├── .gitattributes
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── benchmark
├── bench_backend.sh
├── bench_blas_a100.sh
├── bench_blas_v100.sh
├── bench_comm.sh
├── bench_cublas_a100.sh
├── bench_cublas_v100.sh
├── bench_evaluator_a100.sh
├── bench_evaluator_v100.sh
├── bench_groupsz.sh
├── bench_numgate.sh
├── bench_pergate.sh
├── bench_scale.sh
├── bench_sharemem.sh
├── bench_weak.sh
├── blas.cu
├── plot
│ └── plot.py
└── preprocess.sh
├── cmake
└── FindNccl.cmake
├── evaluator-preprocess
└── process.cpp
├── main.cpp
├── micro-benchmark
├── bench-blas.cpp
├── local-ctr.cpp
├── local-single.cpp
└── two-group-h.cpp
├── scripts
├── .gitignore
├── check.sh
├── check_wrapper.sh
├── coalescing.sh
├── compare.py
├── env.sh
├── gen_stdout.sh
├── gpu-bind.sh
├── init.sh
├── run-multi-GPU.sh
├── run-multi-node.sh
├── run-single.sh
└── run.sh
├── src
├── CMakeLists.txt
├── circuit.cpp
├── circuit.h
├── compiler.cpp
├── compiler.h
├── evaluator.cpp
├── evaluator.h
├── executor.cpp
├── executor.h
├── gate.cpp
├── gate.h
├── kernel.h
├── kernelOpt.cu
├── kernelSimple.cu
├── kernelUtils.cu
├── kernels
│ ├── baseline.cu
│ ├── lookup.cu
│ └── swizzle.cu
├── logger.cpp
├── logger.h
├── schedule.cpp
├── schedule.h
├── utils.cpp
└── utils.h
└── tests
├── input
├── basis_change_24.qasm
├── basis_change_25.qasm
├── basis_change_26.qasm
├── basis_change_27.qasm
├── basis_change_28.qasm
├── basis_change_29.qasm
├── basis_change_30.qasm
├── bv_28.qasm
├── hidden_shift_28.qasm
├── qaoa_28.qasm
├── qft_28.qasm
├── quantum_volume_28.qasm
└── supremacy_28.qasm
└── output
├── basis_change_25.log
├── basis_change_28.log
├── basis_change_30.log
├── bv_28.log
├── hidden_shift_28.log
├── qaoa_28.log
├── qft_28.log
├── quantum_volume_28.log
└── supremacy_28.log
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/input/*.qasm filter=lfs diff=lfs merge=lfs -text
2 | tests/output/*.log filter=lfs diff=lfs merge=lfs -text
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | tests/
3 | evaluator-preprocess/parameter-files*
4 | .vscode/
5 | *.sqlite
6 | *.qdrep
7 | *.log
8 | *.profile
9 | blas
10 | *.pdf
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third-party/cutt"]
2 | path = third-party/cutt
3 | url = https://github.com/heheda12345/cutt.git
4 | [submodule "third-party/dbg-macro"]
5 | path = third-party/dbg-macro
6 | url = https://github.com/sharkdp/dbg-macro.git
7 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.1)
2 | project(QCSimulatorRoot)
3 | set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
4 | find_package(CUDA REQUIRED)
5 | find_package(OpenMP REQUIRED)
6 | find_package(Nccl REQUIRED)
7 | find_package(MPI REQUIRED)
8 |
9 | find_library(CUTT cutt "${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/lib")
10 | include_directories(${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/include)
11 | include_directories(${PROJECT_SOURCE_DIR}/third-party/dbg-macro)
12 | MESSAGE(STATUS "Found CUTT: ${CUTT}")
13 |
14 | set(CMAKE_CXX_FLAGS "-std=c++14 -O2 -g -Wall ${OpenMP_CXX_FLAGS}")
15 | set(CUDA_NVCC_FLAGS "-Xcompiler -fopenmp -std=c++14 -O2 -g -arch=compute_70 -code=sm_70 --ptxas-options=-v -lineinfo -keep")
16 | set(BACKEND "group" CACHE STRING "Backend mode, one of [serial, group, group-serial, blas, mix, blas-advance]")
17 | MESSAGE(STATUS "Backend: ${BACKEND}")
18 |
19 | option(SHOW_SCHEDULE "Print the schedule" ON)
20 | option(SHOW_SUMMARY "Show the running details" ON)
21 | option(MEASURE_STAGE "Measure time of each stage" OFF)
22 | option(MICRO_BENCH "Compile micro-benchmarks" OFF)
23 | option(EVALUATOR_PREPROCESS "compile evaluator preprocess" OFF)
24 | option(DISABLE_ASSERT "Use assert in cuda runtime" ON)
25 | option(USE_DOUBLE "double or float" ON)
26 | option(ENABLE_OVERLAP "overlap" ON)
27 | option(USE_MPI "use mpi" OFF)
28 | option(OVERLAP_MAT "overlap initMatirx" ON)
29 | option(LOG_EVALUATOR "show logging of evaluator" OFF)
30 |
31 | if (BACKEND STREQUAL "serial")
32 | add_definitions(-DBACKEND=0)
33 | elseif(BACKEND STREQUAL "group")
34 | add_definitions(-DBACKEND=1)
35 | elseif(BACKEND STREQUAL "group-serial")
36 | add_definitions(-DBACKEND=2)
37 | elseif(BACKEND STREQUAL "blas")
38 | add_definitions(-DBACKEND=3)
39 | elseif(BACKEND STREQUAL "mix")
40 | add_definitions(-DBACKEND=4)
41 | elseif(BACKEND STREQUAL "blas-advance")
42 | add_definitions(-DBACKEND=5)
43 | else()
44 | MESSAGE(ERROR "invalid mode")
45 | endif()
46 |
47 | if (SHOW_SCHEDULE)
48 | add_definitions(-DSHOW_SCHEDULE)
49 | endif(SHOW_SCHEDULE)
50 | if (SHOW_SUMMARY)
51 | add_definitions(-DSHOW_SUMMARY)
52 | endif(SHOW_SUMMARY)
53 | if (MEASURE_STAGE)
54 | add_definitions(-DMEASURE_STAGE)
55 | endif(MEASURE_STAGE)
56 | if (DISABLE_ASSERT)
57 | add_definitions(-DNDEBUG)
58 | else()
59 | add_definitions(-DDEBUG)
60 | endif(DISABLE_ASSERT)
61 | if (ENABLE_OVERLAP)
62 | add_definitions(-DENABLE_OVERLAP)
63 | endif(ENABLE_OVERLAP)
64 | if (USE_DOUBLE)
65 | MESSAGE(STATUS "Float type: Double")
66 | add_definitions(-DUSE_DOUBLE)
67 | else()
68 | MESSAGE(STATUS "Float type: Float")
69 | endif(USE_DOUBLE)
70 | if (OVERLAP_MAT)
71 | add_definitions(-DOVERLAP_MAT)
72 | endif(OVERLAP_MAT)
73 |
74 | if (USE_MPI)
75 | add_definitions(-DUSE_MPI=1)
76 | else()
77 | add_definitions(-DUSE_MPI=0)
78 | endif(USE_MPI)
79 |
80 | set(COALESCE "3" CACHE STRING "coalescing size")
81 | MESSAGE(STATUS "coalesce = ${COALESCE}")
82 | add_definitions(-DCOALESCE_GLOBAL_DEFINED=${COALESCE})
83 |
84 | set(MAT "6" CACHE STRING "mat size")
85 | MESSAGE(STATUS "mat size = ${MAT}")
86 | add_definitions(-DBLAS_MAT_LIMIT_DEFINED=${MAT})
87 |
88 | set(MIN_MAT "4" CACHE STRING "min mat size")
89 | MESSAGE(STATUS "min mat size = ${MIN_MAT}")
90 | add_definitions(-DMIN_MAT_SIZE_DEFINED=${MIN_MAT})
91 |
92 | set(THREAD_DEP "7" CACHE STRING "thread dep")
93 | MESSAGE(STATUS "thread_dep = ${THREAD_DEP}")
94 | add_definitions(-DTHREAD_DEP_DEFINED=${THREAD_DEP})
95 |
96 | if (EVALUATOR_PREPROCESS)
97 | set(PROCESS process)
98 | add_executable(process evaluator-preprocess/process.cpp)
99 | target_link_libraries(process QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
100 | add_definitions(-DUSE_EVALUATOR_PREPROCESS)
101 | endif(EVALUATOR_PREPROCESS)
102 |
103 | if(LOG_EVALUATOR)
104 | add_definitions(-DLOG_EVALUATOR)
105 | endif(LOG_EVALUATOR)
106 |
107 | include_directories ("${PROJECT_SOURCE_DIR}/src")
108 | add_subdirectory("src")
109 | add_executable(main main.cpp)
110 | target_link_libraries(main QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
111 |
112 | if (MICRO_BENCH)
113 | set(BENCHMARKS local-single local-ctr two-group-h bench-blas)
114 | foreach(BENCHMARK IN LISTS BENCHMARKS)
115 | add_executable(${BENCHMARK} micro-benchmark/${BENCHMARK}.cpp)
116 | target_link_libraries(${BENCHMARK} QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
117 | endforeach(BENCHMARK IN LISTS BENCHMARKS)
118 | endif(MICRO_BENCH)
119 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HyQuas
2 |
3 | HyQuas is a **Hy**brid partitioner based **Qua**ntum circuit **S**imulation system on GPU, which supports both single-GPU, single-node-multi-GPU, and multi-node-multi-GPU quantum circuit simulation.
4 |
5 | For single-GPU simulation, it provides two highly optimized methods, *OShareMem* and *TransMM*. *OShareMem* method optimizes the shared-memory based quantum circuit simulation by
. *TransMM* method converts quantum circuit simulation to standard operations and enables the usage of highly optimized libraries like cuBLAS and powerful hardwares like Tensor Cores. It leads up to
speedup over previous gate-merging based simulation. Moreover, it can select the better simulation method for different parts of a given quantum circuit according to its pattern.
6 |
7 | For distributed simulation, it provides a GPU-centric communication pipelining approach. It can utilize the high-throughput NVLink connections to make the simulation even faster while still preserving low communication traffic.
8 |
9 | Experimental results show that HyQuas can achieve up to
speedup on a single GPU and
speedup on a GPU cluster over state-of-the-art quantum circuit simulation systems.
10 |
11 | ## Compile and Run
12 | 1. Get the source code
13 | ```bash
14 | git clone https://github.com/thu-pacman/HyQuas.git --recursive
15 | ```
16 |
17 | 2. Specify the compute capability in `CMakeLists.txt` (`CUDA_NVCC_FLAGS`) and `third-party/cutt/Makefile` (`GENCODE_FLAGS`)
18 |
19 | 3. Prepare the following dependencies
20 | * cmake (tested on 3.12.3)
21 | * cuda (tested on 10.2.89 and 11.0.2)
22 | * g++ (compatible with cuda)
23 | * cublas (with the same version of cuda)
24 | * openmpi (tested on 4.0.5)
25 | * nccl (Fully tested on 2.9.6-1. Known that 2.7.8-1 cannot work. It will be blocked in an NCCL simulated MPI_Sendrecv.)
26 | And update environment variables like `CUDA_HOME`, `NCCL_ROOT`, `$PATH`, `$LIBRARY_PATH`, `$LD_LIBRARY_PATH`, `CPATH` in `scripts/env.sh`.
27 |
28 | 4. Compile the tensor transpose library `cutt`
29 |
30 | ```bash
31 | cd third-party/cutt
32 | make -j
33 | ```
34 |
35 | 5. Specify the root directory
36 | ```bash
37 | export HYQUAS_ROOT=${The_directory_running_git_clone}/HyQuas
38 | ```
39 |
40 | 5. Prepare the database for the time predictor
41 | ```bash
42 | mkdir -p evaluator-preprocess/parameter-files
43 | cd benchmark
44 | ./preprocess.sh
45 | ```
46 |
47 | 6. Example usages of HyQuas:
48 | HyQuas will use all GPUs it can detect, so please control the number of GPU by `CUDA_VISIBLE_DEVICES`.
49 | * Run a single circuit with single GPU
50 | ```bash
51 | cd scripts
52 | ./run-single.sh
53 | ```
54 |
55 | * Run a single circuit with multiple GPUs in one node
56 | ```bash
57 | cd scripts
58 | ./run-multi-GPU.sh
59 | ```
60 |
61 | * Run a single circuit with multiple GPUs in multiple nodes
62 | Please modify the `-host` first.
63 | ```bash
64 | cd scripts
65 | ./run-multi-node.sh
66 | ```
67 |
68 | * Run all circuits and check the correctness (The script trys both w/o MPI)
69 | ```bash
70 | cd scripts
71 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./check.sh
72 | ```
73 |
74 | **Please use the commands in check.sh for evaluating the performance of HyQuas because the run_\*.sh compiles the simulator in debug mode and check.sh compiles it in release mode.**
75 |
76 | For more ways to use our simulator (like only using the *OShareMem* method or *TransMM* method, tuning off the overlap of communication and computation), and for reproducing our results in the ICS'21 paper, please refer to our `benchmark/` directory.
77 |
78 | It also supports the following **unstable** feathers now. See our dev branch for details.
79 | * Simulating more qubits by saving the state in CPU memory while still compute with GPU.
80 | * An imperative mode, so that you do not need to explicitly call `c->compile();` and `c->run()`.
81 | * Support for more control qubits.
82 | * Support for some two-qubit gates.
83 | * Fast measurement of quantum state.
84 |
85 | # Cite
86 | To cite HyQuas, you can use the following BibTex:
87 | ```
88 | @inproceedings{10.1145/3447818.3460357,
89 | author = {Zhang, Chen and Song, Zeyu and Wang, Haojie and Rong, Kaiyuan and Zhai, Jidong},
90 | title = {HyQuas: Hybrid Partitioner Based Quantum Circuit Simulation System on GPU},
91 | year = {2021},
92 | isbn = {9781450383356},
93 | publisher = {Association for Computing Machinery},
94 | address = {New York, NY, USA},
95 | url = {https://doi.org/10.1145/3447818.3460357},
96 | doi = {10.1145/3447818.3460357},
97 | booktitle = {Proceedings of the ACM International Conference on Supercomputing},
98 | pages = {443–454},
99 | numpages = {12},
100 | keywords = {quantum computing, GPU computing, simulation},
101 | location = {Virtual Event, USA},
102 | series = {ICS '21}
103 | }
104 |
105 | ```
106 |
--------------------------------------------------------------------------------
/benchmark/bench_backend.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | export CUDA_VISIBLE_DEVICES=0
3 | export MPIRUN_CONFIG=""
4 | head=../build/logs/`date +%Y%m%d-%H%M%S`
5 |
6 | cd ../scripts
7 |
8 | name=$head-group
9 | mkdir -p $name
10 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out
11 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/backend.log
12 |
13 | name=$head-blas
14 | mkdir -p $name
15 | ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out
16 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log
17 |
18 | name=$head-mix
19 | mkdir -p $name
20 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1 | tee $name/std.out
21 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log
22 |
23 |
--------------------------------------------------------------------------------
/benchmark/bench_blas_a100.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 | ulimit -s unlimited
4 |
5 | source /opt/spack/share/spack/setup-env.sh
6 | spack load cuda@11
7 | NVPROF_COMMAND="nsys nvprof --profile-from-start=off -o test"
8 | export MPIRUN_CONFIG=""
9 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
10 | export tests="$tests_25 $tests_28 $tests_30"
11 |
12 | head=../build/logs/`date +%Y%m%d-%H%M%S`
13 | logdir=../benchmark/logs/
14 | echo tests=$tests
15 | cd ../scripts
16 |
17 | name=$head-m3
18 | mkdir -p $name
19 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out
20 | echo "+++++ 3" | tee $logdir/blas-profile.log
21 | for test in ${tests[*]}; do
22 | echo "===== $test" | tee -a $name/circ.profile
23 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
24 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
25 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
26 | done
27 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
28 | name3=$name
29 |
30 | name=$head-m4
31 | mkdir -p $name
32 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out
33 | echo "+++++ 4" | tee -a $logdir/blas-profile.log
34 | for test in ${tests[*]}; do
35 | echo "===== $test" | tee -a $name/circ.profile
36 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
37 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
38 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
39 | done
40 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
41 | name4=$name
42 |
43 | name=$head-m5
44 | mkdir -p $name
45 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out
46 | echo "+++++ 5" | tee -a $logdir/blas-profile.log
47 | for test in ${tests[*]}; do
48 | echo "===== $test" | tee -a $name/circ.profile
49 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
50 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
51 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
52 | done
53 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
54 | name5=$name
55 |
56 | name=$head-m6
57 | mkdir -p $name
58 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out
59 | echo "+++++ 6" | tee -a $logdir/blas-profile.log
60 | for test in ${tests[*]}; do
61 | echo "===== $test" | tee -a $name/circ.profile
62 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
63 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
64 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
65 | done
66 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
67 | name6=$name
68 |
69 | name=$head-m7
70 | mkdir -p $name
71 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out
72 | echo "+++++ 7" | tee -a $logdir/blas-profile.log
73 | for test in ${tests[*]}; do
74 | echo "===== $test" | tee -a $name/circ.profile
75 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
76 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
77 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
78 | done
79 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
80 | name7=$name
81 |
82 | name=$head-m8
83 | mkdir -p $name
84 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out
85 | echo "+++++ 8" | tee -a $logdir/blas-profile.log
86 | for test in ${tests[*]}; do
87 | echo "===== $test" | tee -a $name/circ.profile
88 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
89 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
90 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
91 | done
92 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
93 | name8=$name
94 |
95 | name=$head-m9
96 | mkdir -p $name
97 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out
98 | echo "+++++ 9" | tee -a $logdir/blas-profile.log
99 | for test in ${tests[*]}; do
100 | echo "===== $test" | tee -a $name/circ.profile
101 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
102 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
103 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
104 | done
105 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
106 | name9=$name
107 |
108 | name=$head-m10
109 | mkdir -p $name
110 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out
111 | echo "+++++ 10" | tee -a $logdir/blas-profile.log
112 | for test in ${tests[*]}; do
113 | echo "===== $test" | tee -a $name/circ.profile
114 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
115 | grep "cutlass" tmp.profile | tee -a $name/circ.profile
116 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
117 | done
118 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
119 | name10=$name
120 |
121 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/blas.log
122 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/blas.log
123 |
--------------------------------------------------------------------------------
/benchmark/bench_blas_v100.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 | ulimit -s unlimited
4 |
5 | export MPIRUN_CONFIG=""
6 | NVPROF_COMMAND="nvprof --profile-from-start off --csv"
7 |
8 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
9 | export tests="$tests_28"
10 |
11 | head=../build/logs/`date +%Y%m%d-%H%M%S`
12 | logdir=../benchmark/logs/
13 | echo tests=$tests
14 | cd ../scripts
15 |
16 | name=$head-m3
17 | mkdir -p $name
18 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out
19 | echo "+++++ 3" | tee $logdir/transmm-profile.log
20 | for test in ${tests[*]}; do
21 | echo "===== $test" | tee -a $name/circ.profile
22 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
23 | grep "volta" tmp.profile | tee -a $name/circ.profile
24 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
25 | done
26 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
27 | name3=$name
28 |
29 | name=$head-m4
30 | mkdir -p $name
31 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out
32 | echo "+++++ 4" | tee -a $logdir/transmm-profile.log
33 | for test in ${tests[*]}; do
34 | echo "===== $test" | tee -a $name/circ.profile
35 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
36 | grep "volta" tmp.profile | tee -a $name/circ.profile
37 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
38 | done
39 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
40 | name4=$name
41 |
42 | name=$head-m5
43 | mkdir -p $name
44 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out
45 | echo "+++++ 5" | tee -a $logdir/transmm-profile.log
46 | for test in ${tests[*]}; do
47 | echo "===== $test" | tee -a $name/circ.profile
48 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
49 | grep "volta" tmp.profile | tee -a $name/circ.profile
50 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
51 | done
52 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
53 | name5=$name
54 |
55 | name=$head-m6
56 | mkdir -p $name
57 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out
58 | echo "+++++ 6" | tee -a $logdir/transmm-profile.log
59 | for test in ${tests[*]}; do
60 | echo "===== $test" | tee -a $name/circ.profile
61 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
62 | grep "volta" tmp.profile | tee -a $name/circ.profile
63 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
64 | done
65 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
66 | name6=$name
67 |
68 | name=$head-m7
69 | mkdir -p $name
70 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out
71 | echo "+++++ 7" | tee -a $logdir/transmm-profile.log
72 | for test in ${tests[*]}; do
73 | echo "===== $test" | tee -a $name/circ.profile
74 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
75 | grep "volta" tmp.profile | tee -a $name/circ.profile
76 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
77 | done
78 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
79 | name7=$name
80 |
81 | name=$head-m8
82 | mkdir -p $name
83 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out
84 | echo "+++++ 8" | tee -a $logdir/transmm-profile.log
85 | for test in ${tests[*]}; do
86 | echo "===== $test" | tee -a $name/circ.profile
87 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
88 | grep "volta" tmp.profile | tee -a $name/circ.profile
89 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
90 | done
91 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
92 | name8=$name
93 |
94 | name=$head-m9
95 | mkdir -p $name
96 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out
97 | echo "+++++ 9" | tee -a $logdir/transmm-profile.log
98 | for test in ${tests[*]}; do
99 | echo "===== $test" | tee -a $name/circ.profile
100 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
101 | grep "volta" tmp.profile | tee -a $name/circ.profile
102 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
103 | done
104 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
105 | name9=$name
106 |
107 | name=$head-m10
108 | mkdir -p $name
109 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out
110 | echo "+++++ 10" | tee -a $logdir/transmm-profile.log
111 | for test in ${tests[*]}; do
112 | echo "===== $test" | tee -a $name/circ.profile
113 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile
114 | grep "volta" tmp.profile | tee -a $name/circ.profile
115 | grep "void transpose" tmp.profile | tee -a $name/circ.profile
116 | done
117 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
118 | name10=$name
119 |
120 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/transmm.log
121 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/transmm.log
122 |
--------------------------------------------------------------------------------
/benchmark/bench_comm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p logs
3 | mkdir -p logs/bench_comm
4 | mkdir -p logs/bench_comm/4V100
5 | mkdir -p logs/bench_comm/2V100
6 | cd ../scripts
7 | source ./init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1
8 | cd ../benchmark
9 |
10 | tests="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
11 |
12 | echo "test 4V100"
13 |
14 | for test in $tests; do
15 | echo $test
16 | CUDA_VISIBLE_DEVICES=0,1,2,3 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/4V100/$test.log 2>logs/bench_comm/4V100/$test.out
17 | done
18 |
19 | echo "test 2V100"
20 |
21 | for test in $tests; do
22 | echo $test
23 | CUDA_VISIBLE_DEVICES=0,1 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/2V100/$test.log 2>logs/bench_comm/2V100/$test.out
24 | done
25 |
--------------------------------------------------------------------------------
/benchmark/bench_cublas_a100.sh:
--------------------------------------------------------------------------------
1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26
2 | echo N_QUBIT=26
3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
4 |
5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27
6 | echo N_QUBIT=27
7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
8 |
9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28
10 | echo N_QUBIT=28
11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
12 |
13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=29
14 | echo N_QUBIT=29
15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
16 |
17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=30
18 | echo N_QUBIT=30
19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
20 |
--------------------------------------------------------------------------------
/benchmark/bench_cublas_v100.sh:
--------------------------------------------------------------------------------
1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=24
2 | echo N_QUBIT=24
3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-v100.log
4 |
5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=25
6 | echo N_QUBIT=25
7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
8 |
9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26
10 | echo N_QUBIT=26
11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
12 |
13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27
14 | echo N_QUBIT=27
15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
16 |
17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28
18 | echo N_QUBIT=28
19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
20 |
--------------------------------------------------------------------------------
/benchmark/bench_evaluator_a100.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p logs
3 | mkdir -p logs/evaluator_a100
4 |
5 | cd ../scripts
6 | echo "OShareMem"
7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1
8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/OShareMem.log
9 | cd ../benchmark
10 |
11 | cd ../scripts
12 | echo "TransMM MAT=5"
13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1
14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_5.log
15 | cd ../benchmark
16 |
17 | cd ../scripts
18 | echo "TransMM MAT=6"
19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1
20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_6.log
21 | cd ../benchmark
22 |
23 | cd ../scripts
24 | echo "TransMM MAT=7"
25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1
26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_7.log
27 | cd ../benchmark
28 |
--------------------------------------------------------------------------------
/benchmark/bench_evaluator_v100.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p logs
3 | mkdir -p logs/evaluator_v100
4 |
5 | cd ../scripts
6 | echo "OShareMem"
7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1
8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/OShareMem.log
9 | cd ../benchmark
10 |
11 | cd ../scripts
12 | echo "TransMM MAT=5"
13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1
14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_5.log
15 | cd ../benchmark
16 |
17 | cd ../scripts
18 | echo "TransMM MAT=6"
19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1
20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_6.log
21 | cd ../benchmark
22 |
23 | cd ../scripts
24 | echo "TransMM MAT=7"
25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1
26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_7.log
27 | cd ../benchmark
28 |
--------------------------------------------------------------------------------
/benchmark/bench_groupsz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ulimit -s unlimited
3 |
4 | cd ../scripts
5 |
6 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3
7 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee ../benchmark/logs/groupsz-tm.log
8 |
9 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=4 -DMIN_MAT=4
10 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
11 |
12 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=5 -DMIN_MAT=5
13 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
14 |
15 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=6 -DMIN_MAT=6
16 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
17 |
18 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=7 -DMIN_MAT=7
19 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
20 |
21 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=8 -DMIN_MAT=8
22 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
23 |
24 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=9 -DMIN_MAT=9
25 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
26 |
27 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=10 -DMIN_MAT=10
28 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
29 |
--------------------------------------------------------------------------------
/benchmark/bench_numgate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export CUDA_VISIBLE_DEVICES=0
4 | head=../build/logs/`date +%Y%m%d-%H%M%S`
5 |
6 | cd ../scripts
7 |
8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
9 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1
10 | CUDA_VISIBLE_DEVICES=0 ./two-group-h | tee ../benchmark/logs/numgate-sm.log
11 |
12 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
13 |
--------------------------------------------------------------------------------
/benchmark/bench_pergate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MPIRUN_CONFIG=""
3 |
4 | cd ../scripts
5 |
6 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
7 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1
8 | echo "baseline" | tee pergate.log
9 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
10 |
11 | cd ../scripts
12 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
13 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
14 | echo "multitask" | tee -a pergate.log
15 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
16 |
17 | cd ../scripts
18 | cp ../src/kernels/lookup.cu ../src/kernelOpt.cu
19 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
20 | echo "lookup" | tee -a pergate.log
21 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
22 |
23 | cd ../scripts
24 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
25 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
26 | echo "bank" | tee -a pergate.log
27 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
28 |
29 | cp pergate.log ../benchmark/logs
30 | cat pergate.log
31 |
--------------------------------------------------------------------------------
/benchmark/bench_scale.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | head=../build/logs/`date +%Y%m%d-%H%M%S`
3 |
4 |
5 | cd ../scripts
6 | export CUDA_VISIBLE_DEVICES=0
7 | export MPIRUN_CONFIG=""
8 |
9 | name=$head-1gpu-o
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
12 | name1=$name
13 |
14 | export CUDA_VISIBLE_DEVICES=0,1
15 | name=$head-2gpu-o
16 | mkdir -p $name
17 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
18 | name2=$name
19 |
20 | export CUDA_VISIBLE_DEVICES=0,1,2,3
21 | name=$head-4gpu-o
22 | mkdir -p $name
23 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
24 | name3=$name
25 |
26 | export CUDA_VISIBLE_DEVICES=0
27 | name=$head-1gpu-s
28 | mkdir -p $name
29 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
30 | name1=$name
31 |
32 | export CUDA_VISIBLE_DEVICES=0,1
33 | name=$head-2gpu-s
34 | mkdir -p $name
35 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
36 | name2=$name
37 |
38 | export CUDA_VISIBLE_DEVICES=0,1,2,3
39 | name=$head-4gpu-s
40 | mkdir -p $name
41 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
42 | name3=$name
43 |
44 |
45 | grep -r "Time Cost" $head-*/*.log | tee ../benchmark/logs/scale.log
46 |
47 | export CUDA_VISIBLE_DEVICES=0,1,2,3
48 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on
49 | nvprof ./main ../tests/input/hidden_shift_28.qasm 2>&1 | tee ../benchmark/logs/hs.log
50 |
--------------------------------------------------------------------------------
/benchmark/bench_sharemem.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | export CUDA_VISIBLE_DEVICES=0
3 | export MPIRUN_CONFIG=""
4 | name=../build/logs/`date +%Y%m%d-%H%M%S`
5 |
6 | cd ../scripts
7 |
8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
9 |
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1 | tee $name/std.out
12 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/sharemem.log
13 |
14 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
15 |
--------------------------------------------------------------------------------
/benchmark/bench_weak.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off
3 | LOG=../benchmark/logs
4 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee $LOG/weak.log
5 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log
6 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log
7 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
8 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
9 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
10 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
11 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
12 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
13 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
14 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
15 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
16 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
17 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
18 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
19 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log
21 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_30.qasm 2>&1 | tee -a $LOG/weak.log
22 |
23 | grep -r "Logger" $LOG/weak.log | tee $LOG/weak_summary.log
--------------------------------------------------------------------------------
/benchmark/blas.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 |
6 | static const char *cublasGetErrorString(cublasStatus_t error)
7 | {
8 | switch (error)
9 | {
10 | case CUBLAS_STATUS_SUCCESS:
11 | return "CUBLAS_STATUS_SUCCESS";
12 | case CUBLAS_STATUS_NOT_INITIALIZED:
13 | return "CUBLAS_STATUS_NOT_INITIALIZED";
14 | case CUBLAS_STATUS_ALLOC_FAILED:
15 | return "CUBLAS_STATUS_ALLOC_FAILED";
16 | case CUBLAS_STATUS_INVALID_VALUE:
17 | return "CUBLAS_STATUS_INVALID_VALUE";
18 | case CUBLAS_STATUS_ARCH_MISMATCH:
19 | return "CUBLAS_STATUS_ARCH_MISMATCH";
20 | case CUBLAS_STATUS_MAPPING_ERROR:
21 | return "CUBLAS_STATUS_MAPPING_ERROR";
22 | case CUBLAS_STATUS_EXECUTION_FAILED:
23 | return "CUBLAS_STATUS_EXECUTION_FAILED";
24 | case CUBLAS_STATUS_INTERNAL_ERROR:
25 | return "CUBLAS_STATUS_INTERNAL_ERROR";
26 | default:
27 | return "";
28 | }
29 | return "";
30 | }
31 |
32 | #define checkCudaErrors(stmt) { \
33 | cudaError_t err = stmt; \
34 | if (err != cudaSuccess) { \
35 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cudaGetErrorString(err)); \
36 | exit(1); \
37 | } \
38 | }
39 |
40 | #define checkCuttErrors(stmt) { \
41 | cuttResult err = stmt; \
42 | if (err != CUTT_SUCCESS) { \
43 | fprintf(stderr, "%s in file %s, function %s, line %i.\n", #stmt, __FILE__, __FUNCTION__, __LINE__); \
44 | exit(1); \
45 | } \
46 | }
47 |
48 | #define checkBlasErrors(stmt) { \
49 | cublasStatus_t err = stmt; \
50 | if (err != CUBLAS_STATUS_SUCCESS) { \
51 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cublasGetErrorString(err)); \
52 | exit(1); \
53 | } \
54 | }
55 |
56 | int main() {
57 | int nq = N_QUBIT;
58 | cuDoubleComplex* arr;
59 | cuDoubleComplex* mat;
60 | cuDoubleComplex* result;
61 | checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << nq));
62 | checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) * 1024 * 1024));
63 | checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << nq));
64 | cublasHandle_t handle;
65 | checkBlasErrors(cublasCreate(&handle));
66 | // checkBlasErrors(cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION));
67 | int numElements = 1 << nq;
68 | cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0);
69 | cudaEvent_t start, stop;
70 | checkCudaErrors(cudaEventCreate(&start));
71 | checkCudaErrors(cudaEventCreate(&stop));
72 | for (int K = 2; K < 1024; K <<= 1) {
73 | printf("K = %d\n", K);
74 | for (int i = 0; i < 100; i++) {
75 | checkCudaErrors(cudaEventRecord(start));
76 |
77 | checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
78 | K, numElements / K, K, // M, N, K
79 | &alpha, mat, K, // alpha, a, lda
80 | arr, K, // b, ldb
81 | &beta, result, K // beta, c, ldc
82 | ));
83 |
84 | float time;
85 | checkCudaErrors(cudaEventRecord(stop));
86 | cudaEventSynchronize(stop);
87 | cudaEventElapsedTime(&time, start, stop);
88 | printf("%.10f ", time);
89 | }
90 | printf("\n");
91 | }
92 | return 0;
93 | }
94 |
--------------------------------------------------------------------------------
/benchmark/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=on -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0 ./process
--------------------------------------------------------------------------------
/cmake/FindNccl.cmake:
--------------------------------------------------------------------------------
1 | if (NCCL_LIBRARY)
2 | if(NOT USE_NCCL_LIB_PATH)
3 | # Don't cache NCCL_LIBRARY to enable switching between static and shared.
4 | unset(NCCL_LIBRARY CACHE)
5 | endif(NOT USE_NCCL_LIB_PATH)
6 | endif()
7 |
8 | if (BUILD_WITH_SHARED_NCCL)
9 | # libnccl.so
10 | set(NCCL_LIB_NAME nccl)
11 | else ()
12 | # libnccl_static.a
13 | set(NCCL_LIB_NAME nccl_static)
14 | endif (BUILD_WITH_SHARED_NCCL)
15 |
16 | find_path(NCCL_INCLUDE_DIR
17 | NAMES nccl.h
18 | PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
19 |
20 | find_library(NCCL_LIBRARY
21 | NAMES ${NCCL_LIB_NAME}
22 | PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
23 |
24 | message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
25 |
26 | include(FindPackageHandleStandardArgs)
27 | find_package_handle_standard_args(Nccl DEFAULT_MSG
28 | NCCL_INCLUDE_DIR NCCL_LIBRARY)
29 |
30 | mark_as_advanced(
31 | NCCL_INCLUDE_DIR
32 | NCCL_LIBRARY
33 | )
--------------------------------------------------------------------------------
/evaluator-preprocess/process.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include "circuit.h"
12 | #include "logger.h"
13 | using namespace std;
14 |
15 | #define DIFF_QUBIT_NUMS 7
16 | int qubit_nums[DIFF_QUBIT_NUMS] = {22, 23, 24, 25, 26, 27, 28};
17 |
18 | FILE* curr_file;
19 |
20 | #define CALC_ALL_PARAM 0
21 | #define CALC_PARTIAL_PARAM 1
22 | const int param_type = CALC_PARTIAL_PARAM;
23 |
24 | void procPerGateSingle(int numQubits) {
25 | int num_gates = 512;
26 | for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) {
27 | printf("single gate %s\n", Gate::get_name(GateType(i)).c_str());
28 | if(param_type == CALC_ALL_PARAM) {
29 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
30 | Circuit c(numQubits);
31 | for (int k = 0; k < num_gates; k++) {
32 | c.addGate(Gate::random(j, j + 1, GateType(i)));
33 | }
34 | c.compile();
35 | int time = c.run(false);
36 | fprintf(curr_file, "%d ", time);
37 | }
38 | }
39 | else {
40 | Circuit c(numQubits);
41 | for (int k = 0; k < num_gates; k++) {
42 | c.addGate(Gate::random(1, 1 + 1, GateType(i)));
43 | }
44 | c.compile();
45 | int time = c.run(false);
46 | fprintf(curr_file, "%d ", time);
47 | }
48 | fprintf(curr_file, "\n");
49 | }
50 | fprintf(curr_file, "\n");
51 | }
52 |
53 | void procPerGateCtr(int numQubits) {
54 | int num_gates = 512;
55 | for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) {
56 | printf("control gate %s\n", Gate::get_name(GateType(g)).c_str());
57 | if(param_type == CALC_ALL_PARAM) {
58 | for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
59 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
60 | if (i == j) { fprintf(curr_file, "0 "); continue; }
61 | Circuit c(numQubits);
62 | for (int k = 0; k < num_gates; k++) {
63 | c.addGate(Gate::control(i, j, GateType(g)));
64 | }
65 | c.compile();
66 | int time = c.run(false);
67 | fprintf(curr_file, "%d ", time);
68 | }
69 | fprintf(curr_file, "\n");
70 | }
71 | }
72 | else {
73 | Circuit c(numQubits);
74 | for (int k = 0; k < num_gates; k++) {
75 | c.addGate(Gate::control(0, 2, GateType(g)));
76 | }
77 | c.compile();
78 | int time = c.run(false);
79 | fprintf(curr_file, "%d ", time);
80 | }
81 | fprintf(curr_file, "\n");
82 | }
83 | }
84 |
85 | void procBLAS(int numQubits) {
86 | cuDoubleComplex* arr;
87 | cuDoubleComplex* mat;
88 | cuDoubleComplex* result;
89 | checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << numQubits));
90 | checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) << 20));
91 | checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << numQubits));
92 | cublasHandle_t handle;
93 | checkBlasErrors(cublasCreate(&handle));
94 | qindex numElements = qindex(1) << numQubits;
95 | cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0);
96 | cudaEvent_t start, stop;
97 | checkCudaErrors(cudaEventCreate(&start));
98 | checkCudaErrors(cudaEventCreate(&stop));
99 | for (int K = 1; K < 1024; K <<= 1) {
100 | printf("blas calculating K = %d\n", K);
101 | double sum_time = 0.0;
102 | for (int i = 0; i < 100; i++) {
103 | checkCudaErrors(cudaEventRecord(start));
104 |
105 | checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
106 | K, numElements / K, K, // M, N, K
107 | &alpha, mat, K, // alpha, a, lda
108 | arr, K, // b, ldb
109 | &beta, result, K // beta, c, ldc
110 | ));
111 |
112 | float time;
113 | checkCudaErrors(cudaEventRecord(stop));
114 | cudaEventSynchronize(stop);
115 | cudaEventElapsedTime(&time, start, stop);
116 | sum_time += time;
117 | //printf("%.10f ", time);
118 |
119 | }
120 | //printf("\n");
121 | fprintf(curr_file, "%d %f\n", K, sum_time / 100);
122 | }
123 | fprintf(curr_file, "\n");
124 | checkCudaErrors(cudaFree(arr));
125 | checkCudaErrors(cudaFree(mat));
126 | checkCudaErrors(cudaFree(result));
127 | }
128 |
129 | void procCutt(int numQubits) {
130 | double *in, *out;
131 | checkCudaErrors(cudaMalloc(&in, sizeof(double2) << numQubits));
132 | checkCudaErrors(cudaMalloc(&out, sizeof(double2) << numQubits));
133 | int dim[numQubits];
134 | for (int i = 0; i < numQubits; i++) dim[i] = 2;
135 | int total = 0;
136 | double sum_time = 0.0;
137 | for (int change = 1; change <= 20; change ++) {
138 | int perm[numQubits];
139 | printf("Cutt calculating change = %d\n", change);
140 | for (int tt = 0; tt < 100; tt++) {
141 | for (int i = 0; i < numQubits; i++) perm[i] = i;
142 | for (int i = 0; i < change; i++) {
143 | std::swap(perm[rand() % numQubits], perm[rand() % numQubits]);
144 | }
145 | cuttHandle plan;
146 | checkCuttErrors(cuttPlan(&plan, numQubits, dim, perm, sizeof(double2), 0));
147 | cudaEvent_t start, stop;
148 | float time;
149 | checkCudaErrors(cudaEventCreate(&start));
150 | checkCudaErrors(cudaEventCreate(&stop));
151 | checkCudaErrors(cudaEventRecord(start, 0));
152 | checkCuttErrors(cuttExecute(plan, in, out));
153 | checkCudaErrors(cudaEventRecord(stop, 0));
154 | checkCudaErrors(cudaEventSynchronize(stop));
155 | checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
156 | //printf("%.10f ms ", time);
157 | total ++;
158 | sum_time += time;
159 | }
160 | //printf("\n");
161 | }
162 | fprintf(curr_file, "%f\n", sum_time / total);
163 | checkCudaErrors(cudaFree(in));
164 | checkCudaErrors(cudaFree(out));
165 | }
166 |
167 | void process(int numQubits) {
168 | printf("processing qubit number : %d\n", numQubits);
169 | string file_name = string("../evaluator-preprocess/parameter-files/") + to_string(numQubits) + string("qubits.out");
170 | curr_file = fopen(file_name.c_str(), "w");
171 | fprintf(curr_file, "%d\n", param_type);
172 |
173 | procPerGateSingle(numQubits);
174 | procPerGateCtr(numQubits);
175 | procBLAS(numQubits);
176 | procCutt(numQubits);
177 | fclose(curr_file);
178 | }
179 |
180 | int main()
181 | {
182 | auto start = chrono::system_clock::now();
183 | MyGlobalVars::init();
184 | for(int i = 0; i < DIFF_QUBIT_NUMS; i++) {
185 | process(qubit_nums[i]);
186 | }
187 | auto end = chrono::system_clock::now();
188 | printf("process time %d ms\n", chrono::duration_cast(end - start).count());
189 | return 0;
190 | }
191 |
--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "circuit.h"
7 | #include "logger.h"
8 | using namespace std;
9 | const int BUFFER_SIZE = 1000;
10 | char buffer[BUFFER_SIZE];
11 |
12 | std::vector parse_qid(char buf[]) {
13 | std::vector ret;
14 | int l = strlen(buf);
15 | for (int i = 0; i < l; i++) {
16 | if (buf[i] >= '0' && buf[i] <= '9') {
17 | int j = i, x = 0;
18 | while (buf[j] >= '0' && buf[j] <= '9') {
19 | x = x * 10 + (int)(buf[j] - '0');
20 | j++;
21 | }
22 | i = j - 1;
23 | ret.push_back(x);
24 | }
25 | }
26 | return ret;
27 | }
28 |
29 | std::pair> parse_gate(char buf[]) {
30 | qreal pi = acos(-1);
31 | int l = strlen(buf);
32 | std::string name;
33 | int i = 0;
34 | while (i < l) {
35 | if (buf[i] != '(')
36 | name += buf[i];
37 | else
38 | break;
39 | i++;
40 | }
41 | std::vector params;
42 | while (i < l) {
43 | i++;
44 | std::string st;
45 | while (buf[i] != ',' && buf[i] != ')') {
46 | st += buf[i];
47 | i++;
48 | }
49 | qreal param = 1;
50 | if (st[0] == 'p' && st[1] == 'i' && st[2] == '*') {
51 | param = pi;
52 | st = st.erase(0, 3);
53 | } else if (st[0] == 'p' && st[1] == 'i' && st[2] == '/') {
54 | param = -pi;
55 | st = st.erase(0, 3);
56 | }
57 | if (param > 0)
58 | param *= std::stod(st);
59 | else
60 | param = pi / std::stod(st);
61 | params.push_back(param);
62 | if (buf[i] == ')')
63 | break;
64 | }
65 | return std::make_pair(name, params);
66 | }
67 |
68 | std::unique_ptr parse_circuit(const std::string &filename) {
69 | FILE* f = nullptr;
70 | if ((f = fopen(filename.c_str(), "r")) == NULL) {
71 | printf("fail to open %s\n", filename.c_str());
72 | exit(1);
73 | }
74 | int n = -1;
75 | std::unique_ptr c = nullptr;
76 | while (fscanf(f, "%s", buffer) != EOF) {
77 | if (strcmp(buffer, "//") == 0 || strcmp(buffer, "OPENQASM") == 0 || strcmp(buffer, "include") == 0) {
78 | } else if (strcmp(buffer, "qreg") == 0) {
79 | fscanf(f, "%*c%*c%*c%d", &n);
80 | c = std::make_unique(n);
81 | } else if (strcmp(buffer, "cx") == 0) {
82 | fscanf(f, "%s", buffer);
83 | auto qid = parse_qid(buffer);
84 | assert(qid.size() == 2);
85 | c->addGate(Gate::CNOT(qid[0], qid[1]));
86 | // printf("cx %d %d\n", qid[0], qid[1]);
87 | } else if (strcmp(buffer, "ccx") == 0) {
88 | fscanf(f, "%s", buffer);
89 | auto qid = parse_qid(buffer);
90 | assert(qid.size() == 3);
91 | c->addGate(Gate::CCX(qid[0], qid[1], qid[2]));
92 | // printf("ccx %d %d %d\n", qid[0], qid[1], qid[2]);
93 | } else if (strcmp(buffer, "cy") == 0) {
94 | fscanf(f, "%s", buffer);
95 | auto qid = parse_qid(buffer);
96 | assert(qid.size() == 2);
97 | c->addGate(Gate::CY(qid[0], qid[1]));
98 | // printf("cy %d %d\n", qid[0], qid[1]);
99 | } else if (strcmp(buffer, "cz") == 0) {
100 | fscanf(f, "%s", buffer);
101 | auto qid = parse_qid(buffer);
102 | assert(qid.size() == 2);
103 | c->addGate(Gate::CZ(qid[0], qid[1]));
104 | // printf("cz %d %d\n", qid[0], qid[1]);
105 | } else if (strcmp(buffer, "h") == 0) {
106 | fscanf(f, "%s", buffer);
107 | auto qid = parse_qid(buffer);
108 | assert(qid.size() == 1);
109 | c->addGate(Gate::H(qid[0]));
110 | // printf("h %d\n", qid[0]);
111 | } else if (strcmp(buffer, "x") == 0) {
112 | fscanf(f, "%s", buffer);
113 | auto qid = parse_qid(buffer);
114 | assert(qid.size() == 1);
115 | c->addGate(Gate::X(qid[0]));
116 | // printf("x %d\n", qid[0]);
117 | } else if (strcmp(buffer, "y") == 0) {
118 | fscanf(f, "%s", buffer);
119 | auto qid = parse_qid(buffer);
120 | assert(qid.size() == 1);
121 | c->addGate(Gate::Y(qid[0]));
122 | // printf("y %d\n", qid[0]);
123 | } else if (strcmp(buffer, "z") == 0) {
124 | fscanf(f, "%s", buffer);
125 | auto qid = parse_qid(buffer);
126 | assert(qid.size() == 1);
127 | c->addGate(Gate::Z(qid[0]));
128 | // printf("z %d\n", qid[0]);
129 | } else if (strcmp(buffer, "s") == 0) {
130 | fscanf(f, "%s", buffer);
131 | auto qid = parse_qid(buffer);
132 | assert(qid.size() == 1);
133 | c->addGate(Gate::S(qid[0]));
134 | // printf("s %d\n", qid[0]);
135 | } else if (strcmp(buffer, "sdg") == 0) {
136 | fscanf(f, "%s", buffer);
137 | auto qid = parse_qid(buffer);
138 | assert(qid.size() == 1);
139 | c->addGate(Gate::SDG(qid[0]));
140 | // printf("s %d\n", qid[0]);
141 | } else if (strcmp(buffer, "t") == 0) {
142 | fscanf(f, "%s", buffer);
143 | auto qid = parse_qid(buffer);
144 | assert(qid.size() == 1);
145 | c->addGate(Gate::T(qid[0]));
146 | // printf("t %d\n", qid[0]);
147 | } else if (strcmp(buffer, "tdg") == 0) {
148 | fscanf(f, "%s", buffer);
149 | auto qid = parse_qid(buffer);
150 | assert(qid.size() == 1);
151 | c->addGate(Gate::TDG(qid[0]));
152 | // printf("t %d\n", qid[0]);
153 | } else {
154 | auto gate = parse_gate(buffer);
155 | if (gate.first == "crx") {
156 | assert(gate.second.size() == 1);
157 | fscanf(f, "%s", buffer);
158 | auto qid = parse_qid(buffer);
159 | assert(qid.size() == 2);
160 | c->addGate(Gate::CRX(qid[0], qid[1], gate.second[0]));
161 | // printf("crx %d %d %f\n", qid[0], qid[1], gate.second[0]);
162 | } else if (gate.first == "cry") {
163 | assert(gate.second.size() == 1);
164 | fscanf(f, "%s", buffer);
165 | auto qid = parse_qid(buffer);
166 | assert(qid.size() == 2);
167 | c->addGate(Gate::CRY(qid[0], qid[1], gate.second[0]));
168 | // printf("cry %d %d %f\n", qid[0], qid[1], gate.second[0]);
169 | } else if (gate.first == "crz") {
170 | assert(gate.second.size() == 1);
171 | fscanf(f, "%s", buffer);
172 | auto qid = parse_qid(buffer);
173 | assert(qid.size() == 2);
174 | c->addGate(Gate::CRZ(qid[0], qid[1], gate.second[0]));
175 | // printf("crz %d %d %f\n", qid[0], qid[1], gate.second[0]);
176 | } else if (gate.first == "cu1") {
177 | assert(gate.second.size() == 1);
178 | fscanf(f, "%s", buffer);
179 | auto qid = parse_qid(buffer);
180 | assert(qid.size() == 2);
181 | c->addGate(Gate::CU1(qid[0], qid[1], gate.second[0]));
182 | // printf("cu1 %d %d %f\n", qid[0], qid[1], gate.second[0]);
183 | } else if (gate.first == "u1") {
184 | assert(gate.second.size() == 1);
185 | fscanf(f, "%s", buffer);
186 | auto qid = parse_qid(buffer);
187 | assert(qid.size() == 1);
188 | c->addGate(Gate::U1(qid[0], gate.second[0]));
189 | // printf("u1 %d %f\n", qid[0], gate.second[0]);
190 | } else if (gate.first == "u3") {
191 | assert(gate.second.size() == 3);
192 | fscanf(f, "%s", buffer);
193 | auto qid = parse_qid(buffer);
194 | assert(qid.size() == 1);
195 | c->addGate(Gate::U3(qid[0], gate.second[0], gate.second[1], gate.second[2]));
196 | // printf("u3 %d %f %f %f\n", qid[0], gate.second[0], gate.second[1], gate.second[2]);
197 | } else if (gate.first == "rx") {
198 | assert(gate.second.size() == 1);
199 | fscanf(f, "%s", buffer);
200 | auto qid = parse_qid(buffer);
201 | assert(qid.size() == 1);
202 | c->addGate(Gate::RX(qid[0], gate.second[0]));
203 | // printf("rx %d %f\n", qid[0], gate.second[0]);
204 | } else if (gate.first == "ry") {
205 | assert(gate.second.size() == 1);
206 | fscanf(f, "%s", buffer);
207 | auto qid = parse_qid(buffer);
208 | assert(qid.size() == 1);
209 | c->addGate(Gate::RY(qid[0], gate.second[0]));
210 | // printf("ry %d %f\n", qid[0], gate.second[0]);
211 | } else if (gate.first == "rz") {
212 | assert(gate.second.size() == 1);
213 | fscanf(f, "%s", buffer);
214 | auto qid = parse_qid(buffer);
215 | assert(qid.size() == 1);
216 | c->addGate(Gate::RZ(qid[0], gate.second[0]));
217 | // printf("rz %d %f\n", qid[0], gate.second[0]);
218 | } else {
219 | printf("unrecognized token %s\n", buffer);
220 | exit(1);
221 | }
222 | }
223 | fgets(buffer, BUFFER_SIZE, f);
224 | }
225 | fclose(f);
226 | if (c == nullptr) {
227 | printf("fail to load circuit\n");
228 | exit(1);
229 | }
230 | return std::move(c);
231 | }
232 |
233 | int main(int argc, char* argv[]) {
234 | #if USE_MPI
235 | MyMPI::init();
236 | #endif
237 | MyGlobalVars::init();
238 | std::unique_ptr c;
239 | if (argc != 2) {
240 | printf("./parser qasmfile\n");
241 | exit(1);
242 | }
243 | c = parse_circuit(std::string(argv[1]));
244 | c->compile();
245 | c->run();
246 | c->printState();
247 | Logger::print();
248 | #if USE_MPI
249 | checkMPIErrors(MPI_Finalize());
250 | #endif
251 | return 0;
252 | }
--------------------------------------------------------------------------------
/micro-benchmark/bench-blas.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "circuit.h"
7 | #include "logger.h"
8 | using namespace std;
9 |
10 | int main(int argc, char* argv[]) {
11 | MyGlobalVars::init();
12 | int n = 28;
13 | printf("MATSIZE %d ", BLAS_MAT_LIMIT);
14 | for (int tt = 0; tt < 5; tt++) {
15 | Circuit c(n);
16 | for (int i = 0; i < 10 * BLAS_MAT_LIMIT; i++) {
17 | c.addGate(Gate::H(i % (BLAS_MAT_LIMIT * 2)));
18 | }
19 | c.compile();
20 | int time = c.run(false);
21 | printf("%d ", time);
22 | }
23 | printf("\n");
24 | return 0;
25 | }
--------------------------------------------------------------------------------
/micro-benchmark/local-ctr.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "circuit.h"
7 | #include "logger.h"
8 | using namespace std;
9 |
10 | int main(int argc, char* argv[]) {
11 | MyGlobalVars::init();
12 | int n = 28;
13 | int num_gates = 512;
14 | for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) {
15 | printf("%s\n", Gate::get_name(GateType(g)).c_str());
16 | for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
17 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
18 | if (i == j) { printf(" "); continue; }
19 | Circuit c(n);
20 | for (int k = 0; k < num_gates; k++) {
21 | c.addGate(Gate::control(i, j, GateType(g)));
22 | }
23 | c.compile();
24 | int time = c.run(false);
25 | printf("%d ", time);
26 | fflush(stdout);
27 | }
28 | printf("\n");
29 | }
30 | }
31 | return 0;
32 | }
--------------------------------------------------------------------------------
/micro-benchmark/local-single.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "circuit.h"
7 | #include "logger.h"
8 | using namespace std;
9 |
10 | int main(int argc, char* argv[]) {
11 | MyGlobalVars::init();
12 | int n = 28;
13 | int num_gates = 512;
14 | for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) {
15 | printf("%s: ", Gate::get_name(GateType(i)).c_str());
16 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
17 | Circuit c(n);
18 | for (int k = 0; k < num_gates; k++) {
19 | c.addGate(Gate::random(j, j + 1, GateType(i)));
20 | }
21 | c.compile();
22 | int time = c.run(false);
23 | printf("%d ", time);
24 | fflush(stdout);
25 | }
26 | printf("\n");
27 | }
28 | return 0;
29 | }
--------------------------------------------------------------------------------
/micro-benchmark/two-group-h.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "circuit.h"
7 | #include "logger.h"
8 | using namespace std;
9 |
10 | int main(int argc, char* argv[]) {
11 | MyGlobalVars::init();
12 | for (int i = 6; i < 200; i += 6) {
13 | printf("%d:", i);
14 | for (int tt = 0; tt < 5; tt++) {
15 | Circuit c(28);
16 | for (int j = 0; j < i; j++)
17 | c.addGate(Gate::H(j % 6));
18 | c.compile();
19 | int time = c.run(false);
20 | printf("%d ", time);
21 | }
22 | printf("\n");
23 | }
24 | return 0;
25 | }
--------------------------------------------------------------------------------
/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | _check.sh
2 | _run.sh
3 |
--------------------------------------------------------------------------------
/scripts/check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | name=../build/logs/`date +%Y%m%d-%H%M%S`
3 | mkdir -p $name
4 |
5 | # command for no_mpi
6 | MPIRUN_CONFIG="" ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=off -DDISABLE_ASSERT=on -DMAT=7 2>&1 | tee $name/std.out
7 |
8 | # command for mpi
9 | MPIRUN_CONFIG="`which mpirun` -x GPUPerRank=2 -host nico3:2 ../scripts/env.sh ../scripts/gpu-bind.sh"
10 | MPIRUN_CONFIG=$MPIRUN_CONFIG ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=on -DDISABLE_ASSERT=on -DMAT=7 -DUSE_MPI=on 2>&1 | tee $name/std.out
11 |
--------------------------------------------------------------------------------
/scripts/check_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | source init.sh ${@: 2}
4 | input_dir=../tests/input
5 | std_dir=../tests/output
6 |
7 | for test in ${tests[*]}; do
8 | $MPIRUN_CONFIG ./main $input_dir/$test.qasm > $1/$test.log
9 | grep "Logger" $1/$test.log
10 | done
11 |
12 | set +x
13 | set +e
14 |
15 | for test in ${tests[*]}; do
16 | line=`cat $std_dir/$test.log | wc -l`
17 | echo $test
18 | grep -Ev "Logger|CLUSTER" $1/$test.log > tmp.log
19 | diff -q -B $std_dir/$test.log tmp.log || true
20 | done
21 |
22 | grep -Er "Logger:.*Time" $1/*.log
23 |
--------------------------------------------------------------------------------
/scripts/coalescing.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c0
5 | mkdir -p $name
6 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -D COALESCE=0 2>&1 | tee $name/std.out
7 | name1=$name
8 |
9 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c1
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=1 2>&1 | tee $name/std.out
12 | name2=$name
13 |
14 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c2
15 | mkdir -p $name
16 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=2 2>&1 | tee $name/std.out
17 | name3=$name
18 |
19 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c3
20 | mkdir -p $name
21 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=3 2>&1 | tee $name/std.out
22 | name4=$name
23 |
24 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c4
25 | mkdir -p $name
26 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=4 2>&1 | tee $name/std.out
27 | name5=$name
28 |
29 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c5
30 | mkdir -p $name
31 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=5 2>&1 | tee $name/std.out
32 | name6=$name
33 |
34 | tail -n 9 $name1/std.out
35 | tail -n 9 $name2/std.out
36 | tail -n 9 $name3/std.out
37 | tail -n 9 $name4/std.out
38 | tail -n 9 $name5/std.out
39 | tail -n 9 $name6/std.out
40 |
--------------------------------------------------------------------------------
/scripts/compare.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import numpy as np
4 | cases = ['adder_26', 'basis_change_28', 'bv_28', 'hidden_shift_28', 'ising_25', 'qaoa_28', 'qft_28', 'quantum_volume_28', 'supremacy_28']
5 | std_dir = sys.argv[1]
6 | my_dir = sys.argv[2]
7 |
8 | for case in cases:
9 | std = []
10 | with open(os.path.join(std_dir, case + '.log')) as f:
11 | for s in f.readlines():
12 | a, b = s.strip().split()[2:4]
13 | std.append([float(a), float(b)])
14 | std = np.array(std)
15 | std[np.abs(std) < 1e-10] = 0
16 |
17 | my = []
18 | with open(os.path.join(my_dir, case + '.log')) as f:
19 | for s in f.readlines():
20 | if s.startswith('Logger'):
21 | continue
22 | a, b = s.strip().split()[2:4]
23 | my.append([float(a), float(b)])
24 | my = np.array(my)
25 | my[np.abs(my) < 1e-10] = 0
26 | if (std.shape != my.shape):
27 | print("[{}]".format(case), "shape not match")
28 | continue
29 | err = np.abs(std-my)
30 | rela = np.abs(std - my) / (np.maximum(np.abs(std), np.abs(my)) + 1e-10)
31 | print("[{}]".format(case),
32 | "err:", np.max(err), np.argmax(err),
33 | "rela:", np.max(rela), np.argmax(rela))
--------------------------------------------------------------------------------
/scripts/env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | case $(hostname -s) in
3 | nico*)
4 | echo "[CLUSTER] nico"
5 | source /opt/spack/share/spack/setup-env.sh
6 | spack load cuda@10.2.89 /v5oqq5n
7 | spack load openmpi@4.0.5 /h5eun6a
8 | export NCCL_ROOT=/home/heheda/tools/nccl/build
9 | export CPATH=$NCCL_ROOT/include:$CPATH
10 | export LIBRARY_PATH=$NCCL_ROOT/lib:$LIBRARY_PATH
11 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH
12 | ;;
13 | gorgon*)
14 | echo "[CLUSTER] gorgon"
15 | source /usr/local/Modules/init/bash
16 | module load cuda-10.2/cuda
17 | module load cmake-3.12.3
18 | module load openmpi-3.0.0
19 | ;;
20 | i*)
21 | echo "[CLUSTER] scc"
22 | source /opt/spack/share/spack/setup-env.sh
23 | spack load cuda@10.2.89 /tlfcinz
24 | spack load openmpi@3.1.6 /5aaect6
25 | ;;
26 | hanzo)
27 | echo "[CLUSTER] hanzo"
28 | source /opt/spack/share/spack/setup-env.sh
29 | export PATH=$HOME/package/cmake-3.19.2-Linux-x86_64/bin:/usr/mpi/gcc/openmpi-4.1.0rc5/bin:$PATH
30 | # use system mpi
31 | export CPATH=/usr/mpi/gcc/openmpi-4.1.0rc5/include:${CPATH-}
32 | spack load gcc@8.3.0 /liymwyb
33 | spack load cuda@10.2.89 /tlfcinz
34 | ;;
35 | nova)
36 | echo "[CLUSTER] nova"
37 | source /opt/spack/share/spack/setup-env.sh
38 | spack load cuda@11 /njgeoec
39 | spack load openmpi /dfes7hw
40 | esac
41 |
42 | $@
--------------------------------------------------------------------------------
/scripts/gen_stdout.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source init.sh -DBACKEND=1 -DSHOW_SUMMARY=off
3 | for test in ${tests[*]}; do
4 | echo $test
5 | ./main ../tests/input/$test.qasm > ../tests/output/$test.log
6 | done
7 |
--------------------------------------------------------------------------------
/scripts/gpu-bind.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rank=$OMPI_COMM_WORLD_LOCAL_RANK
3 | GPU_start=$(( $rank * $GPUPerRank ))
4 | GPU_end=$(( ($rank + 1) * $GPUPerRank - 1 ))
5 | GPU=`echo $(for i in $(seq $GPU_start $GPU_end); do printf "$i,"; done)`
6 | CUDA_VISIBLE_DEVICES=$GPU $@
--------------------------------------------------------------------------------
/scripts/init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -u
3 | set -e
4 |
5 | source env.sh ""
6 |
7 | mkdir -p $HYQUAS_ROOT/build
8 | cd $HYQUAS_ROOT/build
9 | rm CMakeCache.txt || true
10 | cmake $* ..
11 | make clean
12 | make -j
13 |
14 | if [ -z "${tests-}" ]
15 | then
16 | export tests_25="basis_change_25 bv_25 hidden_shift_25 qaoa_25 qft_25 quantum_volume_25 supremacy_25"
17 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
18 | export tests_30="basis_change_30 bv_30 hidden_shift_30 qaoa_30 qft_30 quantum_volume_30 supremacy_30"
19 | export tests_scale="basis_change_24 basis_change_25 basis_change_26 basis_change_27 basis_change_28"
20 |
21 | export tests=($tests_28)
22 | fi
23 |
--------------------------------------------------------------------------------
/scripts/run-multi-GPU.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/supremacy_28.qasm
4 |
--------------------------------------------------------------------------------
/scripts/run-multi-node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7
3 | `which mpirun` -host nico3:2 -x GPUPerRank=2 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm
4 |
--------------------------------------------------------------------------------
/scripts/run-single.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/supremacy_28.qasm
4 |
--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7
3 | `which mpirun` -host nico1:2,nico2:2 -x GPUPerRank=4 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm
4 | # CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/qft_28.qasm
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | project(QCSimulator)
2 | aux_source_directory(. SRC_CXX)
3 |
4 | cuda_add_library(QCSimulator ${SRC_CXX})
5 |
--------------------------------------------------------------------------------
/src/circuit.cpp:
--------------------------------------------------------------------------------
1 | #include "circuit.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include "utils.h"
10 | #include "kernel.h"
11 | #include "compiler.h"
12 | #include "logger.h"
13 | #include "executor.h"
14 | using namespace std;
15 |
16 | int Circuit::run(bool copy_back, bool destroy) {
17 | kernelInit(deviceStateVec, numQubits);
18 | for (int i = 0; i < MyGlobalVars::localGPUs; i++) {
19 | checkCudaErrors(cudaSetDevice(i));
20 | checkCudaErrors(cudaProfilerStart());
21 | }
22 | auto start = chrono::system_clock::now();
23 | #if BACKEND == 0
24 | kernelExecSimple(deviceStateVec[0], numQubits, gates);
25 | #elif BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
26 | Executor(deviceStateVec, numQubits, schedule).run();
27 | #elif BACKEND == 2
28 | gates.clear();
29 | for (size_t lgID = 0; lgID < schedule.localGroups.size(); lgID++) {
30 | auto& lg = schedule.localGroups[lgID];
31 | for (size_t ggID = 0; ggID < lg.overlapGroups.size(); ggID++) {
32 | auto& gg = lg.overlapGroups[ggID];
33 | for (auto& g: gg.gates)
34 | gates.push_back(g);
35 | }
36 | // if (lgID == 2) break;
37 | for (size_t ggID = 0; ggID < lg.fullGroups.size(); ggID++) {
38 | auto& gg = lg.fullGroups[ggID];
39 | for (auto& g: gg.gates)
40 | gates.push_back(g);
41 | }
42 | }
43 | schedule.finalState = State(numQubits);
44 | kernelExecSimple(deviceStateVec[0], numQubits, gates);
45 | #endif
46 | auto end = chrono::system_clock::now();
47 | for (int i = 0; i < MyGlobalVars::localGPUs; i++) {
48 | checkCudaErrors(cudaSetDevice(i));
49 | checkCudaErrors(cudaProfilerStop());
50 | }
51 | auto duration = chrono::duration_cast(end - start);
52 | Logger::add("Time Cost: %d us", int(duration.count()));
53 |
54 | if (copy_back) {
55 | result.resize(1ll << numQubits); // very slow ...
56 | #if BACKEND == 0 || BACKEND == 2
57 | kernelDeviceToHost((qComplex*)result.data(), deviceStateVec[0], numQubits);
58 | #else
59 | qindex elements = 1ll << (numQubits - MyGlobalVars::bit);
60 | for (int g = 0; g < MyGlobalVars::localGPUs; g++) {
61 | kernelDeviceToHost((qComplex*)result.data() + elements * g, deviceStateVec[g], numQubits - MyGlobalVars::bit);
62 | }
63 | #endif
64 | }
65 | if (destroy) {
66 | for (int g = 0; g < MyGlobalVars::localGPUs; g++) {
67 | kernelDestroy(deviceStateVec[g]);
68 | }
69 | }
70 | return duration.count();
71 | }
72 |
73 | void Circuit::dumpGates() {
74 | int totalGates = gates.size();
75 | printf("total Gates: %d\n", totalGates);
76 | int L = 3;
77 | for (const Gate& gate: gates) {
78 | for (int i = 0; i < numQubits; i++) {
79 | if (i == gate.controlQubit) {
80 | printf(".");
81 | for (int j = 1; j < L; j++) printf(" ");
82 | } else if (i == gate.targetQubit) {
83 | printf("%s", gate.name.c_str());
84 | for (int j = gate.name.length(); j < L; j++)
85 | printf(" ");
86 | } else {
87 | printf("|");
88 | for (int j = 1; j < L; j++) printf(" ");
89 | }
90 | }
91 | printf("\n");
92 | }
93 | }
94 |
95 | qindex Circuit::toPhysicalID(qindex idx) {
96 | qindex id = 0;
97 | auto& pos = schedule.finalState.pos;
98 | for (int i = 0; i < numQubits; i++) {
99 | if (idx >> i & 1)
100 | id |= qindex(1) << pos[i];
101 | }
102 | return id;
103 | }
104 |
105 | qindex Circuit::toLogicID(qindex idx) {
106 | qindex id = 0;
107 | auto& pos = schedule.finalState.pos;
108 | for (int i = 0; i < numQubits; i++) {
109 | if (idx >> pos[i] & 1)
110 | id |= qindex(1) << i;
111 | }
112 | return id;
113 | }
114 |
115 | ResultItem Circuit::ampAt(qindex idx) {
116 | qindex id = toPhysicalID(idx);
117 | return ResultItem(idx, make_qComplex(result[id].x, result[id].y));
118 | }
119 |
120 | qComplex Circuit::ampAtGPU(qindex idx) {
121 | qindex id = toPhysicalID(idx);
122 | qComplex ret;
123 | #if USE_MPI
124 | qindex localAmps = (1ll << numQubits) / MyMPI::commSize;
125 | qindex rankID = id / localAmps;
126 |
127 | if (!USE_MPI || MyMPI::rank == rankID) {
128 | int localID = id % localAmps;
129 | #else
130 | int localID = id;
131 | #endif
132 | qindex localGPUAmp = (1ll << numQubits) / MyGlobalVars::numGPUs;
133 | int gpuID = localID / localGPUAmp;
134 | qindex localGPUID = localID % localGPUAmp;
135 | checkCudaErrors(cudaSetDevice(gpuID));
136 | ret = kernelGetAmp(deviceStateVec[gpuID], localGPUID);
137 | #if USE_MPI
138 | }
139 | MPI_Bcast(&ret, 1, MPI_Complex, rankID, MPI_COMM_WORLD);
140 | #endif
141 | return ret;
142 | }
143 |
144 | bool Circuit::localAmpAt(qindex idx, ResultItem& item) {
145 | qindex localAmps = (1ll << numQubits) / MyMPI::commSize;
146 | qindex id = toPhysicalID(idx);
147 | if (id / localAmps == MyMPI::rank) {
148 | // printf("%d belongs to rank %d\n", idx, MyMPI::rank);
149 | qindex localID = id % localAmps;
150 | item = ResultItem(idx, make_qComplex(result[localID].x, result[localID].y));
151 | return true;
152 | }
153 | return false;
154 | }
155 |
156 | void Circuit::masterCompile() {
157 | Logger::add("Total Gates %d", int(gates.size()));
158 | #if BACKEND == 1 || BACKEND == 2 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
159 | Compiler compiler(numQubits, gates);
160 | schedule = compiler.run();
161 | int totalGroups = 0;
162 | for (auto& lg: schedule.localGroups) totalGroups += lg.fullGroups.size();
163 | int fullGates = 0, overlapGates = 0;
164 | for (auto& lg: schedule.localGroups) {
165 | for (auto& gg: lg.fullGroups) fullGates += gg.gates.size();
166 | for (auto& gg: lg.overlapGroups) overlapGates += gg.gates.size();
167 | }
168 | Logger::add("Total Groups: %d %d %d %d", int(schedule.localGroups.size()), totalGroups, fullGates, overlapGates);
169 | #ifdef SHOW_SCHEDULE
170 | schedule.dump(numQubits);
171 | #endif
172 | #else
173 | schedule.finalState = State(numQubits);
174 | #endif
175 | }
176 |
177 | void Circuit::compile() {
178 | auto start = chrono::system_clock::now();
179 | #if USE_MPI
180 | if (MyMPI::rank == 0) {
181 | masterCompile();
182 | auto s = schedule.serialize();
183 | int bufferSize = (int) s.size();
184 | checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD));
185 | checkMPIErrors(MPI_Bcast(s.data(), bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD));
186 | int cur = 0;
187 | // schedule = Schedule::deserialize(s.data(), cur);
188 | } else {
189 | int bufferSize;
190 | checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD));
191 | unsigned char* buffer = new unsigned char [bufferSize];
192 | checkMPIErrors(MPI_Bcast(buffer, bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD));
193 | int cur = 0;
194 | schedule = Schedule::deserialize(buffer, cur);
195 | delete[] buffer;
196 | fflush(stdout);
197 | }
198 | #else
199 | masterCompile();
200 | #endif
201 | auto mid = chrono::system_clock::now();
202 | schedule.initCuttPlans(numQubits - MyGlobalVars::bit);
203 | #ifndef OVERLAP_MAT
204 | schedule.initMatrix(numQubits);
205 | #endif
206 | auto end = chrono::system_clock::now();
207 | auto duration1 = chrono::duration_cast(mid - start);
208 | auto duration2 = chrono::duration_cast(end - mid);
209 | Logger::add("Compile Time: %d us + %d us = %d us", int(duration1.count()), int(duration2.count()), int(duration1.count()) + int(duration2.count()));
210 | }
211 |
212 | #if USE_MPI
213 | void Circuit::gatherAndPrint(const std::vector& results) {
214 | if (MyMPI::rank == 0) {
215 | int size = results.size();
216 | int sizes[MyMPI::commSize];
217 | MPI_Gather(&size, 1, MPI_INT, sizes, 1, MPI_INT, 0, MPI_COMM_WORLD);
218 | int disp[MyMPI::commSize + 1];
219 | disp[0] = 0;
220 | for (int i = 0; i < MyMPI::commSize; i++)
221 | disp[i + 1] = disp[i] + sizes[i];
222 | int totalItem = disp[MyMPI::commSize];
223 | ResultItem* collected = new ResultItem[totalItem];
224 | for (int i = 0; i < MyMPI::commSize; i++)
225 | sizes[i] *= sizeof(ResultItem);
226 | for (int i = 0; i < MyMPI::commSize; i++)
227 | disp[i] *= sizeof(ResultItem);
228 | MPI_Gatherv(
229 | results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR,
230 | collected, sizes, disp,
231 | MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD
232 | );
233 | sort(collected, collected + totalItem);
234 | for (int i = 0; i < totalItem; i++)
235 | collected[i].print();
236 | delete[] collected;
237 | } else {
238 | int size = results.size();
239 | MPI_Gather(&size, 1, MPI_INT, nullptr, 1, MPI_INT, 0, MPI_COMM_WORLD);
240 | MPI_Gatherv(
241 | results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR,
242 | nullptr, nullptr, nullptr,
243 | MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD
244 | );
245 | }
246 | }
247 | #endif
248 |
249 |
250 | void Circuit::printState() {
251 | #if USE_MPI
252 | std::vector results;
253 | ResultItem item;
254 | for (int i = 0; i < 128; i++) {
255 | if (localAmpAt(i, item)) {
256 | results.push_back(item);
257 | }
258 | }
259 | gatherAndPrint(results);
260 | #ifdef SHOW_SCHEDULE
261 | results.clear();
262 | for (int i = 0; i < numQubits; i++) {
263 | if (localAmpAt(1ll << i, item)) {
264 | results.push_back(item);
265 | }
266 | }
267 | if (localAmpAt((1ll << numQubits) - 1, item)) {
268 | results.push_back(item);
269 | }
270 | gatherAndPrint(results);
271 | #endif
272 | results.clear();
273 | int numLocalAmps = (1ll << numQubits) / MyMPI::commSize;
274 | for (qindex i = 0; i < numLocalAmps; i++) {
275 | if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) {
276 | qindex logicID = toLogicID(i + numLocalAmps * MyMPI::rank);
277 | if (logicID >= 128) {
278 | // printf("large amp %d belongs to %d\n", logicID, MyMPI::rank);
279 | results.push_back(ResultItem(logicID, result[i]));
280 | }
281 | }
282 | }
283 | gatherAndPrint(results);
284 | #else
285 | std::vector results;
286 | for (int i = 0; i < 128; i++) {
287 | results.push_back(ampAt(i));
288 | }
289 | #ifdef SHOW_SCHEDULE
290 | for (int i = 0; i < numQubits; i++) {
291 | results.push_back(ampAt(1ll << i));
292 | }
293 | results.push_back(ampAt((1ll << numQubits) - 1));
294 | #endif
295 | for (auto& item: results)
296 | item.print();
297 | results.clear();
298 | for (qindex i = 0; i < (1ll << numQubits); i++) {
299 | if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) {
300 | qindex logicID = toLogicID(i);
301 | if (logicID >= 128) {
302 | results.push_back(ResultItem(toLogicID(i), result[i]));
303 | }
304 | }
305 | }
306 | sort(results.begin(), results.end());
307 | for (auto& item: results)
308 | item.print();
309 | #endif
310 | }
--------------------------------------------------------------------------------
/src/circuit.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include "utils.h"
6 | #include "gate.h"
7 | #include "schedule.h"
8 |
9 | struct ResultItem {
10 | ResultItem() = default;
11 | ResultItem(const qindex& idx, const qComplex& amp): idx(idx), amp(amp) {}
12 | qindex idx;
13 | qComplex amp;
14 | void print() {
15 | printf("%lld %.12f: %.12f %.12f\n", idx, amp.x * amp.x + amp.y * amp.y, zero_wrapper(amp.x), zero_wrapper(amp.y));
16 | }
17 | bool operator < (const ResultItem& b) { return idx < b.idx; }
18 | };
19 |
20 | class Circuit {
21 | public:
22 | Circuit(int numQubits): numQubits(numQubits) {}
23 | void compile();
24 | int run(bool copy_back = true, bool destroy = true);
25 | void addGate(const Gate& gate) {
26 | gates.push_back(gate);
27 | }
28 | void dumpGates();
29 | void printState();
30 | ResultItem ampAt(qindex idx);
31 | qComplex ampAtGPU(qindex idx);
32 | bool localAmpAt(qindex idx, ResultItem& item);
33 | const int numQubits;
34 |
35 | private:
36 | qindex toPhysicalID(qindex idx);
37 | qindex toLogicID(qindex idx);
38 | void masterCompile();
39 | #if USE_MPI
40 | void gatherAndPrint(const std::vector& results);
41 | #endif
42 | std::vector gates;
43 | std::vector deviceStateVec;
44 | std::vector> deviceMats;
45 | Schedule schedule;
46 | std::vector result;
47 | };
--------------------------------------------------------------------------------
/src/compiler.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include "schedule.h"
6 | #include "utils.h"
7 | #include "gate.h"
8 |
9 | class Compiler {
10 | public:
11 | Compiler(int numQubits, std::vector inputGates);
12 | Schedule run();
13 | private:
14 | void fillLocals(LocalGroup& lg);
15 | std::vector, qindex>> moveToNext(LocalGroup& lg);
16 | int numQubits;
17 | int localSize;
18 | int shareSize;
19 | bool enableGlobal;
20 | std::vector gates;
21 | };
22 |
23 | template
24 | class OneLayerCompiler {
25 | public:
26 | OneLayerCompiler(int numQubits, const std::vector& inputGates);
27 | protected:
28 | int numQubits;
29 | std::vector remainGates;
30 | std::vector getGroupOpt(bool full[], qindex related[], bool enableGlobal, int localSize, qindex localQubits);
31 | void removeGatesOpt(const std::vector& remove);
32 | std::set remain;
33 | };
34 |
35 | class SimpleCompiler: public OneLayerCompiler<2048> {
36 | public:
37 | SimpleCompiler(int numQubits, int localSize, qindex localQubits, const std::vector& inputGates, bool enableGlobal, qindex whiteList = 0, qindex required = 0);
38 | LocalGroup run();
39 | private:
40 | int localSize;
41 | qindex localQubits;
42 | bool enableGlobal;
43 | qindex whiteList;
44 | qindex required;
45 | };
46 |
47 | class AdvanceCompiler: public OneLayerCompiler<512> {
48 | public:
49 | AdvanceCompiler(int numQubits, qindex localQubits, qindex blasForbid, std::vector inputGates);
50 | LocalGroup run(State &state, bool usePerGate, bool useBLAS, int preGateSize, int blasSize, int cuttSize);
51 | private:
52 | qindex localQubits;
53 | qindex blasForbid;
54 | };
55 |
56 | class ChunkCompiler: public OneLayerCompiler<512> {
57 | public:
58 | ChunkCompiler(int numQubits, int localSize, int chunkSize, const std::vector &inputGates);
59 | LocalGroup run();
60 | private:
61 | int localSize, chunkSize;
62 | };
--------------------------------------------------------------------------------
/src/evaluator.cpp:
--------------------------------------------------------------------------------
1 | #include "evaluator.h"
2 |
3 | Evaluator* Evaluator::instance_ptr = nullptr;
4 |
5 | Evaluator::Evaluator() {
6 | memset(num_qbits_loaded_param, 0, sizeof(num_qbits_loaded_param));
7 | #ifndef USE_EVALUATOR_PREPROCESS
8 | num_qbits_loaded_param[28] = true;
9 | memcpy(pergate_single_perf[28][int(GateType::U1)], V100_U1, sizeof(double) * LOCAL_QUBIT_SIZE);
10 | memcpy(pergate_single_perf[28][int(GateType::U2)], V100_U2, sizeof(double) * LOCAL_QUBIT_SIZE);
11 | memcpy(pergate_single_perf[28][int(GateType::U3)], V100_U3, sizeof(double) * LOCAL_QUBIT_SIZE);
12 | memcpy(pergate_single_perf[28][int(GateType::H )], V100_H , sizeof(double) * LOCAL_QUBIT_SIZE);
13 | memcpy(pergate_single_perf[28][int(GateType::X )], V100_X , sizeof(double) * LOCAL_QUBIT_SIZE);
14 | memcpy(pergate_single_perf[28][int(GateType::Y )], V100_Y , sizeof(double) * LOCAL_QUBIT_SIZE);
15 | memcpy(pergate_single_perf[28][int(GateType::Z )], V100_Z , sizeof(double) * LOCAL_QUBIT_SIZE);
16 | memcpy(pergate_single_perf[28][int(GateType::S )], V100_S , sizeof(double) * LOCAL_QUBIT_SIZE);
17 | memcpy(pergate_single_perf[28][int(GateType::SDG )], V100_SDG , sizeof(double) * LOCAL_QUBIT_SIZE);
18 | memcpy(pergate_single_perf[28][int(GateType::T )], V100_T , sizeof(double) * LOCAL_QUBIT_SIZE);
19 | memcpy(pergate_single_perf[28][int(GateType::TDG )], V100_TDG , sizeof(double) * LOCAL_QUBIT_SIZE);
20 | memcpy(pergate_single_perf[28][int(GateType::RX)], V100_RX, sizeof(double) * LOCAL_QUBIT_SIZE);
21 | memcpy(pergate_single_perf[28][int(GateType::RY)], V100_RY, sizeof(double) * LOCAL_QUBIT_SIZE);
22 | memcpy(pergate_single_perf[28][int(GateType::RZ)], V100_RZ, sizeof(double) * LOCAL_QUBIT_SIZE);
23 |
24 | memcpy(pergate_ctr_perf[28][int(GateType::CNOT)], V100_CN , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
25 | memcpy(pergate_ctr_perf[28][int(GateType::CY )], V100_CY , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
26 | memcpy(pergate_ctr_perf[28][int(GateType::CZ )], V100_CZ , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
27 | memcpy(pergate_ctr_perf[28][int(GateType::CRX )], V100_CRX, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
28 | memcpy(pergate_ctr_perf[28][int(GateType::CRY )], V100_CRY, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
29 | memcpy(pergate_ctr_perf[28][int(GateType::CU1 )], V100_CU1, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
30 | memcpy(pergate_ctr_perf[28][int(GateType::CRZ )], V100_CRZ, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
31 |
32 | BLAS_perf[28][6] = 23.068396;
33 | cutt_cost[28] = 11.367814;
34 | #endif
35 | }
36 |
37 | void Evaluator::loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type) {
38 | if(param_type == CALC_ALL_PARAM) {
39 | for(int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
40 | fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][i]);
41 | }
42 | }
43 | else {
44 | fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][1]);
45 | }
46 | }
47 |
48 | void Evaluator::loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type) {
49 | if(param_type == CALC_ALL_PARAM) {
50 | for(int i = 0; i < LOCAL_QUBIT_SIZE; i++)
51 | for(int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
52 | fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][i][j]);
53 | }
54 | }
55 | else {
56 | fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][0][2]);
57 | }
58 | }
59 |
60 | void Evaluator::loadParam(int numQubits) {
61 | if(num_qbits_loaded_param[numQubits])
62 | return;
63 | #ifdef USE_EVALUATOR_PREPROCESS
64 | FILE* qbit_param;
65 | std::string param_file_name = std::string("../evaluator-preprocess/parameter-files/")
66 | + std::to_string(numQubits) + std::string("qubits.out");
67 | if((qbit_param = fopen(param_file_name.c_str(), "r"))) {
68 | fscanf(qbit_param, "%d", ¶m_type);
69 |
70 | loadPergateSingle(numQubits, qbit_param, GateType::U1);
71 | loadPergateSingle(numQubits, qbit_param, GateType::U2);
72 | loadPergateSingle(numQubits, qbit_param, GateType::U3);
73 | loadPergateSingle(numQubits, qbit_param, GateType::H );
74 | loadPergateSingle(numQubits, qbit_param, GateType::X );
75 | loadPergateSingle(numQubits, qbit_param, GateType::Y );
76 | loadPergateSingle(numQubits, qbit_param, GateType::Z );
77 | loadPergateSingle(numQubits, qbit_param, GateType::S );
78 | loadPergateSingle(numQubits, qbit_param, GateType::SDG);
79 | loadPergateSingle(numQubits, qbit_param, GateType::T );
80 | loadPergateSingle(numQubits, qbit_param, GateType::TDG);
81 | loadPergateSingle(numQubits, qbit_param, GateType::RX);
82 | loadPergateSingle(numQubits, qbit_param, GateType::RY);
83 | loadPergateSingle(numQubits, qbit_param, GateType::RZ);
84 |
85 | loadPergateCtr(numQubits, qbit_param, GateType::CNOT);
86 | loadPergateCtr(numQubits, qbit_param, GateType::CY );
87 | loadPergateCtr(numQubits, qbit_param, GateType::CZ );
88 | loadPergateCtr(numQubits, qbit_param, GateType::CRX );
89 | loadPergateCtr(numQubits, qbit_param, GateType::CRY );
90 | loadPergateCtr(numQubits, qbit_param, GateType::CU1 );
91 | loadPergateCtr(numQubits, qbit_param, GateType::CRZ );
92 |
93 | for (int K = 1, i = 0; K < 1024; K <<= 1, i++) {
94 | fscanf(qbit_param, "%*d%lf", &BLAS_perf[numQubits][i]);
95 | }
96 | fscanf(qbit_param, "%lf", &cutt_cost[numQubits]);
97 | fclose(qbit_param);
98 | } else {
99 | printf("Parameter file not find for qubit number %d\n", numQubits);
100 | fflush(stdout);
101 | exit(1);
102 | }
103 | num_qbits_loaded_param[numQubits] = true;
104 | #else
105 | printf("Use option USE_EVALUATOR_PREPROCESS for non-default qubit number %d\n", numQubits);
106 | fflush(stdout);
107 | exit(1);
108 | #endif
109 | }
110 |
111 | double Evaluator::perfPerGate(int numQubits, const GateGroup* gg) {
112 | double tim_pred = 0;
113 | loadParam(numQubits);
114 | for(auto gate : (gg -> gates)) {
115 | switch(gate.type) {
116 | case GateType::CCX :
117 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
118 | case GateType::CNOT :
119 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
120 | case GateType::CY :
121 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break;
122 | case GateType::CZ :
123 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break;
124 | case GateType::CRX :
125 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break;
126 | case GateType::CRY :
127 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break;
128 | case GateType::CU1 :
129 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break;
130 | case GateType::CRZ :
131 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break;
132 | case GateType::U1 :
133 | tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break;
134 | case GateType::U2 :
135 | tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break;
136 | case GateType::U3 :
137 | tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break;
138 | case GateType::H :
139 | tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break;
140 | case GateType::X :
141 | tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break;
142 | case GateType::Y :
143 | tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break;
144 | case GateType::Z :
145 | tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break;
146 | case GateType::S :
147 | tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break;
148 | case GateType::SDG :
149 | tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break;
150 | case GateType::T :
151 | tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break;
152 | case GateType::TDG :
153 | tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break;
154 | case GateType::RX :
155 | tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break;
156 | case GateType::RY :
157 | tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break;
158 | case GateType::RZ :
159 | tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break;
160 | default:
161 | printf("meet wrong gate : %s\n", Gate::get_name(gate.type).c_str());
162 | UNREACHABLE()
163 | }
164 | }
165 | return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits);
166 | }
167 |
168 | double Evaluator::perfPerGate(int numQubits, const std::vector& types) {
169 | double tim_pred = 0;
170 | loadParam(numQubits);
171 | for(auto ty : types) {
172 | switch(ty) {
173 | case GateType::CCX :
174 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
175 | case GateType::CNOT :
176 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
177 | case GateType::CY :
178 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break;
179 | case GateType::CZ :
180 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break;
181 | case GateType::CRX :
182 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break;
183 | case GateType::CRY :
184 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break;
185 | case GateType::CU1 :
186 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break;
187 | case GateType::CRZ :
188 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break;
189 | case GateType::U1 :
190 | tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break;
191 | case GateType::U2 :
192 | tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break;
193 | case GateType::U3 :
194 | tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break;
195 | case GateType::H :
196 | tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break;
197 | case GateType::X :
198 | tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break;
199 | case GateType::Y :
200 | tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break;
201 | case GateType::Z :
202 | tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break;
203 | case GateType::S :
204 | tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break;
205 | case GateType::SDG :
206 | tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break;
207 | case GateType::T :
208 | tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break;
209 | case GateType::TDG :
210 | tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break;
211 | case GateType::RX :
212 | tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break;
213 | case GateType::RY :
214 | tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break;
215 | case GateType::RZ :
216 | tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break;
217 | default:
218 | printf("meet wrong gate : %s\n", Gate::get_name(ty).c_str());
219 | UNREACHABLE()
220 | }
221 | }
222 | return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits);
223 | }
224 |
225 | double Evaluator::perfBLAS(int numQubits, int blasSize) {
226 | loadParam(numQubits);
227 | //double bias = (numQubits < 28) ? ((qindex)1 << (28 - numQubits)) : (1.0 / ((qindex)1 << (numQubits - 28)));
228 | return BLAS_perf[numQubits][blasSize] + cutt_cost[numQubits];
229 | }
230 |
231 | bool Evaluator::PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize) {
232 | double pergate = perfPerGate(numQubits, gg_pergate);
233 | double blas = perfBLAS(numQubits, blasSize);
234 | return pergate / (gg_pergate -> gates).size() < blas / (gg_blas -> gates).size();
235 | }
--------------------------------------------------------------------------------
/src/evaluator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "schedule.h"
3 | #include "utils.h"
4 | #include "gate.h"
5 |
6 | #define GATE_NUM 24
7 | #define MAX_QBITS 40
8 |
9 | #define CALC_ALL_PARAM 0
10 | #define CALC_PARTIAL_PARAM 1
11 |
12 | /*
13 | * build performance model to choose between BLAS and perGate backend
14 | * Is a singleton class
15 | **/
16 | class Evaluator {
17 | private:
18 | const double V100_U1[LOCAL_QUBIT_SIZE] = {235,225,225,225,225,225,224,225,225,225};
19 | const double V100_U2[LOCAL_QUBIT_SIZE] = {470,469,469,469,469,469,469,470,469,469};
20 | const double V100_U3[LOCAL_QUBIT_SIZE] = {469,469,469,469,469,469,469,469,469,469};
21 | const double V100_H[LOCAL_QUBIT_SIZE] = {352,352,352,352,352,352,352,352,352,352};
22 | const double V100_X[LOCAL_QUBIT_SIZE] = {350,350,350,350,350,350,350,350,350,350};
23 | const double V100_Y[LOCAL_QUBIT_SIZE] = {350,350,350,350,350,349,349,350,350,350};
24 | const double V100_Z[LOCAL_QUBIT_SIZE] = {194,194,194,194,194,194,194,194,194,194};
25 | const double V100_S[LOCAL_QUBIT_SIZE] = {209,209,209,209,209,209,209,209,209,209};
26 | const double V100_SDG[LOCAL_QUBIT_SIZE] = {209,209,209,209,209,209,209,209,209,209}; // TODO
27 | const double V100_T[LOCAL_QUBIT_SIZE] = {216,216,216,216,216,216,217,216,216,216};
28 | const double V100_TDG[LOCAL_QUBIT_SIZE] = {216,216,216,216,216,216,217,216,216,216}; // TODO
29 | const double V100_RX[LOCAL_QUBIT_SIZE] = {370,370,370,370,370,370,370,370,370,370};
30 | const double V100_RY[LOCAL_QUBIT_SIZE] = {367,367,367,367,367,367,367,367,367,367};
31 | const double V100_RZ[LOCAL_QUBIT_SIZE] = {369,369,369,369,369,369,369,369,369,369};
32 |
33 | const double V100_CN[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
34 | 0,213,195,345,193,193,193,193,193,193,
35 | 193,0,193,193,345,193,193,193,193,193,
36 | 193,193,0,193,193,345,193,193,193,193,
37 | 345,193,193,0,193,193,193,193,193,193,
38 | 193,345,193,193,0,193,193,193,193,193,
39 | 193,193,345,193,193,0,193,193,193,193,
40 | 193,193,193,193,193,193,0,193,193,193,
41 | 193,193,193,193,193,193,193,0,193,193,
42 | 193,193,193,193,193,193,193,193,0,193,
43 | 193,193,193,193,193,193,193,193,193,0,
44 | };
45 | const double V100_CY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
46 | 0,193,193,346,193,193,193,193,193,193,
47 | 193,0,193,193,346,193,193,193,193,193,
48 | 193,193,0,193,193,345,193,193,193,193,
49 | 346,193,193,0,193,193,192,193,193,193,
50 | 193,345,193,193,0,193,193,193,193,193,
51 | 193,193,345,193,193,0,193,193,193,193,
52 | 193,193,193,193,193,193,0,193,192,193,
53 | 193,193,193,193,193,193,193,0,193,193,
54 | 193,193,193,193,193,192,193,193,0,193,
55 | 193,193,192,193,193,192,193,193,193,0,
56 | };
57 | const double V100_CZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
58 | 0,137,137,191,137,137,137,137,137,137,
59 | 137,0,137,137,190,137,137,137,137,137,
60 | 137,137,0,137,137,191,137,137,137,137,
61 | 190,137,137,0,137,137,137,137,137,137,
62 | 137,190,137,137,0,137,137,137,137,137,
63 | 137,137,191,137,137,0,137,137,137,137,
64 | 137,137,137,137,137,137,0,137,137,137,
65 | 137,137,137,137,137,137,137,0,137,137,
66 | 137,137,137,137,137,137,137,137,0,137,
67 | 137,137,137,137,137,137,137,137,137,0,
68 | };
69 | const double V100_CRX[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
70 | 0,224,224,358,224,224,223,224,224,224,
71 | 224,0,224,224,358,224,224,224,223,224,
72 | 224,224,0,224,224,358,223,224,224,223,
73 | 358,224,223,0,224,223,224,223,223,224,
74 | 223,358,224,224,0,224,224,223,223,224,
75 | 223,223,358,224,224,0,223,224,224,224,
76 | 224,223,223,223,224,224,0,224,224,224,
77 | 224,224,224,224,224,224,223,0,224,223,
78 | 224,224,224,224,224,224,224,223,0,224,
79 | 224,224,224,224,224,224,224,224,223,0,
80 | };
81 | const double V100_CRY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
82 | 0,225,225,356,225,225,225,225,225,225,
83 | 225,0,225,225,356,225,224,225,225,225,
84 | 225,225,0,225,224,356,225,225,225,225,
85 | 356,225,225,0,225,225,225,225,224,225,
86 | 225,356,225,225,0,225,224,225,225,225,
87 | 225,225,356,225,225,0,225,225,225,225,
88 | 225,225,225,225,225,224,0,225,225,225,
89 | 225,225,225,225,224,225,225,0,225,225,
90 | 225,225,225,225,225,225,225,225,0,225,
91 | 225,225,225,225,225,225,225,225,225,0,
92 | };
93 | const double V100_CU1[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
94 | // FIXME
95 | 0,225,225,356,225,225,225,225,225,225,
96 | 225,0,225,225,356,225,224,225,225,225,
97 | 225,225,0,225,224,356,225,225,225,225,
98 | 356,225,225,0,225,225,225,225,224,225,
99 | 225,356,225,225,0,225,224,225,225,225,
100 | 225,225,356,225,225,0,225,225,225,225,
101 | 225,225,225,225,225,224,0,225,225,225,
102 | 225,225,225,225,224,225,225,0,225,225,
103 | 225,225,225,225,225,225,225,225,0,225,
104 | 225,225,225,225,225,225,225,225,225,0,
105 | };
106 | const double V100_CRZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
107 | 0,224,224,359,224,224,224,224,224,224,
108 | 224,0,224,224,359,224,224,224,224,224,
109 | 224,224,0,224,224,359,224,224,224,224,
110 | 359,224,224,0,224,224,224,224,224,224,
111 | 224,359,224,224,0,224,224,224,224,224,
112 | 224,224,359,224,224,0,224,224,224,224,
113 | 224,224,224,224,224,224,0,224,224,224,
114 | 224,224,224,224,224,224,224,0,224,224,
115 | 224,224,224,224,224,224,224,224,0,224,
116 | 224,224,224,224,224,224,224,224,224,0,
117 | };
118 |
119 | // pergate single gate performance for 512 runs with 28 qbits
120 | double pergate_single_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE];
121 | // pergate control gate performance for 512 runs with 28 qbits
122 | double pergate_ctr_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE];
123 | // overhead of one pergate group
124 | double BLAS_perf[MAX_QBITS + 1][MAX_QBITS + 1];
125 | double cutt_cost[MAX_QBITS + 1];
126 | bool num_qbits_loaded_param[MAX_QBITS + 1];
127 | const double pergate_group_overhead = 1.0 / (1 << 27);
128 |
129 | int param_type;
130 |
131 | Evaluator();
132 |
133 | static Evaluator* instance_ptr;
134 | public:
135 | static Evaluator* getInstance() {
136 | if(instance_ptr == nullptr) {
137 | instance_ptr = new Evaluator;
138 | }
139 | return instance_ptr;
140 | }
141 | void loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type);
142 | void loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type);
143 | void loadParam(int numQubits);
144 | double perfPerGate(int numQubits, const GateGroup* gg);
145 | double perfPerGate(int numQubits, const std::vector& types);
146 | double perfBLAS(int numQubits, int blasSize);
147 | // return True if choose pergate over BLAS
148 | bool PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize);
149 | };
150 |
--------------------------------------------------------------------------------
/src/executor.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "utils.h"
3 |
4 | #include
5 | #include
6 | #include