├── .gitattributes ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── benchmark ├── bench_backend.sh ├── bench_blas_a100.sh ├── bench_blas_v100.sh ├── bench_comm.sh ├── bench_cublas_a100.sh ├── bench_cublas_v100.sh ├── bench_evaluator_a100.sh ├── bench_evaluator_v100.sh ├── bench_groupsz.sh ├── bench_numgate.sh ├── bench_pergate.sh ├── bench_scale.sh ├── bench_sharemem.sh ├── bench_weak.sh ├── blas.cu ├── plot │ └── plot.py └── preprocess.sh ├── cmake └── FindNccl.cmake ├── evaluator-preprocess └── process.cpp ├── main.cpp ├── micro-benchmark ├── bench-blas.cpp ├── local-ctr.cpp ├── local-single.cpp └── two-group-h.cpp ├── scripts ├── .gitignore ├── check.sh ├── check_wrapper.sh ├── coalescing.sh ├── compare.py ├── env.sh ├── gen_stdout.sh ├── gpu-bind.sh ├── init.sh ├── run-multi-GPU.sh ├── run-multi-node.sh ├── run-single.sh └── run.sh ├── src ├── CMakeLists.txt ├── circuit.cpp ├── circuit.h ├── compiler.cpp ├── compiler.h ├── evaluator.cpp ├── evaluator.h ├── executor.cpp ├── executor.h ├── gate.cpp ├── gate.h ├── kernel.h ├── kernelOpt.cu ├── kernelSimple.cu ├── kernelUtils.cu ├── kernels │ ├── baseline.cu │ ├── lookup.cu │ └── swizzle.cu ├── logger.cpp ├── logger.h ├── schedule.cpp ├── schedule.h ├── utils.cpp └── utils.h └── tests ├── input ├── basis_change_24.qasm ├── basis_change_25.qasm ├── basis_change_26.qasm ├── basis_change_27.qasm ├── basis_change_28.qasm ├── basis_change_29.qasm ├── basis_change_30.qasm ├── bv_28.qasm ├── hidden_shift_28.qasm ├── qaoa_28.qasm ├── qft_28.qasm ├── quantum_volume_28.qasm └── supremacy_28.qasm └── output ├── basis_change_25.log ├── basis_change_28.log ├── basis_change_30.log ├── bv_28.log ├── hidden_shift_28.log ├── qaoa_28.log ├── qft_28.log ├── quantum_volume_28.log └── supremacy_28.log /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/input/*.qasm filter=lfs diff=lfs merge=lfs -text 2 | tests/output/*.log filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | tests/ 3 | evaluator-preprocess/parameter-files* 4 | .vscode/ 5 | *.sqlite 6 | *.qdrep 7 | *.log 8 | *.profile 9 | blas 10 | *.pdf -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third-party/cutt"] 2 | path = third-party/cutt 3 | url = https://github.com/heheda12345/cutt.git 4 | [submodule "third-party/dbg-macro"] 5 | path = third-party/dbg-macro 6 | url = https://github.com/sharkdp/dbg-macro.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(QCSimulatorRoot) 3 | set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}") 4 | find_package(CUDA REQUIRED) 5 | find_package(OpenMP REQUIRED) 6 | find_package(Nccl REQUIRED) 7 | find_package(MPI REQUIRED) 8 | 9 | find_library(CUTT cutt "${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/lib") 10 | include_directories(${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/include) 11 | include_directories(${PROJECT_SOURCE_DIR}/third-party/dbg-macro) 12 | MESSAGE(STATUS "Found CUTT: ${CUTT}") 13 | 14 | set(CMAKE_CXX_FLAGS "-std=c++14 -O2 -g -Wall ${OpenMP_CXX_FLAGS}") 15 | set(CUDA_NVCC_FLAGS "-Xcompiler -fopenmp -std=c++14 -O2 -g -arch=compute_70 -code=sm_70 --ptxas-options=-v -lineinfo -keep") 16 | set(BACKEND "group" CACHE STRING "Backend mode, one of [serial, group, group-serial, blas, mix, blas-advance]") 17 | MESSAGE(STATUS "Backend: ${BACKEND}") 18 | 19 | option(SHOW_SCHEDULE "Print the schedule" ON) 20 | option(SHOW_SUMMARY "Show the running details" ON) 21 | option(MEASURE_STAGE "Measure time of each stage" OFF) 22 | option(MICRO_BENCH "Compile micro-benchmarks" OFF) 23 | option(EVALUATOR_PREPROCESS "compile evaluator preprocess" OFF) 24 | option(DISABLE_ASSERT "Use assert in cuda runtime" ON) 25 | option(USE_DOUBLE "double or float" ON) 26 | option(ENABLE_OVERLAP "overlap" ON) 27 | option(USE_MPI "use mpi" OFF) 28 | option(OVERLAP_MAT "overlap initMatirx" ON) 29 | option(LOG_EVALUATOR "show logging of evaluator" OFF) 30 | 31 | if (BACKEND STREQUAL "serial") 32 | add_definitions(-DBACKEND=0) 33 | elseif(BACKEND STREQUAL "group") 34 | add_definitions(-DBACKEND=1) 35 | elseif(BACKEND STREQUAL "group-serial") 36 | add_definitions(-DBACKEND=2) 37 | elseif(BACKEND STREQUAL "blas") 38 | add_definitions(-DBACKEND=3) 39 | elseif(BACKEND STREQUAL "mix") 40 | add_definitions(-DBACKEND=4) 41 | elseif(BACKEND STREQUAL "blas-advance") 42 | add_definitions(-DBACKEND=5) 43 | else() 44 | MESSAGE(ERROR "invalid mode") 45 | endif() 46 | 47 | if (SHOW_SCHEDULE) 48 | add_definitions(-DSHOW_SCHEDULE) 49 | endif(SHOW_SCHEDULE) 50 | if (SHOW_SUMMARY) 51 | add_definitions(-DSHOW_SUMMARY) 52 | endif(SHOW_SUMMARY) 53 | if (MEASURE_STAGE) 54 | add_definitions(-DMEASURE_STAGE) 55 | endif(MEASURE_STAGE) 56 | if (DISABLE_ASSERT) 57 | add_definitions(-DNDEBUG) 58 | else() 59 | add_definitions(-DDEBUG) 60 | endif(DISABLE_ASSERT) 61 | if (ENABLE_OVERLAP) 62 | add_definitions(-DENABLE_OVERLAP) 63 | endif(ENABLE_OVERLAP) 64 | if (USE_DOUBLE) 65 | MESSAGE(STATUS "Float type: Double") 66 | add_definitions(-DUSE_DOUBLE) 67 | else() 68 | MESSAGE(STATUS "Float type: Float") 69 | endif(USE_DOUBLE) 70 | if (OVERLAP_MAT) 71 | add_definitions(-DOVERLAP_MAT) 72 | endif(OVERLAP_MAT) 73 | 74 | if (USE_MPI) 75 | add_definitions(-DUSE_MPI=1) 76 | else() 77 | add_definitions(-DUSE_MPI=0) 78 | endif(USE_MPI) 79 | 80 | set(COALESCE "3" CACHE STRING "coalescing size") 81 | MESSAGE(STATUS "coalesce = ${COALESCE}") 82 | add_definitions(-DCOALESCE_GLOBAL_DEFINED=${COALESCE}) 83 | 84 | set(MAT "6" CACHE STRING "mat size") 85 | MESSAGE(STATUS "mat size = ${MAT}") 86 | add_definitions(-DBLAS_MAT_LIMIT_DEFINED=${MAT}) 87 | 88 | set(MIN_MAT "4" CACHE STRING "min mat size") 89 | MESSAGE(STATUS "min mat size = ${MIN_MAT}") 90 | add_definitions(-DMIN_MAT_SIZE_DEFINED=${MIN_MAT}) 91 | 92 | set(THREAD_DEP "7" CACHE STRING "thread dep") 93 | MESSAGE(STATUS "thread_dep = ${THREAD_DEP}") 94 | add_definitions(-DTHREAD_DEP_DEFINED=${THREAD_DEP}) 95 | 96 | if (EVALUATOR_PREPROCESS) 97 | set(PROCESS process) 98 | add_executable(process evaluator-preprocess/process.cpp) 99 | target_link_libraries(process QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY}) 100 | add_definitions(-DUSE_EVALUATOR_PREPROCESS) 101 | endif(EVALUATOR_PREPROCESS) 102 | 103 | if(LOG_EVALUATOR) 104 | add_definitions(-DLOG_EVALUATOR) 105 | endif(LOG_EVALUATOR) 106 | 107 | include_directories ("${PROJECT_SOURCE_DIR}/src") 108 | add_subdirectory("src") 109 | add_executable(main main.cpp) 110 | target_link_libraries(main QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY}) 111 | 112 | if (MICRO_BENCH) 113 | set(BENCHMARKS local-single local-ctr two-group-h bench-blas) 114 | foreach(BENCHMARK IN LISTS BENCHMARKS) 115 | add_executable(${BENCHMARK} micro-benchmark/${BENCHMARK}.cpp) 116 | target_link_libraries(${BENCHMARK} QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY}) 117 | endforeach(BENCHMARK IN LISTS BENCHMARKS) 118 | endif(MICRO_BENCH) 119 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HyQuas 2 | 3 | HyQuas is a **Hy**brid partitioner based **Qua**ntum circuit **S**imulation system on GPU, which supports both single-GPU, single-node-multi-GPU, and multi-node-multi-GPU quantum circuit simulation. 4 | 5 | For single-GPU simulation, it provides two highly optimized methods, *OShareMem* and *TransMM*. *OShareMem* method optimizes the shared-memory based quantum circuit simulation by . *TransMM* method converts quantum circuit simulation to standard operations and enables the usage of highly optimized libraries like cuBLAS and powerful hardwares like Tensor Cores. It leads up to speedup over previous gate-merging based simulation. Moreover, it can select the better simulation method for different parts of a given quantum circuit according to its pattern. 6 | 7 | For distributed simulation, it provides a GPU-centric communication pipelining approach. It can utilize the high-throughput NVLink connections to make the simulation even faster while still preserving low communication traffic. 8 | 9 | Experimental results show that HyQuas can achieve up to speedup on a single GPU and speedup on a GPU cluster over state-of-the-art quantum circuit simulation systems. 10 | 11 | ## Compile and Run 12 | 1. Get the source code 13 | ```bash 14 | git clone https://github.com/thu-pacman/HyQuas.git --recursive 15 | ``` 16 | 17 | 2. Specify the compute capability in `CMakeLists.txt` (`CUDA_NVCC_FLAGS`) and `third-party/cutt/Makefile` (`GENCODE_FLAGS`) 18 | 19 | 3. Prepare the following dependencies 20 | * cmake (tested on 3.12.3) 21 | * cuda (tested on 10.2.89 and 11.0.2) 22 | * g++ (compatible with cuda) 23 | * cublas (with the same version of cuda) 24 | * openmpi (tested on 4.0.5) 25 | * nccl (Fully tested on 2.9.6-1. Known that 2.7.8-1 cannot work. It will be blocked in an NCCL simulated MPI_Sendrecv.) 26 | And update environment variables like `CUDA_HOME`, `NCCL_ROOT`, `$PATH`, `$LIBRARY_PATH`, `$LD_LIBRARY_PATH`, `CPATH` in `scripts/env.sh`. 27 | 28 | 4. Compile the tensor transpose library `cutt` 29 | 30 | ```bash 31 | cd third-party/cutt 32 | make -j 33 | ``` 34 | 35 | 5. Specify the root directory 36 | ```bash 37 | export HYQUAS_ROOT=${The_directory_running_git_clone}/HyQuas 38 | ``` 39 | 40 | 5. Prepare the database for the time predictor 41 | ```bash 42 | mkdir -p evaluator-preprocess/parameter-files 43 | cd benchmark 44 | ./preprocess.sh 45 | ``` 46 | 47 | 6. Example usages of HyQuas: 48 | HyQuas will use all GPUs it can detect, so please control the number of GPU by `CUDA_VISIBLE_DEVICES`. 49 | * Run a single circuit with single GPU 50 | ```bash 51 | cd scripts 52 | ./run-single.sh 53 | ``` 54 | 55 | * Run a single circuit with multiple GPUs in one node 56 | ```bash 57 | cd scripts 58 | ./run-multi-GPU.sh 59 | ``` 60 | 61 | * Run a single circuit with multiple GPUs in multiple nodes 62 | Please modify the `-host` first. 63 | ```bash 64 | cd scripts 65 | ./run-multi-node.sh 66 | ``` 67 | 68 | * Run all circuits and check the correctness (The script trys both w/o MPI) 69 | ```bash 70 | cd scripts 71 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./check.sh 72 | ``` 73 | 74 | **Please use the commands in check.sh for evaluating the performance of HyQuas because the run_\*.sh compiles the simulator in debug mode and check.sh compiles it in release mode.** 75 | 76 | For more ways to use our simulator (like only using the *OShareMem* method or *TransMM* method, tuning off the overlap of communication and computation), and for reproducing our results in the ICS'21 paper, please refer to our `benchmark/` directory. 77 | 78 | It also supports the following **unstable** feathers now. See our dev branch for details. 79 | * Simulating more qubits by saving the state in CPU memory while still compute with GPU. 80 | * An imperative mode, so that you do not need to explicitly call `c->compile();` and `c->run()`. 81 | * Support for more control qubits. 82 | * Support for some two-qubit gates. 83 | * Fast measurement of quantum state. 84 | 85 | # Cite 86 | To cite HyQuas, you can use the following BibTex: 87 | ``` 88 | @inproceedings{10.1145/3447818.3460357, 89 | author = {Zhang, Chen and Song, Zeyu and Wang, Haojie and Rong, Kaiyuan and Zhai, Jidong}, 90 | title = {HyQuas: Hybrid Partitioner Based Quantum Circuit Simulation System on GPU}, 91 | year = {2021}, 92 | isbn = {9781450383356}, 93 | publisher = {Association for Computing Machinery}, 94 | address = {New York, NY, USA}, 95 | url = {https://doi.org/10.1145/3447818.3460357}, 96 | doi = {10.1145/3447818.3460357}, 97 | booktitle = {Proceedings of the ACM International Conference on Supercomputing}, 98 | pages = {443–454}, 99 | numpages = {12}, 100 | keywords = {quantum computing, GPU computing, simulation}, 101 | location = {Virtual Event, USA}, 102 | series = {ICS '21} 103 | } 104 | 105 | ``` 106 | -------------------------------------------------------------------------------- /benchmark/bench_backend.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | export CUDA_VISIBLE_DEVICES=0 3 | export MPIRUN_CONFIG="" 4 | head=../build/logs/`date +%Y%m%d-%H%M%S` 5 | 6 | cd ../scripts 7 | 8 | name=$head-group 9 | mkdir -p $name 10 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out 11 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/backend.log 12 | 13 | name=$head-blas 14 | mkdir -p $name 15 | ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out 16 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log 17 | 18 | name=$head-mix 19 | mkdir -p $name 20 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1 | tee $name/std.out 21 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log 22 | 23 | -------------------------------------------------------------------------------- /benchmark/bench_blas_a100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | ulimit -s unlimited 4 | 5 | source /opt/spack/share/spack/setup-env.sh 6 | spack load cuda@11 7 | NVPROF_COMMAND="nsys nvprof --profile-from-start=off -o test" 8 | export MPIRUN_CONFIG="" 9 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28" 10 | export tests="$tests_25 $tests_28 $tests_30" 11 | 12 | head=../build/logs/`date +%Y%m%d-%H%M%S` 13 | logdir=../benchmark/logs/ 14 | echo tests=$tests 15 | cd ../scripts 16 | 17 | name=$head-m3 18 | mkdir -p $name 19 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out 20 | echo "+++++ 3" | tee $logdir/blas-profile.log 21 | for test in ${tests[*]}; do 22 | echo "===== $test" | tee -a $name/circ.profile 23 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 24 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 25 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 26 | done 27 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 28 | name3=$name 29 | 30 | name=$head-m4 31 | mkdir -p $name 32 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out 33 | echo "+++++ 4" | tee -a $logdir/blas-profile.log 34 | for test in ${tests[*]}; do 35 | echo "===== $test" | tee -a $name/circ.profile 36 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 37 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 38 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 39 | done 40 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 41 | name4=$name 42 | 43 | name=$head-m5 44 | mkdir -p $name 45 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out 46 | echo "+++++ 5" | tee -a $logdir/blas-profile.log 47 | for test in ${tests[*]}; do 48 | echo "===== $test" | tee -a $name/circ.profile 49 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 50 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 51 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 52 | done 53 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 54 | name5=$name 55 | 56 | name=$head-m6 57 | mkdir -p $name 58 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out 59 | echo "+++++ 6" | tee -a $logdir/blas-profile.log 60 | for test in ${tests[*]}; do 61 | echo "===== $test" | tee -a $name/circ.profile 62 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 63 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 64 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 65 | done 66 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 67 | name6=$name 68 | 69 | name=$head-m7 70 | mkdir -p $name 71 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out 72 | echo "+++++ 7" | tee -a $logdir/blas-profile.log 73 | for test in ${tests[*]}; do 74 | echo "===== $test" | tee -a $name/circ.profile 75 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 76 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 77 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 78 | done 79 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 80 | name7=$name 81 | 82 | name=$head-m8 83 | mkdir -p $name 84 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out 85 | echo "+++++ 8" | tee -a $logdir/blas-profile.log 86 | for test in ${tests[*]}; do 87 | echo "===== $test" | tee -a $name/circ.profile 88 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 89 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 90 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 91 | done 92 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 93 | name8=$name 94 | 95 | name=$head-m9 96 | mkdir -p $name 97 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out 98 | echo "+++++ 9" | tee -a $logdir/blas-profile.log 99 | for test in ${tests[*]}; do 100 | echo "===== $test" | tee -a $name/circ.profile 101 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 102 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 103 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 104 | done 105 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 106 | name9=$name 107 | 108 | name=$head-m10 109 | mkdir -p $name 110 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out 111 | echo "+++++ 10" | tee -a $logdir/blas-profile.log 112 | for test in ${tests[*]}; do 113 | echo "===== $test" | tee -a $name/circ.profile 114 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 115 | grep "cutlass" tmp.profile | tee -a $name/circ.profile 116 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 117 | done 118 | cat $name/circ.profile | tee -a $logdir/blas-profile.log 119 | name10=$name 120 | 121 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/blas.log 122 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/blas.log 123 | -------------------------------------------------------------------------------- /benchmark/bench_blas_v100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | ulimit -s unlimited 4 | 5 | export MPIRUN_CONFIG="" 6 | NVPROF_COMMAND="nvprof --profile-from-start off --csv" 7 | 8 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28" 9 | export tests="$tests_28" 10 | 11 | head=../build/logs/`date +%Y%m%d-%H%M%S` 12 | logdir=../benchmark/logs/ 13 | echo tests=$tests 14 | cd ../scripts 15 | 16 | name=$head-m3 17 | mkdir -p $name 18 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out 19 | echo "+++++ 3" | tee $logdir/transmm-profile.log 20 | for test in ${tests[*]}; do 21 | echo "===== $test" | tee -a $name/circ.profile 22 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 23 | grep "volta" tmp.profile | tee -a $name/circ.profile 24 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 25 | done 26 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 27 | name3=$name 28 | 29 | name=$head-m4 30 | mkdir -p $name 31 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out 32 | echo "+++++ 4" | tee -a $logdir/transmm-profile.log 33 | for test in ${tests[*]}; do 34 | echo "===== $test" | tee -a $name/circ.profile 35 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 36 | grep "volta" tmp.profile | tee -a $name/circ.profile 37 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 38 | done 39 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 40 | name4=$name 41 | 42 | name=$head-m5 43 | mkdir -p $name 44 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out 45 | echo "+++++ 5" | tee -a $logdir/transmm-profile.log 46 | for test in ${tests[*]}; do 47 | echo "===== $test" | tee -a $name/circ.profile 48 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 49 | grep "volta" tmp.profile | tee -a $name/circ.profile 50 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 51 | done 52 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 53 | name5=$name 54 | 55 | name=$head-m6 56 | mkdir -p $name 57 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out 58 | echo "+++++ 6" | tee -a $logdir/transmm-profile.log 59 | for test in ${tests[*]}; do 60 | echo "===== $test" | tee -a $name/circ.profile 61 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 62 | grep "volta" tmp.profile | tee -a $name/circ.profile 63 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 64 | done 65 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 66 | name6=$name 67 | 68 | name=$head-m7 69 | mkdir -p $name 70 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out 71 | echo "+++++ 7" | tee -a $logdir/transmm-profile.log 72 | for test in ${tests[*]}; do 73 | echo "===== $test" | tee -a $name/circ.profile 74 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 75 | grep "volta" tmp.profile | tee -a $name/circ.profile 76 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 77 | done 78 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 79 | name7=$name 80 | 81 | name=$head-m8 82 | mkdir -p $name 83 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out 84 | echo "+++++ 8" | tee -a $logdir/transmm-profile.log 85 | for test in ${tests[*]}; do 86 | echo "===== $test" | tee -a $name/circ.profile 87 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 88 | grep "volta" tmp.profile | tee -a $name/circ.profile 89 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 90 | done 91 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 92 | name8=$name 93 | 94 | name=$head-m9 95 | mkdir -p $name 96 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out 97 | echo "+++++ 9" | tee -a $logdir/transmm-profile.log 98 | for test in ${tests[*]}; do 99 | echo "===== $test" | tee -a $name/circ.profile 100 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 101 | grep "volta" tmp.profile | tee -a $name/circ.profile 102 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 103 | done 104 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 105 | name9=$name 106 | 107 | name=$head-m10 108 | mkdir -p $name 109 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out 110 | echo "+++++ 10" | tee -a $logdir/transmm-profile.log 111 | for test in ${tests[*]}; do 112 | echo "===== $test" | tee -a $name/circ.profile 113 | $NVPROF_COMMAND ../build/main ../tests/input/$test.qasm 2>&1 | tee tmp.profile 114 | grep "volta" tmp.profile | tee -a $name/circ.profile 115 | grep "void transpose" tmp.profile | tee -a $name/circ.profile 116 | done 117 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log 118 | name10=$name 119 | 120 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/transmm.log 121 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/transmm.log 122 | -------------------------------------------------------------------------------- /benchmark/bench_comm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p logs 3 | mkdir -p logs/bench_comm 4 | mkdir -p logs/bench_comm/4V100 5 | mkdir -p logs/bench_comm/2V100 6 | cd ../scripts 7 | source ./init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1 8 | cd ../benchmark 9 | 10 | tests="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28" 11 | 12 | echo "test 4V100" 13 | 14 | for test in $tests; do 15 | echo $test 16 | CUDA_VISIBLE_DEVICES=0,1,2,3 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/4V100/$test.log 2>logs/bench_comm/4V100/$test.out 17 | done 18 | 19 | echo "test 2V100" 20 | 21 | for test in $tests; do 22 | echo $test 23 | CUDA_VISIBLE_DEVICES=0,1 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/2V100/$test.log 2>logs/bench_comm/2V100/$test.out 24 | done 25 | -------------------------------------------------------------------------------- /benchmark/bench_cublas_a100.sh: -------------------------------------------------------------------------------- 1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26 2 | echo N_QUBIT=26 3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log 4 | 5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27 6 | echo N_QUBIT=27 7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log 8 | 9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28 10 | echo N_QUBIT=28 11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log 12 | 13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=29 14 | echo N_QUBIT=29 15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log 16 | 17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=30 18 | echo N_QUBIT=30 19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log 20 | -------------------------------------------------------------------------------- /benchmark/bench_cublas_v100.sh: -------------------------------------------------------------------------------- 1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=24 2 | echo N_QUBIT=24 3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-v100.log 4 | 5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=25 6 | echo N_QUBIT=25 7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log 8 | 9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26 10 | echo N_QUBIT=26 11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log 12 | 13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27 14 | echo N_QUBIT=27 15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log 16 | 17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28 18 | echo N_QUBIT=28 19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log 20 | -------------------------------------------------------------------------------- /benchmark/bench_evaluator_a100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p logs 3 | mkdir -p logs/evaluator_a100 4 | 5 | cd ../scripts 6 | echo "OShareMem" 7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1 8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/OShareMem.log 9 | cd ../benchmark 10 | 11 | cd ../scripts 12 | echo "TransMM MAT=5" 13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1 14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_5.log 15 | cd ../benchmark 16 | 17 | cd ../scripts 18 | echo "TransMM MAT=6" 19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1 20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_6.log 21 | cd ../benchmark 22 | 23 | cd ../scripts 24 | echo "TransMM MAT=7" 25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1 26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_7.log 27 | cd ../benchmark 28 | -------------------------------------------------------------------------------- /benchmark/bench_evaluator_v100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p logs 3 | mkdir -p logs/evaluator_v100 4 | 5 | cd ../scripts 6 | echo "OShareMem" 7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1 8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/OShareMem.log 9 | cd ../benchmark 10 | 11 | cd ../scripts 12 | echo "TransMM MAT=5" 13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1 14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_5.log 15 | cd ../benchmark 16 | 17 | cd ../scripts 18 | echo "TransMM MAT=6" 19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1 20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_6.log 21 | cd ../benchmark 22 | 23 | cd ../scripts 24 | echo "TransMM MAT=7" 25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1 26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_7.log 27 | cd ../benchmark 28 | -------------------------------------------------------------------------------- /benchmark/bench_groupsz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ulimit -s unlimited 3 | 4 | cd ../scripts 5 | 6 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 7 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee ../benchmark/logs/groupsz-tm.log 8 | 9 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=4 -DMIN_MAT=4 10 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 11 | 12 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=5 -DMIN_MAT=5 13 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 14 | 15 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=6 -DMIN_MAT=6 16 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 17 | 18 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=7 -DMIN_MAT=7 19 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 20 | 21 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=8 -DMIN_MAT=8 22 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 23 | 24 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=9 -DMIN_MAT=9 25 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 26 | 27 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=10 -DMIN_MAT=10 28 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log 29 | -------------------------------------------------------------------------------- /benchmark/bench_numgate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | head=../build/logs/`date +%Y%m%d-%H%M%S` 5 | 6 | cd ../scripts 7 | 8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu 9 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1 10 | CUDA_VISIBLE_DEVICES=0 ./two-group-h | tee ../benchmark/logs/numgate-sm.log 11 | 12 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu 13 | -------------------------------------------------------------------------------- /benchmark/bench_pergate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MPIRUN_CONFIG="" 3 | 4 | cd ../scripts 5 | 6 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu 7 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1 8 | echo "baseline" | tee pergate.log 9 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log 10 | 11 | cd ../scripts 12 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu 13 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1 14 | echo "multitask" | tee -a pergate.log 15 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log 16 | 17 | cd ../scripts 18 | cp ../src/kernels/lookup.cu ../src/kernelOpt.cu 19 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1 20 | echo "lookup" | tee -a pergate.log 21 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log 22 | 23 | cd ../scripts 24 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu 25 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1 26 | echo "bank" | tee -a pergate.log 27 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log 28 | 29 | cp pergate.log ../benchmark/logs 30 | cat pergate.log 31 | -------------------------------------------------------------------------------- /benchmark/bench_scale.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | head=../build/logs/`date +%Y%m%d-%H%M%S` 3 | 4 | 5 | cd ../scripts 6 | export CUDA_VISIBLE_DEVICES=0 7 | export MPIRUN_CONFIG="" 8 | 9 | name=$head-1gpu-o 10 | mkdir -p $name 11 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out 12 | name1=$name 13 | 14 | export CUDA_VISIBLE_DEVICES=0,1 15 | name=$head-2gpu-o 16 | mkdir -p $name 17 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out 18 | name2=$name 19 | 20 | export CUDA_VISIBLE_DEVICES=0,1,2,3 21 | name=$head-4gpu-o 22 | mkdir -p $name 23 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out 24 | name3=$name 25 | 26 | export CUDA_VISIBLE_DEVICES=0 27 | name=$head-1gpu-s 28 | mkdir -p $name 29 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out 30 | name1=$name 31 | 32 | export CUDA_VISIBLE_DEVICES=0,1 33 | name=$head-2gpu-s 34 | mkdir -p $name 35 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out 36 | name2=$name 37 | 38 | export CUDA_VISIBLE_DEVICES=0,1,2,3 39 | name=$head-4gpu-s 40 | mkdir -p $name 41 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out 42 | name3=$name 43 | 44 | 45 | grep -r "Time Cost" $head-*/*.log | tee ../benchmark/logs/scale.log 46 | 47 | export CUDA_VISIBLE_DEVICES=0,1,2,3 48 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 49 | nvprof ./main ../tests/input/hidden_shift_28.qasm 2>&1 | tee ../benchmark/logs/hs.log 50 | -------------------------------------------------------------------------------- /benchmark/bench_sharemem.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | export CUDA_VISIBLE_DEVICES=0 3 | export MPIRUN_CONFIG="" 4 | name=../build/logs/`date +%Y%m%d-%H%M%S` 5 | 6 | cd ../scripts 7 | 8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu 9 | 10 | mkdir -p $name 11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1 | tee $name/std.out 12 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/sharemem.log 13 | 14 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu 15 | -------------------------------------------------------------------------------- /benchmark/bench_weak.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 3 | LOG=../benchmark/logs 4 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee $LOG/weak.log 5 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log 7 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log 8 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log 9 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log 10 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log 11 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log 12 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log 13 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log 14 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log 15 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log 16 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log 17 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log 18 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log 19 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log 20 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log 21 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_30.qasm 2>&1 | tee -a $LOG/weak.log 22 | 23 | grep -r "Logger" $LOG/weak.log | tee $LOG/weak_summary.log -------------------------------------------------------------------------------- /benchmark/blas.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | static const char *cublasGetErrorString(cublasStatus_t error) 7 | { 8 | switch (error) 9 | { 10 | case CUBLAS_STATUS_SUCCESS: 11 | return "CUBLAS_STATUS_SUCCESS"; 12 | case CUBLAS_STATUS_NOT_INITIALIZED: 13 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 14 | case CUBLAS_STATUS_ALLOC_FAILED: 15 | return "CUBLAS_STATUS_ALLOC_FAILED"; 16 | case CUBLAS_STATUS_INVALID_VALUE: 17 | return "CUBLAS_STATUS_INVALID_VALUE"; 18 | case CUBLAS_STATUS_ARCH_MISMATCH: 19 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 20 | case CUBLAS_STATUS_MAPPING_ERROR: 21 | return "CUBLAS_STATUS_MAPPING_ERROR"; 22 | case CUBLAS_STATUS_EXECUTION_FAILED: 23 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 24 | case CUBLAS_STATUS_INTERNAL_ERROR: 25 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 26 | default: 27 | return ""; 28 | } 29 | return ""; 30 | } 31 | 32 | #define checkCudaErrors(stmt) { \ 33 | cudaError_t err = stmt; \ 34 | if (err != cudaSuccess) { \ 35 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cudaGetErrorString(err)); \ 36 | exit(1); \ 37 | } \ 38 | } 39 | 40 | #define checkCuttErrors(stmt) { \ 41 | cuttResult err = stmt; \ 42 | if (err != CUTT_SUCCESS) { \ 43 | fprintf(stderr, "%s in file %s, function %s, line %i.\n", #stmt, __FILE__, __FUNCTION__, __LINE__); \ 44 | exit(1); \ 45 | } \ 46 | } 47 | 48 | #define checkBlasErrors(stmt) { \ 49 | cublasStatus_t err = stmt; \ 50 | if (err != CUBLAS_STATUS_SUCCESS) { \ 51 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cublasGetErrorString(err)); \ 52 | exit(1); \ 53 | } \ 54 | } 55 | 56 | int main() { 57 | int nq = N_QUBIT; 58 | cuDoubleComplex* arr; 59 | cuDoubleComplex* mat; 60 | cuDoubleComplex* result; 61 | checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << nq)); 62 | checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) * 1024 * 1024)); 63 | checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << nq)); 64 | cublasHandle_t handle; 65 | checkBlasErrors(cublasCreate(&handle)); 66 | // checkBlasErrors(cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION)); 67 | int numElements = 1 << nq; 68 | cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0); 69 | cudaEvent_t start, stop; 70 | checkCudaErrors(cudaEventCreate(&start)); 71 | checkCudaErrors(cudaEventCreate(&stop)); 72 | for (int K = 2; K < 1024; K <<= 1) { 73 | printf("K = %d\n", K); 74 | for (int i = 0; i < 100; i++) { 75 | checkCudaErrors(cudaEventRecord(start)); 76 | 77 | checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 78 | K, numElements / K, K, // M, N, K 79 | &alpha, mat, K, // alpha, a, lda 80 | arr, K, // b, ldb 81 | &beta, result, K // beta, c, ldc 82 | )); 83 | 84 | float time; 85 | checkCudaErrors(cudaEventRecord(stop)); 86 | cudaEventSynchronize(stop); 87 | cudaEventElapsedTime(&time, start, stop); 88 | printf("%.10f ", time); 89 | } 90 | printf("\n"); 91 | } 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /benchmark/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=on -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7 3 | CUDA_VISIBLE_DEVICES=0 ./process -------------------------------------------------------------------------------- /cmake/FindNccl.cmake: -------------------------------------------------------------------------------- 1 | if (NCCL_LIBRARY) 2 | if(NOT USE_NCCL_LIB_PATH) 3 | # Don't cache NCCL_LIBRARY to enable switching between static and shared. 4 | unset(NCCL_LIBRARY CACHE) 5 | endif(NOT USE_NCCL_LIB_PATH) 6 | endif() 7 | 8 | if (BUILD_WITH_SHARED_NCCL) 9 | # libnccl.so 10 | set(NCCL_LIB_NAME nccl) 11 | else () 12 | # libnccl_static.a 13 | set(NCCL_LIB_NAME nccl_static) 14 | endif (BUILD_WITH_SHARED_NCCL) 15 | 16 | find_path(NCCL_INCLUDE_DIR 17 | NAMES nccl.h 18 | PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include) 19 | 20 | find_library(NCCL_LIBRARY 21 | NAMES ${NCCL_LIB_NAME} 22 | PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib) 23 | 24 | message(STATUS "Using nccl library: ${NCCL_LIBRARY}") 25 | 26 | include(FindPackageHandleStandardArgs) 27 | find_package_handle_standard_args(Nccl DEFAULT_MSG 28 | NCCL_INCLUDE_DIR NCCL_LIBRARY) 29 | 30 | mark_as_advanced( 31 | NCCL_INCLUDE_DIR 32 | NCCL_LIBRARY 33 | ) -------------------------------------------------------------------------------- /evaluator-preprocess/process.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "circuit.h" 12 | #include "logger.h" 13 | using namespace std; 14 | 15 | #define DIFF_QUBIT_NUMS 7 16 | int qubit_nums[DIFF_QUBIT_NUMS] = {22, 23, 24, 25, 26, 27, 28}; 17 | 18 | FILE* curr_file; 19 | 20 | #define CALC_ALL_PARAM 0 21 | #define CALC_PARTIAL_PARAM 1 22 | const int param_type = CALC_PARTIAL_PARAM; 23 | 24 | void procPerGateSingle(int numQubits) { 25 | int num_gates = 512; 26 | for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) { 27 | printf("single gate %s\n", Gate::get_name(GateType(i)).c_str()); 28 | if(param_type == CALC_ALL_PARAM) { 29 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) { 30 | Circuit c(numQubits); 31 | for (int k = 0; k < num_gates; k++) { 32 | c.addGate(Gate::random(j, j + 1, GateType(i))); 33 | } 34 | c.compile(); 35 | int time = c.run(false); 36 | fprintf(curr_file, "%d ", time); 37 | } 38 | } 39 | else { 40 | Circuit c(numQubits); 41 | for (int k = 0; k < num_gates; k++) { 42 | c.addGate(Gate::random(1, 1 + 1, GateType(i))); 43 | } 44 | c.compile(); 45 | int time = c.run(false); 46 | fprintf(curr_file, "%d ", time); 47 | } 48 | fprintf(curr_file, "\n"); 49 | } 50 | fprintf(curr_file, "\n"); 51 | } 52 | 53 | void procPerGateCtr(int numQubits) { 54 | int num_gates = 512; 55 | for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) { 56 | printf("control gate %s\n", Gate::get_name(GateType(g)).c_str()); 57 | if(param_type == CALC_ALL_PARAM) { 58 | for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) { 59 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) { 60 | if (i == j) { fprintf(curr_file, "0 "); continue; } 61 | Circuit c(numQubits); 62 | for (int k = 0; k < num_gates; k++) { 63 | c.addGate(Gate::control(i, j, GateType(g))); 64 | } 65 | c.compile(); 66 | int time = c.run(false); 67 | fprintf(curr_file, "%d ", time); 68 | } 69 | fprintf(curr_file, "\n"); 70 | } 71 | } 72 | else { 73 | Circuit c(numQubits); 74 | for (int k = 0; k < num_gates; k++) { 75 | c.addGate(Gate::control(0, 2, GateType(g))); 76 | } 77 | c.compile(); 78 | int time = c.run(false); 79 | fprintf(curr_file, "%d ", time); 80 | } 81 | fprintf(curr_file, "\n"); 82 | } 83 | } 84 | 85 | void procBLAS(int numQubits) { 86 | cuDoubleComplex* arr; 87 | cuDoubleComplex* mat; 88 | cuDoubleComplex* result; 89 | checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << numQubits)); 90 | checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) << 20)); 91 | checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << numQubits)); 92 | cublasHandle_t handle; 93 | checkBlasErrors(cublasCreate(&handle)); 94 | qindex numElements = qindex(1) << numQubits; 95 | cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0); 96 | cudaEvent_t start, stop; 97 | checkCudaErrors(cudaEventCreate(&start)); 98 | checkCudaErrors(cudaEventCreate(&stop)); 99 | for (int K = 1; K < 1024; K <<= 1) { 100 | printf("blas calculating K = %d\n", K); 101 | double sum_time = 0.0; 102 | for (int i = 0; i < 100; i++) { 103 | checkCudaErrors(cudaEventRecord(start)); 104 | 105 | checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 106 | K, numElements / K, K, // M, N, K 107 | &alpha, mat, K, // alpha, a, lda 108 | arr, K, // b, ldb 109 | &beta, result, K // beta, c, ldc 110 | )); 111 | 112 | float time; 113 | checkCudaErrors(cudaEventRecord(stop)); 114 | cudaEventSynchronize(stop); 115 | cudaEventElapsedTime(&time, start, stop); 116 | sum_time += time; 117 | //printf("%.10f ", time); 118 | 119 | } 120 | //printf("\n"); 121 | fprintf(curr_file, "%d %f\n", K, sum_time / 100); 122 | } 123 | fprintf(curr_file, "\n"); 124 | checkCudaErrors(cudaFree(arr)); 125 | checkCudaErrors(cudaFree(mat)); 126 | checkCudaErrors(cudaFree(result)); 127 | } 128 | 129 | void procCutt(int numQubits) { 130 | double *in, *out; 131 | checkCudaErrors(cudaMalloc(&in, sizeof(double2) << numQubits)); 132 | checkCudaErrors(cudaMalloc(&out, sizeof(double2) << numQubits)); 133 | int dim[numQubits]; 134 | for (int i = 0; i < numQubits; i++) dim[i] = 2; 135 | int total = 0; 136 | double sum_time = 0.0; 137 | for (int change = 1; change <= 20; change ++) { 138 | int perm[numQubits]; 139 | printf("Cutt calculating change = %d\n", change); 140 | for (int tt = 0; tt < 100; tt++) { 141 | for (int i = 0; i < numQubits; i++) perm[i] = i; 142 | for (int i = 0; i < change; i++) { 143 | std::swap(perm[rand() % numQubits], perm[rand() % numQubits]); 144 | } 145 | cuttHandle plan; 146 | checkCuttErrors(cuttPlan(&plan, numQubits, dim, perm, sizeof(double2), 0)); 147 | cudaEvent_t start, stop; 148 | float time; 149 | checkCudaErrors(cudaEventCreate(&start)); 150 | checkCudaErrors(cudaEventCreate(&stop)); 151 | checkCudaErrors(cudaEventRecord(start, 0)); 152 | checkCuttErrors(cuttExecute(plan, in, out)); 153 | checkCudaErrors(cudaEventRecord(stop, 0)); 154 | checkCudaErrors(cudaEventSynchronize(stop)); 155 | checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); 156 | //printf("%.10f ms ", time); 157 | total ++; 158 | sum_time += time; 159 | } 160 | //printf("\n"); 161 | } 162 | fprintf(curr_file, "%f\n", sum_time / total); 163 | checkCudaErrors(cudaFree(in)); 164 | checkCudaErrors(cudaFree(out)); 165 | } 166 | 167 | void process(int numQubits) { 168 | printf("processing qubit number : %d\n", numQubits); 169 | string file_name = string("../evaluator-preprocess/parameter-files/") + to_string(numQubits) + string("qubits.out"); 170 | curr_file = fopen(file_name.c_str(), "w"); 171 | fprintf(curr_file, "%d\n", param_type); 172 | 173 | procPerGateSingle(numQubits); 174 | procPerGateCtr(numQubits); 175 | procBLAS(numQubits); 176 | procCutt(numQubits); 177 | fclose(curr_file); 178 | } 179 | 180 | int main() 181 | { 182 | auto start = chrono::system_clock::now(); 183 | MyGlobalVars::init(); 184 | for(int i = 0; i < DIFF_QUBIT_NUMS; i++) { 185 | process(qubit_nums[i]); 186 | } 187 | auto end = chrono::system_clock::now(); 188 | printf("process time %d ms\n", chrono::duration_cast(end - start).count()); 189 | return 0; 190 | } 191 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "circuit.h" 7 | #include "logger.h" 8 | using namespace std; 9 | const int BUFFER_SIZE = 1000; 10 | char buffer[BUFFER_SIZE]; 11 | 12 | std::vector parse_qid(char buf[]) { 13 | std::vector ret; 14 | int l = strlen(buf); 15 | for (int i = 0; i < l; i++) { 16 | if (buf[i] >= '0' && buf[i] <= '9') { 17 | int j = i, x = 0; 18 | while (buf[j] >= '0' && buf[j] <= '9') { 19 | x = x * 10 + (int)(buf[j] - '0'); 20 | j++; 21 | } 22 | i = j - 1; 23 | ret.push_back(x); 24 | } 25 | } 26 | return ret; 27 | } 28 | 29 | std::pair> parse_gate(char buf[]) { 30 | qreal pi = acos(-1); 31 | int l = strlen(buf); 32 | std::string name; 33 | int i = 0; 34 | while (i < l) { 35 | if (buf[i] != '(') 36 | name += buf[i]; 37 | else 38 | break; 39 | i++; 40 | } 41 | std::vector params; 42 | while (i < l) { 43 | i++; 44 | std::string st; 45 | while (buf[i] != ',' && buf[i] != ')') { 46 | st += buf[i]; 47 | i++; 48 | } 49 | qreal param = 1; 50 | if (st[0] == 'p' && st[1] == 'i' && st[2] == '*') { 51 | param = pi; 52 | st = st.erase(0, 3); 53 | } else if (st[0] == 'p' && st[1] == 'i' && st[2] == '/') { 54 | param = -pi; 55 | st = st.erase(0, 3); 56 | } 57 | if (param > 0) 58 | param *= std::stod(st); 59 | else 60 | param = pi / std::stod(st); 61 | params.push_back(param); 62 | if (buf[i] == ')') 63 | break; 64 | } 65 | return std::make_pair(name, params); 66 | } 67 | 68 | std::unique_ptr parse_circuit(const std::string &filename) { 69 | FILE* f = nullptr; 70 | if ((f = fopen(filename.c_str(), "r")) == NULL) { 71 | printf("fail to open %s\n", filename.c_str()); 72 | exit(1); 73 | } 74 | int n = -1; 75 | std::unique_ptr c = nullptr; 76 | while (fscanf(f, "%s", buffer) != EOF) { 77 | if (strcmp(buffer, "//") == 0 || strcmp(buffer, "OPENQASM") == 0 || strcmp(buffer, "include") == 0) { 78 | } else if (strcmp(buffer, "qreg") == 0) { 79 | fscanf(f, "%*c%*c%*c%d", &n); 80 | c = std::make_unique(n); 81 | } else if (strcmp(buffer, "cx") == 0) { 82 | fscanf(f, "%s", buffer); 83 | auto qid = parse_qid(buffer); 84 | assert(qid.size() == 2); 85 | c->addGate(Gate::CNOT(qid[0], qid[1])); 86 | // printf("cx %d %d\n", qid[0], qid[1]); 87 | } else if (strcmp(buffer, "ccx") == 0) { 88 | fscanf(f, "%s", buffer); 89 | auto qid = parse_qid(buffer); 90 | assert(qid.size() == 3); 91 | c->addGate(Gate::CCX(qid[0], qid[1], qid[2])); 92 | // printf("ccx %d %d %d\n", qid[0], qid[1], qid[2]); 93 | } else if (strcmp(buffer, "cy") == 0) { 94 | fscanf(f, "%s", buffer); 95 | auto qid = parse_qid(buffer); 96 | assert(qid.size() == 2); 97 | c->addGate(Gate::CY(qid[0], qid[1])); 98 | // printf("cy %d %d\n", qid[0], qid[1]); 99 | } else if (strcmp(buffer, "cz") == 0) { 100 | fscanf(f, "%s", buffer); 101 | auto qid = parse_qid(buffer); 102 | assert(qid.size() == 2); 103 | c->addGate(Gate::CZ(qid[0], qid[1])); 104 | // printf("cz %d %d\n", qid[0], qid[1]); 105 | } else if (strcmp(buffer, "h") == 0) { 106 | fscanf(f, "%s", buffer); 107 | auto qid = parse_qid(buffer); 108 | assert(qid.size() == 1); 109 | c->addGate(Gate::H(qid[0])); 110 | // printf("h %d\n", qid[0]); 111 | } else if (strcmp(buffer, "x") == 0) { 112 | fscanf(f, "%s", buffer); 113 | auto qid = parse_qid(buffer); 114 | assert(qid.size() == 1); 115 | c->addGate(Gate::X(qid[0])); 116 | // printf("x %d\n", qid[0]); 117 | } else if (strcmp(buffer, "y") == 0) { 118 | fscanf(f, "%s", buffer); 119 | auto qid = parse_qid(buffer); 120 | assert(qid.size() == 1); 121 | c->addGate(Gate::Y(qid[0])); 122 | // printf("y %d\n", qid[0]); 123 | } else if (strcmp(buffer, "z") == 0) { 124 | fscanf(f, "%s", buffer); 125 | auto qid = parse_qid(buffer); 126 | assert(qid.size() == 1); 127 | c->addGate(Gate::Z(qid[0])); 128 | // printf("z %d\n", qid[0]); 129 | } else if (strcmp(buffer, "s") == 0) { 130 | fscanf(f, "%s", buffer); 131 | auto qid = parse_qid(buffer); 132 | assert(qid.size() == 1); 133 | c->addGate(Gate::S(qid[0])); 134 | // printf("s %d\n", qid[0]); 135 | } else if (strcmp(buffer, "sdg") == 0) { 136 | fscanf(f, "%s", buffer); 137 | auto qid = parse_qid(buffer); 138 | assert(qid.size() == 1); 139 | c->addGate(Gate::SDG(qid[0])); 140 | // printf("s %d\n", qid[0]); 141 | } else if (strcmp(buffer, "t") == 0) { 142 | fscanf(f, "%s", buffer); 143 | auto qid = parse_qid(buffer); 144 | assert(qid.size() == 1); 145 | c->addGate(Gate::T(qid[0])); 146 | // printf("t %d\n", qid[0]); 147 | } else if (strcmp(buffer, "tdg") == 0) { 148 | fscanf(f, "%s", buffer); 149 | auto qid = parse_qid(buffer); 150 | assert(qid.size() == 1); 151 | c->addGate(Gate::TDG(qid[0])); 152 | // printf("t %d\n", qid[0]); 153 | } else { 154 | auto gate = parse_gate(buffer); 155 | if (gate.first == "crx") { 156 | assert(gate.second.size() == 1); 157 | fscanf(f, "%s", buffer); 158 | auto qid = parse_qid(buffer); 159 | assert(qid.size() == 2); 160 | c->addGate(Gate::CRX(qid[0], qid[1], gate.second[0])); 161 | // printf("crx %d %d %f\n", qid[0], qid[1], gate.second[0]); 162 | } else if (gate.first == "cry") { 163 | assert(gate.second.size() == 1); 164 | fscanf(f, "%s", buffer); 165 | auto qid = parse_qid(buffer); 166 | assert(qid.size() == 2); 167 | c->addGate(Gate::CRY(qid[0], qid[1], gate.second[0])); 168 | // printf("cry %d %d %f\n", qid[0], qid[1], gate.second[0]); 169 | } else if (gate.first == "crz") { 170 | assert(gate.second.size() == 1); 171 | fscanf(f, "%s", buffer); 172 | auto qid = parse_qid(buffer); 173 | assert(qid.size() == 2); 174 | c->addGate(Gate::CRZ(qid[0], qid[1], gate.second[0])); 175 | // printf("crz %d %d %f\n", qid[0], qid[1], gate.second[0]); 176 | } else if (gate.first == "cu1") { 177 | assert(gate.second.size() == 1); 178 | fscanf(f, "%s", buffer); 179 | auto qid = parse_qid(buffer); 180 | assert(qid.size() == 2); 181 | c->addGate(Gate::CU1(qid[0], qid[1], gate.second[0])); 182 | // printf("cu1 %d %d %f\n", qid[0], qid[1], gate.second[0]); 183 | } else if (gate.first == "u1") { 184 | assert(gate.second.size() == 1); 185 | fscanf(f, "%s", buffer); 186 | auto qid = parse_qid(buffer); 187 | assert(qid.size() == 1); 188 | c->addGate(Gate::U1(qid[0], gate.second[0])); 189 | // printf("u1 %d %f\n", qid[0], gate.second[0]); 190 | } else if (gate.first == "u3") { 191 | assert(gate.second.size() == 3); 192 | fscanf(f, "%s", buffer); 193 | auto qid = parse_qid(buffer); 194 | assert(qid.size() == 1); 195 | c->addGate(Gate::U3(qid[0], gate.second[0], gate.second[1], gate.second[2])); 196 | // printf("u3 %d %f %f %f\n", qid[0], gate.second[0], gate.second[1], gate.second[2]); 197 | } else if (gate.first == "rx") { 198 | assert(gate.second.size() == 1); 199 | fscanf(f, "%s", buffer); 200 | auto qid = parse_qid(buffer); 201 | assert(qid.size() == 1); 202 | c->addGate(Gate::RX(qid[0], gate.second[0])); 203 | // printf("rx %d %f\n", qid[0], gate.second[0]); 204 | } else if (gate.first == "ry") { 205 | assert(gate.second.size() == 1); 206 | fscanf(f, "%s", buffer); 207 | auto qid = parse_qid(buffer); 208 | assert(qid.size() == 1); 209 | c->addGate(Gate::RY(qid[0], gate.second[0])); 210 | // printf("ry %d %f\n", qid[0], gate.second[0]); 211 | } else if (gate.first == "rz") { 212 | assert(gate.second.size() == 1); 213 | fscanf(f, "%s", buffer); 214 | auto qid = parse_qid(buffer); 215 | assert(qid.size() == 1); 216 | c->addGate(Gate::RZ(qid[0], gate.second[0])); 217 | // printf("rz %d %f\n", qid[0], gate.second[0]); 218 | } else { 219 | printf("unrecognized token %s\n", buffer); 220 | exit(1); 221 | } 222 | } 223 | fgets(buffer, BUFFER_SIZE, f); 224 | } 225 | fclose(f); 226 | if (c == nullptr) { 227 | printf("fail to load circuit\n"); 228 | exit(1); 229 | } 230 | return std::move(c); 231 | } 232 | 233 | int main(int argc, char* argv[]) { 234 | #if USE_MPI 235 | MyMPI::init(); 236 | #endif 237 | MyGlobalVars::init(); 238 | std::unique_ptr c; 239 | if (argc != 2) { 240 | printf("./parser qasmfile\n"); 241 | exit(1); 242 | } 243 | c = parse_circuit(std::string(argv[1])); 244 | c->compile(); 245 | c->run(); 246 | c->printState(); 247 | Logger::print(); 248 | #if USE_MPI 249 | checkMPIErrors(MPI_Finalize()); 250 | #endif 251 | return 0; 252 | } -------------------------------------------------------------------------------- /micro-benchmark/bench-blas.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "circuit.h" 7 | #include "logger.h" 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]) { 11 | MyGlobalVars::init(); 12 | int n = 28; 13 | printf("MATSIZE %d ", BLAS_MAT_LIMIT); 14 | for (int tt = 0; tt < 5; tt++) { 15 | Circuit c(n); 16 | for (int i = 0; i < 10 * BLAS_MAT_LIMIT; i++) { 17 | c.addGate(Gate::H(i % (BLAS_MAT_LIMIT * 2))); 18 | } 19 | c.compile(); 20 | int time = c.run(false); 21 | printf("%d ", time); 22 | } 23 | printf("\n"); 24 | return 0; 25 | } -------------------------------------------------------------------------------- /micro-benchmark/local-ctr.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "circuit.h" 7 | #include "logger.h" 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]) { 11 | MyGlobalVars::init(); 12 | int n = 28; 13 | int num_gates = 512; 14 | for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) { 15 | printf("%s\n", Gate::get_name(GateType(g)).c_str()); 16 | for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) { 17 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) { 18 | if (i == j) { printf(" "); continue; } 19 | Circuit c(n); 20 | for (int k = 0; k < num_gates; k++) { 21 | c.addGate(Gate::control(i, j, GateType(g))); 22 | } 23 | c.compile(); 24 | int time = c.run(false); 25 | printf("%d ", time); 26 | fflush(stdout); 27 | } 28 | printf("\n"); 29 | } 30 | } 31 | return 0; 32 | } -------------------------------------------------------------------------------- /micro-benchmark/local-single.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "circuit.h" 7 | #include "logger.h" 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]) { 11 | MyGlobalVars::init(); 12 | int n = 28; 13 | int num_gates = 512; 14 | for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) { 15 | printf("%s: ", Gate::get_name(GateType(i)).c_str()); 16 | for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) { 17 | Circuit c(n); 18 | for (int k = 0; k < num_gates; k++) { 19 | c.addGate(Gate::random(j, j + 1, GateType(i))); 20 | } 21 | c.compile(); 22 | int time = c.run(false); 23 | printf("%d ", time); 24 | fflush(stdout); 25 | } 26 | printf("\n"); 27 | } 28 | return 0; 29 | } -------------------------------------------------------------------------------- /micro-benchmark/two-group-h.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "circuit.h" 7 | #include "logger.h" 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]) { 11 | MyGlobalVars::init(); 12 | for (int i = 6; i < 200; i += 6) { 13 | printf("%d:", i); 14 | for (int tt = 0; tt < 5; tt++) { 15 | Circuit c(28); 16 | for (int j = 0; j < i; j++) 17 | c.addGate(Gate::H(j % 6)); 18 | c.compile(); 19 | int time = c.run(false); 20 | printf("%d ", time); 21 | } 22 | printf("\n"); 23 | } 24 | return 0; 25 | } -------------------------------------------------------------------------------- /scripts/.gitignore: -------------------------------------------------------------------------------- 1 | _check.sh 2 | _run.sh 3 | -------------------------------------------------------------------------------- /scripts/check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | name=../build/logs/`date +%Y%m%d-%H%M%S` 3 | mkdir -p $name 4 | 5 | # command for no_mpi 6 | MPIRUN_CONFIG="" ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=off -DDISABLE_ASSERT=on -DMAT=7 2>&1 | tee $name/std.out 7 | 8 | # command for mpi 9 | MPIRUN_CONFIG="`which mpirun` -x GPUPerRank=2 -host nico3:2 ../scripts/env.sh ../scripts/gpu-bind.sh" 10 | MPIRUN_CONFIG=$MPIRUN_CONFIG ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=on -DDISABLE_ASSERT=on -DMAT=7 -DUSE_MPI=on 2>&1 | tee $name/std.out 11 | -------------------------------------------------------------------------------- /scripts/check_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | source init.sh ${@: 2} 4 | input_dir=../tests/input 5 | std_dir=../tests/output 6 | 7 | for test in ${tests[*]}; do 8 | $MPIRUN_CONFIG ./main $input_dir/$test.qasm > $1/$test.log 9 | grep "Logger" $1/$test.log 10 | done 11 | 12 | set +x 13 | set +e 14 | 15 | for test in ${tests[*]}; do 16 | line=`cat $std_dir/$test.log | wc -l` 17 | echo $test 18 | grep -Ev "Logger|CLUSTER" $1/$test.log > tmp.log 19 | diff -q -B $std_dir/$test.log tmp.log || true 20 | done 21 | 22 | grep -Er "Logger:.*Time" $1/*.log 23 | -------------------------------------------------------------------------------- /scripts/coalescing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c0 5 | mkdir -p $name 6 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -D COALESCE=0 2>&1 | tee $name/std.out 7 | name1=$name 8 | 9 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c1 10 | mkdir -p $name 11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=1 2>&1 | tee $name/std.out 12 | name2=$name 13 | 14 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c2 15 | mkdir -p $name 16 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=2 2>&1 | tee $name/std.out 17 | name3=$name 18 | 19 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c3 20 | mkdir -p $name 21 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=3 2>&1 | tee $name/std.out 22 | name4=$name 23 | 24 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c4 25 | mkdir -p $name 26 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=4 2>&1 | tee $name/std.out 27 | name5=$name 28 | 29 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c5 30 | mkdir -p $name 31 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=5 2>&1 | tee $name/std.out 32 | name6=$name 33 | 34 | tail -n 9 $name1/std.out 35 | tail -n 9 $name2/std.out 36 | tail -n 9 $name3/std.out 37 | tail -n 9 $name4/std.out 38 | tail -n 9 $name5/std.out 39 | tail -n 9 $name6/std.out 40 | -------------------------------------------------------------------------------- /scripts/compare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | cases = ['adder_26', 'basis_change_28', 'bv_28', 'hidden_shift_28', 'ising_25', 'qaoa_28', 'qft_28', 'quantum_volume_28', 'supremacy_28'] 5 | std_dir = sys.argv[1] 6 | my_dir = sys.argv[2] 7 | 8 | for case in cases: 9 | std = [] 10 | with open(os.path.join(std_dir, case + '.log')) as f: 11 | for s in f.readlines(): 12 | a, b = s.strip().split()[2:4] 13 | std.append([float(a), float(b)]) 14 | std = np.array(std) 15 | std[np.abs(std) < 1e-10] = 0 16 | 17 | my = [] 18 | with open(os.path.join(my_dir, case + '.log')) as f: 19 | for s in f.readlines(): 20 | if s.startswith('Logger'): 21 | continue 22 | a, b = s.strip().split()[2:4] 23 | my.append([float(a), float(b)]) 24 | my = np.array(my) 25 | my[np.abs(my) < 1e-10] = 0 26 | if (std.shape != my.shape): 27 | print("[{}]".format(case), "shape not match") 28 | continue 29 | err = np.abs(std-my) 30 | rela = np.abs(std - my) / (np.maximum(np.abs(std), np.abs(my)) + 1e-10) 31 | print("[{}]".format(case), 32 | "err:", np.max(err), np.argmax(err), 33 | "rela:", np.max(rela), np.argmax(rela)) -------------------------------------------------------------------------------- /scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | case $(hostname -s) in 3 | nico*) 4 | echo "[CLUSTER] nico" 5 | source /opt/spack/share/spack/setup-env.sh 6 | spack load cuda@10.2.89 /v5oqq5n 7 | spack load openmpi@4.0.5 /h5eun6a 8 | export NCCL_ROOT=/home/heheda/tools/nccl/build 9 | export CPATH=$NCCL_ROOT/include:$CPATH 10 | export LIBRARY_PATH=$NCCL_ROOT/lib:$LIBRARY_PATH 11 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH 12 | ;; 13 | gorgon*) 14 | echo "[CLUSTER] gorgon" 15 | source /usr/local/Modules/init/bash 16 | module load cuda-10.2/cuda 17 | module load cmake-3.12.3 18 | module load openmpi-3.0.0 19 | ;; 20 | i*) 21 | echo "[CLUSTER] scc" 22 | source /opt/spack/share/spack/setup-env.sh 23 | spack load cuda@10.2.89 /tlfcinz 24 | spack load openmpi@3.1.6 /5aaect6 25 | ;; 26 | hanzo) 27 | echo "[CLUSTER] hanzo" 28 | source /opt/spack/share/spack/setup-env.sh 29 | export PATH=$HOME/package/cmake-3.19.2-Linux-x86_64/bin:/usr/mpi/gcc/openmpi-4.1.0rc5/bin:$PATH 30 | # use system mpi 31 | export CPATH=/usr/mpi/gcc/openmpi-4.1.0rc5/include:${CPATH-} 32 | spack load gcc@8.3.0 /liymwyb 33 | spack load cuda@10.2.89 /tlfcinz 34 | ;; 35 | nova) 36 | echo "[CLUSTER] nova" 37 | source /opt/spack/share/spack/setup-env.sh 38 | spack load cuda@11 /njgeoec 39 | spack load openmpi /dfes7hw 40 | esac 41 | 42 | $@ -------------------------------------------------------------------------------- /scripts/gen_stdout.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source init.sh -DBACKEND=1 -DSHOW_SUMMARY=off 3 | for test in ${tests[*]}; do 4 | echo $test 5 | ./main ../tests/input/$test.qasm > ../tests/output/$test.log 6 | done 7 | -------------------------------------------------------------------------------- /scripts/gpu-bind.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rank=$OMPI_COMM_WORLD_LOCAL_RANK 3 | GPU_start=$(( $rank * $GPUPerRank )) 4 | GPU_end=$(( ($rank + 1) * $GPUPerRank - 1 )) 5 | GPU=`echo $(for i in $(seq $GPU_start $GPU_end); do printf "$i,"; done)` 6 | CUDA_VISIBLE_DEVICES=$GPU $@ -------------------------------------------------------------------------------- /scripts/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | set -e 4 | 5 | source env.sh "" 6 | 7 | mkdir -p $HYQUAS_ROOT/build 8 | cd $HYQUAS_ROOT/build 9 | rm CMakeCache.txt || true 10 | cmake $* .. 11 | make clean 12 | make -j 13 | 14 | if [ -z "${tests-}" ] 15 | then 16 | export tests_25="basis_change_25 bv_25 hidden_shift_25 qaoa_25 qft_25 quantum_volume_25 supremacy_25" 17 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28" 18 | export tests_30="basis_change_30 bv_30 hidden_shift_30 qaoa_30 qft_30 quantum_volume_30 supremacy_30" 19 | export tests_scale="basis_change_24 basis_change_25 basis_change_26 basis_change_27 basis_change_28" 20 | 21 | export tests=($tests_28) 22 | fi 23 | -------------------------------------------------------------------------------- /scripts/run-multi-GPU.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7 3 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/supremacy_28.qasm 4 | -------------------------------------------------------------------------------- /scripts/run-multi-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7 3 | `which mpirun` -host nico3:2 -x GPUPerRank=2 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm 4 | -------------------------------------------------------------------------------- /scripts/run-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7 3 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/supremacy_28.qasm 4 | -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7 3 | `which mpirun` -host nico1:2,nico2:2 -x GPUPerRank=4 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm 4 | # CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/qft_28.qasm -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(QCSimulator) 2 | aux_source_directory(. SRC_CXX) 3 | 4 | cuda_add_library(QCSimulator ${SRC_CXX}) 5 | -------------------------------------------------------------------------------- /src/circuit.cpp: -------------------------------------------------------------------------------- 1 | #include "circuit.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "utils.h" 10 | #include "kernel.h" 11 | #include "compiler.h" 12 | #include "logger.h" 13 | #include "executor.h" 14 | using namespace std; 15 | 16 | int Circuit::run(bool copy_back, bool destroy) { 17 | kernelInit(deviceStateVec, numQubits); 18 | for (int i = 0; i < MyGlobalVars::localGPUs; i++) { 19 | checkCudaErrors(cudaSetDevice(i)); 20 | checkCudaErrors(cudaProfilerStart()); 21 | } 22 | auto start = chrono::system_clock::now(); 23 | #if BACKEND == 0 24 | kernelExecSimple(deviceStateVec[0], numQubits, gates); 25 | #elif BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5 26 | Executor(deviceStateVec, numQubits, schedule).run(); 27 | #elif BACKEND == 2 28 | gates.clear(); 29 | for (size_t lgID = 0; lgID < schedule.localGroups.size(); lgID++) { 30 | auto& lg = schedule.localGroups[lgID]; 31 | for (size_t ggID = 0; ggID < lg.overlapGroups.size(); ggID++) { 32 | auto& gg = lg.overlapGroups[ggID]; 33 | for (auto& g: gg.gates) 34 | gates.push_back(g); 35 | } 36 | // if (lgID == 2) break; 37 | for (size_t ggID = 0; ggID < lg.fullGroups.size(); ggID++) { 38 | auto& gg = lg.fullGroups[ggID]; 39 | for (auto& g: gg.gates) 40 | gates.push_back(g); 41 | } 42 | } 43 | schedule.finalState = State(numQubits); 44 | kernelExecSimple(deviceStateVec[0], numQubits, gates); 45 | #endif 46 | auto end = chrono::system_clock::now(); 47 | for (int i = 0; i < MyGlobalVars::localGPUs; i++) { 48 | checkCudaErrors(cudaSetDevice(i)); 49 | checkCudaErrors(cudaProfilerStop()); 50 | } 51 | auto duration = chrono::duration_cast(end - start); 52 | Logger::add("Time Cost: %d us", int(duration.count())); 53 | 54 | if (copy_back) { 55 | result.resize(1ll << numQubits); // very slow ... 56 | #if BACKEND == 0 || BACKEND == 2 57 | kernelDeviceToHost((qComplex*)result.data(), deviceStateVec[0], numQubits); 58 | #else 59 | qindex elements = 1ll << (numQubits - MyGlobalVars::bit); 60 | for (int g = 0; g < MyGlobalVars::localGPUs; g++) { 61 | kernelDeviceToHost((qComplex*)result.data() + elements * g, deviceStateVec[g], numQubits - MyGlobalVars::bit); 62 | } 63 | #endif 64 | } 65 | if (destroy) { 66 | for (int g = 0; g < MyGlobalVars::localGPUs; g++) { 67 | kernelDestroy(deviceStateVec[g]); 68 | } 69 | } 70 | return duration.count(); 71 | } 72 | 73 | void Circuit::dumpGates() { 74 | int totalGates = gates.size(); 75 | printf("total Gates: %d\n", totalGates); 76 | int L = 3; 77 | for (const Gate& gate: gates) { 78 | for (int i = 0; i < numQubits; i++) { 79 | if (i == gate.controlQubit) { 80 | printf("."); 81 | for (int j = 1; j < L; j++) printf(" "); 82 | } else if (i == gate.targetQubit) { 83 | printf("%s", gate.name.c_str()); 84 | for (int j = gate.name.length(); j < L; j++) 85 | printf(" "); 86 | } else { 87 | printf("|"); 88 | for (int j = 1; j < L; j++) printf(" "); 89 | } 90 | } 91 | printf("\n"); 92 | } 93 | } 94 | 95 | qindex Circuit::toPhysicalID(qindex idx) { 96 | qindex id = 0; 97 | auto& pos = schedule.finalState.pos; 98 | for (int i = 0; i < numQubits; i++) { 99 | if (idx >> i & 1) 100 | id |= qindex(1) << pos[i]; 101 | } 102 | return id; 103 | } 104 | 105 | qindex Circuit::toLogicID(qindex idx) { 106 | qindex id = 0; 107 | auto& pos = schedule.finalState.pos; 108 | for (int i = 0; i < numQubits; i++) { 109 | if (idx >> pos[i] & 1) 110 | id |= qindex(1) << i; 111 | } 112 | return id; 113 | } 114 | 115 | ResultItem Circuit::ampAt(qindex idx) { 116 | qindex id = toPhysicalID(idx); 117 | return ResultItem(idx, make_qComplex(result[id].x, result[id].y)); 118 | } 119 | 120 | qComplex Circuit::ampAtGPU(qindex idx) { 121 | qindex id = toPhysicalID(idx); 122 | qComplex ret; 123 | #if USE_MPI 124 | qindex localAmps = (1ll << numQubits) / MyMPI::commSize; 125 | qindex rankID = id / localAmps; 126 | 127 | if (!USE_MPI || MyMPI::rank == rankID) { 128 | int localID = id % localAmps; 129 | #else 130 | int localID = id; 131 | #endif 132 | qindex localGPUAmp = (1ll << numQubits) / MyGlobalVars::numGPUs; 133 | int gpuID = localID / localGPUAmp; 134 | qindex localGPUID = localID % localGPUAmp; 135 | checkCudaErrors(cudaSetDevice(gpuID)); 136 | ret = kernelGetAmp(deviceStateVec[gpuID], localGPUID); 137 | #if USE_MPI 138 | } 139 | MPI_Bcast(&ret, 1, MPI_Complex, rankID, MPI_COMM_WORLD); 140 | #endif 141 | return ret; 142 | } 143 | 144 | bool Circuit::localAmpAt(qindex idx, ResultItem& item) { 145 | qindex localAmps = (1ll << numQubits) / MyMPI::commSize; 146 | qindex id = toPhysicalID(idx); 147 | if (id / localAmps == MyMPI::rank) { 148 | // printf("%d belongs to rank %d\n", idx, MyMPI::rank); 149 | qindex localID = id % localAmps; 150 | item = ResultItem(idx, make_qComplex(result[localID].x, result[localID].y)); 151 | return true; 152 | } 153 | return false; 154 | } 155 | 156 | void Circuit::masterCompile() { 157 | Logger::add("Total Gates %d", int(gates.size())); 158 | #if BACKEND == 1 || BACKEND == 2 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5 159 | Compiler compiler(numQubits, gates); 160 | schedule = compiler.run(); 161 | int totalGroups = 0; 162 | for (auto& lg: schedule.localGroups) totalGroups += lg.fullGroups.size(); 163 | int fullGates = 0, overlapGates = 0; 164 | for (auto& lg: schedule.localGroups) { 165 | for (auto& gg: lg.fullGroups) fullGates += gg.gates.size(); 166 | for (auto& gg: lg.overlapGroups) overlapGates += gg.gates.size(); 167 | } 168 | Logger::add("Total Groups: %d %d %d %d", int(schedule.localGroups.size()), totalGroups, fullGates, overlapGates); 169 | #ifdef SHOW_SCHEDULE 170 | schedule.dump(numQubits); 171 | #endif 172 | #else 173 | schedule.finalState = State(numQubits); 174 | #endif 175 | } 176 | 177 | void Circuit::compile() { 178 | auto start = chrono::system_clock::now(); 179 | #if USE_MPI 180 | if (MyMPI::rank == 0) { 181 | masterCompile(); 182 | auto s = schedule.serialize(); 183 | int bufferSize = (int) s.size(); 184 | checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD)); 185 | checkMPIErrors(MPI_Bcast(s.data(), bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD)); 186 | int cur = 0; 187 | // schedule = Schedule::deserialize(s.data(), cur); 188 | } else { 189 | int bufferSize; 190 | checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD)); 191 | unsigned char* buffer = new unsigned char [bufferSize]; 192 | checkMPIErrors(MPI_Bcast(buffer, bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD)); 193 | int cur = 0; 194 | schedule = Schedule::deserialize(buffer, cur); 195 | delete[] buffer; 196 | fflush(stdout); 197 | } 198 | #else 199 | masterCompile(); 200 | #endif 201 | auto mid = chrono::system_clock::now(); 202 | schedule.initCuttPlans(numQubits - MyGlobalVars::bit); 203 | #ifndef OVERLAP_MAT 204 | schedule.initMatrix(numQubits); 205 | #endif 206 | auto end = chrono::system_clock::now(); 207 | auto duration1 = chrono::duration_cast(mid - start); 208 | auto duration2 = chrono::duration_cast(end - mid); 209 | Logger::add("Compile Time: %d us + %d us = %d us", int(duration1.count()), int(duration2.count()), int(duration1.count()) + int(duration2.count())); 210 | } 211 | 212 | #if USE_MPI 213 | void Circuit::gatherAndPrint(const std::vector& results) { 214 | if (MyMPI::rank == 0) { 215 | int size = results.size(); 216 | int sizes[MyMPI::commSize]; 217 | MPI_Gather(&size, 1, MPI_INT, sizes, 1, MPI_INT, 0, MPI_COMM_WORLD); 218 | int disp[MyMPI::commSize + 1]; 219 | disp[0] = 0; 220 | for (int i = 0; i < MyMPI::commSize; i++) 221 | disp[i + 1] = disp[i] + sizes[i]; 222 | int totalItem = disp[MyMPI::commSize]; 223 | ResultItem* collected = new ResultItem[totalItem]; 224 | for (int i = 0; i < MyMPI::commSize; i++) 225 | sizes[i] *= sizeof(ResultItem); 226 | for (int i = 0; i < MyMPI::commSize; i++) 227 | disp[i] *= sizeof(ResultItem); 228 | MPI_Gatherv( 229 | results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR, 230 | collected, sizes, disp, 231 | MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD 232 | ); 233 | sort(collected, collected + totalItem); 234 | for (int i = 0; i < totalItem; i++) 235 | collected[i].print(); 236 | delete[] collected; 237 | } else { 238 | int size = results.size(); 239 | MPI_Gather(&size, 1, MPI_INT, nullptr, 1, MPI_INT, 0, MPI_COMM_WORLD); 240 | MPI_Gatherv( 241 | results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR, 242 | nullptr, nullptr, nullptr, 243 | MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD 244 | ); 245 | } 246 | } 247 | #endif 248 | 249 | 250 | void Circuit::printState() { 251 | #if USE_MPI 252 | std::vector results; 253 | ResultItem item; 254 | for (int i = 0; i < 128; i++) { 255 | if (localAmpAt(i, item)) { 256 | results.push_back(item); 257 | } 258 | } 259 | gatherAndPrint(results); 260 | #ifdef SHOW_SCHEDULE 261 | results.clear(); 262 | for (int i = 0; i < numQubits; i++) { 263 | if (localAmpAt(1ll << i, item)) { 264 | results.push_back(item); 265 | } 266 | } 267 | if (localAmpAt((1ll << numQubits) - 1, item)) { 268 | results.push_back(item); 269 | } 270 | gatherAndPrint(results); 271 | #endif 272 | results.clear(); 273 | int numLocalAmps = (1ll << numQubits) / MyMPI::commSize; 274 | for (qindex i = 0; i < numLocalAmps; i++) { 275 | if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) { 276 | qindex logicID = toLogicID(i + numLocalAmps * MyMPI::rank); 277 | if (logicID >= 128) { 278 | // printf("large amp %d belongs to %d\n", logicID, MyMPI::rank); 279 | results.push_back(ResultItem(logicID, result[i])); 280 | } 281 | } 282 | } 283 | gatherAndPrint(results); 284 | #else 285 | std::vector results; 286 | for (int i = 0; i < 128; i++) { 287 | results.push_back(ampAt(i)); 288 | } 289 | #ifdef SHOW_SCHEDULE 290 | for (int i = 0; i < numQubits; i++) { 291 | results.push_back(ampAt(1ll << i)); 292 | } 293 | results.push_back(ampAt((1ll << numQubits) - 1)); 294 | #endif 295 | for (auto& item: results) 296 | item.print(); 297 | results.clear(); 298 | for (qindex i = 0; i < (1ll << numQubits); i++) { 299 | if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) { 300 | qindex logicID = toLogicID(i); 301 | if (logicID >= 128) { 302 | results.push_back(ResultItem(toLogicID(i), result[i])); 303 | } 304 | } 305 | } 306 | sort(results.begin(), results.end()); 307 | for (auto& item: results) 308 | item.print(); 309 | #endif 310 | } -------------------------------------------------------------------------------- /src/circuit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "utils.h" 6 | #include "gate.h" 7 | #include "schedule.h" 8 | 9 | struct ResultItem { 10 | ResultItem() = default; 11 | ResultItem(const qindex& idx, const qComplex& amp): idx(idx), amp(amp) {} 12 | qindex idx; 13 | qComplex amp; 14 | void print() { 15 | printf("%lld %.12f: %.12f %.12f\n", idx, amp.x * amp.x + amp.y * amp.y, zero_wrapper(amp.x), zero_wrapper(amp.y)); 16 | } 17 | bool operator < (const ResultItem& b) { return idx < b.idx; } 18 | }; 19 | 20 | class Circuit { 21 | public: 22 | Circuit(int numQubits): numQubits(numQubits) {} 23 | void compile(); 24 | int run(bool copy_back = true, bool destroy = true); 25 | void addGate(const Gate& gate) { 26 | gates.push_back(gate); 27 | } 28 | void dumpGates(); 29 | void printState(); 30 | ResultItem ampAt(qindex idx); 31 | qComplex ampAtGPU(qindex idx); 32 | bool localAmpAt(qindex idx, ResultItem& item); 33 | const int numQubits; 34 | 35 | private: 36 | qindex toPhysicalID(qindex idx); 37 | qindex toLogicID(qindex idx); 38 | void masterCompile(); 39 | #if USE_MPI 40 | void gatherAndPrint(const std::vector& results); 41 | #endif 42 | std::vector gates; 43 | std::vector deviceStateVec; 44 | std::vector> deviceMats; 45 | Schedule schedule; 46 | std::vector result; 47 | }; -------------------------------------------------------------------------------- /src/compiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "schedule.h" 6 | #include "utils.h" 7 | #include "gate.h" 8 | 9 | class Compiler { 10 | public: 11 | Compiler(int numQubits, std::vector inputGates); 12 | Schedule run(); 13 | private: 14 | void fillLocals(LocalGroup& lg); 15 | std::vector, qindex>> moveToNext(LocalGroup& lg); 16 | int numQubits; 17 | int localSize; 18 | int shareSize; 19 | bool enableGlobal; 20 | std::vector gates; 21 | }; 22 | 23 | template 24 | class OneLayerCompiler { 25 | public: 26 | OneLayerCompiler(int numQubits, const std::vector& inputGates); 27 | protected: 28 | int numQubits; 29 | std::vector remainGates; 30 | std::vector getGroupOpt(bool full[], qindex related[], bool enableGlobal, int localSize, qindex localQubits); 31 | void removeGatesOpt(const std::vector& remove); 32 | std::set remain; 33 | }; 34 | 35 | class SimpleCompiler: public OneLayerCompiler<2048> { 36 | public: 37 | SimpleCompiler(int numQubits, int localSize, qindex localQubits, const std::vector& inputGates, bool enableGlobal, qindex whiteList = 0, qindex required = 0); 38 | LocalGroup run(); 39 | private: 40 | int localSize; 41 | qindex localQubits; 42 | bool enableGlobal; 43 | qindex whiteList; 44 | qindex required; 45 | }; 46 | 47 | class AdvanceCompiler: public OneLayerCompiler<512> { 48 | public: 49 | AdvanceCompiler(int numQubits, qindex localQubits, qindex blasForbid, std::vector inputGates); 50 | LocalGroup run(State &state, bool usePerGate, bool useBLAS, int preGateSize, int blasSize, int cuttSize); 51 | private: 52 | qindex localQubits; 53 | qindex blasForbid; 54 | }; 55 | 56 | class ChunkCompiler: public OneLayerCompiler<512> { 57 | public: 58 | ChunkCompiler(int numQubits, int localSize, int chunkSize, const std::vector &inputGates); 59 | LocalGroup run(); 60 | private: 61 | int localSize, chunkSize; 62 | }; -------------------------------------------------------------------------------- /src/evaluator.cpp: -------------------------------------------------------------------------------- 1 | #include "evaluator.h" 2 | 3 | Evaluator* Evaluator::instance_ptr = nullptr; 4 | 5 | Evaluator::Evaluator() { 6 | memset(num_qbits_loaded_param, 0, sizeof(num_qbits_loaded_param)); 7 | #ifndef USE_EVALUATOR_PREPROCESS 8 | num_qbits_loaded_param[28] = true; 9 | memcpy(pergate_single_perf[28][int(GateType::U1)], V100_U1, sizeof(double) * LOCAL_QUBIT_SIZE); 10 | memcpy(pergate_single_perf[28][int(GateType::U2)], V100_U2, sizeof(double) * LOCAL_QUBIT_SIZE); 11 | memcpy(pergate_single_perf[28][int(GateType::U3)], V100_U3, sizeof(double) * LOCAL_QUBIT_SIZE); 12 | memcpy(pergate_single_perf[28][int(GateType::H )], V100_H , sizeof(double) * LOCAL_QUBIT_SIZE); 13 | memcpy(pergate_single_perf[28][int(GateType::X )], V100_X , sizeof(double) * LOCAL_QUBIT_SIZE); 14 | memcpy(pergate_single_perf[28][int(GateType::Y )], V100_Y , sizeof(double) * LOCAL_QUBIT_SIZE); 15 | memcpy(pergate_single_perf[28][int(GateType::Z )], V100_Z , sizeof(double) * LOCAL_QUBIT_SIZE); 16 | memcpy(pergate_single_perf[28][int(GateType::S )], V100_S , sizeof(double) * LOCAL_QUBIT_SIZE); 17 | memcpy(pergate_single_perf[28][int(GateType::SDG )], V100_SDG , sizeof(double) * LOCAL_QUBIT_SIZE); 18 | memcpy(pergate_single_perf[28][int(GateType::T )], V100_T , sizeof(double) * LOCAL_QUBIT_SIZE); 19 | memcpy(pergate_single_perf[28][int(GateType::TDG )], V100_TDG , sizeof(double) * LOCAL_QUBIT_SIZE); 20 | memcpy(pergate_single_perf[28][int(GateType::RX)], V100_RX, sizeof(double) * LOCAL_QUBIT_SIZE); 21 | memcpy(pergate_single_perf[28][int(GateType::RY)], V100_RY, sizeof(double) * LOCAL_QUBIT_SIZE); 22 | memcpy(pergate_single_perf[28][int(GateType::RZ)], V100_RZ, sizeof(double) * LOCAL_QUBIT_SIZE); 23 | 24 | memcpy(pergate_ctr_perf[28][int(GateType::CNOT)], V100_CN , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 25 | memcpy(pergate_ctr_perf[28][int(GateType::CY )], V100_CY , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 26 | memcpy(pergate_ctr_perf[28][int(GateType::CZ )], V100_CZ , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 27 | memcpy(pergate_ctr_perf[28][int(GateType::CRX )], V100_CRX, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 28 | memcpy(pergate_ctr_perf[28][int(GateType::CRY )], V100_CRY, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 29 | memcpy(pergate_ctr_perf[28][int(GateType::CU1 )], V100_CU1, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 30 | memcpy(pergate_ctr_perf[28][int(GateType::CRZ )], V100_CRZ, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE); 31 | 32 | BLAS_perf[28][6] = 23.068396; 33 | cutt_cost[28] = 11.367814; 34 | #endif 35 | } 36 | 37 | void Evaluator::loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type) { 38 | if(param_type == CALC_ALL_PARAM) { 39 | for(int i = 0; i < LOCAL_QUBIT_SIZE; i++) { 40 | fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][i]); 41 | } 42 | } 43 | else { 44 | fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][1]); 45 | } 46 | } 47 | 48 | void Evaluator::loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type) { 49 | if(param_type == CALC_ALL_PARAM) { 50 | for(int i = 0; i < LOCAL_QUBIT_SIZE; i++) 51 | for(int j = 0; j < LOCAL_QUBIT_SIZE; j++) { 52 | fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][i][j]); 53 | } 54 | } 55 | else { 56 | fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][0][2]); 57 | } 58 | } 59 | 60 | void Evaluator::loadParam(int numQubits) { 61 | if(num_qbits_loaded_param[numQubits]) 62 | return; 63 | #ifdef USE_EVALUATOR_PREPROCESS 64 | FILE* qbit_param; 65 | std::string param_file_name = std::string("../evaluator-preprocess/parameter-files/") 66 | + std::to_string(numQubits) + std::string("qubits.out"); 67 | if((qbit_param = fopen(param_file_name.c_str(), "r"))) { 68 | fscanf(qbit_param, "%d", ¶m_type); 69 | 70 | loadPergateSingle(numQubits, qbit_param, GateType::U1); 71 | loadPergateSingle(numQubits, qbit_param, GateType::U2); 72 | loadPergateSingle(numQubits, qbit_param, GateType::U3); 73 | loadPergateSingle(numQubits, qbit_param, GateType::H ); 74 | loadPergateSingle(numQubits, qbit_param, GateType::X ); 75 | loadPergateSingle(numQubits, qbit_param, GateType::Y ); 76 | loadPergateSingle(numQubits, qbit_param, GateType::Z ); 77 | loadPergateSingle(numQubits, qbit_param, GateType::S ); 78 | loadPergateSingle(numQubits, qbit_param, GateType::SDG); 79 | loadPergateSingle(numQubits, qbit_param, GateType::T ); 80 | loadPergateSingle(numQubits, qbit_param, GateType::TDG); 81 | loadPergateSingle(numQubits, qbit_param, GateType::RX); 82 | loadPergateSingle(numQubits, qbit_param, GateType::RY); 83 | loadPergateSingle(numQubits, qbit_param, GateType::RZ); 84 | 85 | loadPergateCtr(numQubits, qbit_param, GateType::CNOT); 86 | loadPergateCtr(numQubits, qbit_param, GateType::CY ); 87 | loadPergateCtr(numQubits, qbit_param, GateType::CZ ); 88 | loadPergateCtr(numQubits, qbit_param, GateType::CRX ); 89 | loadPergateCtr(numQubits, qbit_param, GateType::CRY ); 90 | loadPergateCtr(numQubits, qbit_param, GateType::CU1 ); 91 | loadPergateCtr(numQubits, qbit_param, GateType::CRZ ); 92 | 93 | for (int K = 1, i = 0; K < 1024; K <<= 1, i++) { 94 | fscanf(qbit_param, "%*d%lf", &BLAS_perf[numQubits][i]); 95 | } 96 | fscanf(qbit_param, "%lf", &cutt_cost[numQubits]); 97 | fclose(qbit_param); 98 | } else { 99 | printf("Parameter file not find for qubit number %d\n", numQubits); 100 | fflush(stdout); 101 | exit(1); 102 | } 103 | num_qbits_loaded_param[numQubits] = true; 104 | #else 105 | printf("Use option USE_EVALUATOR_PREPROCESS for non-default qubit number %d\n", numQubits); 106 | fflush(stdout); 107 | exit(1); 108 | #endif 109 | } 110 | 111 | double Evaluator::perfPerGate(int numQubits, const GateGroup* gg) { 112 | double tim_pred = 0; 113 | loadParam(numQubits); 114 | for(auto gate : (gg -> gates)) { 115 | switch(gate.type) { 116 | case GateType::CCX : 117 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break; 118 | case GateType::CNOT : 119 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break; 120 | case GateType::CY : 121 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break; 122 | case GateType::CZ : 123 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break; 124 | case GateType::CRX : 125 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break; 126 | case GateType::CRY : 127 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break; 128 | case GateType::CU1 : 129 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break; 130 | case GateType::CRZ : 131 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break; 132 | case GateType::U1 : 133 | tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break; 134 | case GateType::U2 : 135 | tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break; 136 | case GateType::U3 : 137 | tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break; 138 | case GateType::H : 139 | tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break; 140 | case GateType::X : 141 | tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break; 142 | case GateType::Y : 143 | tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break; 144 | case GateType::Z : 145 | tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break; 146 | case GateType::S : 147 | tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break; 148 | case GateType::SDG : 149 | tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break; 150 | case GateType::T : 151 | tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break; 152 | case GateType::TDG : 153 | tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break; 154 | case GateType::RX : 155 | tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break; 156 | case GateType::RY : 157 | tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break; 158 | case GateType::RZ : 159 | tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break; 160 | default: 161 | printf("meet wrong gate : %s\n", Gate::get_name(gate.type).c_str()); 162 | UNREACHABLE() 163 | } 164 | } 165 | return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits); 166 | } 167 | 168 | double Evaluator::perfPerGate(int numQubits, const std::vector& types) { 169 | double tim_pred = 0; 170 | loadParam(numQubits); 171 | for(auto ty : types) { 172 | switch(ty) { 173 | case GateType::CCX : 174 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break; 175 | case GateType::CNOT : 176 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break; 177 | case GateType::CY : 178 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break; 179 | case GateType::CZ : 180 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break; 181 | case GateType::CRX : 182 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break; 183 | case GateType::CRY : 184 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break; 185 | case GateType::CU1 : 186 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break; 187 | case GateType::CRZ : 188 | tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break; 189 | case GateType::U1 : 190 | tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break; 191 | case GateType::U2 : 192 | tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break; 193 | case GateType::U3 : 194 | tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break; 195 | case GateType::H : 196 | tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break; 197 | case GateType::X : 198 | tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break; 199 | case GateType::Y : 200 | tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break; 201 | case GateType::Z : 202 | tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break; 203 | case GateType::S : 204 | tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break; 205 | case GateType::SDG : 206 | tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break; 207 | case GateType::T : 208 | tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break; 209 | case GateType::TDG : 210 | tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break; 211 | case GateType::RX : 212 | tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break; 213 | case GateType::RY : 214 | tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break; 215 | case GateType::RZ : 216 | tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break; 217 | default: 218 | printf("meet wrong gate : %s\n", Gate::get_name(ty).c_str()); 219 | UNREACHABLE() 220 | } 221 | } 222 | return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits); 223 | } 224 | 225 | double Evaluator::perfBLAS(int numQubits, int blasSize) { 226 | loadParam(numQubits); 227 | //double bias = (numQubits < 28) ? ((qindex)1 << (28 - numQubits)) : (1.0 / ((qindex)1 << (numQubits - 28))); 228 | return BLAS_perf[numQubits][blasSize] + cutt_cost[numQubits]; 229 | } 230 | 231 | bool Evaluator::PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize) { 232 | double pergate = perfPerGate(numQubits, gg_pergate); 233 | double blas = perfBLAS(numQubits, blasSize); 234 | return pergate / (gg_pergate -> gates).size() < blas / (gg_blas -> gates).size(); 235 | } -------------------------------------------------------------------------------- /src/evaluator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "schedule.h" 3 | #include "utils.h" 4 | #include "gate.h" 5 | 6 | #define GATE_NUM 24 7 | #define MAX_QBITS 40 8 | 9 | #define CALC_ALL_PARAM 0 10 | #define CALC_PARTIAL_PARAM 1 11 | 12 | /* 13 | * build performance model to choose between BLAS and perGate backend 14 | * Is a singleton class 15 | **/ 16 | class Evaluator { 17 | private: 18 | const double V100_U1[LOCAL_QUBIT_SIZE] = {235,225,225,225,225,225,224,225,225,225}; 19 | const double V100_U2[LOCAL_QUBIT_SIZE] = {470,469,469,469,469,469,469,470,469,469}; 20 | const double V100_U3[LOCAL_QUBIT_SIZE] = {469,469,469,469,469,469,469,469,469,469}; 21 | const double V100_H[LOCAL_QUBIT_SIZE] = {352,352,352,352,352,352,352,352,352,352}; 22 | const double V100_X[LOCAL_QUBIT_SIZE] = {350,350,350,350,350,350,350,350,350,350}; 23 | const double V100_Y[LOCAL_QUBIT_SIZE] = {350,350,350,350,350,349,349,350,350,350}; 24 | const double V100_Z[LOCAL_QUBIT_SIZE] = {194,194,194,194,194,194,194,194,194,194}; 25 | const double V100_S[LOCAL_QUBIT_SIZE] = {209,209,209,209,209,209,209,209,209,209}; 26 | const double V100_SDG[LOCAL_QUBIT_SIZE] = {209,209,209,209,209,209,209,209,209,209}; // TODO 27 | const double V100_T[LOCAL_QUBIT_SIZE] = {216,216,216,216,216,216,217,216,216,216}; 28 | const double V100_TDG[LOCAL_QUBIT_SIZE] = {216,216,216,216,216,216,217,216,216,216}; // TODO 29 | const double V100_RX[LOCAL_QUBIT_SIZE] = {370,370,370,370,370,370,370,370,370,370}; 30 | const double V100_RY[LOCAL_QUBIT_SIZE] = {367,367,367,367,367,367,367,367,367,367}; 31 | const double V100_RZ[LOCAL_QUBIT_SIZE] = {369,369,369,369,369,369,369,369,369,369}; 32 | 33 | const double V100_CN[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 34 | 0,213,195,345,193,193,193,193,193,193, 35 | 193,0,193,193,345,193,193,193,193,193, 36 | 193,193,0,193,193,345,193,193,193,193, 37 | 345,193,193,0,193,193,193,193,193,193, 38 | 193,345,193,193,0,193,193,193,193,193, 39 | 193,193,345,193,193,0,193,193,193,193, 40 | 193,193,193,193,193,193,0,193,193,193, 41 | 193,193,193,193,193,193,193,0,193,193, 42 | 193,193,193,193,193,193,193,193,0,193, 43 | 193,193,193,193,193,193,193,193,193,0, 44 | }; 45 | const double V100_CY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 46 | 0,193,193,346,193,193,193,193,193,193, 47 | 193,0,193,193,346,193,193,193,193,193, 48 | 193,193,0,193,193,345,193,193,193,193, 49 | 346,193,193,0,193,193,192,193,193,193, 50 | 193,345,193,193,0,193,193,193,193,193, 51 | 193,193,345,193,193,0,193,193,193,193, 52 | 193,193,193,193,193,193,0,193,192,193, 53 | 193,193,193,193,193,193,193,0,193,193, 54 | 193,193,193,193,193,192,193,193,0,193, 55 | 193,193,192,193,193,192,193,193,193,0, 56 | }; 57 | const double V100_CZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 58 | 0,137,137,191,137,137,137,137,137,137, 59 | 137,0,137,137,190,137,137,137,137,137, 60 | 137,137,0,137,137,191,137,137,137,137, 61 | 190,137,137,0,137,137,137,137,137,137, 62 | 137,190,137,137,0,137,137,137,137,137, 63 | 137,137,191,137,137,0,137,137,137,137, 64 | 137,137,137,137,137,137,0,137,137,137, 65 | 137,137,137,137,137,137,137,0,137,137, 66 | 137,137,137,137,137,137,137,137,0,137, 67 | 137,137,137,137,137,137,137,137,137,0, 68 | }; 69 | const double V100_CRX[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 70 | 0,224,224,358,224,224,223,224,224,224, 71 | 224,0,224,224,358,224,224,224,223,224, 72 | 224,224,0,224,224,358,223,224,224,223, 73 | 358,224,223,0,224,223,224,223,223,224, 74 | 223,358,224,224,0,224,224,223,223,224, 75 | 223,223,358,224,224,0,223,224,224,224, 76 | 224,223,223,223,224,224,0,224,224,224, 77 | 224,224,224,224,224,224,223,0,224,223, 78 | 224,224,224,224,224,224,224,223,0,224, 79 | 224,224,224,224,224,224,224,224,223,0, 80 | }; 81 | const double V100_CRY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 82 | 0,225,225,356,225,225,225,225,225,225, 83 | 225,0,225,225,356,225,224,225,225,225, 84 | 225,225,0,225,224,356,225,225,225,225, 85 | 356,225,225,0,225,225,225,225,224,225, 86 | 225,356,225,225,0,225,224,225,225,225, 87 | 225,225,356,225,225,0,225,225,225,225, 88 | 225,225,225,225,225,224,0,225,225,225, 89 | 225,225,225,225,224,225,225,0,225,225, 90 | 225,225,225,225,225,225,225,225,0,225, 91 | 225,225,225,225,225,225,225,225,225,0, 92 | }; 93 | const double V100_CU1[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 94 | // FIXME 95 | 0,225,225,356,225,225,225,225,225,225, 96 | 225,0,225,225,356,225,224,225,225,225, 97 | 225,225,0,225,224,356,225,225,225,225, 98 | 356,225,225,0,225,225,225,225,224,225, 99 | 225,356,225,225,0,225,224,225,225,225, 100 | 225,225,356,225,225,0,225,225,225,225, 101 | 225,225,225,225,225,224,0,225,225,225, 102 | 225,225,225,225,224,225,225,0,225,225, 103 | 225,225,225,225,225,225,225,225,0,225, 104 | 225,225,225,225,225,225,225,225,225,0, 105 | }; 106 | const double V100_CRZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = { 107 | 0,224,224,359,224,224,224,224,224,224, 108 | 224,0,224,224,359,224,224,224,224,224, 109 | 224,224,0,224,224,359,224,224,224,224, 110 | 359,224,224,0,224,224,224,224,224,224, 111 | 224,359,224,224,0,224,224,224,224,224, 112 | 224,224,359,224,224,0,224,224,224,224, 113 | 224,224,224,224,224,224,0,224,224,224, 114 | 224,224,224,224,224,224,224,0,224,224, 115 | 224,224,224,224,224,224,224,224,0,224, 116 | 224,224,224,224,224,224,224,224,224,0, 117 | }; 118 | 119 | // pergate single gate performance for 512 runs with 28 qbits 120 | double pergate_single_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE]; 121 | // pergate control gate performance for 512 runs with 28 qbits 122 | double pergate_ctr_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE]; 123 | // overhead of one pergate group 124 | double BLAS_perf[MAX_QBITS + 1][MAX_QBITS + 1]; 125 | double cutt_cost[MAX_QBITS + 1]; 126 | bool num_qbits_loaded_param[MAX_QBITS + 1]; 127 | const double pergate_group_overhead = 1.0 / (1 << 27); 128 | 129 | int param_type; 130 | 131 | Evaluator(); 132 | 133 | static Evaluator* instance_ptr; 134 | public: 135 | static Evaluator* getInstance() { 136 | if(instance_ptr == nullptr) { 137 | instance_ptr = new Evaluator; 138 | } 139 | return instance_ptr; 140 | } 141 | void loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type); 142 | void loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type); 143 | void loadParam(int numQubits); 144 | double perfPerGate(int numQubits, const GateGroup* gg); 145 | double perfPerGate(int numQubits, const std::vector& types); 146 | double perfBLAS(int numQubits, int blasSize); 147 | // return True if choose pergate over BLAS 148 | bool PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize); 149 | }; 150 | -------------------------------------------------------------------------------- /src/executor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "utils.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "schedule.h" 9 | 10 | class Executor { 11 | public: 12 | Executor(std::vector deviceStateVec, int numQubits, Schedule& schedule); 13 | void run(); 14 | private: 15 | // instructions 16 | void transpose(std::vector plans); 17 | void all2all(int commSize, std::vector comm); 18 | void setState(const State& newState) { state = newState; } 19 | void applyGateGroup(GateGroup& gg, int sliceID = -1); 20 | void applyPerGateGroup(GateGroup& gg); 21 | void applyBlasGroup(GateGroup& gg); 22 | void applyPerGateGroupSliced(GateGroup& gg, int sliceID); 23 | void applyBlasGroupSliced(GateGroup& gg, int sliceID); 24 | void finalize(); 25 | void storeState(); 26 | void loadState(); 27 | void sliceBarrier(int sliceID); 28 | void allBarrier(); 29 | 30 | // utils 31 | qindex toPhyQubitSet(qindex logicQubitset) const; 32 | qindex fillRelatedQubits(qindex related) const; 33 | KernelGate getGate(const Gate& gate, int part_id, int numLocalQubits, qindex relatedLogicQb, const std::map& toID) const; 34 | 35 | // internal 36 | void prepareBitMap(qindex relatedQubits, unsigned int& blockHot, unsigned int& threadBias, int numLocalQubits); // allocate threadBias 37 | std::map getLogicShareMap(qindex relatedQubits, int numLocalQubits) const; // input: physical, output logic -> share 38 | 39 | State state; 40 | State oldState; 41 | std::vector commEvents; // commEvents[slice][gpuID] 42 | std::vector partID; // partID[slice][gpuID] 43 | std::vector peer; // peer[slice][gpuID] 44 | 45 | // constants 46 | std::vector threadBias; 47 | std::vector deviceStateVec; 48 | std::vector deviceBuffer; 49 | int numQubits; 50 | int numSlice, numSliceBit; 51 | 52 | //schedule 53 | Schedule& schedule; 54 | 55 | }; -------------------------------------------------------------------------------- /src/gate.cpp: -------------------------------------------------------------------------------- 1 | #include "gate.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | static int globalGateID = 0; 8 | 9 | Gate Gate::CCX(int controlQubit, int controlQubit2, int targetQubit) { 10 | Gate g; 11 | g.gateID = ++ globalGateID; 12 | g.type = GateType::CCX; 13 | g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1); 14 | g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0); 15 | g.name = "CCX"; 16 | g.targetQubit = targetQubit; 17 | g.controlQubit = controlQubit; 18 | g.controlQubit2 = controlQubit2; 19 | return g; 20 | 21 | } 22 | 23 | Gate Gate::CNOT(int controlQubit, int targetQubit) { 24 | Gate g; 25 | g.gateID = ++ globalGateID; 26 | g.type = GateType::CNOT; 27 | g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1); 28 | g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0); 29 | g.name = "CN"; 30 | g.targetQubit = targetQubit; 31 | g.controlQubit = controlQubit; 32 | return g; 33 | } 34 | 35 | Gate Gate::CY(int controlQubit, int targetQubit) { 36 | Gate g; 37 | g.gateID = ++ globalGateID; 38 | g.type = GateType::CY; 39 | g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(0, -1); 40 | g.mat[1][0] = make_qComplex(0, 1); g.mat[1][1] = make_qComplex(0); 41 | g.name = "CY"; 42 | g.targetQubit = targetQubit; 43 | g.controlQubit = controlQubit; 44 | return g; 45 | } 46 | 47 | Gate Gate::CZ(int controlQubit, int targetQubit) { 48 | Gate g; 49 | g.gateID = ++ globalGateID; 50 | g.type = GateType::CZ; 51 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 52 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1); 53 | g.name = "CZ"; 54 | g.targetQubit = targetQubit; 55 | g.controlQubit = controlQubit; 56 | return g; 57 | } 58 | 59 | Gate Gate::CRX(int controlQubit, int targetQubit, qreal angle) { 60 | Gate g; 61 | g.gateID = ++ globalGateID; 62 | g.type = GateType::CRX; 63 | g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(0, -sin(angle/2.0)); 64 | g.mat[1][0] = make_qComplex(0, -sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0)); 65 | g.name = "CRX"; 66 | g.targetQubit = targetQubit; 67 | g.controlQubit = controlQubit; 68 | return g; 69 | } 70 | 71 | Gate Gate::CRY(int controlQubit, int targetQubit, qreal angle) { 72 | Gate g; 73 | g.gateID = ++ globalGateID; 74 | g.type = GateType::CRY; 75 | g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(-sin(angle/2.0)); 76 | g.mat[1][0] = make_qComplex(sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0)); 77 | g.name = "CRY"; 78 | g.targetQubit = targetQubit; 79 | g.controlQubit = controlQubit; 80 | return g; 81 | } 82 | 83 | Gate Gate::CU1(int controlQubit, int targetQubit, qreal lambda) { 84 | Gate g; 85 | g.gateID = ++ globalGateID; 86 | g.type = GateType::CU1; 87 | g.mat[0][0] = make_qComplex(1); 88 | g.mat[0][1] = make_qComplex(0); 89 | g.mat[1][0] = make_qComplex(0); 90 | g.mat[1][1] = make_qComplex(cos(lambda), sin(lambda)); 91 | g.name = "CU1"; 92 | g.targetQubit = targetQubit; 93 | g.controlQubit = controlQubit; 94 | return g; 95 | } 96 | 97 | Gate Gate::CRZ(int controlQubit, int targetQubit, qreal angle) { 98 | Gate g; 99 | g.gateID = ++ globalGateID; 100 | g.type = GateType::CRZ; 101 | g.mat[0][0] = make_qComplex(cos(angle/2), -sin(angle/2)); g.mat[0][1] = make_qComplex(0); 102 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(cos(angle/2), sin(angle/2)); 103 | g.name = "CRZ"; 104 | g.targetQubit = targetQubit; 105 | g.controlQubit = controlQubit; 106 | return g; 107 | } 108 | 109 | 110 | Gate Gate::U1(int targetQubit, qreal lambda) { 111 | Gate g; 112 | g.gateID = ++ globalGateID; 113 | g.type = GateType::U1; 114 | g.mat[0][0] = make_qComplex(1); 115 | g.mat[0][1] = make_qComplex(0); 116 | g.mat[1][0] = make_qComplex(0); 117 | g.mat[1][1] = make_qComplex(cos(lambda), sin(lambda)); 118 | g.name = "U1"; 119 | g.targetQubit = targetQubit; 120 | g.controlQubit = -1; 121 | return g; 122 | } 123 | 124 | Gate Gate::U2(int targetQubit, qreal phi, qreal lambda) { 125 | Gate g; 126 | g.gateID = ++ globalGateID; 127 | g.type = GateType::U2; 128 | g.mat[0][0] = make_qComplex(1.0 / sqrt(2)); 129 | g.mat[0][1] = make_qComplex(-cos(lambda) / sqrt(2), -sin(lambda) / sqrt(2)); 130 | g.mat[1][0] = make_qComplex(cos(phi) / sqrt(2), sin(phi) / sqrt(2)); 131 | g.mat[1][1] = make_qComplex(cos(lambda + phi) / sqrt(2), sin(lambda + phi) / sqrt(2)); 132 | g.name = "U2"; 133 | g.targetQubit = targetQubit; 134 | g.controlQubit = -1; 135 | return g; 136 | } 137 | 138 | Gate Gate::U3(int targetQubit, qreal theta, qreal phi, qreal lambda) { 139 | Gate g; 140 | g.gateID = ++ globalGateID; 141 | g.type = GateType::U3; 142 | g.mat[0][0] = make_qComplex(cos(theta / 2)); 143 | g.mat[0][1] = make_qComplex(-cos(lambda) * sin(theta / 2), -sin(lambda) * sin(theta / 2)); 144 | g.mat[1][0] = make_qComplex(cos(phi) * sin(theta / 2), sin(phi) * sin(theta / 2)); 145 | g.mat[1][1] = make_qComplex(cos(phi + lambda) * cos(theta / 2), sin(phi + lambda) * cos(theta / 2)); 146 | g.name = "U3"; 147 | g.targetQubit = targetQubit; 148 | g.controlQubit = -1; 149 | return g; 150 | } 151 | 152 | Gate Gate::H(int targetQubit) { 153 | Gate g; 154 | g.gateID = ++ globalGateID; 155 | g.type = GateType::H; 156 | g.mat[0][0] = make_qComplex(1/sqrt(2)); g.mat[0][1] = make_qComplex(1/sqrt(2)); 157 | g.mat[1][0] = make_qComplex(1/sqrt(2)); g.mat[1][1] = make_qComplex(-1/sqrt(2)); 158 | g.name = "H"; 159 | g.targetQubit = targetQubit; 160 | g.controlQubit = -1; 161 | return g; 162 | } 163 | 164 | Gate Gate::X(int targetQubit) { 165 | Gate g; 166 | g.gateID = ++ globalGateID; 167 | g.type = GateType::X; 168 | g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1); 169 | g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0); 170 | g.name = "X"; 171 | g.targetQubit = targetQubit; 172 | g.controlQubit = -1; 173 | return g; 174 | } 175 | 176 | Gate Gate::Y(int targetQubit) { 177 | Gate g; 178 | g.gateID = ++ globalGateID; 179 | g.type = GateType::Y; 180 | g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(0, -1); 181 | g.mat[1][0] = make_qComplex(0, 1); g.mat[1][1] = make_qComplex(0); 182 | g.name = "Y"; 183 | g.targetQubit = targetQubit; 184 | g.controlQubit = -1; 185 | return g; 186 | } 187 | 188 | Gate Gate::Z(int targetQubit) { 189 | Gate g; 190 | g.gateID = ++ globalGateID; 191 | g.type = GateType::Z; 192 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 193 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1); 194 | g.name = "Z"; 195 | g.targetQubit = targetQubit; 196 | g.controlQubit = -1; 197 | return g; 198 | } 199 | 200 | Gate Gate::S(int targetQubit) { 201 | Gate g; 202 | g.gateID = ++ globalGateID; 203 | g.type = GateType::S; 204 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 205 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, 1); 206 | g.name = "S"; 207 | g.targetQubit = targetQubit; 208 | g.controlQubit = -1; 209 | return g; 210 | } 211 | 212 | Gate Gate::SDG(int targetQubit) { 213 | Gate g; 214 | g.gateID = ++ globalGateID; 215 | g.type = GateType::SDG; 216 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 217 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, -1); 218 | g.name = "SDG"; 219 | g.targetQubit = targetQubit; 220 | g.controlQubit = -1; 221 | return g; 222 | } 223 | 224 | Gate Gate::T(int targetQubit) { 225 | Gate g; 226 | g.gateID = ++ globalGateID; 227 | g.type = GateType::T; 228 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 229 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1/sqrt(2), 1/sqrt(2)); 230 | g.name = "T"; 231 | g.targetQubit = targetQubit; 232 | g.controlQubit = -1; 233 | return g; 234 | } 235 | 236 | Gate Gate::TDG(int targetQubit) { 237 | Gate g; 238 | g.gateID = ++ globalGateID; 239 | g.type = GateType::T; 240 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 241 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1/sqrt(2), -1/sqrt(2)); 242 | g.name = "TDG"; 243 | g.targetQubit = targetQubit; 244 | g.controlQubit = -1; 245 | return g; 246 | } 247 | 248 | Gate Gate::RX(int targetQubit, qreal angle) { 249 | Gate g; 250 | g.gateID = ++ globalGateID; 251 | g.type = GateType::RX; 252 | g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(0, -sin(angle/2.0)); 253 | g.mat[1][0] = make_qComplex(0, -sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0)); 254 | g.name = "RX"; 255 | g.targetQubit = targetQubit; 256 | g.controlQubit = -1; 257 | return g; 258 | } 259 | 260 | Gate Gate::RY(int targetQubit, qreal angle) { 261 | Gate g; 262 | g.gateID = ++ globalGateID; 263 | g.type = GateType::RY; 264 | g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(-sin(angle/2.0)); 265 | g.mat[1][0] = make_qComplex(sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0)); 266 | g.name = "RY"; 267 | g.targetQubit = targetQubit; 268 | g.controlQubit = -1; 269 | return g; 270 | } 271 | 272 | Gate Gate::RZ(int targetQubit, qreal angle) { 273 | Gate g; 274 | g.gateID = ++ globalGateID; 275 | g.type = GateType::RZ; 276 | g.mat[0][0] = make_qComplex(cos(angle/2), -sin(angle/2)); g.mat[0][1] = make_qComplex(0); 277 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(cos(angle/2), sin(angle/2)); 278 | g.name = "RZ"; 279 | g.targetQubit = targetQubit; 280 | g.controlQubit = -1; 281 | return g; 282 | } 283 | 284 | Gate Gate::ID(int targetQubit) { 285 | Gate g; 286 | g.gateID = ++ globalGateID; 287 | g.type = GateType::ID; 288 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 289 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1); 290 | g.name = "ID"; 291 | g.targetQubit = targetQubit; 292 | g.controlQubit = -1; 293 | return g; 294 | } 295 | 296 | Gate Gate::GII(int targetQubit) { 297 | Gate g; 298 | g.gateID = ++ globalGateID; 299 | g.type = GateType::GII; 300 | g.mat[0][0] = make_qComplex(0, 1); g.mat[0][1] = make_qComplex(0); 301 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, 1); 302 | g.name = "GII"; 303 | g.targetQubit = targetQubit; 304 | g.controlQubit = -1; 305 | return g; 306 | } 307 | 308 | Gate Gate::GZZ(int targetQubit) { 309 | Gate g; 310 | g.gateID = ++ globalGateID; 311 | g.type = GateType::GZZ; 312 | g.mat[0][0] = make_qComplex(-1); g.mat[0][1] = make_qComplex(0); 313 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1); 314 | g.name = "GZZ"; 315 | g.targetQubit = targetQubit; 316 | g.controlQubit = -1; 317 | return g; 318 | } 319 | 320 | Gate Gate::GOC(int targetQubit, qreal real, qreal imag) { 321 | Gate g; 322 | g.gateID = ++ globalGateID; 323 | g.type = GateType::GOC; 324 | g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0); 325 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(real, imag); 326 | g.name = "GOC"; 327 | g.targetQubit = targetQubit; 328 | g.controlQubit = -1; 329 | return g; 330 | } 331 | 332 | Gate Gate::GCC(int targetQubit, qreal real, qreal imag) { 333 | Gate g; 334 | g.gateID = ++ globalGateID; 335 | g.type = GateType::GCC; 336 | g.mat[0][0] = make_qComplex(real, imag); g.mat[0][1] = make_qComplex(0); 337 | g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(real, imag); 338 | g.name = "GCC"; 339 | g.targetQubit = targetQubit; 340 | g.controlQubit = -1; 341 | return g; 342 | } 343 | 344 | auto gen_01_float = []() { 345 | return rand() * 1.0 / RAND_MAX; 346 | }; 347 | auto gen_0_2pi_float = []() { 348 | return gen_01_float() * acos(-1) * 2; 349 | }; 350 | 351 | Gate Gate::random(int lo, int hi) { 352 | int type = rand() % int(GateType::TOTAL); 353 | return random(lo, hi, GateType(type)); 354 | } 355 | 356 | Gate Gate::random(int lo, int hi, GateType type) { 357 | auto gen_c2_id = [lo, hi](int &t, int &c1, int &c2) { 358 | assert(hi - lo >= 3); 359 | do { 360 | c2 = rand() % (hi - lo) + lo; 361 | c1 = rand() % (hi - lo) + lo; 362 | t = rand() % (hi - lo) + lo; 363 | } while (c2 == c1 || c2 == t || c1 == t); 364 | }; 365 | auto gen_c1_id = [lo, hi](int &t, int &c1) { 366 | assert(hi - lo >= 2); 367 | do { 368 | c1 = rand() % (hi - lo) + lo; 369 | t = rand() % (hi - lo) + lo; 370 | } while (c1 == t); 371 | }; 372 | auto gen_single_id = [lo, hi](int &t) { 373 | t = rand() % (hi - lo) + lo; 374 | }; 375 | switch (type) { 376 | case GateType::CCX: { 377 | int t, c1, c2; 378 | gen_c2_id(t, c1, c2); 379 | return CCX(c1, c2, t); 380 | } 381 | case GateType::CNOT: { 382 | int t, c1; 383 | gen_c1_id(t, c1); 384 | return CNOT(c1, t); 385 | } 386 | case GateType::CY: { 387 | int t, c1; 388 | gen_c1_id(t, c1); 389 | return CY(c1, t); 390 | } 391 | case GateType::CZ: { 392 | int t, c1; 393 | gen_c1_id(t, c1); 394 | return CZ(c1, t); 395 | } 396 | case GateType::CRX: { 397 | int t, c1; 398 | gen_c1_id(t, c1); 399 | return CRX(c1, t, gen_0_2pi_float()); 400 | } 401 | case GateType::CRY: { 402 | int t, c1; 403 | gen_c1_id(t, c1); 404 | return CRY(c1, t, gen_0_2pi_float()); 405 | } 406 | case GateType::CU1: { 407 | int t, c1; 408 | gen_c1_id(t, c1); 409 | return CU1(c1, t, gen_0_2pi_float()); 410 | } 411 | case GateType::CRZ: { 412 | int t, c1; 413 | gen_c1_id(t, c1); 414 | return CRZ(c1, t, gen_0_2pi_float()); 415 | } 416 | case GateType::U1: { 417 | int t; 418 | gen_single_id(t); 419 | return U1(t, gen_0_2pi_float()); 420 | } 421 | case GateType::U2: { 422 | int t; 423 | gen_single_id(t); 424 | return U2(t, gen_0_2pi_float(), gen_0_2pi_float()); 425 | } 426 | case GateType::U3: { 427 | int t; 428 | gen_single_id(t); 429 | return U3(t, gen_0_2pi_float(), gen_0_2pi_float(), gen_0_2pi_float()); 430 | } 431 | case GateType::H: { 432 | int t; 433 | gen_single_id(t); 434 | return H(t); 435 | } 436 | case GateType::X: { 437 | int t; 438 | gen_single_id(t); 439 | return X(t); 440 | } 441 | case GateType::Y: { 442 | int t; 443 | gen_single_id(t); 444 | return Y(t); 445 | } 446 | case GateType::Z: { 447 | int t; 448 | gen_single_id(t); 449 | return Z(t); 450 | } 451 | case GateType::S: { 452 | int t; 453 | gen_single_id(t); 454 | return S(t); 455 | } 456 | case GateType::SDG: { 457 | int t; 458 | gen_single_id(t); 459 | return SDG(t); 460 | } 461 | case GateType::T: { 462 | int t; 463 | gen_single_id(t); 464 | return T(t); 465 | } 466 | case GateType::TDG: { 467 | int t; 468 | gen_single_id(t); 469 | return TDG(t); 470 | } 471 | case GateType::RX: { 472 | int t; 473 | gen_single_id(t); 474 | return RX(t, gen_0_2pi_float()); 475 | } 476 | case GateType::RY: { 477 | int t; 478 | gen_single_id(t); 479 | return RY(t, gen_0_2pi_float()); 480 | } 481 | case GateType::RZ: { 482 | int t; 483 | gen_single_id(t); 484 | return RZ(t, gen_0_2pi_float()); 485 | } 486 | default: { 487 | printf("invalid %d\n", (int) type); 488 | assert(false); 489 | } 490 | } 491 | exit(1); 492 | } 493 | 494 | Gate Gate::control(int controlQubit, int targetQubit, GateType type) { 495 | switch (type) { 496 | case GateType::CNOT: { 497 | return CNOT(controlQubit, targetQubit); 498 | } 499 | case GateType::CY: { 500 | return CY(controlQubit, targetQubit); 501 | } 502 | case GateType::CZ: { 503 | return CZ(controlQubit, targetQubit); 504 | } 505 | case GateType::CRX: { 506 | return CRX(controlQubit, targetQubit, gen_0_2pi_float()); 507 | } 508 | case GateType::CRY: { 509 | return CRY(controlQubit, targetQubit, gen_0_2pi_float()); 510 | } 511 | case GateType::CU1: { 512 | return CU1(controlQubit, targetQubit, gen_0_2pi_float()); 513 | } 514 | case GateType::CRZ: { 515 | return CRZ(controlQubit, targetQubit, gen_0_2pi_float()); 516 | } 517 | default: { 518 | assert(false); 519 | } 520 | } 521 | exit(1); 522 | } 523 | 524 | GateType Gate::toCU(GateType type) { 525 | if (type == GateType::CCX) { 526 | return GateType::CNOT; 527 | } else { 528 | UNREACHABLE() 529 | } 530 | } 531 | 532 | GateType Gate::toU(GateType type) { 533 | switch (type) { 534 | case GateType::CCX: 535 | case GateType::CNOT: 536 | return GateType::X; 537 | case GateType::CY: 538 | return GateType::Y; 539 | case GateType::CZ: 540 | return GateType::Z; 541 | case GateType::CRX: 542 | return GateType::RX; 543 | case GateType::CRY: 544 | return GateType::RY; 545 | case GateType::CU1: 546 | return GateType::U1; 547 | case GateType::CRZ: 548 | return GateType::RZ; 549 | default: 550 | UNREACHABLE() 551 | } 552 | } 553 | 554 | std::string Gate::get_name(GateType ty) { 555 | return random(0, 10, ty).name; 556 | } 557 | 558 | std::vector Gate::serialize() const { 559 | auto name_len = name.length(); 560 | int len = 561 | sizeof(name_len) + name.length() + 1 + sizeof(gateID) + sizeof(type) + sizeof(mat) 562 | + sizeof(targetQubit) + sizeof(controlQubit) + sizeof(controlQubit2); 563 | std::vector ret; ret.resize(len); 564 | unsigned char* arr = ret.data(); 565 | int cur = 0; 566 | SERIALIZE_STEP(gateID); 567 | SERIALIZE_STEP(type); 568 | memcpy(arr + cur, mat, sizeof(mat)); cur += sizeof(qComplex) * 4; 569 | SERIALIZE_STEP(name_len); 570 | strcpy(reinterpret_cast(arr) + cur, name.c_str()); cur += name_len + 1; 571 | SERIALIZE_STEP(targetQubit); 572 | SERIALIZE_STEP(controlQubit); 573 | SERIALIZE_STEP(controlQubit2); 574 | assert(cur == len); 575 | return ret; 576 | } 577 | 578 | Gate Gate::deserialize(const unsigned char* arr, int& cur) { 579 | Gate g; 580 | DESERIALIZE_STEP(g.gateID); 581 | DESERIALIZE_STEP(g.type); 582 | memcpy(g.mat, arr + cur, sizeof(g.mat)); cur += sizeof(qComplex) * 4; 583 | decltype(g.name.length()) name_len; DESERIALIZE_STEP(name_len); 584 | g.name = std::string(reinterpret_cast(arr) + cur, name_len); cur += name_len + 1; 585 | DESERIALIZE_STEP(g.targetQubit); 586 | DESERIALIZE_STEP(g.controlQubit); 587 | DESERIALIZE_STEP(g.controlQubit2); 588 | return g; 589 | } -------------------------------------------------------------------------------- /src/gate.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "utils.h" 6 | 7 | enum class GateType { 8 | CCX, CNOT, CY, CZ, CRX, CRY, CU1, CRZ, U1, U2, U3, H, X, Y, Z, S, SDG, T, TDG, RX, RY, RZ, TOTAL, ID, GII, GZZ, GOC, GCC 9 | }; 10 | 11 | struct Gate { 12 | int gateID; 13 | GateType type; 14 | qComplex mat[2][2]; 15 | std::string name; 16 | int targetQubit; 17 | int controlQubit; // -1 if no control 18 | int controlQubit2; // -1 if no control 19 | Gate(): controlQubit(-1), controlQubit2(-1) {}; 20 | Gate(const Gate&) = default; 21 | bool isControlGate() const { 22 | return controlQubit != -1; 23 | } 24 | bool isC2Gate() const { 25 | return controlQubit2 != -1; 26 | } 27 | bool isDiagonal() const { 28 | return type == GateType::CZ || type == GateType::CU1 || type == GateType::CRZ || type == GateType::U1 || type == GateType::Z || type == GateType::S || type == GateType::SDG || type == GateType::T || type == GateType::TDG || type == GateType::RZ; 29 | } 30 | static Gate CCX(int c1, int c2, int targetQubit); 31 | static Gate CNOT(int controlQubit, int targetQubit); 32 | static Gate CY(int controlQubit, int targetQubit); 33 | static Gate CZ(int controlQubit, int targetQubit); 34 | static Gate CRX(int controlQubit, int targetQubit, qreal angle); 35 | static Gate CRY(int controlQubit, int targetQubit, qreal angle); 36 | static Gate CU1(int controlQubit, int targetQubit, qreal lambda); 37 | static Gate CRZ(int controlQubit, int targetQubit, qreal angle); 38 | static Gate U1(int targetQubit, qreal lambda); 39 | static Gate U2(int targetQubit, qreal phi, qreal lambda); 40 | static Gate U3(int targetQubit, qreal theta, qreal phi, qreal lambda); 41 | static Gate H(int targetQubit); 42 | static Gate X(int targetQubit); 43 | static Gate Y(int targetQubit); 44 | static Gate Z(int targetQubit); 45 | static Gate S(int targetQubit); 46 | static Gate SDG(int targetQubit); 47 | static Gate T(int targetQubit); 48 | static Gate TDG(int targetQubit); 49 | static Gate RX(int targetQubit, qreal angle); 50 | static Gate RY(int targetQubit, qreal angle); 51 | static Gate RZ(int targetQubit, qreal angle); 52 | static Gate ID(int targetQubit); 53 | static Gate GII(int targetQubit); 54 | static Gate GTT(int targetQubit); 55 | static Gate GZZ(int targetQubit); 56 | static Gate GOC(int targetQubit, qreal real, qreal imag); 57 | static Gate GCC(int targetQubit, qreal real, qreal imag); 58 | static Gate random(int lo, int hi); 59 | static Gate random(int lo, int hi, GateType type); 60 | static Gate control(int controlQubit, int targetQubit, GateType type); 61 | static GateType toCU(GateType type); 62 | static GateType toU(GateType type); 63 | static std::string get_name(GateType ty); 64 | std::vector serialize() const; 65 | static Gate deserialize(const unsigned char* arr, int& cur); 66 | }; 67 | 68 | struct KernelGate { 69 | int targetQubit; 70 | int controlQubit; 71 | int controlQubit2; 72 | GateType type; 73 | char targetIsGlobal; // 0-local 1-global 74 | char controlIsGlobal; // 0-local 1-global 2-not control 75 | char control2IsGlobal; // 0-local 1-global 2-not control 76 | qreal r00, i00, r01, i01, r10, i10, r11, i11; 77 | 78 | KernelGate( 79 | GateType type_, 80 | int controlQubit2_, char control2IsGlobal_, 81 | int controlQubit_, char controlIsGlobal_, 82 | int targetQubit_, char targetIsGlobal_, 83 | const qComplex mat[2][2] 84 | ): 85 | targetQubit(targetQubit_), controlQubit(controlQubit_), controlQubit2(controlQubit2_), 86 | type(type_), 87 | targetIsGlobal(targetIsGlobal_), controlIsGlobal(controlIsGlobal_), control2IsGlobal(control2IsGlobal_), 88 | r00(mat[0][0].x), i00(mat[0][0].y), r01(mat[0][1].x), i01(mat[0][1].y), 89 | r10(mat[1][0].x), i10(mat[1][0].y), r11(mat[1][1].x), i11(mat[1][1].y) {} 90 | 91 | KernelGate( 92 | GateType type_, 93 | int controlQubit_, char controlIsGlobal_, 94 | int targetQubit_, char targetIsGlobal_, 95 | const qComplex mat[2][2] 96 | ): KernelGate(type_, 2, -1, controlQubit_, controlIsGlobal_, targetQubit_, targetIsGlobal_, mat) {} 97 | 98 | KernelGate( 99 | GateType type_, 100 | int targetQubit_, char targetIsGlobal_, 101 | const qComplex mat[2][2] 102 | ): KernelGate(type_, 2, -1, 2, -1, targetQubit_, targetIsGlobal_, mat) {} 103 | 104 | KernelGate() = default; 105 | 106 | static KernelGate ID() { 107 | qComplex mat[2][2] = {1, 0, 0, 1}; \ 108 | return KernelGate(GateType::ID, 0, 0, mat); 109 | } 110 | }; -------------------------------------------------------------------------------- /src/kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "gate.h" 7 | #include "utils.h" 8 | #include "compiler.h" 9 | #include "circuit.h" 10 | 11 | // kernelSimple 12 | void kernelInit(std::vector &deviceStateVec, int numQubits); 13 | void kernelExecSimple(qComplex* deviceStateVec, int numQubits, const std::vector & gates); 14 | qreal kernelMeasure(qComplex* deviceStateVec, int numQubits, int targetQubit); 15 | qComplex kernelGetAmp(qComplex* deviceStateVec, qindex idx); 16 | void kernelDeviceToHost(qComplex* hostStateVec, qComplex* deviceStateVec, int numQubits); 17 | void kernelDestroy(qComplex* deviceStateVec); 18 | void cuttPlanInit(std::vector& plans); 19 | 20 | // kernelOpt 21 | void initControlIdx(); 22 | // call cudaSetDevice() before this function 23 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID); 24 | 25 | // call cudaSetDevice() before this function 26 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID); 27 | 28 | 29 | // kernelUtils 30 | void isnanTest(qComplex* data, int n, cudaStream_t& stream); 31 | void printVector(qComplex* data, int n, cudaStream_t& stream); 32 | void whileTrue(); -------------------------------------------------------------------------------- /src/kernelUtils.cu: -------------------------------------------------------------------------------- 1 | #include "kernel.h" 2 | #include 3 | #include 4 | #include 5 | 6 | __global__ void isnanTestKernel(qComplex *data, int n) { // with grimDim == 1 7 | for (int i = threadIdx.x; i < n; i += blockDim.x) { 8 | if (isnan(data[i].x) || isnan(data[i].y)) { 9 | printf("nan at %d\n", i); 10 | asm("trap;"); 11 | } 12 | } 13 | } 14 | 15 | __global__ void printVectorKernel(qComplex *data, int n) { // with gridDim == 1 && blockDim == 1 16 | for (int i = 0; i < n; i++) 17 | printf("(%f, %f)", data[i].x, data[i].y); 18 | printf("\n"); 19 | } 20 | 21 | __global__ void whileTrueKernel() { 22 | while (true); 23 | } 24 | 25 | void isnanTest(qComplex* data, int n, cudaStream_t& stream) { 26 | isnanTestKernel<<<1, 32, 0, stream>>>(data, n / 32); 27 | } 28 | 29 | void printVector(qComplex* data, int n, cudaStream_t& stream) { 30 | printVectorKernel<<<1, 1, 0, stream>>>(data, n); 31 | } 32 | 33 | void whileTrue() { 34 | whileTrueKernel<<<1,1>>>(); 35 | } 36 | -------------------------------------------------------------------------------- /src/kernels/baseline.cu: -------------------------------------------------------------------------------- 1 | #include "kernel.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "gate.h" 7 | #include "executor.h" 8 | using namespace std; 9 | 10 | extern __shared__ qComplex shm[1< loIdx_device; 17 | std::vector shiftAt_device; 18 | 19 | 20 | __device__ __forceinline__ void XSingle(int loIdx, int hiIdx) { 21 | qComplex v = shm[loIdx]; 22 | shm[loIdx] = shm[hiIdx]; 23 | shm[hiIdx] = v; 24 | } 25 | 26 | __device__ __forceinline__ void YSingle(int loIdx, int hiIdx) { 27 | qComplex lo = shm[loIdx]; 28 | qComplex hi = shm[hiIdx]; 29 | 30 | shm[loIdx] = make_qComplex(hi.y, -hi.x); 31 | shm[hiIdx] = make_qComplex(-lo.y, lo.x); 32 | } 33 | 34 | __device__ __forceinline__ void ZHi(int hiIdx) { 35 | qComplex v = shm[hiIdx]; 36 | shm[hiIdx] = make_qComplex(-v.x, -v.y); 37 | } 38 | 39 | 40 | __device__ __forceinline__ void RXSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) { 41 | qComplex lo = shm[loIdx]; 42 | qComplex hi = shm[hiIdx]; 43 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * hi.y, alpha * lo.y - beta * hi.x); 44 | shm[hiIdx] = make_qComplex(alpha * hi.x + beta * lo.y, alpha * hi.y - beta * lo.x); 45 | } 46 | 47 | __device__ __forceinline__ void RYSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) { 48 | qComplex lo = shm[loIdx]; 49 | qComplex hi = shm[hiIdx]; 50 | shm[loIdx] = make_qComplex(alpha * lo.x - beta * hi.x, alpha * lo.y - beta * hi.y); 51 | shm[hiIdx] = make_qComplex(beta * lo.x + alpha * hi.x, beta * lo.y + alpha * hi.y); 52 | } 53 | 54 | __device__ __forceinline__ void RZSingle(int loIdx, int hiIdx, qreal alpha, qreal beta){ 55 | qComplex lo = shm[loIdx]; 56 | qComplex hi = shm[hiIdx]; 57 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x); 58 | shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x); 59 | } 60 | 61 | __device__ __forceinline__ void RZLo(int loIdx, qreal alpha, qreal beta) { 62 | qComplex lo = shm[loIdx]; 63 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x); 64 | } 65 | 66 | __device__ __forceinline__ void RZHi(int hiIdx, qreal alpha, qreal beta){ 67 | qComplex hi = shm[hiIdx]; 68 | shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x); 69 | } 70 | 71 | #define COMPLEX_MULTIPLY_REAL(v0, v1) (v0.x * v1.x - v0.y * v1.y) 72 | #define COMPLEX_MULTIPLY_IMAG(v0, v1) (v0.x * v1.y + v0.y * v1.x) 73 | 74 | __device__ __forceinline__ void U1Hi(int hiIdx, qComplex p) { 75 | qComplex hi = shm[hiIdx]; 76 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p)); 77 | } 78 | 79 | __device__ __forceinline__ void USingle(int loIdx, int hiIdx, qComplex v00, qComplex v01, qComplex v10, qComplex v11) { 80 | qComplex lo = shm[loIdx]; 81 | qComplex hi = shm[hiIdx]; 82 | shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v00) + COMPLEX_MULTIPLY_REAL(hi, v01), 83 | COMPLEX_MULTIPLY_IMAG(lo, v00) + COMPLEX_MULTIPLY_IMAG(hi, v01)); 84 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v10) + COMPLEX_MULTIPLY_REAL(hi, v11), 85 | COMPLEX_MULTIPLY_IMAG(lo, v10) + COMPLEX_MULTIPLY_IMAG(hi, v11)); 86 | } 87 | 88 | __device__ __forceinline__ void HSingle(int loIdx, int hiIdx) { 89 | qComplex lo = shm[loIdx]; 90 | qComplex hi = shm[hiIdx]; 91 | shm[loIdx] = make_qComplex(recRoot2 * (lo.x + hi.x), recRoot2 * (lo.y + hi.y)); 92 | shm[hiIdx] = make_qComplex(recRoot2 * (lo.x - hi.x), recRoot2 * (lo.y - hi.y)); 93 | } 94 | 95 | __device__ __forceinline__ void SHi(int hiIdx) { 96 | qComplex hi = shm[hiIdx]; 97 | shm[hiIdx] = make_qComplex(-hi.y, hi.x); 98 | } 99 | 100 | __device__ __forceinline__ void SDGHi(int hiIdx) { 101 | qComplex hi = shm[hiIdx]; 102 | shm[hiIdx] = make_qComplex(hi.y, -hi.x); 103 | } 104 | 105 | __device__ __forceinline__ void THi(int hiIdx) { 106 | qComplex hi = shm[hiIdx]; 107 | shm[hiIdx] = make_qComplex(recRoot2 * (hi.x - hi.y), recRoot2 * (hi.x + hi.y)); 108 | } 109 | 110 | __device__ __forceinline__ void TDGHi(int hiIdx) { 111 | qComplex hi = shm[hiIdx]; 112 | shm[hiIdx] = make_qComplex(recRoot2 * (hi.x + hi.y), recRoot2 * (hi.x - hi.y)); 113 | } 114 | 115 | __device__ __forceinline__ void GIISingle(int loIdx, int hiIdx) { 116 | qComplex lo = shm[loIdx]; 117 | shm[loIdx] = make_qComplex(-lo.y, lo.x); 118 | qComplex hi = shm[hiIdx]; 119 | shm[hiIdx] = make_qComplex(-hi.y, hi.x); 120 | } 121 | 122 | __device__ __forceinline__ void GII(int idx) { 123 | qComplex v = shm[idx]; 124 | shm[idx] = make_qComplex(-v.y, v.x); 125 | } 126 | 127 | __device__ __forceinline__ void GZZSingle(int loIdx, int hiIdx) { 128 | qComplex lo = shm[loIdx]; 129 | shm[loIdx] = make_qComplex(-lo.x, -lo.y); 130 | qComplex hi = shm[hiIdx]; 131 | shm[hiIdx] = make_qComplex(-hi.x, -hi.y); 132 | } 133 | 134 | __device__ __forceinline__ void GZZ(int idx) { 135 | qComplex v = shm[idx]; 136 | shm[idx] = make_qComplex(-v.x, -v.y); 137 | } 138 | 139 | __device__ __forceinline__ void GCCSingle(int loIdx, int hiIdx, qComplex p) { 140 | qComplex lo = shm[loIdx]; 141 | shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, p), COMPLEX_MULTIPLY_IMAG(lo, p)); 142 | qComplex hi = shm[hiIdx]; 143 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p)); 144 | } 145 | 146 | __device__ __forceinline__ void GCC(int idx, qComplex p) { 147 | qComplex v = shm[idx]; 148 | shm[idx] = make_qComplex(COMPLEX_MULTIPLY_REAL(v, p), COMPLEX_MULTIPLY_IMAG(v, p)); 149 | } 150 | 151 | #define FOLLOW_NEXT(TYPE) \ 152 | case GateType::TYPE: // no break 153 | 154 | #define CASE_CONTROL(TYPE, OP) \ 155 | case GateType::TYPE: { \ 156 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 157 | int lo = ((j >> smallQubit) << (smallQubit + 1)) | (j & maskSmall); \ 158 | lo = ((lo >> largeQubit) << (largeQubit + 1)) | (lo & maskLarge); \ 159 | lo |= 1 << controlQubit; \ 160 | int hi = lo | (1 << targetQubit); \ 161 | OP; \ 162 | } \ 163 | break; \ 164 | } 165 | 166 | #define CASE_SINGLE(TYPE, OP) \ 167 | case GateType::TYPE: { \ 168 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 169 | int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget); \ 170 | int hi = lo | (1 << targetQubit); \ 171 | OP; \ 172 | } \ 173 | break;\ 174 | } 175 | 176 | #define CASE_LO_HI(TYPE, OP_LO, OP_HI) \ 177 | case GateType::TYPE: { \ 178 | int m = 1 << LOCAL_QUBIT_SIZE; \ 179 | if (!isHighBlock){ \ 180 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 181 | OP_LO; \ 182 | } \ 183 | } else { \ 184 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 185 | OP_HI; \ 186 | } \ 187 | } \ 188 | break; \ 189 | } 190 | 191 | #define CASE_SKIPLO_HI(TYPE, OP_HI) \ 192 | case GateType::TYPE: { \ 193 | if (!isHighBlock) continue; \ 194 | int m = 1 << LOCAL_QUBIT_SIZE; \ 195 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 196 | OP_HI; \ 197 | } \ 198 | break; \ 199 | } 200 | 201 | #define LOHI_SAME(TYPE, OP) \ 202 | case GateType::TYPE: { \ 203 | int m = 1 << LOCAL_QUBIT_SIZE; \ 204 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 205 | OP; \ 206 | } \ 207 | break; \ 208 | } 209 | 210 | #define ID_BREAK() \ 211 | case GateType::ID: { \ 212 | break; \ 213 | } 214 | 215 | template 216 | __device__ void doCompute(int numGates, int* loArr, int* shiftAt) { 217 | for (int i = 0; i < numGates; i++) { 218 | int controlQubit = deviceGates[i].controlQubit; 219 | int targetQubit = deviceGates[i].targetQubit; 220 | char controlIsGlobal = deviceGates[i].controlIsGlobal; 221 | char targetIsGlobal = deviceGates[i].targetIsGlobal; 222 | if (deviceGates[i].type == GateType::CCX) { 223 | int controlQubit2 = deviceGates[i].controlQubit2; 224 | int control2IsGlobal = deviceGates[i].control2IsGlobal; 225 | if (!control2IsGlobal) { 226 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 227 | assert(!controlIsGlobal && !targetIsGlobal); 228 | assert(deviceGates[i].type == GateType::CCX); 229 | int maskTarget = (1 << targetQubit) - 1; 230 | for (int j = threadIdx.x; j < m; j += blockSize) { 231 | int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget); 232 | if (!(lo >> controlQubit & 1) || !(lo >> controlQubit2 & 1)) 233 | continue; 234 | int hi = lo | (1 << targetQubit); 235 | XSingle(lo, hi); 236 | } 237 | continue; 238 | } 239 | if (control2IsGlobal == 1 && !((blockIdx.x >> controlQubit2) & 1)) { 240 | continue; 241 | } 242 | } 243 | if (!controlIsGlobal) { 244 | if (!targetIsGlobal) { 245 | int m = 1 << (LOCAL_QUBIT_SIZE - 2); 246 | int smallQubit = controlQubit > targetQubit ? targetQubit : controlQubit; 247 | int largeQubit = controlQubit > targetQubit ? controlQubit : targetQubit; 248 | int maskSmall = (1 << smallQubit) - 1; 249 | int maskLarge = (1 << largeQubit) - 1; 250 | switch (deviceGates[i].type) { 251 | FOLLOW_NEXT(CCX) 252 | CASE_CONTROL(CNOT, XSingle(lo, hi)) 253 | CASE_CONTROL(CY, YSingle(lo, hi)) 254 | CASE_CONTROL(CZ, ZHi(hi)) 255 | CASE_CONTROL(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01)) 256 | CASE_CONTROL(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10)) 257 | CASE_CONTROL(CU1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 258 | CASE_CONTROL(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00)) 259 | default: { 260 | assert(false); 261 | } 262 | } 263 | } else { 264 | assert(deviceGates[i].type == GateType::CZ || deviceGates[i].type == GateType::CU1 || deviceGates[i].type == GateType::CRZ); 265 | bool isHighBlock = (blockIdx.x >> targetQubit) & 1; 266 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 267 | int maskControl = (1 << controlQubit) - 1; 268 | if (!isHighBlock){ 269 | if (deviceGates[i].type == GateType::CRZ) { 270 | for (int j = threadIdx.x; j < m; j += blockSize) { 271 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 272 | RZLo(x, deviceGates[i].r00, -deviceGates[i].i00); 273 | } 274 | } 275 | } else { 276 | switch (deviceGates[i].type) { 277 | case GateType::CZ: { 278 | for (int j = threadIdx.x; j < m; j += blockSize) { 279 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 280 | ZHi(x); 281 | } 282 | break; 283 | } 284 | case GateType::CU1: { 285 | for (int j = threadIdx.x; j < m; j += blockSize) { 286 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 287 | U1Hi(x, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)); 288 | } 289 | break; 290 | } 291 | case GateType::CRZ: { 292 | for (int j = threadIdx.x; j < m; j += blockSize) { 293 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 294 | RZHi(x, deviceGates[i].r00, -deviceGates[i].i00); 295 | } 296 | break; 297 | } 298 | default: { 299 | assert(false); 300 | } 301 | } 302 | } 303 | } 304 | } else { 305 | if (controlIsGlobal == 1 && !((blockIdx.x >> controlQubit) & 1)) { 306 | continue; 307 | } 308 | if (!targetIsGlobal) { 309 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 310 | int maskTarget = (1 << targetQubit) - 1; 311 | switch (deviceGates[i].type) { 312 | FOLLOW_NEXT(GOC) 313 | FOLLOW_NEXT(CU1) 314 | CASE_SINGLE(U1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 315 | FOLLOW_NEXT(U2) 316 | CASE_SINGLE(U3, USingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00), make_qComplex(deviceGates[i].r01, deviceGates[i].i01), make_qComplex(deviceGates[i].r10, deviceGates[i].i10), make_qComplex(deviceGates[i].r11, deviceGates[i].i11))); 317 | CASE_SINGLE(H, HSingle(lo, hi)) 318 | FOLLOW_NEXT(X) 319 | FOLLOW_NEXT(CNOT) 320 | CASE_SINGLE(CCX, XSingle(lo, hi)) 321 | FOLLOW_NEXT(Y) 322 | CASE_SINGLE(CY, YSingle(lo, hi)) 323 | FOLLOW_NEXT(Z) 324 | CASE_SINGLE(CZ, ZHi(hi)) 325 | FOLLOW_NEXT(RX) 326 | CASE_SINGLE(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01)) 327 | FOLLOW_NEXT(RY) 328 | CASE_SINGLE(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10)) 329 | FOLLOW_NEXT(RZ) 330 | CASE_SINGLE(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00)) 331 | CASE_SINGLE(S, SHi(hi)) 332 | CASE_SINGLE(SDG, SDGHi(hi)) 333 | CASE_SINGLE(T, THi(hi)) 334 | CASE_SINGLE(TDG, TDGHi(hi)) 335 | CASE_SINGLE(GII, GIISingle(lo, hi)) 336 | CASE_SINGLE(GZZ, GZZSingle(lo, hi)) 337 | CASE_SINGLE(GCC, GCCSingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00))) 338 | ID_BREAK() 339 | default: { 340 | assert(false); 341 | } 342 | } 343 | } else { 344 | bool isHighBlock = (blockIdx.x >> targetQubit) & 1; 345 | switch (deviceGates[i].type) { 346 | FOLLOW_NEXT(RZ) 347 | CASE_LO_HI(CRZ, RZLo(j, deviceGates[i].r00, -deviceGates[i].i00), RZHi(j, deviceGates[i].r00, -deviceGates[i].i00)) 348 | FOLLOW_NEXT(Z) 349 | CASE_SKIPLO_HI(CZ, ZHi(j)) 350 | CASE_SKIPLO_HI(S, SHi(j)) 351 | CASE_SKIPLO_HI(SDG, SDGHi(j)) 352 | CASE_SKIPLO_HI(T, THi(j)) 353 | CASE_SKIPLO_HI(TDG, TDGHi(j)) 354 | FOLLOW_NEXT(GOC) 355 | FOLLOW_NEXT(CU1) 356 | CASE_SKIPLO_HI(U1, U1Hi(j, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 357 | LOHI_SAME(GII, GII(j)) 358 | LOHI_SAME(GZZ, GZZ(j)) 359 | LOHI_SAME(GCC, GCC(j, make_qComplex(deviceGates[i].r00, deviceGates[i].i00))) 360 | ID_BREAK() 361 | default: { 362 | assert(false); 363 | } 364 | } 365 | } 366 | } 367 | __syncthreads(); 368 | } 369 | } 370 | 371 | __device__ void fetchData(qComplex* a, unsigned int* threadBias, unsigned int idx, unsigned int blockHot, unsigned int enumerate, int numLocalQubits) { 372 | if (threadIdx.x == 0) { 373 | int bid = blockIdx.x; 374 | unsigned int bias = 0; 375 | for (unsigned int bit = 1; bit < (1u << numLocalQubits); bit <<= 1) { 376 | if (blockHot & bit) { 377 | if (bid & 1) 378 | bias |= bit; 379 | bid >>= 1; 380 | } 381 | } 382 | blockBias = bias; 383 | } 384 | __syncthreads(); 385 | unsigned int bias = blockBias | threadBias[threadIdx.x]; 386 | int x; 387 | unsigned int y; 388 | for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate; 389 | x >= 0; 390 | x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) { 391 | 392 | shm[x] = a[bias | y]; 393 | } 394 | } 395 | 396 | __device__ void saveData(qComplex* a, unsigned int* threadBias, unsigned int enumerate) { 397 | unsigned int bias = blockBias | threadBias[threadIdx.x]; 398 | int x; 399 | unsigned y; 400 | for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate; 401 | x >= 0; 402 | x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) { 403 | 404 | a[bias | y] = shm[x]; 405 | } 406 | } 407 | 408 | template 409 | __global__ void run(qComplex* a, unsigned int* threadBias, int* loArr, int* shiftAt, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate) { 410 | unsigned int idx = (unsigned int) blockIdx.x * blockSize + threadIdx.x; 411 | fetchData(a, threadBias, idx, blockHot, enumerate, numLocalQubits); 412 | __syncthreads(); 413 | doCompute(numGates, loArr, shiftAt); 414 | __syncthreads(); 415 | saveData(a, threadBias, enumerate); 416 | } 417 | 418 | #if BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5 419 | void initControlIdx() { 420 | loIdx_device.resize(MyGlobalVars::localGPUs); 421 | shiftAt_device.resize(MyGlobalVars::localGPUs); 422 | } 423 | #endif 424 | 425 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID) { 426 | checkCudaErrors(cudaMemcpyToSymbolAsync(deviceGates, hostGates + gpuID * numGates, sizeof(KernelGate) * numGates, 0, cudaMemcpyDefault, stream)); 427 | } 428 | 429 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID) { 430 | run<1<<<>> 431 | (deviceStateVec, threadBias, loIdx_device[gpuID], shiftAt_device[gpuID], numLocalQubits, numGates, blockHot, enumerate); 432 | } -------------------------------------------------------------------------------- /src/kernels/lookup.cu: -------------------------------------------------------------------------------- 1 | #include "kernel.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "gate.h" 7 | #include "executor.h" 8 | using namespace std; 9 | 10 | extern __shared__ qComplex shm[1< loIdx_device; 17 | std::vector shiftAt_device; 18 | 19 | 20 | __device__ __forceinline__ void XSingle(int loIdx, int hiIdx) { 21 | qComplex v = shm[loIdx]; 22 | shm[loIdx] = shm[hiIdx]; 23 | shm[hiIdx] = v; 24 | } 25 | 26 | __device__ __forceinline__ void YSingle(int loIdx, int hiIdx) { 27 | qComplex lo = shm[loIdx]; 28 | qComplex hi = shm[hiIdx]; 29 | 30 | shm[loIdx] = make_qComplex(hi.y, -hi.x); 31 | shm[hiIdx] = make_qComplex(-lo.y, lo.x); 32 | } 33 | 34 | __device__ __forceinline__ void ZHi(int hiIdx) { 35 | qComplex v = shm[hiIdx]; 36 | shm[hiIdx] = make_qComplex(-v.x, -v.y); 37 | } 38 | 39 | 40 | __device__ __forceinline__ void RXSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) { 41 | qComplex lo = shm[loIdx]; 42 | qComplex hi = shm[hiIdx]; 43 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * hi.y, alpha * lo.y - beta * hi.x); 44 | shm[hiIdx] = make_qComplex(alpha * hi.x + beta * lo.y, alpha * hi.y - beta * lo.x); 45 | } 46 | 47 | __device__ __forceinline__ void RYSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) { 48 | qComplex lo = shm[loIdx]; 49 | qComplex hi = shm[hiIdx]; 50 | shm[loIdx] = make_qComplex(alpha * lo.x - beta * hi.x, alpha * lo.y - beta * hi.y); 51 | shm[hiIdx] = make_qComplex(beta * lo.x + alpha * hi.x, beta * lo.y + alpha * hi.y); 52 | } 53 | 54 | __device__ __forceinline__ void RZSingle(int loIdx, int hiIdx, qreal alpha, qreal beta){ 55 | qComplex lo = shm[loIdx]; 56 | qComplex hi = shm[hiIdx]; 57 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x); 58 | shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x); 59 | } 60 | 61 | __device__ __forceinline__ void RZLo(int loIdx, qreal alpha, qreal beta) { 62 | qComplex lo = shm[loIdx]; 63 | shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x); 64 | } 65 | 66 | __device__ __forceinline__ void RZHi(int hiIdx, qreal alpha, qreal beta){ 67 | qComplex hi = shm[hiIdx]; 68 | shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x); 69 | } 70 | 71 | #define COMPLEX_MULTIPLY_REAL(v0, v1) (v0.x * v1.x - v0.y * v1.y) 72 | #define COMPLEX_MULTIPLY_IMAG(v0, v1) (v0.x * v1.y + v0.y * v1.x) 73 | 74 | __device__ __forceinline__ void U1Hi(int hiIdx, qComplex p) { 75 | qComplex hi = shm[hiIdx]; 76 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p)); 77 | } 78 | 79 | __device__ __forceinline__ void USingle(int loIdx, int hiIdx, qComplex v00, qComplex v01, qComplex v10, qComplex v11) { 80 | qComplex lo = shm[loIdx]; 81 | qComplex hi = shm[hiIdx]; 82 | shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v00) + COMPLEX_MULTIPLY_REAL(hi, v01), 83 | COMPLEX_MULTIPLY_IMAG(lo, v00) + COMPLEX_MULTIPLY_IMAG(hi, v01)); 84 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v10) + COMPLEX_MULTIPLY_REAL(hi, v11), 85 | COMPLEX_MULTIPLY_IMAG(lo, v10) + COMPLEX_MULTIPLY_IMAG(hi, v11)); 86 | } 87 | 88 | __device__ __forceinline__ void HSingle(int loIdx, int hiIdx) { 89 | qComplex lo = shm[loIdx]; 90 | qComplex hi = shm[hiIdx]; 91 | shm[loIdx] = make_qComplex(recRoot2 * (lo.x + hi.x), recRoot2 * (lo.y + hi.y)); 92 | shm[hiIdx] = make_qComplex(recRoot2 * (lo.x - hi.x), recRoot2 * (lo.y - hi.y)); 93 | } 94 | 95 | __device__ __forceinline__ void SHi(int hiIdx) { 96 | qComplex hi = shm[hiIdx]; 97 | shm[hiIdx] = make_qComplex(-hi.y, hi.x); 98 | } 99 | 100 | __device__ __forceinline__ void SDGHi(int hiIdx) { 101 | qComplex hi = shm[hiIdx]; 102 | shm[hiIdx] = make_qComplex(hi.y, -hi.x); 103 | } 104 | 105 | __device__ __forceinline__ void THi(int hiIdx) { 106 | qComplex hi = shm[hiIdx]; 107 | shm[hiIdx] = make_qComplex(recRoot2 * (hi.x - hi.y), recRoot2 * (hi.x + hi.y)); 108 | } 109 | 110 | __device__ __forceinline__ void TDGHi(int hiIdx) { 111 | qComplex hi = shm[hiIdx]; 112 | shm[hiIdx] = make_qComplex(recRoot2 * (hi.x + hi.y), recRoot2 * (hi.x - hi.y)); 113 | } 114 | 115 | __device__ __forceinline__ void GIISingle(int loIdx, int hiIdx) { 116 | qComplex lo = shm[loIdx]; 117 | shm[loIdx] = make_qComplex(-lo.y, lo.x); 118 | qComplex hi = shm[hiIdx]; 119 | shm[hiIdx] = make_qComplex(-hi.y, hi.x); 120 | } 121 | 122 | __device__ __forceinline__ void GII(int idx) { 123 | qComplex v = shm[idx]; 124 | shm[idx] = make_qComplex(-v.y, v.x); 125 | } 126 | 127 | __device__ __forceinline__ void GZZSingle(int loIdx, int hiIdx) { 128 | qComplex lo = shm[loIdx]; 129 | shm[loIdx] = make_qComplex(-lo.x, -lo.y); 130 | qComplex hi = shm[hiIdx]; 131 | shm[hiIdx] = make_qComplex(-hi.x, -hi.y); 132 | } 133 | 134 | __device__ __forceinline__ void GZZ(int idx) { 135 | qComplex v = shm[idx]; 136 | shm[idx] = make_qComplex(-v.x, -v.y); 137 | } 138 | 139 | __device__ __forceinline__ void GCCSingle(int loIdx, int hiIdx, qComplex p) { 140 | qComplex lo = shm[loIdx]; 141 | shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, p), COMPLEX_MULTIPLY_IMAG(lo, p)); 142 | qComplex hi = shm[hiIdx]; 143 | shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p)); 144 | } 145 | 146 | __device__ __forceinline__ void GCC(int idx, qComplex p) { 147 | qComplex v = shm[idx]; 148 | shm[idx] = make_qComplex(COMPLEX_MULTIPLY_REAL(v, p), COMPLEX_MULTIPLY_IMAG(v, p)); 149 | } 150 | 151 | #define FOLLOW_NEXT(TYPE) \ 152 | case GateType::TYPE: // no break 153 | 154 | #define CASE_CONTROL(TYPE, OP) \ 155 | case GateType::TYPE: { \ 156 | OP; \ 157 | lo += add; hi += add; \ 158 | OP; \ 159 | break; \ 160 | } 161 | 162 | #define CASE_SINGLE(TYPE, OP) \ 163 | case GateType::TYPE: { \ 164 | for (int task = 0; task < 4; task ++) { \ 165 | OP; \ 166 | lo += add[task]; hi += add[task]; \ 167 | } \ 168 | break;\ 169 | } 170 | 171 | #define CASE_LO_HI(TYPE, OP_LO, OP_HI) \ 172 | case GateType::TYPE: { \ 173 | int m = 1 << LOCAL_QUBIT_SIZE; \ 174 | if (!isHighBlock){ \ 175 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 176 | OP_LO; \ 177 | } \ 178 | } else { \ 179 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 180 | OP_HI; \ 181 | } \ 182 | } \ 183 | break; \ 184 | } 185 | 186 | #define CASE_SKIPLO_HI(TYPE, OP_HI) \ 187 | case GateType::TYPE: { \ 188 | if (!isHighBlock) continue; \ 189 | int m = 1 << LOCAL_QUBIT_SIZE; \ 190 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 191 | OP_HI; \ 192 | } \ 193 | break; \ 194 | } 195 | 196 | #define LOHI_SAME(TYPE, OP) \ 197 | case GateType::TYPE: { \ 198 | int m = 1 << LOCAL_QUBIT_SIZE; \ 199 | for (int j = threadIdx.x; j < m; j += blockSize) { \ 200 | OP; \ 201 | } \ 202 | break; \ 203 | } 204 | 205 | #define ID_BREAK() \ 206 | case GateType::ID: { \ 207 | break; \ 208 | } 209 | 210 | template 211 | __device__ void doCompute(int numGates, int* loArr, int* shiftAt) { 212 | for (int i = 0; i < numGates; i++) { 213 | int controlQubit = deviceGates[i].controlQubit; 214 | int targetQubit = deviceGates[i].targetQubit; 215 | char controlIsGlobal = deviceGates[i].controlIsGlobal; 216 | char targetIsGlobal = deviceGates[i].targetIsGlobal; 217 | if (deviceGates[i].type == GateType::CCX) { 218 | int controlQubit2 = deviceGates[i].controlQubit2; 219 | int control2IsGlobal = deviceGates[i].control2IsGlobal; 220 | if (!control2IsGlobal) { 221 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 222 | assert(!controlIsGlobal && !targetIsGlobal); 223 | assert(deviceGates[i].type == GateType::CCX); 224 | int maskTarget = (1 << targetQubit) - 1; 225 | for (int j = threadIdx.x; j < m; j += blockSize) { 226 | int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget); 227 | if (!(lo >> controlQubit & 1) || !(lo >> controlQubit2 & 1)) 228 | continue; 229 | int hi = lo | (1 << targetQubit); 230 | XSingle(lo, hi); 231 | } 232 | continue; 233 | } 234 | if (control2IsGlobal == 1 && !((blockIdx.x >> controlQubit2) & 1)) { 235 | continue; 236 | } 237 | } 238 | if (!controlIsGlobal) { 239 | if (!targetIsGlobal) { 240 | int m = 1 << (LOCAL_QUBIT_SIZE - 2); 241 | int smallQubit = controlQubit > targetQubit ? targetQubit : controlQubit; 242 | int largeQubit = controlQubit > targetQubit ? controlQubit : targetQubit; 243 | int maskSmall = (1 << smallQubit) - 1; 244 | int maskLarge = (1 << largeQubit) - 1; 245 | int lo = ((threadIdx.x >> smallQubit) << (smallQubit + 1)) | (threadIdx.x & maskSmall); 246 | lo = ((lo >> largeQubit) << (largeQubit + 1)) | (lo & maskLarge); 247 | lo |= 1 << controlQubit; 248 | int hi = lo | (1 << targetQubit); 249 | int add = 512; 250 | if (controlQubit == 9 || targetQubit == 9) { 251 | add = 256; 252 | if (controlQubit == 8 || targetQubit == 8) 253 | add = 128; 254 | } 255 | switch (deviceGates[i].type) { 256 | FOLLOW_NEXT(CCX) 257 | CASE_CONTROL(CNOT, XSingle(lo, hi)) 258 | CASE_CONTROL(CY, YSingle(lo, hi)) 259 | CASE_CONTROL(CZ, ZHi(hi)) 260 | CASE_CONTROL(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01)) 261 | CASE_CONTROL(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10)) 262 | CASE_CONTROL(CU1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 263 | CASE_CONTROL(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00)) 264 | default: { 265 | assert(false); 266 | } 267 | } 268 | } else { 269 | assert(deviceGates[i].type == GateType::CZ || deviceGates[i].type == GateType::CU1 || deviceGates[i].type == GateType::CRZ); 270 | bool isHighBlock = (blockIdx.x >> targetQubit) & 1; 271 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 272 | int maskControl = (1 << controlQubit) - 1; 273 | if (!isHighBlock){ 274 | if (deviceGates[i].type == GateType::CRZ) { 275 | for (int j = threadIdx.x; j < m; j += blockSize) { 276 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 277 | RZLo(x, deviceGates[i].r00, -deviceGates[i].i00); 278 | } 279 | } 280 | } else { 281 | switch (deviceGates[i].type) { 282 | case GateType::CZ: { 283 | for (int j = threadIdx.x; j < m; j += blockSize) { 284 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 285 | ZHi(x); 286 | } 287 | break; 288 | } 289 | case GateType::CU1: { 290 | for (int j = threadIdx.x; j < m; j += blockSize) { 291 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 292 | U1Hi(x, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)); 293 | } 294 | break; 295 | } 296 | case GateType::CRZ: { 297 | for (int j = threadIdx.x; j < m; j += blockSize) { 298 | int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl) | (1 << controlQubit); 299 | RZHi(x, deviceGates[i].r00, -deviceGates[i].i00); 300 | } 301 | break; 302 | } 303 | default: { 304 | assert(false); 305 | } 306 | } 307 | } 308 | } 309 | } else { 310 | if (controlIsGlobal == 1 && !((blockIdx.x >> controlQubit) & 1)) { 311 | continue; 312 | } 313 | if (!targetIsGlobal) { 314 | int m = 1 << (LOCAL_QUBIT_SIZE - 1); 315 | int maskTarget = (1 << targetQubit) - 1; 316 | int add[4]; 317 | if (targetQubit < 8) { 318 | add[0] = add[1] = add[2] = 256; 319 | } else if (targetQubit == 8) { 320 | add[0] = 128; add[1] = 384; add[2] = 128; 321 | } else { // targetQubit == 9 322 | add[0] = add[1] = add[2] = 128; 323 | } 324 | int lo = ((threadIdx.x >> targetQubit) << (targetQubit + 1)) | (threadIdx.x & maskTarget); 325 | int hi = lo | (1 << targetQubit); 326 | switch (deviceGates[i].type) { 327 | FOLLOW_NEXT(GOC) 328 | FOLLOW_NEXT(CU1) 329 | CASE_SINGLE(U1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 330 | FOLLOW_NEXT(U2) 331 | CASE_SINGLE(U3, USingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00), make_qComplex(deviceGates[i].r01, deviceGates[i].i01), make_qComplex(deviceGates[i].r10, deviceGates[i].i10), make_qComplex(deviceGates[i].r11, deviceGates[i].i11))); 332 | CASE_SINGLE(H, HSingle(lo, hi)) 333 | FOLLOW_NEXT(X) 334 | FOLLOW_NEXT(CNOT) 335 | CASE_SINGLE(CCX, XSingle(lo, hi)) 336 | FOLLOW_NEXT(Y) 337 | CASE_SINGLE(CY, YSingle(lo, hi)) 338 | FOLLOW_NEXT(Z) 339 | CASE_SINGLE(CZ, ZHi(hi)) 340 | FOLLOW_NEXT(RX) 341 | CASE_SINGLE(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01)) 342 | FOLLOW_NEXT(RY) 343 | CASE_SINGLE(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10)) 344 | FOLLOW_NEXT(RZ) 345 | CASE_SINGLE(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00)) 346 | CASE_SINGLE(S, SHi(hi)) 347 | CASE_SINGLE(SDG, SDGHi(hi)) 348 | CASE_SINGLE(T, THi(hi)) 349 | CASE_SINGLE(TDG, TDGHi(hi)) 350 | CASE_SINGLE(GII, GIISingle(lo, hi)) 351 | CASE_SINGLE(GZZ, GZZSingle(lo, hi)) 352 | CASE_SINGLE(GCC, GCCSingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00))) 353 | ID_BREAK() 354 | default: { 355 | assert(false); 356 | } 357 | } 358 | } else { 359 | bool isHighBlock = (blockIdx.x >> targetQubit) & 1; 360 | switch (deviceGates[i].type) { 361 | FOLLOW_NEXT(RZ) 362 | CASE_LO_HI(CRZ, RZLo(j, deviceGates[i].r00, -deviceGates[i].i00), RZHi(j, deviceGates[i].r00, -deviceGates[i].i00)) 363 | FOLLOW_NEXT(Z) 364 | CASE_SKIPLO_HI(CZ, ZHi(j)) 365 | CASE_SKIPLO_HI(S, SHi(j)) 366 | CASE_SKIPLO_HI(SDG, SDGHi(j)) 367 | CASE_SKIPLO_HI(T, THi(j)) 368 | CASE_SKIPLO_HI(TDG, TDGHi(j)) 369 | FOLLOW_NEXT(GOC) 370 | FOLLOW_NEXT(CU1) 371 | CASE_SKIPLO_HI(U1, U1Hi(j, make_qComplex(deviceGates[i].r11, deviceGates[i].i11))) 372 | LOHI_SAME(GII, GII(j)) 373 | LOHI_SAME(GZZ, GZZ(j)) 374 | LOHI_SAME(GCC, GCC(j, make_qComplex(deviceGates[i].r00, deviceGates[i].i00))) 375 | ID_BREAK() 376 | default: { 377 | assert(false); 378 | } 379 | } 380 | } 381 | } 382 | __syncthreads(); 383 | } 384 | } 385 | 386 | __device__ void fetchData(qComplex* a, unsigned int* threadBias, unsigned int idx, unsigned int blockHot, unsigned int enumerate, int numLocalQubits) { 387 | if (threadIdx.x == 0) { 388 | int bid = blockIdx.x; 389 | unsigned int bias = 0; 390 | for (unsigned int bit = 1; bit < (1u << numLocalQubits); bit <<= 1) { 391 | if (blockHot & bit) { 392 | if (bid & 1) 393 | bias |= bit; 394 | bid >>= 1; 395 | } 396 | } 397 | blockBias = bias; 398 | } 399 | __syncthreads(); 400 | unsigned int bias = blockBias | threadBias[threadIdx.x]; 401 | int x; 402 | unsigned int y; 403 | for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate; 404 | x >= 0; 405 | x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) { 406 | 407 | shm[x] = a[bias | y]; 408 | } 409 | } 410 | 411 | __device__ void saveData(qComplex* a, unsigned int* threadBias, unsigned int enumerate) { 412 | unsigned int bias = blockBias | threadBias[threadIdx.x]; 413 | int x; 414 | unsigned y; 415 | for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate; 416 | x >= 0; 417 | x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) { 418 | 419 | a[bias | y] = shm[x]; 420 | } 421 | } 422 | 423 | template 424 | __global__ void run(qComplex* a, unsigned int* threadBias, int* loArr, int* shiftAt, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate) { 425 | unsigned int idx = (unsigned int) blockIdx.x * blockSize + threadIdx.x; 426 | fetchData(a, threadBias, idx, blockHot, enumerate, numLocalQubits); 427 | __syncthreads(); 428 | doCompute(numGates, loArr, shiftAt); 429 | __syncthreads(); 430 | saveData(a, threadBias, enumerate); 431 | } 432 | 433 | #if BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5 434 | void initControlIdx() { 435 | loIdx_device.resize(MyGlobalVars::localGPUs); 436 | shiftAt_device.resize(MyGlobalVars::localGPUs); 437 | } 438 | #endif 439 | 440 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID) { 441 | checkCudaErrors(cudaMemcpyToSymbolAsync(deviceGates, hostGates + gpuID * numGates, sizeof(KernelGate) * numGates, 0, cudaMemcpyDefault, stream)); 442 | } 443 | 444 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID) { 445 | run<1<<<>> 446 | (deviceStateVec, threadBias, loIdx_device[gpuID], shiftAt_device[gpuID], numLocalQubits, numGates, blockHot, enumerate); 447 | } -------------------------------------------------------------------------------- /src/logger.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | Logger* Logger::instance = NULL; -------------------------------------------------------------------------------- /src/logger.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Logger { 10 | static Logger* instance; 11 | public: 12 | static void add(const char* format, ...) { 13 | #ifdef SHOW_SUMMARY 14 | Logger::init(); 15 | char buffer[1024]; 16 | va_list args; 17 | va_start(args, format); 18 | vsprintf(buffer, format, args); 19 | va_end(args); 20 | instance -> infos.push_back(std::string(buffer)); 21 | #endif 22 | } 23 | 24 | inline static void print() { 25 | #ifdef SHOW_SUMMARY 26 | Logger::init(); 27 | char proc_info[100]; 28 | #if USE_MPI 29 | sprintf(proc_info, "[%d]", MyMPI::rank); 30 | #else 31 | sprintf(proc_info, "%s", ""); // printf("") will cause compilee warning "-Wformat-zero-length" 32 | #endif 33 | for (auto& s: instance -> infos) { 34 | std::cout << "Logger" << proc_info << ": " << s << std::endl; 35 | } 36 | instance -> infos.clear(); 37 | #endif 38 | } 39 | 40 | private: 41 | Logger() = default; 42 | static void init() { 43 | if (instance == NULL) { 44 | instance = new Logger(); 45 | } 46 | } 47 | private: 48 | std::vector infos; 49 | }; 50 | -------------------------------------------------------------------------------- /src/schedule.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "utils.h" 7 | #include "gate.h" 8 | 9 | enum class Backend { 10 | None, PerGate, BLAS 11 | }; 12 | 13 | std::string to_string(Backend b); 14 | 15 | struct State { 16 | std::vector pos; 17 | std::vector layout; 18 | State() = default; 19 | State(const State&) = default; 20 | State(const std::vector& p, const std::vector& l): pos(p), layout(l) {}; 21 | State(int numQubits) { 22 | pos.clear(); 23 | for (int i = 0; i < numQubits; i++) { 24 | pos.push_back(i); 25 | } 26 | layout.clear(); 27 | for (int i = 0; i < numQubits; i++) { 28 | layout.push_back(i); 29 | } 30 | } 31 | 32 | std::vector serialize() const; 33 | static State deserialize(const unsigned char* arr, int& cur); 34 | }; 35 | 36 | struct GateGroup { 37 | std::vector gates; 38 | qindex relatedQubits; 39 | State state; 40 | std::vector cuttPerm; 41 | int matQubit; 42 | Backend backend; 43 | 44 | std::vector cuttPlans; 45 | 46 | std::vector> matrix; 47 | std::vector deviceMats; 48 | 49 | GateGroup(GateGroup&&) = default; 50 | GateGroup& operator = (GateGroup&&) = default; 51 | GateGroup(): relatedQubits(0) {} 52 | GateGroup copyGates(); 53 | 54 | static GateGroup merge(const GateGroup& a, const GateGroup& b); 55 | static qindex newRelated(qindex old, const Gate& g, qindex localQubits, bool enableGlobal); 56 | void addGate(const Gate& g, qindex localQubits, bool enableGlobal); 57 | 58 | bool contains(int i) { return (relatedQubits >> i) & 1; } 59 | 60 | std::vector serialize() const; 61 | static GateGroup deserialize(const unsigned char* arr, int& cur); 62 | 63 | State initState(const State& oldState, int numLocalQubits); 64 | State initPerGateState(const State& oldState); 65 | State initBlasState(const State& oldState, int numLocalQubit); 66 | void initCPUMatrix(int numLocalQubit); 67 | void initGPUMatrix(); 68 | void initMatrix(int numLocalQubit); 69 | void getCuttPlanPointers(int numLocalQubits, std::vector &cuttPlanPointers, std::vector &cuttPermPointers, std::vector &locals); 70 | }; 71 | 72 | struct LocalGroup { 73 | State state; 74 | int a2aCommSize; 75 | std::vector a2aComm; 76 | std::vector cuttPerm; 77 | 78 | std::vector overlapGroups; 79 | std::vector fullGroups; 80 | qindex relatedQubits; 81 | 82 | std::vector cuttPlans; 83 | 84 | LocalGroup() = default; 85 | LocalGroup(LocalGroup&&) = default; 86 | 87 | bool contains(int i) { return (relatedQubits >> i) & 1; } 88 | State initState(const State& oldState, int numQubits, const std::vector& newGlobals, qindex overlapGlobals, qindex overlapRelated); 89 | void getCuttPlanPointers(int numLocalQubits, std::vector &cuttPlanPointers, std::vector &cuttPermPointers, std::vector &locals, bool isFirstGroup = false); 90 | State initFirstGroupState(const State& oldState, int numQubits, const std::vector& newGlobals); 91 | std::vector serialize() const; 92 | static LocalGroup deserialize(const unsigned char* arr, int& cur); 93 | }; 94 | 95 | struct Schedule { 96 | std::vector localGroups; 97 | State finalState; 98 | 99 | void dump(int numQubits); 100 | std::vector serialize() const; 101 | static Schedule deserialize(const unsigned char* arr, int& cur); 102 | void initMatrix(int numQubits); 103 | void initCuttPlans(int numLocalQubits); 104 | }; 105 | 106 | void removeGates(std::vector& remain, const std::vector& remove); // remain := remain - remove -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include "logger.h" 5 | 6 | namespace MyGlobalVars { 7 | int numGPUs; 8 | int localGPUs; 9 | int bit; 10 | std::unique_ptr streams; 11 | std::unique_ptr streams_comm; 12 | std::unique_ptr blasHandles; 13 | #if USE_MPI 14 | std::unique_ptr ncclComms; 15 | #endif 16 | 17 | void init() { 18 | checkCudaErrors(cudaGetDeviceCount(&localGPUs)); 19 | #if USE_MPI 20 | numGPUs = MyMPI::commSize * localGPUs; 21 | #else 22 | numGPUs = localGPUs; 23 | #endif 24 | Logger::add("Local GPU: %d", localGPUs); 25 | bit = get_bit(numGPUs); 26 | 27 | streams = std::make_unique(MyGlobalVars::localGPUs); 28 | streams_comm = std::make_unique(MyGlobalVars::localGPUs); 29 | blasHandles = std::make_unique(MyGlobalVars::localGPUs); 30 | checkCuttErrors(cuttInit()); 31 | for (int i = 0; i < localGPUs; i++) { 32 | checkCudaErrors(cudaSetDevice(i)); 33 | cudaDeviceProp prop; 34 | cudaGetDeviceProperties(&prop, i); 35 | Logger::add("[%d] %s", i, prop.name); 36 | for (int j = 0; j < localGPUs; j++) 37 | if (i != j && (i ^ j) < 4) { 38 | checkCudaErrors(cudaDeviceEnablePeerAccess(j, 0)); 39 | } 40 | checkCudaErrors(cudaStreamCreate(&streams[i]);) 41 | checkBlasErrors(cublasCreate(&blasHandles[i])); 42 | checkBlasErrors(cublasSetStream(blasHandles[i], streams[i])); 43 | checkCudaErrors(cudaStreamCreate(&streams_comm[i])); 44 | checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); 45 | } 46 | #if USE_MPI 47 | checkMPIErrors(MPI_Barrier(MPI_COMM_WORLD)); 48 | ncclUniqueId id; 49 | if (MyMPI::rank == 0) 50 | checkNCCLErrors(ncclGetUniqueId(&id)); 51 | checkMPIErrors(MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); 52 | ncclComms = std::make_unique(MyGlobalVars::localGPUs); 53 | checkNCCLErrors(ncclGroupStart()); 54 | for (int i = 0; i < localGPUs; i++) { 55 | checkCudaErrors(cudaSetDevice(i)); 56 | checkNCCLErrors(ncclCommInitRank(&ncclComms[i], numGPUs, id, MyMPI::rank * localGPUs + i)); 57 | } 58 | checkNCCLErrors(ncclGroupEnd()); 59 | #endif 60 | } 61 | }; 62 | 63 | namespace MyMPI { 64 | int rank; 65 | int commSize; 66 | int commBit; 67 | void init() { 68 | #if USE_MPI 69 | MPI_Init(nullptr, nullptr); 70 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 71 | MPI_Comm_size(MPI_COMM_WORLD, &commSize); 72 | #endif 73 | } 74 | }; 75 | 76 | 77 | qreal zero_wrapper(qreal x) { 78 | const qreal eps = 1e-14; 79 | if (x > -eps && x < eps) { 80 | return 0; 81 | } else { 82 | return x; 83 | } 84 | } 85 | 86 | qComplex operator * (const qComplex& a, const qComplex& b) { 87 | return make_qComplex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); 88 | } 89 | 90 | qComplex operator + (const qComplex& a, const qComplex& b) { 91 | return make_qComplex(a.x + b.x, a.y + b.y); 92 | } 93 | 94 | bool isUnitary(std::unique_ptr& mat, int n) { 95 | qComplex result[n * n]; 96 | memset(result, 0, sizeof(result)); 97 | for (int k = 0; k < n; k++) 98 | #pragma omp parallel for 99 | for (int i = 0; i < n; i++) 100 | for (int j = 0; j < n; j++) { 101 | qComplex v1 = mat[k * n + i]; 102 | v1.y = - v1.y; 103 | result[i * n + j] = result[i * n + j] + v1 * mat[k * n + j]; 104 | } 105 | bool wa = 0; 106 | qreal eps = 1e-8; 107 | #pragma omp parallel for 108 | for (int i = 0; i < n; i++) { 109 | qComplex val = result[i * n + i]; 110 | if (fabs(val.x - 1) > eps || fabs(val.y) > eps) { 111 | wa = 1; 112 | } 113 | for (int j = 0; j < n; j++) { 114 | if (i == j) 115 | continue; 116 | qComplex val = result[i * n + j]; 117 | if (fabs(val.x) > eps || fabs(val.y) > eps) 118 | wa = 1; 119 | } 120 | } 121 | if (wa) { 122 | for (int i = 0; i < n; i++) { 123 | for (int j = 0; j < n; j++) 124 | printf("(%.2f %.2f) ", result[i * n + j].x, result[i * n + j].y); 125 | printf("\n"); 126 | } 127 | exit(1); 128 | } 129 | return 1; 130 | } 131 | 132 | qComplex make_qComplex(qreal x) { 133 | return make_qComplex(x, 0.0); 134 | } 135 | 136 | bool operator < (const qComplex& a, const qComplex& b) { 137 | return a.x == b.x ? a.y < b.y : a.x < b.x; 138 | } 139 | 140 | int get_bit(int n) { 141 | int x = n; 142 | int bit = -1; 143 | while (x) { 144 | bit ++; 145 | x >>= 1; 146 | } 147 | if (n == 0 || (1 << bit) != n) { 148 | printf("Must be pow of two: %d\n", n); 149 | exit(1); 150 | } 151 | return bit; 152 | } -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #if USE_MPI 12 | #include 13 | #include 14 | #endif 15 | 16 | #ifdef USE_DOUBLE 17 | typedef double qreal; 18 | typedef long long qindex; 19 | typedef cuDoubleComplex qComplex; 20 | #define make_qComplex make_cuDoubleComplex 21 | #define MPI_Complex MPI_C_DOUBLE_COMPLEX 22 | #define cublasGEMM cublasZgemm 23 | #define NCCL_FLOAT_TYPE ncclDouble 24 | #else 25 | typedef float qreal; 26 | typedef long long qindex; 27 | typedef cuFloatComplex qComplex; 28 | #define make_qComplex make_cuFloatComplex 29 | #define MPI_Complex MPI_C_COMPLEX 30 | #define cublasGEMM cublasCgemm 31 | #define NCCL_FLOAT_TYPE ncclFloat 32 | #endif 33 | 34 | #define SERIALIZE_STEP(x) { *reinterpret_cast(arr + cur) = x; cur += sizeof(x); } 35 | #define DESERIALIZE_STEP(x) { x = *reinterpret_cast(arr + cur); cur += sizeof(x); } 36 | 37 | #define SERIALIZE_VECTOR(x, result) { \ 38 | auto tmp_chars = reinterpret_cast(x.data()); \ 39 | result.insert(result.end(), tmp_chars, tmp_chars + sizeof(decltype(x)::value_type) * x.size()); \ 40 | } 41 | 42 | #define DESERIALIZE_VECTOR(x, size) { \ 43 | x.resize(size); \ 44 | auto tmp_size = sizeof(decltype(x)::value_type) * size; \ 45 | memcpy(x.data(), arr + cur, tmp_size); \ 46 | cur += tmp_size; \ 47 | } 48 | 49 | 50 | #define UNREACHABLE() { \ 51 | printf("file %s line %i: unreachable!\n", __FILE__, __LINE__); \ 52 | fflush(stdout); \ 53 | exit(1); \ 54 | } 55 | 56 | const int LOCAL_QUBIT_SIZE = 10; // is hardcoded 57 | const int BLAS_MAT_LIMIT = BLAS_MAT_LIMIT_DEFINED; 58 | const int THREAD_DEP = THREAD_DEP_DEFINED; // 1 << THREAD_DEP threads per block 59 | const int COALESCE_GLOBAL = COALESCE_GLOBAL_DEFINED; 60 | const int MAX_GATE = 600; 61 | const int MIN_MAT_SIZE = MIN_MAT_SIZE_DEFINED; 62 | 63 | static const char *cublasGetErrorString(cublasStatus_t error) { 64 | switch (error) 65 | { 66 | case CUBLAS_STATUS_SUCCESS: 67 | return "CUBLAS_STATUS_SUCCESS"; 68 | case CUBLAS_STATUS_NOT_INITIALIZED: 69 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 70 | case CUBLAS_STATUS_ALLOC_FAILED: 71 | return "CUBLAS_STATUS_ALLOC_FAILED"; 72 | case CUBLAS_STATUS_INVALID_VALUE: 73 | return "CUBLAS_STATUS_INVALID_VALUE"; 74 | case CUBLAS_STATUS_ARCH_MISMATCH: 75 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 76 | case CUBLAS_STATUS_MAPPING_ERROR: 77 | return "CUBLAS_STATUS_MAPPING_ERROR"; 78 | case CUBLAS_STATUS_EXECUTION_FAILED: 79 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 80 | case CUBLAS_STATUS_INTERNAL_ERROR: 81 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 82 | default: 83 | return ""; 84 | } 85 | UNREACHABLE() 86 | } 87 | 88 | static const char *cuttGetErrorString(cuttResult error) { 89 | switch (error) { 90 | case CUTT_INVALID_PLAN: 91 | return "CUTT_INVALID_PLAN"; 92 | case CUTT_INVALID_PARAMETER: 93 | return "CUTT_INVALID_PARAMETER"; 94 | case CUTT_INVALID_DEVICE: 95 | return "CUTT_INVALID_DEVICE"; 96 | case CUTT_INTERNAL_ERROR: 97 | return "CUTT_INTERNAL_ERROR"; 98 | case CUTT_UNDEFINED_ERROR: 99 | return "CUTT_UNDEFINED_ERROR"; 100 | default: 101 | return ""; 102 | } 103 | UNREACHABLE() 104 | } 105 | 106 | #define checkCudaErrors(stmt) { \ 107 | cudaError_t err = stmt; \ 108 | if (err != cudaSuccess) { \ 109 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cudaGetErrorString(err)); \ 110 | exit(1); \ 111 | } \ 112 | } 113 | 114 | #define checkCuttErrors(stmt) { \ 115 | cuttResult err = stmt; \ 116 | if (err != CUTT_SUCCESS) { \ 117 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cuttGetErrorString(err)); \ 118 | exit(1); \ 119 | } \ 120 | } 121 | 122 | #define checkBlasErrors(stmt) { \ 123 | cublasStatus_t err = stmt; \ 124 | if (err != CUBLAS_STATUS_SUCCESS) { \ 125 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cublasGetErrorString(err)); \ 126 | exit(1); \ 127 | } \ 128 | } 129 | 130 | #define checkMPIErrors(stmt) { \ 131 | int err = stmt; \ 132 | if(err != MPI_SUCCESS) { \ 133 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err); \ 134 | exit(1); \ 135 | } \ 136 | } 137 | 138 | #define checkNCCLErrors(stmt) { \ 139 | ncclResult_t err= stmt; \ 140 | if (err != ncclSuccess) { \ 141 | fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, ncclGetErrorString(err)); \ 142 | exit(1); \ 143 | } \ 144 | } 145 | 146 | namespace MyGlobalVars { 147 | extern int numGPUs; 148 | extern int localGPUs; 149 | extern int bit; 150 | extern std::unique_ptr streams; 151 | extern std::unique_ptr streams_comm; 152 | extern std::unique_ptr blasHandles; 153 | #if USE_MPI 154 | extern std::unique_ptr ncclComms; 155 | #endif 156 | void init(); 157 | }; 158 | 159 | namespace MyMPI { 160 | extern int rank; 161 | extern int commSize; 162 | extern int commBit; 163 | void init(); 164 | }; 165 | 166 | template 167 | int bitCount(T x) { 168 | int ret = 0; 169 | for (T i = x; i; i -= i & (-i)) { 170 | ret++; 171 | } 172 | return ret; 173 | } 174 | 175 | qreal zero_wrapper(qreal x); 176 | 177 | qComplex operator * (const qComplex& a, const qComplex& b); 178 | qComplex operator + (const qComplex& a, const qComplex& b); 179 | 180 | bool isUnitary(std::unique_ptr& mat, int n); 181 | 182 | qComplex make_qComplex(qreal x); 183 | bool operator < (const qComplex& a, const qComplex& b); 184 | 185 | int get_bit(int n); -------------------------------------------------------------------------------- /tests/input/basis_change_24.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fe6754aecac445ac4b97dd30a70acda95de1601c66071ecefab725f2dc98825e 3 | size 99591 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_25.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9e8d52fdbb24e5e46135dba5af55cf6a6b7d9c3ec2c112334717038aa9335247 3 | size 108252 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_26.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:57ee32983271f084130829d39fb163ce6813e2b917278cb0e505bada417757a8 3 | size 117400 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_27.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c22f6398109ad330eafc6fbe4594e49cf5d9d0020196c9e86e222bc1cd8d12a8 3 | size 126859 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8e398bd661c1789dd2b152f3e14e37c7a882d496dd7eb2be90a7719ca102cae2 3 | size 136363 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_29.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:890b79f448ccab1be394103b563c4d2028b2add24eb328da99261f9c2734a66b 3 | size 146669 4 | -------------------------------------------------------------------------------- /tests/input/basis_change_30.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e7373714031b32cafbb3c6ed7b74e6cd11eaa0f53bee92af2805d8eca7a5c39e 3 | size 157405 4 | -------------------------------------------------------------------------------- /tests/input/bv_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4588f9f0693bd5a1b2714bb09582c72cd2ce14a7d60d223ffa9fca10301e4165 3 | size 954 4 | -------------------------------------------------------------------------------- /tests/input/hidden_shift_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5febba63b537eca2d1580aa6306014321867e2fbc7195f480781f9bc4f229808 3 | size 1771 4 | -------------------------------------------------------------------------------- /tests/input/qaoa_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed161b0a92906f81b6d8242884d85094b7021fe35fc720a4f8f1b0967bbad638 3 | size 44039 4 | -------------------------------------------------------------------------------- /tests/input/qft_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fae2455fdb0f3828d6bb3b8aebd4762ed10a16250806a7b70377329ee8b71294 3 | size 9634 4 | -------------------------------------------------------------------------------- /tests/input/quantum_volume_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cd7939d6e1f39eed7651cc8c1a7b8270f9a138c7c7929850e827545849390606 3 | size 81064 4 | -------------------------------------------------------------------------------- /tests/input/supremacy_28.qasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c98d622067ac9f0eeece5e57234375c2d721ad64c82c543914e9fec6354bc22d 3 | size 14961 4 | -------------------------------------------------------------------------------- /tests/output/basis_change_25.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c141d408cfc426fdd41e8d3beb3adebe15e64fc3d065ba3df1bbfe8e442d72ff 3 | size 6291 4 | -------------------------------------------------------------------------------- /tests/output/basis_change_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a0567b2dbab58f720a04957a6667ac00a0ce923e994c5aaa8e96dec07be17428 3 | size 6290 4 | -------------------------------------------------------------------------------- /tests/output/basis_change_30.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f3620a2c66c37dbd55e140de898625ace24400ab327921603255bad09d53c3d2 3 | size 6291 4 | -------------------------------------------------------------------------------- /tests/output/bv_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e03bc93f14dc98db5a74d4b35458d158c811b5d2b087a6bed4c60eb6864234c5 3 | size 6403 4 | -------------------------------------------------------------------------------- /tests/output/hidden_shift_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a074bd1082dc9b9a15c0d5b806194e4ff6b5e148b04467997203a888d0e2f6e8 3 | size 6346 4 | -------------------------------------------------------------------------------- /tests/output/qaoa_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e90c51de4d9027fa716dcbb9a00ac5d43ad7445c5b2d651cf1c0178307e63b97 3 | size 6386 4 | -------------------------------------------------------------------------------- /tests/output/qft_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:985cd66bcbc2fd4facdafa5fc793b8ad30d96abf28b69d8608d67a34eb39d6e7 3 | size 6290 4 | -------------------------------------------------------------------------------- /tests/output/quantum_volume_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f128865203bbd925e59e9af603997cfc4ff11fde2b5b366153bdc30771c4ee6d 3 | size 6414 4 | -------------------------------------------------------------------------------- /tests/output/supremacy_28.log: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c2dd93160bc32bb9745f28c1562cee6fc737212e119de6b04e803d6b7e114f82 3 | size 6420 4 | --------------------------------------------------------------------------------