├── .gitattributes
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── benchmark
    ├── bench_backend.sh
    ├── bench_blas_a100.sh
    ├── bench_blas_v100.sh
    ├── bench_comm.sh
    ├── bench_cublas_a100.sh
    ├── bench_cublas_v100.sh
    ├── bench_evaluator_a100.sh
    ├── bench_evaluator_v100.sh
    ├── bench_groupsz.sh
    ├── bench_numgate.sh
    ├── bench_pergate.sh
    ├── bench_scale.sh
    ├── bench_sharemem.sh
    ├── bench_weak.sh
    ├── blas.cu
    ├── plot
    │   └── plot.py
    └── preprocess.sh
├── cmake
    └── FindNccl.cmake
├── evaluator-preprocess
    └── process.cpp
├── main.cpp
├── micro-benchmark
    ├── bench-blas.cpp
    ├── local-ctr.cpp
    ├── local-single.cpp
    └── two-group-h.cpp
├── scripts
    ├── .gitignore
    ├── check.sh
    ├── check_wrapper.sh
    ├── coalescing.sh
    ├── compare.py
    ├── env.sh
    ├── gen_stdout.sh
    ├── gpu-bind.sh
    ├── init.sh
    ├── run-multi-GPU.sh
    ├── run-multi-node.sh
    ├── run-single.sh
    └── run.sh
├── src
    ├── CMakeLists.txt
    ├── circuit.cpp
    ├── circuit.h
    ├── compiler.cpp
    ├── compiler.h
    ├── evaluator.cpp
    ├── evaluator.h
    ├── executor.cpp
    ├── executor.h
    ├── gate.cpp
    ├── gate.h
    ├── kernel.h
    ├── kernelOpt.cu
    ├── kernelSimple.cu
    ├── kernelUtils.cu
    ├── kernels
    │   ├── baseline.cu
    │   ├── lookup.cu
    │   └── swizzle.cu
    ├── logger.cpp
    ├── logger.h
    ├── schedule.cpp
    ├── schedule.h
    ├── utils.cpp
    └── utils.h
└── tests
    ├── input
        ├── basis_change_24.qasm
        ├── basis_change_25.qasm
        ├── basis_change_26.qasm
        ├── basis_change_27.qasm
        ├── basis_change_28.qasm
        ├── basis_change_29.qasm
        ├── basis_change_30.qasm
        ├── bv_28.qasm
        ├── hidden_shift_28.qasm
        ├── qaoa_28.qasm
        ├── qft_28.qasm
        ├── quantum_volume_28.qasm
        └── supremacy_28.qasm
    └── output
        ├── basis_change_25.log
        ├── basis_change_28.log
        ├── basis_change_30.log
        ├── bv_28.log
        ├── hidden_shift_28.log
        ├── qaoa_28.log
        ├── qft_28.log
        ├── quantum_volume_28.log
        └── supremacy_28.log


/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/input/*.qasm filter=lfs diff=lfs merge=lfs -text
2 | tests/output/*.log filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | tests/
 3 | evaluator-preprocess/parameter-files*
 4 | .vscode/
 5 | *.sqlite
 6 | *.qdrep
 7 | *.log
 8 | *.profile
 9 | blas
10 | *.pdf


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third-party/cutt"]
2 | 	path = third-party/cutt
3 | 	url = https://github.com/heheda12345/cutt.git
4 | [submodule "third-party/dbg-macro"]
5 | 	path = third-party/dbg-macro
6 | 	url = https://github.com/sharkdp/dbg-macro.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.1)
  2 | project(QCSimulatorRoot)
  3 | set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
  4 | find_package(CUDA REQUIRED)
  5 | find_package(OpenMP REQUIRED)
  6 | find_package(Nccl REQUIRED)
  7 | find_package(MPI REQUIRED)
  8 | 
  9 | find_library(CUTT cutt "${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/lib")
 10 | include_directories(${PROJECT_SOURCE_DIR}/third-party/cutt/cutt/include)
 11 | include_directories(${PROJECT_SOURCE_DIR}/third-party/dbg-macro)
 12 | MESSAGE(STATUS "Found CUTT: ${CUTT}")
 13 | 
 14 | set(CMAKE_CXX_FLAGS "-std=c++14 -O2 -g -Wall ${OpenMP_CXX_FLAGS}")
 15 | set(CUDA_NVCC_FLAGS "-Xcompiler -fopenmp -std=c++14 -O2 -g -arch=compute_70 -code=sm_70 --ptxas-options=-v -lineinfo -keep")
 16 | set(BACKEND "group" CACHE STRING "Backend mode, one of [serial, group, group-serial, blas, mix, blas-advance]")
 17 | MESSAGE(STATUS "Backend: ${BACKEND}")
 18 | 
 19 | option(SHOW_SCHEDULE "Print the schedule" ON)
 20 | option(SHOW_SUMMARY "Show the running details" ON)
 21 | option(MEASURE_STAGE "Measure time of each stage" OFF)
 22 | option(MICRO_BENCH "Compile micro-benchmarks" OFF)
 23 | option(EVALUATOR_PREPROCESS "compile evaluator preprocess" OFF)
 24 | option(DISABLE_ASSERT "Use assert in cuda runtime" ON)
 25 | option(USE_DOUBLE "double or float" ON)
 26 | option(ENABLE_OVERLAP "overlap" ON)
 27 | option(USE_MPI "use mpi" OFF)
 28 | option(OVERLAP_MAT "overlap initMatirx" ON)
 29 | option(LOG_EVALUATOR "show logging of evaluator" OFF)
 30 | 
 31 | if (BACKEND STREQUAL "serial")
 32 |     add_definitions(-DBACKEND=0)
 33 | elseif(BACKEND STREQUAL "group")
 34 |     add_definitions(-DBACKEND=1)
 35 | elseif(BACKEND STREQUAL "group-serial")
 36 |     add_definitions(-DBACKEND=2)
 37 | elseif(BACKEND STREQUAL "blas")
 38 |     add_definitions(-DBACKEND=3)
 39 | elseif(BACKEND STREQUAL "mix")
 40 |     add_definitions(-DBACKEND=4)
 41 | elseif(BACKEND STREQUAL "blas-advance")
 42 |     add_definitions(-DBACKEND=5)
 43 | else()
 44 |     MESSAGE(ERROR "invalid mode")
 45 | endif()
 46 | 
 47 | if (SHOW_SCHEDULE)
 48 |     add_definitions(-DSHOW_SCHEDULE)
 49 | endif(SHOW_SCHEDULE)
 50 | if (SHOW_SUMMARY)
 51 |     add_definitions(-DSHOW_SUMMARY)
 52 | endif(SHOW_SUMMARY)
 53 | if (MEASURE_STAGE)
 54 |     add_definitions(-DMEASURE_STAGE)
 55 | endif(MEASURE_STAGE)
 56 | if (DISABLE_ASSERT)
 57 |     add_definitions(-DNDEBUG)
 58 | else()
 59 |     add_definitions(-DDEBUG)
 60 | endif(DISABLE_ASSERT)
 61 | if (ENABLE_OVERLAP)
 62 |     add_definitions(-DENABLE_OVERLAP)
 63 | endif(ENABLE_OVERLAP)
 64 | if (USE_DOUBLE)
 65 |     MESSAGE(STATUS "Float type: Double")
 66 |     add_definitions(-DUSE_DOUBLE)
 67 | else()
 68 |     MESSAGE(STATUS "Float type: Float")
 69 | endif(USE_DOUBLE)
 70 | if (OVERLAP_MAT)
 71 |     add_definitions(-DOVERLAP_MAT)
 72 | endif(OVERLAP_MAT)
 73 | 
 74 | if (USE_MPI)
 75 |     add_definitions(-DUSE_MPI=1)
 76 | else()
 77 |     add_definitions(-DUSE_MPI=0)
 78 | endif(USE_MPI)
 79 | 
 80 | set(COALESCE "3" CACHE STRING "coalescing size")
 81 | MESSAGE(STATUS "coalesce = ${COALESCE}")
 82 | add_definitions(-DCOALESCE_GLOBAL_DEFINED=${COALESCE})
 83 | 
 84 | set(MAT "6" CACHE STRING "mat size")
 85 | MESSAGE(STATUS "mat size = ${MAT}")
 86 | add_definitions(-DBLAS_MAT_LIMIT_DEFINED=${MAT})
 87 | 
 88 | set(MIN_MAT "4" CACHE STRING "min mat size")
 89 | MESSAGE(STATUS "min mat size = ${MIN_MAT}")
 90 | add_definitions(-DMIN_MAT_SIZE_DEFINED=${MIN_MAT})
 91 | 
 92 | set(THREAD_DEP "7" CACHE STRING "thread dep")
 93 | MESSAGE(STATUS "thread_dep = ${THREAD_DEP}")
 94 | add_definitions(-DTHREAD_DEP_DEFINED=${THREAD_DEP})
 95 | 
 96 | if (EVALUATOR_PREPROCESS)
 97 |     set(PROCESS process)
 98 |     add_executable(process evaluator-preprocess/process.cpp)
 99 |     target_link_libraries(process QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
100 |     add_definitions(-DUSE_EVALUATOR_PREPROCESS)
101 | endif(EVALUATOR_PREPROCESS)
102 | 
103 | if(LOG_EVALUATOR)
104 |     add_definitions(-DLOG_EVALUATOR)
105 | endif(LOG_EVALUATOR)
106 | 
107 | include_directories ("${PROJECT_SOURCE_DIR}/src")
108 | add_subdirectory("src")
109 | add_executable(main main.cpp)
110 | target_link_libraries(main QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
111 | 
112 | if (MICRO_BENCH)
113 |     set(BENCHMARKS local-single local-ctr two-group-h bench-blas)
114 |     foreach(BENCHMARK IN LISTS BENCHMARKS)
115 |         add_executable(${BENCHMARK} micro-benchmark/${BENCHMARK}.cpp)
116 |         target_link_libraries(${BENCHMARK} QCSimulator ${CUTT} ${OpenMP_CXX_FLAGS} ${CUDA_CUBLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARY})
117 |     endforeach(BENCHMARK IN LISTS BENCHMARKS)
118 | endif(MICRO_BENCH)
119 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HyQuas
  2 | 
  3 | HyQuas is a **Hy**brid partitioner based **Qua**ntum circuit **S**imulation system on GPU, which supports both single-GPU, single-node-multi-GPU, and multi-node-multi-GPU quantum circuit simulation.
  4 | 
  5 | For single-GPU simulation, it provides two highly optimized methods, *OShareMem* and *TransMM*. *OShareMem* method optimizes the shared-memory based quantum circuit simulation by <img src="https://render.githubusercontent.com/render/math?math=2.67 \times">. *TransMM* method converts quantum circuit simulation to standard operations and enables the usage of highly optimized libraries like cuBLAS and powerful hardwares like Tensor Cores. It leads up to <img src="https://render.githubusercontent.com/render/math?math=8.43 \times"> speedup over previous gate-merging based simulation. Moreover, it can select the better simulation method for different parts of a given quantum circuit according to its pattern. 
  6 | 
  7 | For distributed simulation, it provides a GPU-centric communication pipelining approach. It can utilize the high-throughput NVLink connections to make the simulation even faster while still preserving low communication traffic.
  8 | 
  9 | Experimental results show that HyQuas can achieve up to <img src="https://render.githubusercontent.com/render/math?math=10.71 \times"> speedup on a single GPU and <img src="https://render.githubusercontent.com/render/math?math=227 \times"> speedup on a GPU cluster over state-of-the-art quantum circuit simulation systems.
 10 | 
 11 | ## Compile and Run
 12 | 1. Get the source code
 13 |     ```bash
 14 |     git clone https://github.com/thu-pacman/HyQuas.git --recursive
 15 |     ```
 16 | 
 17 | 2. Specify the compute capability in `CMakeLists.txt` (`CUDA_NVCC_FLAGS`) and `third-party/cutt/Makefile` (`GENCODE_FLAGS`)
 18 | 
 19 | 3. Prepare the following dependencies
 20 |     * cmake (tested on 3.12.3)
 21 |     * cuda (tested on 10.2.89 and 11.0.2)
 22 |     * g++ (compatible with cuda)
 23 |     * cublas (with the same version of cuda)
 24 |     * openmpi (tested on 4.0.5)
 25 |     * nccl (Fully tested on 2.9.6-1. Known that 2.7.8-1 cannot work. It will be blocked in an NCCL simulated MPI_Sendrecv.)
 26 |     And update environment variables like `CUDA_HOME`, `NCCL_ROOT`, `$PATH`, `$LIBRARY_PATH`, `$LD_LIBRARY_PATH`, `CPATH` in `scripts/env.sh`.
 27 | 
 28 | 4. Compile the tensor transpose library `cutt`
 29 | 
 30 |     ```bash
 31 |     cd third-party/cutt
 32 |     make -j
 33 |     ```
 34 | 
 35 | 5. Specify the root directory
 36 |     ```bash
 37 |     export HYQUAS_ROOT=${The_directory_running_git_clone}/HyQuas
 38 |     ```
 39 | 
 40 | 5. Prepare the database for the time predictor
 41 |     ```bash
 42 |     mkdir -p evaluator-preprocess/parameter-files
 43 |     cd benchmark
 44 |     ./preprocess.sh
 45 |     ```
 46 | 
 47 | 6. Example usages of HyQuas:
 48 |     HyQuas will use all GPUs it can detect, so please control the number of GPU by `CUDA_VISIBLE_DEVICES`.
 49 |     * Run a single circuit with single GPU
 50 |         ```bash
 51 |         cd scripts
 52 |         ./run-single.sh
 53 |         ```
 54 | 
 55 |     * Run a single circuit with multiple GPUs in one node
 56 |         ```bash
 57 |         cd scripts
 58 |         ./run-multi-GPU.sh
 59 |         ```
 60 | 
 61 |     * Run a single circuit with multiple GPUs in multiple nodes
 62 |         Please modify the `-host` first.
 63 |         ```bash
 64 |         cd scripts
 65 |         ./run-multi-node.sh
 66 |         ```
 67 | 
 68 |     * Run all circuits and check the correctness (The script trys both w/o MPI)
 69 |         ```bash
 70 |         cd scripts
 71 |         CUDA_VISIBLE_DEVICES=0,1,2,3 ./check.sh
 72 |         ```
 73 | 
 74 | **Please use the commands in check.sh for evaluating the performance of HyQuas because the run_\*.sh compiles the simulator in debug mode and check.sh compiles it in release mode.**
 75 | 
 76 | For more ways to use our simulator (like only using the *OShareMem* method or *TransMM* method, tuning off the overlap of communication and computation), and for reproducing our results in the ICS'21 paper, please refer to our `benchmark/` directory.
 77 | 
 78 | It also supports the following **unstable** feathers now. See our dev branch for details.
 79 | * Simulating more qubits by saving the state in CPU memory while still compute with GPU.
 80 | * An imperative mode, so that you do not need to explicitly call `c->compile();` and `c->run()`.
 81 | * Support for more control qubits.
 82 | * Support for some two-qubit gates.
 83 | * Fast measurement of quantum state.
 84 | 
 85 | # Cite
 86 | To cite HyQuas, you can use the following BibTex:
 87 | ```
 88 | @inproceedings{10.1145/3447818.3460357,
 89 |     author = {Zhang, Chen and Song, Zeyu and Wang, Haojie and Rong, Kaiyuan and Zhai, Jidong},
 90 |     title = {HyQuas: Hybrid Partitioner Based Quantum Circuit Simulation System on GPU},
 91 |     year = {2021},
 92 |     isbn = {9781450383356},
 93 |     publisher = {Association for Computing Machinery},
 94 |     address = {New York, NY, USA},
 95 |     url = {https://doi.org/10.1145/3447818.3460357},
 96 |     doi = {10.1145/3447818.3460357},
 97 |     booktitle = {Proceedings of the ACM International Conference on Supercomputing},
 98 |     pages = {443–454},
 99 |     numpages = {12},
100 |     keywords = {quantum computing, GPU computing, simulation},
101 |     location = {Virtual Event, USA},
102 |     series = {ICS '21}
103 | }
104 | 
105 | ```
106 | 


--------------------------------------------------------------------------------
/benchmark/bench_backend.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | export MPIRUN_CONFIG=""
 4 | head=../build/logs/`date +%Y%m%d-%H%M%S`
 5 | 
 6 | cd ../scripts
 7 | 
 8 | name=$head-group
 9 | mkdir -p $name
10 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out
11 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/backend.log
12 | 
13 | name=$head-blas
14 | mkdir -p $name
15 | ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DUSE_MPI=off 2>&1 | tee $name/std.out
16 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log
17 | 
18 | name=$head-mix
19 | mkdir -p $name
20 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1 | tee $name/std.out
21 | grep -r "Time Cost" $name/*.log | tee -a ../benchmark/logs/backend.log
22 | 
23 | 


--------------------------------------------------------------------------------
/benchmark/bench_blas_a100.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | export CUDA_VISIBLE_DEVICES=0
  3 | ulimit -s unlimited
  4 | 
  5 | source /opt/spack/share/spack/setup-env.sh
  6 | spack load cuda@11
  7 | NVPROF_COMMAND="nsys nvprof --profile-from-start=off -o test"
  8 | export MPIRUN_CONFIG=""
  9 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
 10 | export tests="$tests_25 $tests_28 $tests_30"
 11 | 
 12 | head=../build/logs/`date +%Y%m%d-%H%M%S`
 13 | logdir=../benchmark/logs/
 14 | echo tests=$tests
 15 | cd ../scripts
 16 | 
 17 | name=$head-m3
 18 | mkdir -p $name
 19 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out
 20 | echo "+++++ 3" | tee $logdir/blas-profile.log 
 21 | for test in ${tests[*]}; do
 22 |     echo "===== $test" | tee -a $name/circ.profile
 23 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 24 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 25 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 26 | done
 27 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 28 | name3=$name
 29 | 
 30 | name=$head-m4
 31 | mkdir -p $name
 32 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out
 33 | echo "+++++ 4" | tee -a $logdir/blas-profile.log
 34 | for test in ${tests[*]}; do
 35 |     echo "===== $test" | tee -a $name/circ.profile
 36 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 37 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 38 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 39 | done
 40 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 41 | name4=$name
 42 | 
 43 | name=$head-m5
 44 | mkdir -p $name
 45 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out
 46 | echo "+++++ 5" | tee -a $logdir/blas-profile.log
 47 | for test in ${tests[*]}; do
 48 |     echo "===== $test" | tee -a $name/circ.profile
 49 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 50 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 51 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 52 | done
 53 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 54 | name5=$name
 55 | 
 56 | name=$head-m6
 57 | mkdir -p $name
 58 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out
 59 | echo "+++++ 6" | tee -a $logdir/blas-profile.log
 60 | for test in ${tests[*]}; do
 61 |     echo "===== $test" | tee -a $name/circ.profile
 62 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 63 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 64 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 65 | done
 66 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 67 | name6=$name
 68 | 
 69 | name=$head-m7
 70 | mkdir -p $name
 71 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out
 72 | echo "+++++ 7" | tee -a $logdir/blas-profile.log
 73 | for test in ${tests[*]}; do
 74 |     echo "===== $test" | tee -a $name/circ.profile
 75 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 76 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 77 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 78 | done
 79 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 80 | name7=$name
 81 | 
 82 | name=$head-m8
 83 | mkdir -p $name
 84 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out
 85 | echo "+++++ 8" | tee -a $logdir/blas-profile.log
 86 | for test in ${tests[*]}; do
 87 |     echo "===== $test" | tee -a $name/circ.profile
 88 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 89 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
 90 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 91 | done
 92 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
 93 | name8=$name
 94 | 
 95 | name=$head-m9
 96 | mkdir -p $name
 97 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out
 98 | echo "+++++ 9" | tee -a $logdir/blas-profile.log
 99 | for test in ${tests[*]}; do
100 |     echo "===== $test" | tee -a $name/circ.profile
101 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
102 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
103 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
104 | done
105 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
106 | name9=$name
107 | 
108 | name=$head-m10
109 | mkdir -p $name
110 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out
111 | echo "+++++ 10" | tee -a $logdir/blas-profile.log
112 | for test in ${tests[*]}; do
113 |     echo "===== $test" | tee -a $name/circ.profile
114 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
115 |     grep "cutlass" tmp.profile | tee -a $name/circ.profile
116 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
117 | done
118 | cat $name/circ.profile | tee -a $logdir/blas-profile.log
119 | name10=$name
120 | 
121 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/blas.log
122 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/blas.log
123 | 


--------------------------------------------------------------------------------
/benchmark/bench_blas_v100.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | export CUDA_VISIBLE_DEVICES=0
  3 | ulimit -s unlimited
  4 |  
  5 | export MPIRUN_CONFIG=""
  6 | NVPROF_COMMAND="nvprof --profile-from-start off --csv"
  7 | 
  8 | export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
  9 | export tests="$tests_28"
 10 | 
 11 | head=../build/logs/`date +%Y%m%d-%H%M%S`
 12 | logdir=../benchmark/logs/
 13 | echo tests=$tests
 14 | cd ../scripts
 15 | 
 16 | name=$head-m3
 17 | mkdir -p $name
 18 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3 2>&1 | tee $name/std.out
 19 | echo "+++++ 3" | tee $logdir/transmm-profile.log 
 20 | for test in ${tests[*]}; do
 21 |     echo "===== $test" | tee -a $name/circ.profile
 22 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 23 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 24 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 25 | done
 26 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 27 | name3=$name
 28 | 
 29 | name=$head-m4
 30 | mkdir -p $name
 31 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=4 2>&1 | tee $name/std.out
 32 | echo "+++++ 4" | tee -a $logdir/transmm-profile.log
 33 | for test in ${tests[*]}; do
 34 |     echo "===== $test" | tee -a $name/circ.profile
 35 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 36 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 37 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 38 | done
 39 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 40 | name4=$name
 41 | 
 42 | name=$head-m5
 43 | mkdir -p $name
 44 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=5 2>&1 | tee $name/std.out
 45 | echo "+++++ 5" | tee -a $logdir/transmm-profile.log
 46 | for test in ${tests[*]}; do
 47 |     echo "===== $test" | tee -a $name/circ.profile
 48 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 49 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 50 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 51 | done
 52 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 53 | name5=$name
 54 | 
 55 | name=$head-m6
 56 | mkdir -p $name
 57 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=6 2>&1 | tee $name/std.out
 58 | echo "+++++ 6" | tee -a $logdir/transmm-profile.log
 59 | for test in ${tests[*]}; do
 60 |     echo "===== $test" | tee -a $name/circ.profile
 61 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 62 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 63 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 64 | done
 65 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 66 | name6=$name
 67 | 
 68 | name=$head-m7
 69 | mkdir -p $name
 70 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=7 2>&1 | tee $name/std.out
 71 | echo "+++++ 7" | tee -a $logdir/transmm-profile.log
 72 | for test in ${tests[*]}; do
 73 |     echo "===== $test" | tee -a $name/circ.profile
 74 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 75 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 76 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 77 | done
 78 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 79 | name7=$name
 80 | 
 81 | name=$head-m8
 82 | mkdir -p $name
 83 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=8 2>&1 | tee $name/std.out
 84 | echo "+++++ 8" | tee -a $logdir/transmm-profile.log
 85 | for test in ${tests[*]}; do
 86 |     echo "===== $test" | tee -a $name/circ.profile
 87 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
 88 |     grep "volta" tmp.profile | tee -a $name/circ.profile
 89 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
 90 | done
 91 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
 92 | name8=$name
 93 | 
 94 | name=$head-m9
 95 | mkdir -p $name
 96 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=9 2>&1 | tee $name/std.out
 97 | echo "+++++ 9" | tee -a $logdir/transmm-profile.log
 98 | for test in ${tests[*]}; do
 99 |     echo "===== $test" | tee -a $name/circ.profile
100 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
101 |     grep "volta" tmp.profile | tee -a $name/circ.profile
102 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
103 | done
104 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
105 | name9=$name
106 | 
107 | name=$head-m10
108 | mkdir -p $name
109 | tests=$tests ./check_wrapper.sh $name -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMAT=10 2>&1 | tee $name/std.out
110 | echo "+++++ 10" | tee -a $logdir/transmm-profile.log
111 | for test in ${tests[*]}; do
112 |     echo "===== $test" | tee -a $name/circ.profile
113 |     $NVPROF_COMMAND ../build/main  ../tests/input/$test.qasm 2>&1 | tee tmp.profile
114 |     grep "volta" tmp.profile | tee -a $name/circ.profile
115 |     grep "void transpose" tmp.profile | tee -a $name/circ.profile
116 | done
117 | cat $name/circ.profile | tee -a $logdir/transmm-profile.log
118 | name10=$name
119 | 
120 | grep -r "Time Cost" $head-m*/*.log | tee $logdir/transmm.log
121 | grep -r "Total Groups" $head-*/*.log | tee -a $logdir/transmm.log
122 | 


--------------------------------------------------------------------------------
/benchmark/bench_comm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | mkdir -p logs
 3 | mkdir -p logs/bench_comm
 4 | mkdir -p logs/bench_comm/4V100
 5 | mkdir -p logs/bench_comm/2V100
 6 | cd ../scripts
 7 | source ./init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off 2>&1
 8 | cd ../benchmark
 9 | 
10 | tests="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
11 | 
12 | echo "test 4V100"
13 | 
14 | for test in $tests; do
15 | echo $test
16 | CUDA_VISIBLE_DEVICES=0,1,2,3 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/4V100/$test.log 2>logs/bench_comm/4V100/$test.out
17 | done
18 | 
19 | echo "test 2V100"
20 | 
21 | for test in $tests; do
22 | echo $test
23 | CUDA_VISIBLE_DEVICES=0,1 nvprof --print-gpu-trace ../build/main ../tests/input/$test.qasm 1>logs/bench_comm/2V100/$test.log 2>logs/bench_comm/2V100/$test.out
24 | done
25 | 


--------------------------------------------------------------------------------
/benchmark/bench_cublas_a100.sh:
--------------------------------------------------------------------------------
 1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26
 2 | echo N_QUBIT=26
 3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
 4 | 
 5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27
 6 | echo N_QUBIT=27
 7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
 8 | 
 9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28
10 | echo N_QUBIT=28
11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
12 | 
13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=29
14 | echo N_QUBIT=29
15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
16 | 
17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=30
18 | echo N_QUBIT=30
19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-a100.log
20 | 


--------------------------------------------------------------------------------
/benchmark/bench_cublas_v100.sh:
--------------------------------------------------------------------------------
 1 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=24
 2 | echo N_QUBIT=24
 3 | CUDA_VISIBLE_DEVICES=0 ./blas | tee logs/cublas-v100.log
 4 | 
 5 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=25
 6 | echo N_QUBIT=25
 7 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
 8 | 
 9 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=26
10 | echo N_QUBIT=26
11 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
12 | 
13 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=27
14 | echo N_QUBIT=27
15 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
16 | 
17 | nvcc blas.cu -o blas -lcublas -O3 -DN_QUBIT=28
18 | echo N_QUBIT=28
19 | CUDA_VISIBLE_DEVICES=0 ./blas | tee -a logs/cublas-v100.log
20 | 


--------------------------------------------------------------------------------
/benchmark/bench_evaluator_a100.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | mkdir -p logs
 3 | mkdir -p logs/evaluator_a100
 4 | 
 5 | cd ../scripts
 6 | echo "OShareMem"
 7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1
 8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/OShareMem.log
 9 | cd ../benchmark
10 | 
11 | cd ../scripts
12 | echo "TransMM MAT=5"
13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1
14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_5.log
15 | cd ../benchmark
16 | 
17 | cd ../scripts
18 | echo "TransMM MAT=6"
19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1
20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_6.log
21 | cd ../benchmark
22 | 
23 | cd ../scripts
24 | echo "TransMM MAT=7"
25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1
26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_a100/TransMM_7.log
27 | cd ../benchmark
28 | 


--------------------------------------------------------------------------------
/benchmark/bench_evaluator_v100.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | mkdir -p logs
 3 | mkdir -p logs/evaluator_v100
 4 | 
 5 | cd ../scripts
 6 | echo "OShareMem"
 7 | source ./init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on 2>&1
 8 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/OShareMem.log
 9 | cd ../benchmark
10 | 
11 | cd ../scripts
12 | echo "TransMM MAT=5"
13 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=5 2>&1
14 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_5.log
15 | cd ../benchmark
16 | 
17 | cd ../scripts
18 | echo "TransMM MAT=6"
19 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=6 2>&1
20 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_6.log
21 | cd ../benchmark
22 | 
23 | cd ../scripts
24 | echo "TransMM MAT=7"
25 | source ./init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMEASURE_STAGE=on -DLOG_EVALUATOR=on -DOVERLAP_MAT=off -DMAT=7 2>&1
26 | CUDA_VISIBLE_DEVICES=0 ../build/main ../tests/input/basis_change_28.qasm 2>&1 1>../benchmark/logs/evaluator_v100/TransMM_7.log
27 | cd ../benchmark
28 | 


--------------------------------------------------------------------------------
/benchmark/bench_groupsz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ulimit -s unlimited
 3 | 
 4 | cd ../scripts
 5 | 
 6 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=3 -DMIN_MAT=3
 7 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee ../benchmark/logs/groupsz-tm.log
 8 | 
 9 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=4 -DMIN_MAT=4
10 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
11 | 
12 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=5 -DMIN_MAT=5
13 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
14 | 
15 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=6 -DMIN_MAT=6
16 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
17 | 
18 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=7 -DMIN_MAT=7
19 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
20 | 
21 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=8 -DMIN_MAT=8
22 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
23 | 
24 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=9 -DMIN_MAT=9
25 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
26 | 
27 | source ../scripts/init.sh -DBACKEND=blas -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DMAT=10 -DMIN_MAT=10
28 | CUDA_VISIBLE_DEVICES=0 ./bench-blas | tee -a ../benchmark/logs/groupsz-tm.log
29 | 


--------------------------------------------------------------------------------
/benchmark/bench_numgate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0
 4 | head=../build/logs/`date +%Y%m%d-%H%M%S`
 5 | 
 6 | cd ../scripts
 7 | 
 8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
 9 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1
10 | CUDA_VISIBLE_DEVICES=0 ./two-group-h | tee ../benchmark/logs/numgate-sm.log
11 | 
12 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
13 | 


--------------------------------------------------------------------------------
/benchmark/bench_pergate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MPIRUN_CONFIG=""
 3 | 
 4 | cd ../scripts
 5 | 
 6 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
 7 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1
 8 | echo "baseline" | tee pergate.log
 9 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
10 | 
11 | cd ../scripts
12 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
13 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
14 | echo "multitask" | tee -a pergate.log
15 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
16 | 
17 | cd ../scripts
18 | cp ../src/kernels/lookup.cu ../src/kernelOpt.cu
19 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
20 | echo "lookup" | tee -a pergate.log
21 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
22 | 
23 | cd ../scripts
24 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
25 | source init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on 2>&1
26 | echo "bank" | tee -a pergate.log
27 | CUDA_VISIBLE_DEVICES=0 ./local-single 2>&1 | tee -a pergate.log
28 | 
29 | cp pergate.log ../benchmark/logs
30 | cat pergate.log
31 | 


--------------------------------------------------------------------------------
/benchmark/bench_scale.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | head=../build/logs/`date +%Y%m%d-%H%M%S`
 3 | 
 4 | 
 5 | cd ../scripts
 6 | export CUDA_VISIBLE_DEVICES=0
 7 | export MPIRUN_CONFIG=""
 8 | 
 9 | name=$head-1gpu-o
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
12 | name1=$name
13 | 
14 | export CUDA_VISIBLE_DEVICES=0,1
15 | name=$head-2gpu-o
16 | mkdir -p $name
17 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
18 | name2=$name
19 | 
20 | export CUDA_VISIBLE_DEVICES=0,1,2,3
21 | name=$head-4gpu-o
22 | mkdir -p $name
23 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on  -DENABLE_OVERLAP=on 2>&1 | tee $name/std.out
24 | name3=$name
25 | 
26 | export CUDA_VISIBLE_DEVICES=0
27 | name=$head-1gpu-s
28 | mkdir -p $name
29 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
30 | name1=$name
31 | 
32 | export CUDA_VISIBLE_DEVICES=0,1
33 | name=$head-2gpu-s
34 | mkdir -p $name
35 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
36 | name2=$name
37 | 
38 | export CUDA_VISIBLE_DEVICES=0,1,2,3
39 | name=$head-4gpu-s
40 | mkdir -p $name
41 | ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=off 2>&1 | tee $name/std.out
42 | name3=$name
43 | 
44 | 
45 | grep -r "Time Cost" $head-*/*.log | tee ../benchmark/logs/scale.log
46 | 
47 | export CUDA_VISIBLE_DEVICES=0,1,2,3
48 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on
49 | nvprof ./main ../tests/input/hidden_shift_28.qasm 2>&1 | tee ../benchmark/logs/hs.log
50 | 


--------------------------------------------------------------------------------
/benchmark/bench_sharemem.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | export MPIRUN_CONFIG=""
 4 | name=../build/logs/`date +%Y%m%d-%H%M%S`
 5 | 
 6 | cd ../scripts
 7 | 
 8 | cp ../src/kernels/baseline.cu ../src/kernelOpt.cu
 9 | 
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DMICRO_BENCH=on -DTHREAD_DEP=9 2>&1 | tee $name/std.out
12 | grep -r "Time Cost" $name/*.log | tee ../benchmark/logs/sharemem.log
13 | 
14 | cp ../src/kernels/swizzle.cu ../src/kernelOpt.cu
15 | 


--------------------------------------------------------------------------------
/benchmark/bench_weak.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off
 3 | LOG=../benchmark/logs
 4 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee $LOG/weak.log
 5 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_24.qasm 2>&1 | tee -a $LOG/weak.log
 7 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
 8 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_25.qasm 2>&1 | tee -a $LOG/weak.log
10 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
11 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
12 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_26.qasm 2>&1 | tee -a $LOG/weak.log
13 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
14 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
15 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_27.qasm 2>&1 | tee -a $LOG/weak.log
16 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
17 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
18 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_28.qasm 2>&1 | tee -a $LOG/weak.log
19 | CUDA_VISIBLE_DEVICES=0,1 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_29.qasm 2>&1 | tee -a $LOG/weak.log
21 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/basis_change_30.qasm 2>&1 | tee -a $LOG/weak.log
22 | 
23 | grep -r "Logger" $LOG/weak.log | tee $LOG/weak_summary.log


--------------------------------------------------------------------------------
/benchmark/blas.cu:
--------------------------------------------------------------------------------
 1 | #include <cublas_v2.h>
 2 | #include <cuComplex.h>
 3 | #include <cstdio>
 4 | 
 5 | 
 6 | static const char *cublasGetErrorString(cublasStatus_t error)
 7 | {
 8 |     switch (error)
 9 |     {
10 |         case CUBLAS_STATUS_SUCCESS:
11 |             return "CUBLAS_STATUS_SUCCESS";
12 |         case CUBLAS_STATUS_NOT_INITIALIZED:
13 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
14 |         case CUBLAS_STATUS_ALLOC_FAILED:
15 |             return "CUBLAS_STATUS_ALLOC_FAILED";
16 |         case CUBLAS_STATUS_INVALID_VALUE:
17 |             return "CUBLAS_STATUS_INVALID_VALUE";
18 |         case CUBLAS_STATUS_ARCH_MISMATCH:
19 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
20 |         case CUBLAS_STATUS_MAPPING_ERROR:
21 |             return "CUBLAS_STATUS_MAPPING_ERROR";
22 |         case CUBLAS_STATUS_EXECUTION_FAILED:
23 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
24 |         case CUBLAS_STATUS_INTERNAL_ERROR:
25 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
26 |         default:
27 |             return "<unknown>";
28 |     }
29 |     return "<unknown>";
30 | }
31 | 
32 | #define checkCudaErrors(stmt) {                                 \
33 |     cudaError_t err = stmt;                            \
34 |     if (err != cudaSuccess) {                          \
35 |       fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cudaGetErrorString(err)); \
36 |       exit(1); \
37 |     }                                                  \
38 | }
39 | 
40 | #define checkCuttErrors(stmt) {                                 \
41 |     cuttResult err = stmt;                            \
42 |     if (err != CUTT_SUCCESS) {                          \
43 |       fprintf(stderr, "%s in file %s, function %s, line %i.\n", #stmt, __FILE__, __FUNCTION__, __LINE__); \
44 |       exit(1); \
45 |     }                                                  \
46 | }
47 | 
48 | #define checkBlasErrors(stmt) { \
49 |     cublasStatus_t err = stmt; \
50 |     if (err != CUBLAS_STATUS_SUCCESS) {                          \
51 |       fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cublasGetErrorString(err)); \
52 |       exit(1); \
53 |     } \
54 | }
55 | 
56 | int main() {
57 |     int nq = N_QUBIT;
58 |     cuDoubleComplex* arr;
59 |     cuDoubleComplex* mat;
60 |     cuDoubleComplex* result;
61 |     checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << nq));
62 |     checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) * 1024 * 1024));
63 |     checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << nq));
64 |     cublasHandle_t handle;
65 |     checkBlasErrors(cublasCreate(&handle));
66 |     // checkBlasErrors(cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION));
67 |     int numElements = 1 << nq;
68 |     cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0);
69 |     cudaEvent_t start, stop;
70 |     checkCudaErrors(cudaEventCreate(&start));
71 |     checkCudaErrors(cudaEventCreate(&stop));
72 |     for (int K = 2; K < 1024; K <<= 1) {
73 |         printf("K = %d\n", K);
74 |         for (int i = 0; i < 100; i++) {
75 |             checkCudaErrors(cudaEventRecord(start));
76 |             
77 |             checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
78 |                 K, numElements / K, K, // M, N, K
79 |                 &alpha, mat, K, // alpha, a, lda
80 |                 arr, K, // b, ldb
81 |                 &beta, result, K // beta, c, ldc
82 |             ));
83 | 
84 |             float time;
85 |             checkCudaErrors(cudaEventRecord(stop));
86 |             cudaEventSynchronize(stop);
87 |             cudaEventElapsedTime(&time, start, stop);
88 |             printf("%.10f ", time);
89 |         }
90 |         printf("\n");
91 |     }
92 |     return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/benchmark/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=on -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0 ./process


--------------------------------------------------------------------------------
/cmake/FindNccl.cmake:
--------------------------------------------------------------------------------
 1 | if (NCCL_LIBRARY)
 2 |   if(NOT USE_NCCL_LIB_PATH)
 3 |     # Don't cache NCCL_LIBRARY to enable switching between static and shared.
 4 |     unset(NCCL_LIBRARY CACHE)
 5 |   endif(NOT USE_NCCL_LIB_PATH)
 6 | endif()
 7 | 
 8 | if (BUILD_WITH_SHARED_NCCL)
 9 |   # libnccl.so
10 |   set(NCCL_LIB_NAME nccl)
11 | else ()
12 |   # libnccl_static.a
13 |   set(NCCL_LIB_NAME nccl_static)
14 | endif (BUILD_WITH_SHARED_NCCL)
15 | 
16 | find_path(NCCL_INCLUDE_DIR
17 |   NAMES nccl.h
18 |   PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
19 | 
20 | find_library(NCCL_LIBRARY
21 |   NAMES ${NCCL_LIB_NAME}
22 |   PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
23 | 
24 | message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
25 | 
26 | include(FindPackageHandleStandardArgs)
27 | find_package_handle_standard_args(Nccl DEFAULT_MSG
28 |                                   NCCL_INCLUDE_DIR NCCL_LIBRARY)
29 | 
30 | mark_as_advanced(
31 |   NCCL_INCLUDE_DIR
32 |   NCCL_LIBRARY
33 | )


--------------------------------------------------------------------------------
/evaluator-preprocess/process.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <fstream>
  3 | #include <cstring>
  4 | #include <regex>
  5 | #include <cmath>
  6 | #include <chrono>
  7 | #include <cuda.h>
  8 | #include <cuda_runtime.h>
  9 | #include <cublas_v2.h>
 10 | #include <cuComplex.h>
 11 | #include "circuit.h"
 12 | #include "logger.h"
 13 | using namespace std;
 14 | 
 15 | #define DIFF_QUBIT_NUMS 7
 16 | int qubit_nums[DIFF_QUBIT_NUMS] = {22, 23, 24, 25, 26, 27, 28};
 17 | 
 18 | FILE* curr_file;
 19 | 
 20 | #define CALC_ALL_PARAM 0
 21 | #define CALC_PARTIAL_PARAM 1
 22 | const int param_type = CALC_PARTIAL_PARAM;
 23 | 
 24 | void procPerGateSingle(int numQubits) {
 25 |     int num_gates = 512;
 26 |     for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) {
 27 |         printf("single gate %s\n", Gate::get_name(GateType(i)).c_str());
 28 |         if(param_type == CALC_ALL_PARAM) {
 29 |             for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
 30 |                 Circuit c(numQubits);
 31 |                 for (int k = 0; k < num_gates; k++) {
 32 |                     c.addGate(Gate::random(j, j + 1, GateType(i)));
 33 |                 }
 34 |                 c.compile();
 35 |                 int time = c.run(false);
 36 |                 fprintf(curr_file, "%d ", time);
 37 |             }
 38 |         }
 39 |         else {
 40 |             Circuit c(numQubits);
 41 |             for (int k = 0; k < num_gates; k++) {
 42 |                 c.addGate(Gate::random(1, 1 + 1, GateType(i)));
 43 |             }
 44 |             c.compile();
 45 |             int time = c.run(false);
 46 |             fprintf(curr_file, "%d ", time);
 47 |         }
 48 |         fprintf(curr_file, "\n");
 49 |     }
 50 |     fprintf(curr_file, "\n");
 51 | }
 52 | 
 53 | void procPerGateCtr(int numQubits) {
 54 |     int num_gates = 512;
 55 |     for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) {
 56 |         printf("control gate %s\n", Gate::get_name(GateType(g)).c_str());
 57 |         if(param_type == CALC_ALL_PARAM) {
 58 |             for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
 59 |                 for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
 60 |                     if (i == j) { fprintf(curr_file, "0 "); continue; }
 61 |                     Circuit c(numQubits);
 62 |                     for (int k = 0; k < num_gates; k++) {
 63 |                         c.addGate(Gate::control(i, j, GateType(g)));
 64 |                     }
 65 |                     c.compile();
 66 |                     int time = c.run(false);
 67 |                     fprintf(curr_file, "%d ", time);
 68 |                 }
 69 |                 fprintf(curr_file, "\n");
 70 |             }
 71 |         }
 72 |         else {
 73 |             Circuit c(numQubits);
 74 |             for (int k = 0; k < num_gates; k++) {
 75 |                 c.addGate(Gate::control(0, 2, GateType(g)));
 76 |             }
 77 |             c.compile();
 78 |             int time = c.run(false);
 79 |             fprintf(curr_file, "%d ", time);
 80 |         }
 81 |         fprintf(curr_file, "\n");
 82 |     }
 83 | }
 84 | 
 85 | void procBLAS(int numQubits) {
 86 |     cuDoubleComplex* arr;
 87 |     cuDoubleComplex* mat;
 88 |     cuDoubleComplex* result;
 89 |     checkCudaErrors(cudaMalloc(&arr, sizeof(cuDoubleComplex) << numQubits));
 90 |     checkCudaErrors(cudaMalloc(&mat, sizeof(cuDoubleComplex) << 20));
 91 |     checkCudaErrors(cudaMalloc(&result, sizeof(cuDoubleComplex) << numQubits));
 92 |     cublasHandle_t handle;
 93 |     checkBlasErrors(cublasCreate(&handle));
 94 |     qindex numElements = qindex(1) << numQubits;
 95 |     cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), beta = make_cuDoubleComplex(0.0, 0.0);
 96 |     cudaEvent_t start, stop;
 97 |     checkCudaErrors(cudaEventCreate(&start));
 98 |     checkCudaErrors(cudaEventCreate(&stop));
 99 |     for (int K = 1; K < 1024; K <<= 1) {
100 |         printf("blas calculating K = %d\n", K);
101 |         double sum_time = 0.0;
102 |         for (int i = 0; i < 100; i++) {
103 |             checkCudaErrors(cudaEventRecord(start));
104 |             
105 |             checkBlasErrors(cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
106 |                 K, numElements / K, K, // M, N, K
107 |                 &alpha, mat, K, // alpha, a, lda
108 |                 arr, K, // b, ldb
109 |                 &beta, result, K // beta, c, ldc
110 |             ));
111 | 
112 |             float time;
113 |             checkCudaErrors(cudaEventRecord(stop));
114 |             cudaEventSynchronize(stop);
115 |             cudaEventElapsedTime(&time, start, stop);
116 |             sum_time += time;
117 |             //printf("%.10f ", time);
118 |             
119 |         }
120 |         //printf("\n");
121 |         fprintf(curr_file, "%d %f\n", K, sum_time / 100);
122 |     }
123 |     fprintf(curr_file, "\n");
124 |     checkCudaErrors(cudaFree(arr));
125 |     checkCudaErrors(cudaFree(mat));
126 |     checkCudaErrors(cudaFree(result));
127 | }
128 | 
129 | void procCutt(int numQubits) {
130 |     double *in, *out;
131 |     checkCudaErrors(cudaMalloc(&in, sizeof(double2) << numQubits));
132 |     checkCudaErrors(cudaMalloc(&out, sizeof(double2) << numQubits));
133 |     int dim[numQubits];
134 |     for (int i = 0; i < numQubits; i++) dim[i] = 2;
135 |     int total = 0;
136 |     double sum_time = 0.0;
137 |     for (int change = 1; change <= 20; change ++) {
138 |         int perm[numQubits];
139 |         printf("Cutt calculating  change = %d\n", change);
140 |         for (int tt = 0; tt < 100; tt++) {
141 |             for (int i = 0; i < numQubits; i++) perm[i] = i;
142 |             for (int i = 0; i < change; i++) {
143 |                 std::swap(perm[rand() % numQubits], perm[rand() % numQubits]);
144 |             }
145 |             cuttHandle plan;
146 |             checkCuttErrors(cuttPlan(&plan, numQubits, dim, perm, sizeof(double2), 0));
147 |             cudaEvent_t start, stop;
148 |             float time;
149 |             checkCudaErrors(cudaEventCreate(&start));
150 |             checkCudaErrors(cudaEventCreate(&stop));
151 |             checkCudaErrors(cudaEventRecord(start, 0));
152 |             checkCuttErrors(cuttExecute(plan, in, out));
153 |             checkCudaErrors(cudaEventRecord(stop, 0));
154 |             checkCudaErrors(cudaEventSynchronize(stop));
155 |             checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
156 |             //printf("%.10f ms ", time);
157 |             total ++;
158 |             sum_time += time;
159 |         }
160 |         //printf("\n");
161 |     }
162 |     fprintf(curr_file, "%f\n", sum_time / total);
163 |     checkCudaErrors(cudaFree(in));
164 |     checkCudaErrors(cudaFree(out));
165 | }
166 | 
167 | void process(int numQubits) {
168 |     printf("processing qubit number : %d\n", numQubits);
169 |     string file_name = string("../evaluator-preprocess/parameter-files/") + to_string(numQubits) + string("qubits.out"); 
170 |     curr_file = fopen(file_name.c_str(), "w");
171 |     fprintf(curr_file, "%d\n", param_type);
172 | 
173 |     procPerGateSingle(numQubits);
174 |     procPerGateCtr(numQubits);
175 |     procBLAS(numQubits);
176 |     procCutt(numQubits);
177 |     fclose(curr_file);
178 | }
179 | 
180 | int main()
181 | {
182 |     auto start = chrono::system_clock::now();
183 |     MyGlobalVars::init();
184 |     for(int i = 0; i < DIFF_QUBIT_NUMS; i++) {
185 |         process(qubit_nums[i]);
186 |     }
187 |     auto end = chrono::system_clock::now();
188 |     printf("process time %d ms\n", chrono::duration_cast<chrono::milliseconds>(end - start).count());
189 |     return 0;
190 | }
191 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <fstream>
  3 | #include <cstring>
  4 | #include <regex>
  5 | #include <cmath>
  6 | #include "circuit.h"
  7 | #include "logger.h"
  8 | using namespace std;
  9 | const int BUFFER_SIZE = 1000;
 10 | char buffer[BUFFER_SIZE];
 11 | 
 12 | std::vector<int> parse_qid(char buf[]) {
 13 |     std::vector<int> ret;
 14 |     int l = strlen(buf);
 15 |     for (int i = 0; i < l; i++) {
 16 |         if (buf[i] >= '0' && buf[i] <= '9') {
 17 |             int j = i, x = 0;
 18 |             while (buf[j] >= '0' && buf[j] <= '9') {
 19 |                 x = x * 10 + (int)(buf[j] - '0');
 20 |                 j++;
 21 |             }
 22 |             i = j - 1;
 23 |             ret.push_back(x);
 24 |         }
 25 |     }
 26 |     return ret;
 27 | }
 28 | 
 29 | std::pair<std::string, std::vector<qreal>> parse_gate(char buf[]) {
 30 |     qreal pi = acos(-1);
 31 |     int l = strlen(buf);
 32 |     std::string name;
 33 |     int i = 0;
 34 |     while (i < l) {
 35 |         if (buf[i] != '(')
 36 |             name += buf[i];
 37 |         else
 38 |             break;
 39 |         i++;
 40 |     }
 41 |     std::vector<qreal> params;
 42 |     while (i < l) {
 43 |         i++;
 44 |         std::string st;
 45 |         while (buf[i] != ',' && buf[i] != ')') {
 46 |             st += buf[i];
 47 |             i++;
 48 |         }
 49 |         qreal param = 1;
 50 |         if (st[0] == 'p' && st[1] == 'i' && st[2] == '*') {
 51 |             param = pi;
 52 |             st = st.erase(0, 3);
 53 |         } else if (st[0] == 'p' && st[1] == 'i' && st[2] == '/') {
 54 |             param = -pi;
 55 |             st = st.erase(0, 3);
 56 |         }
 57 |         if (param > 0)
 58 |             param *= std::stod(st);
 59 |         else
 60 |             param = pi / std::stod(st);
 61 |         params.push_back(param);
 62 |         if (buf[i] == ')')
 63 |             break;
 64 |     }
 65 |     return std::make_pair(name, params);
 66 | }
 67 | 
 68 | std::unique_ptr<Circuit> parse_circuit(const std::string &filename) {
 69 |     FILE* f = nullptr;
 70 |     if ((f = fopen(filename.c_str(), "r")) == NULL) {
 71 |         printf("fail to open %s\n", filename.c_str());
 72 |         exit(1);
 73 |     }
 74 |     int n = -1;
 75 |     std::unique_ptr<Circuit> c = nullptr;
 76 |     while (fscanf(f, "%s", buffer) != EOF) {
 77 |         if (strcmp(buffer, "//") == 0 || strcmp(buffer, "OPENQASM") == 0 || strcmp(buffer, "include") == 0) {
 78 |         } else if (strcmp(buffer, "qreg") == 0) {
 79 |             fscanf(f, "%*c%*c%*c%d", &n);
 80 |             c = std::make_unique<Circuit>(n);
 81 |         } else if (strcmp(buffer, "cx") == 0) {
 82 |             fscanf(f, "%s", buffer);
 83 |             auto qid = parse_qid(buffer);
 84 |             assert(qid.size() == 2);
 85 |             c->addGate(Gate::CNOT(qid[0], qid[1]));
 86 |             // printf("cx %d %d\n", qid[0], qid[1]);
 87 |         } else if (strcmp(buffer, "ccx") == 0) {
 88 |             fscanf(f, "%s", buffer);
 89 |             auto qid = parse_qid(buffer);
 90 |             assert(qid.size() == 3);
 91 |             c->addGate(Gate::CCX(qid[0], qid[1], qid[2]));
 92 |             // printf("ccx %d %d %d\n", qid[0], qid[1], qid[2]);
 93 |         } else if (strcmp(buffer, "cy") == 0) {
 94 |             fscanf(f, "%s", buffer);
 95 |             auto qid = parse_qid(buffer);
 96 |             assert(qid.size() == 2);
 97 |             c->addGate(Gate::CY(qid[0], qid[1]));
 98 |             // printf("cy %d %d\n", qid[0], qid[1]);
 99 |         } else if (strcmp(buffer, "cz") == 0) {
100 |             fscanf(f, "%s", buffer);
101 |             auto qid = parse_qid(buffer);
102 |             assert(qid.size() == 2);
103 |             c->addGate(Gate::CZ(qid[0], qid[1]));
104 |             // printf("cz %d %d\n", qid[0], qid[1]);
105 |         } else if (strcmp(buffer, "h") == 0) {
106 |             fscanf(f, "%s", buffer);
107 |             auto qid = parse_qid(buffer);
108 |             assert(qid.size() == 1);
109 |             c->addGate(Gate::H(qid[0]));
110 |             // printf("h %d\n", qid[0]);
111 |         } else if (strcmp(buffer, "x") == 0) {
112 |             fscanf(f, "%s", buffer);
113 |             auto qid = parse_qid(buffer);
114 |             assert(qid.size() == 1);
115 |             c->addGate(Gate::X(qid[0]));
116 |             // printf("x %d\n", qid[0]);
117 |         } else if (strcmp(buffer, "y") == 0) {
118 |             fscanf(f, "%s", buffer);
119 |             auto qid = parse_qid(buffer);
120 |             assert(qid.size() == 1);
121 |             c->addGate(Gate::Y(qid[0]));
122 |             // printf("y %d\n", qid[0]);
123 |         } else if (strcmp(buffer, "z") == 0) {
124 |             fscanf(f, "%s", buffer);
125 |             auto qid = parse_qid(buffer);
126 |             assert(qid.size() == 1);
127 |             c->addGate(Gate::Z(qid[0]));
128 |             // printf("z %d\n", qid[0]);
129 |         } else if (strcmp(buffer, "s") == 0) {
130 |             fscanf(f, "%s", buffer);
131 |             auto qid = parse_qid(buffer);
132 |             assert(qid.size() == 1);
133 |             c->addGate(Gate::S(qid[0]));
134 |             // printf("s %d\n", qid[0]);
135 |         } else if (strcmp(buffer, "sdg") == 0) {
136 |             fscanf(f, "%s", buffer);
137 |             auto qid = parse_qid(buffer);
138 |             assert(qid.size() == 1);
139 |             c->addGate(Gate::SDG(qid[0]));
140 |             // printf("s %d\n", qid[0]);
141 |         } else if (strcmp(buffer, "t") == 0) {
142 |             fscanf(f, "%s", buffer);
143 |             auto qid = parse_qid(buffer);
144 |             assert(qid.size() == 1);
145 |             c->addGate(Gate::T(qid[0]));
146 |             // printf("t %d\n", qid[0]);
147 |         }  else if (strcmp(buffer, "tdg") == 0) {
148 |             fscanf(f, "%s", buffer);
149 |             auto qid = parse_qid(buffer);
150 |             assert(qid.size() == 1);
151 |             c->addGate(Gate::TDG(qid[0]));
152 |             // printf("t %d\n", qid[0]);
153 |         } else {
154 |             auto gate = parse_gate(buffer);
155 |             if (gate.first == "crx") {
156 |                 assert(gate.second.size() == 1);
157 |                 fscanf(f, "%s", buffer);
158 |                 auto qid = parse_qid(buffer);
159 |                 assert(qid.size() == 2);
160 |                 c->addGate(Gate::CRX(qid[0], qid[1], gate.second[0]));
161 |                 // printf("crx %d %d %f\n", qid[0], qid[1], gate.second[0]);
162 |             } else if (gate.first == "cry") {
163 |                 assert(gate.second.size() == 1);
164 |                 fscanf(f, "%s", buffer);
165 |                 auto qid = parse_qid(buffer);
166 |                 assert(qid.size() == 2);
167 |                 c->addGate(Gate::CRY(qid[0], qid[1], gate.second[0]));
168 |                 // printf("cry %d %d %f\n", qid[0], qid[1], gate.second[0]);
169 |             } else if (gate.first == "crz") {
170 |                 assert(gate.second.size() == 1);
171 |                 fscanf(f, "%s", buffer);
172 |                 auto qid = parse_qid(buffer);
173 |                 assert(qid.size() == 2);
174 |                 c->addGate(Gate::CRZ(qid[0], qid[1], gate.second[0]));
175 |                 // printf("crz %d %d %f\n", qid[0], qid[1], gate.second[0]);
176 |             }  else if (gate.first == "cu1") {
177 |                 assert(gate.second.size() == 1);
178 |                 fscanf(f, "%s", buffer);
179 |                 auto qid = parse_qid(buffer);
180 |                 assert(qid.size() == 2);
181 |                 c->addGate(Gate::CU1(qid[0], qid[1], gate.second[0]));
182 |                 // printf("cu1 %d %d %f\n", qid[0], qid[1], gate.second[0]);
183 |             } else if (gate.first == "u1") {
184 |                 assert(gate.second.size() == 1);
185 |                 fscanf(f, "%s", buffer);
186 |                 auto qid = parse_qid(buffer);
187 |                 assert(qid.size() == 1);
188 |                 c->addGate(Gate::U1(qid[0], gate.second[0]));
189 |                 // printf("u1 %d %f\n", qid[0], gate.second[0]);
190 |             } else if (gate.first == "u3") {
191 |                 assert(gate.second.size() == 3);
192 |                 fscanf(f, "%s", buffer);
193 |                 auto qid = parse_qid(buffer);
194 |                 assert(qid.size() == 1);
195 |                 c->addGate(Gate::U3(qid[0], gate.second[0], gate.second[1], gate.second[2]));
196 |                 // printf("u3 %d %f %f %f\n", qid[0], gate.second[0], gate.second[1], gate.second[2]);
197 |             } else if (gate.first == "rx") {
198 |                 assert(gate.second.size() == 1);
199 |                 fscanf(f, "%s", buffer);
200 |                 auto qid = parse_qid(buffer);
201 |                 assert(qid.size() == 1);
202 |                 c->addGate(Gate::RX(qid[0], gate.second[0]));
203 |                 // printf("rx %d %f\n", qid[0], gate.second[0]);
204 |             } else if (gate.first == "ry") {
205 |                 assert(gate.second.size() == 1);
206 |                 fscanf(f, "%s", buffer);
207 |                 auto qid = parse_qid(buffer);
208 |                 assert(qid.size() == 1);
209 |                 c->addGate(Gate::RY(qid[0], gate.second[0]));
210 |                 // printf("ry %d %f\n", qid[0], gate.second[0]);
211 |             } else if (gate.first == "rz") {
212 |                 assert(gate.second.size() == 1);
213 |                 fscanf(f, "%s", buffer);
214 |                 auto qid = parse_qid(buffer);
215 |                 assert(qid.size() == 1);
216 |                 c->addGate(Gate::RZ(qid[0], gate.second[0]));
217 |                 // printf("rz %d %f\n", qid[0], gate.second[0]);
218 |             } else {
219 |                 printf("unrecognized token %s\n", buffer);
220 |                 exit(1);
221 |             }
222 |         }
223 |         fgets(buffer, BUFFER_SIZE, f);
224 |     }
225 |     fclose(f);
226 |     if (c == nullptr) {
227 |         printf("fail to load circuit\n");
228 |         exit(1);
229 |     }
230 |     return std::move(c);
231 | }
232 | 
233 | int main(int argc, char* argv[]) {
234 |     #if USE_MPI
235 |         MyMPI::init();
236 |     #endif
237 |     MyGlobalVars::init();
238 |     std::unique_ptr<Circuit> c;
239 |     if (argc != 2) {
240 |         printf("./parser qasmfile\n");
241 |         exit(1);
242 |     }
243 |     c = parse_circuit(std::string(argv[1]));
244 |     c->compile();
245 |     c->run();
246 |     c->printState();
247 |     Logger::print();
248 |     #if USE_MPI 
249 |         checkMPIErrors(MPI_Finalize());
250 |     #endif
251 |     return 0;
252 | }


--------------------------------------------------------------------------------
/micro-benchmark/bench-blas.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fstream>
 3 | #include <cstring>
 4 | #include <regex>
 5 | #include <cmath>
 6 | #include "circuit.h"
 7 | #include "logger.h"
 8 | using namespace std;
 9 | 
10 | int main(int argc, char* argv[]) {
11 |     MyGlobalVars::init();
12 |     int n = 28;
13 |     printf("MATSIZE %d ", BLAS_MAT_LIMIT);
14 |     for (int tt = 0; tt < 5; tt++) {
15 |         Circuit c(n);
16 |         for (int i = 0; i < 10 * BLAS_MAT_LIMIT; i++) {
17 |             c.addGate(Gate::H(i % (BLAS_MAT_LIMIT * 2)));
18 |         }
19 |         c.compile();
20 |         int time = c.run(false);
21 |         printf("%d ", time);
22 |     }
23 |     printf("\n");
24 |     return 0;
25 | }


--------------------------------------------------------------------------------
/micro-benchmark/local-ctr.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fstream>
 3 | #include <cstring>
 4 | #include <regex>
 5 | #include <cmath>
 6 | #include "circuit.h"
 7 | #include "logger.h"
 8 | using namespace std;
 9 | 
10 | int main(int argc, char* argv[]) {
11 |     MyGlobalVars::init();
12 |     int n = 28;
13 |     int num_gates = 512;
14 |     for (int g = int(GateType::CNOT); g <= int(GateType::CRZ); g++) {
15 |         printf("%s\n", Gate::get_name(GateType(g)).c_str());
16 |         for (int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
17 |             for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
18 |                 if (i == j) { printf("    "); continue; }
19 |                 Circuit c(n);
20 |                 for (int k = 0; k < num_gates; k++) {
21 |                     c.addGate(Gate::control(i, j, GateType(g)));
22 |                 }
23 |                 c.compile();
24 |                 int time = c.run(false);
25 |                 printf("%d ", time);
26 |                 fflush(stdout);
27 |             }
28 |             printf("\n");
29 |         }
30 |     }
31 |     return 0;
32 | }


--------------------------------------------------------------------------------
/micro-benchmark/local-single.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fstream>
 3 | #include <cstring>
 4 | #include <regex>
 5 | #include <cmath>
 6 | #include "circuit.h"
 7 | #include "logger.h"
 8 | using namespace std;
 9 | 
10 | int main(int argc, char* argv[]) {
11 |     MyGlobalVars::init();
12 |     int n = 28;
13 |     int num_gates = 512;
14 |     for (int i = int(GateType::U1); i < int(GateType::TOTAL); i++) {
15 |         printf("%s: ", Gate::get_name(GateType(i)).c_str());
16 |         for (int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
17 |             Circuit c(n);
18 |             for (int k = 0; k < num_gates; k++) {
19 |                 c.addGate(Gate::random(j, j + 1, GateType(i)));
20 |             }
21 |             c.compile();
22 |             int time = c.run(false);
23 |             printf("%d ", time);
24 |             fflush(stdout);
25 |         }
26 |         printf("\n");
27 |     }
28 |     return 0;
29 | }


--------------------------------------------------------------------------------
/micro-benchmark/two-group-h.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fstream>
 3 | #include <cstring>
 4 | #include <regex>
 5 | #include <cmath>
 6 | #include "circuit.h"
 7 | #include "logger.h"
 8 | using namespace std;
 9 | 
10 | int main(int argc, char* argv[]) {
11 |     MyGlobalVars::init();
12 |     for (int i = 6; i < 200; i += 6) {
13 |         printf("%d:", i);
14 |         for (int tt = 0; tt < 5; tt++) {
15 |             Circuit c(28);
16 |             for (int j = 0; j < i; j++)
17 |                 c.addGate(Gate::H(j % 6));
18 |             c.compile();
19 |             int time = c.run(false);
20 |             printf("%d ", time);
21 |         }
22 |         printf("\n");
23 |     }
24 |     return 0;
25 | }


--------------------------------------------------------------------------------
/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | _check.sh
2 | _run.sh
3 | 


--------------------------------------------------------------------------------
/scripts/check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | name=../build/logs/`date +%Y%m%d-%H%M%S`
 3 | mkdir -p $name
 4 | 
 5 | # command for no_mpi
 6 | MPIRUN_CONFIG="" ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=off -DDISABLE_ASSERT=on -DMAT=7 2>&1 | tee $name/std.out
 7 | 
 8 | # command for mpi
 9 | MPIRUN_CONFIG="`which mpirun` -x GPUPerRank=2 -host nico3:2 ../scripts/env.sh ../scripts/gpu-bind.sh"
10 | MPIRUN_CONFIG=$MPIRUN_CONFIG ./check_wrapper.sh $name -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DEVALUATOR_PREPROCESS=on -DENABLE_OVERLAP=on -DUSE_MPI=on -DDISABLE_ASSERT=on -DMAT=7 -DUSE_MPI=on 2>&1 | tee $name/std.out
11 | 


--------------------------------------------------------------------------------
/scripts/check_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | source init.sh ${@: 2}
 4 | input_dir=../tests/input
 5 | std_dir=../tests/output
 6 | 
 7 | for test in ${tests[*]}; do
 8 |     $MPIRUN_CONFIG ./main $input_dir/$test.qasm > $1/$test.log
 9 |     grep "Logger" $1/$test.log
10 | done
11 | 
12 | set +x
13 | set +e
14 | 
15 | for test in ${tests[*]}; do
16 |     line=`cat $std_dir/$test.log | wc -l`
17 |     echo $test
18 |     grep -Ev "Logger|CLUSTER" $1/$test.log > tmp.log
19 |     diff -q -B $std_dir/$test.log tmp.log || true
20 | done
21 | 
22 | grep -Er "Logger:.*Time" $1/*.log 
23 | 


--------------------------------------------------------------------------------
/scripts/coalescing.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c0
 5 | mkdir -p $name
 6 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -D COALESCE=0 2>&1 | tee $name/std.out
 7 | name1=$name
 8 | 
 9 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c1
10 | mkdir -p $name
11 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=1 2>&1 | tee $name/std.out
12 | name2=$name
13 | 
14 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c2
15 | mkdir -p $name
16 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=2 2>&1 | tee $name/std.out
17 | name3=$name
18 | 
19 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c3
20 | mkdir -p $name
21 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=3 2>&1 | tee $name/std.out
22 | name4=$name
23 | 
24 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c4
25 | mkdir -p $name
26 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=4 2>&1 | tee $name/std.out
27 | name5=$name
28 | 
29 | name=../build/logs/`date +%Y%m%d-%H%M%S`-c5
30 | mkdir -p $name
31 | ./check_wrapper.sh $name -DBACKEND=group -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=off -DUSE_DOUBLE=on -DCOALESCE=5 2>&1 | tee $name/std.out
32 | name6=$name
33 | 
34 | tail -n 9 $name1/std.out
35 | tail -n 9 $name2/std.out
36 | tail -n 9 $name3/std.out
37 | tail -n 9 $name4/std.out
38 | tail -n 9 $name5/std.out
39 | tail -n 9 $name6/std.out
40 | 


--------------------------------------------------------------------------------
/scripts/compare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import numpy as np
 4 | cases = ['adder_26', 'basis_change_28', 'bv_28', 'hidden_shift_28', 'ising_25', 'qaoa_28', 'qft_28', 'quantum_volume_28', 'supremacy_28']
 5 | std_dir = sys.argv[1]
 6 | my_dir = sys.argv[2]
 7 | 
 8 | for case in cases:
 9 |     std = []
10 |     with open(os.path.join(std_dir, case + '.log')) as f:
11 |         for s in f.readlines():
12 |             a, b = s.strip().split()[2:4]
13 |             std.append([float(a), float(b)])
14 |     std = np.array(std)
15 |     std[np.abs(std) < 1e-10] = 0
16 | 
17 |     my = []
18 |     with open(os.path.join(my_dir, case + '.log')) as f:
19 |         for s in f.readlines():
20 |             if s.startswith('Logger'):
21 |                 continue
22 |             a, b = s.strip().split()[2:4]
23 |             my.append([float(a), float(b)])
24 |     my = np.array(my)
25 |     my[np.abs(my) < 1e-10] = 0
26 |     if (std.shape != my.shape):
27 |         print("[{}]".format(case), "shape not match")
28 |         continue
29 |     err = np.abs(std-my)
30 |     rela = np.abs(std - my) / (np.maximum(np.abs(std), np.abs(my)) + 1e-10)
31 |     print("[{}]".format(case),
32 |         "err:", np.max(err), np.argmax(err),
33 |         "rela:", np.max(rela), np.argmax(rela))


--------------------------------------------------------------------------------
/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | case $(hostname -s) in
 3 |   nico*)
 4 |     echo "[CLUSTER] nico"
 5 |     source /opt/spack/share/spack/setup-env.sh
 6 |     spack load cuda@10.2.89 /v5oqq5n
 7 |     spack load openmpi@4.0.5 /h5eun6a
 8 |     export NCCL_ROOT=/home/heheda/tools/nccl/build
 9 |     export CPATH=$NCCL_ROOT/include:$CPATH
10 |     export LIBRARY_PATH=$NCCL_ROOT/lib:$LIBRARY_PATH
11 |     export LD_LIBRARY_PATH=$NCCL_ROOT/lib:$LD_LIBRARY_PATH
12 |     ;;
13 |   gorgon*)
14 |     echo "[CLUSTER] gorgon"
15 |     source /usr/local/Modules/init/bash
16 |     module load cuda-10.2/cuda
17 |     module load cmake-3.12.3
18 |     module load openmpi-3.0.0
19 |     ;;
20 |   i*)
21 |     echo "[CLUSTER] scc"
22 |     source /opt/spack/share/spack/setup-env.sh
23 |     spack load cuda@10.2.89 /tlfcinz
24 |     spack load openmpi@3.1.6 /5aaect6
25 |     ;;
26 |   hanzo)
27 |     echo "[CLUSTER] hanzo"
28 |     source /opt/spack/share/spack/setup-env.sh
29 |     export PATH=$HOME/package/cmake-3.19.2-Linux-x86_64/bin:/usr/mpi/gcc/openmpi-4.1.0rc5/bin:$PATH
30 |     # use system mpi
31 |     export CPATH=/usr/mpi/gcc/openmpi-4.1.0rc5/include:${CPATH-}
32 |     spack load gcc@8.3.0 /liymwyb
33 |     spack load cuda@10.2.89 /tlfcinz
34 |     ;;
35 |   nova)
36 |     echo "[CLUSTER] nova"
37 |     source /opt/spack/share/spack/setup-env.sh
38 |     spack load cuda@11 /njgeoec
39 |     spack load openmpi /dfes7hw
40 | esac
41 | 
42 | $@


--------------------------------------------------------------------------------
/scripts/gen_stdout.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source init.sh -DBACKEND=1 -DSHOW_SUMMARY=off
3 | for test in ${tests[*]}; do
4 |     echo $test
5 |     ./main ../tests/input/$test.qasm > ../tests/output/$test.log
6 | done
7 | 


--------------------------------------------------------------------------------
/scripts/gpu-bind.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rank=$OMPI_COMM_WORLD_LOCAL_RANK
3 | GPU_start=$(( $rank * $GPUPerRank ))
4 | GPU_end=$(( ($rank + 1) * $GPUPerRank - 1 ))
5 | GPU=`echo $(for i in $(seq $GPU_start $GPU_end); do printf "$i,"; done)`
6 | CUDA_VISIBLE_DEVICES=$GPU $@


--------------------------------------------------------------------------------
/scripts/init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | set -e
 4 | 
 5 | source env.sh ""
 6 | 
 7 | mkdir -p $HYQUAS_ROOT/build
 8 | cd $HYQUAS_ROOT/build
 9 | rm CMakeCache.txt || true
10 | cmake $* ..
11 | make clean
12 | make -j
13 | 
14 | if [ -z "${tests-}" ]
15 | then
16 |   export tests_25="basis_change_25 bv_25 hidden_shift_25 qaoa_25 qft_25 quantum_volume_25 supremacy_25"
17 |   export tests_28="basis_change_28 bv_28 hidden_shift_28 qaoa_28 qft_28 quantum_volume_28 supremacy_28"
18 |   export tests_30="basis_change_30 bv_30 hidden_shift_30 qaoa_30 qft_30 quantum_volume_30 supremacy_30"
19 |   export tests_scale="basis_change_24 basis_change_25 basis_change_26 basis_change_27 basis_change_28"
20 | 
21 |   export tests=($tests_28)
22 | fi
23 | 


--------------------------------------------------------------------------------
/scripts/run-multi-GPU.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/supremacy_28.qasm
4 | 


--------------------------------------------------------------------------------
/scripts/run-multi-node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7
3 | `which mpirun` -host nico3:2 -x GPUPerRank=2 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm
4 | 


--------------------------------------------------------------------------------
/scripts/run-single.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=off -DMAT=7
3 | CUDA_VISIBLE_DEVICES=0 ./main ../tests/input/supremacy_28.qasm
4 | 


--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ../scripts/init.sh -DBACKEND=mix -DSHOW_SUMMARY=on -DSHOW_SCHEDULE=on -DMICRO_BENCH=on -DUSE_DOUBLE=on -DDISABLE_ASSERT=off -DENABLE_OVERLAP=on -DMEASURE_STAGE=off -DEVALUATOR_PREPROCESS=on -DUSE_MPI=on -DMAT=7
3 | `which mpirun` -host nico1:2,nico2:2 -x GPUPerRank=4 ../scripts/env.sh ../scripts/gpu-bind.sh ./main ../tests/input/qft_28.qasm
4 | # CUDA_VISIBLE_DEVICES=0,1,2,3 ./main ../tests/input/qft_28.qasm


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | project(QCSimulator)
2 | aux_source_directory(. SRC_CXX)
3 | 
4 | cuda_add_library(QCSimulator ${SRC_CXX})
5 | 


--------------------------------------------------------------------------------
/src/circuit.cpp:
--------------------------------------------------------------------------------
  1 | #include "circuit.h"
  2 | 
  3 | #include <cstdio>
  4 | #include <assert.h>
  5 | #include <chrono>
  6 | #include <mpi.h>
  7 | #include <algorithm>
  8 | #include <cuda_profiler_api.h>
  9 | #include "utils.h"
 10 | #include "kernel.h"
 11 | #include "compiler.h"
 12 | #include "logger.h"
 13 | #include "executor.h"
 14 | using namespace std;
 15 | 
 16 | int Circuit::run(bool copy_back, bool destroy) {
 17 |     kernelInit(deviceStateVec, numQubits);
 18 |     for (int i = 0; i < MyGlobalVars::localGPUs; i++) {
 19 |         checkCudaErrors(cudaSetDevice(i));
 20 |         checkCudaErrors(cudaProfilerStart());
 21 |     }
 22 |     auto start = chrono::system_clock::now();
 23 | #if BACKEND == 0
 24 |     kernelExecSimple(deviceStateVec[0], numQubits, gates);
 25 | #elif BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
 26 |     Executor(deviceStateVec, numQubits, schedule).run();
 27 | #elif BACKEND == 2
 28 |     gates.clear();
 29 |     for (size_t lgID = 0; lgID < schedule.localGroups.size(); lgID++) {
 30 |         auto& lg = schedule.localGroups[lgID];
 31 |         for (size_t ggID = 0; ggID < lg.overlapGroups.size(); ggID++) {
 32 |             auto& gg = lg.overlapGroups[ggID];
 33 |             for (auto& g: gg.gates)
 34 |                 gates.push_back(g);
 35 |         }
 36 |         // if (lgID == 2) break;
 37 |         for (size_t ggID = 0; ggID < lg.fullGroups.size(); ggID++) {
 38 |             auto& gg = lg.fullGroups[ggID];
 39 |             for (auto& g: gg.gates)
 40 |                 gates.push_back(g);
 41 |         }
 42 |     }
 43 |     schedule.finalState = State(numQubits);
 44 |     kernelExecSimple(deviceStateVec[0], numQubits, gates);
 45 | #endif
 46 |     auto end = chrono::system_clock::now();
 47 |     for (int i = 0; i < MyGlobalVars::localGPUs; i++) {
 48 |         checkCudaErrors(cudaSetDevice(i));
 49 |         checkCudaErrors(cudaProfilerStop());
 50 |     }
 51 |     auto duration = chrono::duration_cast<chrono::microseconds>(end - start);
 52 |     Logger::add("Time Cost: %d us", int(duration.count()));
 53 | 
 54 |     if (copy_back) {
 55 |         result.resize(1ll << numQubits); // very slow ...
 56 | #if BACKEND == 0 || BACKEND == 2
 57 |         kernelDeviceToHost((qComplex*)result.data(), deviceStateVec[0], numQubits);
 58 | #else
 59 |         qindex elements = 1ll << (numQubits - MyGlobalVars::bit);
 60 |         for (int g = 0; g < MyGlobalVars::localGPUs; g++) {
 61 |             kernelDeviceToHost((qComplex*)result.data() + elements * g, deviceStateVec[g], numQubits - MyGlobalVars::bit);
 62 |         }
 63 | #endif
 64 |     }
 65 |     if (destroy) {
 66 |         for (int g = 0; g < MyGlobalVars::localGPUs; g++) {
 67 |             kernelDestroy(deviceStateVec[g]);
 68 |         }
 69 |     }
 70 |     return duration.count();
 71 | }
 72 | 
 73 | void Circuit::dumpGates() {
 74 |     int totalGates = gates.size();
 75 |     printf("total Gates: %d\n", totalGates);
 76 |     int L = 3;
 77 |     for (const Gate& gate: gates) {
 78 |         for (int i = 0; i < numQubits; i++) {
 79 |             if (i == gate.controlQubit) {
 80 |                 printf(".");
 81 |                 for (int j = 1; j < L; j++) printf(" ");
 82 |             } else if (i == gate.targetQubit) {
 83 |                 printf("%s", gate.name.c_str());
 84 |                 for (int j = gate.name.length(); j < L; j++)
 85 |                     printf(" ");
 86 |             } else {
 87 |                 printf("|");
 88 |                 for (int j = 1; j < L; j++) printf(" ");
 89 |             }
 90 |         }
 91 |         printf("\n");
 92 |     }
 93 | }
 94 | 
 95 | qindex Circuit::toPhysicalID(qindex idx) {
 96 |     qindex id = 0;
 97 |     auto& pos = schedule.finalState.pos;
 98 |     for (int i = 0; i < numQubits; i++) {
 99 |         if (idx >> i & 1)
100 |             id |= qindex(1) << pos[i];
101 |     }
102 |     return id;
103 | }
104 | 
105 | qindex Circuit::toLogicID(qindex idx) {
106 |     qindex id = 0;
107 |     auto& pos = schedule.finalState.pos;
108 |     for (int i = 0; i < numQubits; i++) {
109 |         if (idx >> pos[i] & 1)
110 |             id |= qindex(1) << i;
111 |     }
112 |     return id;
113 | }
114 | 
115 | ResultItem Circuit::ampAt(qindex idx) {
116 |     qindex id = toPhysicalID(idx);
117 |     return ResultItem(idx, make_qComplex(result[id].x, result[id].y));
118 | }
119 | 
120 | qComplex Circuit::ampAtGPU(qindex idx) {
121 |     qindex id = toPhysicalID(idx);
122 |     qComplex ret;
123 | #if USE_MPI
124 |     qindex localAmps = (1ll << numQubits) / MyMPI::commSize;
125 |     qindex rankID = id / localAmps;
126 | 
127 |     if (!USE_MPI || MyMPI::rank == rankID) {
128 |         int localID = id % localAmps;
129 | #else
130 |         int localID = id;
131 | #endif
132 |         qindex localGPUAmp = (1ll << numQubits) / MyGlobalVars::numGPUs;
133 |         int gpuID = localID / localGPUAmp;
134 |         qindex localGPUID = localID % localGPUAmp;
135 |         checkCudaErrors(cudaSetDevice(gpuID));
136 |         ret = kernelGetAmp(deviceStateVec[gpuID], localGPUID);
137 | #if USE_MPI
138 |     }
139 |     MPI_Bcast(&ret, 1, MPI_Complex, rankID, MPI_COMM_WORLD);
140 | #endif
141 |     return ret;
142 | }
143 | 
144 | bool Circuit::localAmpAt(qindex idx, ResultItem& item) {
145 |     qindex localAmps = (1ll << numQubits) / MyMPI::commSize;
146 |     qindex id = toPhysicalID(idx);
147 |     if (id / localAmps == MyMPI::rank) {
148 |         // printf("%d belongs to rank %d\n", idx, MyMPI::rank);
149 |         qindex localID = id % localAmps;
150 |         item = ResultItem(idx, make_qComplex(result[localID].x, result[localID].y));
151 |         return true;
152 |     }
153 |     return false;
154 | }
155 | 
156 | void Circuit::masterCompile() {
157 |     Logger::add("Total Gates %d", int(gates.size()));
158 | #if BACKEND == 1 || BACKEND == 2 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
159 |     Compiler compiler(numQubits, gates);
160 |     schedule = compiler.run();
161 |     int totalGroups = 0;
162 |     for (auto& lg: schedule.localGroups) totalGroups += lg.fullGroups.size();
163 |     int fullGates = 0, overlapGates = 0;
164 |     for (auto& lg: schedule.localGroups) {
165 |         for (auto& gg: lg.fullGroups) fullGates += gg.gates.size();
166 |         for (auto& gg: lg.overlapGroups) overlapGates += gg.gates.size();
167 |     }
168 |     Logger::add("Total Groups: %d %d %d %d", int(schedule.localGroups.size()), totalGroups, fullGates, overlapGates);
169 | #ifdef SHOW_SCHEDULE
170 |     schedule.dump(numQubits);
171 | #endif
172 | #else
173 |     schedule.finalState = State(numQubits);
174 | #endif
175 | }
176 | 
177 | void Circuit::compile() {
178 |     auto start = chrono::system_clock::now();
179 | #if USE_MPI
180 |     if (MyMPI::rank == 0) {
181 |         masterCompile();
182 |         auto s = schedule.serialize();
183 |         int bufferSize = (int) s.size();
184 |         checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD));
185 |         checkMPIErrors(MPI_Bcast(s.data(), bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD));
186 |         int cur = 0;
187 |         // schedule = Schedule::deserialize(s.data(), cur);
188 |     } else {
189 |         int bufferSize;
190 |         checkMPIErrors(MPI_Bcast(&bufferSize, 1, MPI_INT, 0, MPI_COMM_WORLD));
191 |         unsigned char* buffer = new unsigned char [bufferSize];
192 |         checkMPIErrors(MPI_Bcast(buffer, bufferSize, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD));
193 |         int cur = 0;
194 |         schedule = Schedule::deserialize(buffer, cur);
195 |         delete[] buffer;
196 |         fflush(stdout);
197 |     }
198 | #else
199 |     masterCompile();
200 | #endif
201 |     auto mid = chrono::system_clock::now();
202 |     schedule.initCuttPlans(numQubits - MyGlobalVars::bit);
203 | #ifndef OVERLAP_MAT
204 |     schedule.initMatrix(numQubits);
205 | #endif
206 |     auto end = chrono::system_clock::now();
207 |     auto duration1 = chrono::duration_cast<chrono::microseconds>(mid - start);
208 |     auto duration2 = chrono::duration_cast<chrono::microseconds>(end - mid);
209 |     Logger::add("Compile Time: %d us + %d us = %d us", int(duration1.count()), int(duration2.count()), int(duration1.count()) + int(duration2.count()));
210 | }
211 | 
212 | #if USE_MPI
213 | void Circuit::gatherAndPrint(const std::vector<ResultItem>& results) {
214 |     if (MyMPI::rank == 0) {
215 |         int size = results.size();
216 |         int sizes[MyMPI::commSize];
217 |         MPI_Gather(&size, 1, MPI_INT, sizes, 1, MPI_INT, 0, MPI_COMM_WORLD);
218 |         int disp[MyMPI::commSize + 1];
219 |         disp[0] = 0;
220 |         for (int i = 0; i < MyMPI::commSize; i++)
221 |             disp[i + 1] = disp[i] + sizes[i];
222 |         int totalItem = disp[MyMPI::commSize];
223 |         ResultItem* collected = new ResultItem[totalItem];
224 |         for (int i = 0; i < MyMPI::commSize; i++)
225 |             sizes[i] *= sizeof(ResultItem);
226 |         for (int i = 0; i < MyMPI::commSize; i++)
227 |             disp[i] *= sizeof(ResultItem);
228 |         MPI_Gatherv(
229 |             results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR,
230 |             collected, sizes, disp,
231 |             MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD
232 |         );
233 |         sort(collected, collected + totalItem);
234 |         for (int i = 0; i < totalItem; i++)
235 |             collected[i].print();
236 |         delete[] collected;
237 |     } else {
238 |         int size = results.size();
239 |         MPI_Gather(&size, 1, MPI_INT, nullptr, 1, MPI_INT, 0, MPI_COMM_WORLD);
240 |         MPI_Gatherv(
241 |             results.data(), results.size() * sizeof(ResultItem), MPI_UNSIGNED_CHAR,
242 |             nullptr, nullptr, nullptr,
243 |             MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD
244 |         );
245 |     }
246 | }
247 | #endif
248 | 
249 | 
250 | void Circuit::printState() {
251 | #if USE_MPI
252 |     std::vector<ResultItem> results;
253 |     ResultItem item;
254 |     for (int i = 0; i < 128; i++) {
255 |         if (localAmpAt(i, item)) {
256 |             results.push_back(item);
257 |         }
258 |     }
259 |     gatherAndPrint(results);
260 | #ifdef SHOW_SCHEDULE
261 |     results.clear();
262 |     for (int i = 0; i < numQubits; i++) {
263 |         if (localAmpAt(1ll << i, item)) {
264 |             results.push_back(item);
265 |         }
266 |     }
267 |     if (localAmpAt((1ll << numQubits) - 1, item)) {
268 |         results.push_back(item);
269 |     }
270 |     gatherAndPrint(results);
271 | #endif
272 |     results.clear();
273 |     int numLocalAmps = (1ll << numQubits) / MyMPI::commSize;
274 |     for (qindex i = 0; i < numLocalAmps; i++) {
275 |         if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) {
276 |             qindex logicID = toLogicID(i + numLocalAmps * MyMPI::rank);
277 |             if (logicID >= 128) {
278 |                 // printf("large amp %d belongs to %d\n", logicID, MyMPI::rank);
279 |                 results.push_back(ResultItem(logicID, result[i]));
280 |             }
281 |         }
282 |     }
283 |     gatherAndPrint(results);
284 | #else
285 |     std::vector<ResultItem> results;
286 |     for (int i = 0; i < 128; i++) {
287 |         results.push_back(ampAt(i));
288 |     }
289 | #ifdef SHOW_SCHEDULE
290 |     for (int i = 0; i < numQubits; i++) {
291 |         results.push_back(ampAt(1ll << i));
292 |     }
293 |     results.push_back(ampAt((1ll << numQubits) - 1));
294 | #endif
295 |     for (auto& item: results)
296 |         item.print();
297 |     results.clear();
298 |     for (qindex i = 0; i < (1ll << numQubits); i++) {
299 |         if (result[i].x * result[i].x + result[i].y * result[i].y > 0.001) {
300 |             qindex logicID = toLogicID(i);
301 |             if (logicID >= 128) {
302 |                 results.push_back(ResultItem(toLogicID(i), result[i]));
303 |             }
304 |         }
305 |     }
306 |     sort(results.begin(), results.end());
307 |     for (auto& item: results)
308 |         item.print();
309 | #endif
310 | }


--------------------------------------------------------------------------------
/src/circuit.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include "utils.h"
 6 | #include "gate.h"
 7 | #include "schedule.h"
 8 | 
 9 | struct ResultItem {
10 |     ResultItem() = default;
11 |     ResultItem(const qindex& idx, const qComplex& amp): idx(idx), amp(amp) {}
12 |     qindex idx;
13 |     qComplex amp;
14 |     void print() {
15 |         printf("%lld %.12f: %.12f %.12f\n", idx, amp.x * amp.x + amp.y * amp.y, zero_wrapper(amp.x), zero_wrapper(amp.y));
16 |     }
17 |     bool operator < (const ResultItem& b) { return idx < b.idx; }
18 | };
19 | 
20 | class Circuit {
21 | public:
22 |     Circuit(int numQubits): numQubits(numQubits) {}
23 |     void compile();
24 |     int run(bool copy_back = true, bool destroy = true);
25 |     void addGate(const Gate& gate) {
26 |         gates.push_back(gate);
27 |     }
28 |     void dumpGates();
29 |     void printState();
30 |     ResultItem ampAt(qindex idx);
31 |     qComplex ampAtGPU(qindex idx);
32 |     bool localAmpAt(qindex idx, ResultItem& item);
33 |     const int numQubits;
34 | 
35 | private:
36 |     qindex toPhysicalID(qindex idx);
37 |     qindex toLogicID(qindex idx);
38 |     void masterCompile();
39 | #if USE_MPI
40 |     void gatherAndPrint(const std::vector<ResultItem>& results);
41 | #endif
42 |     std::vector<Gate> gates;
43 |     std::vector<qComplex*> deviceStateVec;
44 |     std::vector<std::vector<qComplex*>> deviceMats;
45 |     Schedule schedule;
46 |     std::vector<qComplex> result;
47 | };


--------------------------------------------------------------------------------
/src/compiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <set>
 4 | #include <bitset>
 5 | #include "schedule.h"
 6 | #include "utils.h"
 7 | #include "gate.h"
 8 | 
 9 | class Compiler {
10 | public:
11 |     Compiler(int numQubits, std::vector<Gate> inputGates);
12 |     Schedule run();
13 | private:
14 |     void fillLocals(LocalGroup& lg);
15 |     std::vector<std::pair<std::vector<Gate>, qindex>> moveToNext(LocalGroup& lg);
16 |     int numQubits;
17 |     int localSize;
18 |     int shareSize;
19 |     bool enableGlobal;
20 |     std::vector<Gate> gates;
21 | };
22 | 
23 | template<int MAX_GATES>
24 | class OneLayerCompiler {
25 | public:
26 |     OneLayerCompiler(int numQubits, const std::vector<Gate>& inputGates);
27 | protected:
28 |     int numQubits;
29 |     std::vector<Gate> remainGates;
30 |     std::vector<int> getGroupOpt(bool full[], qindex related[], bool enableGlobal, int localSize, qindex localQubits);
31 |     void removeGatesOpt(const std::vector<int>& remove);
32 |     std::set<int> remain;
33 | };
34 | 
35 | class SimpleCompiler: public OneLayerCompiler<2048> {
36 | public:
37 |     SimpleCompiler(int numQubits, int localSize, qindex localQubits, const std::vector<Gate>& inputGates, bool enableGlobal, qindex whiteList = 0, qindex required = 0);
38 |     LocalGroup run();
39 | private:
40 |     int localSize;
41 |     qindex localQubits;
42 |     bool enableGlobal;
43 |     qindex whiteList;
44 |     qindex required;
45 | };
46 | 
47 | class AdvanceCompiler: public OneLayerCompiler<512> {
48 | public:
49 |     AdvanceCompiler(int numQubits, qindex localQubits, qindex blasForbid, std::vector<Gate> inputGates);
50 |     LocalGroup run(State &state, bool usePerGate, bool useBLAS, int preGateSize, int blasSize, int cuttSize);
51 | private:
52 |     qindex localQubits;
53 |     qindex blasForbid;
54 | };
55 | 
56 | class ChunkCompiler: public OneLayerCompiler<512> {
57 | public:
58 |     ChunkCompiler(int numQubits, int localSize, int chunkSize, const std::vector<Gate> &inputGates);
59 |     LocalGroup run();
60 | private:
61 |     int localSize, chunkSize;
62 | };


--------------------------------------------------------------------------------
/src/evaluator.cpp:
--------------------------------------------------------------------------------
  1 | #include "evaluator.h"
  2 | 
  3 | Evaluator* Evaluator::instance_ptr = nullptr;
  4 | 
  5 | Evaluator::Evaluator() {
  6 |     memset(num_qbits_loaded_param, 0, sizeof(num_qbits_loaded_param));
  7 | #ifndef USE_EVALUATOR_PREPROCESS
  8 |     num_qbits_loaded_param[28] = true;
  9 |     memcpy(pergate_single_perf[28][int(GateType::U1)], V100_U1, sizeof(double) * LOCAL_QUBIT_SIZE);
 10 |     memcpy(pergate_single_perf[28][int(GateType::U2)], V100_U2, sizeof(double) * LOCAL_QUBIT_SIZE);
 11 |     memcpy(pergate_single_perf[28][int(GateType::U3)], V100_U3, sizeof(double) * LOCAL_QUBIT_SIZE);
 12 |     memcpy(pergate_single_perf[28][int(GateType::H )], V100_H , sizeof(double) * LOCAL_QUBIT_SIZE);
 13 |     memcpy(pergate_single_perf[28][int(GateType::X )], V100_X , sizeof(double) * LOCAL_QUBIT_SIZE);
 14 |     memcpy(pergate_single_perf[28][int(GateType::Y )], V100_Y , sizeof(double) * LOCAL_QUBIT_SIZE);
 15 |     memcpy(pergate_single_perf[28][int(GateType::Z )], V100_Z , sizeof(double) * LOCAL_QUBIT_SIZE);
 16 |     memcpy(pergate_single_perf[28][int(GateType::S )], V100_S , sizeof(double) * LOCAL_QUBIT_SIZE);
 17 |     memcpy(pergate_single_perf[28][int(GateType::SDG )], V100_SDG , sizeof(double) * LOCAL_QUBIT_SIZE);
 18 |     memcpy(pergate_single_perf[28][int(GateType::T )], V100_T , sizeof(double) * LOCAL_QUBIT_SIZE);
 19 |     memcpy(pergate_single_perf[28][int(GateType::TDG )], V100_TDG , sizeof(double) * LOCAL_QUBIT_SIZE);
 20 |     memcpy(pergate_single_perf[28][int(GateType::RX)], V100_RX, sizeof(double) * LOCAL_QUBIT_SIZE);
 21 |     memcpy(pergate_single_perf[28][int(GateType::RY)], V100_RY, sizeof(double) * LOCAL_QUBIT_SIZE);
 22 |     memcpy(pergate_single_perf[28][int(GateType::RZ)], V100_RZ, sizeof(double) * LOCAL_QUBIT_SIZE);
 23 | 
 24 |     memcpy(pergate_ctr_perf[28][int(GateType::CNOT)], V100_CN , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 25 |     memcpy(pergate_ctr_perf[28][int(GateType::CY  )], V100_CY , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 26 |     memcpy(pergate_ctr_perf[28][int(GateType::CZ  )], V100_CZ , sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 27 |     memcpy(pergate_ctr_perf[28][int(GateType::CRX )], V100_CRX, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 28 |     memcpy(pergate_ctr_perf[28][int(GateType::CRY )], V100_CRY, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 29 |     memcpy(pergate_ctr_perf[28][int(GateType::CU1 )], V100_CU1, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 30 |     memcpy(pergate_ctr_perf[28][int(GateType::CRZ )], V100_CRZ, sizeof(double) * LOCAL_QUBIT_SIZE * LOCAL_QUBIT_SIZE);
 31 | 
 32 |     BLAS_perf[28][6] = 23.068396;
 33 |     cutt_cost[28] = 11.367814;
 34 | #endif
 35 | }
 36 | 
 37 | void Evaluator::loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type) {
 38 |     if(param_type == CALC_ALL_PARAM) {
 39 |         for(int i = 0; i < LOCAL_QUBIT_SIZE; i++) {
 40 |             fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][i]);
 41 |         }
 42 |     }
 43 |     else {
 44 |         fscanf(qbit_param, "%lf", &pergate_single_perf[numQubits][int(gate_type)][1]);
 45 |     }
 46 | }
 47 | 
 48 | void Evaluator::loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type) {
 49 |     if(param_type == CALC_ALL_PARAM) {
 50 |         for(int i = 0; i < LOCAL_QUBIT_SIZE; i++)
 51 |             for(int j = 0; j < LOCAL_QUBIT_SIZE; j++) {
 52 |                 fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][i][j]);
 53 |             }
 54 |     }
 55 |     else {
 56 |         fscanf(qbit_param, "%lf", &pergate_ctr_perf[numQubits][int(gate_type)][0][2]);       
 57 |     }
 58 | } 
 59 | 
 60 | void Evaluator::loadParam(int numQubits) {
 61 |     if(num_qbits_loaded_param[numQubits])
 62 |         return;
 63 | #ifdef USE_EVALUATOR_PREPROCESS    
 64 |     FILE* qbit_param;
 65 |     std::string param_file_name = std::string("../evaluator-preprocess/parameter-files/") 
 66 |         + std::to_string(numQubits) + std::string("qubits.out");
 67 |     if((qbit_param = fopen(param_file_name.c_str(), "r"))) {
 68 |         fscanf(qbit_param, "%d", &param_type);
 69 | 
 70 |         loadPergateSingle(numQubits, qbit_param, GateType::U1);
 71 |         loadPergateSingle(numQubits, qbit_param, GateType::U2);
 72 |         loadPergateSingle(numQubits, qbit_param, GateType::U3);
 73 |         loadPergateSingle(numQubits, qbit_param, GateType::H );
 74 |         loadPergateSingle(numQubits, qbit_param, GateType::X );
 75 |         loadPergateSingle(numQubits, qbit_param, GateType::Y );
 76 |         loadPergateSingle(numQubits, qbit_param, GateType::Z );
 77 |         loadPergateSingle(numQubits, qbit_param, GateType::S );
 78 |         loadPergateSingle(numQubits, qbit_param, GateType::SDG);
 79 |         loadPergateSingle(numQubits, qbit_param, GateType::T );
 80 |         loadPergateSingle(numQubits, qbit_param, GateType::TDG);
 81 |         loadPergateSingle(numQubits, qbit_param, GateType::RX);
 82 |         loadPergateSingle(numQubits, qbit_param, GateType::RY);
 83 |         loadPergateSingle(numQubits, qbit_param, GateType::RZ);
 84 | 
 85 |         loadPergateCtr(numQubits, qbit_param, GateType::CNOT);
 86 |         loadPergateCtr(numQubits, qbit_param, GateType::CY  );
 87 |         loadPergateCtr(numQubits, qbit_param, GateType::CZ  );
 88 |         loadPergateCtr(numQubits, qbit_param, GateType::CRX );
 89 |         loadPergateCtr(numQubits, qbit_param, GateType::CRY );
 90 |         loadPergateCtr(numQubits, qbit_param, GateType::CU1 );
 91 |         loadPergateCtr(numQubits, qbit_param, GateType::CRZ );
 92 | 
 93 |         for (int K = 1, i = 0; K < 1024; K <<= 1, i++) {
 94 |             fscanf(qbit_param, "%*d%lf", &BLAS_perf[numQubits][i]);
 95 |         }
 96 |         fscanf(qbit_param, "%lf", &cutt_cost[numQubits]);
 97 |         fclose(qbit_param);
 98 |     } else {
 99 |         printf("Parameter file not find for qubit number %d\n", numQubits);
100 |         fflush(stdout);
101 |         exit(1);
102 |     }
103 |     num_qbits_loaded_param[numQubits] = true;
104 | #else
105 |     printf("Use option USE_EVALUATOR_PREPROCESS for non-default qubit number %d\n", numQubits);
106 |     fflush(stdout);
107 |     exit(1);
108 | #endif
109 | }
110 | 
111 | double Evaluator::perfPerGate(int numQubits, const GateGroup* gg) {
112 |     double tim_pred = 0;
113 |     loadParam(numQubits);
114 |     for(auto gate : (gg -> gates)) {
115 |         switch(gate.type) {
116 |             case GateType::CCX : 
117 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
118 |             case GateType::CNOT : 
119 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
120 |             case GateType::CY : 
121 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break;
122 |             case GateType::CZ : 
123 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break;
124 |             case GateType::CRX : 
125 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break;
126 |             case GateType::CRY : 
127 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break;
128 |             case GateType::CU1 :
129 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break;
130 |             case GateType::CRZ : 
131 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break;
132 |             case GateType::U1 : 
133 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break;
134 |             case GateType::U2 : 
135 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break;
136 |             case GateType::U3 : 
137 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break;
138 |             case GateType::H : 
139 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break;
140 |             case GateType::X : 
141 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break;
142 |             case GateType::Y : 
143 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break;
144 |             case GateType::Z : 
145 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break;
146 |             case GateType::S : 
147 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break;
148 |             case GateType::SDG : 
149 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break;
150 |             case GateType::T : 
151 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break;
152 |             case GateType::TDG : 
153 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break;
154 |             case GateType::RX : 
155 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break;
156 |             case GateType::RY : 
157 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break;
158 |             case GateType::RZ : 
159 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break;
160 |             default:
161 |                 printf("meet wrong gate : %s\n", Gate::get_name(gate.type).c_str());
162 |                 UNREACHABLE()
163 |         }
164 |     }
165 |     return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits);
166 | }
167 | 
168 | double Evaluator::perfPerGate(int numQubits, const std::vector<GateType>& types) {
169 |     double tim_pred = 0;
170 |     loadParam(numQubits);
171 |     for(auto ty : types) {
172 |         switch(ty) {
173 |             case GateType::CCX : 
174 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
175 |             case GateType::CNOT : 
176 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CNOT)][0][2]; break;
177 |             case GateType::CY : 
178 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CY)][0][2]; break;
179 |             case GateType::CZ : 
180 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CZ)][0][2]; break;
181 |             case GateType::CRX : 
182 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRX)][0][2]; break;
183 |             case GateType::CRY : 
184 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRY)][0][2]; break;
185 |             case GateType::CU1 :
186 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CU1)][0][2]; break;
187 |             case GateType::CRZ : 
188 |                 tim_pred += pergate_ctr_perf[numQubits][int(GateType::CRZ)][0][2]; break;
189 |             case GateType::U1 : 
190 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U1)][1]; break;
191 |             case GateType::U2 : 
192 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U2)][1]; break;
193 |             case GateType::U3 : 
194 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::U3)][1]; break;
195 |             case GateType::H : 
196 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::H)][1]; break;
197 |             case GateType::X : 
198 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::X)][1]; break;
199 |             case GateType::Y : 
200 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::Y)][1]; break;
201 |             case GateType::Z : 
202 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::Z)][1]; break;
203 |             case GateType::S : 
204 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::S)][1]; break;
205 |             case GateType::SDG : 
206 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::SDG)][1]; break;
207 |             case GateType::T : 
208 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::T)][1]; break;
209 |             case GateType::TDG : 
210 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::TDG)][1]; break;
211 |             case GateType::RX : 
212 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RX)][1]; break;
213 |             case GateType::RY : 
214 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RY)][1]; break;
215 |             case GateType::RZ : 
216 |                 tim_pred += pergate_single_perf[numQubits][int(GateType::RZ)][1]; break;
217 |             default:
218 |                 printf("meet wrong gate : %s\n", Gate::get_name(ty).c_str());
219 |                 UNREACHABLE()
220 |         }
221 |     }
222 |     return tim_pred / 1000 / 512 + pergate_group_overhead * (1 << numQubits);
223 | }
224 | 
225 | double Evaluator::perfBLAS(int numQubits, int blasSize) {
226 |     loadParam(numQubits);
227 |     //double bias = (numQubits < 28) ? ((qindex)1 << (28 - numQubits)) : (1.0 / ((qindex)1 << (numQubits - 28)));
228 |     return BLAS_perf[numQubits][blasSize] + cutt_cost[numQubits];
229 | }
230 | 
231 | bool Evaluator::PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize) {
232 |     double pergate = perfPerGate(numQubits, gg_pergate);
233 |     double blas = perfBLAS(numQubits, blasSize);
234 |     return pergate / (gg_pergate -> gates).size() < blas / (gg_blas -> gates).size();
235 | }


--------------------------------------------------------------------------------
/src/evaluator.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "schedule.h"
  3 | #include "utils.h"
  4 | #include "gate.h"
  5 | 
  6 | #define GATE_NUM 24
  7 | #define MAX_QBITS 40
  8 | 
  9 | #define CALC_ALL_PARAM 0
 10 | #define CALC_PARTIAL_PARAM 1
 11 | 
 12 | /*
 13 | * build performance model to choose between BLAS and perGate backend
 14 | * Is a singleton class
 15 | **/
 16 | class Evaluator {
 17 | private:
 18 |     const double V100_U1[LOCAL_QUBIT_SIZE] = {235,225,225,225,225,225,224,225,225,225};
 19 |     const double V100_U2[LOCAL_QUBIT_SIZE] = {470,469,469,469,469,469,469,470,469,469};
 20 |     const double V100_U3[LOCAL_QUBIT_SIZE] = {469,469,469,469,469,469,469,469,469,469};
 21 |     const double V100_H[LOCAL_QUBIT_SIZE]  = {352,352,352,352,352,352,352,352,352,352};
 22 |     const double V100_X[LOCAL_QUBIT_SIZE]  = {350,350,350,350,350,350,350,350,350,350};
 23 |     const double V100_Y[LOCAL_QUBIT_SIZE]  = {350,350,350,350,350,349,349,350,350,350};
 24 |     const double V100_Z[LOCAL_QUBIT_SIZE]  = {194,194,194,194,194,194,194,194,194,194};
 25 |     const double V100_S[LOCAL_QUBIT_SIZE]  = {209,209,209,209,209,209,209,209,209,209};
 26 |     const double V100_SDG[LOCAL_QUBIT_SIZE]  = {209,209,209,209,209,209,209,209,209,209}; // TODO
 27 |     const double V100_T[LOCAL_QUBIT_SIZE]  = {216,216,216,216,216,216,217,216,216,216};
 28 |     const double V100_TDG[LOCAL_QUBIT_SIZE]  = {216,216,216,216,216,216,217,216,216,216}; // TODO
 29 |     const double V100_RX[LOCAL_QUBIT_SIZE] = {370,370,370,370,370,370,370,370,370,370};
 30 |     const double V100_RY[LOCAL_QUBIT_SIZE] = {367,367,367,367,367,367,367,367,367,367};
 31 |     const double V100_RZ[LOCAL_QUBIT_SIZE] = {369,369,369,369,369,369,369,369,369,369};
 32 | 
 33 |     const double V100_CN[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 34 |         0,213,195,345,193,193,193,193,193,193,
 35 |         193,0,193,193,345,193,193,193,193,193,
 36 |         193,193,0,193,193,345,193,193,193,193,
 37 |         345,193,193,0,193,193,193,193,193,193,
 38 |         193,345,193,193,0,193,193,193,193,193,
 39 |         193,193,345,193,193,0,193,193,193,193,
 40 |         193,193,193,193,193,193,0,193,193,193,
 41 |         193,193,193,193,193,193,193,0,193,193,
 42 |         193,193,193,193,193,193,193,193,0,193,
 43 |         193,193,193,193,193,193,193,193,193,0,
 44 |     };
 45 |     const double V100_CY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 46 |         0,193,193,346,193,193,193,193,193,193,
 47 |         193,0,193,193,346,193,193,193,193,193,
 48 |         193,193,0,193,193,345,193,193,193,193,
 49 |         346,193,193,0,193,193,192,193,193,193,
 50 |         193,345,193,193,0,193,193,193,193,193,
 51 |         193,193,345,193,193,0,193,193,193,193,
 52 |         193,193,193,193,193,193,0,193,192,193,
 53 |         193,193,193,193,193,193,193,0,193,193,
 54 |         193,193,193,193,193,192,193,193,0,193,
 55 |         193,193,192,193,193,192,193,193,193,0,
 56 |     };
 57 |     const double V100_CZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 58 |         0,137,137,191,137,137,137,137,137,137,
 59 |         137,0,137,137,190,137,137,137,137,137,
 60 |         137,137,0,137,137,191,137,137,137,137,
 61 |         190,137,137,0,137,137,137,137,137,137,
 62 |         137,190,137,137,0,137,137,137,137,137,
 63 |         137,137,191,137,137,0,137,137,137,137,
 64 |         137,137,137,137,137,137,0,137,137,137,
 65 |         137,137,137,137,137,137,137,0,137,137,
 66 |         137,137,137,137,137,137,137,137,0,137,
 67 |         137,137,137,137,137,137,137,137,137,0,
 68 |     };
 69 |     const double V100_CRX[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 70 |         0,224,224,358,224,224,223,224,224,224,
 71 |         224,0,224,224,358,224,224,224,223,224,
 72 |         224,224,0,224,224,358,223,224,224,223,
 73 |         358,224,223,0,224,223,224,223,223,224,
 74 |         223,358,224,224,0,224,224,223,223,224,
 75 |         223,223,358,224,224,0,223,224,224,224,
 76 |         224,223,223,223,224,224,0,224,224,224,
 77 |         224,224,224,224,224,224,223,0,224,223,
 78 |         224,224,224,224,224,224,224,223,0,224,
 79 |         224,224,224,224,224,224,224,224,223,0,
 80 |     };
 81 |     const double V100_CRY[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 82 |         0,225,225,356,225,225,225,225,225,225,
 83 |         225,0,225,225,356,225,224,225,225,225,
 84 |         225,225,0,225,224,356,225,225,225,225,
 85 |         356,225,225,0,225,225,225,225,224,225,
 86 |         225,356,225,225,0,225,224,225,225,225,
 87 |         225,225,356,225,225,0,225,225,225,225,
 88 |         225,225,225,225,225,224,0,225,225,225,
 89 |         225,225,225,225,224,225,225,0,225,225,
 90 |         225,225,225,225,225,225,225,225,0,225,
 91 |         225,225,225,225,225,225,225,225,225,0,
 92 |     };
 93 |     const double V100_CU1[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
 94 |         // FIXME
 95 |         0,225,225,356,225,225,225,225,225,225,
 96 |         225,0,225,225,356,225,224,225,225,225,
 97 |         225,225,0,225,224,356,225,225,225,225,
 98 |         356,225,225,0,225,225,225,225,224,225,
 99 |         225,356,225,225,0,225,224,225,225,225,
100 |         225,225,356,225,225,0,225,225,225,225,
101 |         225,225,225,225,225,224,0,225,225,225,
102 |         225,225,225,225,224,225,225,0,225,225,
103 |         225,225,225,225,225,225,225,225,0,225,
104 |         225,225,225,225,225,225,225,225,225,0,
105 |     };
106 |     const double V100_CRZ[LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE] = {
107 |         0,224,224,359,224,224,224,224,224,224,
108 |         224,0,224,224,359,224,224,224,224,224,
109 |         224,224,0,224,224,359,224,224,224,224,
110 |         359,224,224,0,224,224,224,224,224,224,
111 |         224,359,224,224,0,224,224,224,224,224,
112 |         224,224,359,224,224,0,224,224,224,224,
113 |         224,224,224,224,224,224,0,224,224,224,
114 |         224,224,224,224,224,224,224,0,224,224,
115 |         224,224,224,224,224,224,224,224,0,224,
116 |         224,224,224,224,224,224,224,224,224,0,
117 |     };
118 | 
119 |     // pergate single gate performance for 512 runs with 28 qbits
120 |     double pergate_single_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE];
121 |     // pergate control gate performance for 512 runs with 28 qbits
122 |     double pergate_ctr_perf[MAX_QBITS + 1][GATE_NUM][LOCAL_QUBIT_SIZE][LOCAL_QUBIT_SIZE];
123 |     // overhead of one pergate group
124 |     double BLAS_perf[MAX_QBITS + 1][MAX_QBITS + 1];
125 |     double cutt_cost[MAX_QBITS + 1];
126 |     bool num_qbits_loaded_param[MAX_QBITS + 1];
127 |     const double pergate_group_overhead = 1.0 / (1 << 27);
128 | 
129 |     int param_type;
130 | 
131 |     Evaluator();
132 |     
133 |     static Evaluator* instance_ptr;
134 | public:
135 |     static Evaluator* getInstance() {
136 |         if(instance_ptr == nullptr) {
137 |             instance_ptr = new Evaluator;
138 |         }
139 |         return instance_ptr;
140 |     }
141 |     void loadPergateSingle(int numQubits, FILE* qbit_param, GateType gate_type);
142 |     void loadPergateCtr(int numQubits, FILE* qbit_param, GateType gate_type);
143 |     void loadParam(int numQubits);
144 |     double perfPerGate(int numQubits, const GateGroup* gg);
145 |     double perfPerGate(int numQubits, const std::vector<GateType>& types);
146 |     double perfBLAS(int numQubits, int blasSize);
147 |     // return True if choose pergate over BLAS
148 |     bool PerGateOrBLAS(const GateGroup* gg_pergate, const GateGroup* gg_blas, int numQubits, int blasSize);
149 | };
150 | 


--------------------------------------------------------------------------------
/src/executor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "utils.h"
 3 | 
 4 | #include <cutt.h>
 5 | #include <vector>
 6 | #include <map>
 7 | 
 8 | #include "schedule.h"
 9 | 
10 | class Executor {
11 | public:
12 |     Executor(std::vector<qComplex*> deviceStateVec, int numQubits, Schedule& schedule);
13 |     void run();
14 | private:
15 |     // instructions
16 |     void transpose(std::vector<cuttHandle> plans);
17 |     void all2all(int commSize, std::vector<int> comm);
18 |     void setState(const State& newState) { state = newState; }
19 |     void applyGateGroup(GateGroup& gg, int sliceID = -1);
20 |     void applyPerGateGroup(GateGroup& gg);
21 |     void applyBlasGroup(GateGroup& gg);
22 |     void applyPerGateGroupSliced(GateGroup& gg, int sliceID);
23 |     void applyBlasGroupSliced(GateGroup& gg, int sliceID);
24 |     void finalize();
25 |     void storeState();
26 |     void loadState();
27 |     void sliceBarrier(int sliceID);
28 |     void allBarrier();
29 | 
30 |     // utils
31 |     qindex toPhyQubitSet(qindex logicQubitset) const;
32 |     qindex fillRelatedQubits(qindex related) const;
33 |     KernelGate getGate(const Gate& gate, int part_id, int numLocalQubits, qindex relatedLogicQb, const std::map<int, int>& toID) const;
34 | 
35 |     // internal
36 |     void prepareBitMap(qindex relatedQubits, unsigned int& blockHot, unsigned int& threadBias, int numLocalQubits); // allocate threadBias
37 |     std::map<int, int> getLogicShareMap(qindex relatedQubits, int numLocalQubits) const; // input: physical, output logic -> share
38 | 
39 |     State state;
40 |     State oldState;
41 |     std::vector<cudaEvent_t> commEvents; // commEvents[slice][gpuID]
42 |     std::vector<int> partID; // partID[slice][gpuID]
43 |     std::vector<int> peer; // peer[slice][gpuID]
44 | 
45 |     // constants
46 |     std::vector<unsigned int*> threadBias;
47 |     std::vector<qComplex*> deviceStateVec;
48 |     std::vector<qComplex*> deviceBuffer;
49 |     int numQubits;
50 |     int numSlice, numSliceBit;
51 | 
52 |     //schedule
53 |     Schedule& schedule;
54 |     
55 | };


--------------------------------------------------------------------------------
/src/gate.cpp:
--------------------------------------------------------------------------------
  1 | #include "gate.h"
  2 | 
  3 | #include <cmath>
  4 | #include <cstring>
  5 | #include <assert.h>
  6 | 
  7 | static int globalGateID = 0;
  8 | 
  9 | Gate Gate::CCX(int controlQubit, int controlQubit2, int targetQubit) {
 10 |     Gate g;
 11 |     g.gateID = ++ globalGateID;
 12 |     g.type = GateType::CCX;
 13 |     g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1);
 14 |     g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0);
 15 |     g.name = "CCX";
 16 |     g.targetQubit = targetQubit;
 17 |     g.controlQubit = controlQubit;
 18 |     g.controlQubit2 = controlQubit2;
 19 |     return g;
 20 | 
 21 | }
 22 | 
 23 | Gate Gate::CNOT(int controlQubit, int targetQubit) {
 24 |     Gate g;
 25 |     g.gateID = ++ globalGateID;
 26 |     g.type = GateType::CNOT;
 27 |     g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1);
 28 |     g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0);
 29 |     g.name = "CN";
 30 |     g.targetQubit = targetQubit;
 31 |     g.controlQubit = controlQubit;
 32 |     return g;
 33 | }
 34 | 
 35 | Gate Gate::CY(int controlQubit, int targetQubit) {
 36 |     Gate g;
 37 |     g.gateID = ++ globalGateID;
 38 |     g.type = GateType::CY;
 39 |     g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(0, -1);
 40 |     g.mat[1][0] = make_qComplex(0, 1); g.mat[1][1] = make_qComplex(0);
 41 |     g.name = "CY";
 42 |     g.targetQubit = targetQubit;
 43 |     g.controlQubit = controlQubit;
 44 |     return g;
 45 | }
 46 | 
 47 | Gate Gate::CZ(int controlQubit, int targetQubit) {
 48 |     Gate g;
 49 |     g.gateID = ++ globalGateID;
 50 |     g.type = GateType::CZ;
 51 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
 52 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1);
 53 |     g.name = "CZ";
 54 |     g.targetQubit = targetQubit;
 55 |     g.controlQubit = controlQubit;
 56 |     return g;
 57 | }
 58 | 
 59 | Gate Gate::CRX(int controlQubit, int targetQubit, qreal angle) {
 60 |     Gate g;
 61 |     g.gateID = ++ globalGateID;
 62 |     g.type = GateType::CRX;
 63 |     g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(0, -sin(angle/2.0));
 64 |     g.mat[1][0] = make_qComplex(0, -sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0));
 65 |     g.name = "CRX";
 66 |     g.targetQubit = targetQubit;
 67 |     g.controlQubit = controlQubit;
 68 |     return g;
 69 | }
 70 | 
 71 | Gate Gate::CRY(int controlQubit, int targetQubit, qreal angle) {
 72 |     Gate g;
 73 |     g.gateID = ++ globalGateID;
 74 |     g.type = GateType::CRY;
 75 |     g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(-sin(angle/2.0));
 76 |     g.mat[1][0] = make_qComplex(sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0));
 77 |     g.name = "CRY";
 78 |     g.targetQubit = targetQubit;
 79 |     g.controlQubit = controlQubit;
 80 |     return g;
 81 | }
 82 | 
 83 | Gate Gate::CU1(int controlQubit, int targetQubit, qreal lambda) {
 84 |     Gate g;
 85 |     g.gateID = ++ globalGateID;
 86 |     g.type = GateType::CU1;
 87 |     g.mat[0][0] = make_qComplex(1);
 88 |     g.mat[0][1] = make_qComplex(0);
 89 |     g.mat[1][0] = make_qComplex(0);
 90 |     g.mat[1][1] = make_qComplex(cos(lambda), sin(lambda));
 91 |     g.name = "CU1";
 92 |     g.targetQubit = targetQubit;
 93 |     g.controlQubit = controlQubit;
 94 |     return g;
 95 | }
 96 | 
 97 | Gate Gate::CRZ(int controlQubit, int targetQubit, qreal angle) {
 98 |     Gate g;
 99 |     g.gateID = ++ globalGateID;
100 |     g.type = GateType::CRZ;
101 |     g.mat[0][0] = make_qComplex(cos(angle/2), -sin(angle/2)); g.mat[0][1] = make_qComplex(0);
102 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(cos(angle/2), sin(angle/2));
103 |     g.name = "CRZ";
104 |     g.targetQubit = targetQubit;
105 |     g.controlQubit = controlQubit;
106 |     return g;
107 | }
108 | 
109 | 
110 | Gate Gate::U1(int targetQubit, qreal lambda) {
111 |     Gate g;
112 |     g.gateID = ++ globalGateID;
113 |     g.type = GateType::U1;
114 |     g.mat[0][0] = make_qComplex(1);
115 |     g.mat[0][1] = make_qComplex(0);
116 |     g.mat[1][0] = make_qComplex(0);
117 |     g.mat[1][1] = make_qComplex(cos(lambda), sin(lambda));
118 |     g.name = "U1";
119 |     g.targetQubit = targetQubit;
120 |     g.controlQubit = -1;
121 |     return g;
122 | }
123 | 
124 | Gate Gate::U2(int targetQubit, qreal phi, qreal lambda) {
125 |     Gate g;
126 |     g.gateID = ++ globalGateID;
127 |     g.type = GateType::U2;
128 |     g.mat[0][0] = make_qComplex(1.0 / sqrt(2));
129 |     g.mat[0][1] = make_qComplex(-cos(lambda) / sqrt(2), -sin(lambda) / sqrt(2));
130 |     g.mat[1][0] = make_qComplex(cos(phi) / sqrt(2), sin(phi) / sqrt(2));
131 |     g.mat[1][1] = make_qComplex(cos(lambda + phi) / sqrt(2), sin(lambda + phi) / sqrt(2));
132 |     g.name = "U2";
133 |     g.targetQubit = targetQubit;
134 |     g.controlQubit = -1;
135 |     return g;
136 | }
137 | 
138 | Gate Gate::U3(int targetQubit, qreal theta, qreal phi, qreal lambda) {
139 |     Gate g;
140 |     g.gateID = ++ globalGateID;
141 |     g.type = GateType::U3;
142 |     g.mat[0][0] = make_qComplex(cos(theta / 2));
143 |     g.mat[0][1] = make_qComplex(-cos(lambda) * sin(theta / 2), -sin(lambda) * sin(theta / 2));
144 |     g.mat[1][0] = make_qComplex(cos(phi) * sin(theta / 2), sin(phi) * sin(theta / 2));
145 |     g.mat[1][1] = make_qComplex(cos(phi + lambda) * cos(theta / 2), sin(phi + lambda) * cos(theta / 2));
146 |     g.name = "U3";
147 |     g.targetQubit = targetQubit;
148 |     g.controlQubit = -1;
149 |     return g;
150 | }
151 | 
152 | Gate Gate::H(int targetQubit) {
153 |     Gate g;
154 |     g.gateID = ++ globalGateID;
155 |     g.type = GateType::H;
156 |     g.mat[0][0] = make_qComplex(1/sqrt(2)); g.mat[0][1] = make_qComplex(1/sqrt(2));
157 |     g.mat[1][0] = make_qComplex(1/sqrt(2)); g.mat[1][1] = make_qComplex(-1/sqrt(2));
158 |     g.name = "H";
159 |     g.targetQubit = targetQubit;
160 |     g.controlQubit = -1;
161 |     return g;
162 | }
163 | 
164 | Gate Gate::X(int targetQubit) {
165 |     Gate g;
166 |     g.gateID = ++ globalGateID;
167 |     g.type = GateType::X;
168 |     g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(1);
169 |     g.mat[1][0] = make_qComplex(1); g.mat[1][1] = make_qComplex(0);
170 |     g.name = "X";
171 |     g.targetQubit = targetQubit;
172 |     g.controlQubit = -1;
173 |     return g;
174 | }
175 | 
176 | Gate Gate::Y(int targetQubit) {
177 |     Gate g;
178 |     g.gateID = ++ globalGateID;
179 |     g.type = GateType::Y;
180 |     g.mat[0][0] = make_qComplex(0); g.mat[0][1] = make_qComplex(0, -1);
181 |     g.mat[1][0] = make_qComplex(0, 1); g.mat[1][1] = make_qComplex(0);
182 |     g.name = "Y";
183 |     g.targetQubit = targetQubit;
184 |     g.controlQubit = -1;
185 |     return g;
186 | }
187 | 
188 | Gate Gate::Z(int targetQubit) {
189 |     Gate g;
190 |     g.gateID = ++ globalGateID;
191 |     g.type = GateType::Z;
192 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
193 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1);
194 |     g.name = "Z";
195 |     g.targetQubit = targetQubit;
196 |     g.controlQubit = -1;
197 |     return g;
198 | }
199 | 
200 | Gate Gate::S(int targetQubit) {
201 |     Gate g;
202 |     g.gateID = ++ globalGateID;
203 |     g.type = GateType::S;
204 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
205 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, 1);
206 |     g.name = "S";
207 |     g.targetQubit = targetQubit;
208 |     g.controlQubit = -1;
209 |     return g;
210 | }
211 | 
212 | Gate Gate::SDG(int targetQubit) {
213 |     Gate g;
214 |     g.gateID = ++ globalGateID;
215 |     g.type = GateType::SDG;
216 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
217 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, -1);
218 |     g.name = "SDG";
219 |     g.targetQubit = targetQubit;
220 |     g.controlQubit = -1;
221 |     return g;
222 | }
223 | 
224 | Gate Gate::T(int targetQubit) {
225 |     Gate g;
226 |     g.gateID = ++ globalGateID;
227 |     g.type = GateType::T;
228 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
229 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1/sqrt(2), 1/sqrt(2));
230 |     g.name = "T";
231 |     g.targetQubit = targetQubit;
232 |     g.controlQubit = -1;
233 |     return g;
234 | }
235 | 
236 | Gate Gate::TDG(int targetQubit) {
237 |     Gate g;
238 |     g.gateID = ++ globalGateID;
239 |     g.type = GateType::T;
240 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
241 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1/sqrt(2), -1/sqrt(2));
242 |     g.name = "TDG";
243 |     g.targetQubit = targetQubit;
244 |     g.controlQubit = -1;
245 |     return g;
246 | }
247 | 
248 | Gate Gate::RX(int targetQubit, qreal angle) {
249 |     Gate g;
250 |     g.gateID = ++ globalGateID;
251 |     g.type = GateType::RX;
252 |     g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(0, -sin(angle/2.0));
253 |     g.mat[1][0] = make_qComplex(0, -sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0));
254 |     g.name = "RX";
255 |     g.targetQubit = targetQubit;
256 |     g.controlQubit = -1;
257 |     return g;
258 | }
259 | 
260 | Gate Gate::RY(int targetQubit, qreal angle) {
261 |     Gate g;
262 |     g.gateID = ++ globalGateID;
263 |     g.type = GateType::RY;
264 |     g.mat[0][0] = make_qComplex(cos(angle/2.0)); g.mat[0][1] = make_qComplex(-sin(angle/2.0));
265 |     g.mat[1][0] = make_qComplex(sin(angle/2.0)); g.mat[1][1] = make_qComplex(cos(angle/2.0));
266 |     g.name = "RY";
267 |     g.targetQubit = targetQubit;
268 |     g.controlQubit = -1;
269 |     return g;
270 | }
271 | 
272 | Gate Gate::RZ(int targetQubit, qreal angle) {
273 |     Gate g;
274 |     g.gateID = ++ globalGateID;
275 |     g.type = GateType::RZ;
276 |     g.mat[0][0] = make_qComplex(cos(angle/2), -sin(angle/2)); g.mat[0][1] = make_qComplex(0);
277 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(cos(angle/2), sin(angle/2));
278 |     g.name = "RZ";
279 |     g.targetQubit = targetQubit;
280 |     g.controlQubit = -1;
281 |     return g;
282 | }
283 | 
284 | Gate Gate::ID(int targetQubit) {
285 |     Gate g;
286 |     g.gateID = ++ globalGateID;
287 |     g.type = GateType::ID;
288 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
289 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(1);
290 |     g.name = "ID";
291 |     g.targetQubit = targetQubit;
292 |     g.controlQubit = -1;
293 |     return g;
294 | }
295 | 
296 | Gate Gate::GII(int targetQubit) {
297 |     Gate g;
298 |     g.gateID = ++ globalGateID;
299 |     g.type = GateType::GII;
300 |     g.mat[0][0] = make_qComplex(0, 1); g.mat[0][1] = make_qComplex(0);
301 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(0, 1);
302 |     g.name = "GII";
303 |     g.targetQubit = targetQubit;
304 |     g.controlQubit = -1;
305 |     return g;
306 | }
307 | 
308 | Gate Gate::GZZ(int targetQubit) {
309 |     Gate g;
310 |     g.gateID = ++ globalGateID;
311 |     g.type = GateType::GZZ;
312 |     g.mat[0][0] = make_qComplex(-1); g.mat[0][1] = make_qComplex(0);
313 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(-1);
314 |     g.name = "GZZ";
315 |     g.targetQubit = targetQubit;
316 |     g.controlQubit = -1;
317 |     return g;
318 | }
319 | 
320 | Gate Gate::GOC(int targetQubit, qreal real, qreal imag) {
321 |     Gate g;
322 |     g.gateID = ++ globalGateID;
323 |     g.type = GateType::GOC;
324 |     g.mat[0][0] = make_qComplex(1); g.mat[0][1] = make_qComplex(0);
325 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(real, imag);
326 |     g.name = "GOC";
327 |     g.targetQubit = targetQubit;
328 |     g.controlQubit = -1;
329 |     return g;
330 | }
331 | 
332 | Gate Gate::GCC(int targetQubit, qreal real, qreal imag) {
333 |     Gate g;
334 |     g.gateID = ++ globalGateID;
335 |     g.type = GateType::GCC;
336 |     g.mat[0][0] = make_qComplex(real, imag); g.mat[0][1] = make_qComplex(0);
337 |     g.mat[1][0] = make_qComplex(0); g.mat[1][1] = make_qComplex(real, imag);
338 |     g.name = "GCC";
339 |     g.targetQubit = targetQubit;
340 |     g.controlQubit = -1;
341 |     return g;
342 | }
343 | 
344 | auto gen_01_float = []() {
345 |     return rand() * 1.0 / RAND_MAX;
346 | };
347 | auto gen_0_2pi_float = []() {
348 |         return gen_01_float() * acos(-1) * 2;
349 | };
350 | 
351 | Gate Gate::random(int lo, int hi) {
352 |     int type = rand() % int(GateType::TOTAL);
353 |     return random(lo, hi, GateType(type));
354 | }
355 | 
356 | Gate Gate::random(int lo, int hi, GateType type) {
357 |     auto gen_c2_id = [lo, hi](int &t, int &c1, int &c2) {
358 |         assert(hi - lo >= 3);
359 |         do {
360 |             c2 = rand() % (hi - lo) + lo;
361 |             c1 = rand() % (hi - lo) + lo;
362 |             t = rand() % (hi - lo) + lo;
363 |         } while (c2 == c1 || c2 == t || c1 == t);
364 |     };
365 |     auto gen_c1_id = [lo, hi](int &t, int &c1) {
366 |         assert(hi - lo >= 2);
367 |         do {
368 |             c1 = rand() % (hi - lo) + lo;
369 |             t = rand() % (hi - lo) + lo;
370 |         } while (c1 == t);
371 |     };
372 |     auto gen_single_id = [lo, hi](int &t) {
373 |         t = rand() % (hi - lo) + lo;
374 |     };
375 |     switch (type) {
376 |         case GateType::CCX: {
377 |             int t, c1, c2;
378 |             gen_c2_id(t, c1, c2);
379 |             return CCX(c1, c2, t);
380 |         }
381 |         case GateType::CNOT: {
382 |             int t, c1;
383 |             gen_c1_id(t, c1);
384 |             return CNOT(c1, t);
385 |         }
386 |         case GateType::CY: {
387 |             int t, c1;
388 |             gen_c1_id(t, c1);
389 |             return CY(c1, t);
390 |         }
391 |         case GateType::CZ: {
392 |             int t, c1;
393 |             gen_c1_id(t, c1);
394 |             return CZ(c1, t);
395 |         }
396 |         case GateType::CRX: {
397 |             int t, c1;
398 |             gen_c1_id(t, c1);
399 |             return CRX(c1, t, gen_0_2pi_float());
400 |         }
401 |         case GateType::CRY: {
402 |             int t, c1;
403 |             gen_c1_id(t, c1);
404 |             return CRY(c1, t, gen_0_2pi_float());
405 |         }
406 |         case GateType::CU1: {
407 |             int t, c1;
408 |             gen_c1_id(t, c1);
409 |             return CU1(c1, t, gen_0_2pi_float());
410 |         }
411 |         case GateType::CRZ: {
412 |             int t, c1;
413 |             gen_c1_id(t, c1);
414 |             return CRZ(c1, t, gen_0_2pi_float());
415 |         }
416 |         case GateType::U1: {
417 |             int t;
418 |             gen_single_id(t);
419 |             return U1(t, gen_0_2pi_float());
420 |         }
421 |         case GateType::U2: {
422 |             int t;
423 |             gen_single_id(t);
424 |             return U2(t, gen_0_2pi_float(), gen_0_2pi_float());
425 |         }
426 |         case GateType::U3: {
427 |             int t;
428 |             gen_single_id(t);
429 |             return U3(t, gen_0_2pi_float(), gen_0_2pi_float(), gen_0_2pi_float());
430 |         }
431 |         case GateType::H: {
432 |             int t;
433 |             gen_single_id(t);
434 |             return H(t);
435 |         }
436 |         case GateType::X: {
437 |             int t;
438 |             gen_single_id(t);
439 |             return X(t);
440 |         }
441 |         case GateType::Y: {
442 |             int t;
443 |             gen_single_id(t);
444 |             return Y(t);
445 |         }
446 |         case GateType::Z: {
447 |             int t;
448 |             gen_single_id(t);
449 |             return Z(t);
450 |         }
451 |         case GateType::S: {
452 |             int t;
453 |             gen_single_id(t);
454 |             return S(t);
455 |         }
456 |         case GateType::SDG: {
457 |             int t;
458 |             gen_single_id(t);
459 |             return SDG(t);
460 |         }
461 |         case GateType::T: {
462 |             int t;
463 |             gen_single_id(t);
464 |             return T(t);
465 |         }
466 |         case GateType::TDG: {
467 |             int t;
468 |             gen_single_id(t);
469 |             return TDG(t);
470 |         }
471 |         case GateType::RX: {
472 |             int t;
473 |             gen_single_id(t);
474 |             return RX(t, gen_0_2pi_float());
475 |         }
476 |         case GateType::RY: {
477 |             int t;
478 |             gen_single_id(t);
479 |             return RY(t, gen_0_2pi_float());
480 |         }
481 |         case GateType::RZ: {
482 |             int t;
483 |             gen_single_id(t);
484 |             return RZ(t, gen_0_2pi_float());
485 |         }
486 |         default: {
487 |             printf("invalid %d\n", (int) type);
488 |             assert(false);
489 |         }
490 |     }
491 |     exit(1);
492 | }
493 | 
494 | Gate Gate::control(int controlQubit, int targetQubit, GateType type) {
495 |     switch (type) {
496 |         case GateType::CNOT: {
497 |             return CNOT(controlQubit, targetQubit);
498 |         }
499 |         case GateType::CY: {
500 |             return CY(controlQubit, targetQubit);
501 |         }
502 |         case GateType::CZ: {
503 |             return CZ(controlQubit, targetQubit);
504 |         }
505 |         case GateType::CRX: {
506 |             return CRX(controlQubit, targetQubit, gen_0_2pi_float());
507 |         }
508 |         case GateType::CRY: {
509 |             return CRY(controlQubit, targetQubit, gen_0_2pi_float());
510 |         }
511 |         case GateType::CU1: {
512 |             return CU1(controlQubit, targetQubit, gen_0_2pi_float());
513 |         }
514 |         case GateType::CRZ: {
515 |             return CRZ(controlQubit, targetQubit, gen_0_2pi_float());
516 |         }
517 |         default: {
518 |             assert(false);
519 |         }
520 |     }
521 |     exit(1);
522 | }
523 | 
524 | GateType Gate::toCU(GateType type) {
525 |     if (type == GateType::CCX) {
526 |         return GateType::CNOT;
527 |     } else {
528 |         UNREACHABLE()
529 |     }
530 | }
531 | 
532 | GateType Gate::toU(GateType type) {
533 |     switch (type) {
534 |         case GateType::CCX:
535 |         case GateType::CNOT:
536 |             return GateType::X;
537 |         case GateType::CY:
538 |             return GateType::Y;
539 |         case GateType::CZ:
540 |             return GateType::Z;
541 |         case GateType::CRX:
542 |             return GateType::RX;
543 |         case GateType::CRY:
544 |             return GateType::RY;
545 |         case GateType::CU1:
546 |             return GateType::U1;
547 |         case GateType::CRZ:
548 |             return GateType::RZ;
549 |         default:
550 |             UNREACHABLE()
551 |     }
552 | }
553 | 
554 | std::string Gate::get_name(GateType ty) {
555 |     return random(0, 10, ty).name;
556 | }
557 | 
558 | std::vector<unsigned char> Gate::serialize() const {
559 |     auto name_len = name.length();
560 |     int len =
561 |         sizeof(name_len) + name.length() + 1 + sizeof(gateID) + sizeof(type) + sizeof(mat)
562 |         + sizeof(targetQubit) + sizeof(controlQubit) + sizeof(controlQubit2);
563 |     std::vector<unsigned char> ret; ret.resize(len);
564 |     unsigned char* arr = ret.data();
565 |     int cur = 0;
566 |     SERIALIZE_STEP(gateID);
567 |     SERIALIZE_STEP(type);
568 |     memcpy(arr + cur, mat, sizeof(mat)); cur += sizeof(qComplex) * 4;
569 |     SERIALIZE_STEP(name_len);
570 |     strcpy(reinterpret_cast<char*>(arr) + cur, name.c_str()); cur += name_len + 1;
571 |     SERIALIZE_STEP(targetQubit);
572 |     SERIALIZE_STEP(controlQubit);
573 |     SERIALIZE_STEP(controlQubit2);
574 |     assert(cur == len);
575 |     return ret;
576 | }
577 | 
578 | Gate Gate::deserialize(const unsigned char* arr, int& cur) {
579 |     Gate g;
580 |     DESERIALIZE_STEP(g.gateID);
581 |     DESERIALIZE_STEP(g.type);
582 |     memcpy(g.mat, arr + cur, sizeof(g.mat)); cur += sizeof(qComplex) * 4;
583 |     decltype(g.name.length()) name_len; DESERIALIZE_STEP(name_len);
584 |     g.name = std::string(reinterpret_cast<const char*>(arr) + cur, name_len); cur += name_len + 1;
585 |     DESERIALIZE_STEP(g.targetQubit);
586 |     DESERIALIZE_STEP(g.controlQubit);
587 |     DESERIALIZE_STEP(g.controlQubit2);
588 |     return g;
589 | }


--------------------------------------------------------------------------------
/src/gate.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <vector>
  5 | #include "utils.h"
  6 | 
  7 | enum class GateType {
  8 |     CCX, CNOT, CY, CZ, CRX, CRY, CU1, CRZ, U1, U2, U3, H, X, Y, Z, S, SDG, T, TDG, RX, RY, RZ, TOTAL, ID, GII, GZZ, GOC, GCC 
  9 | };
 10 | 
 11 | struct Gate {
 12 |     int gateID;
 13 |     GateType type;
 14 |     qComplex mat[2][2];
 15 |     std::string name;
 16 |     int targetQubit;
 17 |     int controlQubit; // -1 if no control
 18 |     int controlQubit2; // -1 if no control
 19 |     Gate(): controlQubit(-1), controlQubit2(-1) {};
 20 |     Gate(const Gate&) = default;
 21 |     bool isControlGate() const {
 22 |         return controlQubit != -1;
 23 |     }
 24 |     bool isC2Gate() const {
 25 |         return controlQubit2 != -1;
 26 |     }
 27 |     bool isDiagonal() const {
 28 |         return type == GateType::CZ || type == GateType::CU1 || type == GateType::CRZ || type == GateType::U1 || type == GateType::Z || type == GateType::S || type == GateType::SDG || type == GateType::T || type == GateType::TDG || type == GateType::RZ;
 29 |     }
 30 |     static Gate CCX(int c1, int c2, int targetQubit);
 31 |     static Gate CNOT(int controlQubit, int targetQubit);
 32 |     static Gate CY(int controlQubit, int targetQubit);
 33 |     static Gate CZ(int controlQubit, int targetQubit);
 34 |     static Gate CRX(int controlQubit, int targetQubit, qreal angle);
 35 |     static Gate CRY(int controlQubit, int targetQubit, qreal angle);
 36 |     static Gate CU1(int controlQubit, int targetQubit, qreal lambda);
 37 |     static Gate CRZ(int controlQubit, int targetQubit, qreal angle);
 38 |     static Gate U1(int targetQubit, qreal lambda);
 39 |     static Gate U2(int targetQubit, qreal phi, qreal lambda);
 40 |     static Gate U3(int targetQubit, qreal theta, qreal phi, qreal lambda);
 41 |     static Gate H(int targetQubit);
 42 |     static Gate X(int targetQubit);
 43 |     static Gate Y(int targetQubit);
 44 |     static Gate Z(int targetQubit);
 45 |     static Gate S(int targetQubit);
 46 |     static Gate SDG(int targetQubit); 
 47 |     static Gate T(int targetQubit);
 48 |     static Gate TDG(int targetQubit);
 49 |     static Gate RX(int targetQubit, qreal angle);
 50 |     static Gate RY(int targetQubit, qreal angle);
 51 |     static Gate RZ(int targetQubit, qreal angle);
 52 |     static Gate ID(int targetQubit);
 53 |     static Gate GII(int targetQubit);
 54 |     static Gate GTT(int targetQubit);
 55 |     static Gate GZZ(int targetQubit);
 56 |     static Gate GOC(int targetQubit, qreal real, qreal imag);
 57 |     static Gate GCC(int targetQubit, qreal real, qreal imag);
 58 |     static Gate random(int lo, int hi);
 59 |     static Gate random(int lo, int hi, GateType type);
 60 |     static Gate control(int controlQubit, int targetQubit, GateType type);
 61 |     static GateType toCU(GateType type);
 62 |     static GateType toU(GateType type);
 63 |     static std::string get_name(GateType ty);
 64 |     std::vector<unsigned char> serialize() const;
 65 |     static Gate deserialize(const unsigned char* arr, int& cur);
 66 | };
 67 | 
 68 | struct KernelGate {
 69 |     int targetQubit;
 70 |     int controlQubit;
 71 |     int controlQubit2;
 72 |     GateType type;
 73 |     char targetIsGlobal;  // 0-local 1-global
 74 |     char controlIsGlobal; // 0-local 1-global 2-not control 
 75 |     char control2IsGlobal; // 0-local 1-global 2-not control
 76 |     qreal r00, i00, r01, i01, r10, i10, r11, i11;
 77 | 
 78 |     KernelGate(
 79 |         GateType type_,
 80 |         int controlQubit2_, char control2IsGlobal_, 
 81 |         int controlQubit_, char controlIsGlobal_,
 82 |         int targetQubit_, char targetIsGlobal_,
 83 |         const qComplex mat[2][2]
 84 |     ):
 85 |         targetQubit(targetQubit_), controlQubit(controlQubit_), controlQubit2(controlQubit2_),
 86 |         type(type_),
 87 |         targetIsGlobal(targetIsGlobal_), controlIsGlobal(controlIsGlobal_), control2IsGlobal(control2IsGlobal_),
 88 |         r00(mat[0][0].x), i00(mat[0][0].y), r01(mat[0][1].x), i01(mat[0][1].y),
 89 |         r10(mat[1][0].x), i10(mat[1][0].y), r11(mat[1][1].x), i11(mat[1][1].y) {}
 90 |     
 91 |     KernelGate(
 92 |         GateType type_,
 93 |         int controlQubit_, char controlIsGlobal_,
 94 |         int targetQubit_, char targetIsGlobal_,
 95 |         const qComplex mat[2][2]
 96 |     ): KernelGate(type_, 2, -1, controlQubit_, controlIsGlobal_, targetQubit_, targetIsGlobal_, mat) {}
 97 | 
 98 |     KernelGate(
 99 |         GateType type_,
100 |         int targetQubit_, char targetIsGlobal_,
101 |         const qComplex mat[2][2]
102 |     ): KernelGate(type_, 2, -1, 2, -1, targetQubit_, targetIsGlobal_, mat) {}
103 | 
104 |     KernelGate() = default;
105 | 
106 |     static KernelGate ID() {
107 |         qComplex mat[2][2] = {1, 0, 0, 1}; \
108 |         return KernelGate(GateType::ID, 0, 0, mat);
109 |     }
110 | };


--------------------------------------------------------------------------------
/src/kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <cutt.h>
 5 | 
 6 | #include "gate.h"
 7 | #include "utils.h"
 8 | #include "compiler.h"
 9 | #include "circuit.h"
10 | 
11 | // kernelSimple
12 | void kernelInit(std::vector<qComplex*> &deviceStateVec, int numQubits);
13 | void kernelExecSimple(qComplex* deviceStateVec, int numQubits, const std::vector<Gate> & gates);
14 | qreal kernelMeasure(qComplex* deviceStateVec, int numQubits, int targetQubit);
15 | qComplex kernelGetAmp(qComplex* deviceStateVec, qindex idx);
16 | void kernelDeviceToHost(qComplex* hostStateVec, qComplex* deviceStateVec, int numQubits);
17 | void kernelDestroy(qComplex* deviceStateVec);
18 | void cuttPlanInit(std::vector<cuttHandle>& plans);
19 | 
20 | // kernelOpt
21 | void initControlIdx();
22 | // call cudaSetDevice() before this function
23 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID);
24 | 
25 | // call cudaSetDevice() before this function
26 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID);
27 | 
28 | 
29 | // kernelUtils
30 | void isnanTest(qComplex* data, int n, cudaStream_t& stream);
31 | void printVector(qComplex* data, int n, cudaStream_t& stream);
32 | void whileTrue();


--------------------------------------------------------------------------------
/src/kernelUtils.cu:
--------------------------------------------------------------------------------
 1 | #include "kernel.h"
 2 | #include <assert.h>
 3 | #include <cstdio>
 4 | #include <cuda.h>
 5 | 
 6 | __global__ void isnanTestKernel(qComplex *data, int n) { // with grimDim == 1
 7 |     for (int i = threadIdx.x; i < n; i += blockDim.x) {
 8 |         if (isnan(data[i].x) || isnan(data[i].y)) {
 9 |             printf("nan at %d\n", i);
10 |             asm("trap;");
11 |         }
12 |     }
13 | }
14 | 
15 | __global__ void printVectorKernel(qComplex *data, int n) { // with gridDim == 1 && blockDim == 1
16 |     for (int i = 0; i < n; i++)
17 |         printf("(%f, %f)", data[i].x, data[i].y);
18 |     printf("\n");
19 | }
20 | 
21 | __global__ void whileTrueKernel() {
22 |     while (true);
23 | }
24 | 
25 | void isnanTest(qComplex* data, int n, cudaStream_t& stream) {
26 |     isnanTestKernel<<<1, 32, 0, stream>>>(data, n / 32);
27 | }
28 | 
29 | void printVector(qComplex* data, int n, cudaStream_t& stream) {
30 |     printVectorKernel<<<1, 1, 0, stream>>>(data, n);
31 | }
32 | 
33 | void whileTrue() {
34 |     whileTrueKernel<<<1,1>>>();
35 | }
36 | 


--------------------------------------------------------------------------------
/src/kernels/baseline.cu:
--------------------------------------------------------------------------------
  1 | #include "kernel.h"
  2 | #include <cstdio>
  3 | #include <assert.h>
  4 | #include <map>
  5 | #include <omp.h>
  6 | #include "gate.h"
  7 | #include "executor.h"
  8 | using namespace std;
  9 | 
 10 | extern __shared__ qComplex shm[1<<LOCAL_QUBIT_SIZE];
 11 | extern __shared__ qindex blockBias;
 12 | 
 13 | __device__ __constant__ qreal recRoot2 = 0.70710678118654752440084436210485; // more elegant way?
 14 | __constant__ KernelGate deviceGates[MAX_GATE];
 15 | 
 16 | std::vector<int*> loIdx_device;
 17 | std::vector<int*> shiftAt_device;
 18 | 
 19 | 
 20 | __device__ __forceinline__ void XSingle(int loIdx, int hiIdx) {
 21 |     qComplex v = shm[loIdx];
 22 |     shm[loIdx] = shm[hiIdx];
 23 |     shm[hiIdx] = v;
 24 | }
 25 | 
 26 | __device__ __forceinline__ void YSingle(int loIdx, int hiIdx) {
 27 |     qComplex lo = shm[loIdx];
 28 |     qComplex hi = shm[hiIdx];
 29 |     
 30 |     shm[loIdx] = make_qComplex(hi.y, -hi.x);
 31 |     shm[hiIdx] = make_qComplex(-lo.y, lo.x);
 32 | }
 33 | 
 34 | __device__ __forceinline__ void ZHi(int hiIdx) {
 35 |     qComplex v = shm[hiIdx];
 36 |     shm[hiIdx] = make_qComplex(-v.x, -v.y);
 37 | }
 38 | 
 39 | 
 40 | __device__ __forceinline__ void RXSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) {
 41 |     qComplex lo = shm[loIdx];
 42 |     qComplex hi = shm[hiIdx];
 43 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * hi.y, alpha * lo.y - beta * hi.x);
 44 |     shm[hiIdx] = make_qComplex(alpha * hi.x + beta * lo.y, alpha * hi.y - beta * lo.x);
 45 | }
 46 | 
 47 | __device__ __forceinline__ void RYSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) {
 48 |     qComplex lo = shm[loIdx];
 49 |     qComplex hi = shm[hiIdx];
 50 |     shm[loIdx] = make_qComplex(alpha * lo.x - beta * hi.x, alpha * lo.y - beta * hi.y);
 51 |     shm[hiIdx] = make_qComplex(beta * lo.x + alpha * hi.x, beta * lo.y + alpha * hi.y);
 52 | }
 53 | 
 54 | __device__ __forceinline__ void RZSingle(int loIdx, int hiIdx, qreal alpha, qreal beta){
 55 |     qComplex lo = shm[loIdx];
 56 |     qComplex hi = shm[hiIdx];
 57 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x);
 58 |     shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x);
 59 | }
 60 | 
 61 | __device__ __forceinline__ void RZLo(int loIdx, qreal alpha, qreal beta) {
 62 |     qComplex lo = shm[loIdx];
 63 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x);
 64 | }
 65 | 
 66 | __device__ __forceinline__ void RZHi(int hiIdx, qreal alpha, qreal beta){
 67 |     qComplex hi = shm[hiIdx];
 68 |     shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x);
 69 | }
 70 | 
 71 | #define COMPLEX_MULTIPLY_REAL(v0, v1) (v0.x * v1.x - v0.y * v1.y)
 72 | #define COMPLEX_MULTIPLY_IMAG(v0, v1) (v0.x * v1.y + v0.y * v1.x)
 73 | 
 74 | __device__ __forceinline__ void U1Hi(int hiIdx, qComplex p) {
 75 |     qComplex hi = shm[hiIdx];
 76 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p));
 77 | }
 78 | 
 79 | __device__ __forceinline__ void USingle(int loIdx, int hiIdx, qComplex v00, qComplex v01, qComplex v10, qComplex v11) {
 80 |     qComplex lo = shm[loIdx];
 81 |     qComplex hi = shm[hiIdx];
 82 |     shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v00) + COMPLEX_MULTIPLY_REAL(hi, v01),
 83 |                                COMPLEX_MULTIPLY_IMAG(lo, v00) + COMPLEX_MULTIPLY_IMAG(hi, v01));
 84 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v10) + COMPLEX_MULTIPLY_REAL(hi, v11),
 85 |                                COMPLEX_MULTIPLY_IMAG(lo, v10) + COMPLEX_MULTIPLY_IMAG(hi, v11));
 86 | }
 87 | 
 88 | __device__ __forceinline__ void HSingle(int loIdx, int hiIdx) {
 89 |     qComplex lo = shm[loIdx];
 90 |     qComplex hi = shm[hiIdx];
 91 |     shm[loIdx] = make_qComplex(recRoot2 * (lo.x + hi.x), recRoot2 * (lo.y + hi.y));
 92 |     shm[hiIdx] = make_qComplex(recRoot2 * (lo.x - hi.x), recRoot2 * (lo.y - hi.y));
 93 | }
 94 | 
 95 | __device__ __forceinline__ void SHi(int hiIdx) {
 96 |     qComplex hi = shm[hiIdx];
 97 |     shm[hiIdx] = make_qComplex(-hi.y, hi.x);
 98 | }
 99 | 
100 | __device__ __forceinline__ void SDGHi(int hiIdx) {
101 |     qComplex hi = shm[hiIdx];
102 |     shm[hiIdx] = make_qComplex(hi.y, -hi.x);
103 | }
104 | 
105 | __device__ __forceinline__ void THi(int hiIdx) {
106 |     qComplex hi = shm[hiIdx];
107 |     shm[hiIdx] = make_qComplex(recRoot2 * (hi.x - hi.y), recRoot2 * (hi.x + hi.y));
108 | }
109 | 
110 | __device__ __forceinline__ void TDGHi(int hiIdx) {
111 |     qComplex hi = shm[hiIdx];
112 |     shm[hiIdx] = make_qComplex(recRoot2 * (hi.x + hi.y), recRoot2 * (hi.x - hi.y));
113 | }
114 | 
115 | __device__ __forceinline__ void GIISingle(int loIdx, int hiIdx) {
116 |     qComplex lo = shm[loIdx];
117 |     shm[loIdx] = make_qComplex(-lo.y, lo.x);
118 |     qComplex hi = shm[hiIdx];
119 |     shm[hiIdx] = make_qComplex(-hi.y, hi.x);
120 | }
121 | 
122 | __device__ __forceinline__ void GII(int idx) {
123 |     qComplex v = shm[idx];
124 |     shm[idx] = make_qComplex(-v.y, v.x);
125 | }
126 | 
127 | __device__ __forceinline__ void GZZSingle(int loIdx, int hiIdx) {
128 |     qComplex lo = shm[loIdx];
129 |     shm[loIdx] = make_qComplex(-lo.x, -lo.y);
130 |     qComplex hi = shm[hiIdx];
131 |     shm[hiIdx] = make_qComplex(-hi.x, -hi.y);
132 | }
133 | 
134 | __device__ __forceinline__ void GZZ(int idx) { 
135 |     qComplex v = shm[idx];
136 |     shm[idx] = make_qComplex(-v.x, -v.y);
137 | }
138 | 
139 | __device__ __forceinline__ void GCCSingle(int loIdx, int hiIdx, qComplex p) {
140 |     qComplex lo = shm[loIdx];
141 |     shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, p), COMPLEX_MULTIPLY_IMAG(lo, p));
142 |     qComplex hi = shm[hiIdx];
143 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p));
144 | }
145 | 
146 | __device__ __forceinline__ void GCC(int idx, qComplex p) {
147 |     qComplex v = shm[idx];
148 |     shm[idx] = make_qComplex(COMPLEX_MULTIPLY_REAL(v, p), COMPLEX_MULTIPLY_IMAG(v, p));
149 | }
150 | 
151 | #define FOLLOW_NEXT(TYPE) \
152 | case GateType::TYPE: // no break
153 | 
154 | #define CASE_CONTROL(TYPE, OP) \
155 | case GateType::TYPE: { \
156 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
157 |         int lo = ((j >> smallQubit) << (smallQubit + 1)) | (j & maskSmall); \
158 |         lo = ((lo >> largeQubit) << (largeQubit + 1)) | (lo & maskLarge); \
159 |         lo |= 1 << controlQubit; \
160 |         int hi = lo | (1 << targetQubit); \
161 |         OP; \
162 |     } \
163 |     break; \
164 | }
165 | 
166 | #define CASE_SINGLE(TYPE, OP) \
167 | case GateType::TYPE: { \
168 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
169 |         int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget); \
170 |         int hi = lo | (1 << targetQubit); \
171 |         OP; \
172 |     } \
173 |     break;\
174 | }
175 | 
176 | #define CASE_LO_HI(TYPE, OP_LO, OP_HI) \
177 | case GateType::TYPE: { \
178 |     int m = 1 << LOCAL_QUBIT_SIZE; \
179 |     if (!isHighBlock){ \
180 |         for (int j = threadIdx.x; j < m; j += blockSize) { \
181 |             OP_LO; \
182 |         } \
183 |     } else { \
184 |         for (int j = threadIdx.x; j < m; j += blockSize) { \
185 |             OP_HI; \
186 |         } \
187 |     } \
188 |     break; \
189 | }
190 | 
191 | #define CASE_SKIPLO_HI(TYPE, OP_HI) \
192 | case GateType::TYPE: { \
193 |     if (!isHighBlock) continue; \
194 |     int m = 1 << LOCAL_QUBIT_SIZE; \
195 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
196 |         OP_HI; \
197 |     } \
198 |     break; \
199 | }
200 | 
201 | #define LOHI_SAME(TYPE, OP) \
202 | case GateType::TYPE: { \
203 |     int m = 1 << LOCAL_QUBIT_SIZE; \
204 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
205 |         OP; \
206 |     } \
207 |     break; \
208 | }
209 | 
210 | #define ID_BREAK() \
211 | case GateType::ID: { \
212 |     break; \
213 | }
214 | 
215 | template <unsigned int blockSize>
216 | __device__ void doCompute(int numGates, int* loArr, int* shiftAt) {
217 |     for (int i = 0; i < numGates; i++) {
218 |         int controlQubit = deviceGates[i].controlQubit;
219 |         int targetQubit = deviceGates[i].targetQubit;
220 |         char controlIsGlobal = deviceGates[i].controlIsGlobal;
221 |         char targetIsGlobal = deviceGates[i].targetIsGlobal;
222 |         if (deviceGates[i].type == GateType::CCX) {
223 |             int controlQubit2 = deviceGates[i].controlQubit2;
224 |             int control2IsGlobal = deviceGates[i].control2IsGlobal;
225 |             if (!control2IsGlobal) {
226 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
227 |                 assert(!controlIsGlobal && !targetIsGlobal);
228 |                 assert(deviceGates[i].type == GateType::CCX);
229 |                 int maskTarget = (1 << targetQubit) - 1;
230 |                 for (int j = threadIdx.x; j < m; j += blockSize) {
231 |                     int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget);
232 |                     if (!(lo >> controlQubit & 1) || !(lo >> controlQubit2 & 1))
233 |                         continue;
234 |                     int hi = lo | (1 << targetQubit);
235 |                     XSingle(lo, hi);
236 |                 }
237 |                 continue;
238 |             }
239 |             if (control2IsGlobal == 1 && !((blockIdx.x >> controlQubit2) & 1)) {
240 |                 continue;
241 |             }
242 |         }
243 |         if (!controlIsGlobal) {
244 |             if (!targetIsGlobal) {
245 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 2);
246 |                 int smallQubit = controlQubit > targetQubit ? targetQubit : controlQubit;
247 |                 int largeQubit = controlQubit > targetQubit ? controlQubit : targetQubit;
248 |                 int maskSmall = (1 << smallQubit) - 1;
249 |                 int maskLarge = (1 << largeQubit) - 1;
250 |                 switch (deviceGates[i].type) {
251 |                     FOLLOW_NEXT(CCX)
252 |                     CASE_CONTROL(CNOT, XSingle(lo, hi))
253 |                     CASE_CONTROL(CY, YSingle(lo, hi))
254 |                     CASE_CONTROL(CZ, ZHi(hi))
255 |                     CASE_CONTROL(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01))
256 |                     CASE_CONTROL(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10))
257 |                     CASE_CONTROL(CU1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
258 |                     CASE_CONTROL(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00))
259 |                     default: {
260 |                         assert(false);
261 |                     }
262 |                 }
263 |             } else {
264 |                 assert(deviceGates[i].type == GateType::CZ || deviceGates[i].type == GateType::CU1 || deviceGates[i].type == GateType::CRZ);
265 |                 bool isHighBlock = (blockIdx.x >> targetQubit) & 1;
266 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
267 |                 int maskControl = (1 << controlQubit) - 1;
268 |                 if (!isHighBlock){
269 |                     if (deviceGates[i].type == GateType::CRZ) {
270 |                         for (int j = threadIdx.x; j < m; j += blockSize) {
271 |                             int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
272 |                             RZLo(x, deviceGates[i].r00, -deviceGates[i].i00);
273 |                         }
274 |                     }
275 |                 } else {
276 |                     switch (deviceGates[i].type) {
277 |                         case GateType::CZ: {
278 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
279 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
280 |                                 ZHi(x);
281 |                             }
282 |                             break;    
283 |                         }
284 |                         case GateType::CU1: {
285 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
286 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
287 |                                 U1Hi(x, make_qComplex(deviceGates[i].r11, deviceGates[i].i11));
288 |                             }
289 |                             break;
290 |                         }
291 |                         case GateType::CRZ: {
292 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
293 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
294 |                                 RZHi(x, deviceGates[i].r00, -deviceGates[i].i00);
295 |                             }
296 |                             break;
297 |                         }
298 |                         default: {
299 |                             assert(false);
300 |                         }
301 |                     }
302 |                 }
303 |             }
304 |         } else {
305 |             if (controlIsGlobal == 1 && !((blockIdx.x >> controlQubit) & 1)) {
306 |                 continue;
307 |             }
308 |             if (!targetIsGlobal) {
309 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
310 |                 int maskTarget = (1 << targetQubit) - 1;
311 |                 switch (deviceGates[i].type) {
312 |                     FOLLOW_NEXT(GOC)
313 |                     FOLLOW_NEXT(CU1)
314 |                     CASE_SINGLE(U1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
315 |                     FOLLOW_NEXT(U2)
316 |                     CASE_SINGLE(U3, USingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00), make_qComplex(deviceGates[i].r01, deviceGates[i].i01), make_qComplex(deviceGates[i].r10, deviceGates[i].i10), make_qComplex(deviceGates[i].r11, deviceGates[i].i11)));
317 |                     CASE_SINGLE(H, HSingle(lo, hi))
318 |                     FOLLOW_NEXT(X)
319 |                     FOLLOW_NEXT(CNOT)
320 |                     CASE_SINGLE(CCX, XSingle(lo, hi))
321 |                     FOLLOW_NEXT(Y)
322 |                     CASE_SINGLE(CY, YSingle(lo, hi))
323 |                     FOLLOW_NEXT(Z)
324 |                     CASE_SINGLE(CZ, ZHi(hi))
325 |                     FOLLOW_NEXT(RX)
326 |                     CASE_SINGLE(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01))
327 |                     FOLLOW_NEXT(RY)
328 |                     CASE_SINGLE(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10))
329 |                     FOLLOW_NEXT(RZ)
330 |                     CASE_SINGLE(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00))
331 |                     CASE_SINGLE(S, SHi(hi))
332 |                     CASE_SINGLE(SDG, SDGHi(hi))
333 |                     CASE_SINGLE(T, THi(hi))
334 |                     CASE_SINGLE(TDG, TDGHi(hi))
335 |                     CASE_SINGLE(GII, GIISingle(lo, hi))
336 |                     CASE_SINGLE(GZZ, GZZSingle(lo, hi))
337 |                     CASE_SINGLE(GCC, GCCSingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00)))
338 |                     ID_BREAK()
339 |                     default: {
340 |                         assert(false);
341 |                     }
342 |                 }
343 |             } else {
344 |                 bool isHighBlock = (blockIdx.x >> targetQubit) & 1;
345 |                 switch (deviceGates[i].type) {
346 |                     FOLLOW_NEXT(RZ)
347 |                     CASE_LO_HI(CRZ, RZLo(j, deviceGates[i].r00, -deviceGates[i].i00), RZHi(j, deviceGates[i].r00, -deviceGates[i].i00))
348 |                     FOLLOW_NEXT(Z)
349 |                     CASE_SKIPLO_HI(CZ, ZHi(j))
350 |                     CASE_SKIPLO_HI(S, SHi(j))
351 |                     CASE_SKIPLO_HI(SDG, SDGHi(j))
352 |                     CASE_SKIPLO_HI(T, THi(j))
353 |                     CASE_SKIPLO_HI(TDG, TDGHi(j))
354 |                     FOLLOW_NEXT(GOC)
355 |                     FOLLOW_NEXT(CU1)
356 |                     CASE_SKIPLO_HI(U1, U1Hi(j, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
357 |                     LOHI_SAME(GII, GII(j))
358 |                     LOHI_SAME(GZZ, GZZ(j))
359 |                     LOHI_SAME(GCC, GCC(j, make_qComplex(deviceGates[i].r00, deviceGates[i].i00)))
360 |                     ID_BREAK()
361 |                     default: {
362 |                         assert(false);
363 |                     }
364 |                 }
365 |             }
366 |         }
367 |         __syncthreads();
368 |     }
369 | }
370 | 
371 | __device__ void fetchData(qComplex* a, unsigned int* threadBias, unsigned int idx, unsigned int blockHot, unsigned int enumerate, int numLocalQubits) {
372 |     if (threadIdx.x == 0) {
373 |         int bid = blockIdx.x;
374 |         unsigned int bias = 0;
375 |         for (unsigned int bit = 1; bit < (1u << numLocalQubits); bit <<= 1) {
376 |             if (blockHot & bit) {
377 |                 if (bid & 1)
378 |                     bias |= bit;
379 |                 bid >>= 1;
380 |             }
381 |         }
382 |         blockBias = bias;
383 |     }
384 |     __syncthreads();
385 |     unsigned int bias = blockBias | threadBias[threadIdx.x];
386 |     int x;
387 |     unsigned int y;
388 |     for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate;
389 |         x >= 0;
390 |         x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) {
391 |         
392 |         shm[x] = a[bias | y];
393 |     }
394 | }
395 | 
396 | __device__ void saveData(qComplex* a, unsigned int* threadBias, unsigned int enumerate) {
397 |     unsigned int bias = blockBias | threadBias[threadIdx.x];
398 |     int x;
399 |     unsigned y;
400 |     for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate;
401 |         x >= 0;
402 |         x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) {
403 |         
404 |         a[bias | y] = shm[x];
405 |     }
406 | }
407 | 
408 | template <unsigned int blockSize>
409 | __global__ void run(qComplex* a, unsigned int* threadBias, int* loArr, int* shiftAt, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate) {
410 |     unsigned int idx = (unsigned int) blockIdx.x * blockSize + threadIdx.x;
411 |     fetchData(a, threadBias, idx, blockHot, enumerate, numLocalQubits);
412 |     __syncthreads();
413 |     doCompute<blockSize>(numGates, loArr, shiftAt);
414 |     __syncthreads();
415 |     saveData(a, threadBias, enumerate);
416 | }
417 | 
418 | #if BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
419 | void initControlIdx() {
420 |     loIdx_device.resize(MyGlobalVars::localGPUs);
421 |     shiftAt_device.resize(MyGlobalVars::localGPUs);
422 | }
423 | #endif
424 | 
425 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID) {
426 |     checkCudaErrors(cudaMemcpyToSymbolAsync(deviceGates, hostGates + gpuID * numGates, sizeof(KernelGate) * numGates, 0, cudaMemcpyDefault, stream));
427 | }
428 | 
429 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID) {
430 |     run<1<<THREAD_DEP><<<gridDim, 1<<THREAD_DEP, 0, stream>>>
431 |         (deviceStateVec, threadBias, loIdx_device[gpuID], shiftAt_device[gpuID], numLocalQubits, numGates, blockHot, enumerate);
432 | }


--------------------------------------------------------------------------------
/src/kernels/lookup.cu:
--------------------------------------------------------------------------------
  1 | #include "kernel.h"
  2 | #include <cstdio>
  3 | #include <assert.h>
  4 | #include <map>
  5 | #include <omp.h>
  6 | #include "gate.h"
  7 | #include "executor.h"
  8 | using namespace std;
  9 | 
 10 | extern __shared__ qComplex shm[1<<LOCAL_QUBIT_SIZE];
 11 | extern __shared__ qindex blockBias;
 12 | 
 13 | __device__ __constant__ qreal recRoot2 = 0.70710678118654752440084436210485; // more elegant way?
 14 | __constant__ KernelGate deviceGates[MAX_GATE];
 15 | 
 16 | std::vector<int*> loIdx_device;
 17 | std::vector<int*> shiftAt_device;
 18 | 
 19 | 
 20 | __device__ __forceinline__ void XSingle(int loIdx, int hiIdx) {
 21 |     qComplex v = shm[loIdx];
 22 |     shm[loIdx] = shm[hiIdx];
 23 |     shm[hiIdx] = v;
 24 | }
 25 | 
 26 | __device__ __forceinline__ void YSingle(int loIdx, int hiIdx) {
 27 |     qComplex lo = shm[loIdx];
 28 |     qComplex hi = shm[hiIdx];
 29 |     
 30 |     shm[loIdx] = make_qComplex(hi.y, -hi.x);
 31 |     shm[hiIdx] = make_qComplex(-lo.y, lo.x);
 32 | }
 33 | 
 34 | __device__ __forceinline__ void ZHi(int hiIdx) {
 35 |     qComplex v = shm[hiIdx];
 36 |     shm[hiIdx] = make_qComplex(-v.x, -v.y);
 37 | }
 38 | 
 39 | 
 40 | __device__ __forceinline__ void RXSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) {
 41 |     qComplex lo = shm[loIdx];
 42 |     qComplex hi = shm[hiIdx];
 43 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * hi.y, alpha * lo.y - beta * hi.x);
 44 |     shm[hiIdx] = make_qComplex(alpha * hi.x + beta * lo.y, alpha * hi.y - beta * lo.x);
 45 | }
 46 | 
 47 | __device__ __forceinline__ void RYSingle(int loIdx, int hiIdx, qreal alpha, qreal beta) {
 48 |     qComplex lo = shm[loIdx];
 49 |     qComplex hi = shm[hiIdx];
 50 |     shm[loIdx] = make_qComplex(alpha * lo.x - beta * hi.x, alpha * lo.y - beta * hi.y);
 51 |     shm[hiIdx] = make_qComplex(beta * lo.x + alpha * hi.x, beta * lo.y + alpha * hi.y);
 52 | }
 53 | 
 54 | __device__ __forceinline__ void RZSingle(int loIdx, int hiIdx, qreal alpha, qreal beta){
 55 |     qComplex lo = shm[loIdx];
 56 |     qComplex hi = shm[hiIdx];
 57 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x);
 58 |     shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x);
 59 | }
 60 | 
 61 | __device__ __forceinline__ void RZLo(int loIdx, qreal alpha, qreal beta) {
 62 |     qComplex lo = shm[loIdx];
 63 |     shm[loIdx] = make_qComplex(alpha * lo.x + beta * lo.y, alpha * lo.y - beta * lo.x);
 64 | }
 65 | 
 66 | __device__ __forceinline__ void RZHi(int hiIdx, qreal alpha, qreal beta){
 67 |     qComplex hi = shm[hiIdx];
 68 |     shm[hiIdx] = make_qComplex(alpha * hi.x - beta * hi.y, alpha * hi.y + beta * hi.x);
 69 | }
 70 | 
 71 | #define COMPLEX_MULTIPLY_REAL(v0, v1) (v0.x * v1.x - v0.y * v1.y)
 72 | #define COMPLEX_MULTIPLY_IMAG(v0, v1) (v0.x * v1.y + v0.y * v1.x)
 73 | 
 74 | __device__ __forceinline__ void U1Hi(int hiIdx, qComplex p) {
 75 |     qComplex hi = shm[hiIdx];
 76 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p));
 77 | }
 78 | 
 79 | __device__ __forceinline__ void USingle(int loIdx, int hiIdx, qComplex v00, qComplex v01, qComplex v10, qComplex v11) {
 80 |     qComplex lo = shm[loIdx];
 81 |     qComplex hi = shm[hiIdx];
 82 |     shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v00) + COMPLEX_MULTIPLY_REAL(hi, v01),
 83 |                                COMPLEX_MULTIPLY_IMAG(lo, v00) + COMPLEX_MULTIPLY_IMAG(hi, v01));
 84 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, v10) + COMPLEX_MULTIPLY_REAL(hi, v11),
 85 |                                COMPLEX_MULTIPLY_IMAG(lo, v10) + COMPLEX_MULTIPLY_IMAG(hi, v11));
 86 | }
 87 | 
 88 | __device__ __forceinline__ void HSingle(int loIdx, int hiIdx) {
 89 |     qComplex lo = shm[loIdx];
 90 |     qComplex hi = shm[hiIdx];
 91 |     shm[loIdx] = make_qComplex(recRoot2 * (lo.x + hi.x), recRoot2 * (lo.y + hi.y));
 92 |     shm[hiIdx] = make_qComplex(recRoot2 * (lo.x - hi.x), recRoot2 * (lo.y - hi.y));
 93 | }
 94 | 
 95 | __device__ __forceinline__ void SHi(int hiIdx) {
 96 |     qComplex hi = shm[hiIdx];
 97 |     shm[hiIdx] = make_qComplex(-hi.y, hi.x);
 98 | }
 99 | 
100 | __device__ __forceinline__ void SDGHi(int hiIdx) {
101 |     qComplex hi = shm[hiIdx];
102 |     shm[hiIdx] = make_qComplex(hi.y, -hi.x);
103 | }
104 | 
105 | __device__ __forceinline__ void THi(int hiIdx) {
106 |     qComplex hi = shm[hiIdx];
107 |     shm[hiIdx] = make_qComplex(recRoot2 * (hi.x - hi.y), recRoot2 * (hi.x + hi.y));
108 | }
109 | 
110 | __device__ __forceinline__ void TDGHi(int hiIdx) {
111 |     qComplex hi = shm[hiIdx];
112 |     shm[hiIdx] = make_qComplex(recRoot2 * (hi.x + hi.y), recRoot2 * (hi.x - hi.y));
113 | }
114 | 
115 | __device__ __forceinline__ void GIISingle(int loIdx, int hiIdx) {
116 |     qComplex lo = shm[loIdx];
117 |     shm[loIdx] = make_qComplex(-lo.y, lo.x);
118 |     qComplex hi = shm[hiIdx];
119 |     shm[hiIdx] = make_qComplex(-hi.y, hi.x);
120 | }
121 | 
122 | __device__ __forceinline__ void GII(int idx) {
123 |     qComplex v = shm[idx];
124 |     shm[idx] = make_qComplex(-v.y, v.x);
125 | }
126 | 
127 | __device__ __forceinline__ void GZZSingle(int loIdx, int hiIdx) {
128 |     qComplex lo = shm[loIdx];
129 |     shm[loIdx] = make_qComplex(-lo.x, -lo.y);
130 |     qComplex hi = shm[hiIdx];
131 |     shm[hiIdx] = make_qComplex(-hi.x, -hi.y);
132 | }
133 | 
134 | __device__ __forceinline__ void GZZ(int idx) { 
135 |     qComplex v = shm[idx];
136 |     shm[idx] = make_qComplex(-v.x, -v.y);
137 | }
138 | 
139 | __device__ __forceinline__ void GCCSingle(int loIdx, int hiIdx, qComplex p) {
140 |     qComplex lo = shm[loIdx];
141 |     shm[loIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(lo, p), COMPLEX_MULTIPLY_IMAG(lo, p));
142 |     qComplex hi = shm[hiIdx];
143 |     shm[hiIdx] = make_qComplex(COMPLEX_MULTIPLY_REAL(hi, p), COMPLEX_MULTIPLY_IMAG(hi, p));
144 | }
145 | 
146 | __device__ __forceinline__ void GCC(int idx, qComplex p) {
147 |     qComplex v = shm[idx];
148 |     shm[idx] = make_qComplex(COMPLEX_MULTIPLY_REAL(v, p), COMPLEX_MULTIPLY_IMAG(v, p));
149 | }
150 | 
151 | #define FOLLOW_NEXT(TYPE) \
152 | case GateType::TYPE: // no break
153 | 
154 | #define CASE_CONTROL(TYPE, OP) \
155 | case GateType::TYPE: { \
156 |     OP; \
157 |     lo += add; hi += add; \
158 |     OP; \
159 |     break; \
160 | }
161 | 
162 | #define CASE_SINGLE(TYPE, OP) \
163 | case GateType::TYPE: { \
164 |     for (int task = 0; task < 4; task ++) { \
165 |         OP; \
166 |         lo += add[task]; hi += add[task]; \
167 |     } \
168 |     break;\
169 | }
170 | 
171 | #define CASE_LO_HI(TYPE, OP_LO, OP_HI) \
172 | case GateType::TYPE: { \
173 |     int m = 1 << LOCAL_QUBIT_SIZE; \
174 |     if (!isHighBlock){ \
175 |         for (int j = threadIdx.x; j < m; j += blockSize) { \
176 |             OP_LO; \
177 |         } \
178 |     } else { \
179 |         for (int j = threadIdx.x; j < m; j += blockSize) { \
180 |             OP_HI; \
181 |         } \
182 |     } \
183 |     break; \
184 | }
185 | 
186 | #define CASE_SKIPLO_HI(TYPE, OP_HI) \
187 | case GateType::TYPE: { \
188 |     if (!isHighBlock) continue; \
189 |     int m = 1 << LOCAL_QUBIT_SIZE; \
190 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
191 |         OP_HI; \
192 |     } \
193 |     break; \
194 | }
195 | 
196 | #define LOHI_SAME(TYPE, OP) \
197 | case GateType::TYPE: { \
198 |     int m = 1 << LOCAL_QUBIT_SIZE; \
199 |     for (int j = threadIdx.x; j < m; j += blockSize) { \
200 |         OP; \
201 |     } \
202 |     break; \
203 | }
204 | 
205 | #define ID_BREAK() \
206 | case GateType::ID: { \
207 |     break; \
208 | }
209 | 
210 | template <unsigned int blockSize>
211 | __device__ void doCompute(int numGates, int* loArr, int* shiftAt) {
212 |     for (int i = 0; i < numGates; i++) {
213 |         int controlQubit = deviceGates[i].controlQubit;
214 |         int targetQubit = deviceGates[i].targetQubit;
215 |         char controlIsGlobal = deviceGates[i].controlIsGlobal;
216 |         char targetIsGlobal = deviceGates[i].targetIsGlobal;
217 |         if (deviceGates[i].type == GateType::CCX) {
218 |             int controlQubit2 = deviceGates[i].controlQubit2;
219 |             int control2IsGlobal = deviceGates[i].control2IsGlobal;
220 |             if (!control2IsGlobal) {
221 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
222 |                 assert(!controlIsGlobal && !targetIsGlobal);
223 |                 assert(deviceGates[i].type == GateType::CCX);
224 |                 int maskTarget = (1 << targetQubit) - 1;
225 |                 for (int j = threadIdx.x; j < m; j += blockSize) {
226 |                     int lo = ((j >> targetQubit) << (targetQubit + 1)) | (j & maskTarget);
227 |                     if (!(lo >> controlQubit & 1) || !(lo >> controlQubit2 & 1))
228 |                         continue;
229 |                     int hi = lo | (1 << targetQubit);
230 |                     XSingle(lo, hi);
231 |                 }
232 |                 continue;
233 |             }
234 |             if (control2IsGlobal == 1 && !((blockIdx.x >> controlQubit2) & 1)) {
235 |                 continue;
236 |             }
237 |         }
238 |         if (!controlIsGlobal) {
239 |             if (!targetIsGlobal) {
240 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 2);
241 |                 int smallQubit = controlQubit > targetQubit ? targetQubit : controlQubit;
242 |                 int largeQubit = controlQubit > targetQubit ? controlQubit : targetQubit;
243 |                 int maskSmall = (1 << smallQubit) - 1;
244 |                 int maskLarge = (1 << largeQubit) - 1;
245 |                 int lo = ((threadIdx.x >> smallQubit) << (smallQubit + 1)) | (threadIdx.x & maskSmall);
246 |                 lo = ((lo >> largeQubit) << (largeQubit + 1)) | (lo & maskLarge);
247 |                 lo |= 1 << controlQubit;
248 |                 int hi = lo | (1 << targetQubit);
249 |                 int add = 512;
250 |                 if (controlQubit == 9 || targetQubit == 9) {
251 |                     add = 256;
252 |                     if (controlQubit == 8 || targetQubit == 8)
253 |                         add = 128;
254 |                 }
255 |                 switch (deviceGates[i].type) {
256 |                     FOLLOW_NEXT(CCX)
257 |                     CASE_CONTROL(CNOT, XSingle(lo, hi))
258 |                     CASE_CONTROL(CY, YSingle(lo, hi))
259 |                     CASE_CONTROL(CZ, ZHi(hi))
260 |                     CASE_CONTROL(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01))
261 |                     CASE_CONTROL(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10))
262 |                     CASE_CONTROL(CU1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
263 |                     CASE_CONTROL(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00))
264 |                     default: {
265 |                         assert(false);
266 |                     }
267 |                 }
268 |             } else {
269 |                 assert(deviceGates[i].type == GateType::CZ || deviceGates[i].type == GateType::CU1 || deviceGates[i].type == GateType::CRZ);
270 |                 bool isHighBlock = (blockIdx.x >> targetQubit) & 1;
271 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
272 |                 int maskControl = (1 << controlQubit) - 1;
273 |                 if (!isHighBlock){
274 |                     if (deviceGates[i].type == GateType::CRZ) {
275 |                         for (int j = threadIdx.x; j < m; j += blockSize) {
276 |                             int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
277 |                             RZLo(x, deviceGates[i].r00, -deviceGates[i].i00);
278 |                         }
279 |                     }
280 |                 } else {
281 |                     switch (deviceGates[i].type) {
282 |                         case GateType::CZ: {
283 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
284 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
285 |                                 ZHi(x);
286 |                             }
287 |                             break;    
288 |                         }
289 |                         case GateType::CU1: {
290 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
291 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
292 |                                 U1Hi(x, make_qComplex(deviceGates[i].r11, deviceGates[i].i11));
293 |                             }
294 |                             break;
295 |                         }
296 |                         case GateType::CRZ: {
297 |                             for (int j = threadIdx.x; j < m; j += blockSize) {
298 |                                 int x = ((j >> controlQubit) << (controlQubit + 1)) | (j & maskControl)  | (1 << controlQubit);
299 |                                 RZHi(x, deviceGates[i].r00, -deviceGates[i].i00);
300 |                             }
301 |                             break;
302 |                         }
303 |                         default: {
304 |                             assert(false);
305 |                         }
306 |                     }
307 |                 }
308 |             }
309 |         } else {
310 |             if (controlIsGlobal == 1 && !((blockIdx.x >> controlQubit) & 1)) {
311 |                 continue;
312 |             }
313 |             if (!targetIsGlobal) {
314 |                 int m = 1 << (LOCAL_QUBIT_SIZE - 1);
315 |                 int maskTarget = (1 << targetQubit) - 1;
316 |                 int add[4];
317 |                 if (targetQubit < 8) {
318 |                     add[0] = add[1] = add[2] = 256;
319 |                 } else if (targetQubit == 8) {
320 |                     add[0] = 128; add[1] = 384; add[2] = 128;
321 |                 } else { // targetQubit == 9
322 |                     add[0] = add[1] = add[2] = 128;
323 |                 }
324 |                 int lo = ((threadIdx.x >> targetQubit) << (targetQubit + 1)) | (threadIdx.x & maskTarget);
325 |                 int hi = lo | (1 << targetQubit); 
326 |                 switch (deviceGates[i].type) {
327 |                     FOLLOW_NEXT(GOC)
328 |                     FOLLOW_NEXT(CU1)
329 |                     CASE_SINGLE(U1, U1Hi(hi, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
330 |                     FOLLOW_NEXT(U2)
331 |                     CASE_SINGLE(U3, USingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00), make_qComplex(deviceGates[i].r01, deviceGates[i].i01), make_qComplex(deviceGates[i].r10, deviceGates[i].i10), make_qComplex(deviceGates[i].r11, deviceGates[i].i11)));
332 |                     CASE_SINGLE(H, HSingle(lo, hi))
333 |                     FOLLOW_NEXT(X)
334 |                     FOLLOW_NEXT(CNOT)
335 |                     CASE_SINGLE(CCX, XSingle(lo, hi))
336 |                     FOLLOW_NEXT(Y)
337 |                     CASE_SINGLE(CY, YSingle(lo, hi))
338 |                     FOLLOW_NEXT(Z)
339 |                     CASE_SINGLE(CZ, ZHi(hi))
340 |                     FOLLOW_NEXT(RX)
341 |                     CASE_SINGLE(CRX, RXSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i01))
342 |                     FOLLOW_NEXT(RY)
343 |                     CASE_SINGLE(CRY, RYSingle(lo, hi, deviceGates[i].r00, deviceGates[i].r10))
344 |                     FOLLOW_NEXT(RZ)
345 |                     CASE_SINGLE(CRZ, RZSingle(lo, hi, deviceGates[i].r00, -deviceGates[i].i00))
346 |                     CASE_SINGLE(S, SHi(hi))
347 |                     CASE_SINGLE(SDG, SDGHi(hi))
348 |                     CASE_SINGLE(T, THi(hi))
349 |                     CASE_SINGLE(TDG, TDGHi(hi))
350 |                     CASE_SINGLE(GII, GIISingle(lo, hi))
351 |                     CASE_SINGLE(GZZ, GZZSingle(lo, hi))
352 |                     CASE_SINGLE(GCC, GCCSingle(lo, hi, make_qComplex(deviceGates[i].r00, deviceGates[i].i00)))
353 |                     ID_BREAK()
354 |                     default: {
355 |                         assert(false);
356 |                     }
357 |                 }
358 |             } else {
359 |                 bool isHighBlock = (blockIdx.x >> targetQubit) & 1;
360 |                 switch (deviceGates[i].type) {
361 |                     FOLLOW_NEXT(RZ)
362 |                     CASE_LO_HI(CRZ, RZLo(j, deviceGates[i].r00, -deviceGates[i].i00), RZHi(j, deviceGates[i].r00, -deviceGates[i].i00))
363 |                     FOLLOW_NEXT(Z)
364 |                     CASE_SKIPLO_HI(CZ, ZHi(j))
365 |                     CASE_SKIPLO_HI(S, SHi(j))
366 |                     CASE_SKIPLO_HI(SDG, SDGHi(j))
367 |                     CASE_SKIPLO_HI(T, THi(j))
368 |                     CASE_SKIPLO_HI(TDG, TDGHi(j))
369 |                     FOLLOW_NEXT(GOC)
370 |                     FOLLOW_NEXT(CU1)
371 |                     CASE_SKIPLO_HI(U1, U1Hi(j, make_qComplex(deviceGates[i].r11, deviceGates[i].i11)))
372 |                     LOHI_SAME(GII, GII(j))
373 |                     LOHI_SAME(GZZ, GZZ(j))
374 |                     LOHI_SAME(GCC, GCC(j, make_qComplex(deviceGates[i].r00, deviceGates[i].i00)))
375 |                     ID_BREAK()
376 |                     default: {
377 |                         assert(false);
378 |                     }
379 |                 }
380 |             }
381 |         }
382 |         __syncthreads();
383 |     }
384 | }
385 | 
386 | __device__ void fetchData(qComplex* a, unsigned int* threadBias, unsigned int idx, unsigned int blockHot, unsigned int enumerate, int numLocalQubits) {
387 |     if (threadIdx.x == 0) {
388 |         int bid = blockIdx.x;
389 |         unsigned int bias = 0;
390 |         for (unsigned int bit = 1; bit < (1u << numLocalQubits); bit <<= 1) {
391 |             if (blockHot & bit) {
392 |                 if (bid & 1)
393 |                     bias |= bit;
394 |                 bid >>= 1;
395 |             }
396 |         }
397 |         blockBias = bias;
398 |     }
399 |     __syncthreads();
400 |     unsigned int bias = blockBias | threadBias[threadIdx.x];
401 |     int x;
402 |     unsigned int y;
403 |     for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate;
404 |         x >= 0;
405 |         x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) {
406 |         
407 |         shm[x] = a[bias | y];
408 |     }
409 | }
410 | 
411 | __device__ void saveData(qComplex* a, unsigned int* threadBias, unsigned int enumerate) {
412 |     unsigned int bias = blockBias | threadBias[threadIdx.x];
413 |     int x;
414 |     unsigned y;
415 |     for (x = ((1 << (LOCAL_QUBIT_SIZE - THREAD_DEP)) - 1) << THREAD_DEP | threadIdx.x, y = enumerate;
416 |         x >= 0;
417 |         x -= (1 << THREAD_DEP), y = enumerate & (y - 1)) {
418 |         
419 |         a[bias | y] = shm[x];
420 |     }
421 | }
422 | 
423 | template <unsigned int blockSize>
424 | __global__ void run(qComplex* a, unsigned int* threadBias, int* loArr, int* shiftAt, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate) {
425 |     unsigned int idx = (unsigned int) blockIdx.x * blockSize + threadIdx.x;
426 |     fetchData(a, threadBias, idx, blockHot, enumerate, numLocalQubits);
427 |     __syncthreads();
428 |     doCompute<blockSize>(numGates, loArr, shiftAt);
429 |     __syncthreads();
430 |     saveData(a, threadBias, enumerate);
431 | }
432 | 
433 | #if BACKEND == 1 || BACKEND == 3 || BACKEND == 4 || BACKEND == 5
434 | void initControlIdx() {
435 |     loIdx_device.resize(MyGlobalVars::localGPUs);
436 |     shiftAt_device.resize(MyGlobalVars::localGPUs);
437 | }
438 | #endif
439 | 
440 | void copyGatesToSymbol(KernelGate* hostGates, int numGates, cudaStream_t& stream, int gpuID) {
441 |     checkCudaErrors(cudaMemcpyToSymbolAsync(deviceGates, hostGates + gpuID * numGates, sizeof(KernelGate) * numGates, 0, cudaMemcpyDefault, stream));
442 | }
443 | 
444 | void launchExecutor(int gridDim, qComplex* deviceStateVec, unsigned int* threadBias, int numLocalQubits, int numGates, unsigned int blockHot, unsigned int enumerate, cudaStream_t& stream, int gpuID) {
445 |     run<1<<THREAD_DEP><<<gridDim, 1<<THREAD_DEP, 0, stream>>>
446 |         (deviceStateVec, threadBias, loIdx_device[gpuID], shiftAt_device[gpuID], numLocalQubits, numGates, blockHot, enumerate);
447 | }


--------------------------------------------------------------------------------
/src/logger.cpp:
--------------------------------------------------------------------------------
1 | #include<logger.h>
2 | 
3 | Logger* Logger::instance = NULL;


--------------------------------------------------------------------------------
/src/logger.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <string>
 4 | #include <iostream>
 5 | #include <cstdio>
 6 | #include <stdarg.h>
 7 | #include <utils.h>
 8 | 
 9 | class Logger {
10 |     static Logger* instance;
11 | public:
12 |     static void add(const char* format, ...) {
13 | #ifdef SHOW_SUMMARY
14 |         Logger::init();
15 |         char buffer[1024];
16 |         va_list args;
17 |         va_start(args, format);
18 |         vsprintf(buffer, format, args);
19 |         va_end(args);
20 |         instance -> infos.push_back(std::string(buffer));
21 | #endif
22 |     }
23 | 
24 |     inline static void print() {
25 | #ifdef SHOW_SUMMARY
26 |         Logger::init();
27 |         char proc_info[100];
28 |         #if USE_MPI
29 |             sprintf(proc_info, "[%d]", MyMPI::rank);
30 |         #else
31 |             sprintf(proc_info, "%s", ""); // printf("") will cause compilee warning "-Wformat-zero-length"
32 |         #endif
33 |         for (auto& s: instance -> infos) {
34 |             std::cout << "Logger" << proc_info << ": " << s << std::endl;
35 |         }
36 |         instance -> infos.clear();
37 | #endif
38 |     }
39 | 
40 | private:
41 |     Logger() = default;
42 |     static void init() {
43 |         if (instance == NULL) {
44 |             instance = new Logger();
45 |         }
46 |     }
47 | private:
48 |     std::vector<std::string> infos;
49 | };
50 | 


--------------------------------------------------------------------------------
/src/schedule.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <vector>
  3 | #include <cutt.h>
  4 | #include <memory>
  5 | #include <string>
  6 | #include "utils.h"
  7 | #include "gate.h"
  8 | 
  9 | enum class Backend {
 10 |     None, PerGate, BLAS
 11 | };
 12 | 
 13 | std::string to_string(Backend b);
 14 | 
 15 | struct State {
 16 |     std::vector<int> pos;
 17 |     std::vector<int> layout;
 18 |     State() = default;
 19 |     State(const State&) = default;
 20 |     State(const std::vector<int>& p, const std::vector<int>& l): pos(p), layout(l) {};
 21 |     State(int numQubits) {
 22 |         pos.clear();
 23 |         for (int i = 0; i < numQubits; i++) {
 24 |             pos.push_back(i);
 25 |         }
 26 |         layout.clear();
 27 |         for (int i = 0; i < numQubits; i++) {
 28 |             layout.push_back(i);
 29 |         }
 30 |     }
 31 | 
 32 |     std::vector<unsigned char> serialize() const;
 33 |     static State deserialize(const unsigned char* arr, int& cur);
 34 | };
 35 | 
 36 | struct GateGroup {
 37 |     std::vector<Gate> gates;
 38 |     qindex relatedQubits;
 39 |     State state;
 40 |     std::vector<int> cuttPerm;
 41 |     int matQubit;
 42 |     Backend backend;
 43 | 
 44 |     std::vector<cuttHandle> cuttPlans;
 45 | 
 46 |     std::vector<std::unique_ptr<qComplex[]>> matrix;
 47 |     std::vector<qComplex*> deviceMats;
 48 | 
 49 |     GateGroup(GateGroup&&) = default;
 50 |     GateGroup& operator = (GateGroup&&) = default;
 51 |     GateGroup(): relatedQubits(0) {}
 52 |     GateGroup copyGates();
 53 | 
 54 |     static GateGroup merge(const GateGroup& a, const GateGroup& b);
 55 |     static qindex newRelated(qindex old, const Gate& g, qindex localQubits, bool enableGlobal);
 56 |     void addGate(const Gate& g, qindex localQubits, bool enableGlobal);
 57 |     
 58 |     bool contains(int i) { return (relatedQubits >> i) & 1; }
 59 |     
 60 |     std::vector<unsigned char> serialize() const;
 61 |     static GateGroup deserialize(const unsigned char* arr, int& cur);
 62 | 
 63 |     State initState(const State& oldState, int numLocalQubits);
 64 |     State initPerGateState(const State& oldState);
 65 |     State initBlasState(const State& oldState, int numLocalQubit);
 66 |     void initCPUMatrix(int numLocalQubit);
 67 |     void initGPUMatrix();
 68 |     void initMatrix(int numLocalQubit);
 69 |     void getCuttPlanPointers(int numLocalQubits, std::vector<cuttHandle*> &cuttPlanPointers, std::vector<int*> &cuttPermPointers, std::vector<int> &locals);
 70 | };
 71 | 
 72 | struct LocalGroup {
 73 |     State state;
 74 |     int a2aCommSize;
 75 |     std::vector<int> a2aComm;
 76 |     std::vector<int> cuttPerm;
 77 | 
 78 |     std::vector<GateGroup> overlapGroups;
 79 |     std::vector<GateGroup> fullGroups;
 80 |     qindex relatedQubits;
 81 | 
 82 |     std::vector<cuttHandle> cuttPlans;
 83 |     
 84 |     LocalGroup() = default;
 85 |     LocalGroup(LocalGroup&&) = default;
 86 | 
 87 |     bool contains(int i) { return (relatedQubits >> i) & 1; }
 88 |     State initState(const State& oldState, int numQubits, const std::vector<int>& newGlobals, qindex overlapGlobals, qindex overlapRelated);
 89 |     void getCuttPlanPointers(int numLocalQubits, std::vector<cuttHandle*> &cuttPlanPointers, std::vector<int*> &cuttPermPointers, std::vector<int> &locals, bool isFirstGroup = false);
 90 |     State initFirstGroupState(const State& oldState, int numQubits, const std::vector<int>& newGlobals);
 91 |     std::vector<unsigned char> serialize() const;
 92 |     static LocalGroup deserialize(const unsigned char* arr, int& cur);
 93 | };
 94 | 
 95 | struct Schedule {
 96 |     std::vector<LocalGroup> localGroups;
 97 |     State finalState;
 98 |     
 99 |     void dump(int numQubits);
100 |     std::vector<unsigned char> serialize() const;
101 |     static Schedule deserialize(const unsigned char* arr, int& cur);
102 |     void initMatrix(int numQubits);
103 |     void initCuttPlans(int numLocalQubits);
104 | };
105 | 
106 | void removeGates(std::vector<Gate>& remain, const std::vector<Gate>& remove); // remain := remain - remove        


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | #include <cstring>
  4 | #include "logger.h"
  5 | 
  6 | namespace MyGlobalVars {
  7 | int numGPUs;
  8 | int localGPUs;
  9 | int bit;
 10 | std::unique_ptr<cudaStream_t[]> streams;
 11 | std::unique_ptr<cudaStream_t[]> streams_comm;
 12 | std::unique_ptr<cublasHandle_t[]> blasHandles;
 13 | #if USE_MPI
 14 | std::unique_ptr<ncclComm_t[]> ncclComms;
 15 | #endif
 16 | 
 17 | void init() {
 18 |     checkCudaErrors(cudaGetDeviceCount(&localGPUs));
 19 |     #if USE_MPI
 20 |         numGPUs = MyMPI::commSize * localGPUs;
 21 |     #else
 22 |         numGPUs = localGPUs;
 23 |     #endif
 24 |     Logger::add("Local GPU: %d", localGPUs);
 25 |     bit = get_bit(numGPUs);
 26 | 
 27 |     streams = std::make_unique<cudaStream_t[]>(MyGlobalVars::localGPUs);
 28 |     streams_comm = std::make_unique<cudaStream_t[]>(MyGlobalVars::localGPUs);
 29 |     blasHandles = std::make_unique<cublasHandle_t[]>(MyGlobalVars::localGPUs);
 30 |     checkCuttErrors(cuttInit());
 31 |     for (int i = 0; i < localGPUs; i++) {
 32 |         checkCudaErrors(cudaSetDevice(i));
 33 |         cudaDeviceProp prop;
 34 |         cudaGetDeviceProperties(&prop, i);
 35 |         Logger::add("[%d] %s", i, prop.name);
 36 |         for (int j = 0; j < localGPUs; j++)
 37 |             if (i != j && (i ^ j) < 4) {
 38 |                 checkCudaErrors(cudaDeviceEnablePeerAccess(j, 0));
 39 |             }
 40 |         checkCudaErrors(cudaStreamCreate(&streams[i]);)
 41 |         checkBlasErrors(cublasCreate(&blasHandles[i]));
 42 |         checkBlasErrors(cublasSetStream(blasHandles[i], streams[i]));
 43 |         checkCudaErrors(cudaStreamCreate(&streams_comm[i]));
 44 |         checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
 45 |     }
 46 |     #if USE_MPI
 47 |         checkMPIErrors(MPI_Barrier(MPI_COMM_WORLD));
 48 |         ncclUniqueId id;
 49 |         if (MyMPI::rank == 0)
 50 |             checkNCCLErrors(ncclGetUniqueId(&id));
 51 |         checkMPIErrors(MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
 52 |         ncclComms = std::make_unique<ncclComm_t[]>(MyGlobalVars::localGPUs);
 53 |         checkNCCLErrors(ncclGroupStart());
 54 |         for (int i = 0; i < localGPUs; i++) {
 55 |             checkCudaErrors(cudaSetDevice(i));
 56 |             checkNCCLErrors(ncclCommInitRank(&ncclComms[i], numGPUs, id, MyMPI::rank * localGPUs + i));
 57 |         }
 58 |         checkNCCLErrors(ncclGroupEnd());
 59 |     #endif
 60 | }
 61 | };
 62 | 
 63 | namespace MyMPI {
 64 | int rank;
 65 | int commSize;
 66 | int commBit;
 67 | void init() {
 68 | #if USE_MPI
 69 |     MPI_Init(nullptr, nullptr);
 70 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 71 |     MPI_Comm_size(MPI_COMM_WORLD, &commSize);
 72 | #endif
 73 | }
 74 | };
 75 | 
 76 | 
 77 | qreal zero_wrapper(qreal x) {
 78 |     const qreal eps = 1e-14;
 79 |     if (x > -eps && x < eps) {
 80 |         return 0;
 81 |     } else {
 82 |         return x;
 83 |     }
 84 | }
 85 | 
 86 | qComplex operator * (const qComplex& a, const qComplex& b) {
 87 |     return make_qComplex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
 88 | }
 89 | 
 90 | qComplex operator + (const qComplex& a, const qComplex& b) {
 91 |     return make_qComplex(a.x + b.x, a.y + b.y);
 92 | }
 93 | 
 94 | bool isUnitary(std::unique_ptr<qComplex[]>& mat, int n) {
 95 |     qComplex result[n * n];
 96 |     memset(result, 0, sizeof(result));
 97 |     for (int k = 0; k < n; k++)
 98 |         #pragma omp parallel for
 99 |         for (int i = 0; i < n; i++)
100 |             for (int j = 0; j < n; j++) {
101 |                 qComplex v1 = mat[k * n + i];
102 |                 v1.y = - v1.y;
103 |                 result[i * n + j] = result[i * n + j] + v1 * mat[k * n + j];
104 |             }
105 |     bool wa = 0;
106 |     qreal eps = 1e-8;
107 |     #pragma omp parallel for
108 |     for (int i = 0; i < n; i++) {
109 |         qComplex val = result[i * n + i];
110 |         if (fabs(val.x - 1) > eps || fabs(val.y) > eps) {
111 |             wa = 1;
112 |         }
113 |         for (int j = 0; j < n; j++) {
114 |             if (i == j)
115 |                 continue;
116 |             qComplex val = result[i * n + j];
117 |             if (fabs(val.x) > eps || fabs(val.y) > eps)
118 |                 wa = 1;
119 |         }
120 |     }
121 |     if (wa) {
122 |         for (int i = 0; i < n; i++) {
123 |             for (int j = 0; j < n; j++)
124 |                 printf("(%.2f %.2f) ", result[i * n + j].x, result[i * n + j].y);
125 |             printf("\n");
126 |         }
127 |         exit(1);
128 |     }
129 |     return 1;
130 | }
131 | 
132 | qComplex make_qComplex(qreal x) {
133 |     return make_qComplex(x, 0.0);
134 | }
135 | 
136 | bool operator < (const qComplex& a, const qComplex& b) {
137 |         return a.x == b.x ? a.y < b.y : a.x < b.x;
138 | }
139 | 
140 | int get_bit(int n) {
141 |     int x = n;
142 |     int bit = -1;
143 |     while (x) {
144 |         bit ++;
145 |         x >>= 1;
146 |     }
147 |     if (n == 0 || (1 << bit) != n) {
148 |         printf("Must be pow of two: %d\n", n);
149 |         exit(1);
150 |     }
151 |     return bit;
152 | }


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdio>
  4 | #include <cuComplex.h>
  5 | #include <cuda.h>
  6 | #include <cutt.h>
  7 | #include <cuda_runtime.h>
  8 | #include <memory>
  9 | #include <cublas_v2.h>
 10 | 
 11 | #if USE_MPI
 12 | #include <mpi.h>
 13 | #include <nccl.h>
 14 | #endif
 15 | 
 16 | #ifdef USE_DOUBLE
 17 | typedef double qreal;
 18 | typedef long long qindex;
 19 | typedef cuDoubleComplex qComplex;
 20 | #define make_qComplex make_cuDoubleComplex
 21 | #define MPI_Complex MPI_C_DOUBLE_COMPLEX
 22 | #define cublasGEMM cublasZgemm
 23 | #define NCCL_FLOAT_TYPE ncclDouble
 24 | #else
 25 | typedef float qreal;
 26 | typedef long long qindex;
 27 | typedef cuFloatComplex qComplex;
 28 | #define make_qComplex make_cuFloatComplex
 29 | #define MPI_Complex MPI_C_COMPLEX
 30 | #define cublasGEMM cublasCgemm
 31 | #define NCCL_FLOAT_TYPE ncclFloat
 32 | #endif
 33 | 
 34 | #define SERIALIZE_STEP(x) { *reinterpret_cast<decltype(x)*>(arr + cur) = x; cur += sizeof(x); }
 35 | #define DESERIALIZE_STEP(x) { x = *reinterpret_cast<const decltype(x)*>(arr + cur); cur += sizeof(x); }
 36 | 
 37 | #define SERIALIZE_VECTOR(x, result) { \
 38 |     auto tmp_chars = reinterpret_cast<const unsigned char*>(x.data()); \
 39 |     result.insert(result.end(), tmp_chars, tmp_chars + sizeof(decltype(x)::value_type) * x.size()); \
 40 | }
 41 | 
 42 | #define DESERIALIZE_VECTOR(x, size) { \
 43 |     x.resize(size); \
 44 |     auto tmp_size = sizeof(decltype(x)::value_type) * size; \
 45 |     memcpy(x.data(), arr + cur, tmp_size); \
 46 |     cur += tmp_size; \
 47 | }
 48 | 
 49 | 
 50 | #define UNREACHABLE() { \
 51 |     printf("file %s line %i: unreachable!\n", __FILE__, __LINE__); \
 52 |     fflush(stdout); \
 53 |     exit(1); \
 54 | }
 55 | 
 56 | const int LOCAL_QUBIT_SIZE = 10; // is hardcoded
 57 | const int BLAS_MAT_LIMIT = BLAS_MAT_LIMIT_DEFINED;
 58 | const int THREAD_DEP = THREAD_DEP_DEFINED; // 1 << THREAD_DEP threads per block
 59 | const int COALESCE_GLOBAL = COALESCE_GLOBAL_DEFINED;
 60 | const int MAX_GATE = 600;
 61 | const int MIN_MAT_SIZE = MIN_MAT_SIZE_DEFINED;
 62 | 
 63 | static const char *cublasGetErrorString(cublasStatus_t error) {
 64 |     switch (error)
 65 |     {
 66 |         case CUBLAS_STATUS_SUCCESS:
 67 |             return "CUBLAS_STATUS_SUCCESS";
 68 |         case CUBLAS_STATUS_NOT_INITIALIZED:
 69 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
 70 |         case CUBLAS_STATUS_ALLOC_FAILED:
 71 |             return "CUBLAS_STATUS_ALLOC_FAILED";
 72 |         case CUBLAS_STATUS_INVALID_VALUE:
 73 |             return "CUBLAS_STATUS_INVALID_VALUE";
 74 |         case CUBLAS_STATUS_ARCH_MISMATCH:
 75 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
 76 |         case CUBLAS_STATUS_MAPPING_ERROR:
 77 |             return "CUBLAS_STATUS_MAPPING_ERROR";
 78 |         case CUBLAS_STATUS_EXECUTION_FAILED:
 79 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
 80 |         case CUBLAS_STATUS_INTERNAL_ERROR:
 81 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
 82 |         default:
 83 |             return "<unknown>";
 84 |     }
 85 |     UNREACHABLE()
 86 | }
 87 | 
 88 | static const char *cuttGetErrorString(cuttResult error) {
 89 |     switch (error) {
 90 |         case CUTT_INVALID_PLAN:
 91 |             return "CUTT_INVALID_PLAN";
 92 |         case CUTT_INVALID_PARAMETER:
 93 |             return "CUTT_INVALID_PARAMETER";
 94 |         case CUTT_INVALID_DEVICE:
 95 |             return "CUTT_INVALID_DEVICE";
 96 |         case CUTT_INTERNAL_ERROR:
 97 |             return "CUTT_INTERNAL_ERROR";
 98 |         case CUTT_UNDEFINED_ERROR:
 99 |             return "CUTT_UNDEFINED_ERROR";
100 |         default:
101 |             return "<unknown>";
102 |     }
103 |     UNREACHABLE()
104 | }
105 | 
106 | #define checkCudaErrors(stmt) {                                 \
107 |     cudaError_t err = stmt;                            \
108 |     if (err != cudaSuccess) {                          \
109 |       fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cudaGetErrorString(err)); \
110 |       exit(1); \
111 |     }                                                  \
112 | }
113 | 
114 | #define checkCuttErrors(stmt) {                                 \
115 |     cuttResult err = stmt;                            \
116 |     if (err != CUTT_SUCCESS) {                          \
117 |       fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cuttGetErrorString(err)); \
118 |       exit(1); \
119 |     }                                                  \
120 | }
121 | 
122 | #define checkBlasErrors(stmt) { \
123 |     cublasStatus_t err = stmt; \
124 |     if (err != CUBLAS_STATUS_SUCCESS) {                          \
125 |       fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, cublasGetErrorString(err)); \
126 |       exit(1); \
127 |     } \
128 | }
129 | 
130 | #define checkMPIErrors(stmt) {                          \
131 |   int err = stmt;                                      \
132 |   if(err != MPI_SUCCESS) {                          \
133 |     fprintf(stderr, "%s in file %s, function %s, line %i: %04d\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err); \
134 |       exit(1); \
135 |   }                                                 \
136 | }
137 | 
138 | #define checkNCCLErrors(stmt) {                         \
139 |   ncclResult_t err= stmt;                             \
140 |   if (err != ncclSuccess) {                            \
141 |     fprintf(stderr, "%s in file %s, function %s, line %i: %04d %s\n", #stmt, __FILE__, __FUNCTION__, __LINE__, err, ncclGetErrorString(err)); \
142 |       exit(1); \
143 |   }                                                 \
144 | }
145 | 
146 | namespace MyGlobalVars {
147 |     extern int numGPUs;
148 |     extern int localGPUs;
149 |     extern int bit;
150 |     extern std::unique_ptr<cudaStream_t[]> streams;
151 |     extern std::unique_ptr<cudaStream_t[]> streams_comm;
152 |     extern std::unique_ptr<cublasHandle_t[]> blasHandles;
153 | #if USE_MPI
154 |     extern std::unique_ptr<ncclComm_t[]> ncclComms;
155 | #endif
156 |     void init();
157 | };
158 | 
159 | namespace MyMPI {
160 |     extern int rank;
161 |     extern int commSize;
162 |     extern int commBit;
163 |     void init();
164 | };
165 | 
166 | template<typename T>
167 | int bitCount(T x) {
168 |     int ret = 0;
169 |     for (T i = x; i; i -= i & (-i)) {
170 |         ret++;
171 |     }
172 |     return ret;
173 | }
174 | 
175 | qreal zero_wrapper(qreal x);
176 | 
177 | qComplex operator * (const qComplex& a, const qComplex& b);
178 | qComplex operator + (const qComplex& a, const qComplex& b);
179 | 
180 | bool isUnitary(std::unique_ptr<qComplex[]>& mat, int n);
181 | 
182 | qComplex make_qComplex(qreal x);
183 | bool operator < (const qComplex& a, const qComplex& b);
184 | 
185 | int get_bit(int n);


--------------------------------------------------------------------------------
/tests/input/basis_change_24.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fe6754aecac445ac4b97dd30a70acda95de1601c66071ecefab725f2dc98825e
3 | size 99591
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_25.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9e8d52fdbb24e5e46135dba5af55cf6a6b7d9c3ec2c112334717038aa9335247
3 | size 108252
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_26.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:57ee32983271f084130829d39fb163ce6813e2b917278cb0e505bada417757a8
3 | size 117400
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_27.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c22f6398109ad330eafc6fbe4594e49cf5d9d0020196c9e86e222bc1cd8d12a8
3 | size 126859
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8e398bd661c1789dd2b152f3e14e37c7a882d496dd7eb2be90a7719ca102cae2
3 | size 136363
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_29.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:890b79f448ccab1be394103b563c4d2028b2add24eb328da99261f9c2734a66b
3 | size 146669
4 | 


--------------------------------------------------------------------------------
/tests/input/basis_change_30.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e7373714031b32cafbb3c6ed7b74e6cd11eaa0f53bee92af2805d8eca7a5c39e
3 | size 157405
4 | 


--------------------------------------------------------------------------------
/tests/input/bv_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4588f9f0693bd5a1b2714bb09582c72cd2ce14a7d60d223ffa9fca10301e4165
3 | size 954
4 | 


--------------------------------------------------------------------------------
/tests/input/hidden_shift_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5febba63b537eca2d1580aa6306014321867e2fbc7195f480781f9bc4f229808
3 | size 1771
4 | 


--------------------------------------------------------------------------------
/tests/input/qaoa_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed161b0a92906f81b6d8242884d85094b7021fe35fc720a4f8f1b0967bbad638
3 | size 44039
4 | 


--------------------------------------------------------------------------------
/tests/input/qft_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fae2455fdb0f3828d6bb3b8aebd4762ed10a16250806a7b70377329ee8b71294
3 | size 9634
4 | 


--------------------------------------------------------------------------------
/tests/input/quantum_volume_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cd7939d6e1f39eed7651cc8c1a7b8270f9a138c7c7929850e827545849390606
3 | size 81064
4 | 


--------------------------------------------------------------------------------
/tests/input/supremacy_28.qasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c98d622067ac9f0eeece5e57234375c2d721ad64c82c543914e9fec6354bc22d
3 | size 14961
4 | 


--------------------------------------------------------------------------------
/tests/output/basis_change_25.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c141d408cfc426fdd41e8d3beb3adebe15e64fc3d065ba3df1bbfe8e442d72ff
3 | size 6291
4 | 


--------------------------------------------------------------------------------
/tests/output/basis_change_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a0567b2dbab58f720a04957a6667ac00a0ce923e994c5aaa8e96dec07be17428
3 | size 6290
4 | 


--------------------------------------------------------------------------------
/tests/output/basis_change_30.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f3620a2c66c37dbd55e140de898625ace24400ab327921603255bad09d53c3d2
3 | size 6291
4 | 


--------------------------------------------------------------------------------
/tests/output/bv_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e03bc93f14dc98db5a74d4b35458d158c811b5d2b087a6bed4c60eb6864234c5
3 | size 6403
4 | 


--------------------------------------------------------------------------------
/tests/output/hidden_shift_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a074bd1082dc9b9a15c0d5b806194e4ff6b5e148b04467997203a888d0e2f6e8
3 | size 6346
4 | 


--------------------------------------------------------------------------------
/tests/output/qaoa_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e90c51de4d9027fa716dcbb9a00ac5d43ad7445c5b2d651cf1c0178307e63b97
3 | size 6386
4 | 


--------------------------------------------------------------------------------
/tests/output/qft_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:985cd66bcbc2fd4facdafa5fc793b8ad30d96abf28b69d8608d67a34eb39d6e7
3 | size 6290
4 | 


--------------------------------------------------------------------------------
/tests/output/quantum_volume_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f128865203bbd925e59e9af603997cfc4ff11fde2b5b366153bdc30771c4ee6d
3 | size 6414
4 | 


--------------------------------------------------------------------------------
/tests/output/supremacy_28.log:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c2dd93160bc32bb9745f28c1562cee6fc737212e119de6b04e803d6b7e114f82
3 | size 6420
4 | 


--------------------------------------------------------------------------------