├── .github └── workflows │ └── ci.yml ├── .gitignore ├── COPYRIGHT ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── ec-gpu-gen ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── build.rs └── src │ ├── cl │ ├── common.cl │ ├── ec.cl │ ├── fft.cl │ ├── field.cl │ ├── field2.cl │ ├── multiexp.cl │ └── test.cl │ ├── error.rs │ ├── fft.rs │ ├── fft_cpu.rs │ ├── lib.rs │ ├── multiexp.rs │ ├── multiexp_cpu.rs │ ├── program.rs │ ├── source.rs │ └── threadpool.rs ├── ec-gpu ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT └── src │ └── lib.rs ├── gpu-tests ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches │ └── multiexp.rs ├── build.rs ├── src │ └── lib.rs └── tests │ ├── fft.rs │ └── multiexp.rs ├── release.toml └── rust-toolchain /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [pull_request, push] 4 | 5 | # Cancel a job if there's a new on on the same branch started. 6 | # Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051 7 | concurrency: 8 | group: ${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | env: 12 | CARGO_INCREMENTAL: 0 13 | RUST_BACKTRACE: 1 14 | # Faster crates.io index checkout. 15 | CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse 16 | 17 | jobs: 18 | set-msrv: 19 | runs-on: ubuntu-latest 20 | outputs: 21 | msrv: ${{ steps.msrv.outputs.MSRV }} 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Get MSRV from rust-toolchain 25 | id: msrv 26 | run: | 27 | MSRV=$(cat ./rust-toolchain) 28 | echo "MSRV=$MSRV" | tee --append "$GITHUB_OUTPUT" 29 | 30 | linux: 31 | needs: set-msrv 32 | runs-on: ubuntu-latest 33 | name: Build and test 34 | steps: 35 | - uses: actions/checkout@v4 36 | - uses: dtolnay/rust-toolchain@master 37 | with: 38 | toolchain: ${{needs.set-msrv.outputs.msrv}} 39 | - name: Install required packages 40 | run: sudo apt install --no-install-recommends --yes ocl-icd-opencl-dev nvidia-cuda-toolkit 41 | - name: Build with default features 42 | run: cargo build --workspace 43 | # Machine has no GPU installed, hence run without the `cuda` or `opencl` feature. 44 | - name: Run tests without default features 45 | run: cargo test --workspace --no-default-features -- --nocapture 46 | 47 | clippy_check: 48 | needs: set-msrv 49 | runs-on: ubuntu-latest 50 | name: Clippy 51 | steps: 52 | - uses: actions/checkout@v4 53 | - uses: dtolnay/rust-toolchain@master 54 | with: 55 | toolchain: ${{ needs.set-msrv.outputs.msrv }} 56 | components: clippy 57 | - name: Install required packages 58 | run: sudo apt install --no-install-recommends --yes ocl-icd-opencl-dev nvidia-cuda-dev 59 | - name: Run cargo clippy default features 60 | run: cargo clippy --workspace --all-targets -- -D warnings 61 | - name: Run cargo clippy with cuda and opencl features 62 | run: cargo clippy --workspace --all-targets --features cuda,opencl -- -D warnings 63 | - name: Run cargo clippy with cuda feature 64 | run: cargo clippy --workspace --all-targets --no-default-features --features cuda -- -D warnings 65 | - name: Run cargo clippy with opencl feature 66 | run: cargo clippy --workspace --all-targets --no-default-features --features opencl -- -D warnings 67 | 68 | check_fmt_and_docs: 69 | needs: set-msrv 70 | runs-on: ubuntu-latest 71 | name: Checking fmt and docs 72 | steps: 73 | - uses: actions/checkout@v4 74 | - uses: dtolnay/rust-toolchain@master 75 | with: 76 | toolchain: ${{ needs.set-msrv.outputs.msrv }} 77 | components: rustfmt 78 | - name: fmt 79 | run: cargo fmt --all -- --check 80 | - name: Docs 81 | env: 82 | # Making sure that the documentation can be built without having the NVIDIA toolkit 83 | # installed. 84 | DOCS_RS: true 85 | run: | 86 | cargo rustdoc --package ec-gpu --all-features -- -D warnings 87 | cargo rustdoc --package ec-gpu-gen --all-features -- -D warnings 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyrights in the "ff-cl-gen" library are retained by their contributors. No 2 | copyright assignment is required to contribute to the "ff-cl-gen" library. 3 | 4 | The "ff-cl-gen" library is licensed under either of 5 | 6 | * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0) 7 | * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT) 8 | 9 | at your option. 10 | 11 | Unless you explicitly state otherwise, any contribution intentionally 12 | submitted for inclusion in the work by you, as defined in the Apache-2.0 13 | license, shall be dual licensed as above, without any additional terms or 14 | conditions. 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "ec-gpu", 4 | "ec-gpu-gen", 5 | "gpu-tests", 6 | ] 7 | resolver = "2" 8 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `ec-gpu` & `ec-gpu-gen` 2 | 3 | [![crates.io][crate-image-ec-gpu]][crate-link-ec-gpu] 4 | [![Documentation][doc-image-ec-gpu]][doc-link-ec-gpu] 5 | [![Build Status][build-image-ec-gpu]][build-link-ec-gpu] 6 | ![minimum rustc 1.51][msrv-image-ec-gpu] 7 | [![dependency status][deps-image-ec-gpu]][deps-link-ec-gpu] 8 | 9 | [![crates.io][crate-image-ec-gpu-gen]][crate-link-ec-gpu-gen] 10 | [![Documentation][doc-image-ec-gpu-gen]][doc-link-ec-gpu-gen] 11 | [![Build Status][build-image-ec-gpu-gen]][build-link-ec-gpu-gen] 12 | ![minimum rustc 1.51][msrv-image-ec-gpu-gen] 13 | [![dependency status][deps-image-ec-gpu-gen]][deps-link-ec-gpu-gen] 14 | 15 | CUDA/OpenCL code generator for finite-field arithmetic over prime fields and elliptic curve arithmetic constructed with Rust. 16 | 17 | Notes: 18 | - Limbs are 32/64-bit long, by your choice (on CUDA only 32-bit limbs are supported). 19 | - The library assumes that the most significant bit of your prime-field is unset. This allows for cheap reductions. 20 | 21 | ## Usage 22 | 23 | ### Quickstart 24 | 25 | Generating CUDA/OpenCL codes for `blstrs` Scalar elements: 26 | 27 | ```rust 28 | use blstrs::Scalar; 29 | use ec_gpu_gen::SourceBuilder; 30 | 31 | let source = SourceBuilder::new() 32 | .add_field::() 33 | .build_64_bit_limbs(); 34 | ``` 35 | 36 | ### Integration into your library 37 | 38 | This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time. 39 | 40 | In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time. 41 | 42 | In this example we will make use of the FFT functionality. Add to your `build.rs`: 43 | 44 | ```rust 45 | use blstrs::Scalar; 46 | use ec_gpu_gen::SourceBuilder; 47 | 48 | fn main() { 49 | let source_builder = SourceBuilder::new().add_fft::() 50 | ec_gpu_gen::generate(&source_builder); 51 | } 52 | ``` 53 | 54 | The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source. 55 | 56 | Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this: 57 | 58 | ```rust 59 | use ec_gpu_gen::{ 60 | rust_gpu_tools::Device, 61 | }; 62 | 63 | let devices = Device::all(); 64 | let programs = devices 65 | .iter() 66 | .map(|device| ec_gpu_gen::program!(device)) 67 | .collect::>() 68 | .expect("Cannot create programs!"); 69 | 70 | let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); 71 | kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!"); 72 | ``` 73 | 74 | ## Feature flags 75 | 76 | This crate supports CUDA and OpenCL, which can be enabled with the `cuda` and `opencl` feature flags. 77 | 78 | ### Environment variables 79 | 80 | - `EC_GPU_CUDA_NVCC_ARGS` 81 | 82 | By default the CUDA kernel is compiled for several architectures, which may take a long time. `EC_GPU_CUDA_NVCC_ARGS` can be used to override those arguments. The input and output file will still be automatically set. 83 | 84 | ```console 85 | // Example for compiling the kernel for only the Turing architecture. 86 | EC_GPU_CUDA_NVCC_ARGS="--fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75" 87 | ``` 88 | 89 | - `EC_GPU_FRAMEWORK` 90 | 91 | When the library is built with both CUDA and OpenCL support, you can choose which one to use at run time. The default is `cuda`, when you set nothing or any other (invalid) value. The other possible value is `opencl`. 92 | 93 | ```console 94 | // Example for setting it to OpenCL. 95 | EC_GPU_FRAMEWORK=opencl 96 | ``` 97 | 98 | - `EC_GPU_NUM_THREADS` 99 | 100 | Restricts the number of threads used in the library. The default is set to the number of logical cores reported on the machine. 101 | 102 | ```console 103 | // Example for setting the maximum number of threads to 6. 104 | EC_GPU_NUM_THREADS=6 105 | ``` 106 | 107 | 108 | ## License 109 | 110 | Licensed under either of 111 | 112 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 113 | http://www.apache.org/licenses/LICENSE-2.0) 114 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 115 | 116 | at your option. 117 | 118 | ### Contribution 119 | 120 | Unless you explicitly state otherwise, any contribution intentionally 121 | submitted for inclusion in the work by you, as defined in the Apache-2.0 122 | license, shall be dual licensed as above, without any additional terms or 123 | conditions. 124 | 125 | 126 | [crate-image-ec-gpu]: https://img.shields.io/crates/v/ec-gpu.svg 127 | [crate-link-ec-gpu]: https://crates.io/crates/ec-gpu 128 | [doc-image-ec-gpu]: https://docs.rs/ec-gpu/badge.svg 129 | [doc-link-ec-gpu]: https://docs.rs/ec-gpu 130 | [build-image-ec-gpu]: https://circleci.com/gh/filecoin-project/ec-gpu.svg?style=shield 131 | [build-link-ec-gpu]: https://circleci.com/gh/filecoin-project/ec-gpu 132 | [msrv-image-ec-gpu]: https://img.shields.io/badge/rustc-1.54+-blue.svg 133 | [deps-image-ec-gpu]: https://deps.rs/repo/github/filecoin-projectt/ec-gpu/status.svg 134 | [deps-link-ec-gpu]: https://deps.rs/repo/github/filecoin-project/ec-gpu 135 | 136 | 137 | [crate-image-ec-gpu-gen]: https://img.shields.io/crates/v/ec-gpu-gen.svg 138 | [crate-link-ec-gpu-gen]: https://crates.io/crates/ec-gpu-gen 139 | [doc-image-ec-gpu-gen]: https://docs.rs/ec-gpu-gen/badge.svg 140 | [doc-link-ec-gpu-gen]: https://docs.rs/ec-gpu-gen 141 | [build-image-ec-gpu-gen]: https://circleci.com/gh/filecoin-project/ec-gpu.svg?style=shield 142 | [build-link-ec-gpu-gen]: https://circleci.com/gh/filecoin-project/ec-gpu 143 | [msrv-image-ec-gpu-gen]: https://img.shields.io/badge/rustc-1.54+-blue.svg 144 | [deps-image-ec-gpu-gen]: https://deps.rs/repo/github/filecoin-projectt/ec-gpu/status.svg 145 | [deps-link-ec-gpu-gen]: https://deps.rs/repo/github/filecoin-project/ec-gpu 146 | 147 | [Fast Fourier transform]: https://en.wikipedia.org/wiki/Fast_Fourier_transform 148 | [fatbin]: https://en.wikipedia.org/wiki/Fat_binary#Heterogeneous_computing 149 | -------------------------------------------------------------------------------- /ec-gpu-gen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ec-gpu-gen" 3 | version = "0.7.1" 4 | authors = ["dignifiedquire "] 5 | edition = "2021" 6 | description = "Code generator for field and eliptic curve operations on the GPUs" 7 | homepage = "https://github.com/filecoin-project/ff-cl-gen" 8 | repository = "https://github.com/filecoin-project/ff-cl-gen" 9 | license = "MIT/Apache-2.0" 10 | rust-version = "1.83.0" 11 | 12 | [dependencies] 13 | bitvec = "1.0.1" 14 | crossbeam-channel = "0.5.1" 15 | ec-gpu = "0.2.0" 16 | execute = "0.2.9" 17 | ff = { version = "0.13.0", default-features = false } 18 | group = "0.13.0" 19 | hex = "0.4" 20 | log = "0.4.14" 21 | num_cpus = "1.13.0" 22 | once_cell = "1.8.0" 23 | rayon = "1.5.1" 24 | rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true } 25 | sha2 = "0.10" 26 | thiserror = "1.0.30" 27 | yastl = "0.1.2" 28 | 29 | [dev-dependencies] 30 | # NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just 31 | # temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then 32 | # we won't need the exports of `Fp` and `Fp2` any more. 33 | #blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true } 34 | blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] } 35 | rand = "0.8" 36 | lazy_static = "1.2" 37 | pairing = "0.23.0" 38 | temp-env = "0.3.0" 39 | rand_core = "0.6.3" 40 | rand_xorshift = "0.3.0" 41 | 42 | [features] 43 | default = [] 44 | cuda = ["rust-gpu-tools/cuda"] 45 | opencl = ["rust-gpu-tools/opencl"] 46 | -------------------------------------------------------------------------------- /ec-gpu-gen/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /ec-gpu-gen/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /ec-gpu-gen/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | // This is intentionally empty. It's only there so that `OUT_DIR` is set, which is 3 | // used by one of the tests. 4 | } 5 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/common.cl: -------------------------------------------------------------------------------- 1 | // Defines to make the code work with both, CUDA and OpenCL 2 | #ifdef __NVCC__ 3 | #define DEVICE __device__ 4 | #define GLOBAL 5 | #define KERNEL extern "C" __global__ 6 | #define LOCAL __shared__ 7 | #define CONSTANT __constant__ 8 | 9 | #define GET_GLOBAL_ID() blockIdx.x * blockDim.x + threadIdx.x 10 | #define GET_GROUP_ID() blockIdx.x 11 | #define GET_LOCAL_ID() threadIdx.x 12 | #define GET_LOCAL_SIZE() blockDim.x 13 | #define BARRIER_LOCAL() __syncthreads() 14 | 15 | typedef unsigned char uchar; 16 | 17 | #define CUDA 18 | #else // OpenCL 19 | #define DEVICE 20 | #define GLOBAL __global 21 | #define KERNEL __kernel 22 | #define LOCAL __local 23 | #define CONSTANT __constant 24 | 25 | #define GET_GLOBAL_ID() get_global_id(0) 26 | #define GET_GROUP_ID() get_group_id(0) 27 | #define GET_LOCAL_ID() get_local_id(0) 28 | #define GET_LOCAL_SIZE() get_local_size(0) 29 | #define BARRIER_LOCAL() barrier(CLK_LOCAL_MEM_FENCE) 30 | #endif 31 | 32 | #ifdef __NV_CL_C_VERSION 33 | #define OPENCL_NVIDIA 34 | #endif 35 | 36 | #if defined(__WinterPark__) || defined(__BeaverCreek__) || defined(__Turks__) || \ 37 | defined(__Caicos__) || defined(__Tahiti__) || defined(__Pitcairn__) || \ 38 | defined(__Capeverde__) || defined(__Cayman__) || defined(__Barts__) || \ 39 | defined(__Cypress__) || defined(__Juniper__) || defined(__Redwood__) || \ 40 | defined(__Cedar__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || \ 41 | defined(__ATI_RV710__) || defined(__Loveland__) || defined(__GPU__) || \ 42 | defined(__Hawaii__) 43 | #define AMD 44 | #endif 45 | 46 | // Returns a * b + c + d, puts the carry in d 47 | DEVICE ulong mac_with_carry_64(ulong a, ulong b, ulong c, ulong *d) { 48 | #if defined(OPENCL_NVIDIA) || defined(CUDA) 49 | ulong lo, hi; 50 | asm("mad.lo.cc.u64 %0, %2, %3, %4;\r\n" 51 | "madc.hi.u64 %1, %2, %3, 0;\r\n" 52 | "add.cc.u64 %0, %0, %5;\r\n" 53 | "addc.u64 %1, %1, 0;\r\n" 54 | : "=l"(lo), "=l"(hi) : "l"(a), "l"(b), "l"(c), "l"(*d)); 55 | *d = hi; 56 | return lo; 57 | #else 58 | ulong lo = a * b + c; 59 | ulong hi = mad_hi(a, b, (ulong)(lo < c)); 60 | a = lo; 61 | lo += *d; 62 | hi += (lo < a); 63 | *d = hi; 64 | return lo; 65 | #endif 66 | } 67 | 68 | // Returns a + b, puts the carry in d 69 | DEVICE ulong add_with_carry_64(ulong a, ulong *b) { 70 | #if defined(OPENCL_NVIDIA) || defined(CUDA) 71 | ulong lo, hi; 72 | asm("add.cc.u64 %0, %2, %3;\r\n" 73 | "addc.u64 %1, 0, 0;\r\n" 74 | : "=l"(lo), "=l"(hi) : "l"(a), "l"(*b)); 75 | *b = hi; 76 | return lo; 77 | #else 78 | ulong lo = a + *b; 79 | *b = lo < a; 80 | return lo; 81 | #endif 82 | } 83 | 84 | // Returns a * b + c + d, puts the carry in d 85 | DEVICE uint mac_with_carry_32(uint a, uint b, uint c, uint *d) { 86 | ulong res = (ulong)a * b + c + *d; 87 | *d = res >> 32; 88 | return res; 89 | } 90 | 91 | // Returns a + b, puts the carry in b 92 | DEVICE uint add_with_carry_32(uint a, uint *b) { 93 | #if defined(OPENCL_NVIDIA) || defined(CUDA) 94 | uint lo, hi; 95 | asm("add.cc.u32 %0, %2, %3;\r\n" 96 | "addc.u32 %1, 0, 0;\r\n" 97 | : "=r"(lo), "=r"(hi) : "r"(a), "r"(*b)); 98 | *b = hi; 99 | return lo; 100 | #else 101 | uint lo = a + *b; 102 | *b = lo < a; 103 | return lo; 104 | #endif 105 | } 106 | 107 | // Reverse the given bits. It's used by the FFT kernel. 108 | DEVICE uint bitreverse(uint n, uint bits) { 109 | uint r = 0; 110 | for(int i = 0; i < bits; i++) { 111 | r = (r << 1) | (n & 1); 112 | n >>= 1; 113 | } 114 | return r; 115 | } 116 | 117 | #ifdef CUDA 118 | // CUDA doesn't support local buffers ("dynamic shared memory" in CUDA lingo) as function 119 | // arguments, but only a single globally defined extern value. Use `uchar` so that it is always 120 | // allocated by the number of bytes. 121 | extern LOCAL uchar cuda_shared[]; 122 | 123 | typedef uint uint32_t; 124 | typedef int int32_t; 125 | typedef uint limb; 126 | 127 | DEVICE inline uint32_t add_cc(uint32_t a, uint32_t b) { 128 | uint32_t r; 129 | 130 | asm volatile ("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b)); 131 | return r; 132 | } 133 | 134 | DEVICE inline uint32_t addc_cc(uint32_t a, uint32_t b) { 135 | uint32_t r; 136 | 137 | asm volatile ("addc.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b)); 138 | return r; 139 | } 140 | 141 | DEVICE inline uint32_t addc(uint32_t a, uint32_t b) { 142 | uint32_t r; 143 | 144 | asm volatile ("addc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b)); 145 | return r; 146 | } 147 | 148 | 149 | DEVICE inline uint32_t madlo(uint32_t a, uint32_t b, uint32_t c) { 150 | uint32_t r; 151 | 152 | asm volatile ("mad.lo.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 153 | return r; 154 | } 155 | 156 | DEVICE inline uint32_t madlo_cc(uint32_t a, uint32_t b, uint32_t c) { 157 | uint32_t r; 158 | 159 | asm volatile ("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 160 | return r; 161 | } 162 | 163 | DEVICE inline uint32_t madloc_cc(uint32_t a, uint32_t b, uint32_t c) { 164 | uint32_t r; 165 | 166 | asm volatile ("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 167 | return r; 168 | } 169 | 170 | DEVICE inline uint32_t madloc(uint32_t a, uint32_t b, uint32_t c) { 171 | uint32_t r; 172 | 173 | asm volatile ("madc.lo.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 174 | return r; 175 | } 176 | 177 | DEVICE inline uint32_t madhi(uint32_t a, uint32_t b, uint32_t c) { 178 | uint32_t r; 179 | 180 | asm volatile ("mad.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 181 | return r; 182 | } 183 | 184 | DEVICE inline uint32_t madhi_cc(uint32_t a, uint32_t b, uint32_t c) { 185 | uint32_t r; 186 | 187 | asm volatile ("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 188 | return r; 189 | } 190 | 191 | DEVICE inline uint32_t madhic_cc(uint32_t a, uint32_t b, uint32_t c) { 192 | uint32_t r; 193 | 194 | asm volatile ("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 195 | return r; 196 | } 197 | 198 | DEVICE inline uint32_t madhic(uint32_t a, uint32_t b, uint32_t c) { 199 | uint32_t r; 200 | 201 | asm volatile ("madc.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c)); 202 | return r; 203 | } 204 | 205 | typedef struct { 206 | int32_t _position; 207 | } chain_t; 208 | 209 | DEVICE inline 210 | void chain_init(chain_t *c) { 211 | c->_position = 0; 212 | } 213 | 214 | DEVICE inline 215 | uint32_t chain_add(chain_t *ch, uint32_t a, uint32_t b) { 216 | uint32_t r; 217 | 218 | ch->_position++; 219 | if(ch->_position==1) 220 | r=add_cc(a, b); 221 | else 222 | r=addc_cc(a, b); 223 | return r; 224 | } 225 | 226 | DEVICE inline 227 | uint32_t chain_madlo(chain_t *ch, uint32_t a, uint32_t b, uint32_t c) { 228 | uint32_t r; 229 | 230 | ch->_position++; 231 | if(ch->_position==1) 232 | r=madlo_cc(a, b, c); 233 | else 234 | r=madloc_cc(a, b, c); 235 | return r; 236 | } 237 | 238 | DEVICE inline 239 | uint32_t chain_madhi(chain_t *ch, uint32_t a, uint32_t b, uint32_t c) { 240 | uint32_t r; 241 | 242 | ch->_position++; 243 | if(ch->_position==1) 244 | r=madhi_cc(a, b, c); 245 | else 246 | r=madhic_cc(a, b, c); 247 | return r; 248 | } 249 | #endif 250 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/ec.cl: -------------------------------------------------------------------------------- 1 | // Elliptic curve operations (Short Weierstrass Jacobian form) 2 | 3 | #define POINT_ZERO ((POINT_jacobian){FIELD_ZERO, FIELD_ONE, FIELD_ZERO}) 4 | 5 | typedef struct { 6 | FIELD x; 7 | FIELD y; 8 | } POINT_affine; 9 | 10 | typedef struct { 11 | FIELD x; 12 | FIELD y; 13 | FIELD z; 14 | } POINT_jacobian; 15 | 16 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l 17 | DEVICE POINT_jacobian POINT_double(POINT_jacobian inp) { 18 | const FIELD local_zero = FIELD_ZERO; 19 | if(FIELD_eq(inp.z, local_zero)) { 20 | return inp; 21 | } 22 | 23 | const FIELD a = FIELD_sqr(inp.x); // A = X1^2 24 | const FIELD b = FIELD_sqr(inp.y); // B = Y1^2 25 | FIELD c = FIELD_sqr(b); // C = B^2 26 | 27 | // D = 2*((X1+B)2-A-C) 28 | FIELD d = FIELD_add(inp.x, b); 29 | d = FIELD_sqr(d); d = FIELD_sub(FIELD_sub(d, a), c); d = FIELD_double(d); 30 | 31 | const FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A 32 | const FIELD f = FIELD_sqr(e); 33 | 34 | inp.z = FIELD_mul(inp.y, inp.z); inp.z = FIELD_double(inp.z); // Z3 = 2*Y1*Z1 35 | inp.x = FIELD_sub(FIELD_sub(f, d), d); // X3 = F-2*D 36 | 37 | // Y3 = E*(D-X3)-8*C 38 | c = FIELD_double(c); c = FIELD_double(c); c = FIELD_double(c); 39 | inp.y = FIELD_sub(FIELD_mul(FIELD_sub(d, inp.x), e), c); 40 | 41 | return inp; 42 | } 43 | 44 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl 45 | DEVICE POINT_jacobian POINT_add_mixed(POINT_jacobian a, POINT_affine b) { 46 | const FIELD local_zero = FIELD_ZERO; 47 | if(FIELD_eq(a.z, local_zero)) { 48 | const FIELD local_one = FIELD_ONE; 49 | a.x = b.x; 50 | a.y = b.y; 51 | a.z = local_one; 52 | return a; 53 | } 54 | 55 | const FIELD z1z1 = FIELD_sqr(a.z); 56 | const FIELD u2 = FIELD_mul(b.x, z1z1); 57 | const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); 58 | 59 | if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) { 60 | return POINT_double(a); 61 | } 62 | 63 | const FIELD h = FIELD_sub(u2, a.x); // H = U2-X1 64 | const FIELD hh = FIELD_sqr(h); // HH = H^2 65 | FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH 66 | FIELD j = FIELD_mul(h, i); // J = H*I 67 | FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1) 68 | const FIELD v = FIELD_mul(a.x, i); 69 | 70 | POINT_jacobian ret; 71 | 72 | // X3 = r^2 - J - 2*V 73 | ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v)); 74 | 75 | // Y3 = r*(V-X3)-2*Y1*J 76 | j = FIELD_mul(a.y, j); j = FIELD_double(j); 77 | ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j); 78 | 79 | // Z3 = (Z1+H)^2-Z1Z1-HH 80 | ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh); 81 | return ret; 82 | } 83 | 84 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 85 | DEVICE POINT_jacobian POINT_add(POINT_jacobian a, POINT_jacobian b) { 86 | 87 | const FIELD local_zero = FIELD_ZERO; 88 | if(FIELD_eq(a.z, local_zero)) return b; 89 | if(FIELD_eq(b.z, local_zero)) return a; 90 | 91 | const FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2 92 | const FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2 93 | const FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2 94 | const FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1 95 | FIELD s1 = FIELD_mul(FIELD_mul(a.y, b.z), z2z2); // S1 = Y1*Z2*Z2Z2 96 | const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1 97 | 98 | if(FIELD_eq(u1, u2) && FIELD_eq(s1, s2)) 99 | return POINT_double(a); 100 | else { 101 | const FIELD h = FIELD_sub(u2, u1); // H = U2-U1 102 | FIELD i = FIELD_double(h); i = FIELD_sqr(i); // I = (2*H)^2 103 | const FIELD j = FIELD_mul(h, i); // J = H*I 104 | FIELD r = FIELD_sub(s2, s1); r = FIELD_double(r); // r = 2*(S2-S1) 105 | const FIELD v = FIELD_mul(u1, i); // V = U1*I 106 | a.x = FIELD_sub(FIELD_sub(FIELD_sub(FIELD_sqr(r), j), v), v); // X3 = r^2 - J - 2*V 107 | 108 | // Y3 = r*(V - X3) - 2*S1*J 109 | a.y = FIELD_mul(FIELD_sub(v, a.x), r); 110 | s1 = FIELD_mul(s1, j); s1 = FIELD_double(s1); // S1 = S1 * J * 2 111 | a.y = FIELD_sub(a.y, s1); 112 | 113 | // Z3 = ((Z1+Z2)^2 - Z1Z1 - Z2Z2)*H 114 | a.z = FIELD_add(a.z, b.z); a.z = FIELD_sqr(a.z); 115 | a.z = FIELD_sub(FIELD_sub(a.z, z1z1), z2z2); 116 | a.z = FIELD_mul(a.z, h); 117 | 118 | return a; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/fft.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * FFT algorithm is inspired from: http://www.bealto.com/gpu-fft_group-1.html 3 | */ 4 | KERNEL void FIELD_radix_fft(GLOBAL FIELD* x, // Source buffer 5 | GLOBAL FIELD* y, // Destination buffer 6 | GLOBAL FIELD* pq, // Precalculated twiddle factors 7 | GLOBAL FIELD* omegas, // [omega, omega^2, omega^4, ...] 8 | LOCAL FIELD* u_arg, // Local buffer to store intermediary values 9 | uint n, // Number of elements 10 | uint lgp, // Log2 of `p` (Read more in the link above) 11 | uint deg, // 1=>radix2, 2=>radix4, 3=>radix8, ... 12 | uint max_deg) // Maximum degree supported, according to `pq` and `omegas` 13 | { 14 | // CUDA doesn't support local buffers ("shared memory" in CUDA lingo) as function arguments, 15 | // ignore that argument and use the globally defined extern memory instead. 16 | #ifdef CUDA 17 | // There can only be a single dynamic shared memory item, hence cast it to the type we need. 18 | FIELD* u = (FIELD*)cuda_shared; 19 | #else 20 | LOCAL FIELD* u = u_arg; 21 | #endif 22 | 23 | uint lid = GET_LOCAL_ID(); 24 | uint lsize = GET_LOCAL_SIZE(); 25 | uint index = GET_GROUP_ID(); 26 | uint t = n >> deg; 27 | uint p = 1 << lgp; 28 | uint k = index & (p - 1); 29 | 30 | x += index; 31 | y += ((index - k) << deg) + k; 32 | 33 | uint count = 1 << deg; // 2^deg 34 | uint counth = count >> 1; // Half of count 35 | 36 | uint counts = count / lsize * lid; 37 | uint counte = counts + count / lsize; 38 | 39 | // Compute powers of twiddle 40 | const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k); 41 | FIELD tmp = FIELD_pow(twiddle, counts); 42 | for(uint i = counts; i < counte; i++) { 43 | u[i] = FIELD_mul(tmp, x[i*t]); 44 | tmp = FIELD_mul(tmp, twiddle); 45 | } 46 | BARRIER_LOCAL(); 47 | 48 | const uint pqshift = max_deg - deg; 49 | for(uint rnd = 0; rnd < deg; rnd++) { 50 | const uint bit = counth >> rnd; 51 | for(uint i = counts >> 1; i < counte >> 1; i++) { 52 | const uint di = i & (bit - 1); 53 | const uint i0 = (i << 1) - di; 54 | const uint i1 = i0 + bit; 55 | tmp = u[i0]; 56 | u[i0] = FIELD_add(u[i0], u[i1]); 57 | u[i1] = FIELD_sub(tmp, u[i1]); 58 | if(di != 0) u[i1] = FIELD_mul(pq[di << rnd << pqshift], u[i1]); 59 | } 60 | 61 | BARRIER_LOCAL(); 62 | } 63 | 64 | for(uint i = counts >> 1; i < counte >> 1; i++) { 65 | y[i*p] = u[bitreverse(i, deg)]; 66 | y[(i+counth)*p] = u[bitreverse(i + counth, deg)]; 67 | } 68 | } 69 | 70 | /// Multiplies all of the elements by `field` 71 | KERNEL void FIELD_mul_by_field(GLOBAL FIELD* elements, 72 | uint n, 73 | FIELD field) { 74 | const uint gid = GET_GLOBAL_ID(); 75 | elements[gid] = FIELD_mul(elements[gid], field); 76 | } 77 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/field.cl: -------------------------------------------------------------------------------- 1 | // FinalityLabs - 2019 2 | // Arbitrary size prime-field arithmetic library (add, sub, mul, pow) 3 | 4 | #define FIELD_BITS (FIELD_LIMBS * FIELD_LIMB_BITS) 5 | #if FIELD_LIMB_BITS == 32 6 | #define FIELD_mac_with_carry mac_with_carry_32 7 | #define FIELD_add_with_carry add_with_carry_32 8 | #elif FIELD_LIMB_BITS == 64 9 | #define FIELD_mac_with_carry mac_with_carry_64 10 | #define FIELD_add_with_carry add_with_carry_64 11 | #endif 12 | 13 | // Greater than or equal 14 | DEVICE bool FIELD_gte(FIELD a, FIELD b) { 15 | for(char i = FIELD_LIMBS - 1; i >= 0; i--){ 16 | if(a.val[i] > b.val[i]) 17 | return true; 18 | if(a.val[i] < b.val[i]) 19 | return false; 20 | } 21 | return true; 22 | } 23 | 24 | // Equals 25 | DEVICE bool FIELD_eq(FIELD a, FIELD b) { 26 | for(uchar i = 0; i < FIELD_LIMBS; i++) 27 | if(a.val[i] != b.val[i]) 28 | return false; 29 | return true; 30 | } 31 | 32 | // Normal addition 33 | #if defined(OPENCL_NVIDIA) || defined(CUDA) 34 | #define FIELD_add_ FIELD_add_nvidia 35 | #define FIELD_sub_ FIELD_sub_nvidia 36 | #else 37 | DEVICE FIELD FIELD_add_(FIELD a, FIELD b) { 38 | bool carry = 0; 39 | for(uchar i = 0; i < FIELD_LIMBS; i++) { 40 | FIELD_limb old = a.val[i]; 41 | a.val[i] += b.val[i] + carry; 42 | carry = carry ? old >= a.val[i] : old > a.val[i]; 43 | } 44 | return a; 45 | } 46 | FIELD FIELD_sub_(FIELD a, FIELD b) { 47 | bool borrow = 0; 48 | for(uchar i = 0; i < FIELD_LIMBS; i++) { 49 | FIELD_limb old = a.val[i]; 50 | a.val[i] -= b.val[i] + borrow; 51 | borrow = borrow ? old <= a.val[i] : old < a.val[i]; 52 | } 53 | return a; 54 | } 55 | #endif 56 | 57 | // Modular subtraction 58 | DEVICE FIELD FIELD_sub(FIELD a, FIELD b) { 59 | FIELD res = FIELD_sub_(a, b); 60 | if(!FIELD_gte(a, b)) res = FIELD_add_(res, FIELD_P); 61 | return res; 62 | } 63 | 64 | // Modular addition 65 | DEVICE FIELD FIELD_add(FIELD a, FIELD b) { 66 | FIELD res = FIELD_add_(a, b); 67 | if(FIELD_gte(res, FIELD_P)) res = FIELD_sub_(res, FIELD_P); 68 | return res; 69 | } 70 | 71 | 72 | #ifdef CUDA 73 | // Code based on the work from Supranational, with special thanks to Niall Emmart: 74 | // 75 | // We would like to acknowledge Niall Emmart at Nvidia for his significant 76 | // contribution of concepts and code for generating efficient SASS on 77 | // Nvidia GPUs. The following papers may be of interest: 78 | // Optimizing Modular Multiplication for NVIDIA's Maxwell GPUs 79 | // https://ieeexplore.ieee.org/document/7563271 80 | // 81 | // Faster modular exponentiation using double precision floating point 82 | // arithmetic on the GPU 83 | // https://ieeexplore.ieee.org/document/8464792 84 | 85 | DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq[FIELD_LIMBS]) { 86 | // accLow is an IN and OUT vector 87 | // count must be even 88 | const uint32_t count = FIELD_LIMBS; 89 | uint32_t accHigh[FIELD_LIMBS]; 90 | uint32_t bucket=0, lowCarry=0, highCarry=0, q; 91 | int32_t i, j; 92 | 93 | #pragma unroll 94 | for(i=0;i= xLimbs 168 | DEVICE inline 169 | void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) { 170 | const uint32_t xLimbs = FIELD_LIMBS; 171 | const uint32_t yLimbs = FIELD_LIMBS; 172 | const uint32_t xyLimbs = FIELD_LIMBS * 2; 173 | uint32_t temp[FIELD_LIMBS * 2]; 174 | uint32_t carry = 0; 175 | 176 | #pragma unroll 177 | for (int32_t i = 0; i < xyLimbs; i++) { 178 | temp[i] = 0; 179 | } 180 | 181 | #pragma unroll 182 | for (int32_t i = 0; i < xLimbs; i++) { 183 | chain_t chain1; 184 | chain_init(&chain1); 185 | #pragma unroll 186 | for (int32_t j = 0; j < yLimbs; j++) { 187 | if ((i + j) % 2 == 1) { 188 | temp[i + j - 1] = chain_madlo(&chain1, x[i], y[j], temp[i + j - 1]); 189 | temp[i + j] = chain_madhi(&chain1, x[i], y[j], temp[i + j]); 190 | } 191 | } 192 | if (i % 2 == 1) { 193 | temp[i + yLimbs - 1] = chain_add(&chain1, 0, 0); 194 | } 195 | } 196 | 197 | #pragma unroll 198 | for (int32_t i = xyLimbs - 1; i > 0; i--) { 199 | temp[i] = temp[i - 1]; 200 | } 201 | temp[0] = 0; 202 | 203 | #pragma unroll 204 | for (int32_t i = 0; i < xLimbs; i++) { 205 | chain_t chain2; 206 | chain_init(&chain2); 207 | 208 | #pragma unroll 209 | for (int32_t j = 0; j < yLimbs; j++) { 210 | if ((i + j) % 2 == 0) { 211 | temp[i + j] = chain_madlo(&chain2, x[i], y[j], temp[i + j]); 212 | temp[i + j + 1] = chain_madhi(&chain2, x[i], y[j], temp[i + j + 1]); 213 | } 214 | } 215 | if ((i + yLimbs) % 2 == 0 && i != yLimbs - 1) { 216 | temp[i + yLimbs] = chain_add(&chain2, temp[i + yLimbs], carry); 217 | temp[i + yLimbs + 1] = chain_add(&chain2, temp[i + yLimbs + 1], 0); 218 | carry = chain_add(&chain2, 0, 0); 219 | } 220 | if ((i + yLimbs) % 2 == 1 && i != yLimbs - 1) { 221 | carry = chain_add(&chain2, carry, 0); 222 | } 223 | } 224 | 225 | #pragma unroll 226 | for(int32_t i = 0; i < xyLimbs; i++) { 227 | xy[i] = temp[i]; 228 | } 229 | } 230 | 231 | DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) { 232 | // Perform full multiply 233 | limb ab[2 * FIELD_LIMBS]; 234 | FIELD_mult_v1(a.val, b.val, ab); 235 | 236 | uint32_t io[FIELD_LIMBS]; 237 | #pragma unroll 238 | for(int i=0;i= 1; i--) 321 | a.val[i] = (a.val[i] << 1) | (a.val[i - 1] >> (FIELD_LIMB_BITS - 1)); 322 | a.val[0] <<= 1; 323 | if(FIELD_gte(a, FIELD_P)) a = FIELD_sub_(a, FIELD_P); 324 | return a; 325 | } 326 | 327 | // Modular exponentiation (Exponentiation by Squaring) 328 | // https://en.wikipedia.org/wiki/Exponentiation_by_squaring 329 | DEVICE FIELD FIELD_pow(FIELD base, uint exponent) { 330 | FIELD res = FIELD_ONE; 331 | while(exponent > 0) { 332 | if (exponent & 1) 333 | res = FIELD_mul(res, base); 334 | exponent = exponent >> 1; 335 | base = FIELD_sqr(base); 336 | } 337 | return res; 338 | } 339 | 340 | 341 | // Store squares of the base in a lookup table for faster evaluation. 342 | DEVICE FIELD FIELD_pow_lookup(GLOBAL FIELD *bases, uint exponent) { 343 | FIELD res = FIELD_ONE; 344 | uint i = 0; 345 | while(exponent > 0) { 346 | if (exponent & 1) 347 | res = FIELD_mul(res, bases[i]); 348 | exponent = exponent >> 1; 349 | i++; 350 | } 351 | return res; 352 | } 353 | 354 | DEVICE FIELD FIELD_mont(FIELD a) { 355 | return FIELD_mul(a, FIELD_R2); 356 | } 357 | 358 | DEVICE FIELD FIELD_unmont(FIELD a) { 359 | FIELD one = FIELD_ZERO; 360 | one.val[0] = 1; 361 | return FIELD_mul(a, one); 362 | } 363 | 364 | // Get `i`th bit (From most significant digit) of the field. 365 | DEVICE bool FIELD_get_bit(FIELD l, uint i) { 366 | return (l.val[FIELD_LIMBS - 1 - i / FIELD_LIMB_BITS] >> (FIELD_LIMB_BITS - 1 - (i % FIELD_LIMB_BITS))) & 1; 367 | } 368 | 369 | // Get `window` consecutive bits, (Starting from `skip`th bit) from the field. 370 | DEVICE uint FIELD_get_bits(FIELD l, uint skip, uint window) { 371 | uint ret = 0; 372 | for(uint i = 0; i < window; i++) { 373 | ret <<= 1; 374 | ret |= FIELD_get_bit(l, skip + i); 375 | } 376 | return ret; 377 | } 378 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/field2.cl: -------------------------------------------------------------------------------- 1 | // Fp2 Extension Field where u^2 + 1 = 0 2 | 3 | #define FIELD2_LIMB_BITS FIELD_LIMB_BITS 4 | #define FIELD2_ZERO ((FIELD2){FIELD_ZERO, FIELD_ZERO}) 5 | #define FIELD2_ONE ((FIELD2){FIELD_ONE, FIELD_ZERO}) 6 | 7 | typedef struct { 8 | FIELD c0; 9 | FIELD c1; 10 | } FIELD2; // Represents: c0 + u * c1 11 | 12 | DEVICE bool FIELD2_eq(FIELD2 a, FIELD2 b) { 13 | return FIELD_eq(a.c0, b.c0) && FIELD_eq(a.c1, b.c1); 14 | } 15 | DEVICE FIELD2 FIELD2_sub(FIELD2 a, FIELD2 b) { 16 | a.c0 = FIELD_sub(a.c0, b.c0); 17 | a.c1 = FIELD_sub(a.c1, b.c1); 18 | return a; 19 | } 20 | DEVICE FIELD2 FIELD2_add(FIELD2 a, FIELD2 b) { 21 | a.c0 = FIELD_add(a.c0, b.c0); 22 | a.c1 = FIELD_add(a.c1, b.c1); 23 | return a; 24 | } 25 | DEVICE FIELD2 FIELD2_double(FIELD2 a) { 26 | a.c0 = FIELD_double(a.c0); 27 | a.c1 = FIELD_double(a.c1); 28 | return a; 29 | } 30 | 31 | /* 32 | * (a_0 + u * a_1)(b_0 + u * b_1) = a_0 * b_0 - a_1 * b_1 + u * (a_0 * b_1 + a_1 * b_0) 33 | * Therefore: 34 | * c_0 = a_0 * b_0 - a_1 * b_1 35 | * c_1 = (a_0 * b_1 + a_1 * b_0) = (a_0 + a_1) * (b_0 + b_1) - a_0 * b_0 - a_1 * b_1 36 | */ 37 | DEVICE FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) { 38 | const FIELD aa = FIELD_mul(a.c0, b.c0); 39 | const FIELD bb = FIELD_mul(a.c1, b.c1); 40 | const FIELD o = FIELD_add(b.c0, b.c1); 41 | a.c1 = FIELD_add(a.c1, a.c0); 42 | a.c1 = FIELD_mul(a.c1, o); 43 | a.c1 = FIELD_sub(a.c1, aa); 44 | a.c1 = FIELD_sub(a.c1, bb); 45 | a.c0 = FIELD_sub(aa, bb); 46 | return a; 47 | } 48 | 49 | /* 50 | * (a_0 + u * a_1)(a_0 + u * a_1) = a_0 ^ 2 - a_1 ^ 2 + u * 2 * a_0 * a_1 51 | * Therefore: 52 | * c_0 = (a_0 * a_0 - a_1 * a_1) = (a_0 + a_1)(a_0 - a_1) 53 | * c_1 = 2 * a_0 * a_1 54 | */ 55 | DEVICE FIELD2 FIELD2_sqr(FIELD2 a) { 56 | const FIELD ab = FIELD_mul(a.c0, a.c1); 57 | const FIELD c0c1 = FIELD_add(a.c0, a.c1); 58 | a.c0 = FIELD_mul(FIELD_sub(a.c0, a.c1), c0c1); 59 | a.c1 = FIELD_double(ab); 60 | return a; 61 | } 62 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/multiexp.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Same multiexp algorithm used in Bellman, with some modifications. 3 | * https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L174 4 | * The CPU version of multiexp parallelism is done by dividing the exponent 5 | * values into smaller windows, and then applying a sequence of rounds to each 6 | * window. The GPU kernel not only assigns a thread to each window but also 7 | * divides the bases into several groups which highly increases the number of 8 | * threads running in parallel for calculating a multiexp instance. 9 | */ 10 | 11 | KERNEL void POINT_multiexp( 12 | GLOBAL POINT_affine *bases, 13 | GLOBAL POINT_jacobian *buckets, 14 | GLOBAL POINT_jacobian *results, 15 | GLOBAL EXPONENT *exps, 16 | uint n, 17 | uint num_groups, 18 | uint num_windows, 19 | uint window_size) { 20 | 21 | // We have `num_windows` * `num_groups` threads per multiexp. 22 | const uint gid = GET_GLOBAL_ID(); 23 | if(gid >= num_windows * num_groups) return; 24 | 25 | // We have (2^window_size - 1) buckets. 26 | const uint bucket_len = ((1 << window_size) - 1); 27 | 28 | // Each thread has its own set of buckets in global memory. 29 | buckets += bucket_len * gid; 30 | 31 | const POINT_jacobian local_zero = POINT_ZERO; 32 | for(uint i = 0; i < bucket_len; i++) buckets[i] = local_zero; 33 | 34 | // Num of elements in each group. Round the number up (ceil). 35 | const uint len = (n + num_groups - 1) / num_groups; 36 | 37 | // This thread runs the multiexp algorithm on elements from `nstart` to `nened` 38 | // on the window [`bits`, `bits` + `w`) 39 | const uint nstart = len * (gid / num_windows); 40 | const uint nend = min(nstart + len, n); 41 | const uint bits = (gid % num_windows) * window_size; 42 | const ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits)); 43 | 44 | POINT_jacobian res = POINT_ZERO; 45 | for(uint i = nstart; i < nend; i++) { 46 | uint ind = EXPONENT_get_bits(exps[i], bits, w); 47 | 48 | #if defined(OPENCL_NVIDIA) || defined(CUDA) 49 | // O_o, weird optimization, having a single special case makes it 50 | // tremendously faster! 51 | // 511 is chosen because it's half of the maximum bucket len, but 52 | // any other number works... Bigger indices seems to be better... 53 | if(ind == 511) buckets[510] = POINT_add_mixed(buckets[510], bases[i]); 54 | else if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]); 55 | #else 56 | if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]); 57 | #endif 58 | } 59 | 60 | // Summation by parts 61 | // e.g. 3a + 2b + 1c = a + 62 | // (a) + b + 63 | // ((a) + b) + c 64 | POINT_jacobian acc = POINT_ZERO; 65 | for(int j = bucket_len - 1; j >= 0; j--) { 66 | acc = POINT_add(acc, buckets[j]); 67 | res = POINT_add(res, acc); 68 | } 69 | 70 | results[gid] = res; 71 | } 72 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/cl/test.cl: -------------------------------------------------------------------------------- 1 | KERNEL void test_add(FIELD a, FIELD b, GLOBAL FIELD *result) { 2 | *result = FIELD_add(a, b); 3 | } 4 | 5 | KERNEL void test_mul(FIELD a, FIELD b, GLOBAL FIELD *result) { 6 | *result = FIELD_mul(a, b); 7 | } 8 | 9 | KERNEL void test_sub(FIELD a, FIELD b, GLOBAL FIELD *result) { 10 | *result = FIELD_sub(a, b); 11 | } 12 | 13 | KERNEL void test_pow(FIELD a, uint b, GLOBAL FIELD *result) { 14 | *result = FIELD_pow(a, b); 15 | } 16 | 17 | KERNEL void test_mont(FIELD a, GLOBAL FIELD *result) { 18 | *result = FIELD_mont(a); 19 | } 20 | 21 | KERNEL void test_unmont(FIELD a, GLOBAL FIELD *result) { 22 | *result = FIELD_unmont(a); 23 | } 24 | 25 | KERNEL void test_sqr(FIELD a, GLOBAL FIELD *result) { 26 | *result = FIELD_sqr(a); 27 | } 28 | 29 | KERNEL void test_double(FIELD a, GLOBAL FIELD *result) { 30 | *result = FIELD_double(a); 31 | } 32 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/error.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | #[cfg(any(feature = "cuda", feature = "opencl"))] 4 | use rust_gpu_tools::GPUError; 5 | 6 | /// Errors of this library. 7 | #[derive(thiserror::Error, Debug)] 8 | pub enum EcError { 9 | /// A simple error that is described by a string. 10 | #[error("EcError: {0}")] 11 | Simple(&'static str), 12 | 13 | /// Error in case a GPU kernel execution was aborted. 14 | #[cfg(any(feature = "cuda", feature = "opencl"))] 15 | #[error("GPU call was aborted!")] 16 | Aborted, 17 | 18 | /// An error that is bubbled up from the rust-gpu-tools library. 19 | #[cfg(any(feature = "cuda", feature = "opencl"))] 20 | #[error("GPU tools error: {0}")] 21 | GpuTools(#[from] GPUError), 22 | 23 | /// IO error. 24 | #[error("Encountered an I/O error: {0}")] 25 | Io(#[from] io::Error), 26 | } 27 | 28 | /// Result wrapper that is always using [`EcError`] as error. 29 | pub type EcResult = std::result::Result; 30 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/fft.rs: -------------------------------------------------------------------------------- 1 | use std::cmp; 2 | use std::sync::{Arc, RwLock}; 3 | 4 | use ec_gpu::GpuName; 5 | use ff::Field; 6 | use log::{error, info}; 7 | use rust_gpu_tools::{program_closures, LocalBuffer, Program}; 8 | 9 | use crate::error::{EcError, EcResult}; 10 | use crate::threadpool::THREAD_POOL; 11 | 12 | const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported. 13 | const MAX_LOG2_RADIX: u32 = 8; // Radix256 14 | const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128 15 | 16 | /// FFT kernel for a single GPU. 17 | pub struct SingleFftKernel<'a, F> 18 | where 19 | F: Field + GpuName, 20 | { 21 | program: Program, 22 | /// An optional function which will be called at places where it is possible to abort the FFT 23 | /// calculations. If it returns true, the calculation will be aborted with an 24 | /// [`EcError::Aborted`]. 25 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 26 | _phantom: std::marker::PhantomData, 27 | } 28 | 29 | impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { 30 | /// Create a new FFT instance for the given device. 31 | /// 32 | /// The `maybe_abort` function is called when it is possible to abort the computation, without 33 | /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted. 34 | pub fn create( 35 | program: Program, 36 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 37 | ) -> EcResult { 38 | Ok(SingleFftKernel { 39 | program, 40 | maybe_abort, 41 | _phantom: Default::default(), 42 | }) 43 | } 44 | 45 | /// Performs FFT on `input` 46 | /// * `omega` - Special value `omega` is used for FFT over finite-fields 47 | /// * `log_n` - Specifies log2 of number of elements 48 | pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> { 49 | let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> { 50 | let n = 1 << log_n; 51 | // All usages are safe as the buffers are initialized from either the host or the GPU 52 | // before they are read. 53 | let mut src_buffer = unsafe { program.create_buffer::(n)? }; 54 | let mut dst_buffer = unsafe { program.create_buffer::(n)? }; 55 | // The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg` 56 | let max_deg = cmp::min(MAX_LOG2_RADIX, log_n); 57 | 58 | // Precalculate: 59 | // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))] 60 | let mut pq = vec![F::ZERO; 1 << max_deg >> 1]; 61 | let twiddle = omega.pow_vartime([(n >> max_deg) as u64]); 62 | pq[0] = F::ONE; 63 | if max_deg > 1 { 64 | pq[1] = twiddle; 65 | for i in 2..(1 << max_deg >> 1) { 66 | pq[i] = pq[i - 1]; 67 | pq[i].mul_assign(&twiddle); 68 | } 69 | } 70 | let pq_buffer = program.create_buffer_from_slice(&pq)?; 71 | 72 | // Precalculate [omega, omega^2, omega^4, omega^8, ..., omega^(2^31)] 73 | let mut omegas = vec![F::ZERO; 32]; 74 | omegas[0] = *omega; 75 | for i in 1..LOG2_MAX_ELEMENTS { 76 | omegas[i] = omegas[i - 1].pow_vartime([2u64]); 77 | } 78 | let omegas_buffer = program.create_buffer_from_slice(&omegas)?; 79 | 80 | program.write_from_buffer(&mut src_buffer, &*input)?; 81 | // Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html) 82 | let mut log_p = 0u32; 83 | // Each iteration performs a FFT round 84 | while log_p < log_n { 85 | if let Some(maybe_abort) = &self.maybe_abort { 86 | if maybe_abort() { 87 | return Err(EcError::Aborted); 88 | } 89 | } 90 | 91 | // 1=>radix2, 2=>radix4, 3=>radix8, ... 92 | let deg = cmp::min(max_deg, log_n - log_p); 93 | 94 | let n = 1u32 << log_n; 95 | let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE); 96 | let global_work_size = n >> deg; 97 | let kernel_name = format!("{}_radix_fft", F::name()); 98 | let kernel = program.create_kernel( 99 | &kernel_name, 100 | global_work_size as usize, 101 | local_work_size as usize, 102 | )?; 103 | kernel 104 | .arg(&src_buffer) 105 | .arg(&dst_buffer) 106 | .arg(&pq_buffer) 107 | .arg(&omegas_buffer) 108 | .arg(&LocalBuffer::::new(1 << deg)) 109 | .arg(&n) 110 | .arg(&log_p) 111 | .arg(°) 112 | .arg(&max_deg) 113 | .run()?; 114 | 115 | log_p += deg; 116 | std::mem::swap(&mut src_buffer, &mut dst_buffer); 117 | } 118 | 119 | program.read_into_buffer(&src_buffer, input)?; 120 | 121 | Ok(()) 122 | }); 123 | 124 | self.program.run(closures, input) 125 | } 126 | } 127 | 128 | /// One FFT kernel for each GPU available. 129 | pub struct FftKernel<'a, F> 130 | where 131 | F: Field + GpuName, 132 | { 133 | kernels: Vec>, 134 | } 135 | 136 | impl<'a, F> FftKernel<'a, F> 137 | where 138 | F: Field + GpuName, 139 | { 140 | /// Create new kernels, one for each given device. 141 | pub fn create(programs: Vec) -> EcResult { 142 | Self::create_optional_abort(programs, None) 143 | } 144 | 145 | /// Create new kernels, one for each given device, with early abort hook. 146 | /// 147 | /// The `maybe_abort` function is called when it is possible to abort the computation, without 148 | /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted. 149 | pub fn create_with_abort( 150 | programs: Vec, 151 | maybe_abort: &'a (dyn Fn() -> bool + Send + Sync), 152 | ) -> EcResult { 153 | Self::create_optional_abort(programs, Some(maybe_abort)) 154 | } 155 | 156 | fn create_optional_abort( 157 | programs: Vec, 158 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 159 | ) -> EcResult { 160 | let kernels: Vec<_> = programs 161 | .into_iter() 162 | .filter_map(|program| { 163 | let device_name = program.device_name().to_string(); 164 | let kernel = SingleFftKernel::::create(program, maybe_abort); 165 | if let Err(ref e) = kernel { 166 | error!( 167 | "Cannot initialize kernel for device '{}'! Error: {}", 168 | device_name, e 169 | ); 170 | } 171 | kernel.ok() 172 | }) 173 | .collect(); 174 | 175 | if kernels.is_empty() { 176 | return Err(EcError::Simple("No working GPUs found!")); 177 | } 178 | info!("FFT: {} working device(s) selected. ", kernels.len()); 179 | for (i, k) in kernels.iter().enumerate() { 180 | info!("FFT: Device {}: {}", i, k.program.device_name(),); 181 | } 182 | 183 | Ok(Self { kernels }) 184 | } 185 | 186 | /// Performs FFT on `input` 187 | /// * `omega` - Special value `omega` is used for FFT over finite-fields 188 | /// * `log_n` - Specifies log2 of number of elements 189 | /// 190 | /// Uses the first available GPU. 191 | pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> { 192 | self.kernels[0].radix_fft(input, omega, log_n) 193 | } 194 | 195 | /// Performs FFT on `inputs` 196 | /// * `omega` - Special value `omega` is used for FFT over finite-fields 197 | /// * `log_n` - Specifies log2 of number of elements 198 | /// 199 | /// Uses all available GPUs to distribute the work. 200 | pub fn radix_fft_many( 201 | &mut self, 202 | inputs: &mut [&mut [F]], 203 | omegas: &[F], 204 | log_ns: &[u32], 205 | ) -> EcResult<()> { 206 | let n = inputs.len(); 207 | let num_devices = self.kernels.len(); 208 | let chunk_size = ((n as f64) / (num_devices as f64)).ceil() as usize; 209 | 210 | let result = Arc::new(RwLock::new(Ok(()))); 211 | 212 | THREAD_POOL.scoped(|s| { 213 | for (((inputs, omegas), log_ns), kern) in inputs 214 | .chunks_mut(chunk_size) 215 | .zip(omegas.chunks(chunk_size)) 216 | .zip(log_ns.chunks(chunk_size)) 217 | .zip(self.kernels.iter_mut()) 218 | { 219 | let result = result.clone(); 220 | s.execute(move || { 221 | for ((input, omega), log_n) in 222 | inputs.iter_mut().zip(omegas.iter()).zip(log_ns.iter()) 223 | { 224 | if result.read().unwrap().is_err() { 225 | break; 226 | } 227 | 228 | if let Err(err) = kern.radix_fft(input, omega, *log_n) { 229 | *result.write().unwrap() = Err(err); 230 | break; 231 | } 232 | } 233 | }); 234 | } 235 | }); 236 | 237 | Arc::try_unwrap(result).unwrap().into_inner().unwrap() 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/fft_cpu.rs: -------------------------------------------------------------------------------- 1 | use ff::PrimeField; 2 | 3 | use crate::threadpool::Worker; 4 | 5 | /// Calculate the Fast Fourier Transform on the CPU (single-threaded). 6 | /// 7 | /// The input `a` is mutated and contains the result when this function returns. The length of the 8 | /// input vector must be `2^log_n`. 9 | #[allow(clippy::many_single_char_names)] 10 | pub fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { 11 | fn bitreverse(mut n: u32, l: u32) -> u32 { 12 | let mut r = 0; 13 | for _ in 0..l { 14 | r = (r << 1) | (n & 1); 15 | n >>= 1; 16 | } 17 | r 18 | } 19 | 20 | let n = a.len() as u32; 21 | assert_eq!(n, 1 << log_n); 22 | 23 | for k in 0..n { 24 | let rk = bitreverse(k, log_n); 25 | if k < rk { 26 | a.swap(rk as usize, k as usize); 27 | } 28 | } 29 | 30 | let mut m = 1; 31 | for _ in 0..log_n { 32 | let w_m = omega.pow_vartime([u64::from(n / (2 * m))]); 33 | 34 | let mut k = 0; 35 | while k < n { 36 | let mut w = F::ONE; 37 | for j in 0..m { 38 | let mut t = a[(k + j + m) as usize]; 39 | t *= w; 40 | let mut tmp = a[(k + j) as usize]; 41 | tmp -= t; 42 | a[(k + j + m) as usize] = tmp; 43 | a[(k + j) as usize] += t; 44 | w *= w_m; 45 | } 46 | 47 | k += 2 * m; 48 | } 49 | 50 | m *= 2; 51 | } 52 | } 53 | 54 | /// Calculate the Fast Fourier Transform on the CPU (multithreaded). 55 | /// 56 | /// The result is is written to the input `a`. 57 | /// The number of threads used will be `2^log_threads`. 58 | /// There must be more items to process than threads. 59 | pub fn parallel_fft( 60 | a: &mut [F], 61 | worker: &Worker, 62 | omega: &F, 63 | log_n: u32, 64 | log_threads: u32, 65 | ) { 66 | assert!(log_n >= log_threads); 67 | 68 | let num_threads = 1 << log_threads; 69 | let log_new_n = log_n - log_threads; 70 | let mut tmp = vec![vec![F::ZERO; 1 << log_new_n]; num_threads]; 71 | let new_omega = omega.pow_vartime([num_threads as u64]); 72 | 73 | worker.scope(0, |scope, _| { 74 | let a = &*a; 75 | 76 | for (j, tmp) in tmp.iter_mut().enumerate() { 77 | scope.execute(move || { 78 | // Shuffle into a sub-FFT 79 | let omega_j = omega.pow_vartime([j as u64]); 80 | let omega_step = omega.pow_vartime([(j as u64) << log_new_n]); 81 | 82 | let mut elt = F::ONE; 83 | for (i, tmp) in tmp.iter_mut().enumerate() { 84 | for s in 0..num_threads { 85 | let idx = (i + (s << log_new_n)) % (1 << log_n); 86 | let mut t = a[idx]; 87 | t *= elt; 88 | *tmp += t; 89 | elt *= omega_step; 90 | } 91 | elt *= omega_j; 92 | } 93 | 94 | // Perform sub-FFT 95 | serial_fft::(tmp, &new_omega, log_new_n); 96 | }); 97 | } 98 | }); 99 | 100 | // TODO: does this hurt or help? 101 | worker.scope(a.len(), |scope, chunk| { 102 | let tmp = &tmp; 103 | 104 | for (idx, a) in a.chunks_mut(chunk).enumerate() { 105 | scope.execute(move || { 106 | let mut idx = idx * chunk; 107 | let mask = (1 << log_threads) - 1; 108 | for a in a { 109 | *a = tmp[idx & mask][idx >> log_threads]; 110 | idx += 1; 111 | } 112 | }); 113 | } 114 | }); 115 | } 116 | 117 | #[cfg(test)] 118 | mod tests { 119 | use super::*; 120 | 121 | use std::cmp::min; 122 | 123 | use blstrs::Scalar as Fr; 124 | use ff::PrimeField; 125 | use rand_core::RngCore; 126 | 127 | fn omega(num_coeffs: usize) -> F { 128 | // Compute omega, the 2^exp primitive root of unity 129 | let exp = (num_coeffs as f32).log2().floor() as u32; 130 | let mut omega = F::ROOT_OF_UNITY; 131 | for _ in exp..F::S { 132 | omega = omega.square(); 133 | } 134 | omega 135 | } 136 | 137 | #[test] 138 | fn parallel_fft_consistency() { 139 | fn test_consistency(rng: &mut R) { 140 | let worker = Worker::new(); 141 | 142 | for _ in 0..5 { 143 | for log_d in 0..10 { 144 | let d = 1 << log_d; 145 | 146 | let mut v1_coeffs = (0..d).map(|_| F::random(&mut *rng)).collect::>(); 147 | let mut v2_coeffs = v1_coeffs.clone(); 148 | let v1_omega = omega::(v1_coeffs.len()); 149 | let v2_omega = v1_omega; 150 | 151 | for log_threads in log_d..min(log_d + 1, 3) { 152 | parallel_fft::(&mut v1_coeffs, &worker, &v1_omega, log_d, log_threads); 153 | serial_fft::(&mut v2_coeffs, &v2_omega, log_d); 154 | 155 | assert!(v1_coeffs == v2_coeffs); 156 | } 157 | } 158 | } 159 | } 160 | 161 | let rng = &mut rand::thread_rng(); 162 | 163 | test_consistency::(rng); 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(missing_docs)] 2 | //! CUDA/OpenCL code generator for finite-field arithmetic over prime fields and elliptic curve 3 | //! arithmetic constructed with Rust. 4 | //! 5 | //! There is also support for Fast Fourier Transform and Multiexponentiation. 6 | //! 7 | //! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time. 8 | //! 9 | //! In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time. 10 | //! 11 | //! In this example we will make use of the FFT functionality. Add to your `build.rs`: 12 | //! 13 | //! ```no_run 14 | //! use blstrs::Scalar; 15 | //! use ec_gpu_gen::SourceBuilder; 16 | //! 17 | //! let source_builder = SourceBuilder::new().add_fft::(); 18 | //! ec_gpu_gen::generate(&source_builder); 19 | //! ``` 20 | //! 21 | //! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source. 22 | //! 23 | //! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this: 24 | //! 25 | //! ```no_compile 26 | //! use blstrs::Scalar; 27 | //! use ec_gpu_gen::{ 28 | //! rust_gpu_tools::Device, 29 | //! }; 30 | //! 31 | //! let devices = Device::all(); 32 | //! let programs = devices 33 | //! .iter() 34 | //! .map(|device| ec_gpu_gen::program!(device)) 35 | //! .collect::>() 36 | //! .expect("Cannot create programs!"); 37 | //! 38 | //! let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); 39 | //! kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!"); 40 | //! ``` 41 | //! 42 | //! Feature flags 43 | //! ------------- 44 | //! 45 | //! CUDA and OpenCL are supported, each be enabled with the `cuda` and `opencl` [feature flags]. 46 | //! 47 | //! [fatbin]: https://en.wikipedia.org/wiki/Fat_binary#Heterogeneous_computing 48 | //! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section 49 | mod error; 50 | #[cfg(any(feature = "cuda", feature = "opencl"))] 51 | mod program; 52 | mod source; 53 | 54 | /// Fast Fourier Transform on the GPU. 55 | #[cfg(any(feature = "cuda", feature = "opencl"))] 56 | pub mod fft; 57 | /// Fast Fourier Transform on the CPU. 58 | pub mod fft_cpu; 59 | /// Multiexponentiation on the GPU. 60 | #[cfg(any(feature = "cuda", feature = "opencl"))] 61 | pub mod multiexp; 62 | /// Multiexponentiation on the CPU. 63 | pub mod multiexp_cpu; 64 | /// Helpers for multithreaded code. 65 | pub mod threadpool; 66 | 67 | /// Re-export rust-gpu-tools as things like [`rust_gpu_tools::Device`] might be needed. 68 | #[cfg(any(feature = "cuda", feature = "opencl"))] 69 | pub use rust_gpu_tools; 70 | 71 | pub use error::{EcError, EcResult}; 72 | pub use source::{generate, SourceBuilder}; 73 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/multiexp.rs: -------------------------------------------------------------------------------- 1 | use std::ops::AddAssign; 2 | use std::sync::{Arc, RwLock}; 3 | 4 | use ec_gpu::GpuName; 5 | use ff::PrimeField; 6 | use group::{prime::PrimeCurveAffine, Group}; 7 | use log::{error, info}; 8 | use rust_gpu_tools::{program_closures, Device, Program}; 9 | use yastl::Scope; 10 | 11 | use crate::{ 12 | error::{EcError, EcResult}, 13 | threadpool::Worker, 14 | }; 15 | 16 | /// On the GPU, the exponents are split into windows, this is the maximum number of such windows. 17 | const MAX_WINDOW_SIZE: usize = 10; 18 | /// In CUDA this is the number of blocks per grid (grid size). 19 | const LOCAL_WORK_SIZE: usize = 128; 20 | /// Let 20% of GPU memory be free, this is an arbitrary value. 21 | const MEMORY_PADDING: f64 = 0.2f64; 22 | /// The Nvidia Ampere architecture is compute capability major version 8. 23 | const AMPERE: u32 = 8; 24 | 25 | /// Divide and ceil to the next value. 26 | const fn div_ceil(a: usize, b: usize) -> usize { 27 | if a % b == 0 { 28 | a / b 29 | } else { 30 | (a / b) + 1 31 | } 32 | } 33 | 34 | /// The number of units the work is split into. One unit will result in one CUDA thread. 35 | /// 36 | /// Based on empirical results, it turns out that on Nvidia devices with the Ampere architecture, 37 | /// it's faster to use two times the number of work units. 38 | const fn work_units(compute_units: u32, compute_capabilities: Option<(u32, u32)>) -> usize { 39 | match compute_capabilities { 40 | Some((AMPERE, _)) => LOCAL_WORK_SIZE * compute_units as usize * 2, 41 | _ => LOCAL_WORK_SIZE * compute_units as usize, 42 | } 43 | } 44 | 45 | /// Multiexp kernel for a single GPU. 46 | pub struct SingleMultiexpKernel<'a, G> 47 | where 48 | G: PrimeCurveAffine, 49 | { 50 | program: Program, 51 | /// The number of exponentiations the GPU can handle in a single execution of the kernel. 52 | n: usize, 53 | /// The number of units the work is split into. It will results in this amount of threads on 54 | /// the GPU. 55 | work_units: usize, 56 | /// An optional function which will be called at places where it is possible to abort the 57 | /// multiexp calculations. If it returns true, the calculation will be aborted with an 58 | /// [`EcError::Aborted`]. 59 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 60 | 61 | _phantom: std::marker::PhantomData, 62 | } 63 | 64 | /// Calculates the maximum number of terms that can be put onto the GPU memory. 65 | fn calc_chunk_size(mem: u64, work_units: usize) -> usize 66 | where 67 | G: PrimeCurveAffine, 68 | G::Scalar: PrimeField, 69 | { 70 | let aff_size = std::mem::size_of::(); 71 | let exp_size = exp_size::(); 72 | let proj_size = std::mem::size_of::(); 73 | 74 | // Leave `MEMORY_PADDING` percent of the memory free. 75 | let max_memory = ((mem as f64) * (1f64 - MEMORY_PADDING)) as usize; 76 | // The amount of memory (in bytes) of a single term. 77 | let term_size = aff_size + exp_size; 78 | // The number of buckets needed for one work unit 79 | let max_buckets_per_work_unit = 1 << MAX_WINDOW_SIZE; 80 | // The amount of memory (in bytes) we need for the intermediate steps (buckets). 81 | let buckets_size = work_units * max_buckets_per_work_unit * proj_size; 82 | // The amount of memory (in bytes) we need for the results. 83 | let results_size = work_units * proj_size; 84 | 85 | (max_memory - buckets_size - results_size) / term_size 86 | } 87 | 88 | /// The size of the exponent in bytes. 89 | /// 90 | /// It's the actual bytes size it needs in memory, not it's theoretical bit size. 91 | fn exp_size() -> usize { 92 | std::mem::size_of::() 93 | } 94 | 95 | impl<'a, G> SingleMultiexpKernel<'a, G> 96 | where 97 | G: PrimeCurveAffine + GpuName, 98 | { 99 | /// Create a new Multiexp kernel instance for a device. 100 | /// 101 | /// The `maybe_abort` function is called when it is possible to abort the computation, without 102 | /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted. 103 | pub fn create( 104 | program: Program, 105 | device: &Device, 106 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 107 | ) -> EcResult { 108 | let mem = device.memory(); 109 | let compute_units = device.compute_units(); 110 | let compute_capability = device.compute_capability(); 111 | let work_units = work_units(compute_units, compute_capability); 112 | let chunk_size = calc_chunk_size::(mem, work_units); 113 | 114 | Ok(SingleMultiexpKernel { 115 | program, 116 | n: chunk_size, 117 | work_units, 118 | maybe_abort, 119 | _phantom: std::marker::PhantomData, 120 | }) 121 | } 122 | 123 | /// Run the actual multiexp computation on the GPU. 124 | /// 125 | /// The number of `bases` and `exponents` are determined by [`SingleMultiexpKernel`]`::n`, this 126 | /// means that it is guaranteed that this amount of calculations fit on the GPU this kernel is 127 | /// running on. 128 | pub fn multiexp( 129 | &self, 130 | bases: &[G], 131 | exponents: &[::Repr], 132 | ) -> EcResult { 133 | assert_eq!(bases.len(), exponents.len()); 134 | 135 | if let Some(maybe_abort) = &self.maybe_abort { 136 | if maybe_abort() { 137 | return Err(EcError::Aborted); 138 | } 139 | } 140 | let window_size = self.calc_window_size(bases.len()); 141 | // windows_size * num_windows needs to be >= 256 in order for the kernel to work correctly. 142 | let num_windows = div_ceil(256, window_size); 143 | let num_groups = self.work_units / num_windows; 144 | let bucket_len = 1 << window_size; 145 | 146 | // Each group will have `num_windows` threads and as there are `num_groups` groups, there will 147 | // be `num_groups` * `num_windows` threads in total. 148 | // Each thread will use `num_groups` * `num_windows` * `bucket_len` buckets. 149 | 150 | let closures = program_closures!(|program, _arg| -> EcResult> { 151 | let base_buffer = program.create_buffer_from_slice(bases)?; 152 | let exp_buffer = program.create_buffer_from_slice(exponents)?; 153 | 154 | // It is safe as the GPU will initialize that buffer 155 | let bucket_buffer = 156 | unsafe { program.create_buffer::(self.work_units * bucket_len)? }; 157 | // It is safe as the GPU will initialize that buffer 158 | let result_buffer = unsafe { program.create_buffer::(self.work_units)? }; 159 | 160 | // The global work size follows CUDA's definition and is the number of 161 | // `LOCAL_WORK_SIZE` sized thread groups. 162 | let global_work_size = div_ceil(num_windows * num_groups, LOCAL_WORK_SIZE); 163 | 164 | let kernel_name = format!("{}_multiexp", G::name()); 165 | let kernel = program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)?; 166 | 167 | kernel 168 | .arg(&base_buffer) 169 | .arg(&bucket_buffer) 170 | .arg(&result_buffer) 171 | .arg(&exp_buffer) 172 | .arg(&(bases.len() as u32)) 173 | .arg(&(num_groups as u32)) 174 | .arg(&(num_windows as u32)) 175 | .arg(&(window_size as u32)) 176 | .run()?; 177 | 178 | let mut results = vec![G::Curve::identity(); self.work_units]; 179 | program.read_into_buffer(&result_buffer, &mut results)?; 180 | 181 | Ok(results) 182 | }); 183 | 184 | let results = self.program.run(closures, ())?; 185 | 186 | // Using the algorithm below, we can calculate the final result by accumulating the results 187 | // of those `NUM_GROUPS` * `NUM_WINDOWS` threads. 188 | let mut acc = G::Curve::identity(); 189 | let mut bits = 0; 190 | let exp_bits = exp_size::() * 8; 191 | for i in 0..num_windows { 192 | let w = std::cmp::min(window_size, exp_bits - bits); 193 | for _ in 0..w { 194 | acc = acc.double(); 195 | } 196 | for g in 0..num_groups { 197 | acc.add_assign(&results[g * num_windows + i]); 198 | } 199 | bits += w; // Process the next window 200 | } 201 | 202 | Ok(acc) 203 | } 204 | 205 | /// Calculates the window size, based on the given number of terms. 206 | /// 207 | /// For best performance, the window size is reduced, so that maximum parallelism is possible. 208 | /// If you e.g. have put only a subset of the terms into the GPU memory, then a smaller window 209 | /// size leads to more windows, hence more units to work on, as we split the work into 210 | /// `num_windows * num_groups`. 211 | fn calc_window_size(&self, num_terms: usize) -> usize { 212 | // The window size was determined by running the `gpu_multiexp_consistency` test and 213 | // looking at the resulting numbers. 214 | let window_size = ((div_ceil(num_terms, self.work_units) as f64).log2() as usize) + 2; 215 | std::cmp::min(window_size, MAX_WINDOW_SIZE) 216 | } 217 | } 218 | 219 | /// A struct that contains several multiexp kernels for different devices. 220 | pub struct MultiexpKernel<'a, G> 221 | where 222 | G: PrimeCurveAffine, 223 | { 224 | kernels: Vec>, 225 | } 226 | 227 | impl<'a, G> MultiexpKernel<'a, G> 228 | where 229 | G: PrimeCurveAffine + GpuName, 230 | { 231 | /// Create new kernels, one for each given device. 232 | pub fn create(programs: Vec, devices: &[&Device]) -> EcResult { 233 | Self::create_optional_abort(programs, devices, None) 234 | } 235 | 236 | /// Create new kernels, one for each given device, with early abort hook. 237 | /// 238 | /// The `maybe_abort` function is called when it is possible to abort the computation, without 239 | /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted. 240 | pub fn create_with_abort( 241 | programs: Vec, 242 | devices: &[&Device], 243 | maybe_abort: &'a (dyn Fn() -> bool + Send + Sync), 244 | ) -> EcResult { 245 | Self::create_optional_abort(programs, devices, Some(maybe_abort)) 246 | } 247 | 248 | fn create_optional_abort( 249 | programs: Vec, 250 | devices: &[&Device], 251 | maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, 252 | ) -> EcResult { 253 | let kernels: Vec<_> = programs 254 | .into_iter() 255 | .zip(devices.iter()) 256 | .filter_map(|(program, device)| { 257 | let device_name = program.device_name().to_string(); 258 | let kernel = SingleMultiexpKernel::create(program, device, maybe_abort); 259 | if let Err(ref e) = kernel { 260 | error!( 261 | "Cannot initialize kernel for device '{}'! Error: {}", 262 | device_name, e 263 | ); 264 | } 265 | kernel.ok() 266 | }) 267 | .collect(); 268 | 269 | if kernels.is_empty() { 270 | return Err(EcError::Simple("No working GPUs found!")); 271 | } 272 | info!("Multiexp: {} working device(s) selected.", kernels.len()); 273 | for (i, k) in kernels.iter().enumerate() { 274 | info!( 275 | "Multiexp: Device {}: {} (Chunk-size: {})", 276 | i, 277 | k.program.device_name(), 278 | k.n 279 | ); 280 | } 281 | Ok(MultiexpKernel { kernels }) 282 | } 283 | 284 | /// Calculate multiexp on all available GPUs. 285 | /// 286 | /// It needs to run within a [`yastl::Scope`]. This method usually isn't called directly, use 287 | /// [`MultiexpKernel::multiexp`] instead. 288 | pub fn parallel_multiexp<'s>( 289 | &'s mut self, 290 | scope: &Scope<'s>, 291 | bases: &'s [G], 292 | exps: &'s [::Repr], 293 | results: &'s mut [G::Curve], 294 | error: Arc>>, 295 | ) { 296 | let num_devices = self.kernels.len(); 297 | let num_exps = exps.len(); 298 | // The maximum number of exponentiations per device. 299 | let chunk_size = ((num_exps as f64) / (num_devices as f64)).ceil() as usize; 300 | 301 | for (((bases, exps), kern), result) in bases 302 | .chunks(chunk_size) 303 | .zip(exps.chunks(chunk_size)) 304 | // NOTE vmx 2021-11-17: This doesn't need to be a mutable iterator. But when it isn't 305 | // there will be errors that the OpenCL CommandQueue cannot be shared between threads 306 | // safely. 307 | .zip(self.kernels.iter_mut()) 308 | .zip(results.iter_mut()) 309 | { 310 | let error = error.clone(); 311 | scope.execute(move || { 312 | let mut acc = G::Curve::identity(); 313 | for (bases, exps) in bases.chunks(kern.n).zip(exps.chunks(kern.n)) { 314 | if error.read().unwrap().is_err() { 315 | break; 316 | } 317 | match kern.multiexp(bases, exps) { 318 | Ok(result) => acc.add_assign(&result), 319 | Err(e) => { 320 | *error.write().unwrap() = Err(e); 321 | break; 322 | } 323 | } 324 | } 325 | if error.read().unwrap().is_ok() { 326 | *result = acc; 327 | } 328 | }); 329 | } 330 | } 331 | 332 | /// Calculate multiexp. 333 | /// 334 | /// This is the main entry point. 335 | pub fn multiexp( 336 | &mut self, 337 | pool: &Worker, 338 | bases_arc: Arc>, 339 | exps: Arc::Repr>>, 340 | skip: usize, 341 | ) -> EcResult { 342 | // Bases are skipped by `self.1` elements, when converted from (Arc>, usize) to Source 343 | // https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L38 344 | let bases = &bases_arc[skip..(skip + exps.len())]; 345 | let exps = &exps[..]; 346 | 347 | let mut results = Vec::new(); 348 | let error = Arc::new(RwLock::new(Ok(()))); 349 | 350 | pool.scoped(|s| { 351 | results = vec![G::Curve::identity(); self.kernels.len()]; 352 | self.parallel_multiexp(s, bases, exps, &mut results, error.clone()); 353 | }); 354 | 355 | Arc::try_unwrap(error) 356 | .expect("only one ref left") 357 | .into_inner() 358 | .unwrap()?; 359 | 360 | let mut acc = G::Curve::identity(); 361 | for r in results { 362 | acc.add_assign(&r); 363 | } 364 | 365 | Ok(acc) 366 | } 367 | 368 | /// Returns the number of kernels (one per device). 369 | pub fn num_kernels(&self) -> usize { 370 | self.kernels.len() 371 | } 372 | } 373 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/multiexp_cpu.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs)] 2 | use std::convert::TryInto; 3 | use std::io; 4 | use std::iter; 5 | use std::ops::AddAssign; 6 | use std::sync::Arc; 7 | 8 | use bitvec::prelude::{BitVec, Lsb0}; 9 | use ff::{Field, PrimeField}; 10 | use group::{prime::PrimeCurveAffine, Group}; 11 | use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; 12 | 13 | use crate::error::EcError; 14 | use crate::threadpool::{Waiter, Worker}; 15 | 16 | /// An object that builds a source of bases. 17 | pub trait SourceBuilder: Send + Sync + 'static + Clone { 18 | type Source: Source; 19 | 20 | #[allow(clippy::wrong_self_convention)] 21 | fn new(self) -> Self::Source; 22 | fn get(self) -> (Arc>, usize); 23 | } 24 | 25 | /// A source of bases, like an iterator. 26 | pub trait Source { 27 | /// Parses the element from the source. Fails if the point is at infinity. 28 | fn add_assign_mixed(&mut self, to: &mut ::Curve) -> Result<(), EcError>; 29 | 30 | /// Skips `amt` elements from the source, avoiding deserialization. 31 | fn skip(&mut self, amt: usize) -> Result<(), EcError>; 32 | } 33 | 34 | impl SourceBuilder for (Arc>, usize) { 35 | type Source = (Arc>, usize); 36 | 37 | fn new(self) -> (Arc>, usize) { 38 | (self.0.clone(), self.1) 39 | } 40 | 41 | fn get(self) -> (Arc>, usize) { 42 | (self.0.clone(), self.1) 43 | } 44 | } 45 | 46 | impl Source for (Arc>, usize) { 47 | fn add_assign_mixed(&mut self, to: &mut ::Curve) -> Result<(), EcError> { 48 | if self.0.len() <= self.1 { 49 | return Err(io::Error::new( 50 | io::ErrorKind::UnexpectedEof, 51 | "Expected more bases from source.", 52 | ) 53 | .into()); 54 | } 55 | 56 | if self.0[self.1].is_identity().into() { 57 | return Err(EcError::Simple( 58 | "Encountered an identity element in the CRS.", 59 | )); 60 | } 61 | 62 | to.add_assign(&self.0[self.1]); 63 | 64 | self.1 += 1; 65 | 66 | Ok(()) 67 | } 68 | 69 | fn skip(&mut self, amt: usize) -> Result<(), EcError> { 70 | if self.0.len() <= self.1 { 71 | return Err(io::Error::new( 72 | io::ErrorKind::UnexpectedEof, 73 | "Expected more bases from source.", 74 | ) 75 | .into()); 76 | } 77 | 78 | self.1 += amt; 79 | 80 | Ok(()) 81 | } 82 | } 83 | 84 | pub trait QueryDensity: Sized { 85 | /// Returns whether the base exists. 86 | type Iter: Iterator; 87 | 88 | fn iter(self) -> Self::Iter; 89 | fn get_query_size(self) -> Option; 90 | fn generate_exps(self, exponents: Arc>) -> Arc>; 91 | } 92 | 93 | #[derive(Clone)] 94 | pub struct FullDensity; 95 | 96 | impl AsRef for FullDensity { 97 | fn as_ref(&self) -> &FullDensity { 98 | self 99 | } 100 | } 101 | 102 | impl QueryDensity for &FullDensity { 103 | type Iter = iter::Repeat; 104 | 105 | fn iter(self) -> Self::Iter { 106 | iter::repeat(true) 107 | } 108 | 109 | fn get_query_size(self) -> Option { 110 | None 111 | } 112 | 113 | fn generate_exps(self, exponents: Arc>) -> Arc> { 114 | exponents 115 | } 116 | } 117 | 118 | #[derive(Clone, PartialEq, Eq, Debug, Default)] 119 | pub struct DensityTracker { 120 | pub bv: BitVec, 121 | pub total_density: usize, 122 | } 123 | 124 | impl<'a> QueryDensity for &'a DensityTracker { 125 | type Iter = bitvec::slice::BitValIter<'a, usize, Lsb0>; 126 | 127 | fn iter(self) -> Self::Iter { 128 | self.bv.iter().by_vals() 129 | } 130 | 131 | fn get_query_size(self) -> Option { 132 | Some(self.bv.len()) 133 | } 134 | 135 | fn generate_exps(self, exponents: Arc>) -> Arc> { 136 | let exps: Vec<_> = exponents 137 | .iter() 138 | .zip(self.bv.iter()) 139 | .filter_map(|(&e, d)| if *d { Some(e) } else { None }) 140 | .collect(); 141 | 142 | Arc::new(exps) 143 | } 144 | } 145 | 146 | impl DensityTracker { 147 | pub fn new() -> DensityTracker { 148 | DensityTracker { 149 | bv: BitVec::new(), 150 | total_density: 0, 151 | } 152 | } 153 | 154 | pub fn add_element(&mut self) { 155 | self.bv.push(false); 156 | } 157 | 158 | pub fn inc(&mut self, idx: usize) { 159 | if !self.bv.get(idx).unwrap() { 160 | self.bv.set(idx, true); 161 | self.total_density += 1; 162 | } 163 | } 164 | 165 | pub fn get_total_density(&self) -> usize { 166 | self.total_density 167 | } 168 | 169 | /// Extend by concatenating `other`. If `is_input_density` is true, then we are tracking an input density, 170 | /// and other may contain a redundant input for the `One` element. Coalesce those as needed and track the result. 171 | pub fn extend(&mut self, other: &Self, is_input_density: bool) { 172 | if other.bv.is_empty() { 173 | // Nothing to do if other is empty. 174 | return; 175 | } 176 | 177 | if self.bv.is_empty() { 178 | // If self is empty, assume other's density. 179 | self.total_density = other.total_density; 180 | self.bv.resize(other.bv.len(), false); 181 | self.bv.copy_from_bitslice(&*other.bv); 182 | return; 183 | } 184 | 185 | if is_input_density { 186 | // Input densities need special handling to coalesce their first inputs. 187 | 188 | if other.bv[0] { 189 | // If other's first bit is set, 190 | if self.bv[0] { 191 | // And own first bit is set, then decrement total density so the final sum doesn't overcount. 192 | self.total_density -= 1; 193 | } else { 194 | // Otherwise, set own first bit. 195 | self.bv.set(0, true); 196 | } 197 | } 198 | // Now discard other's first bit, having accounted for it above, and extend self by remaining bits. 199 | self.bv.extend(other.bv.iter().skip(1)); 200 | } else { 201 | // Not an input density, just extend straightforwardly. 202 | self.bv.extend(other.bv.iter()); 203 | } 204 | 205 | // Since any needed adjustments to total densities have been made, just sum the totals and keep the sum. 206 | self.total_density += other.total_density; 207 | } 208 | } 209 | 210 | // Right shift the repr of a field element by `n` bits. 211 | fn shr(le_bytes: &mut [u8], mut n: u32) { 212 | if n >= 8 * le_bytes.len() as u32 { 213 | le_bytes.iter_mut().for_each(|byte| *byte = 0); 214 | return; 215 | } 216 | 217 | // Shift each full byte towards the least significant end. 218 | while n >= 8 { 219 | let mut replacement = 0; 220 | for byte in le_bytes.iter_mut().rev() { 221 | std::mem::swap(&mut replacement, byte); 222 | } 223 | n -= 8; 224 | } 225 | 226 | // Starting at the most significant byte, shift the byte's `n` least significant bits into the 227 | // `n` most significant bits of the next byte. 228 | if n > 0 { 229 | let mut shift_in = 0; 230 | for byte in le_bytes.iter_mut().rev() { 231 | // Copy the byte's `n` least significant bits. 232 | let shift_out = *byte << (8 - n); 233 | // Shift the byte by `n` bits; zeroing its `n` most significant bits. 234 | *byte >>= n; 235 | // Replace the `n` most significant bits with the bits shifted out of the previous byte. 236 | *byte |= shift_in; 237 | shift_in = shift_out; 238 | } 239 | } 240 | } 241 | 242 | fn multiexp_inner( 243 | bases: S, 244 | density_map: D, 245 | exponents: Arc::Repr>>, 246 | c: u32, 247 | ) -> Result<::Curve, EcError> 248 | where 249 | for<'a> &'a Q: QueryDensity, 250 | D: Send + Sync + 'static + Clone + AsRef, 251 | G: PrimeCurveAffine, 252 | S: SourceBuilder, 253 | { 254 | // Perform this region of the multiexp 255 | let this = move |bases: S, 256 | density_map: D, 257 | exponents: Arc::Repr>>, 258 | skip: u32| 259 | -> Result<_, EcError> { 260 | // Accumulate the result 261 | let mut acc = G::Curve::identity(); 262 | 263 | // Build a source for the bases 264 | let mut bases = bases.new(); 265 | 266 | // Create space for the buckets 267 | let mut buckets = vec![::Curve::identity(); (1 << c) - 1]; 268 | 269 | let zero = G::Scalar::ZERO.to_repr(); 270 | let one = G::Scalar::ONE.to_repr(); 271 | 272 | // only the first round uses this 273 | let handle_trivial = skip == 0; 274 | 275 | // Sort the bases into buckets 276 | for (&exp, density) in exponents.iter().zip(density_map.as_ref().iter()) { 277 | if density { 278 | if exp.as_ref() == zero.as_ref() { 279 | bases.skip(1)?; 280 | } else if exp.as_ref() == one.as_ref() { 281 | if handle_trivial { 282 | bases.add_assign_mixed(&mut acc)?; 283 | } else { 284 | bases.skip(1)?; 285 | } 286 | } else { 287 | let mut exp = exp; 288 | shr(exp.as_mut(), skip); 289 | let exp = u64::from_le_bytes(exp.as_ref()[..8].try_into().unwrap()) % (1 << c); 290 | 291 | if exp != 0 { 292 | bases.add_assign_mixed(&mut buckets[(exp - 1) as usize])?; 293 | } else { 294 | bases.skip(1)?; 295 | } 296 | } 297 | } 298 | } 299 | 300 | // Summation by parts 301 | // e.g. 3a + 2b + 1c = a + 302 | // (a) + b + 303 | // ((a) + b) + c 304 | let mut running_sum = G::Curve::identity(); 305 | for exp in buckets.into_iter().rev() { 306 | running_sum.add_assign(&exp); 307 | acc.add_assign(&running_sum); 308 | } 309 | 310 | Ok(acc) 311 | }; 312 | 313 | let parts = (0..::NUM_BITS) 314 | .into_par_iter() 315 | .step_by(c as usize) 316 | .map(|skip| this(bases.clone(), density_map.clone(), exponents.clone(), skip)) 317 | .collect::>>(); 318 | 319 | parts.into_iter().rev().try_fold( 320 | ::Curve::identity(), 321 | |mut acc, part| { 322 | for _ in 0..c { 323 | acc = acc.double(); 324 | } 325 | 326 | acc.add_assign(&part?); 327 | Ok(acc) 328 | }, 329 | ) 330 | } 331 | 332 | /// Perform multi-exponentiation. The caller is responsible for ensuring the 333 | /// query size is the same as the number of exponents. 334 | pub fn multiexp_cpu<'b, Q, D, G, S>( 335 | pool: &Worker, 336 | bases: S, 337 | density_map: D, 338 | exponents: Arc::Repr>>, 339 | ) -> Waiter::Curve, EcError>> 340 | where 341 | for<'a> &'a Q: QueryDensity, 342 | D: Send + Sync + 'static + Clone + AsRef, 343 | G: PrimeCurveAffine, 344 | S: SourceBuilder, 345 | { 346 | let c = if exponents.len() < 32 { 347 | 3u32 348 | } else { 349 | (f64::from(exponents.len() as u32)).ln().ceil() as u32 350 | }; 351 | 352 | if let Some(query_size) = density_map.as_ref().get_query_size() { 353 | // If the density map has a known query size, it should not be 354 | // inconsistent with the number of exponents. 355 | assert!(query_size == exponents.len()); 356 | } 357 | 358 | pool.compute(move || multiexp_inner(bases, density_map, exponents, c)) 359 | } 360 | 361 | #[cfg(test)] 362 | mod tests { 363 | use super::*; 364 | 365 | use blstrs::Bls12; 366 | use group::Curve; 367 | use pairing::Engine; 368 | use rand::Rng; 369 | use rand_core::SeedableRng; 370 | use rand_xorshift::XorShiftRng; 371 | 372 | #[test] 373 | fn test_with_bls12() { 374 | fn naive_multiexp( 375 | bases: Arc>, 376 | exponents: &[G::Scalar], 377 | ) -> G::Curve { 378 | assert_eq!(bases.len(), exponents.len()); 379 | 380 | let mut acc = G::Curve::identity(); 381 | 382 | for (base, exp) in bases.iter().zip(exponents.iter()) { 383 | acc.add_assign(&base.mul(*exp)); 384 | } 385 | 386 | acc 387 | } 388 | 389 | const SAMPLES: usize = 1 << 14; 390 | 391 | let rng = &mut rand::thread_rng(); 392 | let v: Vec<::Fr> = (0..SAMPLES) 393 | .map(|_| ::Fr::random(&mut *rng)) 394 | .collect(); 395 | let g = Arc::new( 396 | (0..SAMPLES) 397 | .map(|_| ::G1::random(&mut *rng).to_affine()) 398 | .collect::>(), 399 | ); 400 | 401 | let now = std::time::Instant::now(); 402 | let naive = naive_multiexp(g.clone(), &v); 403 | println!("Naive: {}", now.elapsed().as_millis()); 404 | 405 | let now = std::time::Instant::now(); 406 | let pool = Worker::new(); 407 | 408 | let v = Arc::new(v.into_iter().map(|fr| fr.to_repr()).collect()); 409 | let fast = multiexp_cpu(&pool, (g, 0), FullDensity, v).wait().unwrap(); 410 | 411 | println!("Fast: {}", now.elapsed().as_millis()); 412 | 413 | assert_eq!(naive, fast); 414 | } 415 | 416 | #[test] 417 | fn test_extend_density_regular() { 418 | let mut rng = XorShiftRng::from_seed([ 419 | 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 420 | 0xbc, 0xe5, 421 | ]); 422 | 423 | for k in &[2, 4, 8] { 424 | for j in &[10, 20, 50] { 425 | let count: usize = k * j; 426 | 427 | let mut tracker_full = DensityTracker::new(); 428 | let mut partial_trackers: Vec = Vec::with_capacity(count / k); 429 | for i in 0..count { 430 | if i % k == 0 { 431 | partial_trackers.push(DensityTracker::new()); 432 | } 433 | 434 | let index: usize = i / k; 435 | if rng.gen() { 436 | tracker_full.add_element(); 437 | partial_trackers[index].add_element(); 438 | } 439 | 440 | if !partial_trackers[index].bv.is_empty() { 441 | let idx = rng.gen_range(0..partial_trackers[index].bv.len()); 442 | let offset: usize = partial_trackers 443 | .iter() 444 | .take(index) 445 | .map(|t| t.bv.len()) 446 | .sum(); 447 | tracker_full.inc(offset + idx); 448 | partial_trackers[index].inc(idx); 449 | } 450 | } 451 | 452 | let mut tracker_combined = DensityTracker::new(); 453 | for tracker in partial_trackers.into_iter() { 454 | tracker_combined.extend(&tracker, false); 455 | } 456 | assert_eq!(tracker_combined, tracker_full); 457 | } 458 | } 459 | } 460 | 461 | #[test] 462 | fn test_extend_density_input() { 463 | let mut rng = XorShiftRng::from_seed([ 464 | 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, 465 | 0xbc, 0xe5, 466 | ]); 467 | let trials = 10; 468 | let max_bits = 10; 469 | let max_density = max_bits; 470 | 471 | // Create an empty DensityTracker. 472 | let empty = DensityTracker::new; 473 | 474 | // Create a random DensityTracker with first bit unset. 475 | let unset = |rng: &mut XorShiftRng| { 476 | let mut dt = DensityTracker::new(); 477 | dt.add_element(); 478 | let n = rng.gen_range(1..max_bits); 479 | let target_density = rng.gen_range(0..max_density); 480 | for _ in 1..n { 481 | dt.add_element(); 482 | } 483 | 484 | for _ in 0..target_density { 485 | if n > 1 { 486 | let to_inc = rng.gen_range(1..n); 487 | dt.inc(to_inc); 488 | } 489 | } 490 | assert!(!dt.bv[0]); 491 | assert_eq!(n, dt.bv.len()); 492 | dbg!(&target_density, &dt.total_density); 493 | 494 | dt 495 | }; 496 | 497 | // Create a random DensityTracker with first bit set. 498 | let set = |rng: &mut XorShiftRng| { 499 | let mut dt = unset(rng); 500 | dt.inc(0); 501 | dt 502 | }; 503 | 504 | for _ in 0..trials { 505 | { 506 | // Both empty. 507 | let (mut e1, e2) = (empty(), empty()); 508 | e1.extend(&e2, true); 509 | assert_eq!(empty(), e1); 510 | } 511 | { 512 | // First empty, second unset. 513 | let (mut e1, u1) = (empty(), unset(&mut rng)); 514 | e1.extend(&u1.clone(), true); 515 | assert_eq!(u1, e1); 516 | } 517 | { 518 | // First empty, second set. 519 | let (mut e1, s1) = (empty(), set(&mut rng)); 520 | e1.extend(&s1.clone(), true); 521 | assert_eq!(s1, e1); 522 | } 523 | { 524 | // First set, second empty. 525 | let (mut s1, e1) = (set(&mut rng), empty()); 526 | let s2 = s1.clone(); 527 | s1.extend(&e1, true); 528 | assert_eq!(s1, s2); 529 | } 530 | { 531 | // First unset, second empty. 532 | let (mut u1, e1) = (unset(&mut rng), empty()); 533 | let u2 = u1.clone(); 534 | u1.extend(&e1, true); 535 | assert_eq!(u1, u2); 536 | } 537 | { 538 | // First unset, second unset. 539 | let (mut u1, u2) = (unset(&mut rng), unset(&mut rng)); 540 | let expected_total = u1.total_density + u2.total_density; 541 | u1.extend(&u2, true); 542 | assert_eq!(expected_total, u1.total_density); 543 | assert!(!u1.bv[0]); 544 | } 545 | { 546 | // First unset, second set. 547 | let (mut u1, s1) = (unset(&mut rng), set(&mut rng)); 548 | let expected_total = u1.total_density + s1.total_density; 549 | u1.extend(&s1, true); 550 | assert_eq!(expected_total, u1.total_density); 551 | assert!(u1.bv[0]); 552 | } 553 | { 554 | // First set, second unset. 555 | let (mut s1, u1) = (set(&mut rng), unset(&mut rng)); 556 | let expected_total = s1.total_density + u1.total_density; 557 | s1.extend(&u1, true); 558 | assert_eq!(expected_total, s1.total_density); 559 | assert!(s1.bv[0]); 560 | } 561 | { 562 | // First set, second set. 563 | let (mut s1, s2) = (set(&mut rng), set(&mut rng)); 564 | let expected_total = s1.total_density + s2.total_density - 1; 565 | s1.extend(&s2, true); 566 | assert_eq!(expected_total, s1.total_density); 567 | assert!(s1.bv[0]); 568 | } 569 | } 570 | } 571 | } 572 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/program.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | /// Helper macro to create a program for a device. 3 | /// 4 | /// It will embed the CUDA fatbin/OpenCL source code within your binary. The source needs to be 5 | /// generated via [`crate::source::generate`] in your `build.rs`. 6 | /// 7 | /// It returns a `[crate::rust_gpu_tools::Program`] instance. 8 | macro_rules! program { 9 | ($device:ident) => {{ 10 | use $crate::rust_gpu_tools::{Framework, GPUError, Program}; 11 | (|device: &Device| -> Result { 12 | // Selects a CUDA or OpenCL on the `EC_GPU_FRAMEWORK` environment variable and the 13 | // compile-time features. 14 | // 15 | // You cannot select CUDA if the library was compiled without support for it. 16 | let default_framework = device.framework(); 17 | let framework = match ::std::env::var("EC_GPU_FRAMEWORK") { 18 | Ok(env) => match env.as_ref() { 19 | "cuda" => { 20 | #[cfg(feature = "cuda")] 21 | { 22 | Framework::Cuda 23 | } 24 | 25 | #[cfg(not(feature = "cuda"))] 26 | return Err($crate::EcError::Simple("CUDA framework is not supported, please compile with the `cuda` feature enabled.")) 27 | } 28 | "opencl" => { 29 | #[cfg(feature = "opencl")] 30 | { 31 | Framework::Opencl 32 | } 33 | 34 | #[cfg(not(feature = "opencl"))] 35 | return Err($crate::EcError::Simple("OpenCL framework is not supported, please compile with the `opencl` feature enabled.")) 36 | } 37 | _ => default_framework, 38 | }, 39 | Err(_) => default_framework, 40 | }; 41 | 42 | match framework { 43 | #[cfg(feature = "cuda")] 44 | Framework::Cuda => { 45 | let kernel = include_bytes!(env!("_EC_GPU_CUDA_KERNEL_FATBIN")); 46 | let cuda_device = device.cuda_device().ok_or(GPUError::DeviceNotFound)?; 47 | let program = $crate::rust_gpu_tools::cuda::Program::from_bytes(cuda_device, kernel)?; 48 | Ok(Program::Cuda(program)) 49 | } 50 | #[cfg(feature = "opencl")] 51 | Framework::Opencl => { 52 | let source = include_str!(env!("_EC_GPU_OPENCL_KERNEL_SOURCE")); 53 | let opencl_device = device.opencl_device().ok_or(GPUError::DeviceNotFound)?; 54 | let program = $crate::rust_gpu_tools::opencl::Program::from_opencl(opencl_device, source)?; 55 | Ok(Program::Opencl(program)) 56 | } 57 | } 58 | })($device) 59 | }}; 60 | } 61 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/source.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::fmt::{self, Write}; 3 | use std::hash::{Hash, Hasher}; 4 | use std::marker::PhantomData; 5 | use std::mem; 6 | #[cfg(any(feature = "opencl", feature = "cuda"))] 7 | use std::path::PathBuf; 8 | #[cfg(any(feature = "opencl", feature = "cuda"))] 9 | use std::{env, fs}; 10 | 11 | use ec_gpu::{GpuField, GpuName}; 12 | use group::prime::PrimeCurveAffine; 13 | 14 | static COMMON_SRC: &str = include_str!("cl/common.cl"); 15 | static FIELD_SRC: &str = include_str!("cl/field.cl"); 16 | static FIELD2_SRC: &str = include_str!("cl/field2.cl"); 17 | static EC_SRC: &str = include_str!("cl/ec.cl"); 18 | static FFT_SRC: &str = include_str!("cl/fft.cl"); 19 | static MULTIEXP_SRC: &str = include_str!("cl/multiexp.cl"); 20 | 21 | #[derive(Clone, Copy)] 22 | enum Limb32Or64 { 23 | Limb32, 24 | Limb64, 25 | } 26 | 27 | /// This trait is used to uniquely identify items by some identifier (`name`) and to return the GPU 28 | /// source code they produce. 29 | trait NameAndSource { 30 | /// The name to identify the item. 31 | fn name(&self) -> String; 32 | /// The GPU source code that is generated. 33 | fn source(&self, limb: Limb32Or64) -> String; 34 | } 35 | 36 | impl PartialEq for dyn NameAndSource { 37 | fn eq(&self, other: &Self) -> bool { 38 | self.name() == other.name() 39 | } 40 | } 41 | 42 | impl Eq for dyn NameAndSource {} 43 | 44 | impl Hash for dyn NameAndSource { 45 | fn hash(&self, state: &mut H) { 46 | self.name().hash(state) 47 | } 48 | } 49 | 50 | /// Prints the name by default, the source code of the 32-bit limb in the alternate mode via 51 | /// `{:#?}`. 52 | impl fmt::Debug for dyn NameAndSource { 53 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 54 | if f.alternate() { 55 | f.debug_map() 56 | .entries(vec![ 57 | ("name", self.name()), 58 | ("source", self.source(Limb32Or64::Limb32)), 59 | ]) 60 | .finish() 61 | } else { 62 | write!(f, "{:?}", self.name()) 63 | } 64 | } 65 | } 66 | 67 | /// A field that might also be an extension field. 68 | /// 69 | /// When the field is an extension field, we also add its sub-field to the list of fields. This 70 | /// enum is used to indicate that it's a sub-field that has a corresponding extension field. This 71 | /// way we can make sure that when the source is generated, that also the source for the sub-field 72 | /// is generated, while not having duplicated field definitions. 73 | // Storing the sub-field as a string is a bit of a hack around Rust's type system. If we would 74 | // store the generic type, then the enum would need to be generic over two fields, even in 75 | // the case when no extension field is used. This would make the API harder to use. 76 | #[derive(Debug)] 77 | enum Field { 78 | /// A field, might be an extension field. 79 | Field(PhantomData), 80 | /// A sub-field with the given name that has a corresponding extension field. 81 | SubField(String), 82 | } 83 | 84 | impl Field { 85 | /// Create a new field for the given generic type. 86 | pub fn new() -> Self { 87 | // By default it's added as a field. If it's an extension field, then the `add_field()` 88 | // function will create a copy of it, as `SubField` variant. 89 | Self::Field(PhantomData) 90 | } 91 | } 92 | 93 | impl Default for Field { 94 | fn default() -> Self { 95 | Self::new() 96 | } 97 | } 98 | 99 | fn field_source(limb: Limb32Or64) -> String { 100 | match limb { 101 | Limb32Or64::Limb32 => [ 102 | params::(), 103 | field_add_sub_nvidia::().expect("preallocated"), 104 | String::from(FIELD_SRC), 105 | ] 106 | .join("\n"), 107 | Limb32Or64::Limb64 => [ 108 | params::(), 109 | field_add_sub_nvidia::().expect("preallocated"), 110 | String::from(FIELD_SRC), 111 | ] 112 | .join("\n"), 113 | } 114 | } 115 | 116 | impl NameAndSource for Field { 117 | fn name(&self) -> String { 118 | match self { 119 | Self::Field(_) => F::name(), 120 | Self::SubField(name) => name.to_string(), 121 | } 122 | } 123 | 124 | fn source(&self, limb: Limb32Or64) -> String { 125 | match self { 126 | Self::Field(_) => { 127 | // If it's an extension field. 128 | if let Some(sub_field_name) = F::sub_field_name() { 129 | String::from(FIELD2_SRC) 130 | .replace("FIELD2", &F::name()) 131 | .replace("FIELD", &sub_field_name) 132 | } else { 133 | field_source::(limb).replace("FIELD", &F::name()) 134 | } 135 | } 136 | Self::SubField(sub_field_name) => { 137 | // The `GpuField` implementation of the extension field contains the constants of 138 | // the sub-field. Hence we can just forward the `F`. It's important that those 139 | // functions do *not* use the name of the field, else we might generate the 140 | // sub-field named like the extension field. 141 | field_source::(limb).replace("FIELD", sub_field_name) 142 | } 143 | } 144 | } 145 | } 146 | 147 | /// Struct that generates FFT GPU source code. 148 | struct Fft(PhantomData); 149 | 150 | impl NameAndSource for Fft { 151 | fn name(&self) -> String { 152 | F::name() 153 | } 154 | 155 | fn source(&self, _limb: Limb32Or64) -> String { 156 | String::from(FFT_SRC).replace("FIELD", &F::name()) 157 | } 158 | } 159 | 160 | /// Struct that generates multiexp GPU smource code. 161 | struct Multiexp { 162 | curve_point: PhantomData

, 163 | field: PhantomData, 164 | exponent: PhantomData, 165 | } 166 | 167 | impl Multiexp { 168 | pub fn new() -> Self { 169 | Self { 170 | curve_point: PhantomData::

, 171 | field: PhantomData::, 172 | exponent: PhantomData::, 173 | } 174 | } 175 | } 176 | 177 | impl NameAndSource for Multiexp { 178 | fn name(&self) -> String { 179 | P::name() 180 | } 181 | 182 | fn source(&self, _limb: Limb32Or64) -> String { 183 | let ec = String::from(EC_SRC) 184 | .replace("FIELD", &F::name()) 185 | .replace("POINT", &P::name()); 186 | let multiexp = String::from(MULTIEXP_SRC) 187 | .replace("POINT", &P::name()) 188 | .replace("EXPONENT", &Exp::name()); 189 | [ec, multiexp].concat() 190 | } 191 | } 192 | 193 | /// Builder to create the source code of a GPU kernel. 194 | /// 195 | /// # Example 196 | /// 197 | /// ``` 198 | /// use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar}; 199 | /// use ec_gpu_gen::SourceBuilder; 200 | /// 201 | /// # #[cfg(any(feature = "cuda", feature = "opencl"))] 202 | /// let source = SourceBuilder::new() 203 | /// .add_fft::() 204 | /// .add_multiexp::() 205 | /// .add_multiexp::() 206 | /// .build_32_bit_limbs(); 207 | ///``` 208 | // In the `HashSet`s the concrete types cannot be used, as each item of the set should be able to 209 | // have its own (different) generic type. 210 | // We distinguish between extension fields and other fields as sub-fields need to be defined first 211 | // in the source code (due to being C, where the order of declaration matters). 212 | pub struct SourceBuilder { 213 | /// The [`Field`]s that are used in this kernel. 214 | fields: HashSet>, 215 | /// The extension [`Field`]s that are used in this kernel. 216 | extension_fields: HashSet>, 217 | /// The [`Fft`]s that are used in this kernel. 218 | ffts: HashSet>, 219 | /// The [`Multiexp`]s that are used in this kernel. 220 | multiexps: HashSet>, 221 | /// Additional source that is appended at the end of the generated source. 222 | extra_sources: Vec, 223 | } 224 | 225 | impl SourceBuilder { 226 | /// Create a new configuration to generation a GPU kernel. 227 | pub fn new() -> Self { 228 | Self { 229 | fields: HashSet::new(), 230 | extension_fields: HashSet::new(), 231 | ffts: HashSet::new(), 232 | multiexps: HashSet::new(), 233 | extra_sources: Vec::new(), 234 | } 235 | } 236 | 237 | /// Add a field to the configuration. 238 | /// 239 | /// If it is an extension field, then the extension field *and* the sub-field is added. 240 | pub fn add_field(mut self) -> Self 241 | where 242 | F: GpuField + 'static, 243 | { 244 | let field = Field::::new(); 245 | // If it's an extension field, also add the corresponding sub-field. 246 | if let Some(sub_field_name) = F::sub_field_name() { 247 | self.extension_fields.insert(Box::new(field)); 248 | let sub_field = Field::::SubField(sub_field_name); 249 | self.fields.insert(Box::new(sub_field)); 250 | } else { 251 | self.fields.insert(Box::new(field)); 252 | } 253 | self 254 | } 255 | 256 | /// Add an FFT kernel function to the configuration. 257 | pub fn add_fft(self) -> Self 258 | where 259 | F: GpuField + 'static, 260 | { 261 | let mut config = self.add_field::(); 262 | let fft = Fft::(PhantomData); 263 | config.ffts.insert(Box::new(fft)); 264 | config 265 | } 266 | 267 | /// Add an Multiexp kernel function to the configuration. 268 | /// 269 | /// The field must be given explicitly as currently it cannot derived from the curve point 270 | /// directly. 271 | pub fn add_multiexp(self) -> Self 272 | where 273 | C: PrimeCurveAffine + GpuName, 274 | C::Scalar: GpuField, 275 | F: GpuField + 'static, 276 | { 277 | let mut config = self.add_field::().add_field::(); 278 | let multiexp = Multiexp::::new(); 279 | config.multiexps.insert(Box::new(multiexp)); 280 | config 281 | } 282 | 283 | /// Appends some given source at the end of the generated source. 284 | /// 285 | /// This is useful for cases where you use this library as building block, but have your own 286 | /// kernel implementation. If this function is is called several times, then those sources are 287 | /// appended in that call order. 288 | pub fn append_source(mut self, source: String) -> Self { 289 | self.extra_sources.push(source); 290 | self 291 | } 292 | 293 | /// Generate the GPU kernel source code based on the current configuration with 32-bit limbs. 294 | /// 295 | /// On CUDA 32-bit limbs are recommended. 296 | pub fn build_32_bit_limbs(&self) -> String { 297 | self.build(Limb32Or64::Limb32) 298 | } 299 | 300 | /// Generate the GPU kernel source code based on the current configuration with 64-bit limbs. 301 | /// 302 | /// On OpenCL 32-bit limbs are recommended. 303 | pub fn build_64_bit_limbs(&self) -> String { 304 | self.build(Limb32Or64::Limb64) 305 | } 306 | 307 | /// Generate the GPU kernel source code based on the current configuration. 308 | fn build(&self, limb_size: Limb32Or64) -> String { 309 | let fields = self 310 | .fields 311 | .iter() 312 | .map(|field| field.source(limb_size)) 313 | .collect(); 314 | let extension_fields = self 315 | .extension_fields 316 | .iter() 317 | .map(|field| field.source(limb_size)) 318 | .collect(); 319 | let ffts = self.ffts.iter().map(|fft| fft.source(limb_size)).collect(); 320 | let multiexps = self 321 | .multiexps 322 | .iter() 323 | .map(|multiexp| multiexp.source(limb_size)) 324 | .collect(); 325 | let extra_sources = self.extra_sources.join("\n"); 326 | [ 327 | COMMON_SRC.to_string(), 328 | fields, 329 | extension_fields, 330 | ffts, 331 | multiexps, 332 | extra_sources, 333 | ] 334 | .join("\n\n") 335 | } 336 | } 337 | 338 | impl Default for SourceBuilder { 339 | fn default() -> Self { 340 | Self::new() 341 | } 342 | } 343 | 344 | /// Trait to implement limbs of different underlying bit sizes. 345 | pub trait Limb: Sized + Clone + Copy { 346 | /// The underlying size of the limb, e.g. `u32` 347 | type LimbType: Clone + std::fmt::Display; 348 | /// Returns the value representing zero. 349 | fn zero() -> Self; 350 | /// Returns a new limb. 351 | fn new(val: Self::LimbType) -> Self; 352 | /// Returns the raw value of the limb. 353 | fn value(&self) -> Self::LimbType; 354 | /// Returns the bit size of the limb. 355 | fn bits() -> usize { 356 | mem::size_of::() * 8 357 | } 358 | /// Returns a tuple with the strings that PTX is using to describe the type and the register. 359 | fn ptx_info() -> (&'static str, &'static str); 360 | /// Returns the type that OpenCL is using to represent the limb. 361 | fn opencl_type() -> &'static str; 362 | /// Returns the limbs that represent the multiplicative identity of the given field. 363 | fn one_limbs() -> Vec; 364 | /// Returns the field modulus in non-Montgomery form as a vector of `Self::LimbType` (least 365 | /// significant limb first). 366 | fn modulus_limbs() -> Vec; 367 | /// Calculate the `INV` parameter of Montgomery reduction algorithm for 32/64bit limbs 368 | /// * `a` - Is the first limb of modulus. 369 | fn calc_inv(a: Self) -> Self; 370 | /// Returns the limbs that represent `R ^ 2 mod P`. 371 | fn calculate_r2() -> Vec; 372 | } 373 | 374 | /// A 32-bit limb. 375 | #[derive(Clone, Copy)] 376 | pub struct Limb32(u32); 377 | impl Limb for Limb32 { 378 | type LimbType = u32; 379 | fn zero() -> Self { 380 | Self(0) 381 | } 382 | fn new(val: Self::LimbType) -> Self { 383 | Self(val) 384 | } 385 | fn value(&self) -> Self::LimbType { 386 | self.0 387 | } 388 | fn ptx_info() -> (&'static str, &'static str) { 389 | ("u32", "r") 390 | } 391 | fn opencl_type() -> &'static str { 392 | "uint" 393 | } 394 | fn one_limbs() -> Vec { 395 | F::one().into_iter().map(Self::new).collect() 396 | } 397 | fn modulus_limbs() -> Vec { 398 | F::modulus().into_iter().map(Self::new).collect() 399 | } 400 | fn calc_inv(a: Self) -> Self { 401 | let mut inv = 1u32; 402 | for _ in 0..31 { 403 | inv = inv.wrapping_mul(inv); 404 | inv = inv.wrapping_mul(a.value()); 405 | } 406 | Self(inv.wrapping_neg()) 407 | } 408 | fn calculate_r2() -> Vec { 409 | F::r2().into_iter().map(Self::new).collect() 410 | } 411 | } 412 | 413 | /// A 64-bit limb. 414 | #[derive(Clone, Copy)] 415 | pub struct Limb64(u64); 416 | impl Limb for Limb64 { 417 | type LimbType = u64; 418 | fn zero() -> Self { 419 | Self(0) 420 | } 421 | fn new(val: Self::LimbType) -> Self { 422 | Self(val) 423 | } 424 | fn value(&self) -> Self::LimbType { 425 | self.0 426 | } 427 | fn ptx_info() -> (&'static str, &'static str) { 428 | ("u64", "l") 429 | } 430 | fn opencl_type() -> &'static str { 431 | "ulong" 432 | } 433 | fn one_limbs() -> Vec { 434 | F::one() 435 | .chunks(2) 436 | .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64))) 437 | .collect() 438 | } 439 | 440 | fn modulus_limbs() -> Vec { 441 | F::modulus() 442 | .chunks(2) 443 | .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64))) 444 | .collect() 445 | } 446 | 447 | fn calc_inv(a: Self) -> Self { 448 | let mut inv = 1u64; 449 | for _ in 0..63 { 450 | inv = inv.wrapping_mul(inv); 451 | inv = inv.wrapping_mul(a.value()); 452 | } 453 | Self(inv.wrapping_neg()) 454 | } 455 | fn calculate_r2() -> Vec { 456 | F::r2() 457 | .chunks(2) 458 | .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64))) 459 | .collect() 460 | } 461 | } 462 | 463 | fn const_field(name: &str, limbs: Vec) -> String { 464 | format!( 465 | "CONSTANT FIELD {} = {{ {{ {} }} }};", 466 | name, 467 | limbs 468 | .iter() 469 | .map(|l| l.value().to_string()) 470 | .collect::>() 471 | .join(", ") 472 | ) 473 | } 474 | 475 | /// Generates CUDA/OpenCL constants and type definitions of prime-field `F` 476 | fn params() -> String 477 | where 478 | F: GpuField, 479 | L: Limb, 480 | { 481 | let one = L::one_limbs::(); // Get Montgomery form of F::one() 482 | let p = L::modulus_limbs::(); // Get field modulus in non-Montgomery form 483 | let r2 = L::calculate_r2::(); 484 | let limbs = one.len(); // Number of limbs 485 | let inv = L::calc_inv(p[0]); 486 | let limb_def = format!("#define FIELD_limb {}", L::opencl_type()); 487 | let limbs_def = format!("#define FIELD_LIMBS {}", limbs); 488 | let limb_bits_def = format!("#define FIELD_LIMB_BITS {}", L::bits()); 489 | let p_def = const_field("FIELD_P", p); 490 | let r2_def = const_field("FIELD_R2", r2); 491 | let one_def = const_field("FIELD_ONE", one); 492 | let zero_def = const_field("FIELD_ZERO", vec![L::zero(); limbs]); 493 | let inv_def = format!("#define FIELD_INV {}", inv.value()); 494 | let typedef = "typedef struct { FIELD_limb val[FIELD_LIMBS]; } FIELD;".to_string(); 495 | [ 496 | limb_def, 497 | limbs_def, 498 | limb_bits_def, 499 | inv_def, 500 | typedef, 501 | one_def, 502 | p_def, 503 | r2_def, 504 | zero_def, 505 | ] 506 | .join("\n") 507 | } 508 | 509 | /// Generates PTX-Assembly implementation of FIELD_add_/FIELD_sub_ 510 | fn field_add_sub_nvidia() -> Result 511 | where 512 | F: GpuField, 513 | L: Limb, 514 | { 515 | let mut result = String::new(); 516 | let (ptx_type, ptx_reg) = L::ptx_info(); 517 | 518 | writeln!(result, "#if defined(OPENCL_NVIDIA) || defined(CUDA)\n")?; 519 | for op in &["sub", "add"] { 520 | let len = L::one_limbs::().len(); 521 | 522 | writeln!( 523 | result, 524 | "DEVICE FIELD FIELD_{}_nvidia(FIELD a, FIELD b) {{", 525 | op 526 | )?; 527 | if len > 1 { 528 | write!(result, "asm(")?; 529 | writeln!(result, "\"{}.cc.{} %0, %0, %{};\\r\\n\"", op, ptx_type, len)?; 530 | 531 | for i in 1..len - 1 { 532 | writeln!( 533 | result, 534 | "\"{}c.cc.{} %{}, %{}, %{};\\r\\n\"", 535 | op, 536 | ptx_type, 537 | i, 538 | i, 539 | len + i 540 | )?; 541 | } 542 | writeln!( 543 | result, 544 | "\"{}c.{} %{}, %{}, %{};\\r\\n\"", 545 | op, 546 | ptx_type, 547 | len - 1, 548 | len - 1, 549 | 2 * len - 1 550 | )?; 551 | 552 | write!(result, ":")?; 553 | for n in 0..len { 554 | write!(result, "\"+{}\"(a.val[{}])", ptx_reg, n)?; 555 | if n != len - 1 { 556 | write!(result, ", ")?; 557 | } 558 | } 559 | 560 | write!(result, "\n:")?; 561 | for n in 0..len { 562 | write!(result, "\"{}\"(b.val[{}])", ptx_reg, n)?; 563 | if n != len - 1 { 564 | write!(result, ", ")?; 565 | } 566 | } 567 | writeln!(result, ");")?; 568 | } 569 | writeln!(result, "return a;\n}}")?; 570 | } 571 | writeln!(result, "#endif")?; 572 | 573 | Ok(result) 574 | } 575 | 576 | /// Convenience function to generate a kernel/source based on a source builder. 577 | /// 578 | /// When the `cuda` feature is enabled it will compile a CUDA fatbin. The path to the file is 579 | /// stored in the `_EC_GPU_CUDA_KERNEL_FATBIN` environment variable, that will automatically be 580 | /// used by the `ec-gpu-gen` functionality that needs a kernel. 581 | /// 582 | /// 583 | /// When the `opencl` feature is enabled it will generate the source code for OpenCL. The path to 584 | /// the source file is stored in the `_EC_GPU_OPENCL_KERNEL_SOURCE` environment variable, that will 585 | /// automatically be used by the `ec-gpu-gen` functionality that needs a kernel. OpenCL compiles 586 | /// the source at run time). 587 | #[allow(unused_variables)] 588 | pub fn generate(source_builder: &SourceBuilder) { 589 | #[cfg(feature = "cuda")] 590 | generate_cuda(source_builder); 591 | #[cfg(feature = "opencl")] 592 | generate_opencl(source_builder); 593 | } 594 | 595 | #[cfg(feature = "cuda")] 596 | fn generate_cuda(source_builder: &SourceBuilder) -> PathBuf { 597 | use sha2::{Digest, Sha256}; 598 | 599 | // This is a hack when no properly compiled kernel is needed. That's the case when the 600 | // documentation is built on docs.rs and when Clippy is run. We can use arbitrary bytes as 601 | // input then. 602 | if env::var("DOCS_RS").is_ok() || cfg!(clippy) { 603 | println!("cargo:rustc-env=_EC_GPU_CUDA_KERNEL_FATBIN=../build.rs"); 604 | return PathBuf::from("../build.rs"); 605 | } 606 | 607 | let kernel_source = source_builder.build_32_bit_limbs(); 608 | let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set."); 609 | 610 | // Make it possible to override the default options. Though the source and output file is 611 | // always set automatically. 612 | let mut nvcc = match env::var("EC_GPU_CUDA_NVCC_ARGS") { 613 | Ok(args) => execute::command(format!("nvcc {}", args)), 614 | Err(_) => { 615 | let mut command = std::process::Command::new("nvcc"); 616 | command 617 | .arg("--optimize=6") 618 | // Compile with as many threads as CPUs are available. 619 | .arg("--threads=0") 620 | .arg("--fatbin") 621 | .arg("--gpu-architecture=sm_86") 622 | .arg("--generate-code=arch=compute_86,code=sm_86") 623 | .arg("--generate-code=arch=compute_80,code=sm_80") 624 | .arg("--generate-code=arch=compute_75,code=sm_75"); 625 | command 626 | } 627 | }; 628 | 629 | // Hash the source and the compile flags. Use that as the filename, so that the kernel is only 630 | // rebuilt if any of them change. 631 | let mut hasher = Sha256::new(); 632 | hasher.update(kernel_source.as_bytes()); 633 | hasher.update(format!("{:?}", &nvcc)); 634 | let kernel_digest = hex::encode(hasher.finalize()); 635 | 636 | let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)] 637 | .iter() 638 | .collect(); 639 | let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)] 640 | .iter() 641 | .collect(); 642 | 643 | fs::write(&source_path, &kernel_source).unwrap_or_else(|_| { 644 | panic!( 645 | "Cannot write kernel source at {}.", 646 | source_path.to_str().unwrap() 647 | ) 648 | }); 649 | 650 | // Only compile if the output doesn't exist yet. 651 | if !fatbin_path.as_path().exists() { 652 | let status = nvcc 653 | .arg("--output-file") 654 | .arg(&fatbin_path) 655 | .arg(&source_path) 656 | .status() 657 | .expect("Cannot run nvcc. Install the NVIDIA toolkit or disable the `cuda` feature."); 658 | 659 | if !status.success() { 660 | panic!( 661 | "nvcc failed. See the kernel source at {}", 662 | source_path.to_str().unwrap() 663 | ); 664 | } 665 | } 666 | 667 | // The idea to put the path to the farbin into a compile-time env variable is from 668 | // https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs 669 | println!( 670 | "cargo:rustc-env=_EC_GPU_CUDA_KERNEL_FATBIN={}", 671 | fatbin_path.to_str().unwrap() 672 | ); 673 | 674 | fatbin_path 675 | } 676 | 677 | #[cfg(feature = "opencl")] 678 | fn generate_opencl(source_builder: &SourceBuilder) -> PathBuf { 679 | let kernel_source = source_builder.build_64_bit_limbs(); 680 | let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set."); 681 | 682 | // Generating the kernel source is cheap, hence use a fixed name and override it on every 683 | // build. 684 | let source_path: PathBuf = [&out_dir, "kernel.cl"].iter().collect(); 685 | 686 | fs::write(&source_path, kernel_source).unwrap_or_else(|_| { 687 | panic!( 688 | "Cannot write kernel source at {}.", 689 | source_path.to_str().unwrap() 690 | ) 691 | }); 692 | 693 | // For OpenCL we only need the kernel source, it is compiled at runtime. 694 | println!( 695 | "cargo:rustc-env=_EC_GPU_OPENCL_KERNEL_SOURCE={}", 696 | source_path.to_str().unwrap() 697 | ); 698 | 699 | source_path 700 | } 701 | 702 | #[cfg(all(test, any(feature = "opencl", feature = "cuda")))] 703 | mod tests { 704 | use super::*; 705 | 706 | use std::sync::Mutex; 707 | 708 | #[cfg(feature = "cuda")] 709 | use rust_gpu_tools::cuda; 710 | #[cfg(feature = "opencl")] 711 | use rust_gpu_tools::opencl; 712 | use rust_gpu_tools::{program_closures, Device, GPUError, Program}; 713 | 714 | use blstrs::Scalar; 715 | use ff::{Field as _, PrimeField}; 716 | use lazy_static::lazy_static; 717 | use rand::{thread_rng, Rng}; 718 | 719 | static TEST_SRC: &str = include_str!("./cl/test.cl"); 720 | 721 | #[derive(PartialEq, Debug, Clone, Copy)] 722 | #[repr(transparent)] 723 | pub struct GpuScalar(pub Scalar); 724 | impl Default for GpuScalar { 725 | fn default() -> Self { 726 | Self(Scalar::ZERO) 727 | } 728 | } 729 | 730 | #[cfg(feature = "cuda")] 731 | impl cuda::KernelArgument for GpuScalar { 732 | fn as_c_void(&self) -> *mut std::ffi::c_void { 733 | &self.0 as *const _ as _ 734 | } 735 | } 736 | 737 | #[cfg(feature = "opencl")] 738 | impl opencl::KernelArgument for GpuScalar { 739 | fn push(&self, kernel: &mut opencl::Kernel) { 740 | unsafe { kernel.builder.set_arg(&self.0) }; 741 | } 742 | } 743 | 744 | /// The `run` call needs to return a result, use this struct as placeholder. 745 | #[derive(Debug)] 746 | struct NoError; 747 | impl From for NoError { 748 | fn from(_error: GPUError) -> Self { 749 | Self 750 | } 751 | } 752 | 753 | fn test_source() -> SourceBuilder { 754 | let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name()); 755 | SourceBuilder::new() 756 | .add_field::() 757 | .append_source(test_source) 758 | } 759 | 760 | #[cfg(feature = "cuda")] 761 | lazy_static! { 762 | static ref CUDA_PROGRAM: Mutex = { 763 | use std::ffi::CString; 764 | 765 | let source = test_source(); 766 | let fatbin_path = generate_cuda(&source); 767 | 768 | let device = *Device::all().first().expect("Cannot get a default device."); 769 | let cuda_device = device.cuda_device().unwrap(); 770 | let fatbin_path_cstring = 771 | CString::new(fatbin_path.to_str().expect("path is not valid UTF-8.")) 772 | .expect("path contains NULL byte."); 773 | let program = 774 | cuda::Program::from_binary(cuda_device, fatbin_path_cstring.as_c_str()).unwrap(); 775 | Mutex::new(Program::Cuda(program)) 776 | }; 777 | } 778 | 779 | #[cfg(feature = "opencl")] 780 | lazy_static! { 781 | static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = { 782 | let device = *Device::all().first().expect("Cannot get a default device"); 783 | let opencl_device = device.opencl_device().unwrap(); 784 | let source_32 = test_source().build_32_bit_limbs(); 785 | let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap(); 786 | let source_64 = test_source().build_64_bit_limbs(); 787 | let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap(); 788 | Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64))) 789 | }; 790 | } 791 | 792 | fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar { 793 | let closures = program_closures!(|program, _args| -> Result { 794 | let mut cpu_buffer = vec![GpuScalar::default()]; 795 | let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap(); 796 | 797 | let mut kernel = program.create_kernel(name, 1, 64).unwrap(); 798 | for scalar in scalars { 799 | kernel = kernel.arg(scalar); 800 | } 801 | for uint in uints { 802 | kernel = kernel.arg(uint); 803 | } 804 | kernel.arg(&buffer).run().unwrap(); 805 | 806 | program.read_into_buffer(&buffer, &mut cpu_buffer).unwrap(); 807 | Ok(cpu_buffer[0].0) 808 | }); 809 | 810 | // For CUDA we only test 32-bit limbs. 811 | #[cfg(all(feature = "cuda", not(feature = "opencl")))] 812 | return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); 813 | 814 | // For OpenCL we test for 32 and 64-bi limbs. 815 | #[cfg(all(feature = "opencl", not(feature = "cuda")))] 816 | { 817 | let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); 818 | let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); 819 | assert_eq!( 820 | result_32, result_64, 821 | "Results for 32-bit and 64-bit limbs must be the same." 822 | ); 823 | result_32 824 | } 825 | 826 | // When both features are enabled, check if the results are the same 827 | #[cfg(all(feature = "cuda", feature = "opencl"))] 828 | { 829 | let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); 830 | let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); 831 | let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); 832 | assert_eq!( 833 | opencl_32_result, opencl_64_result, 834 | "Results for 32-bit and 64-bit limbs on OpenCL must be the same." 835 | ); 836 | assert_eq!( 837 | cuda_result, opencl_32_result, 838 | "Results for CUDA and OpenCL must be the same." 839 | ); 840 | cuda_result 841 | } 842 | } 843 | 844 | #[test] 845 | fn test_add() { 846 | let mut rng = thread_rng(); 847 | for _ in 0..10 { 848 | let a = Scalar::random(&mut rng); 849 | let b = Scalar::random(&mut rng); 850 | let c = a + b; 851 | 852 | assert_eq!( 853 | call_kernel("test_add", &[GpuScalar(a), GpuScalar(b)], &[]), 854 | c 855 | ); 856 | } 857 | } 858 | 859 | #[test] 860 | fn test_sub() { 861 | let mut rng = thread_rng(); 862 | for _ in 0..10 { 863 | let a = Scalar::random(&mut rng); 864 | let b = Scalar::random(&mut rng); 865 | let c = a - b; 866 | assert_eq!( 867 | call_kernel("test_sub", &[GpuScalar(a), GpuScalar(b)], &[]), 868 | c 869 | ); 870 | } 871 | } 872 | 873 | #[test] 874 | fn test_mul() { 875 | let mut rng = thread_rng(); 876 | for _ in 0..10 { 877 | let a = Scalar::random(&mut rng); 878 | let b = Scalar::random(&mut rng); 879 | let c = a * b; 880 | 881 | assert_eq!( 882 | call_kernel("test_mul", &[GpuScalar(a), GpuScalar(b)], &[]), 883 | c 884 | ); 885 | } 886 | } 887 | 888 | #[test] 889 | fn test_pow() { 890 | let mut rng = thread_rng(); 891 | for _ in 0..10 { 892 | let a = Scalar::random(&mut rng); 893 | let b = rng.gen::(); 894 | let c = a.pow_vartime([b as u64]); 895 | assert_eq!(call_kernel("test_pow", &[GpuScalar(a)], &[b]), c); 896 | } 897 | } 898 | 899 | #[test] 900 | fn test_sqr() { 901 | let mut rng = thread_rng(); 902 | for _ in 0..10 { 903 | let a = Scalar::random(&mut rng); 904 | let b = a.square(); 905 | 906 | assert_eq!(call_kernel("test_sqr", &[GpuScalar(a)], &[]), b); 907 | } 908 | } 909 | 910 | #[test] 911 | fn test_double() { 912 | let mut rng = thread_rng(); 913 | for _ in 0..10 { 914 | let a = Scalar::random(&mut rng); 915 | let b = a.double(); 916 | 917 | assert_eq!(call_kernel("test_double", &[GpuScalar(a)], &[]), b); 918 | } 919 | } 920 | 921 | #[test] 922 | fn test_unmont() { 923 | let mut rng = thread_rng(); 924 | for _ in 0..10 { 925 | let a = Scalar::random(&mut rng); 926 | let b: Scalar = unsafe { std::mem::transmute(a.to_repr()) }; 927 | assert_eq!(call_kernel("test_unmont", &[GpuScalar(a)], &[]), b); 928 | } 929 | } 930 | 931 | #[test] 932 | fn test_mont() { 933 | let mut rng = thread_rng(); 934 | for _ in 0..10 { 935 | let a_repr = Scalar::random(&mut rng).to_repr(); 936 | let a: Scalar = unsafe { std::mem::transmute(a_repr) }; 937 | let b = Scalar::from_repr(a_repr).unwrap(); 938 | assert_eq!(call_kernel("test_mont", &[GpuScalar(a)], &[]), b); 939 | } 940 | } 941 | } 942 | -------------------------------------------------------------------------------- /ec-gpu-gen/src/threadpool.rs: -------------------------------------------------------------------------------- 1 | //! An interface for dealing with the kinds of parallel computations involved. 2 | use std::env; 3 | 4 | use crossbeam_channel::{bounded, Receiver, SendError}; 5 | use log::trace; 6 | use once_cell::sync::Lazy; 7 | use yastl::Pool; 8 | 9 | /// The number of threads the thread pool should use. 10 | /// 11 | /// By default it's equal to the number of CPUs, but it can be changed with the 12 | /// `EC_GPU_NUM_THREADS` environment variable. 13 | static NUM_THREADS: Lazy = Lazy::new(read_num_threads); 14 | 15 | /// The thread pool that is used for the computations. 16 | /// 17 | /// By default, it's size is equal to the number of CPUs. It can be set to a different value with 18 | /// the `EC_GPU_NUM_THREADS` environment variable. 19 | pub static THREAD_POOL: Lazy = Lazy::new(|| Pool::new(*NUM_THREADS)); 20 | 21 | /// Returns the number of threads. 22 | /// 23 | /// The number can be set with the `EC_GPU_NUM_THREADS` environment variable. If it isn't set, it 24 | /// defaults to the number of CPUs the system has. 25 | fn read_num_threads() -> usize { 26 | env::var("EC_GPU_NUM_THREADS") 27 | .ok() 28 | .and_then(|num| num.parse::().ok()) 29 | .unwrap_or_else(num_cpus::get) 30 | } 31 | 32 | /// A worker operates on a pool of threads. 33 | #[derive(Clone, Default)] 34 | pub struct Worker {} 35 | 36 | impl Worker { 37 | /// Returns a new worker. 38 | pub fn new() -> Worker { 39 | Worker {} 40 | } 41 | 42 | /// Returns binary logarithm (floored) of the number of threads. 43 | /// 44 | /// This means, the number of threads is `2^log_num_threads()`. 45 | pub fn log_num_threads(&self) -> u32 { 46 | log2_floor(*NUM_THREADS) 47 | } 48 | 49 | /// Executes a function in a thread and returns a [`Waiter`] immediately. 50 | pub fn compute(&self, f: F) -> Waiter 51 | where 52 | F: FnOnce() -> R + Send + 'static, 53 | R: Send + 'static, 54 | { 55 | let (sender, receiver) = bounded(1); 56 | 57 | THREAD_POOL.spawn(move || { 58 | let res = f(); 59 | // Best effort. We run it in a separate thread, so the receiver might not exist 60 | // anymore, but that's OK. It only means that we are not interested in the result. 61 | // A message is logged though, as concurrency issues are hard to debug and this might 62 | // help in such cases. 63 | if let Err(SendError(_)) = sender.send(res) { 64 | trace!("Cannot send result"); 65 | } 66 | }); 67 | 68 | Waiter { receiver } 69 | } 70 | 71 | /// Executes a function and returns the result once it is finished. 72 | /// 73 | /// The function gets the [`yastl::Scope`] as well as the `chunk_size` as parameters. THe 74 | /// `chunk_size` is number of elements per thread. 75 | pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R 76 | where 77 | F: FnOnce(&yastl::Scope<'a>, usize) -> R, 78 | { 79 | let chunk_size = if elements < *NUM_THREADS { 80 | 1 81 | } else { 82 | elements / *NUM_THREADS 83 | }; 84 | 85 | THREAD_POOL.scoped(|scope| f(scope, chunk_size)) 86 | } 87 | 88 | /// Executes the passed in function, and returns the result once it is finished. 89 | pub fn scoped<'a, F, R>(&self, f: F) -> R 90 | where 91 | F: FnOnce(&yastl::Scope<'a>) -> R, 92 | { 93 | let (sender, receiver) = bounded(1); 94 | THREAD_POOL.scoped(|s| { 95 | let res = f(s); 96 | sender.send(res).unwrap(); 97 | }); 98 | 99 | receiver.recv().unwrap() 100 | } 101 | } 102 | 103 | /// A future that is waiting for a result. 104 | pub struct Waiter { 105 | receiver: Receiver, 106 | } 107 | 108 | impl Waiter { 109 | /// Wait for the result. 110 | pub fn wait(&self) -> T { 111 | self.receiver.recv().unwrap() 112 | } 113 | 114 | /// One off sending. 115 | pub fn done(val: T) -> Self { 116 | let (sender, receiver) = bounded(1); 117 | sender.send(val).unwrap(); 118 | 119 | Waiter { receiver } 120 | } 121 | } 122 | 123 | fn log2_floor(num: usize) -> u32 { 124 | assert!(num > 0); 125 | 126 | let mut pow = 0; 127 | 128 | while (1 << (pow + 1)) <= num { 129 | pow += 1; 130 | } 131 | 132 | pow 133 | } 134 | 135 | #[cfg(test)] 136 | mod tests { 137 | use super::*; 138 | 139 | #[test] 140 | fn test_log2_floor() { 141 | assert_eq!(log2_floor(1), 0); 142 | assert_eq!(log2_floor(3), 1); 143 | assert_eq!(log2_floor(4), 2); 144 | assert_eq!(log2_floor(5), 2); 145 | assert_eq!(log2_floor(6), 2); 146 | assert_eq!(log2_floor(7), 2); 147 | assert_eq!(log2_floor(8), 3); 148 | } 149 | 150 | #[test] 151 | fn test_read_num_threads() { 152 | let num_cpus = num_cpus::get(); 153 | temp_env::with_var("EC_GPU_NUM_THREADS", None::<&str>, || { 154 | assert_eq!( 155 | read_num_threads(), 156 | num_cpus, 157 | "By default the number of threads matches the number of CPUs." 158 | ); 159 | }); 160 | 161 | temp_env::with_var("EC_GPU_NUM_THREADS", Some("1234"), || { 162 | assert_eq!( 163 | read_num_threads(), 164 | 1234, 165 | "Number of threads matches the environment variable." 166 | ); 167 | }); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /ec-gpu/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ec-gpu" 3 | version = "0.2.0" 4 | authors = ["dignifiedquire "] 5 | edition = "2021" 6 | description = "Traits for field and eliptic curve operations on GPUs" 7 | homepage = "https://github.com/filecoin-project/ff-cl-gen" 8 | repository = "https://github.com/filecoin-project/ff-cl-gen" 9 | license = "MIT/Apache-2.0" 10 | 11 | [dependencies] 12 | -------------------------------------------------------------------------------- /ec-gpu/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /ec-gpu/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /ec-gpu/src/lib.rs: -------------------------------------------------------------------------------- 1 | /// The name that is used in the GPU source code to identify the item that is used. 2 | pub trait GpuName { 3 | /// A unique name for the item. 4 | /// 5 | /// To make the uniqueness easier to implement, use the [`name`] macro. It produces a unique 6 | /// name, based on the module path and the type of the item itself. That identifier might not 7 | /// be stable across different versions of a crate, but this is OK as kernel sources/binaries 8 | /// are always bundled with a library and not re-used between versions. 9 | /// 10 | /// # Example 11 | /// 12 | /// ``` 13 | /// struct Fp; 14 | /// 15 | /// impl ec_gpu::GpuName for Fp { 16 | /// fn name() -> String { 17 | /// ec_gpu::name!() 18 | /// } 19 | /// } 20 | /// ``` 21 | fn name() -> String; 22 | } 23 | 24 | /// A prime field that returns the values in a representation that is suited for the use on a GPU. 25 | pub trait GpuField: GpuName { 26 | /// Returns `1` as a vector of 32-bit limbs in little-endian non-Montgomery form (least 27 | /// significant limb first). 28 | fn one() -> Vec; 29 | 30 | /// Returns `R ^ 2 mod P` as a vector of 32-bit limbs in little-endian non-Montgomery form 31 | /// (least significant limb first). 32 | fn r2() -> Vec; 33 | 34 | /// Returns the field modulus as a vector of 32-bit limbs in non-Montgomery form (least 35 | /// significant limb first). 36 | fn modulus() -> Vec; 37 | 38 | /// If the field is an extension field, then the name of the sub-field is returned. 39 | fn sub_field_name() -> Option { 40 | None 41 | } 42 | } 43 | 44 | /// Macro to get a unique name of an item. 45 | /// 46 | /// The name is a string that consists of the module path and the type name. All non-alphanumeric 47 | /// characters are replaced with underscores, so that it's an identifier that doesn't cause any 48 | /// issues with C compilers. 49 | #[macro_export] 50 | macro_rules! name { 51 | () => {{ 52 | let mod_path = module_path!(); 53 | let type_name = core::any::type_name::(); 54 | let name = if type_name.starts_with(mod_path) { 55 | type_name.into() 56 | } else { 57 | [mod_path, "__", type_name].concat() 58 | }; 59 | name.replace(|c: char| !c.is_ascii_alphanumeric(), "_") 60 | }}; 61 | } 62 | -------------------------------------------------------------------------------- /gpu-tests/Cargo.toml: -------------------------------------------------------------------------------- 1 | # NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just temporarily until 2 | # https://github.com/zkcrypto/group/pull/29 is fixed. Then we won't need the exports of `Fp` and 3 | # `Fp2` any more. 4 | [package] 5 | name = "gpu-tests" 6 | version = "0.1.0" 7 | edition = "2021" 8 | description = "Test for the ec-gpu project" 9 | homepage = "https://github.com/filecoin-project/ec-gpu" 10 | repository = "https://github.com/filecoin-project/ec-gpu" 11 | license = "MIT/Apache-2.0" 12 | publish = false 13 | 14 | [dev-dependencies] 15 | blstrs = { version = "0.7.0", features = ["__private_bench"] } 16 | criterion = "0.4" 17 | ec-gpu = "0.2" 18 | ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false } 19 | ff = { version = "0.13.0", default-features = false } 20 | fil_logger = "0.1.6" 21 | group = "0.13.0" 22 | pairing = "0.23.0" 23 | rand = "0.8" 24 | rayon = "1.5.3" 25 | 26 | [build-dependencies] 27 | blstrs = { version = "0.7.0", features = ["__private_bench"] } 28 | ec-gpu-gen = { path = "../ec-gpu-gen" } 29 | 30 | [features] 31 | default = ["cuda", "opencl"] 32 | cuda = ["blstrs/gpu", "ec-gpu-gen/cuda"] 33 | opencl = ["blstrs/gpu", "ec-gpu-gen/opencl"] 34 | 35 | [[bench]] 36 | name = "multiexp" 37 | harness = false 38 | -------------------------------------------------------------------------------- /gpu-tests/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /gpu-tests/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /gpu-tests/README.md: -------------------------------------------------------------------------------- 1 | # `gpu-tests` 2 | 3 | This crate is for running tests. Usually kernels are created during compile time, hence a `build.rs` is needed. `ec-gpu-gen` is just a toolkit and doesn't provide pre-defined kernels. This crate separates those concerns and also shows how `ec-gpu-gen` can be used. 4 | 5 | ## Usage 6 | 7 | ```console 8 | cargo test 9 | ``` 10 | 11 | ## Feature flags 12 | 13 | By default `cuda` and `opencl` is enabled. If you want to run the tests/benchmarks with either of those, you can do so: 14 | 15 | ```console 16 | cargo test --no-default-features --features opencl 17 | ``` 18 | 19 | ## License 20 | 21 | Licensed under either of 22 | 23 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 24 | http://www.apache.org/licenses/LICENSE-2.0) 25 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 26 | 27 | at your option. 28 | 29 | ### Contribution 30 | 31 | Unless you explicitly state otherwise, any contribution intentionally 32 | submitted for inclusion in the work by you, as defined in the Apache-2.0 33 | license, shall be dual licensed as above, without any additional terms or 34 | conditions. 35 | -------------------------------------------------------------------------------- /gpu-tests/benches/multiexp.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use blstrs::Bls12; 4 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 5 | use ec_gpu_gen::{ 6 | multiexp::MultiexpKernel, multiexp_cpu::SourceBuilder, rust_gpu_tools::Device, 7 | threadpool::Worker, 8 | }; 9 | use ff::{Field, PrimeField}; 10 | use group::{Curve, Group}; 11 | use pairing::Engine; 12 | use rayon::iter::{IntoParallelIterator, ParallelIterator}; 13 | 14 | /// The power that will be used to define the maximum number of elements. The number of elements 15 | /// is `2^MAX_ELEMENTS_POWER`. 16 | const MAX_ELEMENTS_POWER: usize = 29; 17 | /// The maximum number of elements for this benchmark. 18 | const MAX_ELEMENTS: usize = 1 << MAX_ELEMENTS_POWER; 19 | 20 | fn bench_multiexp(crit: &mut Criterion) { 21 | let mut group = crit.benchmark_group("multiexp"); 22 | // The difference between runs is so little, hence a low sample size is OK. 23 | group.sample_size(10); 24 | 25 | let devices = Device::all(); 26 | let programs = devices 27 | .iter() 28 | .map(|device| ec_gpu_gen::program!(device)) 29 | .collect::>() 30 | .expect("Cannot create programs!"); 31 | let mut kern = MultiexpKernel::<::G1Affine>::create(programs, &devices) 32 | .expect("Cannot initialize kernel!"); 33 | let pool = Worker::new(); 34 | let max_bases: Vec<_> = (0..MAX_ELEMENTS) 35 | .into_par_iter() 36 | .map(|_| ::G1::random(rand::thread_rng()).to_affine()) 37 | .collect(); 38 | let max_exponents: Vec<_> = (0..MAX_ELEMENTS) 39 | .into_par_iter() 40 | .map(|_| ::Fr::random(rand::thread_rng()).to_repr()) 41 | .collect(); 42 | 43 | let num_elements: Vec<_> = (10..MAX_ELEMENTS_POWER).map(|shift| 1 << shift).collect(); 44 | for num in num_elements { 45 | group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &num| { 46 | let (bases, skip) = SourceBuilder::get((Arc::new(max_bases[0..num].to_vec()), 0)); 47 | let exponents = Arc::new(max_exponents[0..num].to_vec()); 48 | 49 | bencher.iter(|| { 50 | black_box( 51 | kern.multiexp(&pool, bases.clone(), exponents.clone(), skip) 52 | .unwrap(), 53 | ); 54 | }) 55 | }); 56 | } 57 | group.finish(); 58 | } 59 | 60 | criterion_group!(benches, bench_multiexp); 61 | criterion_main!(benches); 62 | -------------------------------------------------------------------------------- /gpu-tests/build.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(any(feature = "cuda", feature = "opencl")))] 2 | fn main() {} 3 | 4 | #[cfg(any(feature = "cuda", feature = "opencl"))] 5 | fn main() { 6 | use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar}; 7 | use ec_gpu_gen::SourceBuilder; 8 | 9 | let source_builder = SourceBuilder::new() 10 | .add_fft::() 11 | .add_multiexp::() 12 | .add_multiexp::(); 13 | ec_gpu_gen::generate(&source_builder); 14 | } 15 | -------------------------------------------------------------------------------- /gpu-tests/src/lib.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gpu-tests/tests/fft.rs: -------------------------------------------------------------------------------- 1 | #![cfg(any(feature = "cuda", feature = "opencl"))] 2 | 3 | use std::time::Instant; 4 | 5 | use blstrs::Scalar as Fr; 6 | use ec_gpu_gen::{ 7 | fft::FftKernel, 8 | fft_cpu::{parallel_fft, serial_fft}, 9 | rust_gpu_tools::Device, 10 | threadpool::Worker, 11 | }; 12 | use ff::{Field, PrimeField}; 13 | 14 | fn omega(num_coeffs: usize) -> F { 15 | // Compute omega, the 2^exp primitive root of unity 16 | let exp = (num_coeffs as f32).log2().floor() as u32; 17 | let mut omega = F::ROOT_OF_UNITY; 18 | for _ in exp..F::S { 19 | omega = omega.square(); 20 | } 21 | omega 22 | } 23 | 24 | #[test] 25 | pub fn gpu_fft_consistency() { 26 | fil_logger::maybe_init(); 27 | let mut rng = rand::thread_rng(); 28 | 29 | let worker = Worker::new(); 30 | let log_threads = worker.log_num_threads(); 31 | let devices = Device::all(); 32 | let programs = devices 33 | .iter() 34 | .map(|device| ec_gpu_gen::program!(device)) 35 | .collect::>() 36 | .expect("Cannot create programs!"); 37 | let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); 38 | 39 | for log_d in 1..=20 { 40 | let d = 1 << log_d; 41 | 42 | let mut v1_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); 43 | let v1_omega = omega::(v1_coeffs.len()); 44 | let mut v2_coeffs = v1_coeffs.clone(); 45 | let v2_omega = v1_omega; 46 | 47 | println!("Testing FFT for {} elements...", d); 48 | 49 | let mut now = Instant::now(); 50 | kern.radix_fft_many(&mut [&mut v1_coeffs], &[v1_omega], &[log_d]) 51 | .expect("GPU FFT failed!"); 52 | let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 53 | println!("GPU took {}ms.", gpu_dur); 54 | 55 | now = Instant::now(); 56 | if log_d <= log_threads { 57 | serial_fft::(&mut v2_coeffs, &v2_omega, log_d); 58 | } else { 59 | parallel_fft::(&mut v2_coeffs, &worker, &v2_omega, log_d, log_threads); 60 | } 61 | let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 62 | println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur); 63 | 64 | println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); 65 | 66 | assert!(v1_coeffs == v2_coeffs); 67 | println!("============================"); 68 | } 69 | } 70 | 71 | #[test] 72 | pub fn gpu_fft_many_consistency() { 73 | fil_logger::maybe_init(); 74 | let mut rng = rand::thread_rng(); 75 | 76 | let worker = Worker::new(); 77 | let log_threads = worker.log_num_threads(); 78 | let devices = Device::all(); 79 | let programs = devices 80 | .iter() 81 | .map(|device| ec_gpu_gen::program!(device)) 82 | .collect::>() 83 | .expect("Cannot create programs!"); 84 | let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); 85 | 86 | for log_d in 1..=20 { 87 | let d = 1 << log_d; 88 | 89 | let mut v11_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); 90 | let mut v12_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); 91 | let mut v13_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); 92 | let v11_omega = omega::(v11_coeffs.len()); 93 | let v12_omega = omega::(v12_coeffs.len()); 94 | let v13_omega = omega::(v13_coeffs.len()); 95 | 96 | let mut v21_coeffs = v11_coeffs.clone(); 97 | let mut v22_coeffs = v12_coeffs.clone(); 98 | let mut v23_coeffs = v13_coeffs.clone(); 99 | let v21_omega = v11_omega; 100 | let v22_omega = v12_omega; 101 | let v23_omega = v13_omega; 102 | 103 | println!("Testing FFT3 for {} elements...", d); 104 | 105 | let mut now = Instant::now(); 106 | kern.radix_fft_many( 107 | &mut [&mut v11_coeffs, &mut v12_coeffs, &mut v13_coeffs], 108 | &[v11_omega, v12_omega, v13_omega], 109 | &[log_d, log_d, log_d], 110 | ) 111 | .expect("GPU FFT failed!"); 112 | let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 113 | println!("GPU took {}ms.", gpu_dur); 114 | 115 | now = Instant::now(); 116 | if log_d <= log_threads { 117 | serial_fft::(&mut v21_coeffs, &v21_omega, log_d); 118 | serial_fft::(&mut v22_coeffs, &v22_omega, log_d); 119 | serial_fft::(&mut v23_coeffs, &v23_omega, log_d); 120 | } else { 121 | parallel_fft::(&mut v21_coeffs, &worker, &v21_omega, log_d, log_threads); 122 | parallel_fft::(&mut v22_coeffs, &worker, &v22_omega, log_d, log_threads); 123 | parallel_fft::(&mut v23_coeffs, &worker, &v23_omega, log_d, log_threads); 124 | } 125 | let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 126 | println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur); 127 | 128 | println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); 129 | 130 | assert!(v11_coeffs == v21_coeffs); 131 | assert!(v12_coeffs == v22_coeffs); 132 | assert!(v13_coeffs == v23_coeffs); 133 | 134 | println!("============================"); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /gpu-tests/tests/multiexp.rs: -------------------------------------------------------------------------------- 1 | #![cfg(any(feature = "cuda", feature = "opencl"))] 2 | 3 | use std::sync::Arc; 4 | use std::time::Instant; 5 | 6 | use blstrs::Bls12; 7 | use ec_gpu::GpuName; 8 | use ec_gpu_gen::multiexp_cpu::{multiexp_cpu, FullDensity, QueryDensity, SourceBuilder}; 9 | use ec_gpu_gen::{ 10 | multiexp::MultiexpKernel, program, rust_gpu_tools::Device, threadpool::Worker, EcError, 11 | }; 12 | use ff::{Field, PrimeField}; 13 | use group::Curve; 14 | use group::{prime::PrimeCurveAffine, Group}; 15 | use pairing::Engine; 16 | 17 | fn multiexp_gpu( 18 | pool: &Worker, 19 | bases: S, 20 | density_map: D, 21 | exponents: Arc::Repr>>, 22 | kern: &mut MultiexpKernel, 23 | ) -> Result 24 | where 25 | for<'a> &'a Q: QueryDensity, 26 | D: Send + Sync + 'static + Clone + AsRef, 27 | G: PrimeCurveAffine + GpuName, 28 | S: SourceBuilder, 29 | { 30 | let exps = density_map.as_ref().generate_exps::(exponents); 31 | let (bss, skip) = bases.get(); 32 | kern.multiexp(pool, bss, exps, skip).map_err(Into::into) 33 | } 34 | 35 | #[test] 36 | fn gpu_multiexp_consistency() { 37 | fil_logger::maybe_init(); 38 | const MAX_LOG_D: usize = 16; 39 | const START_LOG_D: usize = 10; 40 | let devices = Device::all(); 41 | let programs = devices 42 | .iter() 43 | .map(|device| crate::program!(device)) 44 | .collect::>() 45 | .expect("Cannot create programs!"); 46 | let mut kern = MultiexpKernel::<::G1Affine>::create(programs, &devices) 47 | .expect("Cannot initialize kernel!"); 48 | let pool = Worker::new(); 49 | 50 | let mut rng = rand::thread_rng(); 51 | 52 | let mut bases = (0..(1 << START_LOG_D)) 53 | .map(|_| ::G1::random(&mut rng).to_affine()) 54 | .collect::>(); 55 | 56 | for log_d in START_LOG_D..=MAX_LOG_D { 57 | let g = Arc::new(bases.clone()); 58 | 59 | let samples = 1 << log_d; 60 | println!("Testing Multiexp for {} elements...", samples); 61 | 62 | let v = Arc::new( 63 | (0..samples) 64 | .map(|_| ::Fr::random(&mut rng).to_repr()) 65 | .collect::>(), 66 | ); 67 | 68 | let mut now = Instant::now(); 69 | let gpu = multiexp_gpu(&pool, (g.clone(), 0), FullDensity, v.clone(), &mut kern).unwrap(); 70 | let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 71 | println!("GPU took {}ms.", gpu_dur); 72 | 73 | now = Instant::now(); 74 | let cpu = multiexp_cpu(&pool, (g.clone(), 0), FullDensity, v.clone()) 75 | .wait() 76 | .unwrap(); 77 | let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; 78 | println!("CPU took {}ms.", cpu_dur); 79 | 80 | println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); 81 | 82 | assert_eq!(cpu, gpu); 83 | 84 | println!("============================"); 85 | 86 | bases = [bases.clone(), bases.clone()].concat(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | consolidate-commits = false 2 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | 1.83.0 2 | --------------------------------------------------------------------------------