├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── COPYRIGHT
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── ec-gpu-gen
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    ├── build.rs
    └── src
    │   ├── cl
    │       ├── common.cl
    │       ├── ec.cl
    │       ├── fft.cl
    │       ├── field.cl
    │       ├── field2.cl
    │       ├── multiexp.cl
    │       └── test.cl
    │   ├── error.rs
    │   ├── fft.rs
    │   ├── fft_cpu.rs
    │   ├── lib.rs
    │   ├── multiexp.rs
    │   ├── multiexp_cpu.rs
    │   ├── program.rs
    │   ├── source.rs
    │   └── threadpool.rs
├── ec-gpu
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    └── src
    │   └── lib.rs
├── gpu-tests
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    ├── README.md
    ├── benches
    │   └── multiexp.rs
    ├── build.rs
    ├── src
    │   └── lib.rs
    └── tests
    │   ├── fft.rs
    │   └── multiexp.rs
├── release.toml
└── rust-toolchain


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [pull_request, push]
 4 | 
 5 | # Cancel a job if there's a new on on the same branch started.
 6 | # Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051
 7 | concurrency:
 8 |   group: ${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | env:
12 |   CARGO_INCREMENTAL: 0
13 |   RUST_BACKTRACE: 1
14 |   # Faster crates.io index checkout.
15 |   CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
16 | 
17 | jobs:
18 |   set-msrv:
19 |     runs-on: ubuntu-latest
20 |     outputs:
21 |       msrv: ${{ steps.msrv.outputs.MSRV }}
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Get MSRV from rust-toolchain
25 |         id: msrv
26 |         run: |
27 |           MSRV=$(cat ./rust-toolchain)
28 |           echo "MSRV=$MSRV" | tee --append "$GITHUB_OUTPUT"
29 | 
30 |   linux:
31 |     needs: set-msrv
32 |     runs-on: ubuntu-latest
33 |     name: Build and test
34 |     steps:
35 |       - uses: actions/checkout@v4
36 |       - uses: dtolnay/rust-toolchain@master
37 |         with:
38 |           toolchain: ${{needs.set-msrv.outputs.msrv}}
39 |       - name: Install required packages
40 |         run: sudo apt install --no-install-recommends --yes ocl-icd-opencl-dev nvidia-cuda-toolkit
41 |       - name: Build with default features
42 |         run: cargo build --workspace
43 |       # Machine has no GPU installed, hence run without the `cuda` or `opencl` feature.
44 |       - name: Run tests without default features
45 |         run: cargo test --workspace --no-default-features -- --nocapture
46 | 
47 |   clippy_check:
48 |     needs: set-msrv
49 |     runs-on: ubuntu-latest
50 |     name: Clippy
51 |     steps:
52 |       - uses: actions/checkout@v4
53 |       - uses: dtolnay/rust-toolchain@master
54 |         with:
55 |           toolchain: ${{ needs.set-msrv.outputs.msrv }}
56 |           components: clippy
57 |       - name: Install required packages
58 |         run: sudo apt install --no-install-recommends --yes ocl-icd-opencl-dev nvidia-cuda-dev
59 |       - name: Run cargo clippy default features
60 |         run: cargo clippy --workspace --all-targets -- -D warnings
61 |       - name: Run cargo clippy with cuda and opencl features
62 |         run: cargo clippy --workspace --all-targets --features cuda,opencl -- -D warnings
63 |       - name: Run cargo clippy with cuda feature
64 |         run: cargo clippy --workspace --all-targets --no-default-features --features cuda -- -D warnings
65 |       - name: Run cargo clippy with opencl feature
66 |         run: cargo clippy --workspace --all-targets --no-default-features --features opencl -- -D warnings
67 | 
68 |   check_fmt_and_docs:
69 |     needs: set-msrv
70 |     runs-on: ubuntu-latest
71 |     name: Checking fmt and docs
72 |     steps:
73 |       - uses: actions/checkout@v4
74 |       - uses: dtolnay/rust-toolchain@master
75 |         with:
76 |           toolchain: ${{ needs.set-msrv.outputs.msrv }}
77 |           components: rustfmt
78 |       - name: fmt
79 |         run: cargo fmt --all -- --check
80 |       - name: Docs
81 |         env:
82 |           # Making sure that the documentation can be built without having the NVIDIA toolkit
83 |           # installed.
84 |           DOCS_RS: true
85 |         run: |
86 |           cargo rustdoc --package ec-gpu --all-features -- -D warnings
87 |           cargo rustdoc --package ec-gpu-gen --all-features -- -D warnings
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | Copyrights in the "ff-cl-gen" library are retained by their contributors. No
 2 | copyright assignment is required to contribute to the "ff-cl-gen" library.
 3 | 
 4 | The "ff-cl-gen" library is licensed under either of
 5 | 
 6 |  * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
 7 |  * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT)
 8 | 
 9 | at your option.
10 | 
11 | Unless you explicitly state otherwise, any contribution intentionally
12 | submitted for inclusion in the work by you, as defined in the Apache-2.0
13 | license, shall be dual licensed as above, without any additional terms or
14 | conditions.
15 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = [
3 |   "ec-gpu",
4 |   "ec-gpu-gen",
5 |   "gpu-tests",
6 | ]
7 | resolver = "2"
8 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `ec-gpu` & `ec-gpu-gen`
  2 | 
  3 | [![crates.io][crate-image-ec-gpu]][crate-link-ec-gpu]
  4 | [![Documentation][doc-image-ec-gpu]][doc-link-ec-gpu]
  5 | [![Build Status][build-image-ec-gpu]][build-link-ec-gpu]
  6 | ![minimum rustc 1.51][msrv-image-ec-gpu]
  7 | [![dependency status][deps-image-ec-gpu]][deps-link-ec-gpu]
  8 | 
  9 | [![crates.io][crate-image-ec-gpu-gen]][crate-link-ec-gpu-gen]
 10 | [![Documentation][doc-image-ec-gpu-gen]][doc-link-ec-gpu-gen]
 11 | [![Build Status][build-image-ec-gpu-gen]][build-link-ec-gpu-gen]
 12 | ![minimum rustc 1.51][msrv-image-ec-gpu-gen]
 13 | [![dependency status][deps-image-ec-gpu-gen]][deps-link-ec-gpu-gen]
 14 | 
 15 | CUDA/OpenCL code generator for finite-field arithmetic over prime fields and elliptic curve arithmetic constructed with Rust.
 16 | 
 17 | Notes:
 18 |  - Limbs are 32/64-bit long, by your choice (on CUDA only 32-bit limbs are supported).
 19 |  - The library assumes that the most significant bit of your prime-field is unset. This allows for cheap reductions.
 20 | 
 21 | ## Usage
 22 | 
 23 | ### Quickstart
 24 | 
 25 | Generating CUDA/OpenCL codes for `blstrs` Scalar elements:
 26 | 
 27 | ```rust
 28 | use blstrs::Scalar;
 29 | use ec_gpu_gen::SourceBuilder;
 30 | 
 31 | let source = SourceBuilder::new()
 32 |     .add_field::<Scalar>()
 33 |     .build_64_bit_limbs();
 34 | ```
 35 | 
 36 | ### Integration into your library
 37 | 
 38 | This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time.
 39 | 
 40 | In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time.
 41 | 
 42 | In this example we will make use of the FFT functionality. Add to your `build.rs`:
 43 | 
 44 | ```rust
 45 | use blstrs::Scalar;
 46 | use ec_gpu_gen::SourceBuilder;
 47 | 
 48 | fn main() {
 49 |     let source_builder = SourceBuilder::new().add_fft::<Scalar>()
 50 |     ec_gpu_gen::generate(&source_builder);
 51 | }
 52 | ```
 53 | 
 54 | The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source.
 55 | 
 56 | Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this:
 57 | 
 58 | ```rust
 59 | use ec_gpu_gen::{
 60 |     rust_gpu_tools::Device,
 61 | };
 62 | 
 63 | let devices = Device::all();
 64 | let programs = devices
 65 |     .iter()
 66 |     .map(|device| ec_gpu_gen::program!(device))
 67 |     .collect::<Result<_, _>>()
 68 |     .expect("Cannot create programs!");
 69 | 
 70 | let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
 71 | kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!");
 72 | ```
 73 | 
 74 | ## Feature flags
 75 | 
 76 | This crate supports CUDA and OpenCL, which can be enabled with the `cuda` and `opencl` feature flags.
 77 | 
 78 | ### Environment variables
 79 | 
 80 |  - `EC_GPU_CUDA_NVCC_ARGS`
 81 | 
 82 |      By default the CUDA kernel is compiled for several architectures, which may take a long time. `EC_GPU_CUDA_NVCC_ARGS` can be used to override those arguments. The input and output file will still be automatically set.
 83 | 
 84 |     ```console
 85 |     // Example for compiling the kernel for only the Turing architecture.
 86 |     EC_GPU_CUDA_NVCC_ARGS="--fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75"
 87 |     ```
 88 | 
 89 |  - `EC_GPU_FRAMEWORK`
 90 | 
 91 |     When the library is built with both CUDA and OpenCL support, you can choose which one to use at run time. The default is `cuda`, when you set nothing or any other (invalid) value. The other possible value is `opencl`.
 92 | 
 93 |     ```console
 94 |     // Example for setting it to OpenCL.
 95 |     EC_GPU_FRAMEWORK=opencl
 96 |     ```
 97 | 
 98 |  - `EC_GPU_NUM_THREADS`
 99 | 
100 |    Restricts the number of threads used in the library. The default is set to the number of logical cores reported on the machine.
101 | 
102 |     ```console
103 |     // Example for setting the maximum number of threads to 6.
104 |     EC_GPU_NUM_THREADS=6
105 |     ```
106 | 
107 | 
108 | ## License
109 | 
110 | Licensed under either of
111 | 
112 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
113 |    http://www.apache.org/licenses/LICENSE-2.0)
114 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
115 | 
116 | at your option.
117 | 
118 | ### Contribution
119 | 
120 | Unless you explicitly state otherwise, any contribution intentionally
121 | submitted for inclusion in the work by you, as defined in the Apache-2.0
122 | license, shall be dual licensed as above, without any additional terms or
123 | conditions.
124 | 
125 | 
126 | [crate-image-ec-gpu]: https://img.shields.io/crates/v/ec-gpu.svg
127 | [crate-link-ec-gpu]: https://crates.io/crates/ec-gpu
128 | [doc-image-ec-gpu]: https://docs.rs/ec-gpu/badge.svg
129 | [doc-link-ec-gpu]: https://docs.rs/ec-gpu
130 | [build-image-ec-gpu]: https://circleci.com/gh/filecoin-project/ec-gpu.svg?style=shield
131 | [build-link-ec-gpu]: https://circleci.com/gh/filecoin-project/ec-gpu
132 | [msrv-image-ec-gpu]: https://img.shields.io/badge/rustc-1.54+-blue.svg
133 | [deps-image-ec-gpu]: https://deps.rs/repo/github/filecoin-projectt/ec-gpu/status.svg
134 | [deps-link-ec-gpu]: https://deps.rs/repo/github/filecoin-project/ec-gpu
135 | 
136 | 
137 | [crate-image-ec-gpu-gen]: https://img.shields.io/crates/v/ec-gpu-gen.svg
138 | [crate-link-ec-gpu-gen]: https://crates.io/crates/ec-gpu-gen
139 | [doc-image-ec-gpu-gen]: https://docs.rs/ec-gpu-gen/badge.svg
140 | [doc-link-ec-gpu-gen]: https://docs.rs/ec-gpu-gen
141 | [build-image-ec-gpu-gen]: https://circleci.com/gh/filecoin-project/ec-gpu.svg?style=shield
142 | [build-link-ec-gpu-gen]: https://circleci.com/gh/filecoin-project/ec-gpu
143 | [msrv-image-ec-gpu-gen]: https://img.shields.io/badge/rustc-1.54+-blue.svg
144 | [deps-image-ec-gpu-gen]: https://deps.rs/repo/github/filecoin-projectt/ec-gpu/status.svg
145 | [deps-link-ec-gpu-gen]: https://deps.rs/repo/github/filecoin-project/ec-gpu
146 | 
147 | [Fast Fourier transform]: https://en.wikipedia.org/wiki/Fast_Fourier_transform
148 | [fatbin]: https://en.wikipedia.org/wiki/Fat_binary#Heterogeneous_computing
149 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ec-gpu-gen"
 3 | version = "0.7.1"
 4 | authors = ["dignifiedquire <me@dignifiedquire.com>"]
 5 | edition = "2021"
 6 | description = "Code generator for field and eliptic curve operations on the GPUs"
 7 | homepage = "https://github.com/filecoin-project/ff-cl-gen"
 8 | repository = "https://github.com/filecoin-project/ff-cl-gen"
 9 | license = "MIT/Apache-2.0"
10 | rust-version = "1.83.0"
11 | 
12 | [dependencies]
13 | bitvec = "1.0.1"
14 | crossbeam-channel = "0.5.1"
15 | ec-gpu = "0.2.0"
16 | execute = "0.2.9"
17 | ff = { version = "0.13.0", default-features = false }
18 | group = "0.13.0"
19 | hex = "0.4"
20 | log = "0.4.14"
21 | num_cpus = "1.13.0"
22 | once_cell = "1.8.0"
23 | rayon = "1.5.1"
24 | rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true }
25 | sha2 = "0.10"
26 | thiserror = "1.0.30"
27 | yastl = "0.1.2"
28 | 
29 | [dev-dependencies]
30 | # NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just
31 | # temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then
32 | # we won't need the exports of `Fp` and `Fp2` any more.
33 | #blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true }
34 | blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] }
35 | rand = "0.8"
36 | lazy_static = "1.2"
37 | pairing = "0.23.0"
38 | temp-env = "0.3.0"
39 | rand_core = "0.6.3"
40 | rand_xorshift = "0.3.0"
41 | 
42 | [features]
43 | default = []
44 | cuda = ["rust-gpu-tools/cuda"]
45 | opencl = ["rust-gpu-tools/opencl"]
46 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | ../LICENSE-APACHE


--------------------------------------------------------------------------------
/ec-gpu-gen/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | ../LICENSE-MIT


--------------------------------------------------------------------------------
/ec-gpu-gen/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     // This is intentionally empty. It's only there so that `OUT_DIR` is set, which is
3 |     // used by one of the tests.
4 | }
5 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/common.cl:
--------------------------------------------------------------------------------
  1 | // Defines to make the code work with both, CUDA and OpenCL
  2 | #ifdef __NVCC__
  3 |   #define DEVICE __device__
  4 |   #define GLOBAL
  5 |   #define KERNEL extern "C" __global__
  6 |   #define LOCAL __shared__
  7 |   #define CONSTANT __constant__
  8 | 
  9 |   #define GET_GLOBAL_ID() blockIdx.x * blockDim.x + threadIdx.x
 10 |   #define GET_GROUP_ID() blockIdx.x
 11 |   #define GET_LOCAL_ID() threadIdx.x
 12 |   #define GET_LOCAL_SIZE() blockDim.x
 13 |   #define BARRIER_LOCAL() __syncthreads()
 14 | 
 15 |   typedef unsigned char uchar;
 16 | 
 17 |   #define CUDA
 18 | #else // OpenCL
 19 |   #define DEVICE
 20 |   #define GLOBAL __global
 21 |   #define KERNEL __kernel
 22 |   #define LOCAL __local
 23 |   #define CONSTANT __constant
 24 | 
 25 |   #define GET_GLOBAL_ID() get_global_id(0)
 26 |   #define GET_GROUP_ID() get_group_id(0)
 27 |   #define GET_LOCAL_ID() get_local_id(0)
 28 |   #define GET_LOCAL_SIZE() get_local_size(0)
 29 |   #define BARRIER_LOCAL() barrier(CLK_LOCAL_MEM_FENCE)
 30 | #endif
 31 | 
 32 | #ifdef __NV_CL_C_VERSION
 33 | #define OPENCL_NVIDIA
 34 | #endif
 35 | 
 36 | #if defined(__WinterPark__) || defined(__BeaverCreek__) || defined(__Turks__) || \
 37 |     defined(__Caicos__) || defined(__Tahiti__) || defined(__Pitcairn__) || \
 38 |     defined(__Capeverde__) || defined(__Cayman__) || defined(__Barts__) || \
 39 |     defined(__Cypress__) || defined(__Juniper__) || defined(__Redwood__) || \
 40 |     defined(__Cedar__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || \
 41 |     defined(__ATI_RV710__) || defined(__Loveland__) || defined(__GPU__) || \
 42 |     defined(__Hawaii__)
 43 | #define AMD
 44 | #endif
 45 | 
 46 | // Returns a * b + c + d, puts the carry in d
 47 | DEVICE ulong mac_with_carry_64(ulong a, ulong b, ulong c, ulong *d) {
 48 |   #if defined(OPENCL_NVIDIA) || defined(CUDA)
 49 |     ulong lo, hi;
 50 |     asm("mad.lo.cc.u64 %0, %2, %3, %4;\r\n"
 51 |         "madc.hi.u64 %1, %2, %3, 0;\r\n"
 52 |         "add.cc.u64 %0, %0, %5;\r\n"
 53 |         "addc.u64 %1, %1, 0;\r\n"
 54 |         : "=l"(lo), "=l"(hi) : "l"(a), "l"(b), "l"(c), "l"(*d));
 55 |     *d = hi;
 56 |     return lo;
 57 |   #else
 58 |     ulong lo = a * b + c;
 59 |     ulong hi = mad_hi(a, b, (ulong)(lo < c));
 60 |     a = lo;
 61 |     lo += *d;
 62 |     hi += (lo < a);
 63 |     *d = hi;
 64 |     return lo;
 65 |   #endif
 66 | }
 67 | 
 68 | // Returns a + b, puts the carry in d
 69 | DEVICE ulong add_with_carry_64(ulong a, ulong *b) {
 70 |   #if defined(OPENCL_NVIDIA) || defined(CUDA)
 71 |     ulong lo, hi;
 72 |     asm("add.cc.u64 %0, %2, %3;\r\n"
 73 |         "addc.u64 %1, 0, 0;\r\n"
 74 |         : "=l"(lo), "=l"(hi) : "l"(a), "l"(*b));
 75 |     *b = hi;
 76 |     return lo;
 77 |   #else
 78 |     ulong lo = a + *b;
 79 |     *b = lo < a;
 80 |     return lo;
 81 |   #endif
 82 | }
 83 | 
 84 | // Returns a * b + c + d, puts the carry in d
 85 | DEVICE uint mac_with_carry_32(uint a, uint b, uint c, uint *d) {
 86 |   ulong res = (ulong)a * b + c + *d;
 87 |   *d = res >> 32;
 88 |   return res;
 89 | }
 90 | 
 91 | // Returns a + b, puts the carry in b
 92 | DEVICE uint add_with_carry_32(uint a, uint *b) {
 93 |   #if defined(OPENCL_NVIDIA) || defined(CUDA)
 94 |     uint lo, hi;
 95 |     asm("add.cc.u32 %0, %2, %3;\r\n"
 96 |         "addc.u32 %1, 0, 0;\r\n"
 97 |         : "=r"(lo), "=r"(hi) : "r"(a), "r"(*b));
 98 |     *b = hi;
 99 |     return lo;
100 |   #else
101 |     uint lo = a + *b;
102 |     *b = lo < a;
103 |     return lo;
104 |   #endif
105 | }
106 | 
107 | // Reverse the given bits. It's used by the FFT kernel.
108 | DEVICE uint bitreverse(uint n, uint bits) {
109 |   uint r = 0;
110 |   for(int i = 0; i < bits; i++) {
111 |     r = (r << 1) | (n & 1);
112 |     n >>= 1;
113 |   }
114 |   return r;
115 | }
116 | 
117 | #ifdef CUDA
118 | // CUDA doesn't support local buffers ("dynamic shared memory" in CUDA lingo) as function
119 | // arguments, but only a single globally defined extern value. Use `uchar` so that it is always
120 | // allocated by the number of bytes.
121 | extern LOCAL uchar cuda_shared[];
122 | 
123 | typedef uint uint32_t;
124 | typedef int  int32_t;
125 | typedef uint limb;
126 | 
127 | DEVICE inline uint32_t add_cc(uint32_t a, uint32_t b) {
128 |   uint32_t r;
129 | 
130 |   asm volatile ("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b));
131 |   return r;
132 | }
133 | 
134 | DEVICE inline uint32_t addc_cc(uint32_t a, uint32_t b) {
135 |   uint32_t r;
136 | 
137 |   asm volatile ("addc.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b));
138 |   return r;
139 | }
140 | 
141 | DEVICE inline uint32_t addc(uint32_t a, uint32_t b) {
142 |   uint32_t r;
143 | 
144 |   asm volatile ("addc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b));
145 |   return r;
146 | }
147 | 
148 | 
149 | DEVICE inline uint32_t madlo(uint32_t a, uint32_t b, uint32_t c) {
150 |   uint32_t r;
151 | 
152 |   asm volatile ("mad.lo.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
153 |   return r;
154 | }
155 | 
156 | DEVICE inline uint32_t madlo_cc(uint32_t a, uint32_t b, uint32_t c) {
157 |   uint32_t r;
158 | 
159 |   asm volatile ("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
160 |   return r;
161 | }
162 | 
163 | DEVICE inline uint32_t madloc_cc(uint32_t a, uint32_t b, uint32_t c) {
164 |   uint32_t r;
165 | 
166 |   asm volatile ("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
167 |   return r;
168 | }
169 | 
170 | DEVICE inline uint32_t madloc(uint32_t a, uint32_t b, uint32_t c) {
171 |   uint32_t r;
172 | 
173 |   asm volatile ("madc.lo.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
174 |   return r;
175 | }
176 | 
177 | DEVICE inline uint32_t madhi(uint32_t a, uint32_t b, uint32_t c) {
178 |   uint32_t r;
179 | 
180 |   asm volatile ("mad.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
181 |   return r;
182 | }
183 | 
184 | DEVICE inline uint32_t madhi_cc(uint32_t a, uint32_t b, uint32_t c) {
185 |   uint32_t r;
186 | 
187 |   asm volatile ("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
188 |   return r;
189 | }
190 | 
191 | DEVICE inline uint32_t madhic_cc(uint32_t a, uint32_t b, uint32_t c) {
192 |   uint32_t r;
193 | 
194 |   asm volatile ("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
195 |   return r;
196 | }
197 | 
198 | DEVICE inline uint32_t madhic(uint32_t a, uint32_t b, uint32_t c) {
199 |   uint32_t r;
200 | 
201 |   asm volatile ("madc.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
202 |   return r;
203 | }
204 | 
205 | typedef struct {
206 |   int32_t _position;
207 | } chain_t;
208 | 
209 | DEVICE inline
210 | void chain_init(chain_t *c) {
211 |   c->_position = 0;
212 | }
213 | 
214 | DEVICE inline
215 | uint32_t chain_add(chain_t *ch, uint32_t a, uint32_t b) {
216 |   uint32_t r;
217 | 
218 |   ch->_position++;
219 |   if(ch->_position==1)
220 |     r=add_cc(a, b);
221 |   else
222 |     r=addc_cc(a, b);
223 |   return r;
224 | }
225 | 
226 | DEVICE inline
227 | uint32_t chain_madlo(chain_t *ch, uint32_t a, uint32_t b, uint32_t c) {
228 |   uint32_t r;
229 | 
230 |   ch->_position++;
231 |   if(ch->_position==1)
232 |     r=madlo_cc(a, b, c);
233 |   else
234 |     r=madloc_cc(a, b, c);
235 |   return r;
236 | }
237 | 
238 | DEVICE inline
239 | uint32_t chain_madhi(chain_t *ch, uint32_t a, uint32_t b, uint32_t c) {
240 |   uint32_t r;
241 | 
242 |   ch->_position++;
243 |   if(ch->_position==1)
244 |     r=madhi_cc(a, b, c);
245 |   else
246 |     r=madhic_cc(a, b, c);
247 |   return r;
248 | }
249 | #endif
250 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/ec.cl:
--------------------------------------------------------------------------------
  1 | // Elliptic curve operations (Short Weierstrass Jacobian form)
  2 | 
  3 | #define POINT_ZERO ((POINT_jacobian){FIELD_ZERO, FIELD_ONE, FIELD_ZERO})
  4 | 
  5 | typedef struct {
  6 |   FIELD x;
  7 |   FIELD y;
  8 | } POINT_affine;
  9 | 
 10 | typedef struct {
 11 |   FIELD x;
 12 |   FIELD y;
 13 |   FIELD z;
 14 | } POINT_jacobian;
 15 | 
 16 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
 17 | DEVICE POINT_jacobian POINT_double(POINT_jacobian inp) {
 18 |   const FIELD local_zero = FIELD_ZERO;
 19 |   if(FIELD_eq(inp.z, local_zero)) {
 20 |       return inp;
 21 |   }
 22 | 
 23 |   const FIELD a = FIELD_sqr(inp.x); // A = X1^2
 24 |   const FIELD b = FIELD_sqr(inp.y); // B = Y1^2
 25 |   FIELD c = FIELD_sqr(b); // C = B^2
 26 | 
 27 |   // D = 2*((X1+B)2-A-C)
 28 |   FIELD d = FIELD_add(inp.x, b);
 29 |   d = FIELD_sqr(d); d = FIELD_sub(FIELD_sub(d, a), c); d = FIELD_double(d);
 30 | 
 31 |   const FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A
 32 |   const FIELD f = FIELD_sqr(e);
 33 | 
 34 |   inp.z = FIELD_mul(inp.y, inp.z); inp.z = FIELD_double(inp.z); // Z3 = 2*Y1*Z1
 35 |   inp.x = FIELD_sub(FIELD_sub(f, d), d); // X3 = F-2*D
 36 | 
 37 |   // Y3 = E*(D-X3)-8*C
 38 |   c = FIELD_double(c); c = FIELD_double(c); c = FIELD_double(c);
 39 |   inp.y = FIELD_sub(FIELD_mul(FIELD_sub(d, inp.x), e), c);
 40 | 
 41 |   return inp;
 42 | }
 43 | 
 44 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
 45 | DEVICE POINT_jacobian POINT_add_mixed(POINT_jacobian a, POINT_affine b) {
 46 |   const FIELD local_zero = FIELD_ZERO;
 47 |   if(FIELD_eq(a.z, local_zero)) {
 48 |     const FIELD local_one = FIELD_ONE;
 49 |     a.x = b.x;
 50 |     a.y = b.y;
 51 |     a.z = local_one;
 52 |     return a;
 53 |   }
 54 | 
 55 |   const FIELD z1z1 = FIELD_sqr(a.z);
 56 |   const FIELD u2 = FIELD_mul(b.x, z1z1);
 57 |   const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1);
 58 | 
 59 |   if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) {
 60 |       return POINT_double(a);
 61 |   }
 62 | 
 63 |   const FIELD h = FIELD_sub(u2, a.x); // H = U2-X1
 64 |   const FIELD hh = FIELD_sqr(h); // HH = H^2
 65 |   FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH
 66 |   FIELD j = FIELD_mul(h, i); // J = H*I
 67 |   FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1)
 68 |   const FIELD v = FIELD_mul(a.x, i);
 69 | 
 70 |   POINT_jacobian ret;
 71 | 
 72 |   // X3 = r^2 - J - 2*V
 73 |   ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v));
 74 | 
 75 |   // Y3 = r*(V-X3)-2*Y1*J
 76 |   j = FIELD_mul(a.y, j); j = FIELD_double(j);
 77 |   ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j);
 78 | 
 79 |   // Z3 = (Z1+H)^2-Z1Z1-HH
 80 |   ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh);
 81 |   return ret;
 82 | }
 83 | 
 84 | // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
 85 | DEVICE POINT_jacobian POINT_add(POINT_jacobian a, POINT_jacobian b) {
 86 | 
 87 |   const FIELD local_zero = FIELD_ZERO;
 88 |   if(FIELD_eq(a.z, local_zero)) return b;
 89 |   if(FIELD_eq(b.z, local_zero)) return a;
 90 | 
 91 |   const FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2
 92 |   const FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2
 93 |   const FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2
 94 |   const FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1
 95 |   FIELD s1 = FIELD_mul(FIELD_mul(a.y, b.z), z2z2); // S1 = Y1*Z2*Z2Z2
 96 |   const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1
 97 | 
 98 |   if(FIELD_eq(u1, u2) && FIELD_eq(s1, s2))
 99 |     return POINT_double(a);
100 |   else {
101 |     const FIELD h = FIELD_sub(u2, u1); // H = U2-U1
102 |     FIELD i = FIELD_double(h); i = FIELD_sqr(i); // I = (2*H)^2
103 |     const FIELD j = FIELD_mul(h, i); // J = H*I
104 |     FIELD r = FIELD_sub(s2, s1); r = FIELD_double(r); // r = 2*(S2-S1)
105 |     const FIELD v = FIELD_mul(u1, i); // V = U1*I
106 |     a.x = FIELD_sub(FIELD_sub(FIELD_sub(FIELD_sqr(r), j), v), v); // X3 = r^2 - J - 2*V
107 | 
108 |     // Y3 = r*(V - X3) - 2*S1*J
109 |     a.y = FIELD_mul(FIELD_sub(v, a.x), r);
110 |     s1 = FIELD_mul(s1, j); s1 = FIELD_double(s1); // S1 = S1 * J * 2
111 |     a.y = FIELD_sub(a.y, s1);
112 | 
113 |     // Z3 = ((Z1+Z2)^2 - Z1Z1 - Z2Z2)*H
114 |     a.z = FIELD_add(a.z, b.z); a.z = FIELD_sqr(a.z);
115 |     a.z = FIELD_sub(FIELD_sub(a.z, z1z1), z2z2);
116 |     a.z = FIELD_mul(a.z, h);
117 | 
118 |     return a;
119 |   }
120 | }
121 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/fft.cl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FFT algorithm is inspired from: http://www.bealto.com/gpu-fft_group-1.html
 3 |  */
 4 | KERNEL void FIELD_radix_fft(GLOBAL FIELD* x, // Source buffer
 5 |                       GLOBAL FIELD* y, // Destination buffer
 6 |                       GLOBAL FIELD* pq, // Precalculated twiddle factors
 7 |                       GLOBAL FIELD* omegas, // [omega, omega^2, omega^4, ...]
 8 |                       LOCAL FIELD* u_arg, // Local buffer to store intermediary values
 9 |                       uint n, // Number of elements
10 |                       uint lgp, // Log2 of `p` (Read more in the link above)
11 |                       uint deg, // 1=>radix2, 2=>radix4, 3=>radix8, ...
12 |                       uint max_deg) // Maximum degree supported, according to `pq` and `omegas`
13 | {
14 | // CUDA doesn't support local buffers ("shared memory" in CUDA lingo) as function arguments,
15 | // ignore that argument and use the globally defined extern memory instead.
16 | #ifdef CUDA
17 |   // There can only be a single dynamic shared memory item, hence cast it to the type we need.
18 |   FIELD* u = (FIELD*)cuda_shared;
19 | #else
20 |   LOCAL FIELD* u = u_arg;
21 | #endif
22 | 
23 |   uint lid = GET_LOCAL_ID();
24 |   uint lsize = GET_LOCAL_SIZE();
25 |   uint index = GET_GROUP_ID();
26 |   uint t = n >> deg;
27 |   uint p = 1 << lgp;
28 |   uint k = index & (p - 1);
29 | 
30 |   x += index;
31 |   y += ((index - k) << deg) + k;
32 | 
33 |   uint count = 1 << deg; // 2^deg
34 |   uint counth = count >> 1; // Half of count
35 | 
36 |   uint counts = count / lsize * lid;
37 |   uint counte = counts + count / lsize;
38 | 
39 |   // Compute powers of twiddle
40 |   const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k);
41 |   FIELD tmp = FIELD_pow(twiddle, counts);
42 |   for(uint i = counts; i < counte; i++) {
43 |     u[i] = FIELD_mul(tmp, x[i*t]);
44 |     tmp = FIELD_mul(tmp, twiddle);
45 |   }
46 |   BARRIER_LOCAL();
47 | 
48 |   const uint pqshift = max_deg - deg;
49 |   for(uint rnd = 0; rnd < deg; rnd++) {
50 |     const uint bit = counth >> rnd;
51 |     for(uint i = counts >> 1; i < counte >> 1; i++) {
52 |       const uint di = i & (bit - 1);
53 |       const uint i0 = (i << 1) - di;
54 |       const uint i1 = i0 + bit;
55 |       tmp = u[i0];
56 |       u[i0] = FIELD_add(u[i0], u[i1]);
57 |       u[i1] = FIELD_sub(tmp, u[i1]);
58 |       if(di != 0) u[i1] = FIELD_mul(pq[di << rnd << pqshift], u[i1]);
59 |     }
60 | 
61 |     BARRIER_LOCAL();
62 |   }
63 | 
64 |   for(uint i = counts >> 1; i < counte >> 1; i++) {
65 |     y[i*p] = u[bitreverse(i, deg)];
66 |     y[(i+counth)*p] = u[bitreverse(i + counth, deg)];
67 |   }
68 | }
69 | 
70 | /// Multiplies all of the elements by `field`
71 | KERNEL void FIELD_mul_by_field(GLOBAL FIELD* elements,
72 |                         uint n,
73 |                         FIELD field) {
74 |   const uint gid = GET_GLOBAL_ID();
75 |   elements[gid] = FIELD_mul(elements[gid], field);
76 | }
77 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/field.cl:
--------------------------------------------------------------------------------
  1 | // FinalityLabs - 2019
  2 | // Arbitrary size prime-field arithmetic library (add, sub, mul, pow)
  3 | 
  4 | #define FIELD_BITS (FIELD_LIMBS * FIELD_LIMB_BITS)
  5 | #if FIELD_LIMB_BITS == 32
  6 |   #define FIELD_mac_with_carry mac_with_carry_32
  7 |   #define FIELD_add_with_carry add_with_carry_32
  8 | #elif FIELD_LIMB_BITS == 64
  9 |   #define FIELD_mac_with_carry mac_with_carry_64
 10 |   #define FIELD_add_with_carry add_with_carry_64
 11 | #endif
 12 | 
 13 | // Greater than or equal
 14 | DEVICE bool FIELD_gte(FIELD a, FIELD b) {
 15 |   for(char i = FIELD_LIMBS - 1; i >= 0; i--){
 16 |     if(a.val[i] > b.val[i])
 17 |       return true;
 18 |     if(a.val[i] < b.val[i])
 19 |       return false;
 20 |   }
 21 |   return true;
 22 | }
 23 | 
 24 | // Equals
 25 | DEVICE bool FIELD_eq(FIELD a, FIELD b) {
 26 |   for(uchar i = 0; i < FIELD_LIMBS; i++)
 27 |     if(a.val[i] != b.val[i])
 28 |       return false;
 29 |   return true;
 30 | }
 31 | 
 32 | // Normal addition
 33 | #if defined(OPENCL_NVIDIA) || defined(CUDA)
 34 |   #define FIELD_add_ FIELD_add_nvidia
 35 |   #define FIELD_sub_ FIELD_sub_nvidia
 36 | #else
 37 |   DEVICE FIELD FIELD_add_(FIELD a, FIELD b) {
 38 |     bool carry = 0;
 39 |     for(uchar i = 0; i < FIELD_LIMBS; i++) {
 40 |       FIELD_limb old = a.val[i];
 41 |       a.val[i] += b.val[i] + carry;
 42 |       carry = carry ? old >= a.val[i] : old > a.val[i];
 43 |     }
 44 |     return a;
 45 |   }
 46 |   FIELD FIELD_sub_(FIELD a, FIELD b) {
 47 |     bool borrow = 0;
 48 |     for(uchar i = 0; i < FIELD_LIMBS; i++) {
 49 |       FIELD_limb old = a.val[i];
 50 |       a.val[i] -= b.val[i] + borrow;
 51 |       borrow = borrow ? old <= a.val[i] : old < a.val[i];
 52 |     }
 53 |     return a;
 54 |   }
 55 | #endif
 56 | 
 57 | // Modular subtraction
 58 | DEVICE FIELD FIELD_sub(FIELD a, FIELD b) {
 59 |   FIELD res = FIELD_sub_(a, b);
 60 |   if(!FIELD_gte(a, b)) res = FIELD_add_(res, FIELD_P);
 61 |   return res;
 62 | }
 63 | 
 64 | // Modular addition
 65 | DEVICE FIELD FIELD_add(FIELD a, FIELD b) {
 66 |   FIELD res = FIELD_add_(a, b);
 67 |   if(FIELD_gte(res, FIELD_P)) res = FIELD_sub_(res, FIELD_P);
 68 |   return res;
 69 | }
 70 | 
 71 | 
 72 | #ifdef CUDA
 73 | // Code based on the work from Supranational, with special thanks to Niall Emmart:
 74 | //
 75 | // We would like to acknowledge Niall Emmart at Nvidia for his significant
 76 | // contribution of concepts and code for generating efficient SASS on
 77 | // Nvidia GPUs. The following papers may be of interest:
 78 | //     Optimizing Modular Multiplication for NVIDIA's Maxwell GPUs
 79 | //     https://ieeexplore.ieee.org/document/7563271
 80 | //
 81 | //     Faster modular exponentiation using double precision floating point
 82 | //     arithmetic on the GPU
 83 | //     https://ieeexplore.ieee.org/document/8464792
 84 | 
 85 | DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq[FIELD_LIMBS]) {
 86 |   // accLow is an IN and OUT vector
 87 |   // count must be even
 88 |   const uint32_t count = FIELD_LIMBS;
 89 |   uint32_t accHigh[FIELD_LIMBS];
 90 |   uint32_t bucket=0, lowCarry=0, highCarry=0, q;
 91 |   int32_t  i, j;
 92 | 
 93 |   #pragma unroll
 94 |   for(i=0;i<count;i++)
 95 |     accHigh[i]=0;
 96 | 
 97 |   // bucket is used so we don't have to push a carry all the way down the line
 98 | 
 99 |   #pragma unroll
100 |   for(j=0;j<count;j++) {       // main iteration
101 |     if(j%2==0) {
102 |       add_cc(bucket, 0xFFFFFFFF);
103 |       accLow[0]=addc_cc(accLow[0], accHigh[1]);
104 |       bucket=addc(0, 0);
105 | 
106 |       q=accLow[0]*np0;
107 | 
108 |       chain_t chain1;
109 |       chain_init(&chain1);
110 | 
111 |       #pragma unroll
112 |       for(i=0;i<count;i+=2) {
113 |         accLow[i]=chain_madlo(&chain1, q, fq[i], accLow[i]);
114 |         accLow[i+1]=chain_madhi(&chain1, q, fq[i], accLow[i+1]);
115 |       }
116 |       lowCarry=chain_add(&chain1, 0, 0);
117 | 
118 |       chain_t chain2;
119 |       chain_init(&chain2);
120 |       for(i=0;i<count-2;i+=2) {
121 |         accHigh[i]=chain_madlo(&chain2, q, fq[i+1], accHigh[i+2]);    // note the shift down
122 |         accHigh[i+1]=chain_madhi(&chain2, q, fq[i+1], accHigh[i+3]);
123 |       }
124 |       accHigh[i]=chain_madlo(&chain2, q, fq[i+1], highCarry);
125 |       accHigh[i+1]=chain_madhi(&chain2, q, fq[i+1], 0);
126 |     }
127 |     else {
128 |       add_cc(bucket, 0xFFFFFFFF);
129 |       accHigh[0]=addc_cc(accHigh[0], accLow[1]);
130 |       bucket=addc(0, 0);
131 | 
132 |       q=accHigh[0]*np0;
133 | 
134 |       chain_t chain3;
135 |       chain_init(&chain3);
136 |       #pragma unroll
137 |       for(i=0;i<count;i+=2) {
138 |         accHigh[i]=chain_madlo(&chain3, q, fq[i], accHigh[i]);
139 |         accHigh[i+1]=chain_madhi(&chain3, q, fq[i], accHigh[i+1]);
140 |       }
141 |       highCarry=chain_add(&chain3, 0, 0);
142 | 
143 |       chain_t chain4;
144 |       chain_init(&chain4);
145 |       for(i=0;i<count-2;i+=2) {
146 |         accLow[i]=chain_madlo(&chain4, q, fq[i+1], accLow[i+2]);    // note the shift down
147 |         accLow[i+1]=chain_madhi(&chain4, q, fq[i+1], accLow[i+3]);
148 |       }
149 |       accLow[i]=chain_madlo(&chain4, q, fq[i+1], lowCarry);
150 |       accLow[i+1]=chain_madhi(&chain4, q, fq[i+1], 0);
151 |     }
152 |   }
153 | 
154 |   // at this point, accHigh needs to be shifted back a word and added to accLow
155 |   // we'll use one other trick.  Bucket is either 0 or 1 at this point, so we
156 |   // can just push it into the carry chain.
157 | 
158 |   chain_t chain5;
159 |   chain_init(&chain5);
160 |   chain_add(&chain5, bucket, 0xFFFFFFFF);    // push the carry into the chain
161 |   #pragma unroll
162 |   for(i=0;i<count-1;i++)
163 |     accLow[i]=chain_add(&chain5, accLow[i], accHigh[i+1]);
164 |   accLow[i]=chain_add(&chain5, accLow[i], highCarry);
165 | }
166 | 
167 | // Requirement: yLimbs >= xLimbs
168 | DEVICE inline
169 | void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) {
170 |   const uint32_t xLimbs  = FIELD_LIMBS;
171 |   const uint32_t yLimbs  = FIELD_LIMBS;
172 |   const uint32_t xyLimbs = FIELD_LIMBS * 2;
173 |   uint32_t temp[FIELD_LIMBS * 2];
174 |   uint32_t carry = 0;
175 | 
176 |   #pragma unroll
177 |   for (int32_t i = 0; i < xyLimbs; i++) {
178 |     temp[i] = 0;
179 |   }
180 | 
181 |   #pragma unroll
182 |   for (int32_t i = 0; i < xLimbs; i++) {
183 |     chain_t chain1;
184 |     chain_init(&chain1);
185 |     #pragma unroll
186 |     for (int32_t j = 0; j < yLimbs; j++) {
187 |       if ((i + j) % 2 == 1) {
188 |         temp[i + j - 1] = chain_madlo(&chain1, x[i], y[j], temp[i + j - 1]);
189 |         temp[i + j]     = chain_madhi(&chain1, x[i], y[j], temp[i + j]);
190 |       }
191 |     }
192 |     if (i % 2 == 1) {
193 |       temp[i + yLimbs - 1] = chain_add(&chain1, 0, 0);
194 |     }
195 |   }
196 | 
197 |   #pragma unroll
198 |   for (int32_t i = xyLimbs - 1; i > 0; i--) {
199 |     temp[i] = temp[i - 1];
200 |   }
201 |   temp[0] = 0;
202 | 
203 |   #pragma unroll
204 |   for (int32_t i = 0; i < xLimbs; i++) {
205 |     chain_t chain2;
206 |     chain_init(&chain2);
207 | 
208 |     #pragma unroll
209 |     for (int32_t j = 0; j < yLimbs; j++) {
210 |       if ((i + j) % 2 == 0) {
211 |         temp[i + j]     = chain_madlo(&chain2, x[i], y[j], temp[i + j]);
212 |         temp[i + j + 1] = chain_madhi(&chain2, x[i], y[j], temp[i + j + 1]);
213 |       }
214 |     }
215 |     if ((i + yLimbs) % 2 == 0 && i != yLimbs - 1) {
216 |       temp[i + yLimbs]     = chain_add(&chain2, temp[i + yLimbs], carry);
217 |       temp[i + yLimbs + 1] = chain_add(&chain2, temp[i + yLimbs + 1], 0);
218 |       carry = chain_add(&chain2, 0, 0);
219 |     }
220 |     if ((i + yLimbs) % 2 == 1 && i != yLimbs - 1) {
221 |       carry = chain_add(&chain2, carry, 0);
222 |     }
223 |   }
224 | 
225 |   #pragma unroll
226 |   for(int32_t i = 0; i < xyLimbs; i++) {
227 |     xy[i] = temp[i];
228 |   }
229 | }
230 | 
231 | DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) {
232 |   // Perform full multiply
233 |   limb ab[2 * FIELD_LIMBS];
234 |   FIELD_mult_v1(a.val, b.val, ab);
235 | 
236 |   uint32_t io[FIELD_LIMBS];
237 |   #pragma unroll
238 |   for(int i=0;i<FIELD_LIMBS;i++) {
239 |     io[i]=ab[i];
240 |   }
241 |   FIELD_reduce(io, FIELD_INV, FIELD_P.val);
242 | 
243 |   // Add io to the upper words of ab
244 |   ab[FIELD_LIMBS] = add_cc(ab[FIELD_LIMBS], io[0]);
245 |   int j;
246 |   #pragma unroll
247 |   for (j = 1; j < FIELD_LIMBS - 1; j++) {
248 |     ab[j + FIELD_LIMBS] = addc_cc(ab[j + FIELD_LIMBS], io[j]);
249 |   }
250 |   ab[2 * FIELD_LIMBS - 1] = addc(ab[2 * FIELD_LIMBS - 1], io[FIELD_LIMBS - 1]);
251 | 
252 |   FIELD r;
253 |   #pragma unroll
254 |   for (int i = 0; i < FIELD_LIMBS; i++) {
255 |     r.val[i] = ab[i + FIELD_LIMBS];
256 |   }
257 | 
258 |   if (FIELD_gte(r, FIELD_P)) {
259 |     r = FIELD_sub_(r, FIELD_P);
260 |   }
261 | 
262 |   return r;
263 | }
264 | 
265 | #endif
266 | 
267 | // Modular multiplication
268 | DEVICE FIELD FIELD_mul_default(FIELD a, FIELD b) {
269 |   /* CIOS Montgomery multiplication, inspired from Tolga Acar's thesis:
270 |    * https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf
271 |    * Learn more:
272 |    * https://en.wikipedia.org/wiki/Montgomery_modular_multiplication
273 |    * https://alicebob.cryptoland.net/understanding-the-montgomery-reduction-algorithm/
274 |    */
275 |   FIELD_limb t[FIELD_LIMBS + 2] = {0};
276 |   for(uchar i = 0; i < FIELD_LIMBS; i++) {
277 |     FIELD_limb carry = 0;
278 |     for(uchar j = 0; j < FIELD_LIMBS; j++)
279 |       t[j] = FIELD_mac_with_carry(a.val[j], b.val[i], t[j], &carry);
280 |     t[FIELD_LIMBS] = FIELD_add_with_carry(t[FIELD_LIMBS], &carry);
281 |     t[FIELD_LIMBS + 1] = carry;
282 | 
283 |     carry = 0;
284 |     FIELD_limb m = FIELD_INV * t[0];
285 |     FIELD_mac_with_carry(m, FIELD_P.val[0], t[0], &carry);
286 |     for(uchar j = 1; j < FIELD_LIMBS; j++)
287 |       t[j - 1] = FIELD_mac_with_carry(m, FIELD_P.val[j], t[j], &carry);
288 | 
289 |     t[FIELD_LIMBS - 1] = FIELD_add_with_carry(t[FIELD_LIMBS], &carry);
290 |     t[FIELD_LIMBS] = t[FIELD_LIMBS + 1] + carry;
291 |   }
292 | 
293 |   FIELD result;
294 |   for(uchar i = 0; i < FIELD_LIMBS; i++) result.val[i] = t[i];
295 | 
296 |   if(FIELD_gte(result, FIELD_P)) result = FIELD_sub_(result, FIELD_P);
297 | 
298 |   return result;
299 | }
300 | 
301 | #ifdef CUDA
302 | DEVICE FIELD FIELD_mul(FIELD a, FIELD b) {
303 |   return FIELD_mul_nvidia(a, b);
304 | }
305 | #else
306 | DEVICE FIELD FIELD_mul(FIELD a, FIELD b) {
307 |   return FIELD_mul_default(a, b);
308 | }
309 | #endif
310 | 
311 | // Squaring is a special case of multiplication which can be done ~1.5x faster.
312 | // https://stackoverflow.com/a/16388571/1348497
313 | DEVICE FIELD FIELD_sqr(FIELD a) {
314 |   return FIELD_mul(a, a);
315 | }
316 | 
317 | // Left-shift the limbs by one bit and subtract by modulus in case of overflow.
318 | // Faster version of FIELD_add(a, a)
319 | DEVICE FIELD FIELD_double(FIELD a) {
320 |   for(uchar i = FIELD_LIMBS - 1; i >= 1; i--)
321 |     a.val[i] = (a.val[i] << 1) | (a.val[i - 1] >> (FIELD_LIMB_BITS - 1));
322 |   a.val[0] <<= 1;
323 |   if(FIELD_gte(a, FIELD_P)) a = FIELD_sub_(a, FIELD_P);
324 |   return a;
325 | }
326 | 
327 | // Modular exponentiation (Exponentiation by Squaring)
328 | // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
329 | DEVICE FIELD FIELD_pow(FIELD base, uint exponent) {
330 |   FIELD res = FIELD_ONE;
331 |   while(exponent > 0) {
332 |     if (exponent & 1)
333 |       res = FIELD_mul(res, base);
334 |     exponent = exponent >> 1;
335 |     base = FIELD_sqr(base);
336 |   }
337 |   return res;
338 | }
339 | 
340 | 
341 | // Store squares of the base in a lookup table for faster evaluation.
342 | DEVICE FIELD FIELD_pow_lookup(GLOBAL FIELD *bases, uint exponent) {
343 |   FIELD res = FIELD_ONE;
344 |   uint i = 0;
345 |   while(exponent > 0) {
346 |     if (exponent & 1)
347 |       res = FIELD_mul(res, bases[i]);
348 |     exponent = exponent >> 1;
349 |     i++;
350 |   }
351 |   return res;
352 | }
353 | 
354 | DEVICE FIELD FIELD_mont(FIELD a) {
355 |   return FIELD_mul(a, FIELD_R2);
356 | }
357 | 
358 | DEVICE FIELD FIELD_unmont(FIELD a) {
359 |   FIELD one = FIELD_ZERO;
360 |   one.val[0] = 1;
361 |   return FIELD_mul(a, one);
362 | }
363 | 
364 | // Get `i`th bit (From most significant digit) of the field.
365 | DEVICE bool FIELD_get_bit(FIELD l, uint i) {
366 |   return (l.val[FIELD_LIMBS - 1 - i / FIELD_LIMB_BITS] >> (FIELD_LIMB_BITS - 1 - (i % FIELD_LIMB_BITS))) & 1;
367 | }
368 | 
369 | // Get `window` consecutive bits, (Starting from `skip`th bit) from the field.
370 | DEVICE uint FIELD_get_bits(FIELD l, uint skip, uint window) {
371 |   uint ret = 0;
372 |   for(uint i = 0; i < window; i++) {
373 |     ret <<= 1;
374 |     ret |= FIELD_get_bit(l, skip + i);
375 |   }
376 |   return ret;
377 | }
378 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/field2.cl:
--------------------------------------------------------------------------------
 1 | // Fp2 Extension Field where u^2 + 1 = 0
 2 | 
 3 | #define FIELD2_LIMB_BITS FIELD_LIMB_BITS
 4 | #define FIELD2_ZERO ((FIELD2){FIELD_ZERO, FIELD_ZERO})
 5 | #define FIELD2_ONE ((FIELD2){FIELD_ONE, FIELD_ZERO})
 6 | 
 7 | typedef struct {
 8 |   FIELD c0;
 9 |   FIELD c1;
10 | } FIELD2; // Represents: c0 + u * c1
11 | 
12 | DEVICE bool FIELD2_eq(FIELD2 a, FIELD2 b) {
13 |   return FIELD_eq(a.c0, b.c0) && FIELD_eq(a.c1, b.c1);
14 | }
15 | DEVICE FIELD2 FIELD2_sub(FIELD2 a, FIELD2 b) {
16 |   a.c0 = FIELD_sub(a.c0, b.c0);
17 |   a.c1 = FIELD_sub(a.c1, b.c1);
18 |   return a;
19 | }
20 | DEVICE FIELD2 FIELD2_add(FIELD2 a, FIELD2 b) {
21 |   a.c0 = FIELD_add(a.c0, b.c0);
22 |   a.c1 = FIELD_add(a.c1, b.c1);
23 |   return a;
24 | }
25 | DEVICE FIELD2 FIELD2_double(FIELD2 a) {
26 |   a.c0 = FIELD_double(a.c0);
27 |   a.c1 = FIELD_double(a.c1);
28 |   return a;
29 | }
30 | 
31 | /*
32 |  * (a_0 + u * a_1)(b_0 + u * b_1) = a_0 * b_0 - a_1 * b_1 + u * (a_0 * b_1 + a_1 * b_0)
33 |  * Therefore:
34 |  * c_0 = a_0 * b_0 - a_1 * b_1
35 |  * c_1 = (a_0 * b_1 + a_1 * b_0) = (a_0 + a_1) * (b_0 + b_1) - a_0 * b_0 - a_1 * b_1
36 |  */
37 | DEVICE FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) {
38 |   const FIELD aa = FIELD_mul(a.c0, b.c0);
39 |   const FIELD bb = FIELD_mul(a.c1, b.c1);
40 |   const FIELD o = FIELD_add(b.c0, b.c1);
41 |   a.c1 = FIELD_add(a.c1, a.c0);
42 |   a.c1 = FIELD_mul(a.c1, o);
43 |   a.c1 = FIELD_sub(a.c1, aa);
44 |   a.c1 = FIELD_sub(a.c1, bb);
45 |   a.c0 = FIELD_sub(aa, bb);
46 |   return a;
47 | }
48 | 
49 | /*
50 |  * (a_0 + u * a_1)(a_0 + u * a_1) = a_0 ^ 2 - a_1 ^ 2 + u * 2 * a_0 * a_1
51 |  * Therefore:
52 |  * c_0 = (a_0 * a_0 - a_1 * a_1) = (a_0 + a_1)(a_0 - a_1)
53 |  * c_1 = 2 * a_0 * a_1
54 |  */
55 | DEVICE FIELD2 FIELD2_sqr(FIELD2 a) {
56 |   const FIELD ab = FIELD_mul(a.c0, a.c1);
57 |   const FIELD c0c1 = FIELD_add(a.c0, a.c1);
58 |   a.c0 = FIELD_mul(FIELD_sub(a.c0, a.c1), c0c1);
59 |   a.c1 = FIELD_double(ab);
60 |   return a;
61 | }
62 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/multiexp.cl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Same multiexp algorithm used in Bellman, with some modifications.
 3 |  * https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L174
 4 |  * The CPU version of multiexp parallelism is done by dividing the exponent
 5 |  * values into smaller windows, and then applying a sequence of rounds to each
 6 |  * window. The GPU kernel not only assigns a thread to each window but also
 7 |  * divides the bases into several groups which highly increases the number of
 8 |  * threads running in parallel for calculating a multiexp instance.
 9 |  */
10 | 
11 | KERNEL void POINT_multiexp(
12 |     GLOBAL POINT_affine *bases,
13 |     GLOBAL POINT_jacobian *buckets,
14 |     GLOBAL POINT_jacobian *results,
15 |     GLOBAL EXPONENT *exps,
16 |     uint n,
17 |     uint num_groups,
18 |     uint num_windows,
19 |     uint window_size) {
20 | 
21 |   // We have `num_windows` * `num_groups` threads per multiexp.
22 |   const uint gid = GET_GLOBAL_ID();
23 |   if(gid >= num_windows * num_groups) return;
24 | 
25 |   // We have (2^window_size - 1) buckets.
26 |   const uint bucket_len = ((1 << window_size) - 1);
27 | 
28 |   // Each thread has its own set of buckets in global memory.
29 |   buckets += bucket_len * gid;
30 | 
31 |   const POINT_jacobian local_zero = POINT_ZERO;
32 |   for(uint i = 0; i < bucket_len; i++) buckets[i] = local_zero;
33 | 
34 |   // Num of elements in each group. Round the number up (ceil).
35 |   const uint len = (n + num_groups - 1) / num_groups;
36 | 
37 |   // This thread runs the multiexp algorithm on elements from `nstart` to `nened`
38 |   // on the window [`bits`, `bits` + `w`)
39 |   const uint nstart = len * (gid / num_windows);
40 |   const uint nend = min(nstart + len, n);
41 |   const uint bits = (gid % num_windows) * window_size;
42 |   const ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits));
43 | 
44 |   POINT_jacobian res = POINT_ZERO;
45 |   for(uint i = nstart; i < nend; i++) {
46 |     uint ind = EXPONENT_get_bits(exps[i], bits, w);
47 | 
48 |     #if defined(OPENCL_NVIDIA) || defined(CUDA)
49 |       // O_o, weird optimization, having a single special case makes it
50 |       // tremendously faster!
51 |       // 511 is chosen because it's half of the maximum bucket len, but
52 |       // any other number works... Bigger indices seems to be better...
53 |       if(ind == 511) buckets[510] = POINT_add_mixed(buckets[510], bases[i]);
54 |       else if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]);
55 |     #else
56 |       if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]);
57 |     #endif
58 |   }
59 | 
60 |   // Summation by parts
61 |   // e.g. 3a + 2b + 1c = a +
62 |   //                    (a) + b +
63 |   //                    ((a) + b) + c
64 |   POINT_jacobian acc = POINT_ZERO;
65 |   for(int j = bucket_len - 1; j >= 0; j--) {
66 |     acc = POINT_add(acc, buckets[j]);
67 |     res = POINT_add(res, acc);
68 |   }
69 | 
70 |   results[gid] = res;
71 | }
72 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/cl/test.cl:
--------------------------------------------------------------------------------
 1 | KERNEL void test_add(FIELD a, FIELD b, GLOBAL FIELD *result) {
 2 |   *result = FIELD_add(a, b);
 3 | }
 4 | 
 5 | KERNEL void test_mul(FIELD a, FIELD b, GLOBAL FIELD *result) {
 6 |   *result = FIELD_mul(a, b);
 7 | }
 8 | 
 9 | KERNEL void test_sub(FIELD a, FIELD b, GLOBAL FIELD *result) {
10 |   *result = FIELD_sub(a, b);
11 | }
12 | 
13 | KERNEL void test_pow(FIELD a, uint b, GLOBAL FIELD *result) {
14 |   *result = FIELD_pow(a, b);
15 | }
16 | 
17 | KERNEL void test_mont(FIELD a, GLOBAL FIELD *result) {
18 |   *result = FIELD_mont(a);
19 | }
20 | 
21 | KERNEL void test_unmont(FIELD a, GLOBAL FIELD *result) {
22 |   *result = FIELD_unmont(a);
23 | }
24 | 
25 | KERNEL void test_sqr(FIELD a, GLOBAL FIELD *result) {
26 |   *result = FIELD_sqr(a);
27 | }
28 | 
29 | KERNEL void test_double(FIELD a, GLOBAL FIELD *result) {
30 |   *result = FIELD_double(a);
31 | }
32 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/error.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | #[cfg(any(feature = "cuda", feature = "opencl"))]
 4 | use rust_gpu_tools::GPUError;
 5 | 
 6 | /// Errors of this library.
 7 | #[derive(thiserror::Error, Debug)]
 8 | pub enum EcError {
 9 |     /// A simple error that is described by a string.
10 |     #[error("EcError: {0}")]
11 |     Simple(&'static str),
12 | 
13 |     /// Error in case a GPU kernel execution was aborted.
14 |     #[cfg(any(feature = "cuda", feature = "opencl"))]
15 |     #[error("GPU call was aborted!")]
16 |     Aborted,
17 | 
18 |     /// An error that is bubbled up from the rust-gpu-tools library.
19 |     #[cfg(any(feature = "cuda", feature = "opencl"))]
20 |     #[error("GPU tools error: {0}")]
21 |     GpuTools(#[from] GPUError),
22 | 
23 |     /// IO error.
24 |     #[error("Encountered an I/O error: {0}")]
25 |     Io(#[from] io::Error),
26 | }
27 | 
28 | /// Result wrapper that is always using [`EcError`] as error.
29 | pub type EcResult<T> = std::result::Result<T, EcError>;
30 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/fft.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp;
  2 | use std::sync::{Arc, RwLock};
  3 | 
  4 | use ec_gpu::GpuName;
  5 | use ff::Field;
  6 | use log::{error, info};
  7 | use rust_gpu_tools::{program_closures, LocalBuffer, Program};
  8 | 
  9 | use crate::error::{EcError, EcResult};
 10 | use crate::threadpool::THREAD_POOL;
 11 | 
 12 | const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported.
 13 | const MAX_LOG2_RADIX: u32 = 8; // Radix256
 14 | const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128
 15 | 
 16 | /// FFT kernel for a single GPU.
 17 | pub struct SingleFftKernel<'a, F>
 18 | where
 19 |     F: Field + GpuName,
 20 | {
 21 |     program: Program,
 22 |     /// An optional function which will be called at places where it is possible to abort the FFT
 23 |     /// calculations. If it returns true, the calculation will be aborted with an
 24 |     /// [`EcError::Aborted`].
 25 |     maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
 26 |     _phantom: std::marker::PhantomData<F>,
 27 | }
 28 | 
 29 | impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
 30 |     /// Create a new FFT instance for the given device.
 31 |     ///
 32 |     /// The `maybe_abort` function is called when it is possible to abort the computation, without
 33 |     /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted.
 34 |     pub fn create(
 35 |         program: Program,
 36 |         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
 37 |     ) -> EcResult<Self> {
 38 |         Ok(SingleFftKernel {
 39 |             program,
 40 |             maybe_abort,
 41 |             _phantom: Default::default(),
 42 |         })
 43 |     }
 44 | 
 45 |     /// Performs FFT on `input`
 46 |     /// * `omega` - Special value `omega` is used for FFT over finite-fields
 47 |     /// * `log_n` - Specifies log2 of number of elements
 48 |     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
 49 |         let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> {
 50 |             let n = 1 << log_n;
 51 |             // All usages are safe as the buffers are initialized from either the host or the GPU
 52 |             // before they are read.
 53 |             let mut src_buffer = unsafe { program.create_buffer::<F>(n)? };
 54 |             let mut dst_buffer = unsafe { program.create_buffer::<F>(n)? };
 55 |             // The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg`
 56 |             let max_deg = cmp::min(MAX_LOG2_RADIX, log_n);
 57 | 
 58 |             // Precalculate:
 59 |             // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))]
 60 |             let mut pq = vec![F::ZERO; 1 << max_deg >> 1];
 61 |             let twiddle = omega.pow_vartime([(n >> max_deg) as u64]);
 62 |             pq[0] = F::ONE;
 63 |             if max_deg > 1 {
 64 |                 pq[1] = twiddle;
 65 |                 for i in 2..(1 << max_deg >> 1) {
 66 |                     pq[i] = pq[i - 1];
 67 |                     pq[i].mul_assign(&twiddle);
 68 |                 }
 69 |             }
 70 |             let pq_buffer = program.create_buffer_from_slice(&pq)?;
 71 | 
 72 |             // Precalculate [omega, omega^2, omega^4, omega^8, ..., omega^(2^31)]
 73 |             let mut omegas = vec![F::ZERO; 32];
 74 |             omegas[0] = *omega;
 75 |             for i in 1..LOG2_MAX_ELEMENTS {
 76 |                 omegas[i] = omegas[i - 1].pow_vartime([2u64]);
 77 |             }
 78 |             let omegas_buffer = program.create_buffer_from_slice(&omegas)?;
 79 | 
 80 |             program.write_from_buffer(&mut src_buffer, &*input)?;
 81 |             // Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html)
 82 |             let mut log_p = 0u32;
 83 |             // Each iteration performs a FFT round
 84 |             while log_p < log_n {
 85 |                 if let Some(maybe_abort) = &self.maybe_abort {
 86 |                     if maybe_abort() {
 87 |                         return Err(EcError::Aborted);
 88 |                     }
 89 |                 }
 90 | 
 91 |                 // 1=>radix2, 2=>radix4, 3=>radix8, ...
 92 |                 let deg = cmp::min(max_deg, log_n - log_p);
 93 | 
 94 |                 let n = 1u32 << log_n;
 95 |                 let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
 96 |                 let global_work_size = n >> deg;
 97 |                 let kernel_name = format!("{}_radix_fft", F::name());
 98 |                 let kernel = program.create_kernel(
 99 |                     &kernel_name,
100 |                     global_work_size as usize,
101 |                     local_work_size as usize,
102 |                 )?;
103 |                 kernel
104 |                     .arg(&src_buffer)
105 |                     .arg(&dst_buffer)
106 |                     .arg(&pq_buffer)
107 |                     .arg(&omegas_buffer)
108 |                     .arg(&LocalBuffer::<F>::new(1 << deg))
109 |                     .arg(&n)
110 |                     .arg(&log_p)
111 |                     .arg(&deg)
112 |                     .arg(&max_deg)
113 |                     .run()?;
114 | 
115 |                 log_p += deg;
116 |                 std::mem::swap(&mut src_buffer, &mut dst_buffer);
117 |             }
118 | 
119 |             program.read_into_buffer(&src_buffer, input)?;
120 | 
121 |             Ok(())
122 |         });
123 | 
124 |         self.program.run(closures, input)
125 |     }
126 | }
127 | 
128 | /// One FFT kernel for each GPU available.
129 | pub struct FftKernel<'a, F>
130 | where
131 |     F: Field + GpuName,
132 | {
133 |     kernels: Vec<SingleFftKernel<'a, F>>,
134 | }
135 | 
136 | impl<'a, F> FftKernel<'a, F>
137 | where
138 |     F: Field + GpuName,
139 | {
140 |     /// Create new kernels, one for each given device.
141 |     pub fn create(programs: Vec<Program>) -> EcResult<Self> {
142 |         Self::create_optional_abort(programs, None)
143 |     }
144 | 
145 |     /// Create new kernels, one for each given device, with early abort hook.
146 |     ///
147 |     /// The `maybe_abort` function is called when it is possible to abort the computation, without
148 |     /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted.
149 |     pub fn create_with_abort(
150 |         programs: Vec<Program>,
151 |         maybe_abort: &'a (dyn Fn() -> bool + Send + Sync),
152 |     ) -> EcResult<Self> {
153 |         Self::create_optional_abort(programs, Some(maybe_abort))
154 |     }
155 | 
156 |     fn create_optional_abort(
157 |         programs: Vec<Program>,
158 |         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
159 |     ) -> EcResult<Self> {
160 |         let kernels: Vec<_> = programs
161 |             .into_iter()
162 |             .filter_map(|program| {
163 |                 let device_name = program.device_name().to_string();
164 |                 let kernel = SingleFftKernel::<F>::create(program, maybe_abort);
165 |                 if let Err(ref e) = kernel {
166 |                     error!(
167 |                         "Cannot initialize kernel for device '{}'! Error: {}",
168 |                         device_name, e
169 |                     );
170 |                 }
171 |                 kernel.ok()
172 |             })
173 |             .collect();
174 | 
175 |         if kernels.is_empty() {
176 |             return Err(EcError::Simple("No working GPUs found!"));
177 |         }
178 |         info!("FFT: {} working device(s) selected. ", kernels.len());
179 |         for (i, k) in kernels.iter().enumerate() {
180 |             info!("FFT: Device {}: {}", i, k.program.device_name(),);
181 |         }
182 | 
183 |         Ok(Self { kernels })
184 |     }
185 | 
186 |     /// Performs FFT on `input`
187 |     /// * `omega` - Special value `omega` is used for FFT over finite-fields
188 |     /// * `log_n` - Specifies log2 of number of elements
189 |     ///
190 |     /// Uses the first available GPU.
191 |     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
192 |         self.kernels[0].radix_fft(input, omega, log_n)
193 |     }
194 | 
195 |     /// Performs FFT on `inputs`
196 |     /// * `omega` - Special value `omega` is used for FFT over finite-fields
197 |     /// * `log_n` - Specifies log2 of number of elements
198 |     ///
199 |     /// Uses all available GPUs to distribute the work.
200 |     pub fn radix_fft_many(
201 |         &mut self,
202 |         inputs: &mut [&mut [F]],
203 |         omegas: &[F],
204 |         log_ns: &[u32],
205 |     ) -> EcResult<()> {
206 |         let n = inputs.len();
207 |         let num_devices = self.kernels.len();
208 |         let chunk_size = ((n as f64) / (num_devices as f64)).ceil() as usize;
209 | 
210 |         let result = Arc::new(RwLock::new(Ok(())));
211 | 
212 |         THREAD_POOL.scoped(|s| {
213 |             for (((inputs, omegas), log_ns), kern) in inputs
214 |                 .chunks_mut(chunk_size)
215 |                 .zip(omegas.chunks(chunk_size))
216 |                 .zip(log_ns.chunks(chunk_size))
217 |                 .zip(self.kernels.iter_mut())
218 |             {
219 |                 let result = result.clone();
220 |                 s.execute(move || {
221 |                     for ((input, omega), log_n) in
222 |                         inputs.iter_mut().zip(omegas.iter()).zip(log_ns.iter())
223 |                     {
224 |                         if result.read().unwrap().is_err() {
225 |                             break;
226 |                         }
227 | 
228 |                         if let Err(err) = kern.radix_fft(input, omega, *log_n) {
229 |                             *result.write().unwrap() = Err(err);
230 |                             break;
231 |                         }
232 |                     }
233 |                 });
234 |             }
235 |         });
236 | 
237 |         Arc::try_unwrap(result).unwrap().into_inner().unwrap()
238 |     }
239 | }
240 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/fft_cpu.rs:
--------------------------------------------------------------------------------
  1 | use ff::PrimeField;
  2 | 
  3 | use crate::threadpool::Worker;
  4 | 
  5 | /// Calculate the Fast Fourier Transform on the CPU (single-threaded).
  6 | ///
  7 | /// The input `a` is mutated and contains the result when this function returns. The length of the
  8 | /// input vector must be `2^log_n`.
  9 | #[allow(clippy::many_single_char_names)]
 10 | pub fn serial_fft<F: PrimeField>(a: &mut [F], omega: &F, log_n: u32) {
 11 |     fn bitreverse(mut n: u32, l: u32) -> u32 {
 12 |         let mut r = 0;
 13 |         for _ in 0..l {
 14 |             r = (r << 1) | (n & 1);
 15 |             n >>= 1;
 16 |         }
 17 |         r
 18 |     }
 19 | 
 20 |     let n = a.len() as u32;
 21 |     assert_eq!(n, 1 << log_n);
 22 | 
 23 |     for k in 0..n {
 24 |         let rk = bitreverse(k, log_n);
 25 |         if k < rk {
 26 |             a.swap(rk as usize, k as usize);
 27 |         }
 28 |     }
 29 | 
 30 |     let mut m = 1;
 31 |     for _ in 0..log_n {
 32 |         let w_m = omega.pow_vartime([u64::from(n / (2 * m))]);
 33 | 
 34 |         let mut k = 0;
 35 |         while k < n {
 36 |             let mut w = F::ONE;
 37 |             for j in 0..m {
 38 |                 let mut t = a[(k + j + m) as usize];
 39 |                 t *= w;
 40 |                 let mut tmp = a[(k + j) as usize];
 41 |                 tmp -= t;
 42 |                 a[(k + j + m) as usize] = tmp;
 43 |                 a[(k + j) as usize] += t;
 44 |                 w *= w_m;
 45 |             }
 46 | 
 47 |             k += 2 * m;
 48 |         }
 49 | 
 50 |         m *= 2;
 51 |     }
 52 | }
 53 | 
 54 | /// Calculate the Fast Fourier Transform on the CPU (multithreaded).
 55 | ///
 56 | /// The result is is written to the input `a`.
 57 | /// The number of threads used will be `2^log_threads`.
 58 | /// There must be more items to process than threads.
 59 | pub fn parallel_fft<F: PrimeField>(
 60 |     a: &mut [F],
 61 |     worker: &Worker,
 62 |     omega: &F,
 63 |     log_n: u32,
 64 |     log_threads: u32,
 65 | ) {
 66 |     assert!(log_n >= log_threads);
 67 | 
 68 |     let num_threads = 1 << log_threads;
 69 |     let log_new_n = log_n - log_threads;
 70 |     let mut tmp = vec![vec![F::ZERO; 1 << log_new_n]; num_threads];
 71 |     let new_omega = omega.pow_vartime([num_threads as u64]);
 72 | 
 73 |     worker.scope(0, |scope, _| {
 74 |         let a = &*a;
 75 | 
 76 |         for (j, tmp) in tmp.iter_mut().enumerate() {
 77 |             scope.execute(move || {
 78 |                 // Shuffle into a sub-FFT
 79 |                 let omega_j = omega.pow_vartime([j as u64]);
 80 |                 let omega_step = omega.pow_vartime([(j as u64) << log_new_n]);
 81 | 
 82 |                 let mut elt = F::ONE;
 83 |                 for (i, tmp) in tmp.iter_mut().enumerate() {
 84 |                     for s in 0..num_threads {
 85 |                         let idx = (i + (s << log_new_n)) % (1 << log_n);
 86 |                         let mut t = a[idx];
 87 |                         t *= elt;
 88 |                         *tmp += t;
 89 |                         elt *= omega_step;
 90 |                     }
 91 |                     elt *= omega_j;
 92 |                 }
 93 | 
 94 |                 // Perform sub-FFT
 95 |                 serial_fft::<F>(tmp, &new_omega, log_new_n);
 96 |             });
 97 |         }
 98 |     });
 99 | 
100 |     // TODO: does this hurt or help?
101 |     worker.scope(a.len(), |scope, chunk| {
102 |         let tmp = &tmp;
103 | 
104 |         for (idx, a) in a.chunks_mut(chunk).enumerate() {
105 |             scope.execute(move || {
106 |                 let mut idx = idx * chunk;
107 |                 let mask = (1 << log_threads) - 1;
108 |                 for a in a {
109 |                     *a = tmp[idx & mask][idx >> log_threads];
110 |                     idx += 1;
111 |                 }
112 |             });
113 |         }
114 |     });
115 | }
116 | 
117 | #[cfg(test)]
118 | mod tests {
119 |     use super::*;
120 | 
121 |     use std::cmp::min;
122 | 
123 |     use blstrs::Scalar as Fr;
124 |     use ff::PrimeField;
125 |     use rand_core::RngCore;
126 | 
127 |     fn omega<F: PrimeField>(num_coeffs: usize) -> F {
128 |         // Compute omega, the 2^exp primitive root of unity
129 |         let exp = (num_coeffs as f32).log2().floor() as u32;
130 |         let mut omega = F::ROOT_OF_UNITY;
131 |         for _ in exp..F::S {
132 |             omega = omega.square();
133 |         }
134 |         omega
135 |     }
136 | 
137 |     #[test]
138 |     fn parallel_fft_consistency() {
139 |         fn test_consistency<F: PrimeField, R: RngCore>(rng: &mut R) {
140 |             let worker = Worker::new();
141 | 
142 |             for _ in 0..5 {
143 |                 for log_d in 0..10 {
144 |                     let d = 1 << log_d;
145 | 
146 |                     let mut v1_coeffs = (0..d).map(|_| F::random(&mut *rng)).collect::<Vec<_>>();
147 |                     let mut v2_coeffs = v1_coeffs.clone();
148 |                     let v1_omega = omega::<F>(v1_coeffs.len());
149 |                     let v2_omega = v1_omega;
150 | 
151 |                     for log_threads in log_d..min(log_d + 1, 3) {
152 |                         parallel_fft::<F>(&mut v1_coeffs, &worker, &v1_omega, log_d, log_threads);
153 |                         serial_fft::<F>(&mut v2_coeffs, &v2_omega, log_d);
154 | 
155 |                         assert!(v1_coeffs == v2_coeffs);
156 |                     }
157 |                 }
158 |             }
159 |         }
160 | 
161 |         let rng = &mut rand::thread_rng();
162 | 
163 |         test_consistency::<Fr, _>(rng);
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(missing_docs)]
 2 | //! CUDA/OpenCL code generator for finite-field arithmetic over prime fields and elliptic curve
 3 | //! arithmetic constructed with Rust.
 4 | //!
 5 | //! There is also support for Fast Fourier Transform and Multiexponentiation.
 6 | //!
 7 | //! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time.
 8 | //!
 9 | //! In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time.
10 | //!
11 | //! In this example we will make use of the FFT functionality. Add to your `build.rs`:
12 | //!
13 | //! ```no_run
14 | //! use blstrs::Scalar;
15 | //! use ec_gpu_gen::SourceBuilder;
16 | //!
17 | //! let source_builder = SourceBuilder::new().add_fft::<Scalar>();
18 | //! ec_gpu_gen::generate(&source_builder);
19 | //! ```
20 | //!
21 | //! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source.
22 | //!
23 | //! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this:
24 | //!
25 | //! ```no_compile
26 | //! use blstrs::Scalar;
27 | //! use ec_gpu_gen::{
28 | //!     rust_gpu_tools::Device,
29 | //! };
30 | //!
31 | //! let devices = Device::all();
32 | //! let programs = devices
33 | //!     .iter()
34 | //!     .map(|device| ec_gpu_gen::program!(device))
35 | //!     .collect::<Result<_, _>>()
36 | //!     .expect("Cannot create programs!");
37 | //!
38 | //! let mut kern = FftKernel::<Scalar>::create(programs).expect("Cannot initialize kernel!");
39 | //! kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!");
40 | //! ```
41 | //!
42 | //! Feature flags
43 | //! -------------
44 | //!
45 | //! CUDA and OpenCL are supported, each be enabled with the `cuda` and `opencl` [feature flags].
46 | //!
47 | //! [fatbin]: https://en.wikipedia.org/wiki/Fat_binary#Heterogeneous_computing
48 | //! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section
49 | mod error;
50 | #[cfg(any(feature = "cuda", feature = "opencl"))]
51 | mod program;
52 | mod source;
53 | 
54 | /// Fast Fourier Transform on the GPU.
55 | #[cfg(any(feature = "cuda", feature = "opencl"))]
56 | pub mod fft;
57 | /// Fast Fourier Transform on the CPU.
58 | pub mod fft_cpu;
59 | /// Multiexponentiation on the GPU.
60 | #[cfg(any(feature = "cuda", feature = "opencl"))]
61 | pub mod multiexp;
62 | /// Multiexponentiation on the CPU.
63 | pub mod multiexp_cpu;
64 | /// Helpers for multithreaded code.
65 | pub mod threadpool;
66 | 
67 | /// Re-export rust-gpu-tools as things like [`rust_gpu_tools::Device`] might be needed.
68 | #[cfg(any(feature = "cuda", feature = "opencl"))]
69 | pub use rust_gpu_tools;
70 | 
71 | pub use error::{EcError, EcResult};
72 | pub use source::{generate, SourceBuilder};
73 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/multiexp.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::AddAssign;
  2 | use std::sync::{Arc, RwLock};
  3 | 
  4 | use ec_gpu::GpuName;
  5 | use ff::PrimeField;
  6 | use group::{prime::PrimeCurveAffine, Group};
  7 | use log::{error, info};
  8 | use rust_gpu_tools::{program_closures, Device, Program};
  9 | use yastl::Scope;
 10 | 
 11 | use crate::{
 12 |     error::{EcError, EcResult},
 13 |     threadpool::Worker,
 14 | };
 15 | 
 16 | /// On the GPU, the exponents are split into windows, this is the maximum number of such windows.
 17 | const MAX_WINDOW_SIZE: usize = 10;
 18 | /// In CUDA this is the number of blocks per grid (grid size).
 19 | const LOCAL_WORK_SIZE: usize = 128;
 20 | /// Let 20% of GPU memory be free, this is an arbitrary value.
 21 | const MEMORY_PADDING: f64 = 0.2f64;
 22 | /// The Nvidia Ampere architecture is compute capability major version 8.
 23 | const AMPERE: u32 = 8;
 24 | 
 25 | /// Divide and ceil to the next value.
 26 | const fn div_ceil(a: usize, b: usize) -> usize {
 27 |     if a % b == 0 {
 28 |         a / b
 29 |     } else {
 30 |         (a / b) + 1
 31 |     }
 32 | }
 33 | 
 34 | /// The number of units the work is split into. One unit will result in one CUDA thread.
 35 | ///
 36 | /// Based on empirical results, it turns out that on Nvidia devices with the Ampere architecture,
 37 | /// it's faster to use two times the number of work units.
 38 | const fn work_units(compute_units: u32, compute_capabilities: Option<(u32, u32)>) -> usize {
 39 |     match compute_capabilities {
 40 |         Some((AMPERE, _)) => LOCAL_WORK_SIZE * compute_units as usize * 2,
 41 |         _ => LOCAL_WORK_SIZE * compute_units as usize,
 42 |     }
 43 | }
 44 | 
 45 | /// Multiexp kernel for a single GPU.
 46 | pub struct SingleMultiexpKernel<'a, G>
 47 | where
 48 |     G: PrimeCurveAffine,
 49 | {
 50 |     program: Program,
 51 |     /// The number of exponentiations the GPU can handle in a single execution of the kernel.
 52 |     n: usize,
 53 |     /// The number of units the work is split into. It will results in this amount of threads on
 54 |     /// the GPU.
 55 |     work_units: usize,
 56 |     /// An optional function which will be called at places where it is possible to abort the
 57 |     /// multiexp calculations. If it returns true, the calculation will be aborted with an
 58 |     /// [`EcError::Aborted`].
 59 |     maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
 60 | 
 61 |     _phantom: std::marker::PhantomData<G::Scalar>,
 62 | }
 63 | 
 64 | /// Calculates the maximum number of terms that can be put onto the GPU memory.
 65 | fn calc_chunk_size<G>(mem: u64, work_units: usize) -> usize
 66 | where
 67 |     G: PrimeCurveAffine,
 68 |     G::Scalar: PrimeField,
 69 | {
 70 |     let aff_size = std::mem::size_of::<G>();
 71 |     let exp_size = exp_size::<G::Scalar>();
 72 |     let proj_size = std::mem::size_of::<G::Curve>();
 73 | 
 74 |     // Leave `MEMORY_PADDING` percent of the memory free.
 75 |     let max_memory = ((mem as f64) * (1f64 - MEMORY_PADDING)) as usize;
 76 |     // The amount of memory (in bytes) of a single term.
 77 |     let term_size = aff_size + exp_size;
 78 |     // The number of buckets needed for one work unit
 79 |     let max_buckets_per_work_unit = 1 << MAX_WINDOW_SIZE;
 80 |     // The amount of memory (in bytes) we need for the intermediate steps (buckets).
 81 |     let buckets_size = work_units * max_buckets_per_work_unit * proj_size;
 82 |     // The amount of memory (in bytes) we need for the results.
 83 |     let results_size = work_units * proj_size;
 84 | 
 85 |     (max_memory - buckets_size - results_size) / term_size
 86 | }
 87 | 
 88 | /// The size of the exponent in bytes.
 89 | ///
 90 | /// It's the actual bytes size it needs in memory, not it's theoretical bit size.
 91 | fn exp_size<F: PrimeField>() -> usize {
 92 |     std::mem::size_of::<F::Repr>()
 93 | }
 94 | 
 95 | impl<'a, G> SingleMultiexpKernel<'a, G>
 96 | where
 97 |     G: PrimeCurveAffine + GpuName,
 98 | {
 99 |     /// Create a new Multiexp kernel instance for a device.
100 |     ///
101 |     /// The `maybe_abort` function is called when it is possible to abort the computation, without
102 |     /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted.
103 |     pub fn create(
104 |         program: Program,
105 |         device: &Device,
106 |         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
107 |     ) -> EcResult<Self> {
108 |         let mem = device.memory();
109 |         let compute_units = device.compute_units();
110 |         let compute_capability = device.compute_capability();
111 |         let work_units = work_units(compute_units, compute_capability);
112 |         let chunk_size = calc_chunk_size::<G>(mem, work_units);
113 | 
114 |         Ok(SingleMultiexpKernel {
115 |             program,
116 |             n: chunk_size,
117 |             work_units,
118 |             maybe_abort,
119 |             _phantom: std::marker::PhantomData,
120 |         })
121 |     }
122 | 
123 |     /// Run the actual multiexp computation on the GPU.
124 |     ///
125 |     /// The number of `bases` and `exponents` are determined by [`SingleMultiexpKernel`]`::n`, this
126 |     /// means that it is guaranteed that this amount of calculations fit on the GPU this kernel is
127 |     /// running on.
128 |     pub fn multiexp(
129 |         &self,
130 |         bases: &[G],
131 |         exponents: &[<G::Scalar as PrimeField>::Repr],
132 |     ) -> EcResult<G::Curve> {
133 |         assert_eq!(bases.len(), exponents.len());
134 | 
135 |         if let Some(maybe_abort) = &self.maybe_abort {
136 |             if maybe_abort() {
137 |                 return Err(EcError::Aborted);
138 |             }
139 |         }
140 |         let window_size = self.calc_window_size(bases.len());
141 |         // windows_size * num_windows needs to be >= 256 in order for the kernel to work correctly.
142 |         let num_windows = div_ceil(256, window_size);
143 |         let num_groups = self.work_units / num_windows;
144 |         let bucket_len = 1 << window_size;
145 | 
146 |         // Each group will have `num_windows` threads and as there are `num_groups` groups, there will
147 |         // be `num_groups` * `num_windows` threads in total.
148 |         // Each thread will use `num_groups` * `num_windows` * `bucket_len` buckets.
149 | 
150 |         let closures = program_closures!(|program, _arg| -> EcResult<Vec<G::Curve>> {
151 |             let base_buffer = program.create_buffer_from_slice(bases)?;
152 |             let exp_buffer = program.create_buffer_from_slice(exponents)?;
153 | 
154 |             // It is safe as the GPU will initialize that buffer
155 |             let bucket_buffer =
156 |                 unsafe { program.create_buffer::<G::Curve>(self.work_units * bucket_len)? };
157 |             // It is safe as the GPU will initialize that buffer
158 |             let result_buffer = unsafe { program.create_buffer::<G::Curve>(self.work_units)? };
159 | 
160 |             // The global work size follows CUDA's definition and is the number of
161 |             // `LOCAL_WORK_SIZE` sized thread groups.
162 |             let global_work_size = div_ceil(num_windows * num_groups, LOCAL_WORK_SIZE);
163 | 
164 |             let kernel_name = format!("{}_multiexp", G::name());
165 |             let kernel = program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)?;
166 | 
167 |             kernel
168 |                 .arg(&base_buffer)
169 |                 .arg(&bucket_buffer)
170 |                 .arg(&result_buffer)
171 |                 .arg(&exp_buffer)
172 |                 .arg(&(bases.len() as u32))
173 |                 .arg(&(num_groups as u32))
174 |                 .arg(&(num_windows as u32))
175 |                 .arg(&(window_size as u32))
176 |                 .run()?;
177 | 
178 |             let mut results = vec![G::Curve::identity(); self.work_units];
179 |             program.read_into_buffer(&result_buffer, &mut results)?;
180 | 
181 |             Ok(results)
182 |         });
183 | 
184 |         let results = self.program.run(closures, ())?;
185 | 
186 |         // Using the algorithm below, we can calculate the final result by accumulating the results
187 |         // of those `NUM_GROUPS` * `NUM_WINDOWS` threads.
188 |         let mut acc = G::Curve::identity();
189 |         let mut bits = 0;
190 |         let exp_bits = exp_size::<G::Scalar>() * 8;
191 |         for i in 0..num_windows {
192 |             let w = std::cmp::min(window_size, exp_bits - bits);
193 |             for _ in 0..w {
194 |                 acc = acc.double();
195 |             }
196 |             for g in 0..num_groups {
197 |                 acc.add_assign(&results[g * num_windows + i]);
198 |             }
199 |             bits += w; // Process the next window
200 |         }
201 | 
202 |         Ok(acc)
203 |     }
204 | 
205 |     /// Calculates the window size, based on the given number of terms.
206 |     ///
207 |     /// For best performance, the window size is reduced, so that maximum parallelism is possible.
208 |     /// If you e.g. have put only a subset of the terms into the GPU memory, then a smaller window
209 |     /// size leads to more windows, hence more units to work on, as we split the work into
210 |     /// `num_windows * num_groups`.
211 |     fn calc_window_size(&self, num_terms: usize) -> usize {
212 |         // The window size was determined by running the `gpu_multiexp_consistency` test and
213 |         // looking at the resulting numbers.
214 |         let window_size = ((div_ceil(num_terms, self.work_units) as f64).log2() as usize) + 2;
215 |         std::cmp::min(window_size, MAX_WINDOW_SIZE)
216 |     }
217 | }
218 | 
219 | /// A struct that contains several multiexp kernels for different devices.
220 | pub struct MultiexpKernel<'a, G>
221 | where
222 |     G: PrimeCurveAffine,
223 | {
224 |     kernels: Vec<SingleMultiexpKernel<'a, G>>,
225 | }
226 | 
227 | impl<'a, G> MultiexpKernel<'a, G>
228 | where
229 |     G: PrimeCurveAffine + GpuName,
230 | {
231 |     /// Create new kernels, one for each given device.
232 |     pub fn create(programs: Vec<Program>, devices: &[&Device]) -> EcResult<Self> {
233 |         Self::create_optional_abort(programs, devices, None)
234 |     }
235 | 
236 |     /// Create new kernels, one for each given device, with early abort hook.
237 |     ///
238 |     /// The `maybe_abort` function is called when it is possible to abort the computation, without
239 |     /// leaving the GPU in a weird state. If that function returns `true`, execution is aborted.
240 |     pub fn create_with_abort(
241 |         programs: Vec<Program>,
242 |         devices: &[&Device],
243 |         maybe_abort: &'a (dyn Fn() -> bool + Send + Sync),
244 |     ) -> EcResult<Self> {
245 |         Self::create_optional_abort(programs, devices, Some(maybe_abort))
246 |     }
247 | 
248 |     fn create_optional_abort(
249 |         programs: Vec<Program>,
250 |         devices: &[&Device],
251 |         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
252 |     ) -> EcResult<Self> {
253 |         let kernels: Vec<_> = programs
254 |             .into_iter()
255 |             .zip(devices.iter())
256 |             .filter_map(|(program, device)| {
257 |                 let device_name = program.device_name().to_string();
258 |                 let kernel = SingleMultiexpKernel::create(program, device, maybe_abort);
259 |                 if let Err(ref e) = kernel {
260 |                     error!(
261 |                         "Cannot initialize kernel for device '{}'! Error: {}",
262 |                         device_name, e
263 |                     );
264 |                 }
265 |                 kernel.ok()
266 |             })
267 |             .collect();
268 | 
269 |         if kernels.is_empty() {
270 |             return Err(EcError::Simple("No working GPUs found!"));
271 |         }
272 |         info!("Multiexp: {} working device(s) selected.", kernels.len());
273 |         for (i, k) in kernels.iter().enumerate() {
274 |             info!(
275 |                 "Multiexp: Device {}: {} (Chunk-size: {})",
276 |                 i,
277 |                 k.program.device_name(),
278 |                 k.n
279 |             );
280 |         }
281 |         Ok(MultiexpKernel { kernels })
282 |     }
283 | 
284 |     /// Calculate multiexp on all available GPUs.
285 |     ///
286 |     /// It needs to run within a [`yastl::Scope`]. This method usually isn't called directly, use
287 |     /// [`MultiexpKernel::multiexp`] instead.
288 |     pub fn parallel_multiexp<'s>(
289 |         &'s mut self,
290 |         scope: &Scope<'s>,
291 |         bases: &'s [G],
292 |         exps: &'s [<G::Scalar as PrimeField>::Repr],
293 |         results: &'s mut [G::Curve],
294 |         error: Arc<RwLock<EcResult<()>>>,
295 |     ) {
296 |         let num_devices = self.kernels.len();
297 |         let num_exps = exps.len();
298 |         // The maximum number of exponentiations per device.
299 |         let chunk_size = ((num_exps as f64) / (num_devices as f64)).ceil() as usize;
300 | 
301 |         for (((bases, exps), kern), result) in bases
302 |             .chunks(chunk_size)
303 |             .zip(exps.chunks(chunk_size))
304 |             // NOTE vmx 2021-11-17: This doesn't need to be a mutable iterator. But when it isn't
305 |             // there will be errors that the OpenCL CommandQueue cannot be shared between threads
306 |             // safely.
307 |             .zip(self.kernels.iter_mut())
308 |             .zip(results.iter_mut())
309 |         {
310 |             let error = error.clone();
311 |             scope.execute(move || {
312 |                 let mut acc = G::Curve::identity();
313 |                 for (bases, exps) in bases.chunks(kern.n).zip(exps.chunks(kern.n)) {
314 |                     if error.read().unwrap().is_err() {
315 |                         break;
316 |                     }
317 |                     match kern.multiexp(bases, exps) {
318 |                         Ok(result) => acc.add_assign(&result),
319 |                         Err(e) => {
320 |                             *error.write().unwrap() = Err(e);
321 |                             break;
322 |                         }
323 |                     }
324 |                 }
325 |                 if error.read().unwrap().is_ok() {
326 |                     *result = acc;
327 |                 }
328 |             });
329 |         }
330 |     }
331 | 
332 |     /// Calculate multiexp.
333 |     ///
334 |     /// This is the main entry point.
335 |     pub fn multiexp(
336 |         &mut self,
337 |         pool: &Worker,
338 |         bases_arc: Arc<Vec<G>>,
339 |         exps: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
340 |         skip: usize,
341 |     ) -> EcResult<G::Curve> {
342 |         // Bases are skipped by `self.1` elements, when converted from (Arc<Vec<G>>, usize) to Source
343 |         // https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L38
344 |         let bases = &bases_arc[skip..(skip + exps.len())];
345 |         let exps = &exps[..];
346 | 
347 |         let mut results = Vec::new();
348 |         let error = Arc::new(RwLock::new(Ok(())));
349 | 
350 |         pool.scoped(|s| {
351 |             results = vec![G::Curve::identity(); self.kernels.len()];
352 |             self.parallel_multiexp(s, bases, exps, &mut results, error.clone());
353 |         });
354 | 
355 |         Arc::try_unwrap(error)
356 |             .expect("only one ref left")
357 |             .into_inner()
358 |             .unwrap()?;
359 | 
360 |         let mut acc = G::Curve::identity();
361 |         for r in results {
362 |             acc.add_assign(&r);
363 |         }
364 | 
365 |         Ok(acc)
366 |     }
367 | 
368 |     /// Returns the number of kernels (one per device).
369 |     pub fn num_kernels(&self) -> usize {
370 |         self.kernels.len()
371 |     }
372 | }
373 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/multiexp_cpu.rs:
--------------------------------------------------------------------------------
  1 | #![allow(missing_docs)]
  2 | use std::convert::TryInto;
  3 | use std::io;
  4 | use std::iter;
  5 | use std::ops::AddAssign;
  6 | use std::sync::Arc;
  7 | 
  8 | use bitvec::prelude::{BitVec, Lsb0};
  9 | use ff::{Field, PrimeField};
 10 | use group::{prime::PrimeCurveAffine, Group};
 11 | use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
 12 | 
 13 | use crate::error::EcError;
 14 | use crate::threadpool::{Waiter, Worker};
 15 | 
 16 | /// An object that builds a source of bases.
 17 | pub trait SourceBuilder<G: PrimeCurveAffine>: Send + Sync + 'static + Clone {
 18 |     type Source: Source<G>;
 19 | 
 20 |     #[allow(clippy::wrong_self_convention)]
 21 |     fn new(self) -> Self::Source;
 22 |     fn get(self) -> (Arc<Vec<G>>, usize);
 23 | }
 24 | 
 25 | /// A source of bases, like an iterator.
 26 | pub trait Source<G: PrimeCurveAffine> {
 27 |     /// Parses the element from the source. Fails if the point is at infinity.
 28 |     fn add_assign_mixed(&mut self, to: &mut <G as PrimeCurveAffine>::Curve) -> Result<(), EcError>;
 29 | 
 30 |     /// Skips `amt` elements from the source, avoiding deserialization.
 31 |     fn skip(&mut self, amt: usize) -> Result<(), EcError>;
 32 | }
 33 | 
 34 | impl<G: PrimeCurveAffine> SourceBuilder<G> for (Arc<Vec<G>>, usize) {
 35 |     type Source = (Arc<Vec<G>>, usize);
 36 | 
 37 |     fn new(self) -> (Arc<Vec<G>>, usize) {
 38 |         (self.0.clone(), self.1)
 39 |     }
 40 | 
 41 |     fn get(self) -> (Arc<Vec<G>>, usize) {
 42 |         (self.0.clone(), self.1)
 43 |     }
 44 | }
 45 | 
 46 | impl<G: PrimeCurveAffine> Source<G> for (Arc<Vec<G>>, usize) {
 47 |     fn add_assign_mixed(&mut self, to: &mut <G as PrimeCurveAffine>::Curve) -> Result<(), EcError> {
 48 |         if self.0.len() <= self.1 {
 49 |             return Err(io::Error::new(
 50 |                 io::ErrorKind::UnexpectedEof,
 51 |                 "Expected more bases from source.",
 52 |             )
 53 |             .into());
 54 |         }
 55 | 
 56 |         if self.0[self.1].is_identity().into() {
 57 |             return Err(EcError::Simple(
 58 |                 "Encountered an identity element in the CRS.",
 59 |             ));
 60 |         }
 61 | 
 62 |         to.add_assign(&self.0[self.1]);
 63 | 
 64 |         self.1 += 1;
 65 | 
 66 |         Ok(())
 67 |     }
 68 | 
 69 |     fn skip(&mut self, amt: usize) -> Result<(), EcError> {
 70 |         if self.0.len() <= self.1 {
 71 |             return Err(io::Error::new(
 72 |                 io::ErrorKind::UnexpectedEof,
 73 |                 "Expected more bases from source.",
 74 |             )
 75 |             .into());
 76 |         }
 77 | 
 78 |         self.1 += amt;
 79 | 
 80 |         Ok(())
 81 |     }
 82 | }
 83 | 
 84 | pub trait QueryDensity: Sized {
 85 |     /// Returns whether the base exists.
 86 |     type Iter: Iterator<Item = bool>;
 87 | 
 88 |     fn iter(self) -> Self::Iter;
 89 |     fn get_query_size(self) -> Option<usize>;
 90 |     fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>>;
 91 | }
 92 | 
 93 | #[derive(Clone)]
 94 | pub struct FullDensity;
 95 | 
 96 | impl AsRef<FullDensity> for FullDensity {
 97 |     fn as_ref(&self) -> &FullDensity {
 98 |         self
 99 |     }
100 | }
101 | 
102 | impl QueryDensity for &FullDensity {
103 |     type Iter = iter::Repeat<bool>;
104 | 
105 |     fn iter(self) -> Self::Iter {
106 |         iter::repeat(true)
107 |     }
108 | 
109 |     fn get_query_size(self) -> Option<usize> {
110 |         None
111 |     }
112 | 
113 |     fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>> {
114 |         exponents
115 |     }
116 | }
117 | 
118 | #[derive(Clone, PartialEq, Eq, Debug, Default)]
119 | pub struct DensityTracker {
120 |     pub bv: BitVec,
121 |     pub total_density: usize,
122 | }
123 | 
124 | impl<'a> QueryDensity for &'a DensityTracker {
125 |     type Iter = bitvec::slice::BitValIter<'a, usize, Lsb0>;
126 | 
127 |     fn iter(self) -> Self::Iter {
128 |         self.bv.iter().by_vals()
129 |     }
130 | 
131 |     fn get_query_size(self) -> Option<usize> {
132 |         Some(self.bv.len())
133 |     }
134 | 
135 |     fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>> {
136 |         let exps: Vec<_> = exponents
137 |             .iter()
138 |             .zip(self.bv.iter())
139 |             .filter_map(|(&e, d)| if *d { Some(e) } else { None })
140 |             .collect();
141 | 
142 |         Arc::new(exps)
143 |     }
144 | }
145 | 
146 | impl DensityTracker {
147 |     pub fn new() -> DensityTracker {
148 |         DensityTracker {
149 |             bv: BitVec::new(),
150 |             total_density: 0,
151 |         }
152 |     }
153 | 
154 |     pub fn add_element(&mut self) {
155 |         self.bv.push(false);
156 |     }
157 | 
158 |     pub fn inc(&mut self, idx: usize) {
159 |         if !self.bv.get(idx).unwrap() {
160 |             self.bv.set(idx, true);
161 |             self.total_density += 1;
162 |         }
163 |     }
164 | 
165 |     pub fn get_total_density(&self) -> usize {
166 |         self.total_density
167 |     }
168 | 
169 |     /// Extend by concatenating `other`. If `is_input_density` is true, then we are tracking an input density,
170 |     /// and other may contain a redundant input for the `One` element. Coalesce those as needed and track the result.
171 |     pub fn extend(&mut self, other: &Self, is_input_density: bool) {
172 |         if other.bv.is_empty() {
173 |             // Nothing to do if other is empty.
174 |             return;
175 |         }
176 | 
177 |         if self.bv.is_empty() {
178 |             // If self is empty, assume other's density.
179 |             self.total_density = other.total_density;
180 |             self.bv.resize(other.bv.len(), false);
181 |             self.bv.copy_from_bitslice(&*other.bv);
182 |             return;
183 |         }
184 | 
185 |         if is_input_density {
186 |             // Input densities need special handling to coalesce their first inputs.
187 | 
188 |             if other.bv[0] {
189 |                 // If other's first bit is set,
190 |                 if self.bv[0] {
191 |                     // And own first bit is set, then decrement total density so the final sum doesn't overcount.
192 |                     self.total_density -= 1;
193 |                 } else {
194 |                     // Otherwise, set own first bit.
195 |                     self.bv.set(0, true);
196 |                 }
197 |             }
198 |             // Now discard other's first bit, having accounted for it above, and extend self by remaining bits.
199 |             self.bv.extend(other.bv.iter().skip(1));
200 |         } else {
201 |             // Not an input density, just extend straightforwardly.
202 |             self.bv.extend(other.bv.iter());
203 |         }
204 | 
205 |         // Since any needed adjustments to total densities have been made, just sum the totals and keep the sum.
206 |         self.total_density += other.total_density;
207 |     }
208 | }
209 | 
210 | // Right shift the repr of a field element by `n` bits.
211 | fn shr(le_bytes: &mut [u8], mut n: u32) {
212 |     if n >= 8 * le_bytes.len() as u32 {
213 |         le_bytes.iter_mut().for_each(|byte| *byte = 0);
214 |         return;
215 |     }
216 | 
217 |     // Shift each full byte towards the least significant end.
218 |     while n >= 8 {
219 |         let mut replacement = 0;
220 |         for byte in le_bytes.iter_mut().rev() {
221 |             std::mem::swap(&mut replacement, byte);
222 |         }
223 |         n -= 8;
224 |     }
225 | 
226 |     // Starting at the most significant byte, shift the byte's `n` least significant bits into the
227 |     // `n` most significant bits of the next byte.
228 |     if n > 0 {
229 |         let mut shift_in = 0;
230 |         for byte in le_bytes.iter_mut().rev() {
231 |             // Copy the byte's `n` least significant bits.
232 |             let shift_out = *byte << (8 - n);
233 |             // Shift the byte by `n` bits; zeroing its `n` most significant bits.
234 |             *byte >>= n;
235 |             // Replace the `n` most significant bits with the bits shifted out of the previous byte.
236 |             *byte |= shift_in;
237 |             shift_in = shift_out;
238 |         }
239 |     }
240 | }
241 | 
242 | fn multiexp_inner<Q, D, G, S>(
243 |     bases: S,
244 |     density_map: D,
245 |     exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
246 |     c: u32,
247 | ) -> Result<<G as PrimeCurveAffine>::Curve, EcError>
248 | where
249 |     for<'a> &'a Q: QueryDensity,
250 |     D: Send + Sync + 'static + Clone + AsRef<Q>,
251 |     G: PrimeCurveAffine,
252 |     S: SourceBuilder<G>,
253 | {
254 |     // Perform this region of the multiexp
255 |     let this = move |bases: S,
256 |                      density_map: D,
257 |                      exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
258 |                      skip: u32|
259 |           -> Result<_, EcError> {
260 |         // Accumulate the result
261 |         let mut acc = G::Curve::identity();
262 | 
263 |         // Build a source for the bases
264 |         let mut bases = bases.new();
265 | 
266 |         // Create space for the buckets
267 |         let mut buckets = vec![<G as PrimeCurveAffine>::Curve::identity(); (1 << c) - 1];
268 | 
269 |         let zero = G::Scalar::ZERO.to_repr();
270 |         let one = G::Scalar::ONE.to_repr();
271 | 
272 |         // only the first round uses this
273 |         let handle_trivial = skip == 0;
274 | 
275 |         // Sort the bases into buckets
276 |         for (&exp, density) in exponents.iter().zip(density_map.as_ref().iter()) {
277 |             if density {
278 |                 if exp.as_ref() == zero.as_ref() {
279 |                     bases.skip(1)?;
280 |                 } else if exp.as_ref() == one.as_ref() {
281 |                     if handle_trivial {
282 |                         bases.add_assign_mixed(&mut acc)?;
283 |                     } else {
284 |                         bases.skip(1)?;
285 |                     }
286 |                 } else {
287 |                     let mut exp = exp;
288 |                     shr(exp.as_mut(), skip);
289 |                     let exp = u64::from_le_bytes(exp.as_ref()[..8].try_into().unwrap()) % (1 << c);
290 | 
291 |                     if exp != 0 {
292 |                         bases.add_assign_mixed(&mut buckets[(exp - 1) as usize])?;
293 |                     } else {
294 |                         bases.skip(1)?;
295 |                     }
296 |                 }
297 |             }
298 |         }
299 | 
300 |         // Summation by parts
301 |         // e.g. 3a + 2b + 1c = a +
302 |         //                    (a) + b +
303 |         //                    ((a) + b) + c
304 |         let mut running_sum = G::Curve::identity();
305 |         for exp in buckets.into_iter().rev() {
306 |             running_sum.add_assign(&exp);
307 |             acc.add_assign(&running_sum);
308 |         }
309 | 
310 |         Ok(acc)
311 |     };
312 | 
313 |     let parts = (0..<G::Scalar as PrimeField>::NUM_BITS)
314 |         .into_par_iter()
315 |         .step_by(c as usize)
316 |         .map(|skip| this(bases.clone(), density_map.clone(), exponents.clone(), skip))
317 |         .collect::<Vec<Result<_, _>>>();
318 | 
319 |     parts.into_iter().rev().try_fold(
320 |         <G as PrimeCurveAffine>::Curve::identity(),
321 |         |mut acc, part| {
322 |             for _ in 0..c {
323 |                 acc = acc.double();
324 |             }
325 | 
326 |             acc.add_assign(&part?);
327 |             Ok(acc)
328 |         },
329 |     )
330 | }
331 | 
332 | /// Perform multi-exponentiation. The caller is responsible for ensuring the
333 | /// query size is the same as the number of exponents.
334 | pub fn multiexp_cpu<'b, Q, D, G, S>(
335 |     pool: &Worker,
336 |     bases: S,
337 |     density_map: D,
338 |     exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
339 | ) -> Waiter<Result<<G as PrimeCurveAffine>::Curve, EcError>>
340 | where
341 |     for<'a> &'a Q: QueryDensity,
342 |     D: Send + Sync + 'static + Clone + AsRef<Q>,
343 |     G: PrimeCurveAffine,
344 |     S: SourceBuilder<G>,
345 | {
346 |     let c = if exponents.len() < 32 {
347 |         3u32
348 |     } else {
349 |         (f64::from(exponents.len() as u32)).ln().ceil() as u32
350 |     };
351 | 
352 |     if let Some(query_size) = density_map.as_ref().get_query_size() {
353 |         // If the density map has a known query size, it should not be
354 |         // inconsistent with the number of exponents.
355 |         assert!(query_size == exponents.len());
356 |     }
357 | 
358 |     pool.compute(move || multiexp_inner(bases, density_map, exponents, c))
359 | }
360 | 
361 | #[cfg(test)]
362 | mod tests {
363 |     use super::*;
364 | 
365 |     use blstrs::Bls12;
366 |     use group::Curve;
367 |     use pairing::Engine;
368 |     use rand::Rng;
369 |     use rand_core::SeedableRng;
370 |     use rand_xorshift::XorShiftRng;
371 | 
372 |     #[test]
373 |     fn test_with_bls12() {
374 |         fn naive_multiexp<G: PrimeCurveAffine>(
375 |             bases: Arc<Vec<G>>,
376 |             exponents: &[G::Scalar],
377 |         ) -> G::Curve {
378 |             assert_eq!(bases.len(), exponents.len());
379 | 
380 |             let mut acc = G::Curve::identity();
381 | 
382 |             for (base, exp) in bases.iter().zip(exponents.iter()) {
383 |                 acc.add_assign(&base.mul(*exp));
384 |             }
385 | 
386 |             acc
387 |         }
388 | 
389 |         const SAMPLES: usize = 1 << 14;
390 | 
391 |         let rng = &mut rand::thread_rng();
392 |         let v: Vec<<Bls12 as Engine>::Fr> = (0..SAMPLES)
393 |             .map(|_| <Bls12 as Engine>::Fr::random(&mut *rng))
394 |             .collect();
395 |         let g = Arc::new(
396 |             (0..SAMPLES)
397 |                 .map(|_| <Bls12 as Engine>::G1::random(&mut *rng).to_affine())
398 |                 .collect::<Vec<_>>(),
399 |         );
400 | 
401 |         let now = std::time::Instant::now();
402 |         let naive = naive_multiexp(g.clone(), &v);
403 |         println!("Naive: {}", now.elapsed().as_millis());
404 | 
405 |         let now = std::time::Instant::now();
406 |         let pool = Worker::new();
407 | 
408 |         let v = Arc::new(v.into_iter().map(|fr| fr.to_repr()).collect());
409 |         let fast = multiexp_cpu(&pool, (g, 0), FullDensity, v).wait().unwrap();
410 | 
411 |         println!("Fast: {}", now.elapsed().as_millis());
412 | 
413 |         assert_eq!(naive, fast);
414 |     }
415 | 
416 |     #[test]
417 |     fn test_extend_density_regular() {
418 |         let mut rng = XorShiftRng::from_seed([
419 |             0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06,
420 |             0xbc, 0xe5,
421 |         ]);
422 | 
423 |         for k in &[2, 4, 8] {
424 |             for j in &[10, 20, 50] {
425 |                 let count: usize = k * j;
426 | 
427 |                 let mut tracker_full = DensityTracker::new();
428 |                 let mut partial_trackers: Vec<DensityTracker> = Vec::with_capacity(count / k);
429 |                 for i in 0..count {
430 |                     if i % k == 0 {
431 |                         partial_trackers.push(DensityTracker::new());
432 |                     }
433 | 
434 |                     let index: usize = i / k;
435 |                     if rng.gen() {
436 |                         tracker_full.add_element();
437 |                         partial_trackers[index].add_element();
438 |                     }
439 | 
440 |                     if !partial_trackers[index].bv.is_empty() {
441 |                         let idx = rng.gen_range(0..partial_trackers[index].bv.len());
442 |                         let offset: usize = partial_trackers
443 |                             .iter()
444 |                             .take(index)
445 |                             .map(|t| t.bv.len())
446 |                             .sum();
447 |                         tracker_full.inc(offset + idx);
448 |                         partial_trackers[index].inc(idx);
449 |                     }
450 |                 }
451 | 
452 |                 let mut tracker_combined = DensityTracker::new();
453 |                 for tracker in partial_trackers.into_iter() {
454 |                     tracker_combined.extend(&tracker, false);
455 |                 }
456 |                 assert_eq!(tracker_combined, tracker_full);
457 |             }
458 |         }
459 |     }
460 | 
461 |     #[test]
462 |     fn test_extend_density_input() {
463 |         let mut rng = XorShiftRng::from_seed([
464 |             0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06,
465 |             0xbc, 0xe5,
466 |         ]);
467 |         let trials = 10;
468 |         let max_bits = 10;
469 |         let max_density = max_bits;
470 | 
471 |         // Create an empty DensityTracker.
472 |         let empty = DensityTracker::new;
473 | 
474 |         // Create a random DensityTracker with first bit unset.
475 |         let unset = |rng: &mut XorShiftRng| {
476 |             let mut dt = DensityTracker::new();
477 |             dt.add_element();
478 |             let n = rng.gen_range(1..max_bits);
479 |             let target_density = rng.gen_range(0..max_density);
480 |             for _ in 1..n {
481 |                 dt.add_element();
482 |             }
483 | 
484 |             for _ in 0..target_density {
485 |                 if n > 1 {
486 |                     let to_inc = rng.gen_range(1..n);
487 |                     dt.inc(to_inc);
488 |                 }
489 |             }
490 |             assert!(!dt.bv[0]);
491 |             assert_eq!(n, dt.bv.len());
492 |             dbg!(&target_density, &dt.total_density);
493 | 
494 |             dt
495 |         };
496 | 
497 |         // Create a random DensityTracker with first bit set.
498 |         let set = |rng: &mut XorShiftRng| {
499 |             let mut dt = unset(rng);
500 |             dt.inc(0);
501 |             dt
502 |         };
503 | 
504 |         for _ in 0..trials {
505 |             {
506 |                 // Both empty.
507 |                 let (mut e1, e2) = (empty(), empty());
508 |                 e1.extend(&e2, true);
509 |                 assert_eq!(empty(), e1);
510 |             }
511 |             {
512 |                 // First empty, second unset.
513 |                 let (mut e1, u1) = (empty(), unset(&mut rng));
514 |                 e1.extend(&u1.clone(), true);
515 |                 assert_eq!(u1, e1);
516 |             }
517 |             {
518 |                 // First empty, second set.
519 |                 let (mut e1, s1) = (empty(), set(&mut rng));
520 |                 e1.extend(&s1.clone(), true);
521 |                 assert_eq!(s1, e1);
522 |             }
523 |             {
524 |                 // First set, second empty.
525 |                 let (mut s1, e1) = (set(&mut rng), empty());
526 |                 let s2 = s1.clone();
527 |                 s1.extend(&e1, true);
528 |                 assert_eq!(s1, s2);
529 |             }
530 |             {
531 |                 // First unset, second empty.
532 |                 let (mut u1, e1) = (unset(&mut rng), empty());
533 |                 let u2 = u1.clone();
534 |                 u1.extend(&e1, true);
535 |                 assert_eq!(u1, u2);
536 |             }
537 |             {
538 |                 // First unset, second unset.
539 |                 let (mut u1, u2) = (unset(&mut rng), unset(&mut rng));
540 |                 let expected_total = u1.total_density + u2.total_density;
541 |                 u1.extend(&u2, true);
542 |                 assert_eq!(expected_total, u1.total_density);
543 |                 assert!(!u1.bv[0]);
544 |             }
545 |             {
546 |                 // First unset, second set.
547 |                 let (mut u1, s1) = (unset(&mut rng), set(&mut rng));
548 |                 let expected_total = u1.total_density + s1.total_density;
549 |                 u1.extend(&s1, true);
550 |                 assert_eq!(expected_total, u1.total_density);
551 |                 assert!(u1.bv[0]);
552 |             }
553 |             {
554 |                 // First set, second unset.
555 |                 let (mut s1, u1) = (set(&mut rng), unset(&mut rng));
556 |                 let expected_total = s1.total_density + u1.total_density;
557 |                 s1.extend(&u1, true);
558 |                 assert_eq!(expected_total, s1.total_density);
559 |                 assert!(s1.bv[0]);
560 |             }
561 |             {
562 |                 // First set, second set.
563 |                 let (mut s1, s2) = (set(&mut rng), set(&mut rng));
564 |                 let expected_total = s1.total_density + s2.total_density - 1;
565 |                 s1.extend(&s2, true);
566 |                 assert_eq!(expected_total, s1.total_density);
567 |                 assert!(s1.bv[0]);
568 |             }
569 |         }
570 |     }
571 | }
572 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/program.rs:
--------------------------------------------------------------------------------
 1 | #[macro_export]
 2 | /// Helper macro to create a program for a device.
 3 | ///
 4 | /// It will embed the CUDA fatbin/OpenCL source code within your binary. The source needs to be
 5 | /// generated via [`crate::source::generate`] in your `build.rs`.
 6 | ///
 7 | /// It returns a `[crate::rust_gpu_tools::Program`] instance.
 8 | macro_rules! program {
 9 |     ($device:ident) => {{
10 |         use $crate::rust_gpu_tools::{Framework, GPUError, Program};
11 |         (|device: &Device| -> Result<Program, $crate::EcError> {
12 |             // Selects a CUDA or OpenCL on the `EC_GPU_FRAMEWORK` environment variable and the
13 |             // compile-time features.
14 |             //
15 |             // You cannot select CUDA if the library was compiled without support for it.
16 |             let default_framework = device.framework();
17 |             let framework = match ::std::env::var("EC_GPU_FRAMEWORK") {
18 |                 Ok(env) => match env.as_ref() {
19 |                     "cuda" => {
20 |                         #[cfg(feature = "cuda")]
21 |                         {
22 |                             Framework::Cuda
23 |                         }
24 | 
25 |                         #[cfg(not(feature = "cuda"))]
26 |                         return Err($crate::EcError::Simple("CUDA framework is not supported, please compile with the `cuda` feature enabled."))
27 |                     }
28 |                     "opencl" => {
29 |                         #[cfg(feature = "opencl")]
30 |                         {
31 |                             Framework::Opencl
32 |                         }
33 | 
34 |                         #[cfg(not(feature = "opencl"))]
35 |                         return Err($crate::EcError::Simple("OpenCL framework is not supported, please compile with the `opencl` feature enabled."))
36 |                     }
37 |                     _ => default_framework,
38 |                 },
39 |                 Err(_) => default_framework,
40 |             };
41 | 
42 |             match framework {
43 |                 #[cfg(feature = "cuda")]
44 |                 Framework::Cuda => {
45 |                     let kernel = include_bytes!(env!("_EC_GPU_CUDA_KERNEL_FATBIN"));
46 |                     let cuda_device = device.cuda_device().ok_or(GPUError::DeviceNotFound)?;
47 |                     let program = $crate::rust_gpu_tools::cuda::Program::from_bytes(cuda_device, kernel)?;
48 |                     Ok(Program::Cuda(program))
49 |                 }
50 |                 #[cfg(feature = "opencl")]
51 |                 Framework::Opencl => {
52 |                     let source = include_str!(env!("_EC_GPU_OPENCL_KERNEL_SOURCE"));
53 |                     let opencl_device = device.opencl_device().ok_or(GPUError::DeviceNotFound)?;
54 |                     let program = $crate::rust_gpu_tools::opencl::Program::from_opencl(opencl_device, source)?;
55 |                     Ok(Program::Opencl(program))
56 |                 }
57 |             }
58 |         })($device)
59 |     }};
60 | }
61 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/source.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashSet;
  2 | use std::fmt::{self, Write};
  3 | use std::hash::{Hash, Hasher};
  4 | use std::marker::PhantomData;
  5 | use std::mem;
  6 | #[cfg(any(feature = "opencl", feature = "cuda"))]
  7 | use std::path::PathBuf;
  8 | #[cfg(any(feature = "opencl", feature = "cuda"))]
  9 | use std::{env, fs};
 10 | 
 11 | use ec_gpu::{GpuField, GpuName};
 12 | use group::prime::PrimeCurveAffine;
 13 | 
 14 | static COMMON_SRC: &str = include_str!("cl/common.cl");
 15 | static FIELD_SRC: &str = include_str!("cl/field.cl");
 16 | static FIELD2_SRC: &str = include_str!("cl/field2.cl");
 17 | static EC_SRC: &str = include_str!("cl/ec.cl");
 18 | static FFT_SRC: &str = include_str!("cl/fft.cl");
 19 | static MULTIEXP_SRC: &str = include_str!("cl/multiexp.cl");
 20 | 
 21 | #[derive(Clone, Copy)]
 22 | enum Limb32Or64 {
 23 |     Limb32,
 24 |     Limb64,
 25 | }
 26 | 
 27 | /// This trait is used to uniquely identify items by some identifier (`name`) and to return the GPU
 28 | /// source code they produce.
 29 | trait NameAndSource {
 30 |     /// The name to identify the item.
 31 |     fn name(&self) -> String;
 32 |     /// The GPU source code that is generated.
 33 |     fn source(&self, limb: Limb32Or64) -> String;
 34 | }
 35 | 
 36 | impl PartialEq for dyn NameAndSource {
 37 |     fn eq(&self, other: &Self) -> bool {
 38 |         self.name() == other.name()
 39 |     }
 40 | }
 41 | 
 42 | impl Eq for dyn NameAndSource {}
 43 | 
 44 | impl Hash for dyn NameAndSource {
 45 |     fn hash<H: Hasher>(&self, state: &mut H) {
 46 |         self.name().hash(state)
 47 |     }
 48 | }
 49 | 
 50 | /// Prints the name by default, the source code of the 32-bit limb in the alternate mode via
 51 | /// `{:#?}`.
 52 | impl fmt::Debug for dyn NameAndSource {
 53 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 54 |         if f.alternate() {
 55 |             f.debug_map()
 56 |                 .entries(vec![
 57 |                     ("name", self.name()),
 58 |                     ("source", self.source(Limb32Or64::Limb32)),
 59 |                 ])
 60 |                 .finish()
 61 |         } else {
 62 |             write!(f, "{:?}", self.name())
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | /// A field that might also be an extension field.
 68 | ///
 69 | /// When the field is an extension field, we also add its sub-field to the list of fields. This
 70 | /// enum is used to indicate that it's a sub-field that has a corresponding extension field. This
 71 | /// way we can make sure that when the source is generated, that also the source for the sub-field
 72 | /// is generated, while not having duplicated field definitions.
 73 | // Storing the sub-field as a string is a bit of a hack around Rust's type system. If we would
 74 | // store the generic type, then the enum would need to be generic over two fields, even in
 75 | // the case when no extension field is used. This would make the API harder to use.
 76 | #[derive(Debug)]
 77 | enum Field<F: GpuField> {
 78 |     /// A field, might be an extension field.
 79 |     Field(PhantomData<F>),
 80 |     /// A sub-field with the given name that has a corresponding extension field.
 81 |     SubField(String),
 82 | }
 83 | 
 84 | impl<F: GpuField> Field<F> {
 85 |     /// Create a new field for the given generic type.
 86 |     pub fn new() -> Self {
 87 |         // By default it's added as a field. If it's an extension field, then the `add_field()`
 88 |         // function will create a copy of it, as `SubField` variant.
 89 |         Self::Field(PhantomData)
 90 |     }
 91 | }
 92 | 
 93 | impl<F: GpuField> Default for Field<F> {
 94 |     fn default() -> Self {
 95 |         Self::new()
 96 |     }
 97 | }
 98 | 
 99 | fn field_source<F: GpuField>(limb: Limb32Or64) -> String {
100 |     match limb {
101 |         Limb32Or64::Limb32 => [
102 |             params::<F, Limb32>(),
103 |             field_add_sub_nvidia::<F, Limb32>().expect("preallocated"),
104 |             String::from(FIELD_SRC),
105 |         ]
106 |         .join("\n"),
107 |         Limb32Or64::Limb64 => [
108 |             params::<F, Limb64>(),
109 |             field_add_sub_nvidia::<F, Limb64>().expect("preallocated"),
110 |             String::from(FIELD_SRC),
111 |         ]
112 |         .join("\n"),
113 |     }
114 | }
115 | 
116 | impl<F: GpuField> NameAndSource for Field<F> {
117 |     fn name(&self) -> String {
118 |         match self {
119 |             Self::Field(_) => F::name(),
120 |             Self::SubField(name) => name.to_string(),
121 |         }
122 |     }
123 | 
124 |     fn source(&self, limb: Limb32Or64) -> String {
125 |         match self {
126 |             Self::Field(_) => {
127 |                 // If it's an extension field.
128 |                 if let Some(sub_field_name) = F::sub_field_name() {
129 |                     String::from(FIELD2_SRC)
130 |                         .replace("FIELD2", &F::name())
131 |                         .replace("FIELD", &sub_field_name)
132 |                 } else {
133 |                     field_source::<F>(limb).replace("FIELD", &F::name())
134 |                 }
135 |             }
136 |             Self::SubField(sub_field_name) => {
137 |                 // The `GpuField` implementation of the extension field contains the constants of
138 |                 // the sub-field. Hence we can just forward the `F`. It's important that those
139 |                 // functions do *not* use the name of the field, else we might generate the
140 |                 // sub-field named like the extension field.
141 |                 field_source::<F>(limb).replace("FIELD", sub_field_name)
142 |             }
143 |         }
144 |     }
145 | }
146 | 
147 | /// Struct that generates FFT GPU source code.
148 | struct Fft<F: GpuName>(PhantomData<F>);
149 | 
150 | impl<F: GpuName> NameAndSource for Fft<F> {
151 |     fn name(&self) -> String {
152 |         F::name()
153 |     }
154 | 
155 |     fn source(&self, _limb: Limb32Or64) -> String {
156 |         String::from(FFT_SRC).replace("FIELD", &F::name())
157 |     }
158 | }
159 | 
160 | /// Struct that generates multiexp GPU smource code.
161 | struct Multiexp<P: GpuName, F: GpuName, Exp: GpuName> {
162 |     curve_point: PhantomData<P>,
163 |     field: PhantomData<F>,
164 |     exponent: PhantomData<Exp>,
165 | }
166 | 
167 | impl<P: GpuName, F: GpuName, Exp: GpuName> Multiexp<P, F, Exp> {
168 |     pub fn new() -> Self {
169 |         Self {
170 |             curve_point: PhantomData::<P>,
171 |             field: PhantomData::<F>,
172 |             exponent: PhantomData::<Exp>,
173 |         }
174 |     }
175 | }
176 | 
177 | impl<P: GpuName, F: GpuName, Exp: GpuName> NameAndSource for Multiexp<P, F, Exp> {
178 |     fn name(&self) -> String {
179 |         P::name()
180 |     }
181 | 
182 |     fn source(&self, _limb: Limb32Or64) -> String {
183 |         let ec = String::from(EC_SRC)
184 |             .replace("FIELD", &F::name())
185 |             .replace("POINT", &P::name());
186 |         let multiexp = String::from(MULTIEXP_SRC)
187 |             .replace("POINT", &P::name())
188 |             .replace("EXPONENT", &Exp::name());
189 |         [ec, multiexp].concat()
190 |     }
191 | }
192 | 
193 | /// Builder to create the source code of a GPU kernel.
194 | ///
195 | /// # Example
196 | ///
197 | /// ```
198 | /// use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar};
199 | /// use ec_gpu_gen::SourceBuilder;
200 | ///
201 | /// # #[cfg(any(feature = "cuda", feature = "opencl"))]
202 | /// let source = SourceBuilder::new()
203 | ///     .add_fft::<Scalar>()
204 | ///     .add_multiexp::<G1Affine, Fp>()
205 | ///     .add_multiexp::<G2Affine, Fp2>()
206 | ///     .build_32_bit_limbs();
207 | ///```
208 | // In the `HashSet`s the concrete types cannot be used, as each item of the set should be able to
209 | // have its own (different) generic type.
210 | // We distinguish between extension fields and other fields as sub-fields need to be defined first
211 | // in the source code (due to being C, where the order of declaration matters).
212 | pub struct SourceBuilder {
213 |     /// The [`Field`]s that are used in this kernel.
214 |     fields: HashSet<Box<dyn NameAndSource>>,
215 |     /// The extension [`Field`]s that are used in this kernel.
216 |     extension_fields: HashSet<Box<dyn NameAndSource>>,
217 |     /// The [`Fft`]s that are used in this kernel.
218 |     ffts: HashSet<Box<dyn NameAndSource>>,
219 |     /// The [`Multiexp`]s that are used in this kernel.
220 |     multiexps: HashSet<Box<dyn NameAndSource>>,
221 |     /// Additional source that is appended at the end of the generated source.
222 |     extra_sources: Vec<String>,
223 | }
224 | 
225 | impl SourceBuilder {
226 |     /// Create a new configuration to generation a GPU kernel.
227 |     pub fn new() -> Self {
228 |         Self {
229 |             fields: HashSet::new(),
230 |             extension_fields: HashSet::new(),
231 |             ffts: HashSet::new(),
232 |             multiexps: HashSet::new(),
233 |             extra_sources: Vec::new(),
234 |         }
235 |     }
236 | 
237 |     /// Add a field to the configuration.
238 |     ///
239 |     /// If it is an extension field, then the extension field *and* the sub-field is added.
240 |     pub fn add_field<F>(mut self) -> Self
241 |     where
242 |         F: GpuField + 'static,
243 |     {
244 |         let field = Field::<F>::new();
245 |         // If it's an extension field, also add the corresponding sub-field.
246 |         if let Some(sub_field_name) = F::sub_field_name() {
247 |             self.extension_fields.insert(Box::new(field));
248 |             let sub_field = Field::<F>::SubField(sub_field_name);
249 |             self.fields.insert(Box::new(sub_field));
250 |         } else {
251 |             self.fields.insert(Box::new(field));
252 |         }
253 |         self
254 |     }
255 | 
256 |     /// Add an FFT kernel function to the configuration.
257 |     pub fn add_fft<F>(self) -> Self
258 |     where
259 |         F: GpuField + 'static,
260 |     {
261 |         let mut config = self.add_field::<F>();
262 |         let fft = Fft::<F>(PhantomData);
263 |         config.ffts.insert(Box::new(fft));
264 |         config
265 |     }
266 | 
267 |     /// Add an Multiexp kernel function to the configuration.
268 |     ///
269 |     /// The field must be given explicitly as currently it cannot derived from the curve point
270 |     /// directly.
271 |     pub fn add_multiexp<C, F>(self) -> Self
272 |     where
273 |         C: PrimeCurveAffine + GpuName,
274 |         C::Scalar: GpuField,
275 |         F: GpuField + 'static,
276 |     {
277 |         let mut config = self.add_field::<F>().add_field::<C::Scalar>();
278 |         let multiexp = Multiexp::<C, F, C::Scalar>::new();
279 |         config.multiexps.insert(Box::new(multiexp));
280 |         config
281 |     }
282 | 
283 |     /// Appends some given source at the end of the generated source.
284 |     ///
285 |     /// This is useful for cases where you use this library as building block, but have your own
286 |     /// kernel implementation. If this function is is called several times, then those sources are
287 |     /// appended in that call order.
288 |     pub fn append_source(mut self, source: String) -> Self {
289 |         self.extra_sources.push(source);
290 |         self
291 |     }
292 | 
293 |     /// Generate the GPU kernel source code based on the current configuration with 32-bit limbs.
294 |     ///
295 |     /// On CUDA 32-bit limbs are recommended.
296 |     pub fn build_32_bit_limbs(&self) -> String {
297 |         self.build(Limb32Or64::Limb32)
298 |     }
299 | 
300 |     /// Generate the GPU kernel source code based on the current configuration with 64-bit limbs.
301 |     ///
302 |     /// On OpenCL 32-bit limbs are recommended.
303 |     pub fn build_64_bit_limbs(&self) -> String {
304 |         self.build(Limb32Or64::Limb64)
305 |     }
306 | 
307 |     /// Generate the GPU kernel source code based on the current configuration.
308 |     fn build(&self, limb_size: Limb32Or64) -> String {
309 |         let fields = self
310 |             .fields
311 |             .iter()
312 |             .map(|field| field.source(limb_size))
313 |             .collect();
314 |         let extension_fields = self
315 |             .extension_fields
316 |             .iter()
317 |             .map(|field| field.source(limb_size))
318 |             .collect();
319 |         let ffts = self.ffts.iter().map(|fft| fft.source(limb_size)).collect();
320 |         let multiexps = self
321 |             .multiexps
322 |             .iter()
323 |             .map(|multiexp| multiexp.source(limb_size))
324 |             .collect();
325 |         let extra_sources = self.extra_sources.join("\n");
326 |         [
327 |             COMMON_SRC.to_string(),
328 |             fields,
329 |             extension_fields,
330 |             ffts,
331 |             multiexps,
332 |             extra_sources,
333 |         ]
334 |         .join("\n\n")
335 |     }
336 | }
337 | 
338 | impl Default for SourceBuilder {
339 |     fn default() -> Self {
340 |         Self::new()
341 |     }
342 | }
343 | 
344 | /// Trait to implement limbs of different underlying bit sizes.
345 | pub trait Limb: Sized + Clone + Copy {
346 |     /// The underlying size of the limb, e.g. `u32`
347 |     type LimbType: Clone + std::fmt::Display;
348 |     /// Returns the value representing zero.
349 |     fn zero() -> Self;
350 |     /// Returns a new limb.
351 |     fn new(val: Self::LimbType) -> Self;
352 |     /// Returns the raw value of the limb.
353 |     fn value(&self) -> Self::LimbType;
354 |     /// Returns the bit size of the limb.
355 |     fn bits() -> usize {
356 |         mem::size_of::<Self::LimbType>() * 8
357 |     }
358 |     /// Returns a tuple with the strings that PTX is using to describe the type and the register.
359 |     fn ptx_info() -> (&'static str, &'static str);
360 |     /// Returns the type that OpenCL is using to represent the limb.
361 |     fn opencl_type() -> &'static str;
362 |     /// Returns the limbs that represent the multiplicative identity of the given field.
363 |     fn one_limbs<F: GpuField>() -> Vec<Self>;
364 |     /// Returns the field modulus in non-Montgomery form as a vector of `Self::LimbType` (least
365 |     /// significant limb first).
366 |     fn modulus_limbs<F: GpuField>() -> Vec<Self>;
367 |     /// Calculate the `INV` parameter of Montgomery reduction algorithm for 32/64bit limbs
368 |     /// * `a` - Is the first limb of modulus.
369 |     fn calc_inv(a: Self) -> Self;
370 |     /// Returns the limbs that represent `R ^ 2 mod P`.
371 |     fn calculate_r2<F: GpuField>() -> Vec<Self>;
372 | }
373 | 
374 | /// A 32-bit limb.
375 | #[derive(Clone, Copy)]
376 | pub struct Limb32(u32);
377 | impl Limb for Limb32 {
378 |     type LimbType = u32;
379 |     fn zero() -> Self {
380 |         Self(0)
381 |     }
382 |     fn new(val: Self::LimbType) -> Self {
383 |         Self(val)
384 |     }
385 |     fn value(&self) -> Self::LimbType {
386 |         self.0
387 |     }
388 |     fn ptx_info() -> (&'static str, &'static str) {
389 |         ("u32", "r")
390 |     }
391 |     fn opencl_type() -> &'static str {
392 |         "uint"
393 |     }
394 |     fn one_limbs<F: GpuField>() -> Vec<Self> {
395 |         F::one().into_iter().map(Self::new).collect()
396 |     }
397 |     fn modulus_limbs<F: GpuField>() -> Vec<Self> {
398 |         F::modulus().into_iter().map(Self::new).collect()
399 |     }
400 |     fn calc_inv(a: Self) -> Self {
401 |         let mut inv = 1u32;
402 |         for _ in 0..31 {
403 |             inv = inv.wrapping_mul(inv);
404 |             inv = inv.wrapping_mul(a.value());
405 |         }
406 |         Self(inv.wrapping_neg())
407 |     }
408 |     fn calculate_r2<F: GpuField>() -> Vec<Self> {
409 |         F::r2().into_iter().map(Self::new).collect()
410 |     }
411 | }
412 | 
413 | /// A 64-bit limb.
414 | #[derive(Clone, Copy)]
415 | pub struct Limb64(u64);
416 | impl Limb for Limb64 {
417 |     type LimbType = u64;
418 |     fn zero() -> Self {
419 |         Self(0)
420 |     }
421 |     fn new(val: Self::LimbType) -> Self {
422 |         Self(val)
423 |     }
424 |     fn value(&self) -> Self::LimbType {
425 |         self.0
426 |     }
427 |     fn ptx_info() -> (&'static str, &'static str) {
428 |         ("u64", "l")
429 |     }
430 |     fn opencl_type() -> &'static str {
431 |         "ulong"
432 |     }
433 |     fn one_limbs<F: GpuField>() -> Vec<Self> {
434 |         F::one()
435 |             .chunks(2)
436 |             .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64)))
437 |             .collect()
438 |     }
439 | 
440 |     fn modulus_limbs<F: GpuField>() -> Vec<Self> {
441 |         F::modulus()
442 |             .chunks(2)
443 |             .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64)))
444 |             .collect()
445 |     }
446 | 
447 |     fn calc_inv(a: Self) -> Self {
448 |         let mut inv = 1u64;
449 |         for _ in 0..63 {
450 |             inv = inv.wrapping_mul(inv);
451 |             inv = inv.wrapping_mul(a.value());
452 |         }
453 |         Self(inv.wrapping_neg())
454 |     }
455 |     fn calculate_r2<F: GpuField>() -> Vec<Self> {
456 |         F::r2()
457 |             .chunks(2)
458 |             .map(|chunk| Self::new(((chunk[1] as u64) << 32) + (chunk[0] as u64)))
459 |             .collect()
460 |     }
461 | }
462 | 
463 | fn const_field<L: Limb>(name: &str, limbs: Vec<L>) -> String {
464 |     format!(
465 |         "CONSTANT FIELD {} = {{ {{ {} }} }};",
466 |         name,
467 |         limbs
468 |             .iter()
469 |             .map(|l| l.value().to_string())
470 |             .collect::<Vec<_>>()
471 |             .join(", ")
472 |     )
473 | }
474 | 
475 | /// Generates CUDA/OpenCL constants and type definitions of prime-field `F`
476 | fn params<F, L>() -> String
477 | where
478 |     F: GpuField,
479 |     L: Limb,
480 | {
481 |     let one = L::one_limbs::<F>(); // Get Montgomery form of F::one()
482 |     let p = L::modulus_limbs::<F>(); // Get field modulus in non-Montgomery form
483 |     let r2 = L::calculate_r2::<F>();
484 |     let limbs = one.len(); // Number of limbs
485 |     let inv = L::calc_inv(p[0]);
486 |     let limb_def = format!("#define FIELD_limb {}", L::opencl_type());
487 |     let limbs_def = format!("#define FIELD_LIMBS {}", limbs);
488 |     let limb_bits_def = format!("#define FIELD_LIMB_BITS {}", L::bits());
489 |     let p_def = const_field("FIELD_P", p);
490 |     let r2_def = const_field("FIELD_R2", r2);
491 |     let one_def = const_field("FIELD_ONE", one);
492 |     let zero_def = const_field("FIELD_ZERO", vec![L::zero(); limbs]);
493 |     let inv_def = format!("#define FIELD_INV {}", inv.value());
494 |     let typedef = "typedef struct { FIELD_limb val[FIELD_LIMBS]; } FIELD;".to_string();
495 |     [
496 |         limb_def,
497 |         limbs_def,
498 |         limb_bits_def,
499 |         inv_def,
500 |         typedef,
501 |         one_def,
502 |         p_def,
503 |         r2_def,
504 |         zero_def,
505 |     ]
506 |     .join("\n")
507 | }
508 | 
509 | /// Generates PTX-Assembly implementation of FIELD_add_/FIELD_sub_
510 | fn field_add_sub_nvidia<F, L>() -> Result<String, std::fmt::Error>
511 | where
512 |     F: GpuField,
513 |     L: Limb,
514 | {
515 |     let mut result = String::new();
516 |     let (ptx_type, ptx_reg) = L::ptx_info();
517 | 
518 |     writeln!(result, "#if defined(OPENCL_NVIDIA) || defined(CUDA)\n")?;
519 |     for op in &["sub", "add"] {
520 |         let len = L::one_limbs::<F>().len();
521 | 
522 |         writeln!(
523 |             result,
524 |             "DEVICE FIELD FIELD_{}_nvidia(FIELD a, FIELD b) {{",
525 |             op
526 |         )?;
527 |         if len > 1 {
528 |             write!(result, "asm(")?;
529 |             writeln!(result, "\"{}.cc.{} %0, %0, %{};\\r\\n\"", op, ptx_type, len)?;
530 | 
531 |             for i in 1..len - 1 {
532 |                 writeln!(
533 |                     result,
534 |                     "\"{}c.cc.{} %{}, %{}, %{};\\r\\n\"",
535 |                     op,
536 |                     ptx_type,
537 |                     i,
538 |                     i,
539 |                     len + i
540 |                 )?;
541 |             }
542 |             writeln!(
543 |                 result,
544 |                 "\"{}c.{} %{}, %{}, %{};\\r\\n\"",
545 |                 op,
546 |                 ptx_type,
547 |                 len - 1,
548 |                 len - 1,
549 |                 2 * len - 1
550 |             )?;
551 | 
552 |             write!(result, ":")?;
553 |             for n in 0..len {
554 |                 write!(result, "\"+{}\"(a.val[{}])", ptx_reg, n)?;
555 |                 if n != len - 1 {
556 |                     write!(result, ", ")?;
557 |                 }
558 |             }
559 | 
560 |             write!(result, "\n:")?;
561 |             for n in 0..len {
562 |                 write!(result, "\"{}\"(b.val[{}])", ptx_reg, n)?;
563 |                 if n != len - 1 {
564 |                     write!(result, ", ")?;
565 |                 }
566 |             }
567 |             writeln!(result, ");")?;
568 |         }
569 |         writeln!(result, "return a;\n}}")?;
570 |     }
571 |     writeln!(result, "#endif")?;
572 | 
573 |     Ok(result)
574 | }
575 | 
576 | /// Convenience function to generate a kernel/source based on a source builder.
577 | ///
578 | /// When the `cuda` feature is enabled it will compile a CUDA fatbin. The path to the file is
579 | /// stored in the `_EC_GPU_CUDA_KERNEL_FATBIN` environment variable, that will automatically be
580 | /// used by the `ec-gpu-gen` functionality that needs a kernel.
581 | ///
582 | ///
583 | /// When the `opencl` feature is enabled it will generate the source code for OpenCL. The path to
584 | /// the source file is stored in the `_EC_GPU_OPENCL_KERNEL_SOURCE` environment variable, that will
585 | /// automatically be used by the `ec-gpu-gen` functionality that needs a kernel. OpenCL compiles
586 | /// the source at run time).
587 | #[allow(unused_variables)]
588 | pub fn generate(source_builder: &SourceBuilder) {
589 |     #[cfg(feature = "cuda")]
590 |     generate_cuda(source_builder);
591 |     #[cfg(feature = "opencl")]
592 |     generate_opencl(source_builder);
593 | }
594 | 
595 | #[cfg(feature = "cuda")]
596 | fn generate_cuda(source_builder: &SourceBuilder) -> PathBuf {
597 |     use sha2::{Digest, Sha256};
598 | 
599 |     // This is a hack when no properly compiled kernel is needed. That's the case when the
600 |     // documentation is built on docs.rs and when Clippy is run. We can use arbitrary bytes as
601 |     // input then.
602 |     if env::var("DOCS_RS").is_ok() || cfg!(clippy) {
603 |         println!("cargo:rustc-env=_EC_GPU_CUDA_KERNEL_FATBIN=../build.rs");
604 |         return PathBuf::from("../build.rs");
605 |     }
606 | 
607 |     let kernel_source = source_builder.build_32_bit_limbs();
608 |     let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
609 | 
610 |     // Make it possible to override the default options. Though the source and output file is
611 |     // always set automatically.
612 |     let mut nvcc = match env::var("EC_GPU_CUDA_NVCC_ARGS") {
613 |         Ok(args) => execute::command(format!("nvcc {}", args)),
614 |         Err(_) => {
615 |             let mut command = std::process::Command::new("nvcc");
616 |             command
617 |                 .arg("--optimize=6")
618 |                 // Compile with as many threads as CPUs are available.
619 |                 .arg("--threads=0")
620 |                 .arg("--fatbin")
621 |                 .arg("--gpu-architecture=sm_86")
622 |                 .arg("--generate-code=arch=compute_86,code=sm_86")
623 |                 .arg("--generate-code=arch=compute_80,code=sm_80")
624 |                 .arg("--generate-code=arch=compute_75,code=sm_75");
625 |             command
626 |         }
627 |     };
628 | 
629 |     // Hash the source and the compile flags. Use that as the filename, so that the kernel is only
630 |     // rebuilt if any of them change.
631 |     let mut hasher = Sha256::new();
632 |     hasher.update(kernel_source.as_bytes());
633 |     hasher.update(format!("{:?}", &nvcc));
634 |     let kernel_digest = hex::encode(hasher.finalize());
635 | 
636 |     let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)]
637 |         .iter()
638 |         .collect();
639 |     let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)]
640 |         .iter()
641 |         .collect();
642 | 
643 |     fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
644 |         panic!(
645 |             "Cannot write kernel source at {}.",
646 |             source_path.to_str().unwrap()
647 |         )
648 |     });
649 | 
650 |     // Only compile if the output doesn't exist yet.
651 |     if !fatbin_path.as_path().exists() {
652 |         let status = nvcc
653 |             .arg("--output-file")
654 |             .arg(&fatbin_path)
655 |             .arg(&source_path)
656 |             .status()
657 |             .expect("Cannot run nvcc. Install the NVIDIA toolkit or disable the `cuda` feature.");
658 | 
659 |         if !status.success() {
660 |             panic!(
661 |                 "nvcc failed. See the kernel source at {}",
662 |                 source_path.to_str().unwrap()
663 |             );
664 |         }
665 |     }
666 | 
667 |     // The idea to put the path to the farbin into a compile-time env variable is from
668 |     // https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs
669 |     println!(
670 |         "cargo:rustc-env=_EC_GPU_CUDA_KERNEL_FATBIN={}",
671 |         fatbin_path.to_str().unwrap()
672 |     );
673 | 
674 |     fatbin_path
675 | }
676 | 
677 | #[cfg(feature = "opencl")]
678 | fn generate_opencl(source_builder: &SourceBuilder) -> PathBuf {
679 |     let kernel_source = source_builder.build_64_bit_limbs();
680 |     let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
681 | 
682 |     // Generating the kernel source is cheap, hence use a fixed name and override it on every
683 |     // build.
684 |     let source_path: PathBuf = [&out_dir, "kernel.cl"].iter().collect();
685 | 
686 |     fs::write(&source_path, kernel_source).unwrap_or_else(|_| {
687 |         panic!(
688 |             "Cannot write kernel source at {}.",
689 |             source_path.to_str().unwrap()
690 |         )
691 |     });
692 | 
693 |     // For OpenCL we only need the kernel source, it is compiled at runtime.
694 |     println!(
695 |         "cargo:rustc-env=_EC_GPU_OPENCL_KERNEL_SOURCE={}",
696 |         source_path.to_str().unwrap()
697 |     );
698 | 
699 |     source_path
700 | }
701 | 
702 | #[cfg(all(test, any(feature = "opencl", feature = "cuda")))]
703 | mod tests {
704 |     use super::*;
705 | 
706 |     use std::sync::Mutex;
707 | 
708 |     #[cfg(feature = "cuda")]
709 |     use rust_gpu_tools::cuda;
710 |     #[cfg(feature = "opencl")]
711 |     use rust_gpu_tools::opencl;
712 |     use rust_gpu_tools::{program_closures, Device, GPUError, Program};
713 | 
714 |     use blstrs::Scalar;
715 |     use ff::{Field as _, PrimeField};
716 |     use lazy_static::lazy_static;
717 |     use rand::{thread_rng, Rng};
718 | 
719 |     static TEST_SRC: &str = include_str!("./cl/test.cl");
720 | 
721 |     #[derive(PartialEq, Debug, Clone, Copy)]
722 |     #[repr(transparent)]
723 |     pub struct GpuScalar(pub Scalar);
724 |     impl Default for GpuScalar {
725 |         fn default() -> Self {
726 |             Self(Scalar::ZERO)
727 |         }
728 |     }
729 | 
730 |     #[cfg(feature = "cuda")]
731 |     impl cuda::KernelArgument for GpuScalar {
732 |         fn as_c_void(&self) -> *mut std::ffi::c_void {
733 |             &self.0 as *const _ as _
734 |         }
735 |     }
736 | 
737 |     #[cfg(feature = "opencl")]
738 |     impl opencl::KernelArgument for GpuScalar {
739 |         fn push(&self, kernel: &mut opencl::Kernel) {
740 |             unsafe { kernel.builder.set_arg(&self.0) };
741 |         }
742 |     }
743 | 
744 |     /// The `run` call needs to return a result, use this struct as placeholder.
745 |     #[derive(Debug)]
746 |     struct NoError;
747 |     impl From<GPUError> for NoError {
748 |         fn from(_error: GPUError) -> Self {
749 |             Self
750 |         }
751 |     }
752 | 
753 |     fn test_source() -> SourceBuilder {
754 |         let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name());
755 |         SourceBuilder::new()
756 |             .add_field::<Scalar>()
757 |             .append_source(test_source)
758 |     }
759 | 
760 |     #[cfg(feature = "cuda")]
761 |     lazy_static! {
762 |         static ref CUDA_PROGRAM: Mutex<Program> = {
763 |             use std::ffi::CString;
764 | 
765 |             let source = test_source();
766 |             let fatbin_path = generate_cuda(&source);
767 | 
768 |             let device = *Device::all().first().expect("Cannot get a default device.");
769 |             let cuda_device = device.cuda_device().unwrap();
770 |             let fatbin_path_cstring =
771 |                 CString::new(fatbin_path.to_str().expect("path is not valid UTF-8."))
772 |                     .expect("path contains NULL byte.");
773 |             let program =
774 |                 cuda::Program::from_binary(cuda_device, fatbin_path_cstring.as_c_str()).unwrap();
775 |             Mutex::new(Program::Cuda(program))
776 |         };
777 |     }
778 | 
779 |     #[cfg(feature = "opencl")]
780 |     lazy_static! {
781 |         static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = {
782 |             let device = *Device::all().first().expect("Cannot get a default device");
783 |             let opencl_device = device.opencl_device().unwrap();
784 |             let source_32 = test_source().build_32_bit_limbs();
785 |             let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap();
786 |             let source_64 = test_source().build_64_bit_limbs();
787 |             let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap();
788 |             Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64)))
789 |         };
790 |     }
791 | 
792 |     fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar {
793 |         let closures = program_closures!(|program, _args| -> Result<Scalar, NoError> {
794 |             let mut cpu_buffer = vec![GpuScalar::default()];
795 |             let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap();
796 | 
797 |             let mut kernel = program.create_kernel(name, 1, 64).unwrap();
798 |             for scalar in scalars {
799 |                 kernel = kernel.arg(scalar);
800 |             }
801 |             for uint in uints {
802 |                 kernel = kernel.arg(uint);
803 |             }
804 |             kernel.arg(&buffer).run().unwrap();
805 | 
806 |             program.read_into_buffer(&buffer, &mut cpu_buffer).unwrap();
807 |             Ok(cpu_buffer[0].0)
808 |         });
809 | 
810 |         // For CUDA we only test 32-bit limbs.
811 |         #[cfg(all(feature = "cuda", not(feature = "opencl")))]
812 |         return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
813 | 
814 |         // For OpenCL we test for 32 and 64-bi limbs.
815 |         #[cfg(all(feature = "opencl", not(feature = "cuda")))]
816 |         {
817 |             let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
818 |             let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
819 |             assert_eq!(
820 |                 result_32, result_64,
821 |                 "Results for 32-bit and 64-bit limbs must be the same."
822 |             );
823 |             result_32
824 |         }
825 | 
826 |         // When both features are enabled, check if the results are the same
827 |         #[cfg(all(feature = "cuda", feature = "opencl"))]
828 |         {
829 |             let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
830 |             let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
831 |             let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
832 |             assert_eq!(
833 |                 opencl_32_result, opencl_64_result,
834 |                 "Results for 32-bit and 64-bit limbs on OpenCL must be the same."
835 |             );
836 |             assert_eq!(
837 |                 cuda_result, opencl_32_result,
838 |                 "Results for CUDA and OpenCL must be the same."
839 |             );
840 |             cuda_result
841 |         }
842 |     }
843 | 
844 |     #[test]
845 |     fn test_add() {
846 |         let mut rng = thread_rng();
847 |         for _ in 0..10 {
848 |             let a = Scalar::random(&mut rng);
849 |             let b = Scalar::random(&mut rng);
850 |             let c = a + b;
851 | 
852 |             assert_eq!(
853 |                 call_kernel("test_add", &[GpuScalar(a), GpuScalar(b)], &[]),
854 |                 c
855 |             );
856 |         }
857 |     }
858 | 
859 |     #[test]
860 |     fn test_sub() {
861 |         let mut rng = thread_rng();
862 |         for _ in 0..10 {
863 |             let a = Scalar::random(&mut rng);
864 |             let b = Scalar::random(&mut rng);
865 |             let c = a - b;
866 |             assert_eq!(
867 |                 call_kernel("test_sub", &[GpuScalar(a), GpuScalar(b)], &[]),
868 |                 c
869 |             );
870 |         }
871 |     }
872 | 
873 |     #[test]
874 |     fn test_mul() {
875 |         let mut rng = thread_rng();
876 |         for _ in 0..10 {
877 |             let a = Scalar::random(&mut rng);
878 |             let b = Scalar::random(&mut rng);
879 |             let c = a * b;
880 | 
881 |             assert_eq!(
882 |                 call_kernel("test_mul", &[GpuScalar(a), GpuScalar(b)], &[]),
883 |                 c
884 |             );
885 |         }
886 |     }
887 | 
888 |     #[test]
889 |     fn test_pow() {
890 |         let mut rng = thread_rng();
891 |         for _ in 0..10 {
892 |             let a = Scalar::random(&mut rng);
893 |             let b = rng.gen::<u32>();
894 |             let c = a.pow_vartime([b as u64]);
895 |             assert_eq!(call_kernel("test_pow", &[GpuScalar(a)], &[b]), c);
896 |         }
897 |     }
898 | 
899 |     #[test]
900 |     fn test_sqr() {
901 |         let mut rng = thread_rng();
902 |         for _ in 0..10 {
903 |             let a = Scalar::random(&mut rng);
904 |             let b = a.square();
905 | 
906 |             assert_eq!(call_kernel("test_sqr", &[GpuScalar(a)], &[]), b);
907 |         }
908 |     }
909 | 
910 |     #[test]
911 |     fn test_double() {
912 |         let mut rng = thread_rng();
913 |         for _ in 0..10 {
914 |             let a = Scalar::random(&mut rng);
915 |             let b = a.double();
916 | 
917 |             assert_eq!(call_kernel("test_double", &[GpuScalar(a)], &[]), b);
918 |         }
919 |     }
920 | 
921 |     #[test]
922 |     fn test_unmont() {
923 |         let mut rng = thread_rng();
924 |         for _ in 0..10 {
925 |             let a = Scalar::random(&mut rng);
926 |             let b: Scalar = unsafe { std::mem::transmute(a.to_repr()) };
927 |             assert_eq!(call_kernel("test_unmont", &[GpuScalar(a)], &[]), b);
928 |         }
929 |     }
930 | 
931 |     #[test]
932 |     fn test_mont() {
933 |         let mut rng = thread_rng();
934 |         for _ in 0..10 {
935 |             let a_repr = Scalar::random(&mut rng).to_repr();
936 |             let a: Scalar = unsafe { std::mem::transmute(a_repr) };
937 |             let b = Scalar::from_repr(a_repr).unwrap();
938 |             assert_eq!(call_kernel("test_mont", &[GpuScalar(a)], &[]), b);
939 |         }
940 |     }
941 | }
942 | 


--------------------------------------------------------------------------------
/ec-gpu-gen/src/threadpool.rs:
--------------------------------------------------------------------------------
  1 | //! An interface for dealing with the kinds of parallel computations involved.
  2 | use std::env;
  3 | 
  4 | use crossbeam_channel::{bounded, Receiver, SendError};
  5 | use log::trace;
  6 | use once_cell::sync::Lazy;
  7 | use yastl::Pool;
  8 | 
  9 | /// The number of threads the thread pool should use.
 10 | ///
 11 | /// By default it's equal to the number of CPUs, but it can be changed with the
 12 | /// `EC_GPU_NUM_THREADS` environment variable.
 13 | static NUM_THREADS: Lazy<usize> = Lazy::new(read_num_threads);
 14 | 
 15 | /// The thread pool that is used for the computations.
 16 | ///
 17 | /// By default, it's size is equal to the number of CPUs. It can be set to a different value with
 18 | /// the `EC_GPU_NUM_THREADS` environment variable.
 19 | pub static THREAD_POOL: Lazy<Pool> = Lazy::new(|| Pool::new(*NUM_THREADS));
 20 | 
 21 | /// Returns the number of threads.
 22 | ///
 23 | /// The number can be set with the `EC_GPU_NUM_THREADS` environment variable. If it isn't set, it
 24 | /// defaults to the number of CPUs the system has.
 25 | fn read_num_threads() -> usize {
 26 |     env::var("EC_GPU_NUM_THREADS")
 27 |         .ok()
 28 |         .and_then(|num| num.parse::<usize>().ok())
 29 |         .unwrap_or_else(num_cpus::get)
 30 | }
 31 | 
 32 | /// A worker operates on a pool of threads.
 33 | #[derive(Clone, Default)]
 34 | pub struct Worker {}
 35 | 
 36 | impl Worker {
 37 |     /// Returns a new worker.
 38 |     pub fn new() -> Worker {
 39 |         Worker {}
 40 |     }
 41 | 
 42 |     /// Returns binary logarithm (floored) of the number of threads.
 43 |     ///
 44 |     /// This means, the number of threads is `2^log_num_threads()`.
 45 |     pub fn log_num_threads(&self) -> u32 {
 46 |         log2_floor(*NUM_THREADS)
 47 |     }
 48 | 
 49 |     /// Executes a function in a thread and returns a [`Waiter`] immediately.
 50 |     pub fn compute<F, R>(&self, f: F) -> Waiter<R>
 51 |     where
 52 |         F: FnOnce() -> R + Send + 'static,
 53 |         R: Send + 'static,
 54 |     {
 55 |         let (sender, receiver) = bounded(1);
 56 | 
 57 |         THREAD_POOL.spawn(move || {
 58 |             let res = f();
 59 |             // Best effort. We run it in a separate thread, so the receiver might not exist
 60 |             // anymore, but that's OK. It only means that we are not interested in the result.
 61 |             // A message is logged though, as concurrency issues are hard to debug and this might
 62 |             // help in such cases.
 63 |             if let Err(SendError(_)) = sender.send(res) {
 64 |                 trace!("Cannot send result");
 65 |             }
 66 |         });
 67 | 
 68 |         Waiter { receiver }
 69 |     }
 70 | 
 71 |     /// Executes a function and returns the result once it is finished.
 72 |     ///
 73 |     /// The function gets the [`yastl::Scope`] as well as the `chunk_size` as parameters. THe
 74 |     /// `chunk_size` is number of elements per thread.
 75 |     pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R
 76 |     where
 77 |         F: FnOnce(&yastl::Scope<'a>, usize) -> R,
 78 |     {
 79 |         let chunk_size = if elements < *NUM_THREADS {
 80 |             1
 81 |         } else {
 82 |             elements / *NUM_THREADS
 83 |         };
 84 | 
 85 |         THREAD_POOL.scoped(|scope| f(scope, chunk_size))
 86 |     }
 87 | 
 88 |     /// Executes the passed in function, and returns the result once it is finished.
 89 |     pub fn scoped<'a, F, R>(&self, f: F) -> R
 90 |     where
 91 |         F: FnOnce(&yastl::Scope<'a>) -> R,
 92 |     {
 93 |         let (sender, receiver) = bounded(1);
 94 |         THREAD_POOL.scoped(|s| {
 95 |             let res = f(s);
 96 |             sender.send(res).unwrap();
 97 |         });
 98 | 
 99 |         receiver.recv().unwrap()
100 |     }
101 | }
102 | 
103 | /// A future that is waiting for a result.
104 | pub struct Waiter<T> {
105 |     receiver: Receiver<T>,
106 | }
107 | 
108 | impl<T> Waiter<T> {
109 |     /// Wait for the result.
110 |     pub fn wait(&self) -> T {
111 |         self.receiver.recv().unwrap()
112 |     }
113 | 
114 |     /// One off sending.
115 |     pub fn done(val: T) -> Self {
116 |         let (sender, receiver) = bounded(1);
117 |         sender.send(val).unwrap();
118 | 
119 |         Waiter { receiver }
120 |     }
121 | }
122 | 
123 | fn log2_floor(num: usize) -> u32 {
124 |     assert!(num > 0);
125 | 
126 |     let mut pow = 0;
127 | 
128 |     while (1 << (pow + 1)) <= num {
129 |         pow += 1;
130 |     }
131 | 
132 |     pow
133 | }
134 | 
135 | #[cfg(test)]
136 | mod tests {
137 |     use super::*;
138 | 
139 |     #[test]
140 |     fn test_log2_floor() {
141 |         assert_eq!(log2_floor(1), 0);
142 |         assert_eq!(log2_floor(3), 1);
143 |         assert_eq!(log2_floor(4), 2);
144 |         assert_eq!(log2_floor(5), 2);
145 |         assert_eq!(log2_floor(6), 2);
146 |         assert_eq!(log2_floor(7), 2);
147 |         assert_eq!(log2_floor(8), 3);
148 |     }
149 | 
150 |     #[test]
151 |     fn test_read_num_threads() {
152 |         let num_cpus = num_cpus::get();
153 |         temp_env::with_var("EC_GPU_NUM_THREADS", None::<&str>, || {
154 |             assert_eq!(
155 |                 read_num_threads(),
156 |                 num_cpus,
157 |                 "By default the number of threads matches the number of CPUs."
158 |             );
159 |         });
160 | 
161 |         temp_env::with_var("EC_GPU_NUM_THREADS", Some("1234"), || {
162 |             assert_eq!(
163 |                 read_num_threads(),
164 |                 1234,
165 |                 "Number of threads matches the environment variable."
166 |             );
167 |         });
168 |     }
169 | }
170 | 


--------------------------------------------------------------------------------
/ec-gpu/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ec-gpu"
 3 | version = "0.2.0"
 4 | authors = ["dignifiedquire <me@dignifiedquire.com>"]
 5 | edition = "2021"
 6 | description = "Traits for field and eliptic curve operations on GPUs"
 7 | homepage = "https://github.com/filecoin-project/ff-cl-gen"
 8 | repository = "https://github.com/filecoin-project/ff-cl-gen"
 9 | license = "MIT/Apache-2.0"
10 | 
11 | [dependencies]
12 | 


--------------------------------------------------------------------------------
/ec-gpu/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | ../LICENSE-APACHE


--------------------------------------------------------------------------------
/ec-gpu/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | ../LICENSE-MIT


--------------------------------------------------------------------------------
/ec-gpu/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /// The name that is used in the GPU source code to identify the item that is used.
 2 | pub trait GpuName {
 3 |     /// A unique name for the item.
 4 |     ///
 5 |     /// To make the uniqueness easier to implement, use the [`name`] macro. It produces a unique
 6 |     /// name, based on the module path and the type of the item itself. That identifier might not
 7 |     /// be stable across different versions of a crate, but this is OK as kernel sources/binaries
 8 |     /// are always bundled with a library and not re-used between versions.
 9 |     ///
10 |     /// # Example
11 |     ///
12 |     /// ```
13 |     /// struct Fp;
14 |     ///
15 |     /// impl ec_gpu::GpuName for Fp {
16 |     ///     fn name() -> String {
17 |     ///         ec_gpu::name!()
18 |     ///     }
19 |     /// }
20 |     /// ```
21 |     fn name() -> String;
22 | }
23 | 
24 | /// A prime field that returns the values in a representation that is suited for the use on a GPU.
25 | pub trait GpuField: GpuName {
26 |     /// Returns `1` as a vector of 32-bit limbs in little-endian non-Montgomery form (least
27 |     /// significant limb first).
28 |     fn one() -> Vec<u32>;
29 | 
30 |     /// Returns `R ^ 2 mod P` as a vector of 32-bit limbs in little-endian non-Montgomery form
31 |     /// (least significant limb first).
32 |     fn r2() -> Vec<u32>;
33 | 
34 |     /// Returns the field modulus as a vector of 32-bit limbs in non-Montgomery form (least
35 |     /// significant limb first).
36 |     fn modulus() -> Vec<u32>;
37 | 
38 |     /// If the field is an extension field, then the name of the sub-field is returned.
39 |     fn sub_field_name() -> Option<String> {
40 |         None
41 |     }
42 | }
43 | 
44 | /// Macro to get a unique name of an item.
45 | ///
46 | /// The name is a string that consists of the module path and the type name. All non-alphanumeric
47 | /// characters are replaced with underscores, so that it's an identifier that doesn't cause any
48 | /// issues with C compilers.
49 | #[macro_export]
50 | macro_rules! name {
51 |     () => {{
52 |         let mod_path = module_path!();
53 |         let type_name = core::any::type_name::<Self>();
54 |         let name = if type_name.starts_with(mod_path) {
55 |             type_name.into()
56 |         } else {
57 |             [mod_path, "__", type_name].concat()
58 |         };
59 |         name.replace(|c: char| !c.is_ascii_alphanumeric(), "_")
60 |     }};
61 | }
62 | 


--------------------------------------------------------------------------------
/gpu-tests/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just temporarily until
 2 | # https://github.com/zkcrypto/group/pull/29 is fixed. Then we won't need the exports of `Fp` and
 3 | # `Fp2` any more.
 4 | [package]
 5 | name = "gpu-tests"
 6 | version = "0.1.0"
 7 | edition = "2021"
 8 | description = "Test for the ec-gpu project"
 9 | homepage = "https://github.com/filecoin-project/ec-gpu"
10 | repository = "https://github.com/filecoin-project/ec-gpu"
11 | license = "MIT/Apache-2.0"
12 | publish = false
13 | 
14 | [dev-dependencies]
15 | blstrs = { version = "0.7.0", features = ["__private_bench"] }
16 | criterion = "0.4"
17 | ec-gpu = "0.2"
18 | ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false }
19 | ff = { version = "0.13.0", default-features = false }
20 | fil_logger = "0.1.6"
21 | group = "0.13.0"
22 | pairing = "0.23.0"
23 | rand = "0.8"
24 | rayon = "1.5.3"
25 | 
26 | [build-dependencies]
27 | blstrs = { version = "0.7.0", features = ["__private_bench"] }
28 | ec-gpu-gen = { path = "../ec-gpu-gen" }
29 | 
30 | [features]
31 | default = ["cuda", "opencl"]
32 | cuda = ["blstrs/gpu", "ec-gpu-gen/cuda"]
33 | opencl = ["blstrs/gpu", "ec-gpu-gen/opencl"]
34 | 
35 | [[bench]]
36 | name = "multiexp"
37 | harness = false
38 | 


--------------------------------------------------------------------------------
/gpu-tests/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | ../LICENSE-APACHE


--------------------------------------------------------------------------------
/gpu-tests/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | ../LICENSE-MIT


--------------------------------------------------------------------------------
/gpu-tests/README.md:
--------------------------------------------------------------------------------
 1 | # `gpu-tests`
 2 | 
 3 | This crate is for running tests. Usually kernels are created during compile time, hence a `build.rs` is needed. `ec-gpu-gen` is just a toolkit and doesn't provide pre-defined kernels. This crate separates those concerns and also shows how `ec-gpu-gen` can be used.
 4 | 
 5 | ## Usage
 6 | 
 7 | ```console
 8 | cargo test
 9 | ```
10 | 
11 | ## Feature flags
12 | 
13 | By default `cuda` and `opencl` is enabled. If you want to run the tests/benchmarks with either of those, you can do so:
14 | 
15 | ```console
16 | cargo test --no-default-features --features opencl
17 | ```
18 | 
19 | ## License
20 | 
21 | Licensed under either of
22 | 
23 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
24 |    http://www.apache.org/licenses/LICENSE-2.0)
25 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
26 | 
27 | at your option.
28 | 
29 | ### Contribution
30 | 
31 | Unless you explicitly state otherwise, any contribution intentionally
32 | submitted for inclusion in the work by you, as defined in the Apache-2.0
33 | license, shall be dual licensed as above, without any additional terms or
34 | conditions.
35 | 


--------------------------------------------------------------------------------
/gpu-tests/benches/multiexp.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use blstrs::Bls12;
 4 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 5 | use ec_gpu_gen::{
 6 |     multiexp::MultiexpKernel, multiexp_cpu::SourceBuilder, rust_gpu_tools::Device,
 7 |     threadpool::Worker,
 8 | };
 9 | use ff::{Field, PrimeField};
10 | use group::{Curve, Group};
11 | use pairing::Engine;
12 | use rayon::iter::{IntoParallelIterator, ParallelIterator};
13 | 
14 | /// The power that will be used to define the maximum number of elements. The number of elements
15 | /// is `2^MAX_ELEMENTS_POWER`.
16 | const MAX_ELEMENTS_POWER: usize = 29;
17 | /// The maximum number of elements for this benchmark.
18 | const MAX_ELEMENTS: usize = 1 << MAX_ELEMENTS_POWER;
19 | 
20 | fn bench_multiexp(crit: &mut Criterion) {
21 |     let mut group = crit.benchmark_group("multiexp");
22 |     // The difference between runs is so little, hence a low sample size is OK.
23 |     group.sample_size(10);
24 | 
25 |     let devices = Device::all();
26 |     let programs = devices
27 |         .iter()
28 |         .map(|device| ec_gpu_gen::program!(device))
29 |         .collect::<Result<_, _>>()
30 |         .expect("Cannot create programs!");
31 |     let mut kern = MultiexpKernel::<<Bls12 as Engine>::G1Affine>::create(programs, &devices)
32 |         .expect("Cannot initialize kernel!");
33 |     let pool = Worker::new();
34 |     let max_bases: Vec<_> = (0..MAX_ELEMENTS)
35 |         .into_par_iter()
36 |         .map(|_| <Bls12 as Engine>::G1::random(rand::thread_rng()).to_affine())
37 |         .collect();
38 |     let max_exponents: Vec<_> = (0..MAX_ELEMENTS)
39 |         .into_par_iter()
40 |         .map(|_| <Bls12 as Engine>::Fr::random(rand::thread_rng()).to_repr())
41 |         .collect();
42 | 
43 |     let num_elements: Vec<_> = (10..MAX_ELEMENTS_POWER).map(|shift| 1 << shift).collect();
44 |     for num in num_elements {
45 |         group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &num| {
46 |             let (bases, skip) = SourceBuilder::get((Arc::new(max_bases[0..num].to_vec()), 0));
47 |             let exponents = Arc::new(max_exponents[0..num].to_vec());
48 | 
49 |             bencher.iter(|| {
50 |                 black_box(
51 |                     kern.multiexp(&pool, bases.clone(), exponents.clone(), skip)
52 |                         .unwrap(),
53 |                 );
54 |             })
55 |         });
56 |     }
57 |     group.finish();
58 | }
59 | 
60 | criterion_group!(benches, bench_multiexp);
61 | criterion_main!(benches);
62 | 


--------------------------------------------------------------------------------
/gpu-tests/build.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(any(feature = "cuda", feature = "opencl")))]
 2 | fn main() {}
 3 | 
 4 | #[cfg(any(feature = "cuda", feature = "opencl"))]
 5 | fn main() {
 6 |     use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar};
 7 |     use ec_gpu_gen::SourceBuilder;
 8 | 
 9 |     let source_builder = SourceBuilder::new()
10 |         .add_fft::<Scalar>()
11 |         .add_multiexp::<G1Affine, Fp>()
12 |         .add_multiexp::<G2Affine, Fp2>();
13 |     ec_gpu_gen::generate(&source_builder);
14 | }
15 | 


--------------------------------------------------------------------------------
/gpu-tests/src/lib.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/gpu-tests/tests/fft.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(any(feature = "cuda", feature = "opencl"))]
  2 | 
  3 | use std::time::Instant;
  4 | 
  5 | use blstrs::Scalar as Fr;
  6 | use ec_gpu_gen::{
  7 |     fft::FftKernel,
  8 |     fft_cpu::{parallel_fft, serial_fft},
  9 |     rust_gpu_tools::Device,
 10 |     threadpool::Worker,
 11 | };
 12 | use ff::{Field, PrimeField};
 13 | 
 14 | fn omega<F: PrimeField>(num_coeffs: usize) -> F {
 15 |     // Compute omega, the 2^exp primitive root of unity
 16 |     let exp = (num_coeffs as f32).log2().floor() as u32;
 17 |     let mut omega = F::ROOT_OF_UNITY;
 18 |     for _ in exp..F::S {
 19 |         omega = omega.square();
 20 |     }
 21 |     omega
 22 | }
 23 | 
 24 | #[test]
 25 | pub fn gpu_fft_consistency() {
 26 |     fil_logger::maybe_init();
 27 |     let mut rng = rand::thread_rng();
 28 | 
 29 |     let worker = Worker::new();
 30 |     let log_threads = worker.log_num_threads();
 31 |     let devices = Device::all();
 32 |     let programs = devices
 33 |         .iter()
 34 |         .map(|device| ec_gpu_gen::program!(device))
 35 |         .collect::<Result<_, _>>()
 36 |         .expect("Cannot create programs!");
 37 |     let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
 38 | 
 39 |     for log_d in 1..=20 {
 40 |         let d = 1 << log_d;
 41 | 
 42 |         let mut v1_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
 43 |         let v1_omega = omega::<Fr>(v1_coeffs.len());
 44 |         let mut v2_coeffs = v1_coeffs.clone();
 45 |         let v2_omega = v1_omega;
 46 | 
 47 |         println!("Testing FFT for {} elements...", d);
 48 | 
 49 |         let mut now = Instant::now();
 50 |         kern.radix_fft_many(&mut [&mut v1_coeffs], &[v1_omega], &[log_d])
 51 |             .expect("GPU FFT failed!");
 52 |         let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
 53 |         println!("GPU took {}ms.", gpu_dur);
 54 | 
 55 |         now = Instant::now();
 56 |         if log_d <= log_threads {
 57 |             serial_fft::<Fr>(&mut v2_coeffs, &v2_omega, log_d);
 58 |         } else {
 59 |             parallel_fft::<Fr>(&mut v2_coeffs, &worker, &v2_omega, log_d, log_threads);
 60 |         }
 61 |         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
 62 |         println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur);
 63 | 
 64 |         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
 65 | 
 66 |         assert!(v1_coeffs == v2_coeffs);
 67 |         println!("============================");
 68 |     }
 69 | }
 70 | 
 71 | #[test]
 72 | pub fn gpu_fft_many_consistency() {
 73 |     fil_logger::maybe_init();
 74 |     let mut rng = rand::thread_rng();
 75 | 
 76 |     let worker = Worker::new();
 77 |     let log_threads = worker.log_num_threads();
 78 |     let devices = Device::all();
 79 |     let programs = devices
 80 |         .iter()
 81 |         .map(|device| ec_gpu_gen::program!(device))
 82 |         .collect::<Result<_, _>>()
 83 |         .expect("Cannot create programs!");
 84 |     let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
 85 | 
 86 |     for log_d in 1..=20 {
 87 |         let d = 1 << log_d;
 88 | 
 89 |         let mut v11_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
 90 |         let mut v12_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
 91 |         let mut v13_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
 92 |         let v11_omega = omega::<Fr>(v11_coeffs.len());
 93 |         let v12_omega = omega::<Fr>(v12_coeffs.len());
 94 |         let v13_omega = omega::<Fr>(v13_coeffs.len());
 95 | 
 96 |         let mut v21_coeffs = v11_coeffs.clone();
 97 |         let mut v22_coeffs = v12_coeffs.clone();
 98 |         let mut v23_coeffs = v13_coeffs.clone();
 99 |         let v21_omega = v11_omega;
100 |         let v22_omega = v12_omega;
101 |         let v23_omega = v13_omega;
102 | 
103 |         println!("Testing FFT3 for {} elements...", d);
104 | 
105 |         let mut now = Instant::now();
106 |         kern.radix_fft_many(
107 |             &mut [&mut v11_coeffs, &mut v12_coeffs, &mut v13_coeffs],
108 |             &[v11_omega, v12_omega, v13_omega],
109 |             &[log_d, log_d, log_d],
110 |         )
111 |         .expect("GPU FFT failed!");
112 |         let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
113 |         println!("GPU took {}ms.", gpu_dur);
114 | 
115 |         now = Instant::now();
116 |         if log_d <= log_threads {
117 |             serial_fft::<Fr>(&mut v21_coeffs, &v21_omega, log_d);
118 |             serial_fft::<Fr>(&mut v22_coeffs, &v22_omega, log_d);
119 |             serial_fft::<Fr>(&mut v23_coeffs, &v23_omega, log_d);
120 |         } else {
121 |             parallel_fft::<Fr>(&mut v21_coeffs, &worker, &v21_omega, log_d, log_threads);
122 |             parallel_fft::<Fr>(&mut v22_coeffs, &worker, &v22_omega, log_d, log_threads);
123 |             parallel_fft::<Fr>(&mut v23_coeffs, &worker, &v23_omega, log_d, log_threads);
124 |         }
125 |         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
126 |         println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur);
127 | 
128 |         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
129 | 
130 |         assert!(v11_coeffs == v21_coeffs);
131 |         assert!(v12_coeffs == v22_coeffs);
132 |         assert!(v13_coeffs == v23_coeffs);
133 | 
134 |         println!("============================");
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/gpu-tests/tests/multiexp.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(any(feature = "cuda", feature = "opencl"))]
 2 | 
 3 | use std::sync::Arc;
 4 | use std::time::Instant;
 5 | 
 6 | use blstrs::Bls12;
 7 | use ec_gpu::GpuName;
 8 | use ec_gpu_gen::multiexp_cpu::{multiexp_cpu, FullDensity, QueryDensity, SourceBuilder};
 9 | use ec_gpu_gen::{
10 |     multiexp::MultiexpKernel, program, rust_gpu_tools::Device, threadpool::Worker, EcError,
11 | };
12 | use ff::{Field, PrimeField};
13 | use group::Curve;
14 | use group::{prime::PrimeCurveAffine, Group};
15 | use pairing::Engine;
16 | 
17 | fn multiexp_gpu<Q, D, G, S>(
18 |     pool: &Worker,
19 |     bases: S,
20 |     density_map: D,
21 |     exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
22 |     kern: &mut MultiexpKernel<G>,
23 | ) -> Result<G::Curve, EcError>
24 | where
25 |     for<'a> &'a Q: QueryDensity,
26 |     D: Send + Sync + 'static + Clone + AsRef<Q>,
27 |     G: PrimeCurveAffine + GpuName,
28 |     S: SourceBuilder<G>,
29 | {
30 |     let exps = density_map.as_ref().generate_exps::<G::Scalar>(exponents);
31 |     let (bss, skip) = bases.get();
32 |     kern.multiexp(pool, bss, exps, skip).map_err(Into::into)
33 | }
34 | 
35 | #[test]
36 | fn gpu_multiexp_consistency() {
37 |     fil_logger::maybe_init();
38 |     const MAX_LOG_D: usize = 16;
39 |     const START_LOG_D: usize = 10;
40 |     let devices = Device::all();
41 |     let programs = devices
42 |         .iter()
43 |         .map(|device| crate::program!(device))
44 |         .collect::<Result<_, _>>()
45 |         .expect("Cannot create programs!");
46 |     let mut kern = MultiexpKernel::<<Bls12 as Engine>::G1Affine>::create(programs, &devices)
47 |         .expect("Cannot initialize kernel!");
48 |     let pool = Worker::new();
49 | 
50 |     let mut rng = rand::thread_rng();
51 | 
52 |     let mut bases = (0..(1 << START_LOG_D))
53 |         .map(|_| <Bls12 as Engine>::G1::random(&mut rng).to_affine())
54 |         .collect::<Vec<_>>();
55 | 
56 |     for log_d in START_LOG_D..=MAX_LOG_D {
57 |         let g = Arc::new(bases.clone());
58 | 
59 |         let samples = 1 << log_d;
60 |         println!("Testing Multiexp for {} elements...", samples);
61 | 
62 |         let v = Arc::new(
63 |             (0..samples)
64 |                 .map(|_| <Bls12 as Engine>::Fr::random(&mut rng).to_repr())
65 |                 .collect::<Vec<_>>(),
66 |         );
67 | 
68 |         let mut now = Instant::now();
69 |         let gpu = multiexp_gpu(&pool, (g.clone(), 0), FullDensity, v.clone(), &mut kern).unwrap();
70 |         let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
71 |         println!("GPU took {}ms.", gpu_dur);
72 | 
73 |         now = Instant::now();
74 |         let cpu = multiexp_cpu(&pool, (g.clone(), 0), FullDensity, v.clone())
75 |             .wait()
76 |             .unwrap();
77 |         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
78 |         println!("CPU took {}ms.", cpu_dur);
79 | 
80 |         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
81 | 
82 |         assert_eq!(cpu, gpu);
83 | 
84 |         println!("============================");
85 | 
86 |         bases = [bases.clone(), bases.clone()].concat();
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/release.toml:
--------------------------------------------------------------------------------
1 | consolidate-commits = false
2 | 


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | 1.83.0
2 | 


--------------------------------------------------------------------------------