├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── COPYRIGHT
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── examples
    ├── add.cl
    ├── add.fatbin
    └── add.rs
├── rust-toolchain
└── src
    ├── cuda
        ├── mod.rs
        └── utils.rs
    ├── device.rs
    ├── error.rs
    ├── lib.rs
    ├── opencl
        ├── error.rs
        ├── mod.rs
        └── utils.rs
    └── program.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [pull_request, push]
 4 | 
 5 | # Cancel a job if there's a new on on the same branch started.
 6 | # Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051
 7 | concurrency:
 8 |   group: ${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | env:
12 |   CARGO_INCREMENTAL: 0
13 |   RUST_BACKTRACE: 1
14 |   # Faster crates.io index checkout.
15 |   CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
16 |   RUST_LOG: debug
17 | 
18 | jobs:
19 |   check_clippy:
20 |     runs-on: ubuntu-24.04
21 |     name: Clippy
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Run cargo clippy
25 |         run: cargo clippy --all-targets --workspace --all-features -- -D warnings
26 | 
27 |   check_fmt:
28 |     runs-on: ubuntu-24.04
29 |     name: Checking fmt
30 |     steps:
31 |       - uses: actions/checkout@v4
32 |       - name: Run cargo fmt
33 |         run: cargo fmt --all -- --check
34 | 
35 |   rustdoc:
36 |     runs-on: ubuntu-24.04
37 |     name: Rustdoc
38 |     steps:
39 |       - uses: actions/checkout@v4
40 |       - name: Run rustdoc
41 |         run: cargo rustdoc --all-features -- -D warnings
42 | 
43 |   build:
44 |     runs-on: ubuntu-24.04
45 |     name: Release build
46 |     steps:
47 |       - uses: actions/checkout@v4
48 |       - name: Run cargo release build
49 |         run: cargo build --release
50 | 
51 |   # Enable these tests once there's a runner with a GPU.
52 |   #test_gpu:
53 |   #  runs-on: ubuntu-24.04
54 |   #  name: Test
55 |   #  steps:
56 |   #    - uses: actions/checkout@v4
57 |   #    - name: Install required packages
58 |   #      run: sudo apt install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
59 |   #    - name: Run tests
60 |   #      run: cargo test
61 |   #    - name: Run `add` example
62 |   #      run: cargo run --example add
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | Copyrights in the "rust-fil-nse-gpu" library are retained by their contributors. No
 2 | copyright assignment is required to contribute to the "rust-fil-nse-gpu" library.
 3 | 
 4 | The "rust-fil-nse-gpu" library is licensed under either of
 5 | 
 6 |  * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
 7 |  * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT)
 8 | 
 9 | at your option.
10 | 
11 | Unless you explicitly state otherwise, any contribution intentionally
12 | submitted for inclusion in the work by you, as defined in the Apache-2.0
13 | license, shall be dual licensed as above, without any additional terms or
14 | conditions.
15 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rust-gpu-tools"
 3 | version = "0.7.2"
 4 | authors = ["Keyvan Kambakhsh <keyvankambakhsh@gmail.com>", "porcuquine <porcuquine@gmail.com>"]
 5 | description = "Rust OpenCL tools"
 6 | edition = "2021"
 7 | homepage = "https://github.com/filecoin-project/rust-gpu-tools"
 8 | license = "MIT/Apache-2.0"
 9 | repository = "https://github.com/filecoin-project/rust-gpu-tools"
10 | rust-version = "1.81.0"
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | [features]
14 | default = ["opencl", "cuda"]
15 | opencl = ["opencl3"]
16 | cuda = ["rustacuda"]
17 | 
18 | [dependencies]
19 | home = "0.5"
20 | sha2 = "0.10"
21 | thiserror = "2.0.12"
22 | log = "0.4.26"
23 | hex = "0.4.3"
24 | 
25 | opencl3 = { version = "0.11.0", default-features = false, features = ["CL_VERSION_1_2"], optional = true }
26 | rustacuda = { package = "fil-rustacuda", version = "0.1.3", optional = true }
27 | once_cell = "1.8.0"
28 | temp-env = "0.3.3"
29 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rust-gpu-tools [![Crates.io](https://img.shields.io/crates/v/rust-gpu-tools.svg)](https://crates.io/crates/rust-gpu-tools)
 2 | 
 3 | An abstraction library to run kernels on both CUDA and OpenCL.
 4 | 
 5 | ## Example
 6 | 
 7 | You need to write the code that interacts with the GPU only once. Below is such code that runs a
 8 | kernel on CUDA and/or OpenCL. For a full working example, please see the [`examples`](examples)
 9 | directory. You can run it via `cargo run --example add`.
10 | 
11 | ```rust
12 | let closures = program_closures!(|program, _args| -> Result<Vec<u32>, GPUError> {
13 |     // Make sure the input data has the same length.
14 |     assert_eq!(aa.len(), bb.len());
15 |     let length = aa.len();
16 | 
17 |     // Copy the data to the GPU.
18 |     let aa_buffer = program.create_buffer_from_slice(&aa)?;
19 |     let bb_buffer = program.create_buffer_from_slice(&bb)?;
20 | 
21 |     // The result buffer has the same length as the input buffers.
22 |     let result_buffer = unsafe { program.create_buffer::<u32>(length)? };
23 | 
24 |     // Get the kernel.
25 |     let kernel = program.create_kernel("add", 8, 4)?;
26 | 
27 |     // Execute the kernel.
28 |     kernel
29 |         .arg(&(length as u32))
30 |         .arg(&aa_buffer)
31 |         .arg(&bb_buffer)
32 |         .arg(&result_buffer)
33 |         .run()?;
34 | 
35 |     // Get the resulting data.
36 |     let mut result = vec![0u32; length];
37 |     program.read_into_buffer(&result_buffer, &mut result)?;
38 | 
39 |     Ok(result)
40 | });
41 | ```
42 | 
43 | 
44 | ## License
45 | 
46 | Licensed under either of
47 | 
48 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
49 |    http://www.apache.org/licenses/LICENSE-2.0)
50 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
51 | 
52 | at your option.
53 | 
54 | ### Contribution
55 | 
56 | Unless you explicitly state otherwise, any contribution intentionally
57 | submitted for inclusion in the work by you, as defined in the Apache-2.0
58 | license, shall be dual licensed as above, without any additional terms or
59 | conditions.
60 | 


--------------------------------------------------------------------------------
/examples/add.cl:
--------------------------------------------------------------------------------
 1 | // CUDA
 2 | #ifdef __CUDACC__
 3 |   #define GLOBAL
 4 |   #define KERNEL extern "C" __global__
 5 | // OpenCL
 6 | #else
 7 |   #define GLOBAL __global
 8 |   #define KERNEL __kernel
 9 | #endif
10 | 
11 | KERNEL void add(uint num, GLOBAL uint *a, GLOBAL uint *b, GLOBAL uint *result) {
12 |     for (uint i = 0; i < num; i++) {
13 |       result[i] = a[i] + b[i];
14 |     }
15 | }


--------------------------------------------------------------------------------
/examples/add.fatbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/filecoin-project/rust-gpu-tools/ccb3c7ee4b5944ddf6427c488dc82cc20c9626ed/examples/add.fatbin


--------------------------------------------------------------------------------
/examples/add.rs:
--------------------------------------------------------------------------------
 1 | use rust_gpu_tools::{cuda, opencl, program_closures, Device, GPUError, Program, Vendor};
 2 | 
 3 | /// Returns a `Program` that runs on CUDA.
 4 | fn cuda(device: &Device) -> Program {
 5 |     // The kernel was compiled with:
 6 |     // nvcc -fatbin -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75 --x cu add.cl
 7 |     let cuda_kernel = include_bytes!("./add.fatbin");
 8 |     let cuda_device = device.cuda_device().unwrap();
 9 |     let cuda_program = cuda::Program::from_bytes(cuda_device, cuda_kernel).unwrap();
10 |     Program::Cuda(cuda_program)
11 | }
12 | 
13 | /// Returns a `Program` that runs on OpenCL.
14 | fn opencl(device: &Device) -> Program {
15 |     let opencl_kernel = include_str!("./add.cl");
16 |     let opencl_device = device.opencl_device().unwrap();
17 |     let opencl_program = opencl::Program::from_opencl(opencl_device, opencl_kernel).unwrap();
18 |     Program::Opencl(opencl_program)
19 | }
20 | 
21 | pub fn main() {
22 |     // Define some data that should be operated on.
23 |     let aa: Vec<u32> = vec![1, 2, 3, 4];
24 |     let bb: Vec<u32> = vec![5, 6, 7, 8];
25 | 
26 |     // This is the core. Here we write the interaction with the GPU independent of whether it is
27 |     // CUDA or OpenCL.
28 |     let closures = program_closures!(|program, _args| -> Result<Vec<u32>, GPUError> {
29 |         // Make sure the input data has the same length.
30 |         assert_eq!(aa.len(), bb.len());
31 |         let length = aa.len();
32 | 
33 |         // Copy the data to the GPU.
34 |         let aa_buffer = program.create_buffer_from_slice(&aa)?;
35 |         let bb_buffer = program.create_buffer_from_slice(&bb)?;
36 | 
37 |         // The result buffer has the same length as the input buffers.
38 |         let result_buffer = unsafe { program.create_buffer::<u32>(length)? };
39 | 
40 |         // Get the kernel.
41 |         let kernel = program.create_kernel("add", 1, 1)?;
42 | 
43 |         // Execute the kernel.
44 |         kernel
45 |             .arg(&(length as u32))
46 |             .arg(&aa_buffer)
47 |             .arg(&bb_buffer)
48 |             .arg(&result_buffer)
49 |             .run()?;
50 | 
51 |         // Get the resulting data.
52 |         let mut result = vec![0u32; length];
53 |         program.read_into_buffer(&result_buffer, &mut result)?;
54 | 
55 |         Ok(result)
56 |     });
57 | 
58 |     // First we run it on CUDA if available
59 |     let nv_dev_list = Device::by_vendor(Vendor::Nvidia);
60 |     if !nv_dev_list.is_empty() {
61 |         // Test NVIDIA CUDA Flow
62 |         let cuda_program = cuda(nv_dev_list[0]);
63 |         let cuda_result = cuda_program.run(closures, ()).unwrap();
64 |         assert_eq!(cuda_result, [6, 8, 10, 12]);
65 |         println!("CUDA result: {:?}", cuda_result);
66 | 
67 |         // Test NVIDIA OpenCL Flow
68 |         let opencl_program = opencl(nv_dev_list[0]);
69 |         let opencl_result = opencl_program.run(closures, ()).unwrap();
70 |         assert_eq!(opencl_result, [6, 8, 10, 12]);
71 |         println!("OpenCL Nvidia result: {:?}", opencl_result);
72 |     }
73 | 
74 |     // Then we run it on Intel OpenCL if available
75 |     let intel_dev_list = Device::by_vendor(Vendor::Intel);
76 |     if !intel_dev_list.is_empty() {
77 |         let opencl_program = opencl(intel_dev_list[0]);
78 |         let opencl_result = opencl_program.run(closures, ()).unwrap();
79 |         assert_eq!(opencl_result, [6, 8, 10, 12]);
80 |         println!("OpenCL Intel result: {:?}", opencl_result);
81 |     }
82 | 
83 |     let amd_dev_list = Device::by_vendor(Vendor::Amd);
84 |     if !amd_dev_list.is_empty() {
85 |         let opencl_program = opencl(amd_dev_list[0]);
86 |         let opencl_result = opencl_program.run(closures, ()).unwrap();
87 |         assert_eq!(opencl_result, [6, 8, 10, 12]);
88 |         println!("OpenCL Amd result: {:?}", opencl_result);
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | 1.81.0
2 | 


--------------------------------------------------------------------------------
/src/cuda/mod.rs:
--------------------------------------------------------------------------------
  1 | //! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
  2 | //!
  3 | //! The current operation mode is synchronuous, in order to have higher safety gurarantees. All
  4 | //! operations happen on a single stream, which is synchronized after each operation. This is a
  5 | //! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons:
  6 | //!
  7 | //!   1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream
  8 | //!   2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous
  9 | //!      copy methods. To prevent such kind of bugs, be explicit which stream is used.
 10 | 
 11 | pub(crate) mod utils;
 12 | 
 13 | use std::convert::TryFrom;
 14 | use std::ffi::{c_void, CStr, CString};
 15 | use std::fmt;
 16 | use std::hash::{Hash, Hasher};
 17 | use std::mem;
 18 | 
 19 | use log::debug;
 20 | use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer};
 21 | use rustacuda::stream::{Stream, StreamFlags};
 22 | 
 23 | use crate::device::{DeviceUuid, PciId, Vendor};
 24 | use crate::error::{GPUError, GPUResult};
 25 | use crate::LocalBuffer;
 26 | 
 27 | /// A Buffer to be used for sending and receiving data to/from the GPU.
 28 | #[derive(Debug)]
 29 | pub struct Buffer<T> {
 30 |     buffer: DeviceBuffer<u8>,
 31 |     /// The number of T-sized elements.
 32 |     length: usize,
 33 |     _phantom: std::marker::PhantomData<T>,
 34 | }
 35 | 
 36 | /// CUDA specific device.
 37 | #[derive(Debug, Clone)]
 38 | pub struct Device {
 39 |     vendor: Vendor,
 40 |     name: String,
 41 |     /// The total memory of the GPU in bytes.
 42 |     memory: u64,
 43 |     /// Number of streaming multiprocessors.
 44 |     compute_units: u32,
 45 |     /// The compute capability of the device, major and minor version.
 46 |     compute_capability: (u32, u32),
 47 |     pci_id: PciId,
 48 |     uuid: Option<DeviceUuid>,
 49 |     context: rustacuda::context::UnownedContext,
 50 | }
 51 | 
 52 | impl Hash for Device {
 53 |     fn hash<H: Hasher>(&self, state: &mut H) {
 54 |         self.vendor.hash(state);
 55 |         self.name.hash(state);
 56 |         self.memory.hash(state);
 57 |         self.pci_id.hash(state);
 58 |         self.uuid.hash(state);
 59 |     }
 60 | }
 61 | 
 62 | impl PartialEq for Device {
 63 |     fn eq(&self, other: &Self) -> bool {
 64 |         self.vendor == other.vendor
 65 |             && self.name == other.name
 66 |             && self.memory == other.memory
 67 |             && self.pci_id == other.pci_id
 68 |             && self.uuid == other.uuid
 69 |     }
 70 | }
 71 | 
 72 | impl Eq for Device {}
 73 | 
 74 | impl Device {
 75 |     /// Returns the [`Vendor`] of the GPU.
 76 |     pub fn vendor(&self) -> Vendor {
 77 |         self.vendor
 78 |     }
 79 | 
 80 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
 81 |     pub fn name(&self) -> String {
 82 |         self.name.clone()
 83 |     }
 84 | 
 85 |     /// Returns the memory of the GPU in bytes.
 86 |     pub fn memory(&self) -> u64 {
 87 |         self.memory
 88 |     }
 89 | 
 90 |     /// Returns the number of compute units of the GPU.
 91 |     pub fn compute_units(&self) -> u32 {
 92 |         self.compute_units
 93 |     }
 94 | 
 95 |     /// Returns the major and minor version of compute capability of the GPU.
 96 |     pub fn compute_capability(&self) -> (u32, u32) {
 97 |         self.compute_capability
 98 |     }
 99 | 
100 |     /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
101 |     pub fn pci_id(&self) -> PciId {
102 |         self.pci_id
103 |     }
104 | 
105 |     /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more
106 |     /// information.
107 |     pub fn uuid(&self) -> Option<DeviceUuid> {
108 |         self.uuid
109 |     }
110 | }
111 | 
112 | /// Abstraction that contains everything to run a CUDA kernel on a GPU.
113 | ///
114 | /// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this
115 | /// API, which will then work with OpenCL as well as CUDA kernels.
116 | // When compiled without the `opencl` feature, then the intra-doc link above will be broken.
117 | #[allow(rustdoc::broken_intra_doc_links)]
118 | #[derive(Debug)]
119 | pub struct Program {
120 |     context: rustacuda::context::UnownedContext,
121 |     module: rustacuda::module::Module,
122 |     stream: Stream,
123 |     device_name: String,
124 | }
125 | 
126 | impl Program {
127 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
128 |     pub fn device_name(&self) -> &str {
129 |         &self.device_name
130 |     }
131 | 
132 |     /// Creates a program for a specific device from a compiled CUDA binary file.
133 |     pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult<Program> {
134 |         debug!("Creating CUDA program from binary file.");
135 |         rustacuda::context::CurrentContext::set_current(&device.context)?;
136 |         let module = rustacuda::module::Module::load_from_file(filename).inspect_err(|_err| {
137 |             Self::pop_context();
138 |         })?;
139 |         let stream = Stream::new(StreamFlags::NON_BLOCKING, None).inspect_err(|_err| {
140 |             Self::pop_context();
141 |         })?;
142 |         let prog = Program {
143 |             module,
144 |             stream,
145 |             device_name: device.name(),
146 |             context: device.context.clone(),
147 |         };
148 |         Self::pop_context();
149 |         Ok(prog)
150 |     }
151 | 
152 |     /// Creates a program for a specific device from a compiled CUDA binary.
153 |     pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult<Program> {
154 |         debug!("Creating CUDA program from bytes.");
155 |         rustacuda::context::CurrentContext::set_current(&device.context)?;
156 |         let module = rustacuda::module::Module::load_from_bytes(bytes).inspect_err(|_err| {
157 |             Self::pop_context();
158 |         })?;
159 |         let stream = Stream::new(StreamFlags::NON_BLOCKING, None).inspect_err(|_err| {
160 |             Self::pop_context();
161 |         })?;
162 |         let prog = Program {
163 |             module,
164 |             stream,
165 |             device_name: device.name(),
166 |             context: device.context.clone(),
167 |         };
168 |         Self::pop_context();
169 |         Ok(prog)
170 |     }
171 | 
172 |     /// Creates a new buffer that can be used for input/output with the GPU.
173 |     ///
174 |     /// The `length` is the number of elements to create.
175 |     ///
176 |     /// It is usually used to create buffers that are initialized by the GPU. If you want to
177 |     /// directly transfer data from the host to the GPU, you would use the safe
178 |     /// [`Program::create_buffer_from_slice`] instead.
179 |     ///
180 |     /// ### Safety
181 |     ///
182 |     /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by
183 |     /// the GPU) before it can be read via [`Program::read_into_buffer`].
184 |     pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
185 |         assert!(length > 0);
186 |         // This is the unsafe call, the rest of the function is safe code.
187 |         let buffer = DeviceBuffer::<u8>::uninitialized(length * std::mem::size_of::<T>())?;
188 | 
189 |         Ok(Buffer::<T> {
190 |             buffer,
191 |             length,
192 |             _phantom: std::marker::PhantomData,
193 |         })
194 |     }
195 | 
196 |     /// Creates a new buffer on the GPU and initializes with the given slice.
197 |     pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
198 |         // The number of bytes is used for the allocations.
199 |         let bytes_len = mem::size_of_val(slice);
200 | 
201 |         // Transmuting types is safe as long a sizes match.
202 |         let bytes = unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u8, bytes_len) };
203 | 
204 |         // It is only unsafe as long as the buffer isn't initialized, but that's what we do next.
205 |         let mut buffer = unsafe { DeviceBuffer::<u8>::uninitialized(bytes_len)? };
206 |         // It is safe as we synchronize the stream after the call.
207 |         unsafe { buffer.async_copy_from(bytes, &self.stream)? };
208 |         self.stream.synchronize()?;
209 | 
210 |         Ok(Buffer::<T> {
211 |             buffer,
212 |             length: slice.len(),
213 |             _phantom: std::marker::PhantomData,
214 |         })
215 |     }
216 | 
217 |     /// Returns a kernel.
218 |     ///
219 |     /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
220 |     /// number of threads. Instead it follows CUDA's definition and is the number of
221 |     /// `local_work_size` sized thread groups. So the total number of threads is
222 |     /// `global_work_size * local_work_size`.
223 |     pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult<Kernel> {
224 |         let function_name = CString::new(name).expect("Kernel name must not contain nul bytes");
225 |         let function = self.module.get_function(&function_name)?;
226 | 
227 |         Ok(Kernel {
228 |             function,
229 |             global_work_size: gws,
230 |             local_work_size: lws,
231 |             stream: &self.stream,
232 |             args: Vec::new(),
233 |         })
234 |     }
235 | 
236 |     /// Puts data from an existing buffer onto the GPU.
237 |     pub fn write_from_buffer<T>(&self, buffer: &mut Buffer<T>, data: &[T]) -> GPUResult<()> {
238 |         assert!(data.len() <= buffer.length, "Buffer is too small");
239 | 
240 |         // Transmuting types is safe as long a sizes match.
241 |         let bytes = unsafe {
242 |             std::slice::from_raw_parts(data.as_ptr() as *const u8, mem::size_of_val(data))
243 |         };
244 | 
245 |         // It is safe as we synchronize the stream after the call.
246 |         unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? };
247 |         self.stream.synchronize()?;
248 | 
249 |         Ok(())
250 |     }
251 | 
252 |     /// Reads data from the GPU into an existing buffer.
253 |     pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
254 |         assert!(data.len() <= buffer.length, "Buffer is too small");
255 | 
256 |         // Transmuting types is safe as long a sizes match.
257 |         let bytes = unsafe {
258 |             std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, mem::size_of_val(data))
259 |         };
260 | 
261 |         // It is safe as we synchronize the stream after the call.
262 |         unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? };
263 |         self.stream.synchronize()?;
264 | 
265 |         Ok(())
266 |     }
267 | 
268 |     /// Run some code in the context of the program.
269 |     ///
270 |     /// It sets the correct contexts.
271 |     ///
272 |     /// It takes the program as a parameter, so that we can use the same function body, for both
273 |     /// the OpenCL and the CUDA code path. The only difference is the type of the program.
274 |     pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
275 |     where
276 |         F: FnOnce(&Self, A) -> Result<R, E>,
277 |         E: From<GPUError>,
278 |     {
279 |         rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?;
280 |         let result = fun(self, arg);
281 |         Self::pop_context();
282 |         result
283 |     }
284 | 
285 |     /// Pop the current context.
286 |     ///
287 |     /// It panics as it's an unrecoverable error.
288 |     fn pop_context() {
289 |         rustacuda::context::ContextStack::pop().expect("Cannot remove context.");
290 |     }
291 | }
292 | 
293 | // TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that
294 | // this manual `Send` implementation is no longer needed.
295 | unsafe impl Send for Program {}
296 | 
297 | /// Abstraction for kernel arguments.
298 | ///
299 | /// Kernel arguments implement this trait, so that they can be converted it into the correct
300 | /// pointers needed by the actual kernel call.
301 | pub trait KernelArgument {
302 |     /// Converts into a C void pointer.
303 |     fn as_c_void(&self) -> *mut c_void;
304 | 
305 |     /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This
306 |     /// informations is used to allocate the memory correctly.
307 |     fn shared_mem(&self) -> u32 {
308 |         0
309 |     }
310 | }
311 | 
312 | impl<T> KernelArgument for Buffer<T> {
313 |     fn as_c_void(&self) -> *mut c_void {
314 |         &self.buffer as *const _ as _
315 |     }
316 | }
317 | 
318 | impl KernelArgument for i32 {
319 |     fn as_c_void(&self) -> *mut c_void {
320 |         self as *const _ as _
321 |     }
322 | }
323 | 
324 | impl KernelArgument for u32 {
325 |     fn as_c_void(&self) -> *mut c_void {
326 |         self as *const _ as _
327 |     }
328 | }
329 | 
330 | impl<T> KernelArgument for LocalBuffer<T> {
331 |     // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo)
332 |     // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it
333 |     // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would
334 |     // return an "invalid argument" error.
335 |     fn as_c_void(&self) -> *mut c_void {
336 |         self as *const _ as _
337 |     }
338 | 
339 |     fn shared_mem(&self) -> u32 {
340 |         u32::try_from(self.length * std::mem::size_of::<T>())
341 |             .expect("__shared__ memory allocation is too big.")
342 |     }
343 | }
344 | 
345 | /// A kernel that can be executed.
346 | pub struct Kernel<'a> {
347 |     function: rustacuda::function::Function<'a>,
348 |     global_work_size: usize,
349 |     local_work_size: usize,
350 |     stream: &'a Stream,
351 |     args: Vec<&'a dyn KernelArgument>,
352 | }
353 | 
354 | impl fmt::Debug for Kernel<'_> {
355 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
356 |         let args = self
357 |             .args
358 |             .iter()
359 |             .map(|arg| (arg.as_c_void(), arg.shared_mem()))
360 |             .collect::<Vec<_>>();
361 |         f.debug_struct("Kernel")
362 |             .field("function", &self.function)
363 |             .field("global_work_size", &self.global_work_size)
364 |             .field("local_work_size", &self.local_work_size)
365 |             .field("stream", &self.stream)
366 |             .field("args", &args)
367 |             .finish()
368 |     }
369 | }
370 | 
371 | impl<'a> Kernel<'a> {
372 |     /// Set a kernel argument.
373 |     ///
374 |     /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
375 |     /// long as the kernel is in use.
376 |     ///
377 |     /// Example where this behaviour is enforced and leads to a compile-time error:
378 |     ///
379 |     /// ```compile_fail
380 |     /// use rust_gpu_tools::cuda::Program;
381 |     ///
382 |     /// fn would_break(program: &Program) {
383 |     ///    let data = vec![1, 2, 3, 4];
384 |     ///    let buffer = program.create_buffer_from_slice(&data).unwrap();
385 |     ///    let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
386 |     ///    let kernel = kernel.arg(&buffer);
387 |     ///    // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
388 |     ///    drop(buffer);
389 |     ///    kernel.run().unwrap();
390 |     /// }
391 |     /// ```
392 |     pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
393 |         self.args.push(t);
394 |         self
395 |     }
396 | 
397 |     /// Actually run the kernel.
398 |     ///
399 |     /// ### Panics
400 |     ///
401 |     /// Panics if the wrong number of arguments was provided.
402 |     pub fn run(self) -> GPUResult<()> {
403 |         // There can only be a single [`LocalBuffer`], due to CUDA restrictions.
404 |         let shared_mem = self
405 |             .args
406 |             .iter()
407 |             .try_fold(0, |acc, &arg| -> GPUResult<u32> {
408 |                 let mem = arg.shared_mem();
409 |                 match (mem, acc) {
410 |                     // No new shared memory needs to be allocated.
411 |                     (0, _) => Ok(acc),
412 |                     // Some shared memory needs to be allocated.
413 |                     (_, 0) => Ok(mem),
414 |                     // There should be memory allocated more than once
415 |                     (_, _) => Err(GPUError::Generic(
416 |                         "There cannot be more than one `LocalBuffer`.".to_string(),
417 |                     )),
418 |                 }
419 |             })?;
420 |         let args = self
421 |             .args
422 |             .iter()
423 |             .map(|arg| arg.as_c_void())
424 |             .collect::<Vec<_>>();
425 |         // It is safe to launch the kernel as the arguments need to live when the kernel is called,
426 |         // and the buffers are copied synchronuously. At the end of the execution, the underlying
427 |         // stream is synchronized.
428 |         unsafe {
429 |             self.stream.launch(
430 |                 &self.function,
431 |                 self.global_work_size as u32,
432 |                 self.local_work_size as u32,
433 |                 shared_mem,
434 |                 &args,
435 |             )?;
436 |         };
437 |         // Synchronize after the kernel execution, so that the underlying pointers can be
438 |         // invalidated/dropped.
439 |         self.stream.synchronize()?;
440 |         Ok(())
441 |     }
442 | }
443 | 


--------------------------------------------------------------------------------
/src/cuda/utils.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryFrom;
  2 | 
  3 | use log::{debug, warn};
  4 | 
  5 | use crate::cuda::Device;
  6 | use crate::device::{PciId, Vendor};
  7 | use crate::error::{GPUError, GPUResult};
  8 | 
  9 | // NOTE vmx 2021-04-14: This is a hack to make sure contexts stay around. We wrap them, so that
 10 | // `Sync` and `Send` can be implemented. `Sync` and `Send` is needed for once_cell. These contexts
 11 | // are never used directly, they are only accessed through [`cuda::Device`] which contains an
 12 | // `UnownedContext`. A device cannot have an own context itself, as then it couldn't be cloned,
 13 | // but that is needed for creating the kernels.
 14 | pub(crate) struct CudaContexts(#[allow(unused)] Vec<rustacuda::context::Context>);
 15 | unsafe impl Sync for CudaContexts {}
 16 | unsafe impl Send for CudaContexts {}
 17 | 
 18 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
 19 | ///
 20 | /// It is the first two identifiers of e.g. `lspci`:
 21 | ///
 22 | /// ```text
 23 | ///     4e:00.0 VGA compatible controller
 24 | ///     || └└-- Device ID
 25 | ///     └└-- Bus ID
 26 | /// ```
 27 | fn get_pci_id(device: &rustacuda::device::Device) -> Result<PciId, GPUError> {
 28 |     let bus_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciBusId)? as u16;
 29 |     let device_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciDeviceId)? as u16;
 30 |     let pci_id = (bus_id << 8) | device_id;
 31 |     Ok(pci_id.into())
 32 | }
 33 | 
 34 | fn get_memory(d: &rustacuda::device::Device) -> GPUResult<u64> {
 35 |     let memory = d.total_memory()?;
 36 |     Ok(u64::try_from(memory).expect("Platform must be <= 64-bit"))
 37 | }
 38 | 
 39 | fn get_compute_units(d: &rustacuda::device::Device) -> GPUResult<u32> {
 40 |     let compute_units = d.get_attribute(rustacuda::device::DeviceAttribute::MultiprocessorCount)?;
 41 |     Ok(u32::try_from(compute_units).expect("The number of units is always positive"))
 42 | }
 43 | 
 44 | /// Get the major an minor version of the compute capability.
 45 | fn get_compute_capability(d: &rustacuda::device::Device) -> GPUResult<(u32, u32)> {
 46 |     let major = d.get_attribute(rustacuda::device::DeviceAttribute::ComputeCapabilityMajor)?;
 47 |     let minor = d.get_attribute(rustacuda::device::DeviceAttribute::ComputeCapabilityMinor)?;
 48 |     Ok((
 49 |         u32::try_from(major).expect("The compute capability major version is always positive"),
 50 |         u32::try_from(minor).expect("The compute capability minor version is always positive"),
 51 |     ))
 52 | }
 53 | 
 54 | /// Get a list of all available and supported devices.
 55 | ///
 56 | /// If there is a failure initializing CUDA or retrieving a device, it won't lead to a hard error,
 57 | /// but an error will be logged and the corresponding device won't be available.
 58 | pub(crate) fn build_device_list() -> (Vec<Device>, CudaContexts) {
 59 |     let mut all_devices = Vec::new();
 60 |     let mut devices_without_pci_id = Vec::new();
 61 |     let mut contexts = Vec::new();
 62 | 
 63 |     rustacuda::init(rustacuda::CudaFlags::empty())
 64 |         .map_err(Into::into)
 65 |         .and_then(|_| {
 66 |             for device in rustacuda::device::Device::devices()? {
 67 |                 let device = device?;
 68 |                 let owned_context = rustacuda::context::Context::create_and_push(
 69 |                     rustacuda::context::ContextFlags::MAP_HOST
 70 |                         | rustacuda::context::ContextFlags::SCHED_AUTO,
 71 |                     device,
 72 |                 )?;
 73 |                 rustacuda::context::ContextStack::pop()?;
 74 | 
 75 |                 let vendor = Vendor::Nvidia;
 76 |                 let name = device.name()?;
 77 |                 let memory = get_memory(&device)?;
 78 |                 let compute_units = get_compute_units(&device)?;
 79 |                 let compute_capability = get_compute_capability(&device)?;
 80 |                 let uuid = device.uuid().ok().map(Into::into);
 81 |                 let context = owned_context.get_unowned();
 82 | 
 83 |                 contexts.push(owned_context);
 84 | 
 85 |                 // If a device doesn't have a PCI-ID, add those later to the list of
 86 |                 // devices with a fake PCI-ID.
 87 |                 match get_pci_id(&device) {
 88 |                     Ok(pci_id) => {
 89 |                         all_devices.push(Device {
 90 |                             vendor,
 91 |                             name,
 92 |                             memory,
 93 |                             compute_units,
 94 |                             compute_capability,
 95 |                             pci_id,
 96 |                             uuid,
 97 |                             context,
 98 |                         });
 99 |                     }
100 |                     Err(_) => {
101 |                         // Use a temporary PCI-ID and replace it later with a non-colliding one.
102 |                         let pci_id = PciId::from(0);
103 |                         devices_without_pci_id.push(Device {
104 |                             vendor,
105 |                             name,
106 |                             memory,
107 |                             compute_units,
108 |                             compute_capability,
109 |                             pci_id,
110 |                             uuid,
111 |                             context,
112 |                         });
113 |                     }
114 |                 };
115 |             }
116 | 
117 |             // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID.
118 |             // As those devices are used for development and not for production use, it's good enough to
119 |             // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID
120 |             // instead, which is generated by enumerating the available devices. In order to make that
121 |             // case easier to spot when debugging issues, a starting number which is pleasant to the human
122 |             // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234).
123 |             let mut enumerated_device: u16 = 4660;
124 |             for mut device in devices_without_pci_id.into_iter() {
125 |                 // Make sure that no device has that actual PCI-ID
126 |                 while all_devices
127 |                     .iter()
128 |                     .any(|d| d.pci_id() == enumerated_device.into())
129 |                 {
130 |                     enumerated_device += 1;
131 |                 }
132 |                 device.pci_id = PciId::from(enumerated_device);
133 |                 enumerated_device += 1;
134 |                 all_devices.push(device);
135 |             }
136 | 
137 |             let wrapped_contexts = CudaContexts(contexts);
138 | 
139 |             debug!("Loaded CUDA devices: {:?}", all_devices);
140 |             Ok((all_devices, wrapped_contexts))
141 |         })
142 |         .unwrap_or_else(|error: GPUError| {
143 |             warn!("Unable to retrieve CUDA devices: {:?}", error);
144 |             (Vec::new(), CudaContexts(Vec::new()))
145 |         })
146 | }
147 | 


--------------------------------------------------------------------------------
/src/device.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | 
  3 | use log::debug;
  4 | #[cfg(all(feature = "opencl", feature = "cuda"))]
  5 | use log::warn;
  6 | use once_cell::sync::Lazy;
  7 | 
  8 | use std::convert::TryFrom;
  9 | use std::mem;
 10 | 
 11 | use crate::error::{GPUError, GPUResult};
 12 | 
 13 | #[cfg(feature = "cuda")]
 14 | use crate::cuda;
 15 | #[cfg(feature = "opencl")]
 16 | use crate::opencl;
 17 | 
 18 | /// The UUID of the devices returned by OpenCL as well as CUDA are always 16 bytes long.
 19 | const UUID_SIZE: usize = 16;
 20 | const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc.";
 21 | const AMD_DEVICE_VENDOR_ID: u32 = 0x1002;
 22 | 
 23 | const INTEL_DEVICE_VENDOR_STRING: &str = "Intel(R) Corporation";
 24 | const INTEL_DEVICE_VENDOR_ID: u32 = 0x8086;
 25 | 
 26 | // For some reason integrated AMD cards on Apple don't have the usual vendor name and ID
 27 | const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD";
 28 | const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00;
 29 | const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation";
 30 | const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de;
 31 | 
 32 | // The owned CUDA contexts are stored globally. Each devives contains an unowned reference, so
 33 | // that devices can be cloned.
 34 | #[cfg(feature = "cuda")]
 35 | static DEVICES: Lazy<(Vec<Device>, cuda::utils::CudaContexts)> = Lazy::new(build_device_list);
 36 | 
 37 | // Keep it as a tuple as the CUDA case, so that the using `DEVICES` is independent of the
 38 | // features set.
 39 | #[cfg(all(feature = "opencl", not(feature = "cuda")))]
 40 | static DEVICES: Lazy<(Vec<Device>, ())> = Lazy::new(build_device_list);
 41 | 
 42 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
 43 | ///
 44 | /// It is the first two identifiers of e.g. `lcpci`:
 45 | ///
 46 | /// ```text
 47 | ///     4e:00.0 VGA compatible controller
 48 | ///     || └└-- Device ID
 49 | ///     └└-- Bus ID
 50 | /// ```
 51 | #[derive(Copy, Clone, Debug, Default, Eq, Hash, PartialEq)]
 52 | pub struct PciId(u16);
 53 | 
 54 | impl From<u16> for PciId {
 55 |     fn from(id: u16) -> Self {
 56 |         Self(id)
 57 |     }
 58 | }
 59 | 
 60 | impl From<PciId> for u16 {
 61 |     fn from(id: PciId) -> Self {
 62 |         id.0
 63 |     }
 64 | }
 65 | 
 66 | /// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`.
 67 | impl TryFrom<&str> for PciId {
 68 |     type Error = GPUError;
 69 | 
 70 |     fn try_from(pci_id: &str) -> GPUResult<Self> {
 71 |         let mut bytes = [0; mem::size_of::<u16>()];
 72 |         hex::decode_to_slice(pci_id.replace(':', ""), &mut bytes).map_err(|_| {
 73 |             GPUError::InvalidId(format!(
 74 |                 "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.",
 75 |                 pci_id
 76 |             ))
 77 |         })?;
 78 |         let parsed = u16::from_be_bytes(bytes);
 79 |         Ok(Self(parsed))
 80 |     }
 81 | }
 82 | 
 83 | /// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`.
 84 | impl fmt::Display for PciId {
 85 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 86 |         let bytes = u16::to_be_bytes(self.0);
 87 |         write!(f, "{:02x}:{:02x}", bytes[0], bytes[1])
 88 |     }
 89 | }
 90 | 
 91 | /// A unique identifier based on UUID of the device.
 92 | #[derive(Copy, Clone, Default, Eq, Hash, PartialEq)]
 93 | pub struct DeviceUuid([u8; UUID_SIZE]);
 94 | 
 95 | impl From<[u8; UUID_SIZE]> for DeviceUuid {
 96 |     fn from(uuid: [u8; UUID_SIZE]) -> Self {
 97 |         Self(uuid)
 98 |     }
 99 | }
100 | 
101 | impl From<DeviceUuid> for [u8; UUID_SIZE] {
102 |     fn from(uuid: DeviceUuid) -> Self {
103 |         uuid.0
104 |     }
105 | }
106 | 
107 | /// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee,
108 | /// e.g. 46abccd6-022e-b783-572d-833f7104d05f
109 | impl TryFrom<&str> for DeviceUuid {
110 |     type Error = GPUError;
111 | 
112 |     fn try_from(uuid: &str) -> GPUResult<Self> {
113 |         let mut bytes = [0; UUID_SIZE];
114 |         hex::decode_to_slice(uuid.replace('-', ""), &mut bytes)
115 |             .map_err(|_| {
116 |                 GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid))
117 |             })?;
118 |         Ok(Self(bytes))
119 |     }
120 | }
121 | 
122 | /// Formats the UUID the same way as `clinfo` does, as an example:
123 | /// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f
124 | impl fmt::Display for DeviceUuid {
125 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
126 |         write!(
127 |             f,
128 |             "{}-{}-{}-{}-{}",
129 |             hex::encode(&self.0[..4]),
130 |             hex::encode(&self.0[4..6]),
131 |             hex::encode(&self.0[6..8]),
132 |             hex::encode(&self.0[8..10]),
133 |             hex::encode(&self.0[10..])
134 |         )
135 |     }
136 | }
137 | 
138 | impl fmt::Debug for DeviceUuid {
139 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
140 |         write!(f, "{}", self)
141 |     }
142 | }
143 | 
144 | /// Unique identifier that can either be a PCI ID or a UUID.
145 | #[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
146 | pub enum UniqueId {
147 |     /// ID based on the PCI bus.
148 |     PciId(PciId),
149 |     /// ID based on a globally unique identifier.
150 |     Uuid(DeviceUuid),
151 | }
152 | 
153 | /// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID.
154 | impl TryFrom<&str> for UniqueId {
155 |     type Error = GPUError;
156 | 
157 |     fn try_from(unique_id: &str) -> GPUResult<Self> {
158 |         Ok(match unique_id.contains('-') {
159 |             true => Self::Uuid(DeviceUuid::try_from(unique_id)?),
160 |             false => Self::PciId(PciId::try_from(unique_id)?),
161 |         })
162 |     }
163 | }
164 | 
165 | impl fmt::Display for UniqueId {
166 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
167 |         match self {
168 |             Self::PciId(id) => id.fmt(f),
169 |             Self::Uuid(id) => id.fmt(f),
170 |         }
171 |     }
172 | }
173 | 
174 | /// Currently supported vendors of this library.
175 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
176 | pub enum Vendor {
177 |     /// GPU by AMD.
178 |     Amd,
179 |     /// GPU by Intel.
180 |     Intel,
181 |     /// GPU by NVIDIA.
182 |     Nvidia,
183 | }
184 | 
185 | impl TryFrom<&str> for Vendor {
186 |     type Error = GPUError;
187 | 
188 |     fn try_from(vendor: &str) -> GPUResult<Self> {
189 |         match vendor {
190 |             AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd),
191 |             AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd),
192 |             INTEL_DEVICE_VENDOR_STRING => Ok(Self::Intel),
193 |             NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia),
194 |             _ => Err(GPUError::UnsupportedVendor(vendor.to_string())),
195 |         }
196 |     }
197 | }
198 | 
199 | impl TryFrom<u32> for Vendor {
200 |     type Error = GPUError;
201 | 
202 |     fn try_from(vendor: u32) -> GPUResult<Self> {
203 |         match vendor {
204 |             AMD_DEVICE_VENDOR_ID => Ok(Self::Amd),
205 |             AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd),
206 |             INTEL_DEVICE_VENDOR_ID => Ok(Self::Intel),
207 |             NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia),
208 |             _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))),
209 |         }
210 |     }
211 | }
212 | 
213 | impl fmt::Display for Vendor {
214 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215 |         let vendor = match self {
216 |             Self::Amd => AMD_DEVICE_VENDOR_STRING,
217 |             Self::Intel => INTEL_DEVICE_VENDOR_STRING,
218 |             Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING,
219 |         };
220 |         write!(f, "{}", vendor)
221 |     }
222 | }
223 | 
224 | /// Which framework to use, CUDA or OpenCL.
225 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
226 | pub enum Framework {
227 |     /// CUDA.
228 |     #[cfg(feature = "cuda")]
229 |     Cuda,
230 |     /// OpenCL.
231 |     #[cfg(feature = "opencl")]
232 |     Opencl,
233 | }
234 | 
235 | /// A device that may have a CUDA and/or OpenCL GPU associated with it.
236 | #[derive(Clone, Debug, Eq, Hash, PartialEq)]
237 | pub struct Device {
238 |     vendor: Vendor,
239 |     name: String,
240 |     memory: u64,
241 |     compute_units: u32,
242 |     /// Major and minor version of the compute capabilitiy (only available on Nvidia GPUs).
243 |     compute_capability: Option<(u32, u32)>,
244 |     // All devices have a PCI ID. It is used as fallback in case there is not UUID.
245 |     pci_id: PciId,
246 |     uuid: Option<DeviceUuid>,
247 |     #[cfg(feature = "cuda")]
248 |     cuda: Option<cuda::Device>,
249 |     #[cfg(feature = "opencl")]
250 |     opencl: Option<opencl::Device>,
251 | }
252 | 
253 | impl Device {
254 |     /// Returns the [`Vendor`] of the GPU.
255 |     pub fn vendor(&self) -> Vendor {
256 |         self.vendor
257 |     }
258 | 
259 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
260 |     pub fn name(&self) -> String {
261 |         self.name.clone()
262 |     }
263 | 
264 |     /// Returns the memory of the GPU in bytes.
265 |     pub fn memory(&self) -> u64 {
266 |         self.memory
267 |     }
268 | 
269 |     /// Returns the number of compute units of the GPU.
270 |     pub fn compute_units(&self) -> u32 {
271 |         self.compute_units
272 |     }
273 | 
274 |     /// Returns the major and minor version of the compute capability (only available on Nvidia
275 |     /// GPUs).
276 |     pub fn compute_capability(&self) -> Option<(u32, u32)> {
277 |         self.compute_capability
278 |     }
279 | 
280 |     /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID.
281 |     pub fn unique_id(&self) -> UniqueId {
282 |         match self.uuid {
283 |             Some(uuid) => UniqueId::Uuid(uuid),
284 |             None => UniqueId::PciId(self.pci_id),
285 |         }
286 |     }
287 | 
288 |     /// Returns the preferred framework (CUDA or OpenCL) to use.
289 |     ///
290 |     /// CUDA will be be preferred over OpenCL. The returned framework will work on the device.
291 |     /// E.g. it won't return `Framework::Cuda` for an AMD device.
292 |     pub fn framework(&self) -> Framework {
293 |         #[cfg(all(feature = "opencl", feature = "cuda"))]
294 |         if cfg!(feature = "cuda") && self.cuda.is_some() {
295 |             Framework::Cuda
296 |         } else {
297 |             Framework::Opencl
298 |         }
299 | 
300 |         #[cfg(all(feature = "cuda", not(feature = "opencl")))]
301 |         {
302 |             Framework::Cuda
303 |         }
304 | 
305 |         #[cfg(all(feature = "opencl", not(feature = "cuda")))]
306 |         {
307 |             Framework::Opencl
308 |         }
309 |     }
310 | 
311 |     /// Returns the underlying CUDA device if it is available.
312 |     #[cfg(feature = "cuda")]
313 |     pub fn cuda_device(&self) -> Option<&cuda::Device> {
314 |         self.cuda.as_ref()
315 |     }
316 | 
317 |     /// Returns the underlying OpenCL device if it is available.
318 |     #[cfg(feature = "opencl")]
319 |     pub fn opencl_device(&self) -> Option<&opencl::Device> {
320 |         self.opencl.as_ref()
321 |     }
322 | 
323 |     /// Returns all available GPUs that are supported.
324 |     pub fn all() -> Vec<&'static Device> {
325 |         Self::all_iter().collect()
326 |     }
327 | 
328 |     /// Returns the device matching the PCI ID if there is one.
329 |     pub fn by_pci_id(pci_id: PciId) -> Option<&'static Device> {
330 |         Self::all_iter().find(|d| pci_id == d.pci_id)
331 |     }
332 | 
333 |     /// Returns the device matching the UUID if there is one.
334 |     pub fn by_uuid(uuid: DeviceUuid) -> Option<&'static Device> {
335 |         Self::all_iter().find(|d| Some(uuid) == d.uuid)
336 |     }
337 | 
338 |     /// Returns the device matching the unique ID if there is one.
339 |     pub fn by_unique_id(unique_id: UniqueId) -> Option<&'static Device> {
340 |         Self::all_iter().find(|d| unique_id == d.unique_id())
341 |     }
342 | 
343 |     /// Returns the device matching the Vendor.
344 |     pub fn by_vendor(vendor_id: Vendor) -> Vec<&'static Device> {
345 |         Self::all_iter()
346 |             .filter(|d| vendor_id == d.vendor())
347 |             .collect()
348 |     }
349 | 
350 |     /// Returns an iterator of all available GPUs that are supported.
351 |     fn all_iter() -> impl Iterator<Item = &'static Device> {
352 |         DEVICES.0.iter()
353 |     }
354 | }
355 | 
356 | /// Get a list of all available and supported devices.
357 | ///
358 | /// If both, the `cuda` and the `opencl` feature are enabled, a device supporting both will be
359 | /// combined into a single device. You can then access the underlying CUDA and OpenCL device
360 | /// if needed.
361 | ///
362 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
363 | /// logged and the corresponding device won't be available.
364 | #[cfg(feature = "cuda")]
365 | fn build_device_list() -> (Vec<Device>, cuda::utils::CudaContexts) {
366 |     let mut all_devices = Vec::new();
367 | 
368 |     #[cfg(feature = "opencl")]
369 |     let opencl_devices = opencl::utils::build_device_list();
370 | 
371 |     #[cfg(all(feature = "cuda", feature = "opencl"))]
372 |     let (mut cuda_devices, cuda_contexts) = cuda::utils::build_device_list();
373 |     #[cfg(all(feature = "cuda", not(feature = "opencl")))]
374 |     let (cuda_devices, cuda_contexts) = cuda::utils::build_device_list();
375 | 
376 |     // Combine OpenCL and CUDA devices into one device if it is the same GPU
377 |     #[cfg(feature = "opencl")]
378 |     for opencl_device in opencl_devices {
379 |         let mut device = Device {
380 |             vendor: opencl_device.vendor(),
381 |             name: opencl_device.name(),
382 |             memory: opencl_device.memory(),
383 |             compute_units: opencl_device.compute_units(),
384 |             compute_capability: opencl_device.compute_capability(),
385 |             pci_id: opencl_device.pci_id(),
386 |             uuid: opencl_device.uuid(),
387 |             opencl: Some(opencl_device),
388 |             cuda: None,
389 |         };
390 | 
391 |         // Only devices from Nvidia can use CUDA
392 |         #[cfg(feature = "cuda")]
393 |         if device.vendor == Vendor::Nvidia {
394 |             for ii in 0..cuda_devices.len() {
395 |                 if (device.uuid.is_some() && cuda_devices[ii].uuid() == device.uuid)
396 |                     || (cuda_devices[ii].pci_id() == device.pci_id)
397 |                 {
398 |                     if device.memory() != cuda_devices[ii].memory() {
399 |                         warn!("OpenCL and CUDA report different amounts of memory for a device with the same identifier");
400 |                         break;
401 |                     }
402 |                     if device.compute_units() != cuda_devices[ii].compute_units() {
403 |                         warn!("OpenCL and CUDA report different amounts of compute units for a device with the same identifier");
404 |                         break;
405 |                     }
406 |                     // Move the CUDA device out of the vector
407 |                     device.cuda = Some(cuda_devices.remove(ii));
408 |                     // Only one device can match
409 |                     break;
410 |                 }
411 |             }
412 |         }
413 | 
414 |         all_devices.push(device)
415 |     }
416 | 
417 |     // All CUDA devices that don't have a corresponding OpenCL devices
418 |     for cuda_device in cuda_devices {
419 |         let device = Device {
420 |             vendor: cuda_device.vendor(),
421 |             name: cuda_device.name(),
422 |             memory: cuda_device.memory(),
423 |             compute_units: cuda_device.compute_units(),
424 |             compute_capability: Some(cuda_device.compute_capability()),
425 |             pci_id: cuda_device.pci_id(),
426 |             uuid: cuda_device.uuid(),
427 |             cuda: Some(cuda_device),
428 |             #[cfg(feature = "opencl")]
429 |             opencl: None,
430 |         };
431 |         all_devices.push(device);
432 |     }
433 | 
434 |     debug!("loaded devices: {:?}", all_devices);
435 |     (all_devices, cuda_contexts)
436 | }
437 | 
438 | /// Get a list of all available and supported OpenCL devices.
439 | ///
440 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
441 | /// logged and the corresponding device won't be available.
442 | #[cfg(all(feature = "opencl", not(feature = "cuda")))]
443 | fn build_device_list() -> (Vec<Device>, ()) {
444 |     let devices = opencl::utils::build_device_list()
445 |         .into_iter()
446 |         .map(|device| Device {
447 |             vendor: device.vendor(),
448 |             name: device.name(),
449 |             memory: device.memory(),
450 |             compute_units: device.compute_units(),
451 |             compute_capability: device.compute_capability(),
452 |             pci_id: device.pci_id(),
453 |             uuid: device.uuid(),
454 |             opencl: Some(device),
455 |         })
456 |         .collect();
457 | 
458 |     debug!("loaded devices: {:?}", devices);
459 |     (devices, ())
460 | }
461 | 
462 | #[cfg(test)]
463 | mod test {
464 |     use super::{
465 |         Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID,
466 |         AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING,
467 |         INTEL_DEVICE_VENDOR_ID, INTEL_DEVICE_VENDOR_STRING, NVIDIA_DEVICE_VENDOR_ID,
468 |         NVIDIA_DEVICE_VENDOR_STRING,
469 |     };
470 |     use std::convert::TryFrom;
471 | 
472 |     #[test]
473 |     fn test_device_all() {
474 |         let devices = Device::all();
475 |         for device in devices.iter() {
476 |             println!("device: {:?}", device);
477 |         }
478 |         assert!(!devices.is_empty(), "No supported GPU found.");
479 |     }
480 | 
481 |     #[test]
482 |     fn test_vendor_from_str() {
483 |         assert_eq!(
484 |             Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(),
485 |             Vendor::Amd,
486 |             "AMD vendor string can be converted."
487 |         );
488 |         assert_eq!(
489 |             Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(),
490 |             Vendor::Amd,
491 |             "AMD vendor string (on apple) can be converted."
492 |         );
493 |         assert_eq!(
494 |             Vendor::try_from(INTEL_DEVICE_VENDOR_STRING).unwrap(),
495 |             Vendor::Intel,
496 |             "Intel vendor string can be converted."
497 |         );
498 |         assert_eq!(
499 |             Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(),
500 |             Vendor::Nvidia,
501 |             "Nvidia vendor string can be converted."
502 |         );
503 |         assert!(matches!(
504 |             Vendor::try_from("unknown vendor"),
505 |             Err(GPUError::UnsupportedVendor(_))
506 |         ));
507 |     }
508 | 
509 |     #[test]
510 |     fn test_vendor_from_u32() {
511 |         assert_eq!(
512 |             Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(),
513 |             Vendor::Amd,
514 |             "AMD vendor ID can be converted."
515 |         );
516 |         assert_eq!(
517 |             Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(),
518 |             Vendor::Amd,
519 |             "AMD vendor ID (on apple) can be converted."
520 |         );
521 |         assert_eq!(
522 |             Vendor::try_from(INTEL_DEVICE_VENDOR_ID).unwrap(),
523 |             Vendor::Intel,
524 |             "Intel vendor ID can be converted."
525 |         );
526 |         assert_eq!(
527 |             Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(),
528 |             Vendor::Nvidia,
529 |             "Nvidia vendor ID can be converted."
530 |         );
531 |         assert!(matches!(
532 |             Vendor::try_from(0x1abc),
533 |             Err(GPUError::UnsupportedVendor(_))
534 |         ));
535 |     }
536 | 
537 |     #[test]
538 |     fn test_vendor_display() {
539 |         assert_eq!(
540 |             Vendor::Amd.to_string(),
541 |             AMD_DEVICE_VENDOR_STRING,
542 |             "AMD vendor can be converted to string."
543 |         );
544 |         assert_eq!(
545 |             Vendor::Intel.to_string(),
546 |             INTEL_DEVICE_VENDOR_STRING,
547 |             "Intel vendor can be converted to string."
548 |         );
549 |         assert_eq!(
550 |             Vendor::Nvidia.to_string(),
551 |             NVIDIA_DEVICE_VENDOR_STRING,
552 |             "Nvidia vendor can be converted to string."
553 |         );
554 |     }
555 | 
556 |     #[test]
557 |     fn test_uuid() {
558 |         let valid_string = "46abccd6-022e-b783-572d-833f7104d05f";
559 |         let valid = DeviceUuid::try_from(valid_string).unwrap();
560 |         assert_eq!(valid_string, &valid.to_string());
561 | 
562 |         let too_short_string = "ccd6-022e-b783-572d-833f7104d05f";
563 |         let too_short = DeviceUuid::try_from(too_short_string);
564 |         assert!(too_short.is_err(), "Parse error when UUID is too short.");
565 | 
566 |         let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h";
567 |         let invalid_hex = DeviceUuid::try_from(invalid_hex_string);
568 |         assert!(
569 |             invalid_hex.is_err(),
570 |             "Parse error when UUID containts non-hex character."
571 |         );
572 |     }
573 | 
574 |     #[test]
575 |     fn test_pci_id() {
576 |         let valid_string = "01:00";
577 |         let valid = PciId::try_from(valid_string).unwrap();
578 |         assert_eq!(valid_string, &valid.to_string());
579 |         assert_eq!(valid, PciId(0x0100));
580 | 
581 |         let too_short_string = "3f";
582 |         let too_short = PciId::try_from(too_short_string);
583 |         assert!(too_short.is_err(), "Parse error when PCI ID is too short.");
584 | 
585 |         let invalid_hex_string = "aaxx";
586 |         let invalid_hex = PciId::try_from(invalid_hex_string);
587 |         assert!(
588 |             invalid_hex.is_err(),
589 |             "Parse error when PCI ID containts non-hex character."
590 |         );
591 |     }
592 | 
593 |     #[test]
594 |     fn test_unique_id() {
595 |         let valid_pci_id_string = "aa:bb";
596 |         let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap();
597 |         assert_eq!(valid_pci_id_string, &valid_pci_id.to_string());
598 |         assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb)));
599 | 
600 |         let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899";
601 |         let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap();
602 |         assert_eq!(valid_uuid_string, &valid_uuid.to_string());
603 |         assert_eq!(
604 |             valid_uuid,
605 |             UniqueId::Uuid(DeviceUuid([
606 |                 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
607 |                 0x88, 0x99
608 |             ]))
609 |         );
610 | 
611 |         let invalid_string = "aabbccddeeffgg";
612 |         let invalid = UniqueId::try_from(invalid_string);
613 |         assert!(
614 |             invalid.is_err(),
615 |             "Parse error when ID matches neither a PCI Id, nor a UUID."
616 |         );
617 |     }
618 | }
619 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "opencl")]
 2 | use opencl3::error_codes::ClError;
 3 | #[cfg(feature = "cuda")]
 4 | use rustacuda::error::CudaError;
 5 | 
 6 | /// Error types of this library.
 7 | #[derive(thiserror::Error, Debug)]
 8 | #[allow(clippy::upper_case_acronyms)]
 9 | pub enum GPUError {
10 |     /// Error from the underlying `opencl3` library, e.g. a memory allocation failure.
11 |     #[cfg(feature = "opencl")]
12 |     #[error("Opencl3 Error: {0}{}", match .message {
13 |        Some(message) => format!(" {}", message),
14 |        None => "".to_string(),
15 |     })]
16 |     Opencl3 {
17 |         /// The error code.
18 |         error: ClError,
19 |         /// The error message.
20 |         message: Option<String>,
21 |     },
22 | 
23 |     /// Error for OpenCL `clGetProgramInfo()` call failures.
24 |     #[cfg(feature = "opencl")]
25 |     #[error("Program info not available!")]
26 |     ProgramInfoNotAvailable(ClError),
27 | 
28 |     /// Error for OpenCL `clGetDeviceInfo()` call failures.
29 |     #[cfg(feature = "opencl")]
30 |     #[error("Device info not available!")]
31 |     DeviceInfoNotAvailable(ClError),
32 | 
33 |     /// Error from the underlying `RustaCUDA` library, e.g. a memory allocation failure.
34 |     #[cfg(feature = "cuda")]
35 |     #[error("Cuda Error: {0}")]
36 |     Cuda(#[from] CudaError),
37 | 
38 |     /// Error when a device cannot be found.
39 |     #[error("Device not found!")]
40 |     DeviceNotFound,
41 | 
42 |     /// Error when a kernel with the given name cannot be found.
43 |     #[error("Kernel with name {0} not found!")]
44 |     KernelNotFound(String),
45 | 
46 |     /// Error when standard I/O fails.
47 |     #[error("IO Error: {0}")]
48 |     IO(#[from] std::io::Error),
49 | 
50 |     /// Error when the device is from an unsupported vendor.
51 |     #[error("Vendor {0} is not supported.")]
52 |     UnsupportedVendor(String),
53 | 
54 |     /// Error when the string representation of a unique identifier (PCI-ID or UUID) cannot be
55 |     /// parsed.
56 |     #[error("{0}")]
57 |     InvalidId(String),
58 | 
59 |     /// Errors that rarely happen and don't deserve their own error type.
60 |     #[error("{0}")]
61 |     Generic(String),
62 | }
63 | 
64 | /// Convenience type alias for [`GPUError`] based [`Result`]s.
65 | #[allow(clippy::upper_case_acronyms)]
66 | pub type GPUResult<T> = std::result::Result<T, GPUError>;
67 | 
68 | #[cfg(feature = "opencl")]
69 | impl From<ClError> for GPUError {
70 |     fn from(error: ClError) -> Self {
71 |         GPUError::Opencl3 {
72 |             error,
73 |             message: None,
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Abstraction layer for OpenCL and CUDA.
 2 | //!
 3 | //! Feature flags
 4 | //! -------------
 5 | //!
 6 | //! There are two [feature flags], `cuda` and `opencl`. By default `opencl` is enabled. You can
 7 | //! enable both at the same time. At least one of them needs to be enabled at any time.
 8 | //!
 9 | //! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section
10 | 
11 | #![warn(missing_docs)]
12 | 
13 | mod device;
14 | mod error;
15 | #[cfg(any(feature = "cuda", feature = "opencl"))]
16 | mod program;
17 | 
18 | #[cfg(feature = "cuda")]
19 | pub mod cuda;
20 | #[cfg(feature = "opencl")]
21 | pub mod opencl;
22 | 
23 | pub use device::{Device, DeviceUuid, Framework, PciId, UniqueId, Vendor};
24 | pub use error::GPUError;
25 | #[cfg(any(feature = "cuda", feature = "opencl"))]
26 | pub use program::Program;
27 | 
28 | #[cfg(not(any(feature = "cuda", feature = "opencl")))]
29 | compile_error!("At least one of the features `cuda` or `opencl` must be enabled.");
30 | 
31 | /// A buffer on the GPU.
32 | ///
33 | /// The concept of a local buffer is from OpenCL. In CUDA you don't allocate a buffer directly
34 | /// via API call. Instead you pass in the amount of shared memory that should be used.
35 | ///
36 | /// There can be at most a single local buffer per kernel. On CUDA a null pointer will be passed
37 | /// in, instead of an actual value. The memory that should get allocated is then passed into the
38 | /// kernel call automatically.
39 | #[derive(Debug)]
40 | pub struct LocalBuffer<T> {
41 |     /// The number of T sized elements.
42 |     length: usize,
43 |     _phantom: std::marker::PhantomData<T>,
44 | }
45 | 
46 | impl<T> LocalBuffer<T> {
47 |     /// Returns a new buffer of the specified `length`.
48 |     pub fn new(length: usize) -> Self {
49 |         LocalBuffer::<T> {
50 |             length,
51 |             _phantom: std::marker::PhantomData,
52 |         }
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/opencl/error.rs:
--------------------------------------------------------------------------------
 1 | use opencl3::{device::DeviceInfo, error_codes::ClError, program::ProgramInfo};
 2 | 
 3 | #[derive(thiserror::Error, Debug)]
 4 | #[allow(clippy::upper_case_acronyms)]
 5 | pub enum GPUError {
 6 |     #[error("Opencl3 Error: {0}{}", match .1 {
 7 |        Some(message) => format!(" {}", message),
 8 |        None => "".to_string(),
 9 |     })]
10 |     Opencl3(ClError, Option<String>),
11 |     #[error("Device not found!")]
12 |     DeviceNotFound,
13 |     #[error("Device info not available!")]
14 |     DeviceInfoNotAvailable(DeviceInfo),
15 |     #[error("Program info not available!")]
16 |     ProgramInfoNotAvailable(ProgramInfo),
17 |     #[error("Kernel with name {0} not found!")]
18 |     KernelNotFound(String),
19 |     #[error("IO Error: {0}")]
20 |     IO(#[from] std::io::Error),
21 |     #[error("Vendor {0} is not supported.")]
22 |     UnsupportedVendor(String),
23 |     #[error("{0}")]
24 |     InvalidId(String),
25 | }
26 | 
27 | #[allow(clippy::upper_case_acronyms)]
28 | #[allow(dead_code)]
29 | pub type GPUResult<T> = std::result::Result<T, GPUError>;
30 | 
31 | impl From<ClError> for GPUError {
32 |     fn from(error: ClError) -> Self {
33 |         GPUError::Opencl3(error, None)
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/opencl/mod.rs:
--------------------------------------------------------------------------------
  1 | //! The OpenCL specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
  2 | 
  3 | pub(crate) mod utils;
  4 | 
  5 | use std::collections::HashMap;
  6 | use std::hash::{Hash, Hasher};
  7 | use std::mem;
  8 | use std::ptr;
  9 | 
 10 | use opencl3::command_queue::CommandQueue;
 11 | use opencl3::context::Context;
 12 | use opencl3::error_codes::ClError;
 13 | use opencl3::kernel::ExecuteKernel;
 14 | use opencl3::memory::CL_MEM_READ_WRITE;
 15 | use opencl3::types::CL_BLOCKING;
 16 | 
 17 | use log::debug;
 18 | 
 19 | use crate::device::{DeviceUuid, PciId, Vendor};
 20 | use crate::error::{GPUError, GPUResult};
 21 | use crate::LocalBuffer;
 22 | 
 23 | /// The lowest level identifier of an OpenCL device, it changes whenever a device is initialized.
 24 | #[allow(non_camel_case_types)]
 25 | pub type cl_device_id = opencl3::types::cl_device_id;
 26 | 
 27 | /// A Buffer to be used for sending and receiving data to/from the GPU.
 28 | #[derive(Debug)]
 29 | pub struct Buffer<T> {
 30 |     buffer: opencl3::memory::Buffer<u8>,
 31 |     /// The number of T-sized elements.
 32 |     length: usize,
 33 |     _phantom: std::marker::PhantomData<T>,
 34 | }
 35 | 
 36 | /// OpenCL specific device.
 37 | #[derive(Debug, Clone)]
 38 | pub struct Device {
 39 |     vendor: Vendor,
 40 |     name: String,
 41 |     /// The total memory of the GPU in bytes.
 42 |     memory: u64,
 43 |     /// The number of parallel compute units.
 44 |     compute_units: u32,
 45 |     /// Major and minor version of the compute capabilitiy (only available on Nvidia GPUs).
 46 |     compute_capability: Option<(u32, u32)>,
 47 |     pci_id: PciId,
 48 |     uuid: Option<DeviceUuid>,
 49 |     device: opencl3::device::Device,
 50 | }
 51 | 
 52 | impl Hash for Device {
 53 |     fn hash<H: Hasher>(&self, state: &mut H) {
 54 |         self.vendor.hash(state);
 55 |         self.name.hash(state);
 56 |         self.memory.hash(state);
 57 |         self.pci_id.hash(state);
 58 |         self.uuid.hash(state);
 59 |     }
 60 | }
 61 | 
 62 | impl PartialEq for Device {
 63 |     fn eq(&self, other: &Self) -> bool {
 64 |         self.vendor == other.vendor
 65 |             && self.name == other.name
 66 |             && self.memory == other.memory
 67 |             && self.pci_id == other.pci_id
 68 |             && self.uuid == other.uuid
 69 |     }
 70 | }
 71 | 
 72 | impl Eq for Device {}
 73 | 
 74 | impl Device {
 75 |     /// Returns the [`Vendor`] of the GPU.
 76 |     pub fn vendor(&self) -> Vendor {
 77 |         self.vendor
 78 |     }
 79 | 
 80 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
 81 |     pub fn name(&self) -> String {
 82 |         self.name.clone()
 83 |     }
 84 | 
 85 |     /// Returns the memory of the GPU in bytes.
 86 |     pub fn memory(&self) -> u64 {
 87 |         self.memory
 88 |     }
 89 | 
 90 |     /// Returns the number of compute units of the GPU.
 91 |     pub fn compute_units(&self) -> u32 {
 92 |         self.compute_units
 93 |     }
 94 | 
 95 |     /// Returns the major and minor version of the compute capability (only available on Nvidia
 96 |     /// GPUs).
 97 |     pub fn compute_capability(&self) -> Option<(u32, u32)> {
 98 |         self.compute_capability
 99 |     }
100 | 
101 |     /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
102 |     pub fn pci_id(&self) -> PciId {
103 |         self.pci_id
104 |     }
105 | 
106 |     /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more
107 |     /// information.
108 |     pub fn uuid(&self) -> Option<DeviceUuid> {
109 |         self.uuid
110 |     }
111 | 
112 |     /// Low-level access to the device identifier.
113 |     ///
114 |     /// It changes when the device is initialized and should only be used to interact with other
115 |     /// libraries that work on the lowest OpenCL level.
116 |     pub fn cl_device_id(&self) -> cl_device_id {
117 |         self.device.id()
118 |     }
119 | }
120 | 
121 | /// Abstraction that contains everything to run an OpenCL kernel on a GPU.
122 | ///
123 | /// The majority of methods are the same as [`crate::cuda::Program`], so you can write code using this
124 | /// API, which will then work with OpenCL as well as CUDA kernels.
125 | #[allow(rustdoc::broken_intra_doc_links)]
126 | pub struct Program {
127 |     device_name: String,
128 |     queue: CommandQueue,
129 |     context: Context,
130 |     kernels_by_name: HashMap<String, opencl3::kernel::Kernel>,
131 | }
132 | 
133 | impl Program {
134 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
135 |     pub fn device_name(&self) -> &str {
136 |         &self.device_name
137 |     }
138 | 
139 |     /// Creates a program for a specific device from OpenCL source code.
140 |     pub fn from_opencl(device: &Device, src: &str) -> GPUResult<Program> {
141 |         debug!("Creating OpenCL program from source.");
142 |         let cached = utils::cache_path(device, src)?;
143 |         if std::path::Path::exists(&cached) {
144 |             let bin = std::fs::read(cached)?;
145 |             Program::from_binary(device, bin)
146 |         } else {
147 |             let context = Context::from_device(&device.device)?;
148 |             debug!(
149 |                 "Building kernel ({}) from source…",
150 |                 cached.to_string_lossy()
151 |             );
152 |             let mut program = opencl3::program::Program::create_from_source(&context, src)?;
153 |             if let Err(build_error) = program.build(context.devices(), "") {
154 |                 let log = program.get_build_log(context.devices()[0])?;
155 |                 return Err(GPUError::Opencl3 {
156 |                     error: build_error,
157 |                     message: Some(log),
158 |                 });
159 |             }
160 |             debug!(
161 |                 "Building kernel ({}) from source: done.",
162 |                 cached.to_string_lossy()
163 |             );
164 |             let queue = CommandQueue::create_default(&context, 0)?;
165 |             let kernels = opencl3::kernel::create_program_kernels(&program)?;
166 |             let kernels_by_name = kernels
167 |                 .into_iter()
168 |                 .map(|kernel| {
169 |                     let name = kernel.function_name()?;
170 |                     Ok((name, kernel))
171 |                 })
172 |                 .collect::<Result<_, ClError>>()?;
173 |             let prog = Program {
174 |                 device_name: device.name(),
175 |                 queue,
176 |                 context,
177 |                 kernels_by_name,
178 |             };
179 |             let binaries = program
180 |                 .get_binaries()
181 |                 .map_err(GPUError::ProgramInfoNotAvailable)?;
182 |             std::fs::write(cached, binaries[0].clone())?;
183 |             Ok(prog)
184 |         }
185 |     }
186 | 
187 |     /// Creates a program for a specific device from a compiled OpenCL binary.
188 |     pub fn from_binary(device: &Device, bin: Vec<u8>) -> GPUResult<Program> {
189 |         debug!("Creating OpenCL program from binary.");
190 |         let context = Context::from_device(&device.device)?;
191 |         let bins = vec![&bin[..]];
192 |         let mut program = unsafe {
193 |             opencl3::program::Program::create_from_binary(&context, context.devices(), &bins)
194 |         }?;
195 |         if let Err(build_error) = program.build(context.devices(), "") {
196 |             let log = program.get_build_log(context.devices()[0])?;
197 |             return Err(GPUError::Opencl3 {
198 |                 error: build_error,
199 |                 message: Some(log),
200 |             });
201 |         }
202 |         let queue = CommandQueue::create_default(&context, 0)?;
203 |         let kernels = opencl3::kernel::create_program_kernels(&program)?;
204 |         let kernels_by_name = kernels
205 |             .into_iter()
206 |             .map(|kernel| {
207 |                 let name = kernel.function_name()?;
208 |                 Ok((name, kernel))
209 |             })
210 |             .collect::<Result<_, ClError>>()?;
211 |         Ok(Program {
212 |             device_name: device.name(),
213 |             queue,
214 |             context,
215 |             kernels_by_name,
216 |         })
217 |     }
218 | 
219 |     /// Creates a new buffer that can be used for input/output with the GPU.
220 |     ///
221 |     /// The `length` is the number of elements to create.
222 |     ///
223 |     /// It is usually used to create buffers that are initialized by the GPU. If you want to
224 |     /// directly transfer data from the host to the GPU, you would use the safe
225 |     /// [`Program::create_buffer_from_slice`] instead.
226 |     ///
227 |     /// # Safety
228 |     ///
229 |     /// This function isn't actually unsafe, it's marked as `unsafe` due to the CUDA version of it,
230 |     /// where it is unsafe. This is done to have symmetry between both APIs.
231 |     pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
232 |         assert!(length > 0);
233 |         let mut buff = opencl3::memory::Buffer::create(
234 |             &self.context,
235 |             CL_MEM_READ_WRITE,
236 |             // The input length is the number of elements, but we create a `u8` buffer. Hence the
237 |             // length needs to be the number of bytes.
238 |             length * std::mem::size_of::<T>(),
239 |             ptr::null_mut(),
240 |         )?;
241 | 
242 |         // Write some data right-away. This makes a significant performance different.
243 |         self.queue
244 |             .enqueue_write_buffer(&mut buff, opencl3::types::CL_BLOCKING, 0, &[0u8], &[])?;
245 | 
246 |         Ok(Buffer::<T> {
247 |             buffer: buff,
248 |             length,
249 |             _phantom: std::marker::PhantomData,
250 |         })
251 |     }
252 | 
253 |     /// Creates a new buffer on the GPU and initializes with the given slice.
254 |     pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
255 |         // The underlying buffer is `u8`, hence we need the number of bytes.
256 |         let bytes_len = mem::size_of_val(slice);
257 | 
258 |         let mut buffer = unsafe {
259 |             opencl3::memory::Buffer::create(
260 |                 &self.context,
261 |                 CL_MEM_READ_WRITE,
262 |                 bytes_len,
263 |                 ptr::null_mut(),
264 |             )?
265 |         };
266 |         // Transmuting types is safe as long a sizes match.
267 |         let bytes = unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u8, bytes_len) };
268 |         // Write some data right-away. This makes a significant performance different.
269 |         unsafe {
270 |             self.queue
271 |                 .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &[0u8], &[])?;
272 |             self.queue
273 |                 .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, bytes, &[])?;
274 |         };
275 | 
276 |         Ok(Buffer::<T> {
277 |             buffer,
278 |             length: slice.len(),
279 |             _phantom: std::marker::PhantomData,
280 |         })
281 |     }
282 | 
283 |     /// Returns a kernel.
284 |     ///
285 |     /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
286 |     /// number of threads. Instead it follows CUDA's definition and is the number of
287 |     /// `local_work_size` sized thread groups. So the total number of threads is
288 |     /// `global_work_size * local_work_size`.
289 |     pub fn create_kernel(
290 |         &self,
291 |         name: &str,
292 |         global_work_size: usize,
293 |         local_work_size: usize,
294 |     ) -> GPUResult<Kernel> {
295 |         let kernel = self
296 |             .kernels_by_name
297 |             .get(name)
298 |             .ok_or_else(|| GPUError::KernelNotFound(name.to_string()))?;
299 |         let mut builder = ExecuteKernel::new(kernel);
300 |         builder.set_global_work_size(global_work_size * local_work_size);
301 |         builder.set_local_work_size(local_work_size);
302 |         Ok(Kernel {
303 |             builder,
304 |             queue: &self.queue,
305 |             num_local_buffers: 0,
306 |         })
307 |     }
308 | 
309 |     /// Puts data from an existing buffer onto the GPU.
310 |     pub fn write_from_buffer<T>(
311 |         &self,
312 |         // From Rust's perspective, this buffer doesn't need to be mutable. But the sub-buffer is
313 |         // mutating the buffer, so it really should be.
314 |         buffer: &mut Buffer<T>,
315 |         data: &[T],
316 |     ) -> GPUResult<()> {
317 |         assert!(data.len() <= buffer.length, "Buffer is too small");
318 | 
319 |         // It is safe as long as the sizes match.
320 |         let bytes = unsafe {
321 |             std::slice::from_raw_parts(data.as_ptr() as *const u8, mem::size_of_val(data))
322 |         };
323 |         unsafe {
324 |             self.queue
325 |                 .enqueue_write_buffer(&mut buffer.buffer, CL_BLOCKING, 0, bytes, &[])?;
326 |         }
327 |         Ok(())
328 |     }
329 | 
330 |     /// Reads data from the GPU into an existing buffer.
331 |     pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
332 |         assert!(data.len() <= buffer.length, "Buffer is too small");
333 | 
334 |         // It is safe as long as the sizes match.
335 |         let bytes = unsafe {
336 |             std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, mem::size_of_val(data))
337 |         };
338 |         unsafe {
339 |             self.queue
340 |                 .enqueue_read_buffer(&buffer.buffer, CL_BLOCKING, 0, bytes, &[])?;
341 |         };
342 |         Ok(())
343 |     }
344 | 
345 |     /// Run some code in the context of the program.
346 |     ///
347 |     /// It takes the program as a parameter, so that we can use the same function body, for both
348 |     /// the OpenCL and the CUDA code path. The only difference is the type of the program.
349 |     pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
350 |     where
351 |         F: FnOnce(&Self, A) -> Result<R, E>,
352 |         E: From<GPUError>,
353 |     {
354 |         fun(self, arg)
355 |     }
356 | }
357 | 
358 | /// Abstraction for kernel arguments.
359 | ///
360 | /// The kernel doesn't support being called with custom types, hence some conversion might be
361 | /// needed. This trait enables automatic coversions, so that any type implementing it can be
362 | /// passed into a [`Kernel`].
363 | pub trait KernelArgument {
364 |     /// Apply the kernel argument to the kernel.
365 |     fn push(&self, kernel: &mut Kernel);
366 | }
367 | 
368 | impl<T> KernelArgument for Buffer<T> {
369 |     fn push(&self, kernel: &mut Kernel) {
370 |         unsafe {
371 |             kernel.builder.set_arg(&self.buffer);
372 |         }
373 |     }
374 | }
375 | 
376 | impl KernelArgument for i32 {
377 |     fn push(&self, kernel: &mut Kernel) {
378 |         unsafe {
379 |             kernel.builder.set_arg(self);
380 |         }
381 |     }
382 | }
383 | 
384 | impl KernelArgument for u32 {
385 |     fn push(&self, kernel: &mut Kernel) {
386 |         unsafe {
387 |             kernel.builder.set_arg(self);
388 |         }
389 |     }
390 | }
391 | 
392 | impl<T> KernelArgument for LocalBuffer<T> {
393 |     fn push(&self, kernel: &mut Kernel) {
394 |         unsafe {
395 |             kernel
396 |                 .builder
397 |                 .set_arg_local_buffer(self.length * std::mem::size_of::<T>());
398 |         }
399 |         kernel.num_local_buffers += 1;
400 |     }
401 | }
402 | 
403 | /// A kernel that can be executed.
404 | #[derive(Debug)]
405 | pub struct Kernel<'a> {
406 |     /// The underlying kernel builder.
407 |     pub builder: ExecuteKernel<'a>,
408 |     queue: &'a CommandQueue,
409 |     /// There can only be a single [`LocalBuffer`] as parameter due to CUDA restrictions. This
410 |     /// counts them, so that there can be an error if there are more `LocalBuffer` arguments.
411 |     num_local_buffers: u8,
412 | }
413 | 
414 | impl<'a> Kernel<'a> {
415 |     /// Set a kernel argument.
416 |     ///
417 |     /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
418 |     /// long as the kernel is in use.
419 |     ///
420 |     /// Example where this behaviour is enforced and leads to a compile-time error:
421 |     ///
422 |     /// ```compile_fail
423 |     /// use rust_gpu_tools::opencl::Program;
424 |     ///
425 |     /// fn would_break(program: &Program) {
426 |     ///    let data = vec![1, 2, 3, 4];
427 |     ///    let buffer = program.create_buffer_from_slice(&data).unwrap();
428 |     ///    let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
429 |     ///    let kernel = kernel.arg(&buffer);
430 |     ///    // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
431 |     ///    drop(buffer);
432 |     ///    kernel.run().unwrap();
433 |     /// }
434 |     /// ```
435 |     pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
436 |         t.push(&mut self);
437 |         self
438 |     }
439 | 
440 |     /// Actually run the kernel.
441 |     pub fn run(mut self) -> GPUResult<()> {
442 |         if self.num_local_buffers > 1 {
443 |             return Err(GPUError::Generic(
444 |                 "There cannot be more than one `LocalBuffer`.".to_string(),
445 |             ));
446 |         }
447 |         unsafe {
448 |             self.builder.enqueue_nd_range(self.queue)?;
449 |         }
450 |         Ok(())
451 |     }
452 | }
453 | 


--------------------------------------------------------------------------------
/src/opencl/utils.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryFrom;
  2 | 
  3 | use log::{debug, warn};
  4 | use opencl3::device::CL_UUID_SIZE_KHR;
  5 | use sha2::{Digest, Sha256};
  6 | 
  7 | use crate::device::{DeviceUuid, PciId, Vendor};
  8 | use crate::error::{GPUError, GPUResult};
  9 | use crate::opencl::Device;
 10 | 
 11 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
 12 | ///
 13 | /// It is the first two identifiers of e.g. `lspci`:
 14 | ///
 15 | /// ```text
 16 | ///     4e:00.0 VGA compatible controller
 17 | ///     || └└-- Device ID
 18 | ///     └└-- Bus ID
 19 | /// ```
 20 | fn get_pci_id(device: &opencl3::device::Device) -> GPUResult<PciId> {
 21 |     let vendor = Vendor::try_from(device.vendor_id()?)?;
 22 |     let id = match vendor {
 23 |         Vendor::Amd => {
 24 |             let topo = device.topology_amd()?;
 25 |             let bus_id = topo.bus as u16;
 26 |             let device_id = topo.device as u16;
 27 |             (bus_id << 8) | device_id
 28 |         }
 29 |         Vendor::Intel => {
 30 |             let pcibusinfo = device.pcibusinfokhr_intel()?;
 31 |             let bus_id = pcibusinfo.pci_bus as u16;
 32 |             let device_id = pcibusinfo.pci_device as u16;
 33 |             (bus_id << 8) | device_id
 34 |         }
 35 |         Vendor::Nvidia => {
 36 |             let bus_id = device.pci_bus_id_nv()? as u16;
 37 |             let device_id = device.pci_slot_id_nv()? as u16;
 38 |             (bus_id << 8) | device_id
 39 |         }
 40 |     };
 41 |     Ok(id.into())
 42 | }
 43 | 
 44 | fn get_uuid(device: &opencl3::device::Device) -> GPUResult<DeviceUuid> {
 45 |     let uuid = device.uuid_khr()?;
 46 |     Ok(uuid.into())
 47 | }
 48 | 
 49 | pub fn cache_path(device: &Device, cl_source: &str) -> std::io::Result<std::path::PathBuf> {
 50 |     let path = home::home_dir().unwrap().join(".rust-gpu-tools");
 51 |     if !std::path::Path::exists(&path) {
 52 |         std::fs::create_dir(&path)?;
 53 |     }
 54 |     let mut hasher = Sha256::new();
 55 |     hasher.update(device.name.as_bytes());
 56 |     hasher.update(u16::from(device.pci_id).to_be_bytes());
 57 |     hasher.update(<[u8; CL_UUID_SIZE_KHR]>::from(
 58 |         device.uuid.unwrap_or_default(),
 59 |     ));
 60 |     hasher.update(cl_source.as_bytes());
 61 |     let filename = format!("{}.bin", hex::encode(hasher.finalize()));
 62 |     Ok(path.join(filename))
 63 | }
 64 | 
 65 | fn get_memory(d: &opencl3::device::Device) -> GPUResult<u64> {
 66 |     d.global_mem_size()
 67 |         .map_err(GPUError::DeviceInfoNotAvailable)
 68 | }
 69 | 
 70 | fn get_compute_units(d: &opencl3::device::Device) -> GPUResult<u32> {
 71 |     d.max_compute_units()
 72 |         .map_err(GPUError::DeviceInfoNotAvailable)
 73 | }
 74 | 
 75 | /// Get the major an minor version of the compute capabilitiy (only available on Nvidia GPUs).
 76 | fn get_compute_capability(d: &opencl3::device::Device) -> GPUResult<(u32, u32)> {
 77 |     let major = d
 78 |         .compute_capability_major_nv()
 79 |         .map_err(GPUError::DeviceInfoNotAvailable)?;
 80 |     let minor = d
 81 |         .compute_capability_major_nv()
 82 |         .map_err(GPUError::DeviceInfoNotAvailable)?;
 83 |     Ok((major, minor))
 84 | }
 85 | 
 86 | /// Get a list of all available and supported devices.
 87 | ///
 88 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
 89 | /// logged and the corresponding device won't be available.
 90 | pub(crate) fn build_device_list() -> Vec<Device> {
 91 |     let mut all_devices = Vec::new();
 92 |     let platforms: Vec<_> = opencl3::platform::get_platforms().unwrap_or_default();
 93 | 
 94 |     let mut devices_without_pci_id = Vec::new();
 95 | 
 96 |     for platform in platforms.iter() {
 97 |         let devices = platform
 98 |             .get_devices(opencl3::device::CL_DEVICE_TYPE_GPU)
 99 |             .map_err(Into::into)
100 |             .and_then(|devices| {
101 |                 devices
102 |                     .into_iter()
103 |                     .map(opencl3::device::Device::new)
104 |                     .filter_map(|device| {
105 |                         if let Ok(vendor_id) = device.vendor_id() {
106 |                             // Only use devices from the accepted vendors ...
107 |                             let vendor = Vendor::try_from(vendor_id).ok()?;
108 |                             // ... which are available.
109 |                             if !device.available().unwrap_or(false) {
110 |                                 return None;
111 |                             }
112 | 
113 |                             // `filter_map()` needs to return erros wrapped in an `Option`, hence
114 |                             // early returns with the question mark operator cannot be used.
115 |                             let name = match device.name() {
116 |                                 Ok(name) => name,
117 |                                 Err(error) => return Some(Err(error.into())),
118 |                             };
119 |                             let memory = match get_memory(&device) {
120 |                                 Ok(memory) => memory,
121 |                                 Err(error) => return Some(Err(error)),
122 |                             };
123 |                             let compute_units = match get_compute_units(&device) {
124 |                                 Ok(units) => units,
125 |                                 Err(error) => return Some(Err(error)),
126 |                             };
127 |                             let compute_capability = get_compute_capability(&device).ok();
128 |                             let uuid = get_uuid(&device).ok();
129 | 
130 |                             // If a device doesn't have a PCI-ID, add those later to the list of
131 |                             // devices with a fake PCI-ID.
132 |                             match get_pci_id(&device) {
133 |                                 Ok(pci_id) => {
134 |                                     return Some(Ok(Device {
135 |                                         vendor,
136 |                                         name,
137 |                                         memory,
138 |                                         compute_units,
139 |                                         compute_capability,
140 |                                         pci_id,
141 |                                         uuid,
142 |                                         device,
143 |                                     }));
144 |                                 }
145 |                                 Err(_) => {
146 |                                     // Use a temporary PCI-ID and replace it later with a
147 |                                     // non-colliding one.
148 |                                     let pci_id = PciId::from(0);
149 |                                     devices_without_pci_id.push(Device {
150 |                                         vendor,
151 |                                         name,
152 |                                         memory,
153 |                                         compute_units,
154 |                                         compute_capability,
155 |                                         pci_id,
156 |                                         uuid,
157 |                                         device,
158 |                                     });
159 |                                     return None;
160 |                                 }
161 |                             };
162 |                         }
163 |                         None
164 |                     })
165 |                     .collect::<GPUResult<Vec<_>>>()
166 |             });
167 |         match devices {
168 |             Ok(mut devices) => {
169 |                 all_devices.append(&mut devices);
170 |             }
171 |             Err(err) => {
172 |                 let platform_name = platform
173 |                     .name()
174 |                     .unwrap_or_else(|_| "<unknown platform>".to_string());
175 |                 warn!(
176 |                     "Unable to retrieve devices for {}: {:?}",
177 |                     platform_name, err
178 |                 );
179 |             }
180 |         }
181 |     }
182 | 
183 |     // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID.
184 |     // As those devices are used for development and not for production use, it's good enough to
185 |     // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID
186 |     // instead, which is generated by enumerating the available devices. In order to make that
187 |     // case easier to spot when debugging issues, a starting number which is pleasant to the human
188 |     // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234).
189 |     let mut enumerated_device: u16 = 4660;
190 |     for mut device in devices_without_pci_id.into_iter() {
191 |         // Make sure that no device has that actual PCI-ID
192 |         while all_devices
193 |             .iter()
194 |             .any(|d| d.pci_id() == enumerated_device.into())
195 |         {
196 |             enumerated_device += 1;
197 |         }
198 |         device.pci_id = PciId::from(enumerated_device);
199 |         enumerated_device += 1;
200 |         all_devices.push(device);
201 |     }
202 | 
203 |     debug!("loaded devices: {:?}", all_devices);
204 |     all_devices
205 | }
206 | 


--------------------------------------------------------------------------------
/src/program.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "cuda")]
  2 | use crate::cuda;
  3 | use crate::error::GPUError;
  4 | #[cfg(feature = "opencl")]
  5 | use crate::opencl;
  6 | 
  7 | /// Abstraction for running programs on CUDA or OpenCL.
  8 | pub enum Program {
  9 |     /// CUDA program.
 10 |     #[cfg(feature = "cuda")]
 11 |     Cuda(cuda::Program),
 12 |     /// OpenCL program.
 13 |     #[cfg(feature = "opencl")]
 14 |     Opencl(opencl::Program),
 15 | }
 16 | 
 17 | impl Program {
 18 |     /// Run some code in the context of the program.
 19 |     ///
 20 |     /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
 21 |     /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
 22 |     /// code code can be used to run on either of them. The only difference is the type of the
 23 |     /// `Program`.
 24 |     ///
 25 |     /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
 26 |     /// corresponding program type as parameter. For convenience there is the
 27 |     /// [`crate::program_closures`] macro defined, which can help reducing code duplication by
 28 |     /// creating two closures out of a single one.
 29 |     ///
 30 |     /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
 31 |     /// one of them is disabled, you still need to pass in two closures. This way the API stays,
 32 |     /// the same, but you can disable it things at compile-time.
 33 |     ///
 34 |     /// The second parameter is a single arbitrary argument, which will be passed on into the
 35 |     /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
 36 |     /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
 37 |     /// is used at most once.
 38 |     #[cfg(all(feature = "cuda", feature = "opencl"))]
 39 |     pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
 40 |     where
 41 |         E: From<GPUError>,
 42 |         F1: FnOnce(&cuda::Program, A) -> Result<R, E>,
 43 |         F2: FnOnce(&opencl::Program, A) -> Result<R, E>,
 44 |     {
 45 |         match self {
 46 |             Self::Cuda(program) => program.run(fun.0, arg),
 47 |             Self::Opencl(program) => program.run(fun.1, arg),
 48 |         }
 49 |     }
 50 | 
 51 |     /// Run some code in the context of the program.
 52 |     ///
 53 |     /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
 54 |     /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
 55 |     /// code code can be used to run on either of them. The only difference is the type of the
 56 |     /// `Program`.
 57 |     ///
 58 |     /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
 59 |     /// corresponding program type as parameter. For convenience there is the [`program_closures`]
 60 |     /// macro defined, which can help reducing code duplication by creating two closures out of
 61 |     /// a single one.
 62 |     ///
 63 |     /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
 64 |     /// one of them is disabled, you still need to pass in two closures. This way the API stays,
 65 |     /// the same, but you can disable it things at compile-time.
 66 |     ///
 67 |     /// The second parameter is a single arbitrary argument, which will be passed on into the
 68 |     /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
 69 |     /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
 70 |     /// is used at most once.
 71 |     #[cfg(all(feature = "cuda", not(feature = "opencl")))]
 72 |     pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
 73 |     where
 74 |         E: From<GPUError>,
 75 |         F1: FnOnce(&cuda::Program, A) -> Result<R, E>,
 76 |     {
 77 |         match self {
 78 |             Self::Cuda(program) => program.run(fun.0, arg),
 79 |         }
 80 |     }
 81 | 
 82 |     /// Run some code in the context of the program.
 83 |     ///
 84 |     /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
 85 |     /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
 86 |     /// code code can be used to run on either of them. The only difference is the type of the
 87 |     /// `Program`.
 88 |     ///
 89 |     /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
 90 |     /// corresponding program type as parameter. For convenience there is the [`define_closures`]
 91 |     /// macro defined, which can help reducing code duplication by creating two closures out of
 92 |     /// a single one.
 93 |     ///
 94 |     /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
 95 |     /// one of them is disabled, you still need to pass in two closures. This way the API stays,
 96 |     /// the same, but you can disable it things at compile-time.
 97 |     ///
 98 |     /// The second parameter is a single arbitrary argument, which will be passed on into the
 99 |     /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
100 |     /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
101 |     /// is used at most once.
102 |     #[cfg(all(not(feature = "cuda"), feature = "opencl"))]
103 |     pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
104 |     where
105 |         E: From<GPUError>,
106 |         F2: FnOnce(&opencl::Program, A) -> Result<R, E>,
107 |     {
108 |         match self {
109 |             Self::Opencl(program) => program.run(fun.1, arg),
110 |         }
111 |     }
112 | 
113 |     /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
114 |     pub fn device_name(&self) -> &str {
115 |         match self {
116 |             #[cfg(feature = "cuda")]
117 |             Self::Cuda(program) => program.device_name(),
118 |             #[cfg(feature = "opencl")]
119 |             Self::Opencl(program) => program.device_name(),
120 |         }
121 |     }
122 | }
123 | 
124 | /// Creates two closures, one for CUDA, one for OpenCL for the given one.
125 | ///
126 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
127 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
128 | /// it will be mapped to &[`cuda::Program`] and &[`opencl::Program`].
129 | ///
130 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure.
131 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
132 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at
133 | /// most once.
134 | ///
135 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
136 | /// thing and not specify one of them if it is appropriate.
137 | ///
138 | /// ### Example
139 | ///
140 | /// ```
141 | /// use rust_gpu_tools::{cuda, opencl, program_closures};
142 | ///
143 | /// let closures = program_closures!(|program, arg: u8| -> bool {
144 | ///     true
145 | /// });
146 | ///
147 | /// // Generates
148 | /// let closures = (
149 | ///     |program: &cuda::Program, arg: u8| { true },
150 | ///     |program: &opencl::Program, arg: u8| { true },
151 | /// );
152 | ///
153 | /// // If e.g. the `cuda` feature is disabled, it would generate
154 | /// let closures_without_cuda = (
155 | ///     (),
156 | ///     |program: &opencl::Program, arg: u8| { true },
157 | /// );
158 | /// ```
159 | #[cfg(all(feature = "cuda", feature = "opencl"))]
160 | #[macro_export]
161 | macro_rules! program_closures {
162 |     // Additional argument without a type
163 |     (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
164 |         (
165 |             |$program: &$crate::cuda::Program, $arg| -> $ret { $body },
166 |             |$program: &$crate::opencl::Program, $arg| -> $ret { $body },
167 |         )
168 |     };
169 |     // Additional argument with a type
170 |     (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
171 |         (
172 |             |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body },
173 |             |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body },
174 |         )
175 |     };
176 | }
177 | 
178 | /// Creates two closures, one for CUDA, one for OpenCL for the given one.
179 | ///
180 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
181 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
182 | /// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`].
183 | ///
184 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure.
185 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
186 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at
187 | /// most once.
188 | ///
189 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
190 | /// thing and not specify one of them if it is appropriate.
191 | ///
192 | /// ### Example
193 | ///
194 | /// ```
195 | /// use rust_gpu_tools::{cuda, opencl, program_closures};
196 | ///
197 | /// let closures = program_closures!(|program, arg: u8| -> bool {
198 | ///     true
199 | /// });
200 | ///
201 | /// // Generates
202 | /// let closures = (
203 | ///     |program: &cuda::Program, arg: u8| { true },
204 | ///     |program: &opencl::Program, arg: u8| { true },
205 | /// );
206 | ///
207 | /// // If e.g. the `cuda` feature is disabled, it would generate
208 | /// let closures_without_cuda = (
209 | ///     (),
210 | ///     |program: &opencl::Program, arg: u8| { true },
211 | /// );
212 | /// ```
213 | #[macro_export]
214 | #[cfg(all(feature = "cuda", not(feature = "opencl")))]
215 | macro_rules! program_closures {
216 |     // Additional argument without a type
217 |     (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
218 |         (
219 |             |$program: &$crate::cuda::Program, $arg| -> $ret { $body },
220 |             (),
221 |         )
222 |     };
223 |     // Additional argument with a type
224 |     (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
225 |         (
226 |             |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body },
227 |             (),
228 |         )
229 |     };
230 | }
231 | 
232 | /// Creates two closures, one for CUDA, one for OpenCL for the given one.
233 | ///
234 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
235 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
236 | /// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`].
237 | ///
238 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure.
239 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
240 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at
241 | /// most once.
242 | ///
243 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
244 | /// thing and not specify one of them if it is appropriate.
245 | ///
246 | /// ### Example
247 | ///
248 | /// ```
249 | /// use rust_gpu_tools::{cuda, opencl, program_closures};
250 | ///
251 | /// let closures = program_closures!(|program, arg: u8| -> bool {
252 | ///     true
253 | /// });
254 | ///
255 | /// // Generates
256 | /// let closures = (
257 | ///     |program: &cuda::Program, arg: u8| { true },
258 | ///     |program: &opencl::Program, arg: u8| { true },
259 | /// );
260 | ///
261 | /// // If e.g. the `cuda` feature is disabled, it would generate
262 | /// let closures_without_cuda = (
263 | ///     (),
264 | ///     |program: &opencl::Program, arg: u8| { true },
265 | /// );
266 | /// ```
267 | #[macro_export]
268 | #[cfg(all(not(feature = "cuda"), feature = "opencl"))]
269 | macro_rules! program_closures {
270 |     // Additional argument without a type
271 |     (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
272 |         ((), |$program: &$crate::opencl::Program, $arg| -> $ret {
273 |             $body
274 |         })
275 |     };
276 |     // Additional argument with a type
277 |     (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
278 |         (
279 |             (),
280 |             |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body },
281 |         )
282 |     };
283 | }
284 | 


--------------------------------------------------------------------------------