├── .github └── workflows │ └── ci.yml ├── .gitignore ├── COPYRIGHT ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── examples ├── add.cl ├── add.fatbin └── add.rs ├── rust-toolchain └── src ├── cuda ├── mod.rs └── utils.rs ├── device.rs ├── error.rs ├── lib.rs ├── opencl ├── error.rs ├── mod.rs └── utils.rs └── program.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [pull_request, push] 4 | 5 | # Cancel a job if there's a new on on the same branch started. 6 | # Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051 7 | concurrency: 8 | group: ${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | env: 12 | CARGO_INCREMENTAL: 0 13 | RUST_BACKTRACE: 1 14 | # Faster crates.io index checkout. 15 | CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse 16 | RUST_LOG: debug 17 | 18 | jobs: 19 | check_clippy: 20 | runs-on: ubuntu-24.04 21 | name: Clippy 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Run cargo clippy 25 | run: cargo clippy --all-targets --workspace --all-features -- -D warnings 26 | 27 | check_fmt: 28 | runs-on: ubuntu-24.04 29 | name: Checking fmt 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: Run cargo fmt 33 | run: cargo fmt --all -- --check 34 | 35 | rustdoc: 36 | runs-on: ubuntu-24.04 37 | name: Rustdoc 38 | steps: 39 | - uses: actions/checkout@v4 40 | - name: Run rustdoc 41 | run: cargo rustdoc --all-features -- -D warnings 42 | 43 | build: 44 | runs-on: ubuntu-24.04 45 | name: Release build 46 | steps: 47 | - uses: actions/checkout@v4 48 | - name: Run cargo release build 49 | run: cargo build --release 50 | 51 | # Enable these tests once there's a runner with a GPU. 52 | #test_gpu: 53 | # runs-on: ubuntu-24.04 54 | # name: Test 55 | # steps: 56 | # - uses: actions/checkout@v4 57 | # - name: Install required packages 58 | # run: sudo apt install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev 59 | # - name: Run tests 60 | # run: cargo test 61 | # - name: Run `add` example 62 | # run: cargo run --example add 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyrights in the "rust-fil-nse-gpu" library are retained by their contributors. No 2 | copyright assignment is required to contribute to the "rust-fil-nse-gpu" library. 3 | 4 | The "rust-fil-nse-gpu" library is licensed under either of 5 | 6 | * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0) 7 | * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT) 8 | 9 | at your option. 10 | 11 | Unless you explicitly state otherwise, any contribution intentionally 12 | submitted for inclusion in the work by you, as defined in the Apache-2.0 13 | license, shall be dual licensed as above, without any additional terms or 14 | conditions. 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust-gpu-tools" 3 | version = "0.7.2" 4 | authors = ["Keyvan Kambakhsh ", "porcuquine "] 5 | description = "Rust OpenCL tools" 6 | edition = "2021" 7 | homepage = "https://github.com/filecoin-project/rust-gpu-tools" 8 | license = "MIT/Apache-2.0" 9 | repository = "https://github.com/filecoin-project/rust-gpu-tools" 10 | rust-version = "1.81.0" 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | [features] 14 | default = ["opencl", "cuda"] 15 | opencl = ["opencl3"] 16 | cuda = ["rustacuda"] 17 | 18 | [dependencies] 19 | home = "0.5" 20 | sha2 = "0.10" 21 | thiserror = "2.0.12" 22 | log = "0.4.26" 23 | hex = "0.4.3" 24 | 25 | opencl3 = { version = "0.11.0", default-features = false, features = ["CL_VERSION_1_2"], optional = true } 26 | rustacuda = { package = "fil-rustacuda", version = "0.1.3", optional = true } 27 | once_cell = "1.8.0" 28 | temp-env = "0.3.3" 29 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rust-gpu-tools [![Crates.io](https://img.shields.io/crates/v/rust-gpu-tools.svg)](https://crates.io/crates/rust-gpu-tools) 2 | 3 | An abstraction library to run kernels on both CUDA and OpenCL. 4 | 5 | ## Example 6 | 7 | You need to write the code that interacts with the GPU only once. Below is such code that runs a 8 | kernel on CUDA and/or OpenCL. For a full working example, please see the [`examples`](examples) 9 | directory. You can run it via `cargo run --example add`. 10 | 11 | ```rust 12 | let closures = program_closures!(|program, _args| -> Result, GPUError> { 13 | // Make sure the input data has the same length. 14 | assert_eq!(aa.len(), bb.len()); 15 | let length = aa.len(); 16 | 17 | // Copy the data to the GPU. 18 | let aa_buffer = program.create_buffer_from_slice(&aa)?; 19 | let bb_buffer = program.create_buffer_from_slice(&bb)?; 20 | 21 | // The result buffer has the same length as the input buffers. 22 | let result_buffer = unsafe { program.create_buffer::(length)? }; 23 | 24 | // Get the kernel. 25 | let kernel = program.create_kernel("add", 8, 4)?; 26 | 27 | // Execute the kernel. 28 | kernel 29 | .arg(&(length as u32)) 30 | .arg(&aa_buffer) 31 | .arg(&bb_buffer) 32 | .arg(&result_buffer) 33 | .run()?; 34 | 35 | // Get the resulting data. 36 | let mut result = vec![0u32; length]; 37 | program.read_into_buffer(&result_buffer, &mut result)?; 38 | 39 | Ok(result) 40 | }); 41 | ``` 42 | 43 | 44 | ## License 45 | 46 | Licensed under either of 47 | 48 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 49 | http://www.apache.org/licenses/LICENSE-2.0) 50 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 51 | 52 | at your option. 53 | 54 | ### Contribution 55 | 56 | Unless you explicitly state otherwise, any contribution intentionally 57 | submitted for inclusion in the work by you, as defined in the Apache-2.0 58 | license, shall be dual licensed as above, without any additional terms or 59 | conditions. 60 | -------------------------------------------------------------------------------- /examples/add.cl: -------------------------------------------------------------------------------- 1 | // CUDA 2 | #ifdef __CUDACC__ 3 | #define GLOBAL 4 | #define KERNEL extern "C" __global__ 5 | // OpenCL 6 | #else 7 | #define GLOBAL __global 8 | #define KERNEL __kernel 9 | #endif 10 | 11 | KERNEL void add(uint num, GLOBAL uint *a, GLOBAL uint *b, GLOBAL uint *result) { 12 | for (uint i = 0; i < num; i++) { 13 | result[i] = a[i] + b[i]; 14 | } 15 | } -------------------------------------------------------------------------------- /examples/add.fatbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filecoin-project/rust-gpu-tools/ccb3c7ee4b5944ddf6427c488dc82cc20c9626ed/examples/add.fatbin -------------------------------------------------------------------------------- /examples/add.rs: -------------------------------------------------------------------------------- 1 | use rust_gpu_tools::{cuda, opencl, program_closures, Device, GPUError, Program, Vendor}; 2 | 3 | /// Returns a `Program` that runs on CUDA. 4 | fn cuda(device: &Device) -> Program { 5 | // The kernel was compiled with: 6 | // nvcc -fatbin -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75 --x cu add.cl 7 | let cuda_kernel = include_bytes!("./add.fatbin"); 8 | let cuda_device = device.cuda_device().unwrap(); 9 | let cuda_program = cuda::Program::from_bytes(cuda_device, cuda_kernel).unwrap(); 10 | Program::Cuda(cuda_program) 11 | } 12 | 13 | /// Returns a `Program` that runs on OpenCL. 14 | fn opencl(device: &Device) -> Program { 15 | let opencl_kernel = include_str!("./add.cl"); 16 | let opencl_device = device.opencl_device().unwrap(); 17 | let opencl_program = opencl::Program::from_opencl(opencl_device, opencl_kernel).unwrap(); 18 | Program::Opencl(opencl_program) 19 | } 20 | 21 | pub fn main() { 22 | // Define some data that should be operated on. 23 | let aa: Vec = vec![1, 2, 3, 4]; 24 | let bb: Vec = vec![5, 6, 7, 8]; 25 | 26 | // This is the core. Here we write the interaction with the GPU independent of whether it is 27 | // CUDA or OpenCL. 28 | let closures = program_closures!(|program, _args| -> Result, GPUError> { 29 | // Make sure the input data has the same length. 30 | assert_eq!(aa.len(), bb.len()); 31 | let length = aa.len(); 32 | 33 | // Copy the data to the GPU. 34 | let aa_buffer = program.create_buffer_from_slice(&aa)?; 35 | let bb_buffer = program.create_buffer_from_slice(&bb)?; 36 | 37 | // The result buffer has the same length as the input buffers. 38 | let result_buffer = unsafe { program.create_buffer::(length)? }; 39 | 40 | // Get the kernel. 41 | let kernel = program.create_kernel("add", 1, 1)?; 42 | 43 | // Execute the kernel. 44 | kernel 45 | .arg(&(length as u32)) 46 | .arg(&aa_buffer) 47 | .arg(&bb_buffer) 48 | .arg(&result_buffer) 49 | .run()?; 50 | 51 | // Get the resulting data. 52 | let mut result = vec![0u32; length]; 53 | program.read_into_buffer(&result_buffer, &mut result)?; 54 | 55 | Ok(result) 56 | }); 57 | 58 | // First we run it on CUDA if available 59 | let nv_dev_list = Device::by_vendor(Vendor::Nvidia); 60 | if !nv_dev_list.is_empty() { 61 | // Test NVIDIA CUDA Flow 62 | let cuda_program = cuda(nv_dev_list[0]); 63 | let cuda_result = cuda_program.run(closures, ()).unwrap(); 64 | assert_eq!(cuda_result, [6, 8, 10, 12]); 65 | println!("CUDA result: {:?}", cuda_result); 66 | 67 | // Test NVIDIA OpenCL Flow 68 | let opencl_program = opencl(nv_dev_list[0]); 69 | let opencl_result = opencl_program.run(closures, ()).unwrap(); 70 | assert_eq!(opencl_result, [6, 8, 10, 12]); 71 | println!("OpenCL Nvidia result: {:?}", opencl_result); 72 | } 73 | 74 | // Then we run it on Intel OpenCL if available 75 | let intel_dev_list = Device::by_vendor(Vendor::Intel); 76 | if !intel_dev_list.is_empty() { 77 | let opencl_program = opencl(intel_dev_list[0]); 78 | let opencl_result = opencl_program.run(closures, ()).unwrap(); 79 | assert_eq!(opencl_result, [6, 8, 10, 12]); 80 | println!("OpenCL Intel result: {:?}", opencl_result); 81 | } 82 | 83 | let amd_dev_list = Device::by_vendor(Vendor::Amd); 84 | if !amd_dev_list.is_empty() { 85 | let opencl_program = opencl(amd_dev_list[0]); 86 | let opencl_result = opencl_program.run(closures, ()).unwrap(); 87 | assert_eq!(opencl_result, [6, 8, 10, 12]); 88 | println!("OpenCL Amd result: {:?}", opencl_result); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | 1.81.0 2 | -------------------------------------------------------------------------------- /src/cuda/mod.rs: -------------------------------------------------------------------------------- 1 | //! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`]. 2 | //! 3 | //! The current operation mode is synchronuous, in order to have higher safety gurarantees. All 4 | //! operations happen on a single stream, which is synchronized after each operation. This is a 5 | //! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons: 6 | //! 7 | //! 1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream 8 | //! 2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous 9 | //! copy methods. To prevent such kind of bugs, be explicit which stream is used. 10 | 11 | pub(crate) mod utils; 12 | 13 | use std::convert::TryFrom; 14 | use std::ffi::{c_void, CStr, CString}; 15 | use std::fmt; 16 | use std::hash::{Hash, Hasher}; 17 | use std::mem; 18 | 19 | use log::debug; 20 | use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer}; 21 | use rustacuda::stream::{Stream, StreamFlags}; 22 | 23 | use crate::device::{DeviceUuid, PciId, Vendor}; 24 | use crate::error::{GPUError, GPUResult}; 25 | use crate::LocalBuffer; 26 | 27 | /// A Buffer to be used for sending and receiving data to/from the GPU. 28 | #[derive(Debug)] 29 | pub struct Buffer { 30 | buffer: DeviceBuffer, 31 | /// The number of T-sized elements. 32 | length: usize, 33 | _phantom: std::marker::PhantomData, 34 | } 35 | 36 | /// CUDA specific device. 37 | #[derive(Debug, Clone)] 38 | pub struct Device { 39 | vendor: Vendor, 40 | name: String, 41 | /// The total memory of the GPU in bytes. 42 | memory: u64, 43 | /// Number of streaming multiprocessors. 44 | compute_units: u32, 45 | /// The compute capability of the device, major and minor version. 46 | compute_capability: (u32, u32), 47 | pci_id: PciId, 48 | uuid: Option, 49 | context: rustacuda::context::UnownedContext, 50 | } 51 | 52 | impl Hash for Device { 53 | fn hash(&self, state: &mut H) { 54 | self.vendor.hash(state); 55 | self.name.hash(state); 56 | self.memory.hash(state); 57 | self.pci_id.hash(state); 58 | self.uuid.hash(state); 59 | } 60 | } 61 | 62 | impl PartialEq for Device { 63 | fn eq(&self, other: &Self) -> bool { 64 | self.vendor == other.vendor 65 | && self.name == other.name 66 | && self.memory == other.memory 67 | && self.pci_id == other.pci_id 68 | && self.uuid == other.uuid 69 | } 70 | } 71 | 72 | impl Eq for Device {} 73 | 74 | impl Device { 75 | /// Returns the [`Vendor`] of the GPU. 76 | pub fn vendor(&self) -> Vendor { 77 | self.vendor 78 | } 79 | 80 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 81 | pub fn name(&self) -> String { 82 | self.name.clone() 83 | } 84 | 85 | /// Returns the memory of the GPU in bytes. 86 | pub fn memory(&self) -> u64 { 87 | self.memory 88 | } 89 | 90 | /// Returns the number of compute units of the GPU. 91 | pub fn compute_units(&self) -> u32 { 92 | self.compute_units 93 | } 94 | 95 | /// Returns the major and minor version of compute capability of the GPU. 96 | pub fn compute_capability(&self) -> (u32, u32) { 97 | self.compute_capability 98 | } 99 | 100 | /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information. 101 | pub fn pci_id(&self) -> PciId { 102 | self.pci_id 103 | } 104 | 105 | /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more 106 | /// information. 107 | pub fn uuid(&self) -> Option { 108 | self.uuid 109 | } 110 | } 111 | 112 | /// Abstraction that contains everything to run a CUDA kernel on a GPU. 113 | /// 114 | /// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this 115 | /// API, which will then work with OpenCL as well as CUDA kernels. 116 | // When compiled without the `opencl` feature, then the intra-doc link above will be broken. 117 | #[allow(rustdoc::broken_intra_doc_links)] 118 | #[derive(Debug)] 119 | pub struct Program { 120 | context: rustacuda::context::UnownedContext, 121 | module: rustacuda::module::Module, 122 | stream: Stream, 123 | device_name: String, 124 | } 125 | 126 | impl Program { 127 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 128 | pub fn device_name(&self) -> &str { 129 | &self.device_name 130 | } 131 | 132 | /// Creates a program for a specific device from a compiled CUDA binary file. 133 | pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult { 134 | debug!("Creating CUDA program from binary file."); 135 | rustacuda::context::CurrentContext::set_current(&device.context)?; 136 | let module = rustacuda::module::Module::load_from_file(filename).inspect_err(|_err| { 137 | Self::pop_context(); 138 | })?; 139 | let stream = Stream::new(StreamFlags::NON_BLOCKING, None).inspect_err(|_err| { 140 | Self::pop_context(); 141 | })?; 142 | let prog = Program { 143 | module, 144 | stream, 145 | device_name: device.name(), 146 | context: device.context.clone(), 147 | }; 148 | Self::pop_context(); 149 | Ok(prog) 150 | } 151 | 152 | /// Creates a program for a specific device from a compiled CUDA binary. 153 | pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult { 154 | debug!("Creating CUDA program from bytes."); 155 | rustacuda::context::CurrentContext::set_current(&device.context)?; 156 | let module = rustacuda::module::Module::load_from_bytes(bytes).inspect_err(|_err| { 157 | Self::pop_context(); 158 | })?; 159 | let stream = Stream::new(StreamFlags::NON_BLOCKING, None).inspect_err(|_err| { 160 | Self::pop_context(); 161 | })?; 162 | let prog = Program { 163 | module, 164 | stream, 165 | device_name: device.name(), 166 | context: device.context.clone(), 167 | }; 168 | Self::pop_context(); 169 | Ok(prog) 170 | } 171 | 172 | /// Creates a new buffer that can be used for input/output with the GPU. 173 | /// 174 | /// The `length` is the number of elements to create. 175 | /// 176 | /// It is usually used to create buffers that are initialized by the GPU. If you want to 177 | /// directly transfer data from the host to the GPU, you would use the safe 178 | /// [`Program::create_buffer_from_slice`] instead. 179 | /// 180 | /// ### Safety 181 | /// 182 | /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by 183 | /// the GPU) before it can be read via [`Program::read_into_buffer`]. 184 | pub unsafe fn create_buffer(&self, length: usize) -> GPUResult> { 185 | assert!(length > 0); 186 | // This is the unsafe call, the rest of the function is safe code. 187 | let buffer = DeviceBuffer::::uninitialized(length * std::mem::size_of::())?; 188 | 189 | Ok(Buffer:: { 190 | buffer, 191 | length, 192 | _phantom: std::marker::PhantomData, 193 | }) 194 | } 195 | 196 | /// Creates a new buffer on the GPU and initializes with the given slice. 197 | pub fn create_buffer_from_slice(&self, slice: &[T]) -> GPUResult> { 198 | // The number of bytes is used for the allocations. 199 | let bytes_len = mem::size_of_val(slice); 200 | 201 | // Transmuting types is safe as long a sizes match. 202 | let bytes = unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u8, bytes_len) }; 203 | 204 | // It is only unsafe as long as the buffer isn't initialized, but that's what we do next. 205 | let mut buffer = unsafe { DeviceBuffer::::uninitialized(bytes_len)? }; 206 | // It is safe as we synchronize the stream after the call. 207 | unsafe { buffer.async_copy_from(bytes, &self.stream)? }; 208 | self.stream.synchronize()?; 209 | 210 | Ok(Buffer:: { 211 | buffer, 212 | length: slice.len(), 213 | _phantom: std::marker::PhantomData, 214 | }) 215 | } 216 | 217 | /// Returns a kernel. 218 | /// 219 | /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total 220 | /// number of threads. Instead it follows CUDA's definition and is the number of 221 | /// `local_work_size` sized thread groups. So the total number of threads is 222 | /// `global_work_size * local_work_size`. 223 | pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult { 224 | let function_name = CString::new(name).expect("Kernel name must not contain nul bytes"); 225 | let function = self.module.get_function(&function_name)?; 226 | 227 | Ok(Kernel { 228 | function, 229 | global_work_size: gws, 230 | local_work_size: lws, 231 | stream: &self.stream, 232 | args: Vec::new(), 233 | }) 234 | } 235 | 236 | /// Puts data from an existing buffer onto the GPU. 237 | pub fn write_from_buffer(&self, buffer: &mut Buffer, data: &[T]) -> GPUResult<()> { 238 | assert!(data.len() <= buffer.length, "Buffer is too small"); 239 | 240 | // Transmuting types is safe as long a sizes match. 241 | let bytes = unsafe { 242 | std::slice::from_raw_parts(data.as_ptr() as *const u8, mem::size_of_val(data)) 243 | }; 244 | 245 | // It is safe as we synchronize the stream after the call. 246 | unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? }; 247 | self.stream.synchronize()?; 248 | 249 | Ok(()) 250 | } 251 | 252 | /// Reads data from the GPU into an existing buffer. 253 | pub fn read_into_buffer(&self, buffer: &Buffer, data: &mut [T]) -> GPUResult<()> { 254 | assert!(data.len() <= buffer.length, "Buffer is too small"); 255 | 256 | // Transmuting types is safe as long a sizes match. 257 | let bytes = unsafe { 258 | std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, mem::size_of_val(data)) 259 | }; 260 | 261 | // It is safe as we synchronize the stream after the call. 262 | unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? }; 263 | self.stream.synchronize()?; 264 | 265 | Ok(()) 266 | } 267 | 268 | /// Run some code in the context of the program. 269 | /// 270 | /// It sets the correct contexts. 271 | /// 272 | /// It takes the program as a parameter, so that we can use the same function body, for both 273 | /// the OpenCL and the CUDA code path. The only difference is the type of the program. 274 | pub fn run(&self, fun: F, arg: A) -> Result 275 | where 276 | F: FnOnce(&Self, A) -> Result, 277 | E: From, 278 | { 279 | rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?; 280 | let result = fun(self, arg); 281 | Self::pop_context(); 282 | result 283 | } 284 | 285 | /// Pop the current context. 286 | /// 287 | /// It panics as it's an unrecoverable error. 288 | fn pop_context() { 289 | rustacuda::context::ContextStack::pop().expect("Cannot remove context."); 290 | } 291 | } 292 | 293 | // TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that 294 | // this manual `Send` implementation is no longer needed. 295 | unsafe impl Send for Program {} 296 | 297 | /// Abstraction for kernel arguments. 298 | /// 299 | /// Kernel arguments implement this trait, so that they can be converted it into the correct 300 | /// pointers needed by the actual kernel call. 301 | pub trait KernelArgument { 302 | /// Converts into a C void pointer. 303 | fn as_c_void(&self) -> *mut c_void; 304 | 305 | /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This 306 | /// informations is used to allocate the memory correctly. 307 | fn shared_mem(&self) -> u32 { 308 | 0 309 | } 310 | } 311 | 312 | impl KernelArgument for Buffer { 313 | fn as_c_void(&self) -> *mut c_void { 314 | &self.buffer as *const _ as _ 315 | } 316 | } 317 | 318 | impl KernelArgument for i32 { 319 | fn as_c_void(&self) -> *mut c_void { 320 | self as *const _ as _ 321 | } 322 | } 323 | 324 | impl KernelArgument for u32 { 325 | fn as_c_void(&self) -> *mut c_void { 326 | self as *const _ as _ 327 | } 328 | } 329 | 330 | impl KernelArgument for LocalBuffer { 331 | // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo) 332 | // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it 333 | // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would 334 | // return an "invalid argument" error. 335 | fn as_c_void(&self) -> *mut c_void { 336 | self as *const _ as _ 337 | } 338 | 339 | fn shared_mem(&self) -> u32 { 340 | u32::try_from(self.length * std::mem::size_of::()) 341 | .expect("__shared__ memory allocation is too big.") 342 | } 343 | } 344 | 345 | /// A kernel that can be executed. 346 | pub struct Kernel<'a> { 347 | function: rustacuda::function::Function<'a>, 348 | global_work_size: usize, 349 | local_work_size: usize, 350 | stream: &'a Stream, 351 | args: Vec<&'a dyn KernelArgument>, 352 | } 353 | 354 | impl fmt::Debug for Kernel<'_> { 355 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 356 | let args = self 357 | .args 358 | .iter() 359 | .map(|arg| (arg.as_c_void(), arg.shared_mem())) 360 | .collect::>(); 361 | f.debug_struct("Kernel") 362 | .field("function", &self.function) 363 | .field("global_work_size", &self.global_work_size) 364 | .field("local_work_size", &self.local_work_size) 365 | .field("stream", &self.stream) 366 | .field("args", &args) 367 | .finish() 368 | } 369 | } 370 | 371 | impl<'a> Kernel<'a> { 372 | /// Set a kernel argument. 373 | /// 374 | /// The arguments must live as long as the kernel. Hence make sure they are not dropped as 375 | /// long as the kernel is in use. 376 | /// 377 | /// Example where this behaviour is enforced and leads to a compile-time error: 378 | /// 379 | /// ```compile_fail 380 | /// use rust_gpu_tools::cuda::Program; 381 | /// 382 | /// fn would_break(program: &Program) { 383 | /// let data = vec![1, 2, 3, 4]; 384 | /// let buffer = program.create_buffer_from_slice(&data).unwrap(); 385 | /// let kernel = program.create_kernel("my_kernel", 4, 256).unwrap(); 386 | /// let kernel = kernel.arg(&buffer); 387 | /// // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime. 388 | /// drop(buffer); 389 | /// kernel.run().unwrap(); 390 | /// } 391 | /// ``` 392 | pub fn arg(mut self, t: &'a T) -> Self { 393 | self.args.push(t); 394 | self 395 | } 396 | 397 | /// Actually run the kernel. 398 | /// 399 | /// ### Panics 400 | /// 401 | /// Panics if the wrong number of arguments was provided. 402 | pub fn run(self) -> GPUResult<()> { 403 | // There can only be a single [`LocalBuffer`], due to CUDA restrictions. 404 | let shared_mem = self 405 | .args 406 | .iter() 407 | .try_fold(0, |acc, &arg| -> GPUResult { 408 | let mem = arg.shared_mem(); 409 | match (mem, acc) { 410 | // No new shared memory needs to be allocated. 411 | (0, _) => Ok(acc), 412 | // Some shared memory needs to be allocated. 413 | (_, 0) => Ok(mem), 414 | // There should be memory allocated more than once 415 | (_, _) => Err(GPUError::Generic( 416 | "There cannot be more than one `LocalBuffer`.".to_string(), 417 | )), 418 | } 419 | })?; 420 | let args = self 421 | .args 422 | .iter() 423 | .map(|arg| arg.as_c_void()) 424 | .collect::>(); 425 | // It is safe to launch the kernel as the arguments need to live when the kernel is called, 426 | // and the buffers are copied synchronuously. At the end of the execution, the underlying 427 | // stream is synchronized. 428 | unsafe { 429 | self.stream.launch( 430 | &self.function, 431 | self.global_work_size as u32, 432 | self.local_work_size as u32, 433 | shared_mem, 434 | &args, 435 | )?; 436 | }; 437 | // Synchronize after the kernel execution, so that the underlying pointers can be 438 | // invalidated/dropped. 439 | self.stream.synchronize()?; 440 | Ok(()) 441 | } 442 | } 443 | -------------------------------------------------------------------------------- /src/cuda/utils.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | 3 | use log::{debug, warn}; 4 | 5 | use crate::cuda::Device; 6 | use crate::device::{PciId, Vendor}; 7 | use crate::error::{GPUError, GPUResult}; 8 | 9 | // NOTE vmx 2021-04-14: This is a hack to make sure contexts stay around. We wrap them, so that 10 | // `Sync` and `Send` can be implemented. `Sync` and `Send` is needed for once_cell. These contexts 11 | // are never used directly, they are only accessed through [`cuda::Device`] which contains an 12 | // `UnownedContext`. A device cannot have an own context itself, as then it couldn't be cloned, 13 | // but that is needed for creating the kernels. 14 | pub(crate) struct CudaContexts(#[allow(unused)] Vec); 15 | unsafe impl Sync for CudaContexts {} 16 | unsafe impl Send for CudaContexts {} 17 | 18 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. 19 | /// 20 | /// It is the first two identifiers of e.g. `lspci`: 21 | /// 22 | /// ```text 23 | /// 4e:00.0 VGA compatible controller 24 | /// || └└-- Device ID 25 | /// └└-- Bus ID 26 | /// ``` 27 | fn get_pci_id(device: &rustacuda::device::Device) -> Result { 28 | let bus_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciBusId)? as u16; 29 | let device_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciDeviceId)? as u16; 30 | let pci_id = (bus_id << 8) | device_id; 31 | Ok(pci_id.into()) 32 | } 33 | 34 | fn get_memory(d: &rustacuda::device::Device) -> GPUResult { 35 | let memory = d.total_memory()?; 36 | Ok(u64::try_from(memory).expect("Platform must be <= 64-bit")) 37 | } 38 | 39 | fn get_compute_units(d: &rustacuda::device::Device) -> GPUResult { 40 | let compute_units = d.get_attribute(rustacuda::device::DeviceAttribute::MultiprocessorCount)?; 41 | Ok(u32::try_from(compute_units).expect("The number of units is always positive")) 42 | } 43 | 44 | /// Get the major an minor version of the compute capability. 45 | fn get_compute_capability(d: &rustacuda::device::Device) -> GPUResult<(u32, u32)> { 46 | let major = d.get_attribute(rustacuda::device::DeviceAttribute::ComputeCapabilityMajor)?; 47 | let minor = d.get_attribute(rustacuda::device::DeviceAttribute::ComputeCapabilityMinor)?; 48 | Ok(( 49 | u32::try_from(major).expect("The compute capability major version is always positive"), 50 | u32::try_from(minor).expect("The compute capability minor version is always positive"), 51 | )) 52 | } 53 | 54 | /// Get a list of all available and supported devices. 55 | /// 56 | /// If there is a failure initializing CUDA or retrieving a device, it won't lead to a hard error, 57 | /// but an error will be logged and the corresponding device won't be available. 58 | pub(crate) fn build_device_list() -> (Vec, CudaContexts) { 59 | let mut all_devices = Vec::new(); 60 | let mut devices_without_pci_id = Vec::new(); 61 | let mut contexts = Vec::new(); 62 | 63 | rustacuda::init(rustacuda::CudaFlags::empty()) 64 | .map_err(Into::into) 65 | .and_then(|_| { 66 | for device in rustacuda::device::Device::devices()? { 67 | let device = device?; 68 | let owned_context = rustacuda::context::Context::create_and_push( 69 | rustacuda::context::ContextFlags::MAP_HOST 70 | | rustacuda::context::ContextFlags::SCHED_AUTO, 71 | device, 72 | )?; 73 | rustacuda::context::ContextStack::pop()?; 74 | 75 | let vendor = Vendor::Nvidia; 76 | let name = device.name()?; 77 | let memory = get_memory(&device)?; 78 | let compute_units = get_compute_units(&device)?; 79 | let compute_capability = get_compute_capability(&device)?; 80 | let uuid = device.uuid().ok().map(Into::into); 81 | let context = owned_context.get_unowned(); 82 | 83 | contexts.push(owned_context); 84 | 85 | // If a device doesn't have a PCI-ID, add those later to the list of 86 | // devices with a fake PCI-ID. 87 | match get_pci_id(&device) { 88 | Ok(pci_id) => { 89 | all_devices.push(Device { 90 | vendor, 91 | name, 92 | memory, 93 | compute_units, 94 | compute_capability, 95 | pci_id, 96 | uuid, 97 | context, 98 | }); 99 | } 100 | Err(_) => { 101 | // Use a temporary PCI-ID and replace it later with a non-colliding one. 102 | let pci_id = PciId::from(0); 103 | devices_without_pci_id.push(Device { 104 | vendor, 105 | name, 106 | memory, 107 | compute_units, 108 | compute_capability, 109 | pci_id, 110 | uuid, 111 | context, 112 | }); 113 | } 114 | }; 115 | } 116 | 117 | // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID. 118 | // As those devices are used for development and not for production use, it's good enough to 119 | // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID 120 | // instead, which is generated by enumerating the available devices. In order to make that 121 | // case easier to spot when debugging issues, a starting number which is pleasant to the human 122 | // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234). 123 | let mut enumerated_device: u16 = 4660; 124 | for mut device in devices_without_pci_id.into_iter() { 125 | // Make sure that no device has that actual PCI-ID 126 | while all_devices 127 | .iter() 128 | .any(|d| d.pci_id() == enumerated_device.into()) 129 | { 130 | enumerated_device += 1; 131 | } 132 | device.pci_id = PciId::from(enumerated_device); 133 | enumerated_device += 1; 134 | all_devices.push(device); 135 | } 136 | 137 | let wrapped_contexts = CudaContexts(contexts); 138 | 139 | debug!("Loaded CUDA devices: {:?}", all_devices); 140 | Ok((all_devices, wrapped_contexts)) 141 | }) 142 | .unwrap_or_else(|error: GPUError| { 143 | warn!("Unable to retrieve CUDA devices: {:?}", error); 144 | (Vec::new(), CudaContexts(Vec::new())) 145 | }) 146 | } 147 | -------------------------------------------------------------------------------- /src/device.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use log::debug; 4 | #[cfg(all(feature = "opencl", feature = "cuda"))] 5 | use log::warn; 6 | use once_cell::sync::Lazy; 7 | 8 | use std::convert::TryFrom; 9 | use std::mem; 10 | 11 | use crate::error::{GPUError, GPUResult}; 12 | 13 | #[cfg(feature = "cuda")] 14 | use crate::cuda; 15 | #[cfg(feature = "opencl")] 16 | use crate::opencl; 17 | 18 | /// The UUID of the devices returned by OpenCL as well as CUDA are always 16 bytes long. 19 | const UUID_SIZE: usize = 16; 20 | const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc."; 21 | const AMD_DEVICE_VENDOR_ID: u32 = 0x1002; 22 | 23 | const INTEL_DEVICE_VENDOR_STRING: &str = "Intel(R) Corporation"; 24 | const INTEL_DEVICE_VENDOR_ID: u32 = 0x8086; 25 | 26 | // For some reason integrated AMD cards on Apple don't have the usual vendor name and ID 27 | const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD"; 28 | const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00; 29 | const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation"; 30 | const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de; 31 | 32 | // The owned CUDA contexts are stored globally. Each devives contains an unowned reference, so 33 | // that devices can be cloned. 34 | #[cfg(feature = "cuda")] 35 | static DEVICES: Lazy<(Vec, cuda::utils::CudaContexts)> = Lazy::new(build_device_list); 36 | 37 | // Keep it as a tuple as the CUDA case, so that the using `DEVICES` is independent of the 38 | // features set. 39 | #[cfg(all(feature = "opencl", not(feature = "cuda")))] 40 | static DEVICES: Lazy<(Vec, ())> = Lazy::new(build_device_list); 41 | 42 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. 43 | /// 44 | /// It is the first two identifiers of e.g. `lcpci`: 45 | /// 46 | /// ```text 47 | /// 4e:00.0 VGA compatible controller 48 | /// || └└-- Device ID 49 | /// └└-- Bus ID 50 | /// ``` 51 | #[derive(Copy, Clone, Debug, Default, Eq, Hash, PartialEq)] 52 | pub struct PciId(u16); 53 | 54 | impl From for PciId { 55 | fn from(id: u16) -> Self { 56 | Self(id) 57 | } 58 | } 59 | 60 | impl From for u16 { 61 | fn from(id: PciId) -> Self { 62 | id.0 63 | } 64 | } 65 | 66 | /// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`. 67 | impl TryFrom<&str> for PciId { 68 | type Error = GPUError; 69 | 70 | fn try_from(pci_id: &str) -> GPUResult { 71 | let mut bytes = [0; mem::size_of::()]; 72 | hex::decode_to_slice(pci_id.replace(':', ""), &mut bytes).map_err(|_| { 73 | GPUError::InvalidId(format!( 74 | "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.", 75 | pci_id 76 | )) 77 | })?; 78 | let parsed = u16::from_be_bytes(bytes); 79 | Ok(Self(parsed)) 80 | } 81 | } 82 | 83 | /// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`. 84 | impl fmt::Display for PciId { 85 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 86 | let bytes = u16::to_be_bytes(self.0); 87 | write!(f, "{:02x}:{:02x}", bytes[0], bytes[1]) 88 | } 89 | } 90 | 91 | /// A unique identifier based on UUID of the device. 92 | #[derive(Copy, Clone, Default, Eq, Hash, PartialEq)] 93 | pub struct DeviceUuid([u8; UUID_SIZE]); 94 | 95 | impl From<[u8; UUID_SIZE]> for DeviceUuid { 96 | fn from(uuid: [u8; UUID_SIZE]) -> Self { 97 | Self(uuid) 98 | } 99 | } 100 | 101 | impl From for [u8; UUID_SIZE] { 102 | fn from(uuid: DeviceUuid) -> Self { 103 | uuid.0 104 | } 105 | } 106 | 107 | /// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, 108 | /// e.g. 46abccd6-022e-b783-572d-833f7104d05f 109 | impl TryFrom<&str> for DeviceUuid { 110 | type Error = GPUError; 111 | 112 | fn try_from(uuid: &str) -> GPUResult { 113 | let mut bytes = [0; UUID_SIZE]; 114 | hex::decode_to_slice(uuid.replace('-', ""), &mut bytes) 115 | .map_err(|_| { 116 | GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid)) 117 | })?; 118 | Ok(Self(bytes)) 119 | } 120 | } 121 | 122 | /// Formats the UUID the same way as `clinfo` does, as an example: 123 | /// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f 124 | impl fmt::Display for DeviceUuid { 125 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 126 | write!( 127 | f, 128 | "{}-{}-{}-{}-{}", 129 | hex::encode(&self.0[..4]), 130 | hex::encode(&self.0[4..6]), 131 | hex::encode(&self.0[6..8]), 132 | hex::encode(&self.0[8..10]), 133 | hex::encode(&self.0[10..]) 134 | ) 135 | } 136 | } 137 | 138 | impl fmt::Debug for DeviceUuid { 139 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 140 | write!(f, "{}", self) 141 | } 142 | } 143 | 144 | /// Unique identifier that can either be a PCI ID or a UUID. 145 | #[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] 146 | pub enum UniqueId { 147 | /// ID based on the PCI bus. 148 | PciId(PciId), 149 | /// ID based on a globally unique identifier. 150 | Uuid(DeviceUuid), 151 | } 152 | 153 | /// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID. 154 | impl TryFrom<&str> for UniqueId { 155 | type Error = GPUError; 156 | 157 | fn try_from(unique_id: &str) -> GPUResult { 158 | Ok(match unique_id.contains('-') { 159 | true => Self::Uuid(DeviceUuid::try_from(unique_id)?), 160 | false => Self::PciId(PciId::try_from(unique_id)?), 161 | }) 162 | } 163 | } 164 | 165 | impl fmt::Display for UniqueId { 166 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 167 | match self { 168 | Self::PciId(id) => id.fmt(f), 169 | Self::Uuid(id) => id.fmt(f), 170 | } 171 | } 172 | } 173 | 174 | /// Currently supported vendors of this library. 175 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 176 | pub enum Vendor { 177 | /// GPU by AMD. 178 | Amd, 179 | /// GPU by Intel. 180 | Intel, 181 | /// GPU by NVIDIA. 182 | Nvidia, 183 | } 184 | 185 | impl TryFrom<&str> for Vendor { 186 | type Error = GPUError; 187 | 188 | fn try_from(vendor: &str) -> GPUResult { 189 | match vendor { 190 | AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd), 191 | AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd), 192 | INTEL_DEVICE_VENDOR_STRING => Ok(Self::Intel), 193 | NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia), 194 | _ => Err(GPUError::UnsupportedVendor(vendor.to_string())), 195 | } 196 | } 197 | } 198 | 199 | impl TryFrom for Vendor { 200 | type Error = GPUError; 201 | 202 | fn try_from(vendor: u32) -> GPUResult { 203 | match vendor { 204 | AMD_DEVICE_VENDOR_ID => Ok(Self::Amd), 205 | AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd), 206 | INTEL_DEVICE_VENDOR_ID => Ok(Self::Intel), 207 | NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia), 208 | _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))), 209 | } 210 | } 211 | } 212 | 213 | impl fmt::Display for Vendor { 214 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 215 | let vendor = match self { 216 | Self::Amd => AMD_DEVICE_VENDOR_STRING, 217 | Self::Intel => INTEL_DEVICE_VENDOR_STRING, 218 | Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING, 219 | }; 220 | write!(f, "{}", vendor) 221 | } 222 | } 223 | 224 | /// Which framework to use, CUDA or OpenCL. 225 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 226 | pub enum Framework { 227 | /// CUDA. 228 | #[cfg(feature = "cuda")] 229 | Cuda, 230 | /// OpenCL. 231 | #[cfg(feature = "opencl")] 232 | Opencl, 233 | } 234 | 235 | /// A device that may have a CUDA and/or OpenCL GPU associated with it. 236 | #[derive(Clone, Debug, Eq, Hash, PartialEq)] 237 | pub struct Device { 238 | vendor: Vendor, 239 | name: String, 240 | memory: u64, 241 | compute_units: u32, 242 | /// Major and minor version of the compute capabilitiy (only available on Nvidia GPUs). 243 | compute_capability: Option<(u32, u32)>, 244 | // All devices have a PCI ID. It is used as fallback in case there is not UUID. 245 | pci_id: PciId, 246 | uuid: Option, 247 | #[cfg(feature = "cuda")] 248 | cuda: Option, 249 | #[cfg(feature = "opencl")] 250 | opencl: Option, 251 | } 252 | 253 | impl Device { 254 | /// Returns the [`Vendor`] of the GPU. 255 | pub fn vendor(&self) -> Vendor { 256 | self.vendor 257 | } 258 | 259 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 260 | pub fn name(&self) -> String { 261 | self.name.clone() 262 | } 263 | 264 | /// Returns the memory of the GPU in bytes. 265 | pub fn memory(&self) -> u64 { 266 | self.memory 267 | } 268 | 269 | /// Returns the number of compute units of the GPU. 270 | pub fn compute_units(&self) -> u32 { 271 | self.compute_units 272 | } 273 | 274 | /// Returns the major and minor version of the compute capability (only available on Nvidia 275 | /// GPUs). 276 | pub fn compute_capability(&self) -> Option<(u32, u32)> { 277 | self.compute_capability 278 | } 279 | 280 | /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID. 281 | pub fn unique_id(&self) -> UniqueId { 282 | match self.uuid { 283 | Some(uuid) => UniqueId::Uuid(uuid), 284 | None => UniqueId::PciId(self.pci_id), 285 | } 286 | } 287 | 288 | /// Returns the preferred framework (CUDA or OpenCL) to use. 289 | /// 290 | /// CUDA will be be preferred over OpenCL. The returned framework will work on the device. 291 | /// E.g. it won't return `Framework::Cuda` for an AMD device. 292 | pub fn framework(&self) -> Framework { 293 | #[cfg(all(feature = "opencl", feature = "cuda"))] 294 | if cfg!(feature = "cuda") && self.cuda.is_some() { 295 | Framework::Cuda 296 | } else { 297 | Framework::Opencl 298 | } 299 | 300 | #[cfg(all(feature = "cuda", not(feature = "opencl")))] 301 | { 302 | Framework::Cuda 303 | } 304 | 305 | #[cfg(all(feature = "opencl", not(feature = "cuda")))] 306 | { 307 | Framework::Opencl 308 | } 309 | } 310 | 311 | /// Returns the underlying CUDA device if it is available. 312 | #[cfg(feature = "cuda")] 313 | pub fn cuda_device(&self) -> Option<&cuda::Device> { 314 | self.cuda.as_ref() 315 | } 316 | 317 | /// Returns the underlying OpenCL device if it is available. 318 | #[cfg(feature = "opencl")] 319 | pub fn opencl_device(&self) -> Option<&opencl::Device> { 320 | self.opencl.as_ref() 321 | } 322 | 323 | /// Returns all available GPUs that are supported. 324 | pub fn all() -> Vec<&'static Device> { 325 | Self::all_iter().collect() 326 | } 327 | 328 | /// Returns the device matching the PCI ID if there is one. 329 | pub fn by_pci_id(pci_id: PciId) -> Option<&'static Device> { 330 | Self::all_iter().find(|d| pci_id == d.pci_id) 331 | } 332 | 333 | /// Returns the device matching the UUID if there is one. 334 | pub fn by_uuid(uuid: DeviceUuid) -> Option<&'static Device> { 335 | Self::all_iter().find(|d| Some(uuid) == d.uuid) 336 | } 337 | 338 | /// Returns the device matching the unique ID if there is one. 339 | pub fn by_unique_id(unique_id: UniqueId) -> Option<&'static Device> { 340 | Self::all_iter().find(|d| unique_id == d.unique_id()) 341 | } 342 | 343 | /// Returns the device matching the Vendor. 344 | pub fn by_vendor(vendor_id: Vendor) -> Vec<&'static Device> { 345 | Self::all_iter() 346 | .filter(|d| vendor_id == d.vendor()) 347 | .collect() 348 | } 349 | 350 | /// Returns an iterator of all available GPUs that are supported. 351 | fn all_iter() -> impl Iterator { 352 | DEVICES.0.iter() 353 | } 354 | } 355 | 356 | /// Get a list of all available and supported devices. 357 | /// 358 | /// If both, the `cuda` and the `opencl` feature are enabled, a device supporting both will be 359 | /// combined into a single device. You can then access the underlying CUDA and OpenCL device 360 | /// if needed. 361 | /// 362 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be 363 | /// logged and the corresponding device won't be available. 364 | #[cfg(feature = "cuda")] 365 | fn build_device_list() -> (Vec, cuda::utils::CudaContexts) { 366 | let mut all_devices = Vec::new(); 367 | 368 | #[cfg(feature = "opencl")] 369 | let opencl_devices = opencl::utils::build_device_list(); 370 | 371 | #[cfg(all(feature = "cuda", feature = "opencl"))] 372 | let (mut cuda_devices, cuda_contexts) = cuda::utils::build_device_list(); 373 | #[cfg(all(feature = "cuda", not(feature = "opencl")))] 374 | let (cuda_devices, cuda_contexts) = cuda::utils::build_device_list(); 375 | 376 | // Combine OpenCL and CUDA devices into one device if it is the same GPU 377 | #[cfg(feature = "opencl")] 378 | for opencl_device in opencl_devices { 379 | let mut device = Device { 380 | vendor: opencl_device.vendor(), 381 | name: opencl_device.name(), 382 | memory: opencl_device.memory(), 383 | compute_units: opencl_device.compute_units(), 384 | compute_capability: opencl_device.compute_capability(), 385 | pci_id: opencl_device.pci_id(), 386 | uuid: opencl_device.uuid(), 387 | opencl: Some(opencl_device), 388 | cuda: None, 389 | }; 390 | 391 | // Only devices from Nvidia can use CUDA 392 | #[cfg(feature = "cuda")] 393 | if device.vendor == Vendor::Nvidia { 394 | for ii in 0..cuda_devices.len() { 395 | if (device.uuid.is_some() && cuda_devices[ii].uuid() == device.uuid) 396 | || (cuda_devices[ii].pci_id() == device.pci_id) 397 | { 398 | if device.memory() != cuda_devices[ii].memory() { 399 | warn!("OpenCL and CUDA report different amounts of memory for a device with the same identifier"); 400 | break; 401 | } 402 | if device.compute_units() != cuda_devices[ii].compute_units() { 403 | warn!("OpenCL and CUDA report different amounts of compute units for a device with the same identifier"); 404 | break; 405 | } 406 | // Move the CUDA device out of the vector 407 | device.cuda = Some(cuda_devices.remove(ii)); 408 | // Only one device can match 409 | break; 410 | } 411 | } 412 | } 413 | 414 | all_devices.push(device) 415 | } 416 | 417 | // All CUDA devices that don't have a corresponding OpenCL devices 418 | for cuda_device in cuda_devices { 419 | let device = Device { 420 | vendor: cuda_device.vendor(), 421 | name: cuda_device.name(), 422 | memory: cuda_device.memory(), 423 | compute_units: cuda_device.compute_units(), 424 | compute_capability: Some(cuda_device.compute_capability()), 425 | pci_id: cuda_device.pci_id(), 426 | uuid: cuda_device.uuid(), 427 | cuda: Some(cuda_device), 428 | #[cfg(feature = "opencl")] 429 | opencl: None, 430 | }; 431 | all_devices.push(device); 432 | } 433 | 434 | debug!("loaded devices: {:?}", all_devices); 435 | (all_devices, cuda_contexts) 436 | } 437 | 438 | /// Get a list of all available and supported OpenCL devices. 439 | /// 440 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be 441 | /// logged and the corresponding device won't be available. 442 | #[cfg(all(feature = "opencl", not(feature = "cuda")))] 443 | fn build_device_list() -> (Vec, ()) { 444 | let devices = opencl::utils::build_device_list() 445 | .into_iter() 446 | .map(|device| Device { 447 | vendor: device.vendor(), 448 | name: device.name(), 449 | memory: device.memory(), 450 | compute_units: device.compute_units(), 451 | compute_capability: device.compute_capability(), 452 | pci_id: device.pci_id(), 453 | uuid: device.uuid(), 454 | opencl: Some(device), 455 | }) 456 | .collect(); 457 | 458 | debug!("loaded devices: {:?}", devices); 459 | (devices, ()) 460 | } 461 | 462 | #[cfg(test)] 463 | mod test { 464 | use super::{ 465 | Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID, 466 | AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING, 467 | INTEL_DEVICE_VENDOR_ID, INTEL_DEVICE_VENDOR_STRING, NVIDIA_DEVICE_VENDOR_ID, 468 | NVIDIA_DEVICE_VENDOR_STRING, 469 | }; 470 | use std::convert::TryFrom; 471 | 472 | #[test] 473 | fn test_device_all() { 474 | let devices = Device::all(); 475 | for device in devices.iter() { 476 | println!("device: {:?}", device); 477 | } 478 | assert!(!devices.is_empty(), "No supported GPU found."); 479 | } 480 | 481 | #[test] 482 | fn test_vendor_from_str() { 483 | assert_eq!( 484 | Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(), 485 | Vendor::Amd, 486 | "AMD vendor string can be converted." 487 | ); 488 | assert_eq!( 489 | Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(), 490 | Vendor::Amd, 491 | "AMD vendor string (on apple) can be converted." 492 | ); 493 | assert_eq!( 494 | Vendor::try_from(INTEL_DEVICE_VENDOR_STRING).unwrap(), 495 | Vendor::Intel, 496 | "Intel vendor string can be converted." 497 | ); 498 | assert_eq!( 499 | Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(), 500 | Vendor::Nvidia, 501 | "Nvidia vendor string can be converted." 502 | ); 503 | assert!(matches!( 504 | Vendor::try_from("unknown vendor"), 505 | Err(GPUError::UnsupportedVendor(_)) 506 | )); 507 | } 508 | 509 | #[test] 510 | fn test_vendor_from_u32() { 511 | assert_eq!( 512 | Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(), 513 | Vendor::Amd, 514 | "AMD vendor ID can be converted." 515 | ); 516 | assert_eq!( 517 | Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(), 518 | Vendor::Amd, 519 | "AMD vendor ID (on apple) can be converted." 520 | ); 521 | assert_eq!( 522 | Vendor::try_from(INTEL_DEVICE_VENDOR_ID).unwrap(), 523 | Vendor::Intel, 524 | "Intel vendor ID can be converted." 525 | ); 526 | assert_eq!( 527 | Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(), 528 | Vendor::Nvidia, 529 | "Nvidia vendor ID can be converted." 530 | ); 531 | assert!(matches!( 532 | Vendor::try_from(0x1abc), 533 | Err(GPUError::UnsupportedVendor(_)) 534 | )); 535 | } 536 | 537 | #[test] 538 | fn test_vendor_display() { 539 | assert_eq!( 540 | Vendor::Amd.to_string(), 541 | AMD_DEVICE_VENDOR_STRING, 542 | "AMD vendor can be converted to string." 543 | ); 544 | assert_eq!( 545 | Vendor::Intel.to_string(), 546 | INTEL_DEVICE_VENDOR_STRING, 547 | "Intel vendor can be converted to string." 548 | ); 549 | assert_eq!( 550 | Vendor::Nvidia.to_string(), 551 | NVIDIA_DEVICE_VENDOR_STRING, 552 | "Nvidia vendor can be converted to string." 553 | ); 554 | } 555 | 556 | #[test] 557 | fn test_uuid() { 558 | let valid_string = "46abccd6-022e-b783-572d-833f7104d05f"; 559 | let valid = DeviceUuid::try_from(valid_string).unwrap(); 560 | assert_eq!(valid_string, &valid.to_string()); 561 | 562 | let too_short_string = "ccd6-022e-b783-572d-833f7104d05f"; 563 | let too_short = DeviceUuid::try_from(too_short_string); 564 | assert!(too_short.is_err(), "Parse error when UUID is too short."); 565 | 566 | let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h"; 567 | let invalid_hex = DeviceUuid::try_from(invalid_hex_string); 568 | assert!( 569 | invalid_hex.is_err(), 570 | "Parse error when UUID containts non-hex character." 571 | ); 572 | } 573 | 574 | #[test] 575 | fn test_pci_id() { 576 | let valid_string = "01:00"; 577 | let valid = PciId::try_from(valid_string).unwrap(); 578 | assert_eq!(valid_string, &valid.to_string()); 579 | assert_eq!(valid, PciId(0x0100)); 580 | 581 | let too_short_string = "3f"; 582 | let too_short = PciId::try_from(too_short_string); 583 | assert!(too_short.is_err(), "Parse error when PCI ID is too short."); 584 | 585 | let invalid_hex_string = "aaxx"; 586 | let invalid_hex = PciId::try_from(invalid_hex_string); 587 | assert!( 588 | invalid_hex.is_err(), 589 | "Parse error when PCI ID containts non-hex character." 590 | ); 591 | } 592 | 593 | #[test] 594 | fn test_unique_id() { 595 | let valid_pci_id_string = "aa:bb"; 596 | let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap(); 597 | assert_eq!(valid_pci_id_string, &valid_pci_id.to_string()); 598 | assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb))); 599 | 600 | let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899"; 601 | let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap(); 602 | assert_eq!(valid_uuid_string, &valid_uuid.to_string()); 603 | assert_eq!( 604 | valid_uuid, 605 | UniqueId::Uuid(DeviceUuid([ 606 | 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 607 | 0x88, 0x99 608 | ])) 609 | ); 610 | 611 | let invalid_string = "aabbccddeeffgg"; 612 | let invalid = UniqueId::try_from(invalid_string); 613 | assert!( 614 | invalid.is_err(), 615 | "Parse error when ID matches neither a PCI Id, nor a UUID." 616 | ); 617 | } 618 | } 619 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "opencl")] 2 | use opencl3::error_codes::ClError; 3 | #[cfg(feature = "cuda")] 4 | use rustacuda::error::CudaError; 5 | 6 | /// Error types of this library. 7 | #[derive(thiserror::Error, Debug)] 8 | #[allow(clippy::upper_case_acronyms)] 9 | pub enum GPUError { 10 | /// Error from the underlying `opencl3` library, e.g. a memory allocation failure. 11 | #[cfg(feature = "opencl")] 12 | #[error("Opencl3 Error: {0}{}", match .message { 13 | Some(message) => format!(" {}", message), 14 | None => "".to_string(), 15 | })] 16 | Opencl3 { 17 | /// The error code. 18 | error: ClError, 19 | /// The error message. 20 | message: Option, 21 | }, 22 | 23 | /// Error for OpenCL `clGetProgramInfo()` call failures. 24 | #[cfg(feature = "opencl")] 25 | #[error("Program info not available!")] 26 | ProgramInfoNotAvailable(ClError), 27 | 28 | /// Error for OpenCL `clGetDeviceInfo()` call failures. 29 | #[cfg(feature = "opencl")] 30 | #[error("Device info not available!")] 31 | DeviceInfoNotAvailable(ClError), 32 | 33 | /// Error from the underlying `RustaCUDA` library, e.g. a memory allocation failure. 34 | #[cfg(feature = "cuda")] 35 | #[error("Cuda Error: {0}")] 36 | Cuda(#[from] CudaError), 37 | 38 | /// Error when a device cannot be found. 39 | #[error("Device not found!")] 40 | DeviceNotFound, 41 | 42 | /// Error when a kernel with the given name cannot be found. 43 | #[error("Kernel with name {0} not found!")] 44 | KernelNotFound(String), 45 | 46 | /// Error when standard I/O fails. 47 | #[error("IO Error: {0}")] 48 | IO(#[from] std::io::Error), 49 | 50 | /// Error when the device is from an unsupported vendor. 51 | #[error("Vendor {0} is not supported.")] 52 | UnsupportedVendor(String), 53 | 54 | /// Error when the string representation of a unique identifier (PCI-ID or UUID) cannot be 55 | /// parsed. 56 | #[error("{0}")] 57 | InvalidId(String), 58 | 59 | /// Errors that rarely happen and don't deserve their own error type. 60 | #[error("{0}")] 61 | Generic(String), 62 | } 63 | 64 | /// Convenience type alias for [`GPUError`] based [`Result`]s. 65 | #[allow(clippy::upper_case_acronyms)] 66 | pub type GPUResult = std::result::Result; 67 | 68 | #[cfg(feature = "opencl")] 69 | impl From for GPUError { 70 | fn from(error: ClError) -> Self { 71 | GPUError::Opencl3 { 72 | error, 73 | message: None, 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Abstraction layer for OpenCL and CUDA. 2 | //! 3 | //! Feature flags 4 | //! ------------- 5 | //! 6 | //! There are two [feature flags], `cuda` and `opencl`. By default `opencl` is enabled. You can 7 | //! enable both at the same time. At least one of them needs to be enabled at any time. 8 | //! 9 | //! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section 10 | 11 | #![warn(missing_docs)] 12 | 13 | mod device; 14 | mod error; 15 | #[cfg(any(feature = "cuda", feature = "opencl"))] 16 | mod program; 17 | 18 | #[cfg(feature = "cuda")] 19 | pub mod cuda; 20 | #[cfg(feature = "opencl")] 21 | pub mod opencl; 22 | 23 | pub use device::{Device, DeviceUuid, Framework, PciId, UniqueId, Vendor}; 24 | pub use error::GPUError; 25 | #[cfg(any(feature = "cuda", feature = "opencl"))] 26 | pub use program::Program; 27 | 28 | #[cfg(not(any(feature = "cuda", feature = "opencl")))] 29 | compile_error!("At least one of the features `cuda` or `opencl` must be enabled."); 30 | 31 | /// A buffer on the GPU. 32 | /// 33 | /// The concept of a local buffer is from OpenCL. In CUDA you don't allocate a buffer directly 34 | /// via API call. Instead you pass in the amount of shared memory that should be used. 35 | /// 36 | /// There can be at most a single local buffer per kernel. On CUDA a null pointer will be passed 37 | /// in, instead of an actual value. The memory that should get allocated is then passed into the 38 | /// kernel call automatically. 39 | #[derive(Debug)] 40 | pub struct LocalBuffer { 41 | /// The number of T sized elements. 42 | length: usize, 43 | _phantom: std::marker::PhantomData, 44 | } 45 | 46 | impl LocalBuffer { 47 | /// Returns a new buffer of the specified `length`. 48 | pub fn new(length: usize) -> Self { 49 | LocalBuffer:: { 50 | length, 51 | _phantom: std::marker::PhantomData, 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/opencl/error.rs: -------------------------------------------------------------------------------- 1 | use opencl3::{device::DeviceInfo, error_codes::ClError, program::ProgramInfo}; 2 | 3 | #[derive(thiserror::Error, Debug)] 4 | #[allow(clippy::upper_case_acronyms)] 5 | pub enum GPUError { 6 | #[error("Opencl3 Error: {0}{}", match .1 { 7 | Some(message) => format!(" {}", message), 8 | None => "".to_string(), 9 | })] 10 | Opencl3(ClError, Option), 11 | #[error("Device not found!")] 12 | DeviceNotFound, 13 | #[error("Device info not available!")] 14 | DeviceInfoNotAvailable(DeviceInfo), 15 | #[error("Program info not available!")] 16 | ProgramInfoNotAvailable(ProgramInfo), 17 | #[error("Kernel with name {0} not found!")] 18 | KernelNotFound(String), 19 | #[error("IO Error: {0}")] 20 | IO(#[from] std::io::Error), 21 | #[error("Vendor {0} is not supported.")] 22 | UnsupportedVendor(String), 23 | #[error("{0}")] 24 | InvalidId(String), 25 | } 26 | 27 | #[allow(clippy::upper_case_acronyms)] 28 | #[allow(dead_code)] 29 | pub type GPUResult = std::result::Result; 30 | 31 | impl From for GPUError { 32 | fn from(error: ClError) -> Self { 33 | GPUError::Opencl3(error, None) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/opencl/mod.rs: -------------------------------------------------------------------------------- 1 | //! The OpenCL specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`]. 2 | 3 | pub(crate) mod utils; 4 | 5 | use std::collections::HashMap; 6 | use std::hash::{Hash, Hasher}; 7 | use std::mem; 8 | use std::ptr; 9 | 10 | use opencl3::command_queue::CommandQueue; 11 | use opencl3::context::Context; 12 | use opencl3::error_codes::ClError; 13 | use opencl3::kernel::ExecuteKernel; 14 | use opencl3::memory::CL_MEM_READ_WRITE; 15 | use opencl3::types::CL_BLOCKING; 16 | 17 | use log::debug; 18 | 19 | use crate::device::{DeviceUuid, PciId, Vendor}; 20 | use crate::error::{GPUError, GPUResult}; 21 | use crate::LocalBuffer; 22 | 23 | /// The lowest level identifier of an OpenCL device, it changes whenever a device is initialized. 24 | #[allow(non_camel_case_types)] 25 | pub type cl_device_id = opencl3::types::cl_device_id; 26 | 27 | /// A Buffer to be used for sending and receiving data to/from the GPU. 28 | #[derive(Debug)] 29 | pub struct Buffer { 30 | buffer: opencl3::memory::Buffer, 31 | /// The number of T-sized elements. 32 | length: usize, 33 | _phantom: std::marker::PhantomData, 34 | } 35 | 36 | /// OpenCL specific device. 37 | #[derive(Debug, Clone)] 38 | pub struct Device { 39 | vendor: Vendor, 40 | name: String, 41 | /// The total memory of the GPU in bytes. 42 | memory: u64, 43 | /// The number of parallel compute units. 44 | compute_units: u32, 45 | /// Major and minor version of the compute capabilitiy (only available on Nvidia GPUs). 46 | compute_capability: Option<(u32, u32)>, 47 | pci_id: PciId, 48 | uuid: Option, 49 | device: opencl3::device::Device, 50 | } 51 | 52 | impl Hash for Device { 53 | fn hash(&self, state: &mut H) { 54 | self.vendor.hash(state); 55 | self.name.hash(state); 56 | self.memory.hash(state); 57 | self.pci_id.hash(state); 58 | self.uuid.hash(state); 59 | } 60 | } 61 | 62 | impl PartialEq for Device { 63 | fn eq(&self, other: &Self) -> bool { 64 | self.vendor == other.vendor 65 | && self.name == other.name 66 | && self.memory == other.memory 67 | && self.pci_id == other.pci_id 68 | && self.uuid == other.uuid 69 | } 70 | } 71 | 72 | impl Eq for Device {} 73 | 74 | impl Device { 75 | /// Returns the [`Vendor`] of the GPU. 76 | pub fn vendor(&self) -> Vendor { 77 | self.vendor 78 | } 79 | 80 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 81 | pub fn name(&self) -> String { 82 | self.name.clone() 83 | } 84 | 85 | /// Returns the memory of the GPU in bytes. 86 | pub fn memory(&self) -> u64 { 87 | self.memory 88 | } 89 | 90 | /// Returns the number of compute units of the GPU. 91 | pub fn compute_units(&self) -> u32 { 92 | self.compute_units 93 | } 94 | 95 | /// Returns the major and minor version of the compute capability (only available on Nvidia 96 | /// GPUs). 97 | pub fn compute_capability(&self) -> Option<(u32, u32)> { 98 | self.compute_capability 99 | } 100 | 101 | /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information. 102 | pub fn pci_id(&self) -> PciId { 103 | self.pci_id 104 | } 105 | 106 | /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more 107 | /// information. 108 | pub fn uuid(&self) -> Option { 109 | self.uuid 110 | } 111 | 112 | /// Low-level access to the device identifier. 113 | /// 114 | /// It changes when the device is initialized and should only be used to interact with other 115 | /// libraries that work on the lowest OpenCL level. 116 | pub fn cl_device_id(&self) -> cl_device_id { 117 | self.device.id() 118 | } 119 | } 120 | 121 | /// Abstraction that contains everything to run an OpenCL kernel on a GPU. 122 | /// 123 | /// The majority of methods are the same as [`crate::cuda::Program`], so you can write code using this 124 | /// API, which will then work with OpenCL as well as CUDA kernels. 125 | #[allow(rustdoc::broken_intra_doc_links)] 126 | pub struct Program { 127 | device_name: String, 128 | queue: CommandQueue, 129 | context: Context, 130 | kernels_by_name: HashMap, 131 | } 132 | 133 | impl Program { 134 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 135 | pub fn device_name(&self) -> &str { 136 | &self.device_name 137 | } 138 | 139 | /// Creates a program for a specific device from OpenCL source code. 140 | pub fn from_opencl(device: &Device, src: &str) -> GPUResult { 141 | debug!("Creating OpenCL program from source."); 142 | let cached = utils::cache_path(device, src)?; 143 | if std::path::Path::exists(&cached) { 144 | let bin = std::fs::read(cached)?; 145 | Program::from_binary(device, bin) 146 | } else { 147 | let context = Context::from_device(&device.device)?; 148 | debug!( 149 | "Building kernel ({}) from source…", 150 | cached.to_string_lossy() 151 | ); 152 | let mut program = opencl3::program::Program::create_from_source(&context, src)?; 153 | if let Err(build_error) = program.build(context.devices(), "") { 154 | let log = program.get_build_log(context.devices()[0])?; 155 | return Err(GPUError::Opencl3 { 156 | error: build_error, 157 | message: Some(log), 158 | }); 159 | } 160 | debug!( 161 | "Building kernel ({}) from source: done.", 162 | cached.to_string_lossy() 163 | ); 164 | let queue = CommandQueue::create_default(&context, 0)?; 165 | let kernels = opencl3::kernel::create_program_kernels(&program)?; 166 | let kernels_by_name = kernels 167 | .into_iter() 168 | .map(|kernel| { 169 | let name = kernel.function_name()?; 170 | Ok((name, kernel)) 171 | }) 172 | .collect::>()?; 173 | let prog = Program { 174 | device_name: device.name(), 175 | queue, 176 | context, 177 | kernels_by_name, 178 | }; 179 | let binaries = program 180 | .get_binaries() 181 | .map_err(GPUError::ProgramInfoNotAvailable)?; 182 | std::fs::write(cached, binaries[0].clone())?; 183 | Ok(prog) 184 | } 185 | } 186 | 187 | /// Creates a program for a specific device from a compiled OpenCL binary. 188 | pub fn from_binary(device: &Device, bin: Vec) -> GPUResult { 189 | debug!("Creating OpenCL program from binary."); 190 | let context = Context::from_device(&device.device)?; 191 | let bins = vec![&bin[..]]; 192 | let mut program = unsafe { 193 | opencl3::program::Program::create_from_binary(&context, context.devices(), &bins) 194 | }?; 195 | if let Err(build_error) = program.build(context.devices(), "") { 196 | let log = program.get_build_log(context.devices()[0])?; 197 | return Err(GPUError::Opencl3 { 198 | error: build_error, 199 | message: Some(log), 200 | }); 201 | } 202 | let queue = CommandQueue::create_default(&context, 0)?; 203 | let kernels = opencl3::kernel::create_program_kernels(&program)?; 204 | let kernels_by_name = kernels 205 | .into_iter() 206 | .map(|kernel| { 207 | let name = kernel.function_name()?; 208 | Ok((name, kernel)) 209 | }) 210 | .collect::>()?; 211 | Ok(Program { 212 | device_name: device.name(), 213 | queue, 214 | context, 215 | kernels_by_name, 216 | }) 217 | } 218 | 219 | /// Creates a new buffer that can be used for input/output with the GPU. 220 | /// 221 | /// The `length` is the number of elements to create. 222 | /// 223 | /// It is usually used to create buffers that are initialized by the GPU. If you want to 224 | /// directly transfer data from the host to the GPU, you would use the safe 225 | /// [`Program::create_buffer_from_slice`] instead. 226 | /// 227 | /// # Safety 228 | /// 229 | /// This function isn't actually unsafe, it's marked as `unsafe` due to the CUDA version of it, 230 | /// where it is unsafe. This is done to have symmetry between both APIs. 231 | pub unsafe fn create_buffer(&self, length: usize) -> GPUResult> { 232 | assert!(length > 0); 233 | let mut buff = opencl3::memory::Buffer::create( 234 | &self.context, 235 | CL_MEM_READ_WRITE, 236 | // The input length is the number of elements, but we create a `u8` buffer. Hence the 237 | // length needs to be the number of bytes. 238 | length * std::mem::size_of::(), 239 | ptr::null_mut(), 240 | )?; 241 | 242 | // Write some data right-away. This makes a significant performance different. 243 | self.queue 244 | .enqueue_write_buffer(&mut buff, opencl3::types::CL_BLOCKING, 0, &[0u8], &[])?; 245 | 246 | Ok(Buffer:: { 247 | buffer: buff, 248 | length, 249 | _phantom: std::marker::PhantomData, 250 | }) 251 | } 252 | 253 | /// Creates a new buffer on the GPU and initializes with the given slice. 254 | pub fn create_buffer_from_slice(&self, slice: &[T]) -> GPUResult> { 255 | // The underlying buffer is `u8`, hence we need the number of bytes. 256 | let bytes_len = mem::size_of_val(slice); 257 | 258 | let mut buffer = unsafe { 259 | opencl3::memory::Buffer::create( 260 | &self.context, 261 | CL_MEM_READ_WRITE, 262 | bytes_len, 263 | ptr::null_mut(), 264 | )? 265 | }; 266 | // Transmuting types is safe as long a sizes match. 267 | let bytes = unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u8, bytes_len) }; 268 | // Write some data right-away. This makes a significant performance different. 269 | unsafe { 270 | self.queue 271 | .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &[0u8], &[])?; 272 | self.queue 273 | .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, bytes, &[])?; 274 | }; 275 | 276 | Ok(Buffer:: { 277 | buffer, 278 | length: slice.len(), 279 | _phantom: std::marker::PhantomData, 280 | }) 281 | } 282 | 283 | /// Returns a kernel. 284 | /// 285 | /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total 286 | /// number of threads. Instead it follows CUDA's definition and is the number of 287 | /// `local_work_size` sized thread groups. So the total number of threads is 288 | /// `global_work_size * local_work_size`. 289 | pub fn create_kernel( 290 | &self, 291 | name: &str, 292 | global_work_size: usize, 293 | local_work_size: usize, 294 | ) -> GPUResult { 295 | let kernel = self 296 | .kernels_by_name 297 | .get(name) 298 | .ok_or_else(|| GPUError::KernelNotFound(name.to_string()))?; 299 | let mut builder = ExecuteKernel::new(kernel); 300 | builder.set_global_work_size(global_work_size * local_work_size); 301 | builder.set_local_work_size(local_work_size); 302 | Ok(Kernel { 303 | builder, 304 | queue: &self.queue, 305 | num_local_buffers: 0, 306 | }) 307 | } 308 | 309 | /// Puts data from an existing buffer onto the GPU. 310 | pub fn write_from_buffer( 311 | &self, 312 | // From Rust's perspective, this buffer doesn't need to be mutable. But the sub-buffer is 313 | // mutating the buffer, so it really should be. 314 | buffer: &mut Buffer, 315 | data: &[T], 316 | ) -> GPUResult<()> { 317 | assert!(data.len() <= buffer.length, "Buffer is too small"); 318 | 319 | // It is safe as long as the sizes match. 320 | let bytes = unsafe { 321 | std::slice::from_raw_parts(data.as_ptr() as *const u8, mem::size_of_val(data)) 322 | }; 323 | unsafe { 324 | self.queue 325 | .enqueue_write_buffer(&mut buffer.buffer, CL_BLOCKING, 0, bytes, &[])?; 326 | } 327 | Ok(()) 328 | } 329 | 330 | /// Reads data from the GPU into an existing buffer. 331 | pub fn read_into_buffer(&self, buffer: &Buffer, data: &mut [T]) -> GPUResult<()> { 332 | assert!(data.len() <= buffer.length, "Buffer is too small"); 333 | 334 | // It is safe as long as the sizes match. 335 | let bytes = unsafe { 336 | std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, mem::size_of_val(data)) 337 | }; 338 | unsafe { 339 | self.queue 340 | .enqueue_read_buffer(&buffer.buffer, CL_BLOCKING, 0, bytes, &[])?; 341 | }; 342 | Ok(()) 343 | } 344 | 345 | /// Run some code in the context of the program. 346 | /// 347 | /// It takes the program as a parameter, so that we can use the same function body, for both 348 | /// the OpenCL and the CUDA code path. The only difference is the type of the program. 349 | pub fn run(&self, fun: F, arg: A) -> Result 350 | where 351 | F: FnOnce(&Self, A) -> Result, 352 | E: From, 353 | { 354 | fun(self, arg) 355 | } 356 | } 357 | 358 | /// Abstraction for kernel arguments. 359 | /// 360 | /// The kernel doesn't support being called with custom types, hence some conversion might be 361 | /// needed. This trait enables automatic coversions, so that any type implementing it can be 362 | /// passed into a [`Kernel`]. 363 | pub trait KernelArgument { 364 | /// Apply the kernel argument to the kernel. 365 | fn push(&self, kernel: &mut Kernel); 366 | } 367 | 368 | impl KernelArgument for Buffer { 369 | fn push(&self, kernel: &mut Kernel) { 370 | unsafe { 371 | kernel.builder.set_arg(&self.buffer); 372 | } 373 | } 374 | } 375 | 376 | impl KernelArgument for i32 { 377 | fn push(&self, kernel: &mut Kernel) { 378 | unsafe { 379 | kernel.builder.set_arg(self); 380 | } 381 | } 382 | } 383 | 384 | impl KernelArgument for u32 { 385 | fn push(&self, kernel: &mut Kernel) { 386 | unsafe { 387 | kernel.builder.set_arg(self); 388 | } 389 | } 390 | } 391 | 392 | impl KernelArgument for LocalBuffer { 393 | fn push(&self, kernel: &mut Kernel) { 394 | unsafe { 395 | kernel 396 | .builder 397 | .set_arg_local_buffer(self.length * std::mem::size_of::()); 398 | } 399 | kernel.num_local_buffers += 1; 400 | } 401 | } 402 | 403 | /// A kernel that can be executed. 404 | #[derive(Debug)] 405 | pub struct Kernel<'a> { 406 | /// The underlying kernel builder. 407 | pub builder: ExecuteKernel<'a>, 408 | queue: &'a CommandQueue, 409 | /// There can only be a single [`LocalBuffer`] as parameter due to CUDA restrictions. This 410 | /// counts them, so that there can be an error if there are more `LocalBuffer` arguments. 411 | num_local_buffers: u8, 412 | } 413 | 414 | impl<'a> Kernel<'a> { 415 | /// Set a kernel argument. 416 | /// 417 | /// The arguments must live as long as the kernel. Hence make sure they are not dropped as 418 | /// long as the kernel is in use. 419 | /// 420 | /// Example where this behaviour is enforced and leads to a compile-time error: 421 | /// 422 | /// ```compile_fail 423 | /// use rust_gpu_tools::opencl::Program; 424 | /// 425 | /// fn would_break(program: &Program) { 426 | /// let data = vec![1, 2, 3, 4]; 427 | /// let buffer = program.create_buffer_from_slice(&data).unwrap(); 428 | /// let kernel = program.create_kernel("my_kernel", 4, 256).unwrap(); 429 | /// let kernel = kernel.arg(&buffer); 430 | /// // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime. 431 | /// drop(buffer); 432 | /// kernel.run().unwrap(); 433 | /// } 434 | /// ``` 435 | pub fn arg(mut self, t: &'a T) -> Self { 436 | t.push(&mut self); 437 | self 438 | } 439 | 440 | /// Actually run the kernel. 441 | pub fn run(mut self) -> GPUResult<()> { 442 | if self.num_local_buffers > 1 { 443 | return Err(GPUError::Generic( 444 | "There cannot be more than one `LocalBuffer`.".to_string(), 445 | )); 446 | } 447 | unsafe { 448 | self.builder.enqueue_nd_range(self.queue)?; 449 | } 450 | Ok(()) 451 | } 452 | } 453 | -------------------------------------------------------------------------------- /src/opencl/utils.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | 3 | use log::{debug, warn}; 4 | use opencl3::device::CL_UUID_SIZE_KHR; 5 | use sha2::{Digest, Sha256}; 6 | 7 | use crate::device::{DeviceUuid, PciId, Vendor}; 8 | use crate::error::{GPUError, GPUResult}; 9 | use crate::opencl::Device; 10 | 11 | /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. 12 | /// 13 | /// It is the first two identifiers of e.g. `lspci`: 14 | /// 15 | /// ```text 16 | /// 4e:00.0 VGA compatible controller 17 | /// || └└-- Device ID 18 | /// └└-- Bus ID 19 | /// ``` 20 | fn get_pci_id(device: &opencl3::device::Device) -> GPUResult { 21 | let vendor = Vendor::try_from(device.vendor_id()?)?; 22 | let id = match vendor { 23 | Vendor::Amd => { 24 | let topo = device.topology_amd()?; 25 | let bus_id = topo.bus as u16; 26 | let device_id = topo.device as u16; 27 | (bus_id << 8) | device_id 28 | } 29 | Vendor::Intel => { 30 | let pcibusinfo = device.pcibusinfokhr_intel()?; 31 | let bus_id = pcibusinfo.pci_bus as u16; 32 | let device_id = pcibusinfo.pci_device as u16; 33 | (bus_id << 8) | device_id 34 | } 35 | Vendor::Nvidia => { 36 | let bus_id = device.pci_bus_id_nv()? as u16; 37 | let device_id = device.pci_slot_id_nv()? as u16; 38 | (bus_id << 8) | device_id 39 | } 40 | }; 41 | Ok(id.into()) 42 | } 43 | 44 | fn get_uuid(device: &opencl3::device::Device) -> GPUResult { 45 | let uuid = device.uuid_khr()?; 46 | Ok(uuid.into()) 47 | } 48 | 49 | pub fn cache_path(device: &Device, cl_source: &str) -> std::io::Result { 50 | let path = home::home_dir().unwrap().join(".rust-gpu-tools"); 51 | if !std::path::Path::exists(&path) { 52 | std::fs::create_dir(&path)?; 53 | } 54 | let mut hasher = Sha256::new(); 55 | hasher.update(device.name.as_bytes()); 56 | hasher.update(u16::from(device.pci_id).to_be_bytes()); 57 | hasher.update(<[u8; CL_UUID_SIZE_KHR]>::from( 58 | device.uuid.unwrap_or_default(), 59 | )); 60 | hasher.update(cl_source.as_bytes()); 61 | let filename = format!("{}.bin", hex::encode(hasher.finalize())); 62 | Ok(path.join(filename)) 63 | } 64 | 65 | fn get_memory(d: &opencl3::device::Device) -> GPUResult { 66 | d.global_mem_size() 67 | .map_err(GPUError::DeviceInfoNotAvailable) 68 | } 69 | 70 | fn get_compute_units(d: &opencl3::device::Device) -> GPUResult { 71 | d.max_compute_units() 72 | .map_err(GPUError::DeviceInfoNotAvailable) 73 | } 74 | 75 | /// Get the major an minor version of the compute capabilitiy (only available on Nvidia GPUs). 76 | fn get_compute_capability(d: &opencl3::device::Device) -> GPUResult<(u32, u32)> { 77 | let major = d 78 | .compute_capability_major_nv() 79 | .map_err(GPUError::DeviceInfoNotAvailable)?; 80 | let minor = d 81 | .compute_capability_major_nv() 82 | .map_err(GPUError::DeviceInfoNotAvailable)?; 83 | Ok((major, minor)) 84 | } 85 | 86 | /// Get a list of all available and supported devices. 87 | /// 88 | /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be 89 | /// logged and the corresponding device won't be available. 90 | pub(crate) fn build_device_list() -> Vec { 91 | let mut all_devices = Vec::new(); 92 | let platforms: Vec<_> = opencl3::platform::get_platforms().unwrap_or_default(); 93 | 94 | let mut devices_without_pci_id = Vec::new(); 95 | 96 | for platform in platforms.iter() { 97 | let devices = platform 98 | .get_devices(opencl3::device::CL_DEVICE_TYPE_GPU) 99 | .map_err(Into::into) 100 | .and_then(|devices| { 101 | devices 102 | .into_iter() 103 | .map(opencl3::device::Device::new) 104 | .filter_map(|device| { 105 | if let Ok(vendor_id) = device.vendor_id() { 106 | // Only use devices from the accepted vendors ... 107 | let vendor = Vendor::try_from(vendor_id).ok()?; 108 | // ... which are available. 109 | if !device.available().unwrap_or(false) { 110 | return None; 111 | } 112 | 113 | // `filter_map()` needs to return erros wrapped in an `Option`, hence 114 | // early returns with the question mark operator cannot be used. 115 | let name = match device.name() { 116 | Ok(name) => name, 117 | Err(error) => return Some(Err(error.into())), 118 | }; 119 | let memory = match get_memory(&device) { 120 | Ok(memory) => memory, 121 | Err(error) => return Some(Err(error)), 122 | }; 123 | let compute_units = match get_compute_units(&device) { 124 | Ok(units) => units, 125 | Err(error) => return Some(Err(error)), 126 | }; 127 | let compute_capability = get_compute_capability(&device).ok(); 128 | let uuid = get_uuid(&device).ok(); 129 | 130 | // If a device doesn't have a PCI-ID, add those later to the list of 131 | // devices with a fake PCI-ID. 132 | match get_pci_id(&device) { 133 | Ok(pci_id) => { 134 | return Some(Ok(Device { 135 | vendor, 136 | name, 137 | memory, 138 | compute_units, 139 | compute_capability, 140 | pci_id, 141 | uuid, 142 | device, 143 | })); 144 | } 145 | Err(_) => { 146 | // Use a temporary PCI-ID and replace it later with a 147 | // non-colliding one. 148 | let pci_id = PciId::from(0); 149 | devices_without_pci_id.push(Device { 150 | vendor, 151 | name, 152 | memory, 153 | compute_units, 154 | compute_capability, 155 | pci_id, 156 | uuid, 157 | device, 158 | }); 159 | return None; 160 | } 161 | }; 162 | } 163 | None 164 | }) 165 | .collect::>>() 166 | }); 167 | match devices { 168 | Ok(mut devices) => { 169 | all_devices.append(&mut devices); 170 | } 171 | Err(err) => { 172 | let platform_name = platform 173 | .name() 174 | .unwrap_or_else(|_| "".to_string()); 175 | warn!( 176 | "Unable to retrieve devices for {}: {:?}", 177 | platform_name, err 178 | ); 179 | } 180 | } 181 | } 182 | 183 | // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID. 184 | // As those devices are used for development and not for production use, it's good enough to 185 | // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID 186 | // instead, which is generated by enumerating the available devices. In order to make that 187 | // case easier to spot when debugging issues, a starting number which is pleasant to the human 188 | // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234). 189 | let mut enumerated_device: u16 = 4660; 190 | for mut device in devices_without_pci_id.into_iter() { 191 | // Make sure that no device has that actual PCI-ID 192 | while all_devices 193 | .iter() 194 | .any(|d| d.pci_id() == enumerated_device.into()) 195 | { 196 | enumerated_device += 1; 197 | } 198 | device.pci_id = PciId::from(enumerated_device); 199 | enumerated_device += 1; 200 | all_devices.push(device); 201 | } 202 | 203 | debug!("loaded devices: {:?}", all_devices); 204 | all_devices 205 | } 206 | -------------------------------------------------------------------------------- /src/program.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "cuda")] 2 | use crate::cuda; 3 | use crate::error::GPUError; 4 | #[cfg(feature = "opencl")] 5 | use crate::opencl; 6 | 7 | /// Abstraction for running programs on CUDA or OpenCL. 8 | pub enum Program { 9 | /// CUDA program. 10 | #[cfg(feature = "cuda")] 11 | Cuda(cuda::Program), 12 | /// OpenCL program. 13 | #[cfg(feature = "opencl")] 14 | Opencl(opencl::Program), 15 | } 16 | 17 | impl Program { 18 | /// Run some code in the context of the program. 19 | /// 20 | /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but 21 | /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same 22 | /// code code can be used to run on either of them. The only difference is the type of the 23 | /// `Program`. 24 | /// 25 | /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their 26 | /// corresponding program type as parameter. For convenience there is the 27 | /// [`crate::program_closures`] macro defined, which can help reducing code duplication by 28 | /// creating two closures out of a single one. 29 | /// 30 | /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If 31 | /// one of them is disabled, you still need to pass in two closures. This way the API stays, 32 | /// the same, but you can disable it things at compile-time. 33 | /// 34 | /// The second parameter is a single arbitrary argument, which will be passed on into the 35 | /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference 36 | /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it 37 | /// is used at most once. 38 | #[cfg(all(feature = "cuda", feature = "opencl"))] 39 | pub fn run(&self, fun: (F1, F2), arg: A) -> Result 40 | where 41 | E: From, 42 | F1: FnOnce(&cuda::Program, A) -> Result, 43 | F2: FnOnce(&opencl::Program, A) -> Result, 44 | { 45 | match self { 46 | Self::Cuda(program) => program.run(fun.0, arg), 47 | Self::Opencl(program) => program.run(fun.1, arg), 48 | } 49 | } 50 | 51 | /// Run some code in the context of the program. 52 | /// 53 | /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but 54 | /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same 55 | /// code code can be used to run on either of them. The only difference is the type of the 56 | /// `Program`. 57 | /// 58 | /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their 59 | /// corresponding program type as parameter. For convenience there is the [`program_closures`] 60 | /// macro defined, which can help reducing code duplication by creating two closures out of 61 | /// a single one. 62 | /// 63 | /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If 64 | /// one of them is disabled, you still need to pass in two closures. This way the API stays, 65 | /// the same, but you can disable it things at compile-time. 66 | /// 67 | /// The second parameter is a single arbitrary argument, which will be passed on into the 68 | /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference 69 | /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it 70 | /// is used at most once. 71 | #[cfg(all(feature = "cuda", not(feature = "opencl")))] 72 | pub fn run(&self, fun: (F1, F2), arg: A) -> Result 73 | where 74 | E: From, 75 | F1: FnOnce(&cuda::Program, A) -> Result, 76 | { 77 | match self { 78 | Self::Cuda(program) => program.run(fun.0, arg), 79 | } 80 | } 81 | 82 | /// Run some code in the context of the program. 83 | /// 84 | /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but 85 | /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same 86 | /// code code can be used to run on either of them. The only difference is the type of the 87 | /// `Program`. 88 | /// 89 | /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their 90 | /// corresponding program type as parameter. For convenience there is the [`define_closures`] 91 | /// macro defined, which can help reducing code duplication by creating two closures out of 92 | /// a single one. 93 | /// 94 | /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If 95 | /// one of them is disabled, you still need to pass in two closures. This way the API stays, 96 | /// the same, but you can disable it things at compile-time. 97 | /// 98 | /// The second parameter is a single arbitrary argument, which will be passed on into the 99 | /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference 100 | /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it 101 | /// is used at most once. 102 | #[cfg(all(not(feature = "cuda"), feature = "opencl"))] 103 | pub fn run(&self, fun: (F1, F2), arg: A) -> Result 104 | where 105 | E: From, 106 | F2: FnOnce(&opencl::Program, A) -> Result, 107 | { 108 | match self { 109 | Self::Opencl(program) => program.run(fun.1, arg), 110 | } 111 | } 112 | 113 | /// Returns the name of the GPU, e.g. "GeForce RTX 3090". 114 | pub fn device_name(&self) -> &str { 115 | match self { 116 | #[cfg(feature = "cuda")] 117 | Self::Cuda(program) => program.device_name(), 118 | #[cfg(feature = "opencl")] 119 | Self::Opencl(program) => program.device_name(), 120 | } 121 | } 122 | } 123 | 124 | /// Creates two closures, one for CUDA, one for OpenCL for the given one. 125 | /// 126 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both, 127 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and 128 | /// it will be mapped to &[`cuda::Program`] and &[`opencl::Program`]. 129 | /// 130 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure. 131 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be 132 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at 133 | /// most once. 134 | /// 135 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct 136 | /// thing and not specify one of them if it is appropriate. 137 | /// 138 | /// ### Example 139 | /// 140 | /// ``` 141 | /// use rust_gpu_tools::{cuda, opencl, program_closures}; 142 | /// 143 | /// let closures = program_closures!(|program, arg: u8| -> bool { 144 | /// true 145 | /// }); 146 | /// 147 | /// // Generates 148 | /// let closures = ( 149 | /// |program: &cuda::Program, arg: u8| { true }, 150 | /// |program: &opencl::Program, arg: u8| { true }, 151 | /// ); 152 | /// 153 | /// // If e.g. the `cuda` feature is disabled, it would generate 154 | /// let closures_without_cuda = ( 155 | /// (), 156 | /// |program: &opencl::Program, arg: u8| { true }, 157 | /// ); 158 | /// ``` 159 | #[cfg(all(feature = "cuda", feature = "opencl"))] 160 | #[macro_export] 161 | macro_rules! program_closures { 162 | // Additional argument without a type 163 | (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { 164 | ( 165 | |$program: &$crate::cuda::Program, $arg| -> $ret { $body }, 166 | |$program: &$crate::opencl::Program, $arg| -> $ret { $body }, 167 | ) 168 | }; 169 | // Additional argument with a type 170 | (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { 171 | ( 172 | |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body }, 173 | |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body }, 174 | ) 175 | }; 176 | } 177 | 178 | /// Creates two closures, one for CUDA, one for OpenCL for the given one. 179 | /// 180 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both, 181 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and 182 | /// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`]. 183 | /// 184 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure. 185 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be 186 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at 187 | /// most once. 188 | /// 189 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct 190 | /// thing and not specify one of them if it is appropriate. 191 | /// 192 | /// ### Example 193 | /// 194 | /// ``` 195 | /// use rust_gpu_tools::{cuda, opencl, program_closures}; 196 | /// 197 | /// let closures = program_closures!(|program, arg: u8| -> bool { 198 | /// true 199 | /// }); 200 | /// 201 | /// // Generates 202 | /// let closures = ( 203 | /// |program: &cuda::Program, arg: u8| { true }, 204 | /// |program: &opencl::Program, arg: u8| { true }, 205 | /// ); 206 | /// 207 | /// // If e.g. the `cuda` feature is disabled, it would generate 208 | /// let closures_without_cuda = ( 209 | /// (), 210 | /// |program: &opencl::Program, arg: u8| { true }, 211 | /// ); 212 | /// ``` 213 | #[macro_export] 214 | #[cfg(all(feature = "cuda", not(feature = "opencl")))] 215 | macro_rules! program_closures { 216 | // Additional argument without a type 217 | (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { 218 | ( 219 | |$program: &$crate::cuda::Program, $arg| -> $ret { $body }, 220 | (), 221 | ) 222 | }; 223 | // Additional argument with a type 224 | (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { 225 | ( 226 | |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body }, 227 | (), 228 | ) 229 | }; 230 | } 231 | 232 | /// Creates two closures, one for CUDA, one for OpenCL for the given one. 233 | /// 234 | /// This macro is used to be able to interact with rust-gpu-tools with unified code for both, 235 | /// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and 236 | /// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`]. 237 | /// 238 | /// The second parameter is a single arbitrary argument, which will be passed on into the closure. 239 | /// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be 240 | /// shared between closures, hence we pass it on, so that the compiler knows that it is used at 241 | /// most once. 242 | /// 243 | /// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct 244 | /// thing and not specify one of them if it is appropriate. 245 | /// 246 | /// ### Example 247 | /// 248 | /// ``` 249 | /// use rust_gpu_tools::{cuda, opencl, program_closures}; 250 | /// 251 | /// let closures = program_closures!(|program, arg: u8| -> bool { 252 | /// true 253 | /// }); 254 | /// 255 | /// // Generates 256 | /// let closures = ( 257 | /// |program: &cuda::Program, arg: u8| { true }, 258 | /// |program: &opencl::Program, arg: u8| { true }, 259 | /// ); 260 | /// 261 | /// // If e.g. the `cuda` feature is disabled, it would generate 262 | /// let closures_without_cuda = ( 263 | /// (), 264 | /// |program: &opencl::Program, arg: u8| { true }, 265 | /// ); 266 | /// ``` 267 | #[macro_export] 268 | #[cfg(all(not(feature = "cuda"), feature = "opencl"))] 269 | macro_rules! program_closures { 270 | // Additional argument without a type 271 | (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { 272 | ((), |$program: &$crate::opencl::Program, $arg| -> $ret { 273 | $body 274 | }) 275 | }; 276 | // Additional argument with a type 277 | (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { 278 | ( 279 | (), 280 | |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body }, 281 | ) 282 | }; 283 | } 284 | --------------------------------------------------------------------------------