├── src ├── npp │ ├── tests │ │ ├── sync │ │ │ ├── mod.rs │ │ │ └── memory.rs │ │ ├── mod.rs │ │ ├── memory.rs │ │ └── image.rs │ ├── mod.rs │ ├── error.rs │ ├── constant_border.rs │ ├── stream.rs │ ├── copy_constant_border.rs │ ├── region.rs │ ├── resize.rs │ ├── remap.rs │ └── resize_batch.rs ├── ffi │ ├── includes.rs │ ├── memory │ │ ├── mod.rs │ │ ├── host.rs │ │ ├── device.rs │ │ └── device2d.rs │ ├── npp │ │ ├── includes.rs │ │ ├── mod.rs │ │ ├── copy_constant_border.rs │ │ ├── context.rs │ │ ├── resize.rs │ │ ├── remap.rs │ │ └── resize_batch.rs │ ├── mod.rs │ ├── error.rs │ ├── device.rs │ ├── ptr.rs │ └── stream.rs ├── memory │ ├── mod.rs │ ├── host.rs │ ├── device.rs │ └── device2d.rs ├── runtime │ ├── mod.rs │ ├── work.rs │ ├── thread_local.rs │ └── execution.rs ├── lib.rs ├── error.rs ├── stream.rs └── device.rs ├── .github └── workflows │ └── ci.yaml ├── Cargo.toml ├── LICENSE-MIT ├── tests └── functions_side_effects_test.rs ├── README.md └── LICENSE-APACHE /src/npp/tests/sync/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod memory; 2 | -------------------------------------------------------------------------------- /src/npp/tests/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod image; 2 | pub mod memory; 3 | pub mod sync; 4 | -------------------------------------------------------------------------------- /src/ffi/includes.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | cpp! {{ 4 | #include 5 | }} 6 | 7 | cpp! {{ 8 | #include 9 | }} 10 | -------------------------------------------------------------------------------- /src/ffi/memory/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod device; 2 | pub mod device2d; 3 | pub mod host; 4 | 5 | pub use device::DeviceBuffer; 6 | pub use device2d::DeviceBuffer2D; 7 | pub use host::HostBuffer; 8 | -------------------------------------------------------------------------------- /src/memory/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod device; 2 | pub mod device2d; 3 | pub mod host; 4 | 5 | pub use device::DeviceBuffer; 6 | pub use device2d::DeviceBuffer2D; 7 | pub use host::HostBuffer; 8 | -------------------------------------------------------------------------------- /src/runtime/mod.rs: -------------------------------------------------------------------------------- 1 | mod execution; 2 | mod future; 3 | mod thread_local; 4 | mod work; 5 | 6 | pub use future::{Future, SynchronizeFuture}; 7 | pub use thread_local::enqueue_decoupled; 8 | -------------------------------------------------------------------------------- /src/ffi/npp/includes.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | cpp! {{ 4 | #include 5 | }} 6 | 7 | cpp! {{ 8 | #include 9 | }} 10 | 11 | cpp! {{ 12 | #include 13 | #include 14 | }} 15 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![recursion_limit = "256"] 2 | 3 | pub mod device; 4 | pub mod error; 5 | pub mod ffi; 6 | pub mod memory; 7 | pub mod runtime; 8 | pub mod stream; 9 | 10 | #[cfg(feature = "npp")] 11 | pub mod npp; 12 | 13 | pub use device::{num_devices, Device, DeviceId, MemoryInfo}; 14 | pub use memory::{DeviceBuffer, DeviceBuffer2D, HostBuffer}; 15 | pub use stream::Stream; 16 | 17 | pub use error::Error; 18 | -------------------------------------------------------------------------------- /src/npp/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod constant_border; 2 | pub mod copy_constant_border; 3 | pub mod error; 4 | pub mod region; 5 | pub mod remap; 6 | pub mod resize; 7 | pub mod stream; 8 | 9 | #[cfg(feature = "npp-unstable")] 10 | pub mod resize_batch; 11 | 12 | pub use constant_border::ConstantBorder; 13 | pub use copy_constant_border::copy_constant_border; 14 | pub use error::Error; 15 | pub use region::Region; 16 | pub use remap::remap; 17 | pub use resize::resize; 18 | pub use stream::Stream; 19 | 20 | #[cfg(feature = "npp-unstable")] 21 | pub use resize_batch::resize_batch; 22 | 23 | #[cfg(test)] 24 | pub mod tests; 25 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | 14 | lints: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v3 20 | 21 | - name: Setup Rust 22 | uses: dtolnay/rust-toolchain@v1 23 | with: 24 | toolchain: stable 25 | components: rustfmt, clippy 26 | 27 | - name: Rustfmt 28 | run: cargo fmt --all -- --check 29 | 30 | # - name: Clippy 31 | # run: cargo clippy --locked --all --all-features -- -D warnings 32 | -------------------------------------------------------------------------------- /src/npp/error.rs: -------------------------------------------------------------------------------- 1 | /// An error that occurred in NPP. 2 | #[derive(Debug, Clone)] 3 | pub enum Error { 4 | /// Error code as reported by NPP. 5 | /// 6 | /// [NPP documentation](https://docs.nvidia.com/cuda/npp/group__typedefs__npp.html#ga1105a17b5e76381583c46ecd6a60fe21) 7 | Npp(i32), 8 | /// Error in CUDA backend. 9 | /// 10 | /// Refer to [`crate::Error`]. 11 | Cuda(crate::Error), 12 | } 13 | 14 | impl std::fmt::Display for Error { 15 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 16 | match self { 17 | Error::Cuda(err) => write!(f, "{err}"), 18 | Error::Npp(error_code) => write!(f, "error code produced by NPP: {error_code}"), 19 | } 20 | } 21 | } 22 | 23 | impl std::error::Error for Error {} 24 | 25 | impl From for Error { 26 | #[inline] 27 | fn from(err: crate::Error) -> Self { 28 | Error::Cuda(err) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/ffi/mod.rs: -------------------------------------------------------------------------------- 1 | mod includes; 2 | 3 | pub mod device; 4 | pub mod error; 5 | pub mod memory; 6 | pub mod ptr; 7 | pub mod stream; 8 | 9 | #[cfg(feature = "npp")] 10 | pub mod npp; 11 | 12 | /// Convenience macro for turning a CUDA error code into a `std::result::Result`. 13 | /// 14 | /// # Usage 15 | /// 16 | /// There are two possible uses of the macro: 17 | /// 18 | /// (1) Shorthand to return `Ok(something)` or a CUDA error: 19 | /// 20 | /// ```ignore 21 | /// result!(code, return_value); 22 | /// ``` 23 | /// 24 | /// (2) Shorthand to return `Ok(())` or a CUDA error: 25 | /// 26 | /// ```ignore 27 | /// result!(code) 28 | /// ``` 29 | macro_rules! result { 30 | ($code:expr, $ok:expr) => { 31 | if $code == 0 { 32 | Ok($ok) 33 | } else { 34 | Err($crate::error::Error::Cuda($code)) 35 | } 36 | }; 37 | ($code:expr) => { 38 | result!($code, ()) 39 | }; 40 | } 41 | 42 | use result; 43 | -------------------------------------------------------------------------------- /src/ffi/npp/mod.rs: -------------------------------------------------------------------------------- 1 | mod includes; 2 | 3 | pub mod context; 4 | pub mod copy_constant_border; 5 | pub mod remap; 6 | pub mod resize; 7 | 8 | #[cfg(feature = "npp-unstable")] 9 | pub mod resize_batch; 10 | 11 | /// Convenience macro for turning an NPP error code into a `std::result::Result`. 12 | /// 13 | /// # Usage 14 | /// 15 | /// There are two possible uses of the macro: 16 | /// 17 | /// (1) Shorthand to return `Ok(something)` or an NPP error: 18 | /// 19 | /// ```ignore 20 | /// result!(code, return_value); 21 | /// ``` 22 | /// 23 | /// (2) Shorthand to return `Ok(())` or an NPP error: 24 | /// 25 | /// ```ignore 26 | /// result!(code) 27 | /// ``` 28 | macro_rules! result { 29 | ($code:expr, $ok:expr) => { 30 | if $code == 0 { 31 | Ok($ok) 32 | } else { 33 | Err($crate::npp::error::Error::Npp($code)) 34 | } 35 | }; 36 | ($code:expr) => { 37 | result!($code, ()) 38 | }; 39 | } 40 | 41 | use result; 42 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "async-cuda" 3 | description = "Async CUDA for Rust." 4 | keywords = ["async", "nvidia", "cuda", "gpu", "npp"] 5 | readme = "README.md" 6 | categories = ["asynchronous"] 7 | edition = "2021" 8 | version = "0.6.1" 9 | authors = ["Oddity.ai Developers "] 10 | repository = "https://github.com/oddity-ai/async-cuda" 11 | license = "MIT OR Apache-2.0" 12 | 13 | [dependencies] 14 | cpp = "0.5" 15 | ndarray = { version = "0.16", optional = true } 16 | once_cell = "1.17" 17 | 18 | [dev-dependencies] 19 | futures = { version = "0.3", default-features = false, features = ["std"] } 20 | tokio = { version = "1", default-features = false, features = [ 21 | "macros", 22 | "test-util", 23 | "time", 24 | ] } 25 | tokio-test = { version = "0.4" } 26 | 27 | [build-dependencies] 28 | cpp_build = "0.5" 29 | 30 | [features] 31 | npp = [] 32 | npp-unstable = [] 33 | 34 | [package.metadata.docs.rs] 35 | rustc-args = ["--cfg", "feature=\"docs-only\""] 36 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::error::error_description; 2 | 3 | /// An error that occurred during a CUDA operation. 4 | #[derive(Debug, Clone)] 5 | pub enum Error { 6 | /// Error code as reported by the CUDA backend. 7 | /// 8 | /// [CUDA documentation](https://docs.nvidia.com/cuda/npp/group__typedefs__npp.html#ga1105a17b5e76381583c46ecd6a60fe21) 9 | Cuda(i32), 10 | /// The runtime backend unexpectedly broke down. This is usually irrecoverable because the 11 | /// entire crate assumes that all backend execution will happen on the runtime thread. 12 | Runtime, 13 | } 14 | 15 | impl std::fmt::Display for Error { 16 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 17 | match self { 18 | Error::Cuda(code) => { 19 | let error_code = *code; 20 | let error_description = error_description(error_code); 21 | write!( 22 | f, 23 | "CUDA error ({}): {}", 24 | error_code, 25 | error_description.as_str(), 26 | ) 27 | } 28 | Error::Runtime => write!(f, "CUDA runtime broken"), 29 | } 30 | } 31 | } 32 | 33 | impl std::error::Error for Error {} 34 | -------------------------------------------------------------------------------- /src/npp/tests/memory.rs: -------------------------------------------------------------------------------- 1 | /// Convenience macro for testing to take a memory slice and put it on the device and return the 2 | /// [`crate::memory::DeviceBuffer2D`] that refers to it. 3 | macro_rules! to_device_2d { 4 | ($slice:expr, $width:expr, $height:expr, $num_channels:expr, $stream:expr) => {{ 5 | let host_buffer = crate::memory::HostBuffer::from_slice($slice).await; 6 | let mut device_buffer = 7 | crate::memory::DeviceBuffer2D::new($width, $height, $num_channels).await; 8 | device_buffer 9 | .copy_from(&host_buffer, $stream) 10 | .await 11 | .unwrap(); 12 | device_buffer 13 | }}; 14 | } 15 | 16 | /// Convenience macro for testing to take a [`crate::memory::DeviceBuffer2D`] and copy it back to 17 | /// the host, then return a [`Vec`] of that memory. 18 | macro_rules! to_host_2d { 19 | ($device_buffer:expr, $stream:expr) => {{ 20 | let mut host_buffer = crate::memory::HostBuffer::new($device_buffer.num_elements()).await; 21 | $device_buffer 22 | .copy_to(&mut host_buffer, $stream) 23 | .await 24 | .unwrap(); 25 | host_buffer.to_vec() 26 | }}; 27 | } 28 | 29 | pub(crate) use to_device_2d; 30 | pub(crate) use to_host_2d; 31 | -------------------------------------------------------------------------------- /src/ffi/error.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | /// Returns the description string for an error code. 4 | /// 5 | /// Note that this function is not executed on the runtime thread, since it is purely a utility 6 | /// function and should have no side-effects with regards to CUDA devices. 7 | /// 8 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html#group__CUDART__ERROR_1g4bc9e35a618dfd0877c29c8ee45148f1) 9 | /// 10 | /// # Arguments 11 | /// 12 | /// * `error_code` - CUDA error code. 13 | pub fn error_description(error_code: i32) -> String { 14 | let error_description = cpp!(unsafe [ 15 | error_code as "std::int32_t" 16 | ] -> *const std::ffi::c_char as "const char*" { 17 | return cudaGetErrorString(static_cast(error_code)); 18 | }); 19 | // SAFETY: The pointer returned by `cudaGetErrorString` actually has a static lifetime so this 20 | // is safe for sure. We even copy inside the unsafe block so we just need it to remain for a 21 | // little bit. 22 | unsafe { 23 | std::ffi::CStr::from_ptr(error_description) 24 | .to_string_lossy() 25 | .to_string() 26 | } 27 | } 28 | 29 | #[cfg(test)] 30 | mod tests { 31 | use super::*; 32 | 33 | #[test] 34 | fn test_correct_description() { 35 | assert_eq!(error_description(1), "invalid argument"); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/runtime/work.rs: -------------------------------------------------------------------------------- 1 | /// Represents a unit of work passed to the runtime. Holds a closure inside. 2 | /// 3 | /// The closure is explictly [`Send`] because it will be sent over the thread boundary to be 4 | /// executed in the runtime thread. For the same reason, the closure must be `'static`. 5 | /// 6 | /// # Usage 7 | /// 8 | /// ```ignore 9 | /// let work = Work::new(|| { 10 | /// // ... 11 | /// }); 12 | /// work.run(); 13 | /// ``` 14 | pub struct Work(Box); 15 | 16 | impl Work { 17 | /// Create a new work item. 18 | /// 19 | /// # Arguments 20 | /// 21 | /// * `f` - Closure to execute. 22 | pub fn new(f: impl FnOnce() + Send + 'static) -> Self { 23 | Work(Box::new(f)) 24 | } 25 | 26 | /// Execute work. 27 | pub fn run(self) { 28 | let Work(f) = self; 29 | f(); 30 | } 31 | } 32 | 33 | #[cfg(test)] 34 | mod tests { 35 | use super::*; 36 | 37 | #[test] 38 | fn test_it_runs() { 39 | let make_me_true = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); 40 | let work = Work::new({ 41 | let make_me_true = make_me_true.clone(); 42 | move || { 43 | make_me_true.store(true, std::sync::atomic::Ordering::SeqCst); 44 | } 45 | }); 46 | work.run(); 47 | assert!(make_me_true.load(std::sync::atomic::Ordering::SeqCst)); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/npp/tests/sync/memory.rs: -------------------------------------------------------------------------------- 1 | /// Convenience macro for testing to take a memory slice and put it on the device and return the 2 | /// [`crate::ffi::memory::DeviceBuffer2D`] that refers to it. 3 | macro_rules! to_device_2d { 4 | ($slice:expr, $width:expr, $height:expr, $num_channels:expr, $context:expr) => {{ 5 | let host_buffer = crate::ffi::memory::HostBuffer::from_slice($slice); 6 | let mut device_buffer = 7 | crate::ffi::memory::DeviceBuffer2D::new($width, $height, $num_channels); 8 | // SAFETY: Stream is synchronized right after this. 9 | unsafe { 10 | device_buffer 11 | .copy_from_async(&host_buffer, &$context.stream.inner()) 12 | .unwrap(); 13 | } 14 | $context.stream.inner().synchronize().unwrap(); 15 | device_buffer 16 | }}; 17 | } 18 | 19 | /// Convenience macro for testing to take a [`crate::ffi::memory::DeviceBuffer2D`] and copy it back 20 | /// to the host, then return a [`Vec`] of that memory. 21 | macro_rules! to_host_2d { 22 | ($device_buffer:expr, $context:expr) => {{ 23 | let mut host_buffer = crate::ffi::memory::HostBuffer::new($device_buffer.num_elements()); 24 | // SAFETY: Stream is synchronized right after this. 25 | unsafe { 26 | $device_buffer 27 | .copy_to_async(&mut host_buffer, &$context.stream.inner()) 28 | .unwrap(); 29 | } 30 | $context.stream.inner().synchronize().unwrap(); 31 | host_buffer.to_vec() 32 | }}; 33 | } 34 | 35 | pub(crate) use to_device_2d; 36 | pub(crate) use to_host_2d; 37 | -------------------------------------------------------------------------------- /src/npp/tests/image.rs: -------------------------------------------------------------------------------- 1 | pub type Pixel = [u8; 3]; 2 | pub type Image2x2 = [[Pixel; 2]; 2]; 3 | pub type Image4x4 = [[Pixel; 4]; 4]; 4 | 5 | pub const R: Pixel = [255_u8, 0_u8, 0_u8]; 6 | pub const G: Pixel = [0_u8, 255_u8, 0_u8]; 7 | pub const B: Pixel = [0_u8, 0_u8, 255_u8]; 8 | 9 | /// This is a 4 by 4 testing image that represents the hypothetical RGB flag, which looks something 10 | /// like this: 11 | /// 12 | /// ```text 13 | /// .. .. .. .. 14 | /// RR RR RR RR 15 | /// RR GG GG RR 16 | /// RR BB BB RR 17 | /// RR RR RR RR 18 | /// .. .. .. .. 19 | /// ``` 20 | /// (It consists of a two-pixel green and blue band, wrapped in a red one-pixel border.) 21 | /// 22 | /// Where `RR` represents a red pixel, `GG` a green one and `BB` a blue one. 23 | pub const RGB_FLAG_RAW: Image4x4 = [ 24 | [R, R, R, R], // Red border 25 | [R, G, G, R], // Green band with red border 26 | [R, B, B, R], // Blue band with red border 27 | [R, R, R, R], // Red border 28 | ]; 29 | 30 | /// This is the [`RGB_FLAG_RAW`] image with contiguous memory layout so that it can be easily put 31 | /// into a host or device buffer. 32 | pub const RGB_FLAG: [u8; 4 * 4 * 3] = flatten!(RGB_FLAG_RAW, 4 * 4 * 3); 33 | 34 | /// Convenience macro to flatten a nested array to a flat array. 35 | /// 36 | /// # Usage 37 | /// 38 | /// ```ignore 39 | /// let array = [ 40 | /// [1, 2, 3], 41 | /// [4, 5, 6], 42 | /// [7, 8, 9], 43 | /// ]; 44 | /// assert_eq!( 45 | /// &flatten!(array), 46 | /// &[1, 2, 3, 4, 5, 6, 7, 8, 9], 47 | /// ); 48 | /// ``` 49 | macro_rules! flatten { 50 | ($array:expr, $size:expr) => { 51 | unsafe { std::mem::transmute::<_, [_; $size]>($array) } 52 | }; 53 | } 54 | 55 | pub(crate) use flatten; 56 | -------------------------------------------------------------------------------- /src/npp/constant_border.rs: -------------------------------------------------------------------------------- 1 | /// Represents a constant border around an image. 2 | /// 3 | /// This is used to specify the border around an image when copying a constant border around it for 4 | /// the purposes of letterbox resizing. 5 | #[derive(Debug, Clone, PartialEq)] 6 | pub struct ConstantBorder { 7 | pub left: u32, 8 | pub top: u32, 9 | pub color: [u8; 3], 10 | } 11 | 12 | impl ConstantBorder { 13 | /// New constant border. 14 | /// 15 | /// # Arguments 16 | /// 17 | /// * `left` - Size of border on the left and right sides of the image in number of pixels. 18 | /// * `top`- Size of border on the top and bottom sides of the image in number of pixels. 19 | /// * `color` - Color of border (RGB). 20 | pub fn new(left: u32, top: u32, color: [u8; 3]) -> Self { 21 | Self { left, top, color } 22 | } 23 | 24 | /// New constant border with white color. 25 | /// 26 | /// # Arguments 27 | /// 28 | /// * `left` - Size of border on the left and right sides of the image in number of pixels. 29 | /// * `top`- Size of border on the top and bottom sides of the image in number of pixels. 30 | pub fn white(left: u32, top: u32) -> Self { 31 | Self::new(left, top, [255, 255, 255]) 32 | } 33 | 34 | /// New constant border with black color. 35 | /// 36 | /// # Arguments 37 | /// 38 | /// * `left` - Size of border on the left and right sides of the image in number of pixels. 39 | /// * `top`- Size of border on the top and bottom sides of the image in number of pixels. 40 | pub fn black(left: u32, top: u32) -> Self { 41 | Self::new(left, top, [0, 0, 0]) 42 | } 43 | } 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | use super::*; 48 | 49 | #[test] 50 | fn test_new() { 51 | let border = ConstantBorder::new(1, 2, [3, 4, 5]); 52 | assert_eq!(border.left, 1); 53 | assert_eq!(border.top, 2); 54 | assert_eq!(border.color, [3, 4, 5]); 55 | } 56 | 57 | #[test] 58 | fn test_white() { 59 | let border = ConstantBorder::white(1, 2); 60 | assert_eq!(border.left, 1); 61 | assert_eq!(border.top, 2); 62 | assert_eq!(border.color, [255, 255, 255]); 63 | } 64 | 65 | #[test] 66 | fn test_black() { 67 | let border = ConstantBorder::black(1, 2); 68 | assert_eq!(border.left, 1); 69 | assert_eq!(border.top, 2); 70 | assert_eq!(border.color, [0, 0, 0]); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /tests/functions_side_effects_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "npp")] 2 | use async_cuda::ffi::device::Device; 3 | #[cfg(feature = "npp")] 4 | use async_cuda::stream::Stream; 5 | 6 | #[cfg(feature = "npp")] 7 | use async_cuda::ffi::npp::context::Context; 8 | 9 | /// This integration test helps determine which ffi functions affect the GPU state, or local thread 10 | /// state. 11 | /// 12 | /// This information is important to determine which function need to be executed on the runtime 13 | /// thread, and which functions can be executed directly by the caller (and don't need to be async). 14 | /// 15 | /// We only test functions where it is not immediately apparent whether or not the function has 16 | /// side-effects. All wrappers for NPP operations aren't tested since it is evident that they affect 17 | /// the GPU state. 18 | /// 19 | /// # Find GPU side-effects 20 | /// 21 | /// Run this integration test under the Nsight profile with the following command: 22 | /// 23 | /// ```bash 24 | /// nsys profile --output /tmp/side_effects_trace --force-overwrite true cargo test --release --test functions_side_effects_test 25 | /// ``` 26 | /// 27 | /// Use the `nsys-ui` utility to inspect the report produced in `/tmp/side_effects_trace.qdstrm` and 28 | /// determine for each function call if one or more CUDA API functions were invoked, and if the GPU 29 | /// was affected in any way. Function calls are separated by device synchronization markers in the 30 | /// trace. 31 | /// 32 | /// # Find thread-local side-effects 33 | /// 34 | /// These need to inferred from documentation or usage (or an educated guess). 35 | /// 36 | /// # Results 37 | /// 38 | /// | Function | Side-effect: GPU | Side-effect: thread-local | 39 | /// | ----------------------------- | ---------------- | ------------------------- | 40 | /// | `Context::from_null_stream` | ❌ | ✅ | 41 | /// | `Context::from_stream` | ❌ | ✅ | 42 | #[cfg(feature = "npp")] 43 | #[tokio::test] 44 | async fn test_side_effects() { 45 | // First block contains stuff we are not interested in measuring... 46 | let stream = Stream::new().await.unwrap(); 47 | 48 | // A sequence of CUDA calls that is easy to find in the trace. 49 | Device::synchronize().unwrap(); 50 | let _mem_info_1 = Device::memory_info().unwrap(); 51 | let _mem_info_2 = Device::memory_info().unwrap(); 52 | let _mem_info_3 = Device::memory_info().unwrap(); 53 | let _mem_info_4 = Device::memory_info().unwrap(); 54 | Device::synchronize().unwrap(); 55 | 56 | let _context_null = Context::from_null_stream(); 57 | Device::synchronize().unwrap(); 58 | 59 | let _context_new = Context::from_stream(stream); 60 | Device::synchronize().unwrap(); 61 | } 62 | -------------------------------------------------------------------------------- /src/stream.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi; 2 | use crate::runtime::{Future, SynchronizeFuture}; 3 | 4 | type Result = std::result::Result; 5 | 6 | /// CUDA stream. 7 | pub struct Stream { 8 | inner: ffi::stream::Stream, 9 | } 10 | 11 | impl Stream { 12 | /// Create a [`Stream`] object that represent the default stream, also known as the null stream. 13 | /// 14 | /// Refer to the [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) 15 | /// for more information regarding the default ("null") stream: 16 | /// 17 | /// # Prefer owned streams 18 | /// 19 | /// It is recommended to use owned streams as much as possible, for two reasons: 20 | /// 21 | /// * Using streams to separate semanticly unrelated streams of operations allows the GPU to 22 | /// overlap operations and improved parallelism. 23 | /// * Using the default stream can incur implicit synchronization, even on other streams, which 24 | /// causes their performance to degrade. 25 | /// 26 | /// Note that it is not enforced that there is only one [`Stream`] object that represents the 27 | /// default stream. This is safe because all operations are serialized anyway. 28 | pub fn null() -> Self { 29 | Self { 30 | inner: ffi::stream::Stream::null(), 31 | } 32 | } 33 | 34 | /// Create an asynchronous stream. 35 | /// 36 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g6a3c4b819e6a994c26d0c4824a4c80da) 37 | pub async fn new() -> Result { 38 | let inner = Future::new(ffi::stream::Stream::new).await?; 39 | Ok(Self { inner }) 40 | } 41 | 42 | /// Synchronize stream. This future will only return once all currently enqueued work on the 43 | /// stream is done. 44 | /// 45 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g74aa9f4b1c2f12d994bf13876a5a2498) 46 | /// 47 | /// # Behavior 48 | /// 49 | /// In constrast to most of the API, this future does not become ready eagerly. Instead, a 50 | /// callback is pushed onto the given stream that will be invoked to make the future ready once 51 | /// all work on the stream that was previously queued asynchroneously is completed. 52 | /// 53 | /// Internally, the future uses `cudaStreamAddCallback` to schedule the callback on the stream. 54 | pub async fn synchronize(&self) -> Result<()> { 55 | SynchronizeFuture::new(self).await 56 | } 57 | 58 | /// Access the inner synchronous implementation of [`Stream`]. 59 | #[inline(always)] 60 | pub fn inner(&self) -> &ffi::stream::Stream { 61 | &self.inner 62 | } 63 | } 64 | 65 | #[cfg(test)] 66 | mod tests { 67 | use super::*; 68 | 69 | #[tokio::test] 70 | async fn test_new() { 71 | assert!(Stream::new().await.is_ok()); 72 | } 73 | 74 | #[tokio::test] 75 | async fn test_synchronize() { 76 | let stream = Stream::new().await.unwrap(); 77 | assert!(stream.synchronize().await.is_ok()); 78 | } 79 | 80 | #[tokio::test] 81 | async fn test_synchronize_null_stream() { 82 | let stream = Stream::null(); 83 | assert!(stream.synchronize().await.is_ok()); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | async-cuda 3 |

4 |

Asynchronous CUDA for Rust.

5 |
6 | 7 | [![version](https://img.shields.io/crates/v/async-cuda)](https://crates.io/crates/async-cuda) 8 | [![license](https://img.shields.io/crates/l/async-cuda)](#license) 9 | [![docs](https://img.shields.io/docsrs/async-cuda)](https://docs.rs/async-cuda) 10 | 11 |
12 | 13 | ## ℹ️ Introduction 14 | 15 | `async-cuda` is an experimental library for interacting with the GPU asynchronously. Since the GPU 16 | is just another I/O device (from the point of view of your program), the async model actually fits 17 | surprisingly well. The way it is implemented in `async-cuda` is that all operations are scheduled on 18 | a single runtime thread that drives the GPU. The interface of this library enforces that 19 | synchronization happens when it is necessary (and synchronization itself is also asynchronous). 20 | 21 | On top of common CUDA primitives, this library also includes async wrappers for 22 | [NVIDIA's NPP library](https://developer.nvidia.com/npp). 23 | 24 | The async wrappers for TensorRT have been moved to a separate repository here: 25 | [`async-tensorrt`](https://github.com/oddity-ai/async-tensorrt). 26 | 27 | ## 🛠 S️️tatus 28 | 29 | This project is still a work-in-progress, and will contain bugs. Some parts of the API have not 30 | been flushed out yet. Use with caution. 31 | 32 | ## 📦 Setup 33 | 34 | Make sure you have the necessary dependencies installed: 35 | 36 | * CUDA toolkit 11 or later. 37 | 38 | Then, add the following to your dependencies in `Cargo.toml`: 39 | 40 | ```toml 41 | async-cuda = "0.6" 42 | ``` 43 | 44 | To enable the NPP functions: 45 | 46 | ```toml 47 | async-cuda = { version = "0.6", features = ["npp"] } 48 | ``` 49 | 50 | ## ⚠️ Safety warning 51 | 52 | This crate is **intentionally unsafe**. Due to the limitations of how async Rust currently works, 53 | usage of the async interface of this crate can cause undefined behavior in some rare cases. It is up 54 | to the user of this crate to prevent this from happening by following these rules: 55 | 56 | * No futures produced by functions in this crate may be leaked (either by `std::mem::forget` or 57 | otherwise). 58 | * Use a well-behaved runtime (one that will not forget your future) like Tokio or async-std. 59 | 60 | Internally, the `Future` type in this crate schedules a CUDA call on a separate runtime thread. To 61 | make the API as ergonomic as possible, the lifetime bounds of the closure (that is sent to the 62 | runtime) are tied to the future object. To enforce this bound, the future will block and wait if it 63 | is dropped. This mechanism relies on the future being driven to completion, and not forgotten. This 64 | is not necessarily guaranteed. Unsafety may arise if either the runtime gives up on or forgets the 65 | future, or the caller manually polls the future, then forgets it. 66 | 67 | ## License 68 | 69 | Licensed under either of 70 | 71 | * Apache License, Version 2.0 72 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 73 | * MIT license 74 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 75 | 76 | at your option. 77 | 78 | ## Contribution 79 | 80 | Unless you explicitly state otherwise, any contribution intentionally submitted 81 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 82 | dual licensed as above, without any additional terms or conditions. 83 | -------------------------------------------------------------------------------- /src/runtime/thread_local.rs: -------------------------------------------------------------------------------- 1 | use std::sync::mpsc::Sender; 2 | 3 | use once_cell::sync::Lazy; 4 | 5 | use crate::error::Error; 6 | use crate::runtime::execution::RUNTIME; 7 | use crate::runtime::work::Work; 8 | 9 | thread_local! { 10 | /// Thread-local runtime delegate. 11 | /// 12 | /// This object serves as the per-thread reference to the [`RUNTIME`] that can be used to 13 | /// enqueue work on the runtime thread. 14 | /// 15 | /// # Usage 16 | /// 17 | /// ```ignore 18 | /// assert!( 19 | /// RUNTIME_THREAD_LOCAL.with(|runtime| 20 | /// runtime.enqueue(Work::new(|| ())) 21 | /// ).is_ok() 22 | /// ) 23 | /// ``` 24 | pub(super) static RUNTIME_THREAD_LOCAL: Lazy = Lazy::new(|| { 25 | RUNTIME.lock().unwrap().thread_local() 26 | }); 27 | } 28 | 29 | /// Per-thread delegate for global runtime. 30 | pub struct RuntimeThreadLocal(Sender); 31 | 32 | impl RuntimeThreadLocal { 33 | /// Initialize [`RuntimeThreadLocal`] from [`Sender`] that allows the delegate to send work to 34 | /// the actual [`crate::runtime::execution::Runtime`]. 35 | /// 36 | /// # Arguments 37 | /// 38 | /// * `sender` - Sender through which work can be sent to runtime. 39 | pub(super) fn from_sender(sender: Sender) -> Self { 40 | RuntimeThreadLocal(sender) 41 | } 42 | 43 | /// Enqueue work on runtime. 44 | /// 45 | /// # Arguments 46 | /// 47 | /// * `function` - Unit of work in function closure to enqueue. 48 | pub(super) fn enqueue(&self, function: Work) -> Result<(), Error> { 49 | self.0.send(function).map_err(|_| Error::Runtime) 50 | } 51 | } 52 | 53 | /// Enqueue work on the runtime without caring about the return value. This is useful in situations 54 | /// where work must be performed but the result does not matter. For example, when destorying CUDA 55 | /// object as part of dropping an object. 56 | /// 57 | /// # Arguments 58 | /// 59 | /// * `f` - Function closure to execute on runtime. 60 | /// 61 | /// # Example 62 | /// 63 | /// ```ignore 64 | /// enqueue_decoupled(move || { 65 | /// // ... 66 | /// }); 67 | /// ``` 68 | #[inline] 69 | pub fn enqueue_decoupled(f: impl FnOnce() + Send + 'static) { 70 | let f = Box::new(f); 71 | RUNTIME_THREAD_LOCAL 72 | .with(|runtime| runtime.enqueue(Work::new(f))) 73 | .expect("runtime broken") 74 | } 75 | 76 | #[cfg(test)] 77 | mod tests { 78 | use super::*; 79 | 80 | #[test] 81 | fn test_enqueue_works() { 82 | let (tx, rx) = std::sync::mpsc::channel(); 83 | assert!(RUNTIME_THREAD_LOCAL 84 | .with(|runtime| { 85 | runtime.enqueue(Work::new(move || { 86 | assert!(tx.send(true).is_ok()); 87 | })) 88 | }) 89 | .is_ok()); 90 | assert!(matches!( 91 | rx.recv_timeout(std::time::Duration::from_millis(100)), 92 | Ok(true), 93 | )); 94 | } 95 | 96 | #[test] 97 | fn test_enqueue_decoupled_works() { 98 | let (tx, rx) = std::sync::mpsc::channel(); 99 | enqueue_decoupled(move || { 100 | assert!(tx.send(true).is_ok()); 101 | }); 102 | assert!(matches!( 103 | rx.recv_timeout(std::time::Duration::from_millis(100)), 104 | Ok(true), 105 | )); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/ffi/device.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::device::DeviceId; 4 | use crate::device::MemoryInfo; 5 | use crate::ffi::result; 6 | 7 | type Result = std::result::Result; 8 | 9 | /// Synchronous implementation of [`crate::num_devices`]. 10 | /// 11 | /// Refer to [`crate::num_devices`] for documentation. 12 | pub fn num_devices() -> Result { 13 | let mut num = 0_i32; 14 | let num_ptr = std::ptr::addr_of_mut!(num); 15 | let ret = cpp!(unsafe [ 16 | num_ptr as "std::int32_t*" 17 | ] -> i32 as "std::int32_t" { 18 | return cudaGetDeviceCount(num_ptr); 19 | }); 20 | 21 | result!(ret, num as usize) 22 | } 23 | 24 | /// Synchronous implementation of [`crate::Device`]. 25 | /// 26 | /// Refer to [`crate::Device`] for documentation. 27 | pub struct Device; 28 | 29 | impl Device { 30 | #[inline] 31 | pub fn get() -> Result { 32 | let mut id: i32 = 0; 33 | let id_ptr = std::ptr::addr_of_mut!(id); 34 | let ret = cpp!(unsafe [ 35 | id_ptr as "int*" 36 | ] -> i32 as "int" { 37 | return cudaGetDevice(id_ptr); 38 | }); 39 | result!(ret, id) 40 | } 41 | 42 | #[inline(always)] 43 | pub fn get_or_panic() -> DeviceId { 44 | Device::get().unwrap_or_else(|err| panic!("failed to get device: {err}")) 45 | } 46 | 47 | #[inline] 48 | pub fn set(id: DeviceId) -> Result<()> { 49 | let ret = cpp!(unsafe [ 50 | id as "int" 51 | ] -> i32 as "int" { 52 | return cudaSetDevice(id); 53 | }); 54 | result!(ret) 55 | } 56 | 57 | #[inline(always)] 58 | pub fn set_or_panic(id: DeviceId) { 59 | Device::set(id).unwrap_or_else(|err| panic!("failed to set device {id}: {err}")); 60 | } 61 | 62 | pub fn synchronize() -> Result<()> { 63 | let ret = cpp!(unsafe [] -> i32 as "std::int32_t" { 64 | return cudaDeviceSynchronize(); 65 | }); 66 | result!(ret) 67 | } 68 | 69 | pub fn memory_info() -> Result { 70 | let mut free: usize = 0; 71 | let free_ptr = std::ptr::addr_of_mut!(free); 72 | let mut total: usize = 0; 73 | let total_ptr = std::ptr::addr_of_mut!(total); 74 | 75 | let ret = cpp!(unsafe [ 76 | free_ptr as "std::size_t*", 77 | total_ptr as "std::size_t*" 78 | ] -> i32 as "std::int32_t" { 79 | return cudaMemGetInfo(free_ptr, total_ptr); 80 | }); 81 | result!(ret, MemoryInfo { free, total }) 82 | } 83 | } 84 | 85 | #[cfg(test)] 86 | mod tests { 87 | use super::*; 88 | 89 | #[test] 90 | fn test_num_devices() { 91 | assert!(matches!(num_devices(), Ok(num) if num > 0)); 92 | } 93 | 94 | #[test] 95 | fn test_get_device() { 96 | assert!(matches!(Device::get(), Ok(0))); 97 | } 98 | 99 | #[test] 100 | fn test_set_device() { 101 | assert!(Device::set(0).is_ok()); 102 | assert!(matches!(Device::get(), Ok(0))); 103 | } 104 | 105 | #[test] 106 | fn test_synchronize() { 107 | assert!(Device::synchronize().is_ok()); 108 | } 109 | 110 | #[test] 111 | fn test_memory_info() { 112 | let memory_info = Device::memory_info().unwrap(); 113 | assert!(memory_info.free > 0); 114 | assert!(memory_info.total > 0); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/npp/stream.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::ffi::npp::context::Context; 4 | use crate::runtime::Future; 5 | 6 | /// Represents an NPP stream. 7 | /// 8 | /// An NPP stream is a thin wrapper around a normal CUDA stream ([`crate::Stream`]). It manages some 9 | /// additional context information required in NPP to statelessly execute on a user-provided stream. 10 | /// 11 | /// This struct implements `Deref` such that it can be used as a normal [`crate::Stream`] as well. 12 | /// 13 | /// # Usage 14 | /// 15 | /// If the caller wants to use a stream context for mixed NPP and non-NPP operations, they should 16 | /// create an NPP stream and pass it as CUDA stream when desired. This should work out-of-the-box 17 | /// since [`Stream`] dereferences to [`crate::Stream`]. 18 | pub struct Stream { 19 | context: Arc, 20 | } 21 | 22 | impl Stream { 23 | /// Create an NPP [`Stream`] that represent the default stream, also known as the null stream. 24 | /// 25 | /// This type is a wrapper around the actual CUDA stream type: [`crate::Stream`]. 26 | #[inline] 27 | pub async fn null() -> Self { 28 | let context = Future::new(Context::from_null_stream).await; 29 | Self { 30 | context: Arc::new(context), 31 | } 32 | } 33 | 34 | /// Create a new [`Stream`] for use with NPP. 35 | /// 36 | /// This type is a wrapper around the actual CUDA stream type: [`crate::Stream`]. 37 | #[inline] 38 | pub async fn new() -> std::result::Result { 39 | let stream = crate::Stream::new().await?; 40 | let context = Future::new(move || Context::from_stream(stream)).await; 41 | Ok(Self { 42 | context: Arc::new(context), 43 | }) 44 | } 45 | 46 | /// Acquire shared access to the underlying NPP context object. 47 | /// 48 | /// This NPP object can be safetly sent to the runtime thread so it can be used as a context. 49 | /// 50 | /// # Safety 51 | /// 52 | /// The [`Context`] object may only be *used* from the runtime thread. 53 | pub(crate) fn to_context(&self) -> Arc { 54 | self.context.clone() 55 | } 56 | } 57 | 58 | impl std::ops::Deref for Stream { 59 | type Target = crate::Stream; 60 | 61 | fn deref(&self) -> &Self::Target { 62 | &self.context.stream 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | use super::*; 69 | 70 | #[tokio::test] 71 | async fn test_new() { 72 | let stream = Stream::new().await.unwrap(); 73 | assert!(!stream.to_context().as_ptr().is_null()); 74 | // SAFETY: This works because we know that the first field of the underlying 75 | // `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped 76 | // stream or it was not initalized correctly. 77 | assert_eq!( 78 | unsafe { *(stream.to_context().as_ptr() as *const *const std::ffi::c_void) }, 79 | stream.inner().as_internal().as_ptr(), 80 | ); 81 | } 82 | 83 | #[tokio::test] 84 | async fn test_null() { 85 | let stream = Stream::null().await; 86 | assert!(!stream.to_context().as_ptr().is_null()); 87 | // SAFETY: This works because we know that the first field of the underlying 88 | // `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped 89 | // stream, which is the null stream in this case. 90 | assert!( 91 | unsafe { *(stream.to_context().as_ptr() as *const *const std::ffi::c_void) }.is_null() 92 | ); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/device.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi; 2 | use crate::runtime::Future; 3 | 4 | type Result = std::result::Result; 5 | 6 | /// Returns the number of compute-capable devices. 7 | /// 8 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g18808e54893cfcaafefeab31a73cc55f) 9 | /// 10 | /// # Return value 11 | /// 12 | /// Number of CUDA devices or error in case of failure. 13 | pub async fn num_devices() -> Result { 14 | Future::new(ffi::device::num_devices).await 15 | } 16 | 17 | /// CUDA device ID. 18 | pub type DeviceId = i32; 19 | 20 | /// CUDA device. 21 | pub struct Device; 22 | 23 | impl Device { 24 | /// Returns which device is currently being used by [`DeviceId`]. 25 | /// 26 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g80861db2ce7c29b6e8055af8ae01bc78) 27 | pub async fn get() -> Result { 28 | Future::new(ffi::device::Device::get).await 29 | } 30 | 31 | /// Set device to be used for GPU executions. 32 | /// 33 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb) 34 | /// 35 | /// # Arguments 36 | /// 37 | /// * `id` - Device ID to use. 38 | pub async fn set(id: DeviceId) -> Result<()> { 39 | Future::new(move || ffi::device::Device::set(id)).await 40 | } 41 | 42 | /// Synchronize the current CUDA device. 43 | /// 44 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g10e20b05a95f638a4071a655503df25d) 45 | /// 46 | /// # Warning 47 | /// 48 | /// Note that this operation will block all device operations, even from other processes while 49 | /// running. Use this operation sparingly. 50 | pub async fn synchronize() -> Result<()> { 51 | Future::new(ffi::device::Device::synchronize).await 52 | } 53 | 54 | /// Gets free and total device memory. 55 | /// 56 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g376b97f5ab20321ca46f7cfa9511b978) 57 | /// 58 | /// # Return value 59 | /// 60 | /// Total amount of memory and free memory in bytes. 61 | pub async fn memory_info() -> Result { 62 | Future::new(ffi::device::Device::memory_info).await 63 | } 64 | } 65 | 66 | /// CUDA device memory information. 67 | #[derive(Debug, Clone, Copy, PartialEq)] 68 | pub struct MemoryInfo { 69 | /// Amount of free device memory in bytes. 70 | pub free: usize, 71 | /// Total amount of device memory in bytes. 72 | pub total: usize, 73 | } 74 | 75 | #[cfg(test)] 76 | mod tests { 77 | use super::*; 78 | 79 | #[tokio::test] 80 | async fn test_num_devices() { 81 | assert!(matches!(num_devices().await, Ok(num) if num > 0)); 82 | } 83 | 84 | #[tokio::test] 85 | async fn test_get_device() { 86 | assert!(matches!(Device::get().await, Ok(0))); 87 | } 88 | 89 | #[tokio::test] 90 | async fn test_set_device() { 91 | assert!(Device::set(0).await.is_ok()); 92 | assert!(matches!(Device::get().await, Ok(0))); 93 | } 94 | 95 | #[tokio::test] 96 | async fn test_synchronize() { 97 | assert!(Device::synchronize().await.is_ok()); 98 | } 99 | 100 | #[tokio::test] 101 | async fn test_memory_info() { 102 | let memory_info = Device::memory_info().await.unwrap(); 103 | assert!(memory_info.free > 0); 104 | assert!(memory_info.total > 0); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/runtime/execution.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicBool, Ordering}; 2 | use std::sync::mpsc::{channel, Receiver, Sender}; 3 | use std::sync::{Arc, Mutex}; 4 | 5 | use once_cell::sync::Lazy; 6 | 7 | use crate::runtime::thread_local::RuntimeThreadLocal; 8 | use crate::runtime::work::Work; 9 | 10 | /// Refers to the global runtime. The runtime is responsible for running all CUDA operations in a 11 | /// dedicated thread. 12 | /// 13 | /// Note that this object should not be used by callers because each thread gets its own delegate 14 | /// object to communicate with the runtime. 15 | /// 16 | /// # Usage 17 | /// 18 | /// Each thread should get its own [`RuntimeThreadLocal`] object, which acts as delegate object. 19 | /// 20 | /// Use `Runtime::thread_local` to get the thread local object: 21 | /// 22 | /// ```ignore 23 | /// let runtime = RUNTIME.lock().unwrap().thread_local(); 24 | /// ``` 25 | pub(super) static RUNTIME: Lazy> = Lazy::new(|| Mutex::new(Runtime::new())); 26 | 27 | /// Runtime object that holds the runtime thread and a channel 28 | /// to send jobs onto the worker queue. 29 | pub struct Runtime { 30 | join_handle: Option>, 31 | run_flag: Arc, 32 | work_tx: Sender, 33 | } 34 | 35 | impl Runtime { 36 | /// Acquire a thread local delegate for the runtime. 37 | pub(super) fn thread_local(&self) -> RuntimeThreadLocal { 38 | RuntimeThreadLocal::from_sender(self.work_tx.clone()) 39 | } 40 | 41 | /// Create runtime. 42 | fn new() -> Self { 43 | let run_flag = Arc::new(AtomicBool::new(true)); 44 | let (work_tx, work_rx) = channel::(); 45 | 46 | let join_handle = std::thread::spawn({ 47 | let run_flag = run_flag.clone(); 48 | move || Self::worker(run_flag, work_rx) 49 | }); 50 | 51 | Runtime { 52 | join_handle: Some(join_handle), 53 | run_flag, 54 | work_tx, 55 | } 56 | } 57 | 58 | /// Worker loop. Receives jobs from the worker queue and executes them until [`run_flag`] 59 | /// becomes `false`. 60 | /// 61 | /// # Arguments 62 | /// 63 | /// * `run_flag` - Atomic flag that indicates whether the worker should continue running. 64 | /// * `work_rx` - Receives work to execute. 65 | fn worker(run_flag: Arc, work_rx: Receiver) { 66 | while run_flag.load(Ordering::Relaxed) { 67 | match work_rx.recv() { 68 | Ok(work) => work.run(), 69 | Err(_) => break, 70 | } 71 | } 72 | } 73 | } 74 | 75 | impl Drop for Runtime { 76 | fn drop(&mut self) { 77 | self.run_flag.store(false, Ordering::Relaxed); 78 | 79 | // Put dummy workload into the queue to trigger the loop to continue and encounted the 80 | // `run_flag` that is now false, then stop. Note that if this fails, it means the underlying 81 | // channel is broken. It is not a problem, since that must mean the worker already quit 82 | // before, and it will join immediatly. 83 | let _ = self.work_tx.send(Work::new(|| {})); 84 | 85 | if let Some(join_handle) = self.join_handle.take() { 86 | join_handle 87 | .join() 88 | .expect("failed to join on runtime thread"); 89 | } 90 | } 91 | } 92 | 93 | #[cfg(test)] 94 | mod tests { 95 | use super::*; 96 | 97 | #[test] 98 | fn test_drop() { 99 | let runtime = Runtime::new(); 100 | std::thread::sleep(std::time::Duration::from_millis(10)); 101 | drop(runtime); 102 | } 103 | 104 | #[test] 105 | fn test_it_does_work() { 106 | let runtime = Runtime::new(); 107 | let (tx, rx) = std::sync::mpsc::channel(); 108 | assert!(runtime 109 | .thread_local() 110 | .enqueue(Work::new(move || { 111 | assert!(tx.send(true).is_ok()); 112 | })) 113 | .is_ok()); 114 | assert!(matches!( 115 | rx.recv_timeout(std::time::Duration::from_millis(100)), 116 | Ok(true), 117 | )); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/npp/copy_constant_border.rs: -------------------------------------------------------------------------------- 1 | use crate::memory::DeviceBuffer2D; 2 | use crate::npp::constant_border::ConstantBorder; 3 | use crate::npp::stream::Stream; 4 | use crate::runtime::Future; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// Copy an image with a constant border. This function expects a reference to a device image for 9 | /// input, and a mutable reference to a device image to place the output in. 10 | /// 11 | /// This function assumes the following about the input and output images: 12 | /// * Images are in RGB format. 13 | /// * Images are in standard memory order, i.e. HWC. 14 | /// 15 | /// # Stream ordered semantics 16 | /// 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially 18 | /// relative to operations scheduled on the same stream or the default stream. 19 | /// 20 | /// # Arguments 21 | /// 22 | /// * `input` - The on-device input image. 23 | /// * `output` - The on-device output image. 24 | /// * `constant_border` - The constant border parameters to apply. 25 | /// * `stream` - Stream to use. 26 | pub async fn copy_constant_border( 27 | input: &DeviceBuffer2D, 28 | output: &mut DeviceBuffer2D, 29 | constant_border: &ConstantBorder, 30 | stream: &Stream, 31 | ) -> Result<()> { 32 | assert_eq!(input.num_channels(), 3, "input image must be in RGB format"); 33 | assert_eq!( 34 | output.num_channels(), 35 | 3, 36 | "output image must be in RGB format" 37 | ); 38 | 39 | let context = stream.to_context(); 40 | Future::new(move || { 41 | crate::ffi::npp::copy_constant_border::copy_constant_border( 42 | input.inner(), 43 | output.inner_mut(), 44 | constant_border, 45 | &context, 46 | ) 47 | }) 48 | .await 49 | } 50 | 51 | #[cfg(test)] 52 | mod tests { 53 | use super::*; 54 | 55 | use crate::memory::DeviceBuffer2D; 56 | use crate::npp::stream::Stream; 57 | use crate::npp::tests::image::*; 58 | use crate::npp::tests::memory::*; 59 | 60 | #[tokio::test] 61 | async fn test_copy_constant_border() { 62 | // Input image is 1x2 and just contains one red and one green pixel. 63 | const INPUT: [[Pixel; 2]; 1] = [[R, G]]; 64 | const INPUT_FLAT: [u8; 6] = flatten!(INPUT, 6); 65 | 66 | // Expected output of copy constant border with left border of 1 and top border of 2, if 67 | // the border color is blue. 68 | const OUTPUT: [[Pixel; 4]; 5] = [ 69 | [B, B, B, B], 70 | [B, B, B, B], 71 | [B, R, G, B], 72 | [B, B, B, B], 73 | [B, B, B, B], 74 | ]; 75 | const OUTPUT_FLAT: [u8; 4 * 5 * 3] = flatten!(OUTPUT, 4 * 5 * 3); 76 | 77 | let stream = Stream::new().await.unwrap(); 78 | 79 | let image = to_device_2d!(&INPUT_FLAT, 2, 1, 3, &stream); 80 | let mut output = DeviceBuffer2D::::new(4, 5, 3).await; 81 | copy_constant_border(&image, &mut output, &ConstantBorder::new(1, 2, B), &stream) 82 | .await 83 | .unwrap(); 84 | 85 | let output = to_host_2d!(output, &stream); 86 | assert_eq!(&output, &OUTPUT_FLAT); 87 | } 88 | 89 | #[tokio::test] 90 | #[should_panic] 91 | async fn test_it_panics_when_input_num_channels_incorrect() { 92 | let input = DeviceBuffer2D::::new(100, 100, 2).await; 93 | let mut output = DeviceBuffer2D::::new(200, 200, 3).await; 94 | copy_constant_border( 95 | &input, 96 | &mut output, 97 | &ConstantBorder::black(10, 20), 98 | &Stream::null().await, 99 | ) 100 | .await 101 | .unwrap(); 102 | } 103 | 104 | #[tokio::test] 105 | #[should_panic] 106 | async fn test_it_panics_when_output_num_channels_incorrect() { 107 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 108 | let mut output = DeviceBuffer2D::::new(200, 200, 2).await; 109 | copy_constant_border( 110 | &input, 111 | &mut output, 112 | &ConstantBorder::black(10, 20), 113 | &Stream::null().await, 114 | ) 115 | .await 116 | .unwrap(); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/npp/region.rs: -------------------------------------------------------------------------------- 1 | /// Represents subregion of image. 2 | #[derive(Debug, Clone, Copy, PartialEq, Default)] 3 | pub enum Region { 4 | #[default] 5 | Full, 6 | Rectangle { 7 | x: usize, 8 | y: usize, 9 | width: usize, 10 | height: usize, 11 | }, 12 | } 13 | 14 | impl Region { 15 | /// Create new [`Region`] that covers the whole image. 16 | #[inline] 17 | pub fn full() -> Self { 18 | Region::Full 19 | } 20 | 21 | /// Create new partial [`Region`] with normalized width and height. 22 | /// 23 | /// If the `width` or `height` is less than 2, it will be set to 2 to produce a region that 24 | /// is valid when used with the NPP API. 25 | /// 26 | /// # Arguments 27 | /// 28 | /// * `topleft` - Coordinates of top left corner of the region. 29 | /// * `dims` - Dimensions of the region. 30 | #[inline] 31 | pub fn rectangle_normalized(topleft: (usize, usize), dims: (usize, usize)) -> Self { 32 | let (x, y) = topleft; 33 | let (width, height) = dims; 34 | Self::Rectangle { 35 | x, 36 | y, 37 | width: width.max(2), 38 | height: height.max(2), 39 | } 40 | } 41 | 42 | /// Resolve the actual values for `x`, `y`, `width` and `height` of the box, even if when it is 43 | /// `Region::Full`. To compute these, the outer `width` and `height` are required. 44 | /// 45 | /// # Arguments 46 | /// 47 | /// * `width` - Outer width. 48 | /// * `height` - Outer height. 49 | /// 50 | /// # Return value 51 | /// 52 | /// Region coordinates `x`, `y`, `width` and `height`. 53 | pub fn resolve_to_xywh(&self, width: usize, height: usize) -> (usize, usize, usize, usize) { 54 | match self { 55 | Region::Full => (0, 0, width, height), 56 | Region::Rectangle { 57 | x, 58 | y, 59 | width, 60 | height, 61 | } => (*x, *y, *width, *height), 62 | } 63 | } 64 | 65 | /// Whether or not the region is of type `Region::Full`. 66 | pub fn is_full(&self) -> bool { 67 | matches!(self, Region::Full) 68 | } 69 | } 70 | 71 | impl std::fmt::Display for Region { 72 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 73 | match self { 74 | Region::Full => write!(f, "[full]"), 75 | // This formats to something like this: 76 | // 77 | // ``` 78 | // [x: 10, y: 10, width: 80, height: 40] 79 | // ``` 80 | Region::Rectangle { 81 | x, 82 | y, 83 | width, 84 | height, 85 | } => write!(f, "[x: {x}, y: {y}, width: {width}, height: {height}]",), 86 | } 87 | } 88 | } 89 | 90 | #[cfg(test)] 91 | mod tests { 92 | use super::*; 93 | 94 | #[test] 95 | fn test_new_full() { 96 | assert_eq!(Region::full(), Region::Full); 97 | assert!(Region::full().is_full()); 98 | } 99 | 100 | #[test] 101 | fn test_new_rectangle_normalized() { 102 | assert_eq!( 103 | Region::rectangle_normalized((1, 2), (3, 4)), 104 | Region::Rectangle { 105 | x: 1, 106 | y: 2, 107 | width: 3, 108 | height: 4 109 | } 110 | ); 111 | assert_eq!( 112 | Region::rectangle_normalized((1, 2), (0, 1)), 113 | Region::Rectangle { 114 | x: 1, 115 | y: 2, 116 | width: 2, 117 | height: 2 118 | } 119 | ); 120 | assert!(!Region::rectangle_normalized((1, 2), (3, 4)).is_full()); 121 | } 122 | 123 | #[test] 124 | fn test_resolve_region() { 125 | let region = Region::Rectangle { 126 | x: 8, 127 | y: 10, 128 | width: 12, 129 | height: 16, 130 | }; 131 | assert_eq!(region.resolve_to_xywh(20, 20), (8, 10, 12, 16)); 132 | } 133 | 134 | #[test] 135 | fn test_resolve_full() { 136 | let region = Region::Full; 137 | assert_eq!(region.resolve_to_xywh(10, 20), (0, 0, 10, 20)); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/ffi/ptr.rs: -------------------------------------------------------------------------------- 1 | /// Represents a device-local pointer. Pointers qualify as device-local if they refer to memory that 2 | /// lives on the device, and not on the host. 3 | /// 4 | /// # Safety 5 | /// 6 | /// ## Null 7 | /// 8 | /// Creating a null pointer is always unsafe, because any CUDA operations on null pointers can cause 9 | /// undefined behavior. 10 | /// 11 | /// Use the `unsafe` function `Ptr::null` to create a null pointer in cases where usage is safe. 12 | pub struct DevicePtr { 13 | addr: *mut std::ffi::c_void, 14 | } 15 | 16 | impl DevicePtr { 17 | /// Create from device address. 18 | /// 19 | /// # Arguments 20 | /// 21 | /// * `addr` - Address of pointer. 22 | #[inline] 23 | pub fn from_addr(addr: *mut std::ffi::c_void) -> Self { 24 | if !addr.is_null() { 25 | DevicePtr { addr } 26 | } else { 27 | panic!("unexpected null pointer"); 28 | } 29 | } 30 | 31 | /// Create null pointer. 32 | /// 33 | /// # Safety 34 | /// 35 | /// This is unsafe because operating on a `null` pointer in CUDA code can cause crashes. In some 36 | /// cases it is allowed though, for example, a `null` pointer can designate the default stream 37 | /// in stream-related operations. 38 | #[inline] 39 | pub unsafe fn null() -> Self { 40 | DevicePtr { 41 | addr: std::ptr::null_mut(), 42 | } 43 | } 44 | 45 | /// Whether or not the device pointer is a null pointer. 46 | #[inline] 47 | pub fn is_null(&self) -> bool { 48 | self.addr.is_null() 49 | } 50 | 51 | /// Get the readonly pointer value. 52 | #[inline(always)] 53 | pub fn as_ptr(&self) -> *const std::ffi::c_void { 54 | self.addr as *const std::ffi::c_void 55 | } 56 | 57 | /// Get the mutable pointer value. 58 | #[inline(always)] 59 | pub fn as_mut_ptr(&mut self) -> *mut std::ffi::c_void { 60 | self.addr 61 | } 62 | 63 | /// Take the pointer from this wrapper and replace it with a null pointer. 64 | /// 65 | /// # Safety 66 | /// 67 | /// This operation is unsafe because it creates a null pointer. 68 | /// 69 | /// # Usage 70 | /// 71 | /// This function can be used inside [`Drop`] if it known that the pointer object will not be 72 | /// used for the remainder of the function scope, and the object is to be dropped. 73 | /// 74 | /// # Example 75 | /// 76 | /// ```ignore 77 | /// # use async_cuda::ffi::DevicePtr; 78 | /// pub struct Object { 79 | /// internal: DevicePtr, 80 | /// } 81 | /// 82 | /// impl Drop for Object { 83 | /// fn drop(&mut self) { 84 | /// // SAFETY: This is safe because `self` and `self.internal` 85 | /// // are not used beyond this unsafe block. 86 | /// let ptr = unsafe { 87 | /// self.internal.take(); 88 | /// }; 89 | /// // Propertly deallocate the pointer here and do *NOT* use 90 | /// // use `self` for anything! 91 | /// } 92 | /// } 93 | /// ``` 94 | #[inline] 95 | pub unsafe fn take(&mut self) -> DevicePtr { 96 | DevicePtr { 97 | // sets `self.addr` to NULL, puts addr in new device ptr 98 | addr: std::mem::replace(&mut self.addr, std::ptr::null_mut()), 99 | } 100 | } 101 | } 102 | 103 | impl std::fmt::Display for DevicePtr { 104 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 105 | write!(f, "{:?}", self.addr) 106 | } 107 | } 108 | 109 | #[cfg(test)] 110 | mod tests { 111 | use super::*; 112 | 113 | #[test] 114 | fn test_it_holds_on() { 115 | let fake = 0xffffffff as *mut std::ffi::c_void; 116 | let ptr = DevicePtr::from_addr(fake); 117 | assert_eq!(ptr.as_ptr(), 0xffffffff as *const std::ffi::c_void); 118 | } 119 | 120 | #[test] 121 | #[should_panic] 122 | fn test_it_panics_when_null() { 123 | let _ = DevicePtr::from_addr(std::ptr::null_mut()); 124 | } 125 | 126 | #[test] 127 | fn test_null() { 128 | let ptr = unsafe { DevicePtr::null() }; 129 | assert!(ptr.is_null()); 130 | assert_eq!(ptr.as_ptr(), std::ptr::null_mut()); 131 | } 132 | 133 | #[test] 134 | fn test_take() { 135 | let fake = 0xffffffff as *mut std::ffi::c_void; 136 | let mut ptr = DevicePtr::from_addr(fake); 137 | assert_eq!( 138 | unsafe { ptr.take().as_ptr() }, 139 | 0xffffffff as *const std::ffi::c_void, 140 | ); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/ffi/npp/copy_constant_border.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::ffi::npp::context::Context; 4 | use crate::ffi::npp::result; 5 | use crate::npp::constant_border::ConstantBorder; 6 | 7 | type Result = std::result::Result; 8 | 9 | /// Synchroneous implementation of [`crate::copy_constant_border()`]. 10 | /// 11 | /// Refer to [`crate::copy_constant_border()`] for documentation. 12 | pub fn copy_constant_border( 13 | input: &crate::ffi::memory::DeviceBuffer2D, 14 | output: &mut crate::ffi::memory::DeviceBuffer2D, 15 | border: &ConstantBorder, 16 | context: &Context, 17 | ) -> Result<()> { 18 | assert_eq!(input.num_channels, 3, "input image must be in RGB format"); 19 | assert_eq!(output.num_channels, 3, "output image must be in RGB format"); 20 | 21 | let (src_pitch, src_width, src_height) = (input.pitch, input.width as i32, input.height as i32); 22 | let (dst_pitch, dst_width, dst_height) = 23 | (output.pitch, output.width as i32, output.height as i32); 24 | 25 | let (border_left, border_top) = (border.left as i32, border.top as i32); 26 | let border_color_ptr = border.color.as_ptr(); 27 | 28 | let src_ptr = input.as_internal().as_ptr(); 29 | let dst_ptr = output.as_mut_internal().as_mut_ptr(); 30 | let context_ptr = context.as_ptr(); 31 | let ret = cpp!(unsafe [ 32 | src_ptr as "const void*", 33 | src_pitch as "std::size_t", 34 | src_width as "std::int32_t", 35 | src_height as "std::int32_t", 36 | dst_ptr as "void*", 37 | dst_pitch as "std::size_t", 38 | dst_width as "std::int32_t", 39 | dst_height as "std::int32_t", 40 | border_left as "std::int32_t", 41 | border_top as "std::int32_t", 42 | border_color_ptr as "const std::uint8_t*", 43 | context_ptr as "void*" 44 | ] -> i32 as "std::int32_t" { 45 | NppiSize src_size = { src_width, src_height }; 46 | NppiSize dst_size = { dst_width, dst_height }; 47 | return nppiCopyConstBorder_8u_C3R_Ctx( 48 | (const Npp8u*) src_ptr, 49 | src_pitch, 50 | src_size, 51 | (Npp8u*) dst_ptr, 52 | dst_pitch, 53 | dst_size, 54 | border_top, 55 | border_left, 56 | border_color_ptr, 57 | *((NppStreamContext*) context_ptr) 58 | ); 59 | }); 60 | result!(ret) 61 | } 62 | 63 | #[cfg(test)] 64 | mod tests { 65 | use super::*; 66 | 67 | use crate::ffi::npp::context::Context; 68 | use crate::npp::tests::image::*; 69 | use crate::npp::tests::sync::memory::*; 70 | 71 | #[test] 72 | fn test_copy_constant_border() { 73 | // Input image is 1x2 and just contains one red and one green pixel. 74 | const INPUT: [[Pixel; 2]; 1] = [[R, G]]; 75 | const INPUT_FLAT: [u8; 6] = flatten!(INPUT, 6); 76 | 77 | // Expected output of copy constant border with left border of 1 and top border of 2, if 78 | // the border color is blue. 79 | const OUTPUT: [[Pixel; 4]; 5] = [ 80 | [B, B, B, B], 81 | [B, B, B, B], 82 | [B, R, G, B], 83 | [B, B, B, B], 84 | [B, B, B, B], 85 | ]; 86 | const OUTPUT_FLAT: [u8; 4 * 5 * 3] = flatten!(OUTPUT, 4 * 5 * 3); 87 | 88 | let context = Context::from_null_stream(); 89 | 90 | let image = to_device_2d!(&INPUT_FLAT, 2, 1, 3, &context); 91 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(4, 5, 3); 92 | copy_constant_border(&image, &mut output, &ConstantBorder::new(1, 2, B), &context).unwrap(); 93 | 94 | let output = to_host_2d!(output, &context); 95 | assert_eq!(&output, &OUTPUT_FLAT); 96 | } 97 | 98 | #[test] 99 | #[should_panic] 100 | fn test_it_panics_when_input_num_channels_incorrect() { 101 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2); 102 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 3); 103 | copy_constant_border( 104 | &input, 105 | &mut output, 106 | &ConstantBorder::black(10, 20), 107 | &Context::from_null_stream(), 108 | ) 109 | .unwrap(); 110 | } 111 | 112 | #[test] 113 | #[should_panic] 114 | fn test_it_panics_when_output_num_channels_incorrect() { 115 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 116 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 2); 117 | copy_constant_border( 118 | &input, 119 | &mut output, 120 | &ConstantBorder::black(10, 20), 121 | &Context::from_null_stream(), 122 | ) 123 | .unwrap(); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/ffi/npp/context.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::ffi::device::Device; 4 | use crate::ffi::npp::result; 5 | use crate::stream::Stream; 6 | 7 | /// NPP stream context structure. 8 | /// 9 | /// [NPP documentation](https://docs.nvidia.com/cuda/npp/struct_npp_stream_context.html) 10 | pub struct Context { 11 | raw: *mut std::ffi::c_void, 12 | pub stream: Stream, 13 | } 14 | 15 | /// Implements [`Send`] for [`Context`]. 16 | /// 17 | /// # Safety 18 | /// 19 | /// This is safe because the way we use the underlying `NppStreamContext` object is thread-safe. 20 | unsafe impl Send for Context {} 21 | 22 | /// Implements [`Sync`] for [`Context`]. 23 | /// 24 | /// # Safety 25 | /// 26 | /// This is safe because the way we use the underlying `NppStreamContext` object is thread-safe. 27 | unsafe impl Sync for Context {} 28 | 29 | impl Context { 30 | /// Create context on null stream. 31 | /// 32 | /// This creates a context that can be passed to NPP functions. Any functions using this context 33 | /// will be executed on the null stream. 34 | pub fn from_null_stream() -> Self { 35 | let mut raw = std::ptr::null_mut(); 36 | let raw_ptr = std::ptr::addr_of_mut!(raw); 37 | // SAFETY: 38 | // * Must call this function on runtime since `nppGetStreamContext` needs the correct thread 39 | // locals to determine current device and other context settings. 40 | // * We can store a reference to the stream in `NppStreamContext` as long as we make sure 41 | // `NppStreamContext` cannot outlive the stream, which we can guarantee because we take 42 | // ownership of the stream. 43 | let ret = cpp!(unsafe [ 44 | raw_ptr as "void**" 45 | ] -> i32 as "std::int32_t" { 46 | NppStreamContext* stream_context = new NppStreamContext(); 47 | NppStatus ret = nppGetStreamContext(stream_context); 48 | if (ret == NPP_SUCCESS) { 49 | stream_context->hStream = nullptr; 50 | *raw_ptr = (void*) stream_context; 51 | } 52 | return ret; 53 | }); 54 | match result!(ret) { 55 | Ok(()) => Self { 56 | raw, 57 | stream: Stream::null(), 58 | }, 59 | Err(err) => { 60 | panic!("failed to get current NPP stream context: {err}") 61 | } 62 | } 63 | } 64 | 65 | /// Create context. 66 | /// 67 | /// This creates an NPP context object. It can be passed to NPP functions, and they will execute 68 | /// on the associated stream. 69 | /// 70 | /// # Arguments 71 | /// 72 | /// * `stream` - Stream to associate with context. 73 | pub fn from_stream(stream: Stream) -> Self { 74 | let (ret, raw) = { 75 | let mut raw = std::ptr::null_mut(); 76 | let raw_ptr = std::ptr::addr_of_mut!(raw); 77 | let stream_ptr = stream.inner().as_internal().as_ptr(); 78 | let device_id = stream.inner().device(); 79 | // SAFETY: 80 | // * Must call this function on runtime since `nppGetStreamContext` needs the correct 81 | // thread locals to determine current device and other context settings. 82 | // * We can store a reference to the stream in `NppStreamContext` as long as we make 83 | // sure `NppStreamContext` cannot outlive the stream, which we can guarantee because 84 | // we take ownership of the stream. 85 | let ret = cpp!(unsafe [ 86 | raw_ptr as "void**", 87 | stream_ptr as "void*", 88 | device_id as "int" 89 | ] -> i32 as "std::int32_t" { 90 | NppStreamContext* stream_context = new NppStreamContext(); 91 | NppStatus ret = nppGetStreamContext(stream_context); 92 | if (ret == NPP_SUCCESS) { 93 | stream_context->hStream = (cudaStream_t) stream_ptr; 94 | stream_context->nCudaDeviceId = device_id; 95 | *raw_ptr = (void*) stream_context; 96 | } 97 | return ret; 98 | }); 99 | (ret, raw) 100 | }; 101 | match result!(ret) { 102 | Ok(()) => Self { raw, stream }, 103 | Err(err) => { 104 | panic!("failed to get current NPP stream context: {err}") 105 | } 106 | } 107 | } 108 | 109 | /// Get internal readonly pointer. 110 | #[inline] 111 | pub(crate) fn as_ptr(&self) -> *const std::ffi::c_void { 112 | self.raw 113 | } 114 | 115 | /// Delete the context. 116 | /// 117 | /// # Panics 118 | /// 119 | /// This function panics if binding to the corresponding device fails. 120 | /// 121 | /// # Safety 122 | /// 123 | /// The context may not be used after this function is called, except for being dropped. 124 | pub unsafe fn delete(&mut self) { 125 | if self.raw.is_null() { 126 | return; 127 | } 128 | 129 | Device::set_or_panic(self.stream.inner().device()); 130 | 131 | let raw = self.raw; 132 | self.raw = std::ptr::null_mut(); 133 | 134 | cpp!(unsafe [raw as "void*"] { 135 | delete ((NppStreamContext*) raw); 136 | }); 137 | } 138 | } 139 | 140 | impl Drop for Context { 141 | #[inline] 142 | fn drop(&mut self) { 143 | // SAFETY: This is safe since the buffer cannot be used after this. 144 | unsafe { 145 | self.delete(); 146 | } 147 | } 148 | } 149 | 150 | #[cfg(test)] 151 | mod tests { 152 | use super::*; 153 | 154 | #[tokio::test] 155 | async fn test_from_stream() { 156 | let stream = Stream::new().await.unwrap(); 157 | let context = Context::from_stream(stream); 158 | assert!(!context.as_ptr().is_null()); 159 | assert!(!context.stream.inner().as_internal().as_ptr().is_null()); 160 | } 161 | 162 | #[test] 163 | fn test_from_null_stream() { 164 | let context = Context::from_null_stream(); 165 | assert!(!context.as_ptr().is_null()); 166 | assert!(context.stream.inner().as_internal().as_ptr().is_null()); 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/npp/resize.rs: -------------------------------------------------------------------------------- 1 | use crate::memory::DeviceBuffer2D; 2 | use crate::npp::region::Region; 3 | use crate::npp::stream::Stream; 4 | use crate::runtime::Future; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// Resize an image using bilinear interpolation. This function expects a reference to a device 9 | /// image for input, and a mutable reference to a device image to place the output in. 10 | /// 11 | /// This function assumes the following about the input and output images: 12 | /// * Images are in RGB format. 13 | /// * Images are in standard memory order, i.e. HWC. 14 | /// 15 | /// # Stream ordered semantics 16 | /// 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially 18 | /// relative to operations scheduled on the same stream or the default stream. 19 | /// 20 | /// # Arguments 21 | /// 22 | /// * `input` - The on-device input image. 23 | /// * `input_region` - Specify region of interest in input image. This can be used to combine crop 24 | /// and resize in a single operation. 25 | /// * `output_region` - Specify region of interest in input image. 26 | /// * `output` - The on-device output image. 27 | /// * `stream` - Stream to use. 28 | pub async fn resize( 29 | input: &DeviceBuffer2D, 30 | input_region: Region, 31 | output: &mut DeviceBuffer2D, 32 | output_region: Region, 33 | stream: &Stream, 34 | ) -> Result<()> { 35 | assert_eq!(input.num_channels(), 3, "input image must be in RGB format"); 36 | assert_eq!( 37 | output.num_channels(), 38 | 3, 39 | "output image must be in RGB format" 40 | ); 41 | 42 | let context = stream.to_context(); 43 | Future::new(move || { 44 | crate::ffi::npp::resize::resize( 45 | input.inner(), 46 | input_region, 47 | output.inner_mut(), 48 | output_region, 49 | &context, 50 | ) 51 | }) 52 | .await 53 | } 54 | 55 | #[cfg(test)] 56 | mod tests { 57 | use super::*; 58 | 59 | use crate::memory::DeviceBuffer2D; 60 | use crate::npp::stream::Stream; 61 | use crate::npp::tests::image::*; 62 | use crate::npp::tests::memory::*; 63 | 64 | #[tokio::test] 65 | async fn test_resize() { 66 | // This is the expected result when resizing the RGB flag to 2 by 2 with bilinear 67 | // interpolation. 68 | const OUTPUT: Image2x2 = [[R, R], [R, B]]; 69 | const OUTPUT_FLAT: [u8; 2 * 2 * 3] = flatten!(OUTPUT, 2 * 2 * 3); 70 | 71 | let stream = Stream::new().await.unwrap(); 72 | 73 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream); 74 | let mut output = DeviceBuffer2D::::new(2, 2, 3).await; 75 | resize(&image, Region::Full, &mut output, Region::Full, &stream) 76 | .await 77 | .unwrap(); 78 | 79 | let output = to_host_2d!(output, &stream); 80 | assert_eq!(&output, &OUTPUT_FLAT); 81 | } 82 | 83 | #[tokio::test] 84 | async fn test_resize_with_input_region() { 85 | // This is the raw expected result when resizing the center part of the RGB flag from two by 86 | // to two four by four. 87 | #[rustfmt::skip] 88 | #[allow(clippy::zero_prefixed_literal)] 89 | const OUTPUT: [u8; 4 * 4 * 3] = [ 90 | 000, 255, 000, 000, 255, 000, 000, 255, 000, 064, 191, 000, 91 | 000, 191, 064, 000, 191, 064, 000, 191, 064, 064, 143, 048, 92 | 000, 064, 191, 000, 064, 191, 000, 064, 191, 064, 048, 143, 93 | 064, 000, 191, 064, 000, 191, 064, 000, 191, 112, 000, 143, 94 | ]; 95 | 96 | let stream = Stream::new().await.unwrap(); 97 | 98 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream); 99 | let center = Region::Rectangle { 100 | x: 1, 101 | y: 1, 102 | width: 2, 103 | height: 2, 104 | }; 105 | let mut output = DeviceBuffer2D::::new(4, 4, 3).await; 106 | resize(&image, center, &mut output, Region::Full, &stream) 107 | .await 108 | .unwrap(); 109 | 110 | let output = to_host_2d!(output, &stream); 111 | assert_eq!(&output, &OUTPUT); 112 | } 113 | 114 | #[tokio::test] 115 | async fn test_resize_with_output_region() { 116 | #[rustfmt::skip] 117 | const INPUT: [u8; 2 * 2 * 3] = [ 118 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 119 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 120 | ]; 121 | #[rustfmt::skip] 122 | const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [ 123 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 124 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 125 | ]; 126 | 127 | let stream = Stream::new().await.unwrap(); 128 | let bottom_half = Region::Rectangle { 129 | x: 0, 130 | y: 1, 131 | width: 2, 132 | height: 1, 133 | }; 134 | 135 | let image = to_device_2d!(&INPUT, 2, 2, 3, &stream); 136 | let mut output = DeviceBuffer2D::::new(2, 2, 3).await; 137 | output.fill_with_byte(0x00, &stream).await.unwrap(); 138 | resize(&image, Region::Full, &mut output, bottom_half, &stream) 139 | .await 140 | .unwrap(); 141 | 142 | let output = to_host_2d!(output, &stream); 143 | assert_eq!(&output, &EXPECTED_OUTPUT); 144 | } 145 | 146 | #[tokio::test] 147 | #[should_panic] 148 | async fn test_it_panics_when_input_num_channels_incorrect() { 149 | let input = DeviceBuffer2D::::new(100, 100, 2).await; 150 | let mut output = DeviceBuffer2D::::new(200, 200, 3).await; 151 | resize( 152 | &input, 153 | Region::Full, 154 | &mut output, 155 | Region::Full, 156 | &Stream::null().await, 157 | ) 158 | .await 159 | .unwrap(); 160 | } 161 | 162 | #[tokio::test] 163 | #[should_panic] 164 | async fn test_it_panics_when_output_num_channels_incorrect() { 165 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 166 | let mut output = DeviceBuffer2D::::new(200, 200, 2).await; 167 | resize( 168 | &input, 169 | Region::Full, 170 | &mut output, 171 | Region::Full, 172 | &Stream::null().await, 173 | ) 174 | .await 175 | .unwrap(); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/npp/remap.rs: -------------------------------------------------------------------------------- 1 | use crate::memory::DeviceBuffer2D; 2 | use crate::npp::stream::Stream; 3 | use crate::runtime::Future; 4 | 5 | type Result = std::result::Result; 6 | 7 | /// Remaps an image using bilinear interpolation. This function expects a reference to a device 8 | /// buffer as inputs, and a mutable reference to a device buffer to store the output of the 9 | /// operation in. 10 | /// 11 | /// This function assumes the following about the input and output images: 12 | /// * Images are in RGB format. 13 | /// * Images are in standard memory order, i.e. HWC. 14 | /// 15 | /// # Stream ordered semantics 16 | /// 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially 18 | /// relative to operations scheduled on the same stream or the default stream. 19 | /// 20 | /// # Arguments 21 | /// 22 | /// * `input` - The on-device input image. 23 | /// * `output` - The on-device output image. 24 | /// * `map_x` - On-device X pixel map. 25 | /// * `map_y` - On-device Y pixel map. 26 | /// * `stream` - Stream to use. 27 | pub async fn remap( 28 | input: &DeviceBuffer2D, 29 | output: &mut DeviceBuffer2D, 30 | map_x: &DeviceBuffer2D, 31 | map_y: &DeviceBuffer2D, 32 | stream: &Stream, 33 | ) -> Result<()> { 34 | assert_eq!(input.num_channels(), 3, "input image must be in RGB format"); 35 | assert_eq!( 36 | output.num_channels(), 37 | 3, 38 | "output image must be in RGB format" 39 | ); 40 | assert_eq!(map_x.num_channels(), 1, "map must have one channel"); 41 | assert_eq!(map_y.num_channels(), 1, "map must have one channel"); 42 | assert_eq!( 43 | output.width(), 44 | map_x.width(), 45 | "map x must have same width as output image" 46 | ); 47 | assert_eq!( 48 | output.height(), 49 | map_x.height(), 50 | "map x must have same height as output image" 51 | ); 52 | assert_eq!( 53 | output.width(), 54 | map_y.width(), 55 | "map y must have same width as output image" 56 | ); 57 | assert_eq!( 58 | output.height(), 59 | map_y.height(), 60 | "map y must have same height as output image" 61 | ); 62 | 63 | let context = stream.to_context(); 64 | Future::new(move || { 65 | crate::ffi::npp::remap::remap( 66 | input.inner(), 67 | output.inner_mut(), 68 | map_x.inner(), 69 | map_y.inner(), 70 | &context, 71 | ) 72 | }) 73 | .await 74 | } 75 | 76 | #[cfg(test)] 77 | mod tests { 78 | use super::*; 79 | 80 | use crate::memory::DeviceBuffer2D; 81 | use crate::npp::stream::Stream; 82 | use crate::npp::tests::image::*; 83 | use crate::npp::tests::memory::*; 84 | 85 | #[tokio::test] 86 | async fn test_remap() { 87 | const MAP_X: &[f32; 16] = &[ 88 | 0.0, 1.0, 2.0, 3.0, // No mapping at all 89 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 90 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 91 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 92 | ]; 93 | const MAP_Y: &[f32; 16] = &[ 94 | 0.0, 0.0, 0.0, 0.0, // No mapping at all 95 | 1.0, 1.0, 1.0, 1.0, // Take from green band 96 | 1.0, 1.0, 1.0, 1.0, // Take from green band 97 | 2.0, 2.0, 2.0, 2.0, // Take from blue band 98 | ]; 99 | const OUTPUT: Image4x4 = [ 100 | [R, R, R, R], // Red band 101 | [G, G, G, G], // Green band 102 | [G, G, G, G], // Green band 103 | [B, B, B, B], // Blue band 104 | ]; 105 | const OUTPUT_FLAT: [u8; 4 * 4 * 3] = flatten!(OUTPUT, 4 * 4 * 3); 106 | 107 | let stream = Stream::new().await.unwrap(); 108 | 109 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream); 110 | let map_x = to_device_2d!(MAP_X, 4, 4, 1, &stream); 111 | let map_y = to_device_2d!(MAP_Y, 4, 4, 1, &stream); 112 | let mut output = DeviceBuffer2D::::new(4, 4, 3).await; 113 | assert!(remap(&image, &mut output, &map_x, &map_y, &stream) 114 | .await 115 | .is_ok()); 116 | 117 | let output = to_host_2d!(output, &stream); 118 | assert_eq!(&output, &OUTPUT_FLAT); 119 | } 120 | 121 | #[tokio::test] 122 | #[should_panic] 123 | async fn test_it_panics_when_input_num_channels_incorrect() { 124 | let input = DeviceBuffer2D::::new(100, 100, 2).await; 125 | let map_x = DeviceBuffer2D::::new(100, 100, 1).await; 126 | let map_y = DeviceBuffer2D::::new(100, 100, 1).await; 127 | let mut output = DeviceBuffer2D::::new(100, 100, 3).await; 128 | remap(&input, &mut output, &map_x, &map_y, &Stream::null().await) 129 | .await 130 | .unwrap(); 131 | } 132 | 133 | #[tokio::test] 134 | #[should_panic] 135 | async fn test_it_panics_when_output_num_channels_incorrect() { 136 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 137 | let map_x = DeviceBuffer2D::::new(100, 100, 1).await; 138 | let map_y = DeviceBuffer2D::::new(100, 100, 1).await; 139 | let mut output = DeviceBuffer2D::::new(100, 100, 2).await; 140 | remap(&input, &mut output, &map_x, &map_y, &Stream::null().await) 141 | .await 142 | .unwrap(); 143 | } 144 | 145 | #[tokio::test] 146 | #[should_panic] 147 | async fn test_it_panics_when_map_num_channels_incorrect() { 148 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 149 | let map_x = DeviceBuffer2D::::new(100, 100, 2).await; 150 | let map_y = DeviceBuffer2D::::new(100, 100, 3).await; 151 | let mut output = DeviceBuffer2D::::new(100, 100, 3).await; 152 | remap(&input, &mut output, &map_x, &map_y, &Stream::null().await) 153 | .await 154 | .unwrap(); 155 | } 156 | 157 | #[tokio::test] 158 | #[should_panic] 159 | async fn test_it_panics_when_map_width_incorrect() { 160 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 161 | let map_x = DeviceBuffer2D::::new(120, 100, 1).await; 162 | let map_y = DeviceBuffer2D::::new(120, 100, 1).await; 163 | let mut output = DeviceBuffer2D::::new(100, 100, 3).await; 164 | remap(&input, &mut output, &map_x, &map_y, &Stream::null().await) 165 | .await 166 | .unwrap(); 167 | } 168 | 169 | #[tokio::test] 170 | #[should_panic] 171 | async fn test_it_panics_when_map_height_incorrect() { 172 | let input = DeviceBuffer2D::::new(100, 100, 3).await; 173 | let map_x = DeviceBuffer2D::::new(100, 120, 1).await; 174 | let map_y = DeviceBuffer2D::::new(100, 120, 1).await; 175 | let mut output = DeviceBuffer2D::::new(100, 100, 3).await; 176 | remap(&input, &mut output, &map_x, &map_y, &Stream::null().await) 177 | .await 178 | .unwrap(); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/ffi/stream.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::device::DeviceId; 4 | use crate::ffi::device::Device; 5 | use crate::ffi::ptr::DevicePtr; 6 | use crate::ffi::result; 7 | 8 | type Result = std::result::Result; 9 | 10 | /// Synchronous implementation of [`crate::Stream`]. 11 | /// 12 | /// Refer to [`crate::Stream`] for documentation. 13 | pub struct Stream { 14 | internal: DevicePtr, 15 | device: DeviceId, 16 | } 17 | 18 | /// Implements [`Send`] for [`Stream`]. 19 | /// 20 | /// # Safety 21 | /// 22 | /// This property is inherited from the CUDA API, which is thread-safe. 23 | unsafe impl Send for Stream {} 24 | 25 | /// Implements [`Sync`] for [`Stream`]. 26 | /// 27 | /// # Safety 28 | /// 29 | /// This property is inherited from the CUDA API, which is thread-safe. 30 | unsafe impl Sync for Stream {} 31 | 32 | impl Stream { 33 | pub fn null() -> Self { 34 | Self { 35 | // SAFETY: This is safe because a null pointer for stream indicates the default 36 | // stream in CUDA and all functions accept this. 37 | internal: unsafe { DevicePtr::null() }, 38 | device: Device::get_or_panic(), 39 | } 40 | } 41 | 42 | pub fn new() -> Result { 43 | let device = Device::get()?; 44 | let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut(); 45 | let ptr_ptr = std::ptr::addr_of_mut!(ptr); 46 | let ret = cpp!(unsafe [ 47 | ptr_ptr as "void**" 48 | ] -> i32 as "std::int32_t" { 49 | return cudaStreamCreate((cudaStream_t*) ptr_ptr); 50 | }); 51 | result!( 52 | ret, 53 | Stream { 54 | internal: DevicePtr::from_addr(ptr), 55 | device, 56 | } 57 | ) 58 | } 59 | 60 | pub fn synchronize(&self) -> Result<()> { 61 | Device::set(self.device)?; 62 | let ptr = self.internal.as_ptr(); 63 | let ret = cpp!(unsafe [ 64 | ptr as "void*" 65 | ] -> i32 as "std::int32_t" { 66 | return cudaStreamSynchronize((cudaStream_t) ptr); 67 | }); 68 | result!(ret) 69 | } 70 | 71 | pub fn add_callback(&self, f: impl FnOnce() + Send) -> Result<()> { 72 | Device::set(self.device)?; 73 | let ptr = self.internal.as_ptr(); 74 | let f_boxed = Box::new(f) as Box; 75 | let f_boxed2 = Box::new(f_boxed); 76 | let f_boxed2_ptr = Box::into_raw(f_boxed2); 77 | let user_data = f_boxed2_ptr as *mut std::ffi::c_void; 78 | let ret = cpp!(unsafe [ 79 | ptr as "void*", 80 | user_data as "void*" 81 | ] -> i32 as "std::int32_t" { 82 | return cudaStreamAddCallback( 83 | (cudaStream_t) ptr, 84 | cuda_ffi_Callback, 85 | user_data, 86 | 0 87 | ); 88 | }); 89 | result!(ret) 90 | } 91 | 92 | /// Get readonly reference to internal [`DevicePtr`]. 93 | #[inline(always)] 94 | pub fn as_internal(&self) -> &DevicePtr { 95 | &self.internal 96 | } 97 | 98 | /// Get mutable reference to internal [`DevicePtr`]. 99 | #[inline(always)] 100 | pub fn as_mut_internal(&mut self) -> &mut DevicePtr { 101 | &mut self.internal 102 | } 103 | 104 | /// Get corresponding device as [`DeviceId`]. 105 | #[inline(always)] 106 | pub fn device(&self) -> DeviceId { 107 | self.device 108 | } 109 | 110 | /// Destroy stream. 111 | /// 112 | /// # Panics 113 | /// 114 | /// This function panics if binding to the corresponding device fails. 115 | /// 116 | /// # Safety 117 | /// 118 | /// The object may not be used after this function is called, except for being dropped. 119 | pub unsafe fn destroy(&mut self) { 120 | if self.internal.is_null() { 121 | return; 122 | } 123 | 124 | Device::set_or_panic(self.device); 125 | 126 | // SAFETY: This will cause `self` to hold a null pointer. It is safe here because we don't 127 | // use the object after this. 128 | let mut internal = unsafe { self.internal.take() }; 129 | let ptr = internal.as_mut_ptr(); 130 | 131 | // SAFETY: We must synchronize the stream before destroying it to make sure we are not 132 | // dropping a stream that still has operations pending. 133 | let _ret = cpp!(unsafe [ 134 | ptr as "void*" 135 | ] -> i32 as "std::int32_t" { 136 | return cudaStreamSynchronize((cudaStream_t) ptr); 137 | }); 138 | 139 | let _ret = cpp!(unsafe [ 140 | ptr as "void*" 141 | ] -> i32 as "std::int32_t" { 142 | return cudaStreamDestroy((cudaStream_t) ptr); 143 | }); 144 | } 145 | } 146 | 147 | impl Drop for Stream { 148 | #[inline] 149 | fn drop(&mut self) { 150 | // SAFETY: This is safe since the object cannot be used after this. 151 | unsafe { 152 | self.destroy(); 153 | } 154 | } 155 | } 156 | 157 | cpp! {{ 158 | /// Holds the C++ code that makes up the native part required to get our CUDA callback to work 159 | /// over the FFI. 160 | /// 161 | /// # Arguments 162 | /// 163 | /// * `stream` - The CUDA stream on which the callback was scheduled. 164 | /// * `status` - The CUDA status value (this could represent an error from an earlier async CUDA 165 | /// call). 166 | /// * `user_data` - The user data pointer provided when adding the callback. 167 | /// 168 | /// # Example 169 | /// 170 | /// It can be used like so: 171 | /// 172 | /// ```cpp 173 | /// return cudaStreamAddCallback( 174 | /// stream, 175 | /// cuda_ffi_Callback, 176 | /// user_data, 177 | /// 0 178 | /// ); 179 | /// ``` 180 | static void cuda_ffi_Callback( 181 | __attribute__((unused)) cudaStream_t stream, 182 | cudaError_t status, 183 | void* user_data 184 | ) { 185 | rust!(cuda_ffi_Callback_internal [ 186 | status : i32 as "std::int32_t", 187 | user_data : *mut std::ffi::c_void as "void*" 188 | ] { 189 | // SAFETY: We boxed the closure ourselves and did `Box::into_raw`, which allows us to 190 | // reinstate the box here and use it accordingly. It will be dropped here after use. 191 | unsafe { 192 | let user_data = std::mem::transmute(user_data); 193 | let function = Box::>::from_raw(user_data); 194 | function() 195 | } 196 | }); 197 | } 198 | }} 199 | 200 | #[cfg(test)] 201 | mod tests { 202 | use super::*; 203 | 204 | #[test] 205 | fn test_new() { 206 | assert!(Stream::new().is_ok()); 207 | } 208 | 209 | #[test] 210 | fn test_synchronize() { 211 | let stream = Stream::new().unwrap(); 212 | assert!(stream.synchronize().is_ok()); 213 | } 214 | 215 | #[test] 216 | fn test_synchronize_null_stream() { 217 | let stream = Stream::null(); 218 | assert!(stream.synchronize().is_ok()); 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/ffi/npp/resize.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::ffi::npp::context::Context; 4 | use crate::ffi::npp::result; 5 | use crate::npp::region::Region; 6 | 7 | type Result = std::result::Result; 8 | 9 | /// Synchroneous implementation of [`crate::resize()`]. 10 | /// 11 | /// Refer to [`crate::resize()`] for documentation. 12 | pub fn resize( 13 | input: &crate::ffi::memory::DeviceBuffer2D, 14 | input_region: Region, 15 | output: &mut crate::ffi::memory::DeviceBuffer2D, 16 | output_region: Region, 17 | context: &Context, 18 | ) -> Result<()> { 19 | assert_eq!(input.num_channels, 3, "input image must be in RGB format"); 20 | assert_eq!(output.num_channels, 3, "output image must be in RGB format"); 21 | 22 | let (src_pitch, src_width, src_height) = (input.pitch, input.width as i32, input.height as i32); 23 | let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = 24 | input_region.resolve_to_xywh(src_width as usize, src_height as usize); 25 | let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = ( 26 | src_rect_x as i32, 27 | src_rect_y as i32, 28 | src_rect_width as i32, 29 | src_rect_height as i32, 30 | ); 31 | 32 | let (dst_pitch, dst_width, dst_height) = 33 | (output.pitch, output.width as i32, output.height as i32); 34 | let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = 35 | output_region.resolve_to_xywh(dst_width as usize, dst_height as usize); 36 | let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = ( 37 | dst_rect_x as i32, 38 | dst_rect_y as i32, 39 | dst_rect_width as i32, 40 | dst_rect_height as i32, 41 | ); 42 | 43 | let src_ptr = input.as_internal().as_ptr(); 44 | let dst_ptr = output.as_mut_internal().as_mut_ptr(); 45 | let context_ptr = context.as_ptr(); 46 | let ret = cpp!(unsafe [ 47 | src_ptr as "const void*", 48 | src_pitch as "std::size_t", 49 | src_width as "std::int32_t", 50 | src_height as "std::int32_t", 51 | src_rect_x as "std::int32_t", 52 | src_rect_y as "std::int32_t", 53 | src_rect_width as "std::int32_t", 54 | src_rect_height as "std::int32_t", 55 | dst_ptr as "void*", 56 | dst_pitch as "std::size_t", 57 | dst_width as "std::int32_t", 58 | dst_height as "std::int32_t", 59 | dst_rect_x as "std::int32_t", 60 | dst_rect_y as "std::int32_t", 61 | dst_rect_width as "std::int32_t", 62 | dst_rect_height as "std::int32_t", 63 | context_ptr as "void*" 64 | ] -> i32 as "std::int32_t" { 65 | NppiSize src_size = { src_width, src_height }; 66 | NppiSize dst_size = { dst_width, dst_height }; 67 | NppiRect src_rect = { src_rect_x, src_rect_y, src_rect_width, src_rect_height }; 68 | NppiRect dst_rect = { dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height }; 69 | return nppiResize_8u_C3R_Ctx( 70 | (const Npp8u*) src_ptr, 71 | src_pitch, 72 | src_size, 73 | src_rect, 74 | (Npp8u*) dst_ptr, 75 | dst_pitch, 76 | dst_size, 77 | dst_rect, 78 | // We use bilinear interpolation, which is the fastest resize method that does not 79 | // produce messed up quality. 80 | NPPI_INTER_LINEAR, 81 | *((NppStreamContext*) context_ptr) 82 | ); 83 | }); 84 | result!(ret) 85 | } 86 | 87 | #[cfg(test)] 88 | mod tests { 89 | use super::*; 90 | 91 | use crate::ffi::npp::context::Context; 92 | use crate::npp::tests::image::*; 93 | use crate::npp::tests::sync::memory::*; 94 | 95 | #[test] 96 | fn test_resize() { 97 | // This is the expected result when resizing the RGB flag to 2 by 2 with bilinear 98 | // interpolation. 99 | const OUTPUT: Image2x2 = [[R, R], [R, B]]; 100 | const OUTPUT_FLAT: [u8; 2 * 2 * 3] = flatten!(OUTPUT, 2 * 2 * 3); 101 | 102 | let context = Context::from_null_stream(); 103 | 104 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context); 105 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(2, 2, 3); 106 | resize(&image, Region::Full, &mut output, Region::Full, &context).unwrap(); 107 | 108 | let output = to_host_2d!(output, &context); 109 | assert_eq!(&output, &OUTPUT_FLAT); 110 | } 111 | 112 | #[test] 113 | fn test_resize_with_input_region() { 114 | // This is the raw expected result when resizing the center part of the RGB flag from two by 115 | // to two four by four. 116 | #[rustfmt::skip] 117 | #[allow(clippy::zero_prefixed_literal)] 118 | const OUTPUT: [u8; 4 * 4 * 3] = [ 119 | 000, 255, 000, 000, 255, 000, 000, 255, 000, 064, 191, 000, 120 | 000, 191, 064, 000, 191, 064, 000, 191, 064, 064, 143, 048, 121 | 000, 064, 191, 000, 064, 191, 000, 064, 191, 064, 048, 143, 122 | 064, 000, 191, 064, 000, 191, 064, 000, 191, 112, 000, 143, 123 | ]; 124 | 125 | let context = Context::from_null_stream(); 126 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context); 127 | let center = Region::Rectangle { 128 | x: 1, 129 | y: 1, 130 | width: 2, 131 | height: 2, 132 | }; 133 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(4, 4, 3); 134 | resize(&image, center, &mut output, Region::Full, &context).unwrap(); 135 | 136 | let output = to_host_2d!(output, &context); 137 | assert_eq!(&output, &OUTPUT); 138 | } 139 | 140 | #[test] 141 | fn test_resize_with_output_region() { 142 | #[rustfmt::skip] 143 | const INPUT: [u8; 2 * 2 * 3] = [ 144 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 145 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 146 | ]; 147 | #[rustfmt::skip] 148 | const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [ 149 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 150 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 151 | ]; 152 | 153 | let context = Context::from_null_stream(); 154 | let bottom_half = Region::Rectangle { 155 | x: 0, 156 | y: 1, 157 | width: 2, 158 | height: 1, 159 | }; 160 | 161 | let image = to_device_2d!(&INPUT, 2, 2, 3, &context); 162 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(2, 2, 3); 163 | output.fill_with_byte(0x00, context.stream.inner()).unwrap(); 164 | resize(&image, Region::Full, &mut output, bottom_half, &context).unwrap(); 165 | 166 | let output = to_host_2d!(output, &context); 167 | assert_eq!(&output, &EXPECTED_OUTPUT); 168 | } 169 | 170 | #[test] 171 | #[should_panic] 172 | fn test_it_panics_when_input_num_channels_incorrect() { 173 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2); 174 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 3); 175 | resize( 176 | &input, 177 | Region::Full, 178 | &mut output, 179 | Region::Full, 180 | &Context::from_null_stream(), 181 | ) 182 | .unwrap(); 183 | } 184 | 185 | #[test] 186 | #[should_panic] 187 | fn test_it_panics_when_output_num_channels_incorrect() { 188 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 189 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 2); 190 | resize( 191 | &input, 192 | Region::Full, 193 | &mut output, 194 | Region::Full, 195 | &Context::from_null_stream(), 196 | ) 197 | .unwrap(); 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /src/ffi/npp/remap.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::ffi::npp::context::Context; 4 | use crate::ffi::npp::result; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// Synchroneous implementation of [`crate::remap()`]. 9 | /// 10 | /// Refer to [`crate::remap()`] for documentation. 11 | pub fn remap( 12 | input: &crate::ffi::memory::DeviceBuffer2D, 13 | output: &mut crate::ffi::memory::DeviceBuffer2D, 14 | map_x: &crate::ffi::memory::DeviceBuffer2D, 15 | map_y: &crate::ffi::memory::DeviceBuffer2D, 16 | context: &Context, 17 | ) -> Result<()> { 18 | assert_eq!(input.num_channels, 3, "input image must be in RGB format"); 19 | assert_eq!(output.num_channels, 3, "output image must be in RGB format"); 20 | assert_eq!(map_x.num_channels, 1, "map must have one channel"); 21 | assert_eq!(map_y.num_channels, 1, "map must have one channel"); 22 | assert_eq!( 23 | output.width, map_x.width, 24 | "map x must have same width as output image" 25 | ); 26 | assert_eq!( 27 | output.height, map_x.height, 28 | "map x must have same height as output image" 29 | ); 30 | assert_eq!( 31 | output.width, map_y.width, 32 | "map y must have same width as output image" 33 | ); 34 | assert_eq!( 35 | output.height, map_y.height, 36 | "map y must have same height as output image" 37 | ); 38 | 39 | let (src_width, src_height, src_pitch) = (input.width as i32, input.height as i32, input.pitch); 40 | let (dst_width, dst_height, dst_pitch) = 41 | (output.width as i32, output.height as i32, output.pitch); 42 | 43 | let map_x_pitch = map_x.pitch; 44 | let map_y_pitch = map_y.pitch; 45 | 46 | let src_ptr = input.as_internal().as_ptr(); 47 | let dst_ptr = output.as_mut_internal().as_mut_ptr(); 48 | let map_x_ptr = map_x.as_internal().as_ptr(); 49 | let map_y_ptr = map_y.as_internal().as_ptr(); 50 | let context_ptr = context.as_ptr(); 51 | let ret = cpp!(unsafe [ 52 | src_ptr as "const std::uint8_t*", 53 | src_width as "std::int32_t", 54 | src_height as "std::int32_t", 55 | src_pitch as "std::size_t", 56 | map_x_ptr as "const float*", 57 | map_x_pitch as "std::size_t", 58 | map_y_ptr as "const float*", 59 | map_y_pitch as "std::size_t", 60 | dst_ptr as "std::uint8_t*", 61 | dst_width as "std::int32_t", 62 | dst_height as "std::int32_t", 63 | dst_pitch as "std::size_t", 64 | context_ptr as "void*" 65 | ] -> i32 as "std::int32_t" { 66 | NppiSize src_size = { src_width, src_height }; 67 | NppiSize dst_size = { dst_width, dst_height }; 68 | NppiRect src_rect = { 0, 0, src_width, src_height }; 69 | return nppiRemap_8u_C3R_Ctx( 70 | (const Npp8u*) src_ptr, 71 | src_size, 72 | src_pitch, 73 | src_rect, 74 | (const Npp32f*) map_x_ptr, 75 | map_x_pitch, 76 | (const Npp32f*) map_y_ptr, 77 | map_y_pitch, 78 | (Npp8u*) dst_ptr, 79 | dst_pitch, 80 | dst_size, 81 | // We use bilinear interpolation, which is the fastest resize method that does not 82 | // produce messed up quality. 83 | NPPI_INTER_LINEAR, 84 | *((NppStreamContext*) context_ptr) 85 | ); 86 | }); 87 | result!(ret) 88 | } 89 | 90 | #[cfg(test)] 91 | mod tests { 92 | use super::*; 93 | 94 | use crate::ffi::npp::context::Context; 95 | use crate::npp::tests::image::*; 96 | use crate::npp::tests::sync::memory::*; 97 | 98 | #[test] 99 | fn test_remap() { 100 | const MAP_X: &[f32; 16] = &[ 101 | 0.0, 1.0, 2.0, 3.0, // No mapping at all 102 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 103 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 104 | 1.0, 1.0, 2.0, 2.0, // Ignore the red border 105 | ]; 106 | const MAP_Y: &[f32; 16] = &[ 107 | 0.0, 0.0, 0.0, 0.0, // No mapping at all 108 | 1.0, 1.0, 1.0, 1.0, // Take from green band 109 | 1.0, 1.0, 1.0, 1.0, // Take from green band 110 | 2.0, 2.0, 2.0, 2.0, // Take from blue band 111 | ]; 112 | const OUTPUT: Image4x4 = [ 113 | [R, R, R, R], // Red band 114 | [G, G, G, G], // Green band 115 | [G, G, G, G], // Green band 116 | [B, B, B, B], // Blue band 117 | ]; 118 | const OUTPUT_FLAT: [u8; 4 * 4 * 3] = flatten!(OUTPUT, 4 * 4 * 3); 119 | 120 | let context = Context::from_null_stream(); 121 | 122 | let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context); 123 | let map_x = to_device_2d!(MAP_X, 4, 4, 1, &context); 124 | let map_y = to_device_2d!(MAP_Y, 4, 4, 1, &context); 125 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(4, 4, 3); 126 | assert!(remap(&image, &mut output, &map_x, &map_y, &context).is_ok()); 127 | 128 | let output = to_host_2d!(output, &context); 129 | assert_eq!(&output, &OUTPUT_FLAT); 130 | } 131 | 132 | #[test] 133 | #[should_panic] 134 | fn test_it_panics_when_input_num_channels_incorrect() { 135 | let context = Context::from_null_stream(); 136 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2); 137 | let map_x = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 1); 138 | let map_y = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 1); 139 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 140 | remap(&input, &mut output, &map_x, &map_y, &context).unwrap(); 141 | } 142 | 143 | #[test] 144 | #[should_panic] 145 | fn test_it_panics_when_output_num_channels_incorrect() { 146 | let context = Context::from_null_stream(); 147 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 148 | let map_x = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 1); 149 | let map_y = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 1); 150 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2); 151 | remap(&input, &mut output, &map_x, &map_y, &context).unwrap(); 152 | } 153 | 154 | #[test] 155 | #[should_panic] 156 | fn test_it_panics_when_map_num_channels_incorrect() { 157 | let context = Context::from_null_stream(); 158 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 159 | let map_x = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2); 160 | let map_y = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 161 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 162 | remap(&input, &mut output, &map_x, &map_y, &context).unwrap(); 163 | } 164 | 165 | #[test] 166 | #[should_panic] 167 | fn test_it_panics_when_map_width_incorrect() { 168 | let context = Context::from_null_stream(); 169 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 170 | let map_x = crate::ffi::memory::DeviceBuffer2D::::new(120, 100, 1); 171 | let map_y = crate::ffi::memory::DeviceBuffer2D::::new(120, 100, 1); 172 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 173 | remap(&input, &mut output, &map_x, &map_y, &context).unwrap(); 174 | } 175 | 176 | #[test] 177 | #[should_panic] 178 | fn test_it_panics_when_map_height_incorrect() { 179 | let context = Context::from_null_stream(); 180 | let input = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 181 | let map_x = crate::ffi::memory::DeviceBuffer2D::::new(100, 120, 1); 182 | let map_y = crate::ffi::memory::DeviceBuffer2D::::new(100, 120, 1); 183 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 3); 184 | remap(&input, &mut output, &map_x, &map_y, &context).unwrap(); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/ffi/memory/host.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::device::DeviceId; 4 | use crate::ffi::device::Device; 5 | use crate::ffi::memory::device::DeviceBuffer; 6 | use crate::ffi::ptr::DevicePtr; 7 | use crate::ffi::result; 8 | use crate::ffi::stream::Stream; 9 | 10 | type Result = std::result::Result; 11 | 12 | /// Synchronous implementation of [`crate::HostBuffer`]. 13 | /// 14 | /// Refer to [`crate::HostBuffer`] for documentation. 15 | pub struct HostBuffer { 16 | pub num_elements: usize, 17 | internal: DevicePtr, 18 | device: DeviceId, 19 | _phantom: std::marker::PhantomData, 20 | } 21 | 22 | /// Implements [`Send`] for [`HostBuffer`]. 23 | /// 24 | /// # Safety 25 | /// 26 | /// This property is inherited from the CUDA API, which is thread-safe. 27 | unsafe impl Send for HostBuffer {} 28 | 29 | /// Implements [`Sync`] for [`HostBuffer`]. 30 | /// 31 | /// # Safety 32 | /// 33 | /// This property is inherited from the CUDA API, which is thread-safe. 34 | unsafe impl Sync for HostBuffer {} 35 | 36 | impl HostBuffer { 37 | pub fn new(num_elements: usize) -> Self { 38 | let device = Device::get_or_panic(); 39 | let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut(); 40 | let ptr_ptr = std::ptr::addr_of_mut!(ptr); 41 | let size = num_elements * std::mem::size_of::(); 42 | let ret = cpp!(unsafe [ 43 | ptr_ptr as "void**", 44 | size as "std::size_t" 45 | ] -> i32 as "std::int32_t" { 46 | return cudaMallocHost(ptr_ptr, size); 47 | }); 48 | match result!(ret, DevicePtr::from_addr(ptr)) { 49 | Ok(internal) => Self { 50 | internal, 51 | device, 52 | num_elements, 53 | _phantom: Default::default(), 54 | }, 55 | Err(err) => { 56 | panic!("failed to allocate host memory: {err}"); 57 | } 58 | } 59 | } 60 | 61 | pub fn from_slice(slice: &[T]) -> Self { 62 | let mut this = Self::new(slice.len()); 63 | this.copy_from_slice(slice); 64 | this 65 | } 66 | 67 | #[cfg(feature = "ndarray")] 68 | pub fn from_array(array: &ndarray::ArrayView) -> Self { 69 | let mut this = Self::new(array.len()); 70 | this.copy_from_array(array); 71 | this 72 | } 73 | 74 | /// Copy from device buffer. 75 | /// 76 | /// # Safety 77 | /// 78 | /// This function is marked unsafe because it does not synchronize and the operation might not 79 | /// have completed when it returns. 80 | #[inline] 81 | pub unsafe fn copy_from_async( 82 | &mut self, 83 | other: &DeviceBuffer, 84 | stream: &Stream, 85 | ) -> Result<()> { 86 | other.copy_to_async(self, stream) 87 | } 88 | 89 | /// Copy to device buffer. 90 | /// 91 | /// # Safety 92 | /// 93 | /// This function is marked unsafe because it does not synchronize and the operation might not 94 | /// have completed when it returns. 95 | #[inline] 96 | pub unsafe fn copy_to_async(&self, other: &mut DeviceBuffer, stream: &Stream) -> Result<()> { 97 | other.copy_from_async(self, stream) 98 | } 99 | 100 | pub fn copy_from_slice(&mut self, slice: &[T]) { 101 | // SAFETY: This is safe because we only instantiate the slice temporarily whilst having 102 | // exclusive mutable access to it to copy the data into it. 103 | let target = unsafe { 104 | std::slice::from_raw_parts_mut(self.internal.as_mut_ptr() as *mut T, self.num_elements) 105 | }; 106 | target.copy_from_slice(slice); 107 | } 108 | 109 | #[cfg(feature = "ndarray")] 110 | pub fn copy_from_array(&mut self, array: &ndarray::ArrayView) { 111 | assert!( 112 | array.is_standard_layout(), 113 | "array must be in standard layout" 114 | ); 115 | // SAFETY: This is safe because we only instantiate the slice temporarily whilst having 116 | // exclusive mutable access to it to copy the data into it. 117 | let target = unsafe { 118 | std::slice::from_raw_parts_mut(self.internal.as_mut_ptr() as *mut T, self.num_elements) 119 | }; 120 | target.copy_from_slice(array.as_slice().unwrap()); 121 | } 122 | 123 | #[inline] 124 | pub fn to_vec(&self) -> Vec { 125 | // SAFETY: This is safe because we only instantiate the slice temporarily to copy the data 126 | // to a safe Rust [`Vec`]. 127 | let source = unsafe { 128 | std::slice::from_raw_parts(self.internal.as_ptr() as *const T, self.num_elements) 129 | }; 130 | source.to_vec() 131 | } 132 | 133 | #[cfg(feature = "ndarray")] 134 | pub fn to_array_with_shape( 135 | &self, 136 | shape: impl Into>, 137 | ) -> ndarray::Array { 138 | let shape = shape.into(); 139 | assert_eq!( 140 | self.num_elements, 141 | shape.size(), 142 | "provided shape does not match number of elements in buffer" 143 | ); 144 | ndarray::Array::from_shape_vec(shape, self.to_vec()).unwrap() 145 | } 146 | 147 | /// Get readonly reference to internal [`DevicePtr`]. 148 | #[inline(always)] 149 | pub fn as_internal(&self) -> &DevicePtr { 150 | &self.internal 151 | } 152 | 153 | /// Get mutable reference to internal [`DevicePtr`]. 154 | #[inline(always)] 155 | pub fn as_mut_internal(&mut self) -> &mut DevicePtr { 156 | &mut self.internal 157 | } 158 | 159 | /// Release the buffer memory. 160 | /// 161 | /// # Panics 162 | /// 163 | /// This function panics if binding to the corresponding device fails. 164 | /// 165 | /// # Safety 166 | /// 167 | /// The buffer may not be used after this function is called, except for being dropped. 168 | pub unsafe fn free(&mut self) { 169 | if self.internal.is_null() { 170 | return; 171 | } 172 | 173 | Device::set_or_panic(self.device); 174 | 175 | // SAFETY: Safe because we won't use the pointer after this. 176 | let mut internal = unsafe { self.internal.take() }; 177 | let ptr = internal.as_mut_ptr(); 178 | let _ret = cpp!(unsafe [ 179 | ptr as "void*" 180 | ] -> i32 as "std::int32_t" { 181 | return cudaFreeHost(ptr); 182 | }); 183 | } 184 | } 185 | 186 | impl Drop for HostBuffer { 187 | #[inline] 188 | fn drop(&mut self) { 189 | // SAFETY: This is safe since the buffer cannot be used after this. 190 | unsafe { 191 | self.free(); 192 | } 193 | } 194 | } 195 | 196 | #[cfg(test)] 197 | mod tests { 198 | use super::*; 199 | 200 | #[test] 201 | fn test_new() { 202 | let buffer = HostBuffer::::new(100); 203 | assert_eq!(buffer.num_elements, 100); 204 | assert_eq!(buffer.to_vec().len(), 100); 205 | } 206 | 207 | #[test] 208 | fn test_from_slice() { 209 | let all_ones = vec![1_u32; 200]; 210 | let buffer = HostBuffer::from_slice(all_ones.as_slice()); 211 | assert_eq!(buffer.num_elements, 200); 212 | let data = buffer.to_vec(); 213 | assert_eq!(data.len(), 200); 214 | assert!(data.into_iter().all(|v| v == 1_u32)); 215 | } 216 | 217 | #[test] 218 | fn test_copy() { 219 | let stream = Stream::new().unwrap(); 220 | let all_ones = vec![1_u32; 100]; 221 | let host_buffer = HostBuffer::from_slice(all_ones.as_slice()); 222 | 223 | let mut device_buffer = DeviceBuffer::::new(100, &stream); 224 | unsafe { 225 | host_buffer 226 | .copy_to_async(&mut device_buffer, &stream) 227 | .unwrap(); 228 | } 229 | 230 | let mut return_host_buffer = HostBuffer::::new(100); 231 | unsafe { 232 | return_host_buffer 233 | .copy_from_async(&device_buffer, &stream) 234 | .unwrap(); 235 | } 236 | 237 | stream.synchronize().unwrap(); 238 | 239 | assert_eq!(return_host_buffer.num_elements, 100); 240 | let return_data = return_host_buffer.to_vec(); 241 | assert_eq!(return_data.len(), 100); 242 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 243 | } 244 | 245 | #[test] 246 | #[should_panic] 247 | fn test_it_panics_when_copying_invalid_size() { 248 | let stream = Stream::new().unwrap(); 249 | let host_buffer = HostBuffer::::new(100); 250 | let mut device_buffer = DeviceBuffer::::new(101, &Stream::null()); 251 | let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream) }; 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/npp/resize_batch.rs: -------------------------------------------------------------------------------- 1 | use crate::memory::DeviceBuffer2D; 2 | use crate::npp::region::Region; 3 | use crate::npp::stream::Stream; 4 | use crate::runtime::Future; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// Resize a batch of images using bilinear interpolation. This function expects a batch with 9 | /// on-device input and output buffers. 10 | /// 11 | /// This function assumes the following about the input and output images: 12 | /// * Images are in RGB format. 13 | /// * Images are in standard memory order, i.e. HWC. 14 | /// 15 | /// This is the batched version of [`crate::resize()`]. 16 | /// 17 | /// # Stability 18 | /// 19 | /// This function is only available when the `npp-unstable` feature is enabled. Testing shows that 20 | /// the batched version can be imprecise when the input image dimensions are small. 21 | /// 22 | /// Currently identified suspicious behavior: 23 | /// * It does not necessarily produce the same output over a batch of images that would have been 24 | /// produced if the non-batched version of resize were used on each image individually. 25 | /// * When invoking batched resize to resize to the same dimensions as the input, it might not 26 | /// reproduce the input image exactly. 27 | /// 28 | /// # Stream ordered semantics 29 | /// 30 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially 31 | /// relative to operations scheduled on the same stream or the default stream. 32 | /// 33 | /// # Arguments 34 | /// 35 | /// * `batch` - The on-device input and output images as batch. 36 | /// * `input_region` - Specify region of interest in input image. This can be used to combine crop 37 | /// and resize in a single operation. 38 | /// * `output_region` - Specify region of interest in output image. 39 | /// * `stream` - Stream to use. 40 | pub async fn resize_batch( 41 | inputs_and_outputs: &mut [(&DeviceBuffer2D, &mut DeviceBuffer2D)], 42 | input_region: Region, 43 | output_region: Region, 44 | stream: &Stream, 45 | ) -> Result<()> { 46 | assert!( 47 | !inputs_and_outputs.is_empty(), 48 | "batch must have at least one item" 49 | ); 50 | 51 | let (first_input, first_output) = &inputs_and_outputs[0]; 52 | let first_input_width = first_input.width(); 53 | let first_input_height = first_input.height(); 54 | let first_output_width = first_output.width(); 55 | let first_output_height = first_output.height(); 56 | for (input, output) in inputs_and_outputs.iter() { 57 | assert_eq!( 58 | input.width(), 59 | first_input_width, 60 | "all inputs in batch must have the same width", 61 | ); 62 | assert_eq!( 63 | input.height(), 64 | first_input_height, 65 | "all inputs in batch must have the same height", 66 | ); 67 | assert_eq!( 68 | output.width(), 69 | first_output_width, 70 | "all outputs in batch must have the same width", 71 | ); 72 | assert_eq!( 73 | output.height(), 74 | first_output_height, 75 | "all outputs in batch must have the same height", 76 | ); 77 | assert_eq!( 78 | input.num_channels(), 79 | 3, 80 | "all inputs and outputs must be in RGB format" 81 | ); 82 | assert_eq!( 83 | output.num_channels(), 84 | 3, 85 | "all inputs and outputs must be in RGB format" 86 | ); 87 | } 88 | 89 | let context = stream.to_context(); 90 | Future::new(move || { 91 | let mut inputs_and_outputs_inner = inputs_and_outputs 92 | .iter_mut() 93 | .map(|(input, output)| (input.inner(), output.inner_mut())) 94 | .collect::>(); 95 | crate::ffi::npp::resize_batch::resize_batch( 96 | inputs_and_outputs_inner.as_mut_slice(), 97 | input_region, 98 | output_region, 99 | &context, 100 | ) 101 | }) 102 | .await 103 | } 104 | 105 | #[cfg(test)] 106 | mod tests { 107 | use super::*; 108 | 109 | use crate::memory::DeviceBuffer2D; 110 | use crate::npp::stream::Stream; 111 | use crate::npp::tests::memory::*; 112 | 113 | use futures::future; 114 | 115 | #[tokio::test] 116 | async fn test_resize_batch() { 117 | #[rustfmt::skip] 118 | const INPUT: [u8; 12] = [ 119 | 10, 10, 10, 20, 20, 20, 120 | 30, 30, 30, 40, 40, 40, 121 | ]; 122 | #[rustfmt::skip] 123 | const EXPECTED_OUTPUT: [u8; 27] = [ 124 | 10, 10, 10, 14, 14, 14, 20, 20, 20, 125 | 18, 18, 18, 23, 23, 23, 28, 28, 28, 126 | 30, 30, 30, 34, 34, 34, 40, 40, 40, 127 | ]; 128 | 129 | let stream = Stream::new().await.unwrap(); 130 | 131 | let mut inputs_and_outputs = future::join_all((0..10).map(|_| async { 132 | let image = to_device_2d!(&INPUT, 2, 2, 3, &stream); 133 | let output = DeviceBuffer2D::::new(3, 3, 3).await; 134 | (image, output) 135 | })) 136 | .await; 137 | let mut inputs_and_outputs_ref = inputs_and_outputs 138 | .iter_mut() 139 | .map(|(input, output)| (&*input, output)) 140 | .collect::>(); 141 | resize_batch( 142 | &mut inputs_and_outputs_ref, 143 | Region::Full, 144 | Region::Full, 145 | &stream, 146 | ) 147 | .await 148 | .unwrap(); 149 | 150 | for (_, output) in inputs_and_outputs { 151 | let output = to_host_2d!(output, &stream); 152 | assert_eq!(&output, &EXPECTED_OUTPUT); 153 | } 154 | } 155 | 156 | #[tokio::test] 157 | async fn test_resize_batch_with_input_region() { 158 | #[rustfmt::skip] 159 | const INPUT: [u8; 27] = [ 160 | 99, 99, 99, 10, 10, 10, 20, 20, 20, 161 | 99, 99, 99, 30, 30, 30, 40, 40, 40, 162 | 99, 99, 99, 99, 99, 99, 99, 99, 99, 163 | ]; 164 | #[rustfmt::skip] 165 | const EXPECTED_OUTPUT: [u8; 27] = [ 166 | 32, 32, 32, 14, 14, 14, 20, 20, 20, 167 | 39, 39, 39, 23, 23, 23, 28, 28, 28, 168 | 52, 52, 52, 40, 40, 40, 45, 45, 45, 169 | ]; 170 | 171 | let stream = Stream::new().await.unwrap(); 172 | 173 | let center = Region::Rectangle { 174 | x: 1, 175 | y: 0, 176 | width: 2, 177 | height: 2, 178 | }; 179 | 180 | let mut inputs_and_outputs = future::join_all((0..10).map(|_| async { 181 | let image = to_device_2d!(&INPUT, 3, 3, 3, &stream); 182 | let output = DeviceBuffer2D::::new(3, 3, 3).await; 183 | (image, output) 184 | })) 185 | .await; 186 | let mut inputs_and_outputs_ref = inputs_and_outputs 187 | .iter_mut() 188 | .map(|(input, output)| (&*input, output)) 189 | .collect::>(); 190 | resize_batch(&mut inputs_and_outputs_ref, center, Region::Full, &stream) 191 | .await 192 | .unwrap(); 193 | 194 | for (_, output) in inputs_and_outputs { 195 | let output = to_host_2d!(output, &stream); 196 | assert_eq!(&output, &EXPECTED_OUTPUT); 197 | } 198 | } 199 | 200 | #[tokio::test] 201 | async fn test_resize_batch_with_output_region() { 202 | #[rustfmt::skip] 203 | const INPUT: [u8; 2 * 2 * 3] = [ 204 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 205 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 206 | ]; 207 | #[rustfmt::skip] 208 | const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [ 209 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 210 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 211 | ]; 212 | 213 | let stream = Stream::new().await.unwrap(); 214 | let bottom_half = Region::Rectangle { 215 | x: 0, 216 | y: 1, 217 | width: 2, 218 | height: 1, 219 | }; 220 | 221 | let mut inputs_and_outputs = future::join_all((0..10).map(|_| async { 222 | let image = to_device_2d!(&INPUT, 2, 2, 3, &stream); 223 | let mut output = DeviceBuffer2D::::new(2, 2, 3).await; 224 | output.fill_with_byte(0x00, &stream).await.unwrap(); 225 | (image, output) 226 | })) 227 | .await; 228 | let mut inputs_and_outputs_ref = inputs_and_outputs 229 | .iter_mut() 230 | .map(|(input, output)| (&*input, output)) 231 | .collect::>(); 232 | resize_batch( 233 | &mut inputs_and_outputs_ref, 234 | Region::Full, 235 | bottom_half, 236 | &stream, 237 | ) 238 | .await 239 | .unwrap(); 240 | 241 | for (_, output) in inputs_and_outputs { 242 | let output = to_host_2d!(output, &stream); 243 | assert_eq!(&output, &EXPECTED_OUTPUT); 244 | } 245 | } 246 | 247 | #[tokio::test] 248 | #[should_panic] 249 | async fn test_it_panics_when_input_num_channels_incorrect() { 250 | let mut inputs_and_outputs = vec![ 251 | ( 252 | DeviceBuffer2D::::new(100, 100, 2).await, 253 | DeviceBuffer2D::::new(200, 200, 2).await, 254 | ), 255 | ( 256 | DeviceBuffer2D::::new(100, 100, 2).await, 257 | DeviceBuffer2D::::new(200, 200, 2).await, 258 | ), 259 | ]; 260 | let mut inputs_and_outputs_ref = inputs_and_outputs 261 | .iter_mut() 262 | .map(|(input, output)| (&*input, output)) 263 | .collect::>(); 264 | resize_batch( 265 | &mut inputs_and_outputs_ref, 266 | Region::Full, 267 | Region::Full, 268 | &Stream::null().await, 269 | ) 270 | .await 271 | .unwrap(); 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /src/ffi/memory/device.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::device::DeviceId; 4 | use crate::ffi::device::Device; 5 | use crate::ffi::memory::host::HostBuffer; 6 | use crate::ffi::ptr::DevicePtr; 7 | use crate::ffi::result; 8 | use crate::ffi::stream::Stream; 9 | 10 | type Result = std::result::Result; 11 | 12 | /// Synchronous implementation of [`crate::DeviceBuffer`]. 13 | /// 14 | /// Refer to [`crate::DeviceBuffer`] for documentation. 15 | pub struct DeviceBuffer { 16 | pub num_elements: usize, 17 | internal: DevicePtr, 18 | device: DeviceId, 19 | _phantom: std::marker::PhantomData, 20 | } 21 | 22 | /// Implements [`Send`] for [`DeviceBuffer`]. 23 | /// 24 | /// # Safety 25 | /// 26 | /// This property is inherited from the CUDA API, which is thread-safe. 27 | unsafe impl Send for DeviceBuffer {} 28 | 29 | /// Implements [`Sync`] for [`DeviceBuffer`]. 30 | /// 31 | /// # Safety 32 | /// 33 | /// This property is inherited from the CUDA API, which is thread-safe. 34 | unsafe impl Sync for DeviceBuffer {} 35 | 36 | impl DeviceBuffer { 37 | pub fn new(num_elements: usize, stream: &Stream) -> Self { 38 | let device = Device::get_or_panic(); 39 | let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut(); 40 | let ptr_ptr = std::ptr::addr_of_mut!(ptr); 41 | let size = num_elements * std::mem::size_of::(); 42 | let stream_ptr = stream.as_internal().as_ptr(); 43 | let ret = cpp!(unsafe [ 44 | ptr_ptr as "void**", 45 | size as "std::size_t", 46 | stream_ptr as "const void*" 47 | ] -> i32 as "std::int32_t" { 48 | return cudaMallocAsync(ptr_ptr, size, (cudaStream_t) stream_ptr); 49 | }); 50 | match result!(ret, DevicePtr::from_addr(ptr)) { 51 | Ok(internal) => Self { 52 | internal, 53 | device, 54 | num_elements, 55 | _phantom: Default::default(), 56 | }, 57 | Err(err) => { 58 | panic!("failed to allocate device memory: {err}"); 59 | } 60 | } 61 | } 62 | 63 | pub fn from_slice(slice: &[T], stream: &Stream) -> Result { 64 | let host_buffer = HostBuffer::from_slice(slice); 65 | let mut this = Self::new(slice.len(), stream); 66 | // SAFETY: Safe because the stream is synchronized after this. 67 | unsafe { 68 | this.copy_from_async(&host_buffer, stream)?; 69 | } 70 | stream.synchronize()?; 71 | Ok(this) 72 | } 73 | 74 | #[cfg(feature = "ndarray")] 75 | pub fn from_array( 76 | array: &ndarray::ArrayView, 77 | stream: &Stream, 78 | ) -> Result { 79 | let host_buffer = HostBuffer::from_array(array); 80 | let mut this = Self::new(array.len(), stream); 81 | // SAFETY: Safe because the stream is synchronized after this. 82 | unsafe { 83 | this.copy_from_async(&host_buffer, stream)?; 84 | } 85 | stream.synchronize()?; 86 | Ok(this) 87 | } 88 | 89 | /// Copy from host buffer. 90 | /// 91 | /// # Safety 92 | /// 93 | /// This function is marked unsafe because it does not synchronize and the operation might not 94 | /// have completed when it returns. 95 | pub unsafe fn copy_from_async(&mut self, other: &HostBuffer, stream: &Stream) -> Result<()> { 96 | assert_eq!(self.num_elements, other.num_elements); 97 | let ptr_to = self.as_mut_internal().as_mut_ptr(); 98 | let ptr_from = other.as_internal().as_ptr(); 99 | let stream_ptr = stream.as_internal().as_ptr(); 100 | let size = self.num_elements * std::mem::size_of::(); 101 | let ret = cpp!(unsafe [ 102 | ptr_from as "void*", 103 | ptr_to as "void*", 104 | size as "std::size_t", 105 | stream_ptr as "const void*" 106 | ] -> i32 as "std::int32_t" { 107 | return cudaMemcpyAsync( 108 | ptr_to, 109 | ptr_from, 110 | size, 111 | cudaMemcpyHostToDevice, 112 | (cudaStream_t) stream_ptr 113 | ); 114 | }); 115 | result!(ret) 116 | } 117 | 118 | /// Copy to host buffer. 119 | /// 120 | /// # Safety 121 | /// 122 | /// This function is marked unsafe because it does not synchronize and the operation might not 123 | /// have completed when it returns. 124 | pub unsafe fn copy_to_async(&self, other: &mut HostBuffer, stream: &Stream) -> Result<()> { 125 | assert_eq!(self.num_elements, other.num_elements); 126 | let ptr_from = self.as_internal().as_ptr(); 127 | let ptr_to = other.as_mut_internal().as_mut_ptr(); 128 | let size = self.num_elements * std::mem::size_of::(); 129 | let stream_ptr = stream.as_internal().as_ptr(); 130 | let ret = cpp!(unsafe [ 131 | ptr_from as "void*", 132 | ptr_to as "void*", 133 | size as "std::size_t", 134 | stream_ptr as "const void*" 135 | ] -> i32 as "std::int32_t" { 136 | return cudaMemcpyAsync( 137 | ptr_to, 138 | ptr_from, 139 | size, 140 | cudaMemcpyDeviceToHost, 141 | (cudaStream_t) stream_ptr 142 | ); 143 | }); 144 | result!(ret) 145 | } 146 | 147 | /// Fill buffer with byte value. 148 | pub fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> { 149 | let ptr = self.as_internal().as_ptr(); 150 | let value = value as std::ffi::c_int; 151 | let size = self.num_elements * std::mem::size_of::(); 152 | let stream_ptr = stream.as_internal().as_ptr(); 153 | let ret = cpp!(unsafe [ 154 | ptr as "void*", 155 | value as "int", 156 | size as "std::size_t", 157 | stream_ptr as "const void*" 158 | ] -> i32 as "std::int32_t" { 159 | return cudaMemsetAsync( 160 | ptr, 161 | value, 162 | size, 163 | (cudaStream_t) stream_ptr 164 | ); 165 | }); 166 | result!(ret) 167 | } 168 | 169 | /// Get readonly reference to internal [`DevicePtr`]. 170 | #[inline(always)] 171 | pub fn as_internal(&self) -> &DevicePtr { 172 | &self.internal 173 | } 174 | 175 | /// Get mutable reference to internal [`DevicePtr`]. 176 | #[inline(always)] 177 | pub fn as_mut_internal(&mut self) -> &mut DevicePtr { 178 | &mut self.internal 179 | } 180 | 181 | /// Release the buffer memory. 182 | /// 183 | /// # Panics 184 | /// 185 | /// This function panics if binding to the corresponding device fails. 186 | /// 187 | /// # Safety 188 | /// 189 | /// The buffer may not be used after this function is called, except for being dropped. 190 | pub unsafe fn free(&mut self) { 191 | if self.internal.is_null() { 192 | return; 193 | } 194 | 195 | Device::set_or_panic(self.device); 196 | 197 | // SAFETY: Safe because we won't use pointer after this. 198 | let mut internal = unsafe { self.internal.take() }; 199 | let ptr = internal.as_mut_ptr(); 200 | let _ret = cpp!(unsafe [ 201 | ptr as "void*" 202 | ] -> i32 as "std::int32_t" { 203 | return cudaFree(ptr); 204 | }); 205 | } 206 | } 207 | 208 | impl Drop for DeviceBuffer { 209 | #[inline] 210 | fn drop(&mut self) { 211 | // SAFETY: This is safe since the buffer cannot be used after this. 212 | unsafe { 213 | self.free(); 214 | } 215 | } 216 | } 217 | 218 | #[cfg(test)] 219 | mod tests { 220 | use super::*; 221 | 222 | #[test] 223 | fn test_new() { 224 | let buffer = DeviceBuffer::::new(100, &Stream::null()); 225 | assert_eq!(buffer.num_elements, 100); 226 | } 227 | 228 | #[test] 229 | fn test_copy() { 230 | let stream = Stream::new().unwrap(); 231 | let all_ones = vec![1_u32; 100]; 232 | let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()); 233 | 234 | let mut device_buffer = DeviceBuffer::::new(100, &stream); 235 | unsafe { 236 | device_buffer 237 | .copy_from_async(&host_buffer_all_ones, &stream) 238 | .unwrap(); 239 | } 240 | 241 | let mut host_buffer = HostBuffer::::new(100); 242 | unsafe { 243 | device_buffer 244 | .copy_to_async(&mut host_buffer, &stream) 245 | .unwrap(); 246 | } 247 | 248 | let mut another_device_buffer = DeviceBuffer::::new(100, &stream); 249 | unsafe { 250 | another_device_buffer 251 | .copy_from_async(&host_buffer, &stream) 252 | .unwrap(); 253 | } 254 | 255 | let mut return_host_buffer = HostBuffer::::new(100); 256 | unsafe { 257 | another_device_buffer 258 | .copy_to_async(&mut return_host_buffer, &stream) 259 | .unwrap(); 260 | } 261 | 262 | stream.synchronize().unwrap(); 263 | 264 | assert_eq!(return_host_buffer.num_elements, 100); 265 | let return_data = return_host_buffer.to_vec(); 266 | assert_eq!(return_data.len(), 100); 267 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 268 | } 269 | 270 | #[test] 271 | fn test_fill_with_byte() { 272 | let stream = Stream::new().unwrap(); 273 | let mut device_buffer = DeviceBuffer::::new(4, &stream); 274 | let mut host_buffer = HostBuffer::::new(4); 275 | device_buffer.fill_with_byte(0xab, &stream).unwrap(); 276 | unsafe { 277 | device_buffer 278 | .copy_to_async(&mut host_buffer, &stream) 279 | .unwrap(); 280 | } 281 | stream.synchronize().unwrap(); 282 | assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]); 283 | } 284 | 285 | #[test] 286 | #[should_panic] 287 | fn test_it_panics_when_copying_invalid_size() { 288 | let stream = Stream::new().unwrap(); 289 | let device_buffer = DeviceBuffer::::new(101, &stream); 290 | let mut host_buffer = HostBuffer::::new(100); 291 | let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream) }; 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /src/ffi/memory/device2d.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::device::DeviceId; 4 | use crate::ffi::device::Device; 5 | use crate::ffi::memory::host::HostBuffer; 6 | use crate::ffi::ptr::DevicePtr; 7 | use crate::ffi::result; 8 | use crate::ffi::stream::Stream; 9 | 10 | type Result = std::result::Result; 11 | 12 | /// Synchronous implementation of [`crate::DeviceBuffer2D`]. 13 | /// 14 | /// Refer to [`crate::DeviceBuffer2D`] for documentation. 15 | pub struct DeviceBuffer2D { 16 | pub width: usize, 17 | pub height: usize, 18 | pub num_channels: usize, 19 | pub pitch: usize, 20 | internal: DevicePtr, 21 | device: DeviceId, 22 | _phantom: std::marker::PhantomData, 23 | } 24 | 25 | /// Implements [`Send`] for [`DeviceBuffer2D`]. 26 | /// 27 | /// # Safety 28 | /// 29 | /// This property is inherited from the CUDA API, which is thread-safe. 30 | unsafe impl Send for DeviceBuffer2D {} 31 | 32 | /// Implements [`Sync`] for [`DeviceBuffer2D`]. 33 | /// 34 | /// # Safety 35 | /// 36 | /// This property is inherited from the CUDA API, which is thread-safe. 37 | unsafe impl Sync for DeviceBuffer2D {} 38 | 39 | impl DeviceBuffer2D { 40 | pub fn new(width: usize, height: usize, num_channels: usize) -> Self { 41 | let device = Device::get_or_panic(); 42 | let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut(); 43 | let ptr_ptr = std::ptr::addr_of_mut!(ptr); 44 | let mut pitch = 0_usize; 45 | let pitch_ptr = std::ptr::addr_of_mut!(pitch); 46 | let line_size = width * num_channels * std::mem::size_of::(); 47 | let ret = cpp!(unsafe [ 48 | ptr_ptr as "void**", 49 | pitch_ptr as "std::size_t*", 50 | line_size as "std::size_t", 51 | height as "std::size_t" 52 | ] -> i32 as "std::int32_t" { 53 | return cudaMallocPitch( 54 | ptr_ptr, 55 | pitch_ptr, 56 | line_size, 57 | height 58 | ); 59 | }); 60 | match result!(ret, DevicePtr::from_addr(ptr)) { 61 | Ok(internal) => Self { 62 | width, 63 | height, 64 | num_channels, 65 | pitch, 66 | internal, 67 | device, 68 | _phantom: Default::default(), 69 | }, 70 | Err(err) => { 71 | panic!("failed to allocate device memory: {err}"); 72 | } 73 | } 74 | } 75 | 76 | #[cfg(feature = "ndarray")] 77 | pub fn from_array(array: &ndarray::ArrayView3, stream: &Stream) -> Result { 78 | let host_buffer = HostBuffer::from_array(array); 79 | let (height, width, num_channels) = array.dim(); 80 | let mut this = Self::new(width, height, num_channels); 81 | // SAFETY: Safe because the stream is synchronized after this. 82 | unsafe { 83 | this.copy_from_async(&host_buffer, stream)?; 84 | } 85 | stream.synchronize()?; 86 | Ok(this) 87 | } 88 | 89 | /// Copy from host buffer. 90 | /// 91 | /// # Safety 92 | /// 93 | /// This function is marked unsafe because it does not synchronize and the operation might not 94 | /// have completed when it returns. 95 | pub unsafe fn copy_from_async(&mut self, other: &HostBuffer, stream: &Stream) -> Result<()> { 96 | assert_eq!(self.num_elements(), other.num_elements); 97 | let ptr_from = other.as_internal().as_ptr(); 98 | let ptr_to = self.as_mut_internal().as_mut_ptr(); 99 | let line_size = self.width * self.num_channels * std::mem::size_of::(); 100 | let height = self.height; 101 | let pitch = self.pitch; 102 | let stream_ptr = stream.as_internal().as_ptr(); 103 | let ret = cpp!(unsafe [ 104 | ptr_from as "void*", 105 | ptr_to as "void*", 106 | pitch as "std::size_t", 107 | line_size as "std::size_t", 108 | height as "std::size_t", 109 | stream_ptr as "const void*" 110 | ] -> i32 as "std::int32_t" { 111 | return cudaMemcpy2DAsync( 112 | ptr_to, 113 | pitch, 114 | ptr_from, 115 | line_size, 116 | line_size, 117 | height, 118 | cudaMemcpyHostToDevice, 119 | (cudaStream_t) stream_ptr 120 | ); 121 | }); 122 | result!(ret) 123 | } 124 | 125 | /// Copy to host buffer. 126 | /// 127 | /// # Safety 128 | /// 129 | /// This function is marked unsafe because it does not synchronize and the operation might not 130 | /// have completed when it returns. 131 | pub unsafe fn copy_to_async(&self, other: &mut HostBuffer, stream: &Stream) -> Result<()> { 132 | assert_eq!(self.num_elements(), other.num_elements); 133 | let ptr_from = self.as_internal().as_ptr(); 134 | let ptr_to = other.as_mut_internal().as_mut_ptr(); 135 | let line_size = self.width * self.num_channels * std::mem::size_of::(); 136 | let height = self.height; 137 | let pitch = self.pitch; 138 | let stream_ptr = stream.as_internal().as_ptr(); 139 | let ret = cpp!(unsafe [ 140 | ptr_from as "void*", 141 | ptr_to as "void*", 142 | pitch as "std::size_t", 143 | line_size as "std::size_t", 144 | height as "std::size_t", 145 | stream_ptr as "const void*" 146 | ] -> i32 as "std::int32_t" { 147 | return cudaMemcpy2DAsync( 148 | ptr_to, 149 | line_size, 150 | ptr_from, 151 | pitch, 152 | line_size, 153 | height, 154 | cudaMemcpyDeviceToHost, 155 | (cudaStream_t) stream_ptr 156 | ); 157 | }); 158 | result!(ret) 159 | } 160 | 161 | /// Fill buffer with byte value. 162 | pub fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> { 163 | let ptr = self.as_internal().as_ptr(); 164 | let value = value as std::ffi::c_int; 165 | let line_size = self.width * self.num_channels * std::mem::size_of::(); 166 | let height = self.height; 167 | let pitch = self.pitch; 168 | let stream_ptr = stream.as_internal().as_ptr(); 169 | let ret = cpp!(unsafe [ 170 | ptr as "void*", 171 | value as "int", 172 | pitch as "std::size_t", 173 | line_size as "std::size_t", 174 | height as "std::size_t", 175 | stream_ptr as "const void*" 176 | ] -> i32 as "std::int32_t" { 177 | return cudaMemset2DAsync( 178 | ptr, 179 | pitch, 180 | value, 181 | line_size, 182 | height, 183 | (cudaStream_t) stream_ptr 184 | ); 185 | }); 186 | result!(ret) 187 | } 188 | 189 | #[inline(always)] 190 | pub fn num_elements(&self) -> usize { 191 | self.width * self.height * self.num_channels 192 | } 193 | 194 | /// Get readonly reference to internal [`DevicePtr`]. 195 | #[inline(always)] 196 | pub fn as_internal(&self) -> &DevicePtr { 197 | &self.internal 198 | } 199 | 200 | /// Get mutable reference to internal [`DevicePtr`]. 201 | #[inline(always)] 202 | pub fn as_mut_internal(&mut self) -> &mut DevicePtr { 203 | &mut self.internal 204 | } 205 | 206 | /// Release the buffer memory. 207 | /// 208 | /// # Panics 209 | /// 210 | /// This function panics if binding to the corresponding device fails. 211 | /// 212 | /// # Safety 213 | /// 214 | /// The buffer may not be used after this function is called, except for being dropped. 215 | pub unsafe fn free(&mut self) { 216 | if self.internal.is_null() { 217 | return; 218 | } 219 | 220 | Device::set_or_panic(self.device); 221 | 222 | // SAFETY: Safe because we won't use pointer after this. 223 | let mut internal = unsafe { self.internal.take() }; 224 | let ptr = internal.as_mut_ptr(); 225 | let _ret = cpp!(unsafe [ 226 | ptr as "void*" 227 | ] -> i32 as "std::int32_t" { 228 | return cudaFree(ptr); 229 | }); 230 | } 231 | } 232 | 233 | impl Drop for DeviceBuffer2D { 234 | #[inline] 235 | fn drop(&mut self) { 236 | // SAFETY: This is safe since the buffer cannot be used after this. 237 | unsafe { 238 | self.free(); 239 | } 240 | } 241 | } 242 | 243 | #[cfg(test)] 244 | mod tests { 245 | use super::*; 246 | 247 | #[test] 248 | fn test_new() { 249 | let buffer = DeviceBuffer2D::::new(120, 80, 3); 250 | assert_eq!(buffer.width, 120); 251 | assert_eq!(buffer.height, 80); 252 | assert_eq!(buffer.num_channels, 3); 253 | assert_eq!(buffer.num_elements(), 120 * 80 * 3); 254 | assert!(buffer.pitch >= 360); 255 | } 256 | 257 | #[test] 258 | fn test_copy() { 259 | let stream = Stream::new().unwrap(); 260 | let all_ones = vec![1_u32; 150]; 261 | let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()); 262 | 263 | let mut device_buffer = DeviceBuffer2D::::new(10, 5, 3); 264 | unsafe { 265 | device_buffer 266 | .copy_from_async(&host_buffer_all_ones, &stream) 267 | .unwrap(); 268 | } 269 | 270 | let mut host_buffer = HostBuffer::::new(150); 271 | unsafe { 272 | device_buffer 273 | .copy_to_async(&mut host_buffer, &stream) 274 | .unwrap(); 275 | } 276 | 277 | let mut another_device_buffer = DeviceBuffer2D::::new(10, 5, 3); 278 | unsafe { 279 | another_device_buffer 280 | .copy_from_async(&host_buffer, &stream) 281 | .unwrap(); 282 | } 283 | 284 | let mut return_host_buffer = HostBuffer::::new(150); 285 | unsafe { 286 | another_device_buffer 287 | .copy_to_async(&mut return_host_buffer, &stream) 288 | .unwrap(); 289 | } 290 | 291 | stream.synchronize().unwrap(); 292 | 293 | assert_eq!(return_host_buffer.num_elements, 150); 294 | let return_data = return_host_buffer.to_vec(); 295 | assert_eq!(return_data.len(), 150); 296 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 297 | } 298 | 299 | #[test] 300 | fn test_copy_2d() { 301 | let stream = Stream::new().unwrap(); 302 | let image: [u8; 12] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]; 303 | let host_buffer = HostBuffer::from_slice(&image); 304 | let mut device_buffer = DeviceBuffer2D::::new(2, 2, 3); 305 | unsafe { 306 | device_buffer 307 | .copy_from_async(&host_buffer, &stream) 308 | .unwrap(); 309 | } 310 | let mut return_host_buffer = HostBuffer::::new(12); 311 | unsafe { 312 | device_buffer 313 | .copy_to_async(&mut return_host_buffer, &stream) 314 | .unwrap(); 315 | } 316 | stream.synchronize().unwrap(); 317 | assert_eq!( 318 | &return_host_buffer.to_vec(), 319 | &[1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] 320 | ); 321 | } 322 | 323 | #[test] 324 | fn test_fill_with_byte() { 325 | let stream = Stream::new().unwrap(); 326 | let mut device_buffer = DeviceBuffer2D::::new(2, 2, 3); 327 | let mut host_buffer = HostBuffer::::new(2 * 2 * 3); 328 | device_buffer.fill_with_byte(0xab, &stream).unwrap(); 329 | unsafe { 330 | device_buffer 331 | .copy_to_async(&mut host_buffer, &stream) 332 | .unwrap(); 333 | } 334 | stream.synchronize().unwrap(); 335 | assert_eq!( 336 | host_buffer.to_vec(), 337 | &[0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab] 338 | ); 339 | } 340 | 341 | #[test] 342 | #[should_panic] 343 | fn test_it_panics_when_copying_invalid_size() { 344 | let stream = Stream::new().unwrap(); 345 | let device_buffer = DeviceBuffer2D::::new(5, 5, 3); 346 | let mut host_buffer = HostBuffer::::new(80); 347 | let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream) }; 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/memory/host.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi; 2 | use crate::memory::DeviceBuffer; 3 | use crate::runtime::Future; 4 | use crate::stream::Stream; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// A host buffer. 9 | /// 10 | /// # Performance 11 | /// 12 | /// Host buffers are managed by CUDA and can be used for pinned memory transfer. Pinned memory 13 | /// transfer speeds are usually higher compared to paged memory transfers. Pinned memory buffers are 14 | /// especially important for this crate because the runtime thread must do the least amount of CPU 15 | /// work possible. Paged transfers do require the host to move data into a CUDA managed buffer first 16 | /// (an extra memory copy) whilst pinned transfers do not. 17 | pub struct HostBuffer { 18 | inner: ffi::memory::HostBuffer, 19 | } 20 | 21 | impl HostBuffer { 22 | /// Allocates memory on the host. This creates a pinned buffer. Any transfers to and from this 23 | /// buffer automatically become pinned transfers, and will be much faster. 24 | /// 25 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c) 26 | /// 27 | /// # Arguments 28 | /// 29 | /// * `num_elements` - Number of elements to allocate. 30 | pub async fn new(num_elements: usize) -> Self { 31 | let inner = Future::new(move || ffi::memory::HostBuffer::::new(num_elements)).await; 32 | Self { inner } 33 | } 34 | 35 | /// Allocates memory on the host and copies the provided data into it. 36 | /// 37 | /// This creates a pinned buffer. Any transfers to and from this buffer automatically become 38 | /// pinned transfers, and will be much faster. 39 | /// 40 | /// This is a convenience function that allows the caller to quickly put data into a host 41 | /// buffer. It is roughly similar to `buffer.copy_from_slice(slice)`. 42 | /// 43 | /// # Arguments 44 | /// 45 | /// * `slice` - Data to copy into the new host buffer. 46 | pub async fn from_slice(slice: &[T]) -> Self { 47 | let mut this = Self::new(slice.len()).await; 48 | this.copy_from_slice(slice); 49 | this 50 | } 51 | 52 | /// Allocates memory on the host and copies the provided array into it. 53 | /// 54 | /// This creates a pinned buffer. Any transfers to and from this buffer automatically become 55 | /// pinned transfers, and will be much faster. 56 | /// 57 | /// This is a convenience function that allows the caller to quickly put data into a host 58 | /// buffer. It is roughly similar to `buffer.copy_from_array(slice)`. 59 | /// 60 | /// # Arguments 61 | /// 62 | /// * `array` - Array to copy into the new host buffer. 63 | #[cfg(feature = "ndarray")] 64 | pub async fn from_array(array: &ndarray::ArrayView<'_, T, D>) -> Self { 65 | let mut this = Self::new(array.len()).await; 66 | this.copy_from_array(array); 67 | this 68 | } 69 | 70 | /// Copies memory from the provided device buffer to this buffer. 71 | /// 72 | /// This function synchronizes the stream implicitly. 73 | /// 74 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 75 | /// 76 | /// # Pinned transfer 77 | /// 78 | /// This function is guaranteed to produce a pinned transfer on the runtime thread. 79 | /// 80 | /// # Stream ordered semantics 81 | /// 82 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 83 | /// sequentially relative to operations scheduled on the same stream or the default stream. 84 | /// 85 | /// # Arguments 86 | /// 87 | /// * `other` - Device buffer to copy from. 88 | /// * `stream` - Stream to use. 89 | #[inline(always)] 90 | pub async fn copy_from(&mut self, other: &DeviceBuffer, stream: &Stream) -> Result<()> { 91 | other.copy_to(self, stream).await 92 | } 93 | 94 | /// Copies memory from the provided device buffer to this buffer. 95 | /// 96 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 97 | /// 98 | /// # Pinned transfer 99 | /// 100 | /// This function is guaranteed to produce a pinned transfer on the runtime thread. 101 | /// 102 | /// # Stream ordered semantics 103 | /// 104 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 105 | /// sequentially relative to operations scheduled on the same stream or the default stream. 106 | /// 107 | /// # Safety 108 | /// 109 | /// This function is unsafe because the operation might not have completed when the function 110 | /// returns, and thus the state of the buffer is undefined. 111 | /// 112 | /// # Arguments 113 | /// 114 | /// * `other` - Device buffer to copy from. 115 | /// * `stream` - Stream to use. 116 | #[inline(always)] 117 | pub async unsafe fn copy_from_async( 118 | &mut self, 119 | other: &DeviceBuffer, 120 | stream: &Stream, 121 | ) -> Result<()> { 122 | other.copy_to_async(self, stream).await 123 | } 124 | 125 | /// Copies memory from this buffer to the provided device buffer. 126 | /// 127 | /// This function synchronizes the stream implicitly. 128 | /// 129 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 130 | /// 131 | /// # Pinned transfer 132 | /// 133 | /// This function is guaranteed to produce a pinned transfer on the runtime thread. 134 | /// 135 | /// # Stream ordered semantics 136 | /// 137 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 138 | /// sequentially relative to operations scheduled on the same stream or the default stream. 139 | /// 140 | /// # Arguments 141 | /// 142 | /// * `other` - Device buffer to copy to. 143 | /// * `stream` - Stream to use. 144 | #[inline(always)] 145 | pub async fn copy_to(&self, other: &mut DeviceBuffer, stream: &Stream) -> Result<()> { 146 | other.copy_from(self, stream).await 147 | } 148 | 149 | /// Copies memory from this buffer to the provided device buffer. 150 | /// 151 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 152 | /// 153 | /// # Pinned transfer 154 | /// 155 | /// This function is guaranteed to produce a pinned transfer on the runtime thread. 156 | /// 157 | /// # Stream ordered semantics 158 | /// 159 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 160 | /// sequentially relative to operations scheduled on the same stream or the default stream. 161 | /// 162 | /// # Safety 163 | /// 164 | /// This function is unsafe because the operation might not have completed when the function 165 | /// returns, and thus the state of the buffer is undefined. 166 | /// 167 | /// # Arguments 168 | /// 169 | /// * `other` - Device buffer to copy to. 170 | /// * `stream` - Stream to use. 171 | #[inline(always)] 172 | pub async unsafe fn copy_to_async( 173 | &self, 174 | other: &mut DeviceBuffer, 175 | stream: &Stream, 176 | ) -> Result<()> { 177 | other.copy_from_async(self, stream).await 178 | } 179 | 180 | /// Copy data into the host buffer from a slice. 181 | /// 182 | /// # Synchronization safety 183 | /// 184 | /// This call is only synchronization-safe if all streams that have previously been used for 185 | /// copy operations either from or to this host buffer have been synchronized, and no operations 186 | /// have been scheduled since. 187 | /// 188 | /// # Arguments 189 | /// 190 | /// * `slice` - Data to copy into the new host buffer. 191 | /// 192 | /// # Example 193 | /// 194 | /// ``` 195 | /// # use async_cuda::HostBuffer; 196 | /// # tokio_test::block_on(async { 197 | /// let mut host_buffer = HostBuffer::::new(100).await; 198 | /// let some_data = vec![10; 100]; 199 | /// host_buffer.copy_from_slice(&some_data); 200 | /// # }) 201 | /// ``` 202 | #[inline(always)] 203 | pub fn copy_from_slice(&mut self, slice: &[T]) { 204 | self.inner.copy_from_slice(slice); 205 | } 206 | 207 | /// Copy array into the host buffer from a slice. 208 | /// 209 | /// # Synchronization safety 210 | /// 211 | /// This call is only synchronization-safe if all streams that have previously been used for 212 | /// copy operations either from or to this host buffer have been synchronized, and no operations 213 | /// have been scheduled since. 214 | /// 215 | /// # Arguments 216 | /// 217 | /// * `array` - Array to copy into the new host buffer. 218 | #[cfg(feature = "ndarray")] 219 | #[inline(always)] 220 | pub fn copy_from_array(&mut self, array: &ndarray::ArrayView) { 221 | self.inner.copy_from_array(array) 222 | } 223 | 224 | /// Copy the data to a [`Vec`] and return it. 225 | #[inline(always)] 226 | pub fn to_vec(&self) -> Vec { 227 | self.inner.to_vec() 228 | } 229 | 230 | /// Copy the data to an [`ndarray::Array`] and return it. 231 | /// 232 | /// Function panics if provided shape does not match size of array. 233 | /// 234 | /// # Arguments 235 | /// 236 | /// * `shape` - Shape for array. 237 | #[cfg(feature = "ndarray")] 238 | #[inline(always)] 239 | pub fn to_array_with_shape( 240 | &self, 241 | shape: impl Into>, 242 | ) -> ndarray::Array { 243 | self.inner.to_array_with_shape::(shape) 244 | } 245 | 246 | /// Get number of elements in buffer. 247 | #[inline(always)] 248 | pub fn num_elements(&self) -> usize { 249 | self.inner.num_elements 250 | } 251 | 252 | /// Access the inner synchronous implementation of [`HostBuffer`]. 253 | #[inline(always)] 254 | pub fn inner(&self) -> &ffi::memory::HostBuffer { 255 | &self.inner 256 | } 257 | 258 | /// Access the inner synchronous implementation of [`HostBuffer`]. 259 | #[inline(always)] 260 | pub fn inner_mut(&mut self) -> &mut ffi::memory::HostBuffer { 261 | &mut self.inner 262 | } 263 | } 264 | 265 | #[cfg(test)] 266 | mod tests { 267 | use super::*; 268 | 269 | #[tokio::test] 270 | async fn test_new() { 271 | let buffer = HostBuffer::::new(100).await; 272 | assert_eq!(buffer.num_elements(), 100); 273 | assert_eq!(buffer.to_vec().len(), 100); 274 | } 275 | 276 | #[tokio::test] 277 | async fn test_from_slice() { 278 | let all_ones = vec![1_u32; 200]; 279 | let buffer = HostBuffer::from_slice(all_ones.as_slice()).await; 280 | assert_eq!(buffer.num_elements(), 200); 281 | let data = buffer.to_vec(); 282 | assert_eq!(data.len(), 200); 283 | assert!(data.into_iter().all(|v| v == 1_u32)); 284 | } 285 | 286 | #[tokio::test] 287 | async fn test_copy() { 288 | let stream = Stream::new().await.unwrap(); 289 | let all_ones = vec![1_u32; 100]; 290 | let host_buffer = HostBuffer::from_slice(all_ones.as_slice()).await; 291 | 292 | let mut device_buffer = DeviceBuffer::::new(100, &stream).await; 293 | unsafe { 294 | host_buffer 295 | .copy_to_async(&mut device_buffer, &stream) 296 | .await 297 | .unwrap(); 298 | } 299 | 300 | let mut return_host_buffer = HostBuffer::::new(100).await; 301 | unsafe { 302 | return_host_buffer 303 | .copy_from_async(&device_buffer, &stream) 304 | .await 305 | .unwrap(); 306 | } 307 | 308 | stream.synchronize().await.unwrap(); 309 | 310 | assert_eq!(return_host_buffer.num_elements(), 100); 311 | let return_data = return_host_buffer.to_vec(); 312 | assert_eq!(return_data.len(), 100); 313 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 314 | } 315 | 316 | #[tokio::test] 317 | #[should_panic] 318 | async fn test_it_panics_when_copying_invalid_size() { 319 | let stream = Stream::new().await.unwrap(); 320 | let host_buffer = HostBuffer::::new(100).await; 321 | let mut device_buffer = DeviceBuffer::::new(101, &Stream::null()).await; 322 | let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream).await }; 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /src/ffi/npp/resize_batch.rs: -------------------------------------------------------------------------------- 1 | use cpp::cpp; 2 | 3 | use crate::ffi::npp::context::Context; 4 | use crate::ffi::npp::result; 5 | use crate::npp::region::Region; 6 | 7 | type Result = std::result::Result; 8 | 9 | /// Synchroneous implementation of [`crate::resize_batch()`]. 10 | /// 11 | /// Refer to [`crate::resize_batch()`] for documentation. 12 | pub fn resize_batch( 13 | inputs_and_outputs: &mut [( 14 | &crate::ffi::memory::DeviceBuffer2D, 15 | &mut crate::ffi::memory::DeviceBuffer2D, 16 | )], 17 | input_region: Region, 18 | output_region: Region, 19 | context: &Context, 20 | ) -> Result<()> { 21 | assert!( 22 | !inputs_and_outputs.is_empty(), 23 | "batch must have at least one item" 24 | ); 25 | 26 | let (first_input, first_output) = &inputs_and_outputs[0]; 27 | let first_input_width = first_input.width; 28 | let first_input_height = first_input.height; 29 | let first_output_width = first_output.width; 30 | let first_output_height = first_output.height; 31 | for (input, output) in inputs_and_outputs.iter() { 32 | assert_eq!( 33 | input.width, first_input_width, 34 | "all inputs in batch must have the same width", 35 | ); 36 | assert_eq!( 37 | input.height, first_input_height, 38 | "all inputs in batch must have the same height", 39 | ); 40 | assert_eq!( 41 | output.width, first_output_width, 42 | "all outputs in batch must have the same width", 43 | ); 44 | assert_eq!( 45 | output.height, first_output_height, 46 | "all outputs in batch must have the same height", 47 | ); 48 | assert_eq!( 49 | input.num_channels, 3, 50 | "all inputs and outputs must be in RGB format" 51 | ); 52 | assert_eq!( 53 | output.num_channels, 3, 54 | "all inputs and outputs must be in RGB format" 55 | ); 56 | } 57 | 58 | let batch_size = inputs_and_outputs.len(); 59 | 60 | let (src_width, src_height) = (first_input_width as i32, first_input_height as i32); 61 | let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = 62 | input_region.resolve_to_xywh(src_width as usize, src_height as usize); 63 | let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = ( 64 | src_rect_x as i32, 65 | src_rect_y as i32, 66 | src_rect_width as i32, 67 | src_rect_height as i32, 68 | ); 69 | 70 | let (dst_width, dst_height) = (first_output_width as i32, first_output_height as i32); 71 | let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = 72 | output_region.resolve_to_xywh(dst_width as usize, dst_height as usize); 73 | let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = ( 74 | dst_rect_x as i32, 75 | dst_rect_y as i32, 76 | dst_rect_width as i32, 77 | dst_rect_height as i32, 78 | ); 79 | 80 | let srcs = inputs_and_outputs 81 | .iter() 82 | // SAFETY: This is safe because we keep the original input and output device buffers around 83 | // for the duration of this call. 84 | .map(|(input, _)| input.as_internal().as_ptr()) 85 | .collect::>(); 86 | let src_pitches = inputs_and_outputs 87 | .iter() 88 | .map(|(input, _)| input.pitch) 89 | .collect::>(); 90 | let dsts = inputs_and_outputs 91 | .iter_mut() 92 | // SAFETY: This is safe because we keep the original input and output device buffers around 93 | // for the duration of this call. 94 | .map(|(_, output)| output.as_mut_internal().as_mut_ptr()) 95 | .collect::>(); 96 | let dst_pitches = inputs_and_outputs 97 | .iter() 98 | .map(|(_, output)| output.pitch) 99 | .collect::>(); 100 | 101 | let src_array = srcs.as_ptr(); 102 | let src_pitches_array = src_pitches.as_ptr(); 103 | let dst_array = dsts.as_ptr(); 104 | let dst_pitches_array = dst_pitches.as_ptr(); 105 | 106 | let context_ptr = context.as_ptr(); 107 | let ret = cpp!(unsafe [ 108 | src_array as "const void* const*", 109 | src_pitches_array as "const std::size_t*", 110 | src_width as "std::int32_t", 111 | src_height as "std::int32_t", 112 | src_rect_x as "std::int32_t", 113 | src_rect_y as "std::int32_t", 114 | src_rect_width as "std::int32_t", 115 | src_rect_height as "std::int32_t", 116 | dst_array as "void* const*", 117 | dst_pitches_array as "const std::size_t*", 118 | dst_width as "std::int32_t", 119 | dst_height as "std::int32_t", 120 | dst_rect_x as "std::int32_t", 121 | dst_rect_y as "std::int32_t", 122 | dst_rect_width as "std::int32_t", 123 | dst_rect_height as "std::int32_t", 124 | batch_size as "std::size_t", 125 | context_ptr as "void*" 126 | ] -> i32 as "std::int32_t" { 127 | NppStatus ret {}; 128 | cudaError_t ret_cuda {}; 129 | 130 | NppiSize src_size = { src_width, src_height }; 131 | NppiSize dst_size = { dst_width, dst_height }; 132 | NppiRect src_rect = { src_rect_x, src_rect_y, src_rect_width, src_rect_height }; 133 | NppiRect dst_rect = { dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height }; 134 | 135 | NppiResizeBatchCXR* batch_host = new NppiResizeBatchCXR[batch_size]; 136 | for (std::size_t i = 0; i < batch_size; i++) { 137 | batch_host[i].pSrc = src_array[i]; 138 | batch_host[i].nSrcStep = src_pitches_array[i]; 139 | batch_host[i].pDst = dst_array[i]; 140 | batch_host[i].nDstStep = dst_pitches_array[i]; 141 | } 142 | 143 | NppiResizeBatchCXR* batch = nullptr; 144 | ret_cuda = cudaMallocAsync( 145 | &batch, 146 | batch_size * sizeof(NppiResizeBatchCXR), 147 | ((NppStreamContext*) context_ptr)->hStream 148 | ); 149 | if (ret_cuda != cudaSuccess) 150 | goto cleanup; 151 | ret_cuda = cudaMemcpyAsync( 152 | batch, 153 | batch_host, 154 | batch_size * sizeof(NppiResizeBatchCXR), 155 | cudaMemcpyHostToDevice, 156 | ((NppStreamContext*) context_ptr)->hStream 157 | ); 158 | if (ret_cuda != cudaSuccess) 159 | goto cleanup; 160 | 161 | ret = nppiResizeBatch_8u_C3R_Ctx( 162 | src_size, 163 | src_rect, 164 | dst_size, 165 | dst_rect, 166 | // We use bilinear interpolation, which is the fastest resize method that does not 167 | // produce messed up quality. 168 | NPPI_INTER_LINEAR, 169 | batch, 170 | batch_size, 171 | *((NppStreamContext*) context_ptr) 172 | ); 173 | 174 | cleanup: 175 | if (batch != nullptr) 176 | cudaFreeAsync( 177 | batch, 178 | ((NppStreamContext*) context_ptr)->hStream 179 | ); 180 | if (batch_host != nullptr) 181 | delete[] batch_host; 182 | 183 | return ret; 184 | }); 185 | result!(ret) 186 | } 187 | 188 | #[cfg(test)] 189 | mod tests { 190 | use super::*; 191 | 192 | use crate::ffi::npp::context::Context; 193 | use crate::npp::tests::sync::memory::*; 194 | 195 | #[test] 196 | fn test_resize_batch() { 197 | #[rustfmt::skip] 198 | const INPUT: [u8; 12] = [ 199 | 10, 10, 10, 20, 20, 20, 200 | 30, 30, 30, 40, 40, 40, 201 | ]; 202 | #[rustfmt::skip] 203 | const EXPECTED_OUTPUT: [u8; 27] = [ 204 | 10, 10, 10, 14, 14, 14, 20, 20, 20, 205 | 18, 18, 18, 23, 23, 23, 28, 28, 28, 206 | 30, 30, 30, 34, 34, 34, 40, 40, 40, 207 | ]; 208 | 209 | let context = Context::from_null_stream(); 210 | 211 | let mut inputs_and_outputs = (0..10) 212 | .map(|_| { 213 | let image = to_device_2d!(&INPUT, 2, 2, 3, &context); 214 | let output = crate::ffi::memory::DeviceBuffer2D::::new(3, 3, 3); 215 | (image, output) 216 | }) 217 | .collect::>(); 218 | let mut inputs_and_outputs_ref = inputs_and_outputs 219 | .iter_mut() 220 | .map(|(input, output)| (&*input, output)) 221 | .collect::>(); 222 | resize_batch( 223 | &mut inputs_and_outputs_ref, 224 | Region::Full, 225 | Region::Full, 226 | &context, 227 | ) 228 | .unwrap(); 229 | 230 | for (_, output) in inputs_and_outputs { 231 | let output = to_host_2d!(output, &context); 232 | assert_eq!(&output, &EXPECTED_OUTPUT); 233 | } 234 | } 235 | 236 | #[test] 237 | fn test_resize_batch_with_input_region() { 238 | #[rustfmt::skip] 239 | const INPUT: [u8; 27] = [ 240 | 99, 99, 99, 10, 10, 10, 20, 20, 20, 241 | 99, 99, 99, 30, 30, 30, 40, 40, 40, 242 | 99, 99, 99, 99, 99, 99, 99, 99, 99, 243 | ]; 244 | #[rustfmt::skip] 245 | const EXPECTED_OUTPUT: [u8; 27] = [ 246 | 32, 32, 32, 14, 14, 14, 20, 20, 20, 247 | 39, 39, 39, 23, 23, 23, 28, 28, 28, 248 | 52, 52, 52, 40, 40, 40, 45, 45, 45, 249 | ]; 250 | 251 | let context = Context::from_null_stream(); 252 | let center = Region::Rectangle { 253 | x: 1, 254 | y: 0, 255 | width: 2, 256 | height: 2, 257 | }; 258 | let mut inputs_and_outputs = (0..10) 259 | .map(|_| { 260 | let image = to_device_2d!(&INPUT, 3, 3, 3, &context); 261 | let output = crate::ffi::memory::DeviceBuffer2D::::new(3, 3, 3); 262 | (image, output) 263 | }) 264 | .collect::>(); 265 | let mut inputs_and_outputs_ref = inputs_and_outputs 266 | .iter_mut() 267 | .map(|(input, output)| (&*input, output)) 268 | .collect::>(); 269 | resize_batch(&mut inputs_and_outputs_ref, center, Region::Full, &context).unwrap(); 270 | 271 | for (_, output) in inputs_and_outputs { 272 | let output = to_host_2d!(output, &context); 273 | assert_eq!(&output, &EXPECTED_OUTPUT); 274 | } 275 | } 276 | 277 | #[test] 278 | fn test_resize_batch_with_output_region() { 279 | #[rustfmt::skip] 280 | const INPUT: [u8; 2 * 2 * 3] = [ 281 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 282 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 283 | ]; 284 | #[rustfmt::skip] 285 | const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [ 286 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 287 | 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 288 | ]; 289 | 290 | let context = Context::from_null_stream(); 291 | let bottom_half = Region::Rectangle { 292 | x: 0, 293 | y: 1, 294 | width: 2, 295 | height: 1, 296 | }; 297 | 298 | let mut inputs_and_outputs = (0..10) 299 | .map(|_| { 300 | let image = to_device_2d!(&INPUT, 2, 2, 3, &context); 301 | let mut output = crate::ffi::memory::DeviceBuffer2D::::new(2, 2, 3); 302 | output.fill_with_byte(0x00, context.stream.inner()).unwrap(); 303 | (image, output) 304 | }) 305 | .collect::>(); 306 | let mut inputs_and_outputs_ref = inputs_and_outputs 307 | .iter_mut() 308 | .map(|(input, output)| (&*input, output)) 309 | .collect::>(); 310 | resize_batch( 311 | &mut inputs_and_outputs_ref, 312 | Region::Full, 313 | bottom_half, 314 | &context, 315 | ) 316 | .unwrap(); 317 | 318 | for (_, output) in inputs_and_outputs { 319 | let output = to_host_2d!(output, &context); 320 | assert_eq!(&output, &EXPECTED_OUTPUT); 321 | } 322 | } 323 | 324 | #[test] 325 | #[should_panic] 326 | fn test_it_panics_when_input_num_channels_incorrect() { 327 | let mut inputs_and_outputs = vec![ 328 | ( 329 | crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2), 330 | crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 2), 331 | ), 332 | ( 333 | crate::ffi::memory::DeviceBuffer2D::::new(100, 100, 2), 334 | crate::ffi::memory::DeviceBuffer2D::::new(200, 200, 2), 335 | ), 336 | ]; 337 | let mut inputs_and_outputs_ref = inputs_and_outputs 338 | .iter_mut() 339 | .map(|(input, output)| (&*input, output)) 340 | .collect::>(); 341 | resize_batch( 342 | &mut inputs_and_outputs_ref, 343 | Region::Full, 344 | Region::Full, 345 | &Context::from_null_stream(), 346 | ) 347 | .unwrap(); 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/memory/device.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi; 2 | use crate::memory::HostBuffer; 3 | use crate::runtime::Future; 4 | use crate::stream::Stream; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// A buffer on the device. 9 | /// 10 | /// # Example 11 | /// 12 | /// Copying data from a [`HostBuffer`] to a [`DeviceBuffer`]: 13 | /// 14 | /// ``` 15 | /// # use async_cuda::{DeviceBuffer, HostBuffer, Stream}; 16 | /// # tokio_test::block_on(async { 17 | /// let stream = Stream::new().await.unwrap(); 18 | /// let all_ones = vec![1_u8; 100]; 19 | /// let host_buffer = HostBuffer::::from_slice(&all_ones).await; 20 | /// let mut device_buffer = DeviceBuffer::::new(100, &stream).await; 21 | /// device_buffer.copy_from(&host_buffer, &stream).await.unwrap(); 22 | /// # }) 23 | /// ``` 24 | pub struct DeviceBuffer { 25 | inner: ffi::memory::DeviceBuffer, 26 | } 27 | 28 | impl DeviceBuffer { 29 | /// Allocates memory on the device. 30 | /// 31 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS_1gbbf70065888d61853c047513baa14081) 32 | /// 33 | /// # Stream ordered semantics 34 | /// 35 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 36 | /// sequentially relative to operations scheduled on the same stream or the default stream. 37 | /// 38 | /// # Arguments 39 | /// 40 | /// * `num_elements` - Number of elements to allocate. 41 | /// * `stream` - Stream to use. 42 | pub async fn new(num_elements: usize, stream: &Stream) -> Self { 43 | let inner = 44 | Future::new(move || ffi::memory::DeviceBuffer::::new(num_elements, stream.inner())) 45 | .await; 46 | Self { inner } 47 | } 48 | 49 | /// Allocate memory on the device, and copy data from host into it. 50 | /// 51 | /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally 52 | /// copies the data from the host buffer to the [`DeviceBuffer`]. 53 | /// 54 | /// The given stream is automatically synchronized, since the temporary host buffer might 55 | /// otherwise be dropped before the copy can complete. 56 | /// 57 | /// # Arguments 58 | /// 59 | /// * `slice` - Data to copy into the buffer. 60 | /// * `stream` - Stream to use. 61 | pub async fn from_slice(slice: &[T], stream: &Stream) -> Result { 62 | let host_buffer = HostBuffer::from_slice(slice).await; 63 | let mut this = Self::new(slice.len(), stream).await; 64 | this.copy_from(&host_buffer, stream).await?; 65 | Ok(this) 66 | } 67 | 68 | /// Allocate memory on the device, and copy array from host into it. 69 | /// 70 | /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally 71 | /// copies the data from the host buffer to the [`DeviceBuffer`]. 72 | /// 73 | /// The given stream is automatically synchronized, since the temporary host buffer might 74 | /// otherwise be dropped before the copy can complete. 75 | /// 76 | /// # Arguments 77 | /// 78 | /// * `slice` - Data to copy into the buffer. 79 | /// * `stream` - Stream to use. 80 | #[cfg(feature = "ndarray")] 81 | pub async fn from_array( 82 | array: &ndarray::ArrayView<'_, T, D>, 83 | stream: &Stream, 84 | ) -> Result { 85 | let host_buffer = HostBuffer::from_array(array).await; 86 | let mut this = Self::new(array.len(), stream).await; 87 | this.copy_from(&host_buffer, stream).await?; 88 | Ok(this) 89 | } 90 | 91 | /// Copies memory from the provided pinned host buffer to this buffer. 92 | /// 93 | /// This function synchronizes the stream implicitly. 94 | /// 95 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 96 | /// 97 | /// # Pinned transfer 98 | /// 99 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 100 | /// guaranteed to produce a pinned transfer on the runtime thread. 101 | /// 102 | /// # Stream ordered semantics 103 | /// 104 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 105 | /// sequentially relative to operations scheduled on the same stream or the default stream. 106 | /// 107 | /// # Arguments 108 | /// 109 | /// * `other` - Buffer to copy from. 110 | /// * `stream` - Stream to use. 111 | #[inline] 112 | pub async fn copy_from(&mut self, other: &HostBuffer, stream: &Stream) -> Result<()> { 113 | // SAFETY: Stream is synchronized after this. 114 | unsafe { 115 | self.copy_from_async(other, stream).await?; 116 | } 117 | stream.synchronize().await?; 118 | Ok(()) 119 | } 120 | 121 | /// Copies memory from the provided pinned host buffer to this buffer. 122 | /// 123 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 124 | /// 125 | /// # Pinned transfer 126 | /// 127 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 128 | /// guaranteed to produce a pinned transfer on the runtime thread. 129 | /// 130 | /// # Stream ordered semantics 131 | /// 132 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 133 | /// sequentially relative to operations scheduled on the same stream or the default stream. 134 | /// 135 | /// # Safety 136 | /// 137 | /// This function is unsafe because the operation might not have completed when the function 138 | /// returns, and thus the state of the buffer is undefined. 139 | /// 140 | /// # Arguments 141 | /// 142 | /// * `other` - Buffer to copy from. 143 | /// * `stream` - Stream to use. 144 | pub async unsafe fn copy_from_async( 145 | &mut self, 146 | other: &HostBuffer, 147 | stream: &Stream, 148 | ) -> Result<()> { 149 | assert_eq!(self.num_elements(), other.num_elements()); 150 | Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await 151 | } 152 | 153 | /// Copies memory from this buffer to the provided pinned host buffer. 154 | /// 155 | /// This function synchronizes the stream implicitly. 156 | /// 157 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 158 | /// 159 | /// # Pinned transfer 160 | /// 161 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 162 | /// guaranteed to produce a pinned transfer on the runtime thread. 163 | /// 164 | /// # Stream ordered semantics 165 | /// 166 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 167 | /// sequentially relative to operations scheduled on the same stream or the default stream. 168 | /// 169 | /// # Arguments 170 | /// 171 | /// * `other` - Buffer to copy to. 172 | /// * `stream` - Stream to use. 173 | #[inline] 174 | pub async fn copy_to(&self, other: &mut HostBuffer, stream: &Stream) -> Result<()> { 175 | // SAFETY: Stream is synchronized after this. 176 | unsafe { 177 | self.copy_to_async(other, stream).await?; 178 | } 179 | stream.synchronize().await?; 180 | Ok(()) 181 | } 182 | 183 | /// Copies memory from this buffer to the provided pinned host buffer. 184 | /// 185 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79) 186 | /// 187 | /// # Pinned transfer 188 | /// 189 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 190 | /// guaranteed to produce a pinned transfer on the runtime thread. 191 | /// 192 | /// # Stream ordered semantics 193 | /// 194 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 195 | /// sequentially relative to operations scheduled on the same stream or the default stream. 196 | /// 197 | /// # Safety 198 | /// 199 | /// This function is unsafe because the operation might not have completed when the function 200 | /// returns, and thus the state of the buffer is undefined. 201 | /// 202 | /// # Arguments 203 | /// 204 | /// * `other` - Buffer to copy to. 205 | /// * `stream` - Stream to use. 206 | pub async unsafe fn copy_to_async( 207 | &self, 208 | other: &mut HostBuffer, 209 | stream: &Stream, 210 | ) -> Result<()> { 211 | assert_eq!(self.num_elements(), other.num_elements()); 212 | Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await 213 | } 214 | 215 | /// Fill the entire buffer with the given byte. 216 | /// 217 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g7c9761e21d9f0999fd136c51e7b9b2a0) 218 | /// 219 | /// # Stream ordered semantics 220 | /// 221 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 222 | /// sequentially relative to operations scheduled on the same stream or the default stream. 223 | /// 224 | /// # Arguments 225 | /// 226 | /// * `value` - Byte value to fill buffer with. 227 | pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> { 228 | Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await 229 | } 230 | 231 | /// Get number of elements in buffer. 232 | #[inline(always)] 233 | pub fn num_elements(&self) -> usize { 234 | self.inner.num_elements 235 | } 236 | 237 | /// Access the inner synchronous implementation of [`DeviceBuffer`]. 238 | #[inline(always)] 239 | pub fn inner(&self) -> &ffi::memory::DeviceBuffer { 240 | &self.inner 241 | } 242 | 243 | /// Access the inner synchronous implementation of [`DeviceBuffer`]. 244 | #[inline(always)] 245 | pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer { 246 | &mut self.inner 247 | } 248 | } 249 | 250 | #[cfg(test)] 251 | mod tests { 252 | use super::*; 253 | 254 | #[tokio::test] 255 | async fn test_new() { 256 | let buffer = DeviceBuffer::::new(100, &Stream::null()).await; 257 | assert_eq!(buffer.num_elements(), 100); 258 | } 259 | 260 | #[tokio::test] 261 | async fn test_copy() { 262 | let stream = Stream::new().await.unwrap(); 263 | let all_ones = vec![1_u32; 100]; 264 | let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await; 265 | 266 | let mut device_buffer = DeviceBuffer::::new(100, &stream).await; 267 | unsafe { 268 | device_buffer 269 | .copy_from_async(&host_buffer_all_ones, &stream) 270 | .await 271 | .unwrap(); 272 | } 273 | 274 | let mut host_buffer = HostBuffer::::new(100).await; 275 | unsafe { 276 | device_buffer 277 | .copy_to_async(&mut host_buffer, &stream) 278 | .await 279 | .unwrap(); 280 | } 281 | 282 | let mut another_device_buffer = DeviceBuffer::::new(100, &stream).await; 283 | unsafe { 284 | another_device_buffer 285 | .copy_from_async(&host_buffer, &stream) 286 | .await 287 | .unwrap(); 288 | } 289 | 290 | let mut return_host_buffer = HostBuffer::::new(100).await; 291 | unsafe { 292 | another_device_buffer 293 | .copy_to_async(&mut return_host_buffer, &stream) 294 | .await 295 | .unwrap(); 296 | } 297 | 298 | stream.synchronize().await.unwrap(); 299 | 300 | assert_eq!(return_host_buffer.num_elements(), 100); 301 | let return_data = return_host_buffer.to_vec(); 302 | assert_eq!(return_data.len(), 100); 303 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 304 | } 305 | 306 | #[tokio::test] 307 | async fn test_fill_with_byte() { 308 | let stream = Stream::new().await.unwrap(); 309 | let mut device_buffer = DeviceBuffer::::new(4, &stream).await; 310 | let mut host_buffer = HostBuffer::::new(4).await; 311 | device_buffer.fill_with_byte(0xab, &stream).await.unwrap(); 312 | device_buffer 313 | .copy_to(&mut host_buffer, &stream) 314 | .await 315 | .unwrap(); 316 | assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]); 317 | } 318 | 319 | #[tokio::test] 320 | #[should_panic] 321 | async fn test_it_panics_when_copying_invalid_size() { 322 | let stream = Stream::new().await.unwrap(); 323 | let device_buffer = DeviceBuffer::::new(101, &stream).await; 324 | let mut host_buffer = HostBuffer::::new(100).await; 325 | let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await }; 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /src/memory/device2d.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi; 2 | use crate::memory::HostBuffer; 3 | use crate::runtime::Future; 4 | use crate::stream::Stream; 5 | 6 | type Result = std::result::Result; 7 | 8 | /// A buffer on the device. 9 | /// 10 | /// # Example 11 | /// 12 | /// Copying data from a [`HostBuffer`] to a [`DeviceBuffer2D`]: 13 | /// 14 | /// ``` 15 | /// # use async_cuda::{DeviceBuffer2D, HostBuffer, Stream}; 16 | /// # tokio_test::block_on(async { 17 | /// let stream = Stream::new().await.unwrap(); 18 | /// let all_ones = vec![1_u8; 300]; 19 | /// let host_buffer = HostBuffer::::from_slice(&all_ones).await; 20 | /// let mut device_buffer = DeviceBuffer2D::::new(10, 10, 3).await; 21 | /// device_buffer.copy_from(&host_buffer, &stream).await.unwrap(); 22 | /// # }) 23 | /// ``` 24 | pub struct DeviceBuffer2D { 25 | inner: ffi::memory::DeviceBuffer2D, 26 | } 27 | 28 | impl DeviceBuffer2D { 29 | /// Allocates 2D memory on the device. 30 | /// 31 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c) 32 | /// 33 | /// # Arguments 34 | /// 35 | /// * `width` - Width of 2-dimensional buffer. 36 | /// * `height` - Height of 2-dimensional buffer. 37 | /// * `num_channels` - Number of channels per item. 38 | pub async fn new(width: usize, height: usize, num_channels: usize) -> Self { 39 | let inner = 40 | Future::new(move || ffi::memory::DeviceBuffer2D::::new(width, height, num_channels)) 41 | .await; 42 | Self { inner } 43 | } 44 | 45 | /// Allocate memory on the device, and copy 3D array from host into it. 46 | /// 47 | /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally 48 | /// copies the data from the host buffer to the [`DeviceBuffer`]. 49 | /// 50 | /// The given stream is automatically synchronized, since the temporary host buffer might 51 | /// otherwise be dropped before the copy can complete. 52 | /// 53 | /// # Arguments 54 | /// 55 | /// * `array` - 3-dimensional array to copy into the buffer. The first and second dimensions are 56 | /// equivalent to the height and width of the 2D buffer (respectively), and the third 57 | /// dimension is the number of channels. 58 | /// * `stream` - Stream to use. 59 | #[cfg(feature = "ndarray")] 60 | pub async fn from_array(array: &ndarray::ArrayView3<'_, T>, stream: &Stream) -> Result { 61 | let host_buffer = HostBuffer::from_array(array).await; 62 | let (height, width, num_channels) = array.dim(); 63 | let mut this = Self::new(width, height, num_channels).await; 64 | this.copy_from(&host_buffer, stream).await?; 65 | Ok(this) 66 | } 67 | 68 | /// Copies memory from the provided pinned host buffer to this 2D buffer. 69 | /// 70 | /// This function synchronizes the stream implicitly. 71 | /// 72 | /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is 73 | /// `width` times `height` times `num_channels`. 74 | /// 75 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1) 76 | /// 77 | /// # Pinned transfer 78 | /// 79 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 80 | /// guaranteed to produce a pinned transfer on the runtime thread. 81 | /// 82 | /// # Stream ordered semantics 83 | /// 84 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 85 | /// sequentially relative to operations scheduled on the same stream or the default stream. 86 | /// 87 | /// # Arguments 88 | /// 89 | /// * `other` - Buffer to copy from. 90 | /// * `stream` - Stream to use. 91 | #[inline] 92 | pub async fn copy_from(&mut self, other: &HostBuffer, stream: &Stream) -> Result<()> { 93 | // SAFETY: Stream is synchronized after this. 94 | unsafe { 95 | self.copy_from_async(other, stream).await?; 96 | } 97 | stream.synchronize().await?; 98 | Ok(()) 99 | } 100 | 101 | /// Copies memory from the provided pinned host buffer to this 2D buffer. 102 | /// 103 | /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is 104 | /// `width` times `height` times `num_channels`. 105 | /// 106 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1) 107 | /// 108 | /// # Pinned transfer 109 | /// 110 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 111 | /// guaranteed to produce a pinned transfer on the runtime thread. 112 | /// 113 | /// # Stream ordered semantics 114 | /// 115 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 116 | /// sequentially relative to operations scheduled on the same stream or the default stream. 117 | /// 118 | /// # Safety 119 | /// 120 | /// This function is unsafe because the operation might not have completed when the function 121 | /// returns, and thus the state of the buffer is undefined. 122 | /// 123 | /// # Arguments 124 | /// 125 | /// * `other` - Buffer to copy from. 126 | /// * `stream` - Stream to use. 127 | pub async unsafe fn copy_from_async( 128 | &mut self, 129 | other: &HostBuffer, 130 | stream: &Stream, 131 | ) -> Result<()> { 132 | assert_eq!(self.num_elements(), other.num_elements()); 133 | Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await 134 | } 135 | 136 | /// Copies memory from this 2D buffer to the provided pinned host buffer. 137 | /// 138 | /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is 139 | /// `width` times `height` times `num_channels`. 140 | /// 141 | /// This function synchronizes the stream implicitly. 142 | /// 143 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1) 144 | /// 145 | /// # Pinned transfer 146 | /// 147 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 148 | /// guaranteed to produce a pinned transfer on the runtime thread. 149 | /// 150 | /// # Stream ordered semantics 151 | /// 152 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 153 | /// sequentially relative to operations scheduled on the same stream or the default stream. 154 | /// 155 | /// # Arguments 156 | /// 157 | /// * `other` - Buffer to copy to. 158 | /// * `stream` - Stream to use. 159 | #[inline] 160 | pub async fn copy_to(&self, other: &mut HostBuffer, stream: &Stream) -> Result<()> { 161 | // SAFETY: Stream is synchronized after this. 162 | unsafe { 163 | self.copy_to_async(other, stream).await?; 164 | } 165 | stream.synchronize().await?; 166 | Ok(()) 167 | } 168 | 169 | /// Copies memory from this 2D buffer to the provided pinned host buffer. 170 | /// 171 | /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is 172 | /// `width` times `height` times `num_channels`. 173 | /// 174 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1) 175 | /// 176 | /// # Pinned transfer 177 | /// 178 | /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is 179 | /// guaranteed to produce a pinned transfer on the runtime thread. 180 | /// 181 | /// # Stream ordered semantics 182 | /// 183 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 184 | /// sequentially relative to operations scheduled on the same stream or the default stream. 185 | /// 186 | /// # Safety 187 | /// 188 | /// This function is unsafe because the operation might not have completed when the function 189 | /// returns, and thus the state of the buffer is undefined. 190 | /// 191 | /// # Arguments 192 | /// 193 | /// * `other` - Buffer to copy to. 194 | /// * `stream` - Stream to use. 195 | pub async unsafe fn copy_to_async( 196 | &self, 197 | other: &mut HostBuffer, 198 | stream: &Stream, 199 | ) -> Result<()> { 200 | assert_eq!(self.num_elements(), other.num_elements()); 201 | Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await 202 | } 203 | 204 | /// Fill the entire buffer with the given byte. 205 | /// 206 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8fdcc53996ff49c570f4b5ead0256ef0) 207 | /// 208 | /// # Stream ordered semantics 209 | /// 210 | /// This function uses stream ordered semantics. It can only be guaranteed to complete 211 | /// sequentially relative to operations scheduled on the same stream or the default stream. 212 | /// 213 | /// # Arguments 214 | /// 215 | /// * `value` - Byte value to fill buffer with. 216 | pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> { 217 | Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await 218 | } 219 | 220 | /// Get 2D buffer width. 221 | #[inline(always)] 222 | pub fn width(&self) -> usize { 223 | self.inner.width 224 | } 225 | 226 | /// Get 2D buffer height. 227 | #[inline(always)] 228 | pub fn height(&self) -> usize { 229 | self.inner.height 230 | } 231 | 232 | /// Get 2D buffer number of channels. 233 | #[inline(always)] 234 | pub fn num_channels(&self) -> usize { 235 | self.inner.num_channels 236 | } 237 | 238 | /// Get the total number of elements in buffer. 239 | /// 240 | /// This is equal to: `width` times `height` times `num_channels`. 241 | #[inline(always)] 242 | pub fn num_elements(&self) -> usize { 243 | self.inner.num_elements() 244 | } 245 | 246 | /// Access the inner synchronous implementation of [`DeviceBuffer2D`]. 247 | #[inline(always)] 248 | pub fn inner(&self) -> &ffi::memory::DeviceBuffer2D { 249 | &self.inner 250 | } 251 | 252 | /// Access the inner synchronous implementation of [`DeviceBuffer2D`]. 253 | #[inline(always)] 254 | pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer2D { 255 | &mut self.inner 256 | } 257 | } 258 | 259 | #[cfg(test)] 260 | mod tests { 261 | use super::*; 262 | 263 | #[tokio::test] 264 | async fn test_new() { 265 | let buffer = DeviceBuffer2D::::new(120, 80, 3).await; 266 | assert_eq!(buffer.width(), 120); 267 | assert_eq!(buffer.height(), 80); 268 | assert_eq!(buffer.num_channels(), 3); 269 | assert_eq!(buffer.num_elements(), 120 * 80 * 3); 270 | assert!(buffer.inner().pitch >= 360); 271 | } 272 | 273 | #[tokio::test] 274 | async fn test_copy() { 275 | let stream = Stream::new().await.unwrap(); 276 | let all_ones = vec![1_u32; 150]; 277 | let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await; 278 | 279 | let mut device_buffer = DeviceBuffer2D::::new(10, 5, 3).await; 280 | unsafe { 281 | device_buffer 282 | .copy_from_async(&host_buffer_all_ones, &stream) 283 | .await 284 | .unwrap(); 285 | } 286 | 287 | let mut host_buffer = HostBuffer::::new(150).await; 288 | unsafe { 289 | device_buffer 290 | .copy_to_async(&mut host_buffer, &stream) 291 | .await 292 | .unwrap(); 293 | } 294 | 295 | let mut another_device_buffer = DeviceBuffer2D::::new(10, 5, 3).await; 296 | unsafe { 297 | another_device_buffer 298 | .copy_from_async(&host_buffer, &stream) 299 | .await 300 | .unwrap(); 301 | } 302 | 303 | let mut return_host_buffer = HostBuffer::::new(150).await; 304 | unsafe { 305 | another_device_buffer 306 | .copy_to_async(&mut return_host_buffer, &stream) 307 | .await 308 | .unwrap(); 309 | } 310 | 311 | stream.synchronize().await.unwrap(); 312 | 313 | assert_eq!(return_host_buffer.num_elements(), 150); 314 | let return_data = return_host_buffer.to_vec(); 315 | assert_eq!(return_data.len(), 150); 316 | assert!(return_data.into_iter().all(|v| v == 1_u32)); 317 | } 318 | 319 | #[tokio::test] 320 | async fn test_copy_2d() { 321 | let stream = Stream::new().await.unwrap(); 322 | let image: [u8; 12] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]; 323 | let host_buffer = HostBuffer::from_slice(&image).await; 324 | let mut device_buffer = DeviceBuffer2D::::new(2, 2, 3).await; 325 | unsafe { 326 | device_buffer 327 | .copy_from_async(&host_buffer, &stream) 328 | .await 329 | .unwrap(); 330 | } 331 | let mut return_host_buffer = HostBuffer::::new(12).await; 332 | unsafe { 333 | device_buffer 334 | .copy_to_async(&mut return_host_buffer, &stream) 335 | .await 336 | .unwrap(); 337 | } 338 | stream.synchronize().await.unwrap(); 339 | assert_eq!( 340 | &return_host_buffer.to_vec(), 341 | &[1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] 342 | ); 343 | } 344 | 345 | #[tokio::test] 346 | async fn test_fill_with_byte() { 347 | let stream = Stream::new().await.unwrap(); 348 | let mut device_buffer = DeviceBuffer2D::::new(2, 2, 3).await; 349 | let mut host_buffer = HostBuffer::::new(2 * 2 * 3).await; 350 | device_buffer.fill_with_byte(0xab, &stream).await.unwrap(); 351 | device_buffer 352 | .copy_to(&mut host_buffer, &stream) 353 | .await 354 | .unwrap(); 355 | assert_eq!( 356 | host_buffer.to_vec(), 357 | &[0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab] 358 | ); 359 | } 360 | 361 | #[tokio::test] 362 | #[should_panic] 363 | async fn test_it_panics_when_copying_invalid_size() { 364 | let stream = Stream::new().await.unwrap(); 365 | let device_buffer = DeviceBuffer2D::::new(5, 5, 3).await; 366 | let mut host_buffer = HostBuffer::::new(80).await; 367 | let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await }; 368 | } 369 | } 370 | --------------------------------------------------------------------------------