2 | `async-cuda` 3 |

├── src
    ├── npp
    │   ├── tests
    │   │   ├── sync
    │   │   │   ├── mod.rs
    │   │   │   └── memory.rs
    │   │   ├── mod.rs
    │   │   ├── memory.rs
    │   │   └── image.rs
    │   ├── mod.rs
    │   ├── error.rs
    │   ├── constant_border.rs
    │   ├── stream.rs
    │   ├── copy_constant_border.rs
    │   ├── region.rs
    │   ├── resize.rs
    │   ├── remap.rs
    │   └── resize_batch.rs
    ├── ffi
    │   ├── includes.rs
    │   ├── memory
    │   │   ├── mod.rs
    │   │   ├── host.rs
    │   │   ├── device.rs
    │   │   └── device2d.rs
    │   ├── npp
    │   │   ├── includes.rs
    │   │   ├── mod.rs
    │   │   ├── copy_constant_border.rs
    │   │   ├── context.rs
    │   │   ├── resize.rs
    │   │   ├── remap.rs
    │   │   └── resize_batch.rs
    │   ├── mod.rs
    │   ├── error.rs
    │   ├── device.rs
    │   ├── ptr.rs
    │   └── stream.rs
    ├── memory
    │   ├── mod.rs
    │   ├── host.rs
    │   ├── device.rs
    │   └── device2d.rs
    ├── runtime
    │   ├── mod.rs
    │   ├── work.rs
    │   ├── thread_local.rs
    │   └── execution.rs
    ├── lib.rs
    ├── error.rs
    ├── stream.rs
    └── device.rs
├── .github
    └── workflows
    │   └── ci.yaml
├── Cargo.toml
├── LICENSE-MIT
├── tests
    └── functions_side_effects_test.rs
├── README.md
└── LICENSE-APACHE


/src/npp/tests/sync/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod memory;
2 | 


--------------------------------------------------------------------------------
/src/npp/tests/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod image;
2 | pub mod memory;
3 | pub mod sync;
4 | 


--------------------------------------------------------------------------------
/src/ffi/includes.rs:
--------------------------------------------------------------------------------
 1 | use cpp::cpp;
 2 | 
 3 | cpp! {{
 4 |     #include <cstdint>
 5 | }}
 6 | 
 7 | cpp! {{
 8 |     #include <cuda_runtime.h>
 9 | }}
10 | 


--------------------------------------------------------------------------------
/src/ffi/memory/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod device;
2 | pub mod device2d;
3 | pub mod host;
4 | 
5 | pub use device::DeviceBuffer;
6 | pub use device2d::DeviceBuffer2D;
7 | pub use host::HostBuffer;
8 | 


--------------------------------------------------------------------------------
/src/memory/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod device;
2 | pub mod device2d;
3 | pub mod host;
4 | 
5 | pub use device::DeviceBuffer;
6 | pub use device2d::DeviceBuffer2D;
7 | pub use host::HostBuffer;
8 | 


--------------------------------------------------------------------------------
/src/runtime/mod.rs:
--------------------------------------------------------------------------------
1 | mod execution;
2 | mod future;
3 | mod thread_local;
4 | mod work;
5 | 
6 | pub use future::{Future, SynchronizeFuture};
7 | pub use thread_local::enqueue_decoupled;
8 | 


--------------------------------------------------------------------------------
/src/ffi/npp/includes.rs:
--------------------------------------------------------------------------------
 1 | use cpp::cpp;
 2 | 
 3 | cpp! {{
 4 |     #include <cstdint>
 5 | }}
 6 | 
 7 | cpp! {{
 8 |     #include <cuda_runtime.h>
 9 | }}
10 | 
11 | cpp! {{
12 |     #include <nppcore.h>
13 |     #include <nppi.h>
14 | }}
15 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![recursion_limit = "256"]
 2 | 
 3 | pub mod device;
 4 | pub mod error;
 5 | pub mod ffi;
 6 | pub mod memory;
 7 | pub mod runtime;
 8 | pub mod stream;
 9 | 
10 | #[cfg(feature = "npp")]
11 | pub mod npp;
12 | 
13 | pub use device::{num_devices, Device, DeviceId, MemoryInfo};
14 | pub use memory::{DeviceBuffer, DeviceBuffer2D, HostBuffer};
15 | pub use stream::Stream;
16 | 
17 | pub use error::Error;
18 | 


--------------------------------------------------------------------------------
/src/npp/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod constant_border;
 2 | pub mod copy_constant_border;
 3 | pub mod error;
 4 | pub mod region;
 5 | pub mod remap;
 6 | pub mod resize;
 7 | pub mod stream;
 8 | 
 9 | #[cfg(feature = "npp-unstable")]
10 | pub mod resize_batch;
11 | 
12 | pub use constant_border::ConstantBorder;
13 | pub use copy_constant_border::copy_constant_border;
14 | pub use error::Error;
15 | pub use region::Region;
16 | pub use remap::remap;
17 | pub use resize::resize;
18 | pub use stream::Stream;
19 | 
20 | #[cfg(feature = "npp-unstable")]
21 | pub use resize_batch::resize_batch;
22 | 
23 | #[cfg(test)]
24 | pub mod tests;
25 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 | 
14 |   lints:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout
19 |       uses: actions/checkout@v3
20 | 
21 |     - name: Setup Rust
22 |       uses: dtolnay/rust-toolchain@v1
23 |       with:
24 |         toolchain: stable
25 |         components: rustfmt, clippy
26 |     
27 |     - name: Rustfmt
28 |       run: cargo fmt --all -- --check
29 | 
30 |     # - name: Clippy
31 |     #   run: cargo clippy --locked --all --all-features -- -D warnings
32 | 


--------------------------------------------------------------------------------
/src/npp/error.rs:
--------------------------------------------------------------------------------
 1 | /// An error that occurred in NPP.
 2 | #[derive(Debug, Clone)]
 3 | pub enum Error {
 4 |     /// Error code as reported by NPP.
 5 |     ///
 6 |     /// [NPP documentation](https://docs.nvidia.com/cuda/npp/group__typedefs__npp.html#ga1105a17b5e76381583c46ecd6a60fe21)
 7 |     Npp(i32),
 8 |     /// Error in CUDA backend.
 9 |     ///
10 |     /// Refer to [`crate::Error`].
11 |     Cuda(crate::Error),
12 | }
13 | 
14 | impl std::fmt::Display for Error {
15 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
16 |         match self {
17 |             Error::Cuda(err) => write!(f, "{err}"),
18 |             Error::Npp(error_code) => write!(f, "error code produced by NPP: {error_code}"),
19 |         }
20 |     }
21 | }
22 | 
23 | impl std::error::Error for Error {}
24 | 
25 | impl From<crate::Error> for Error {
26 |     #[inline]
27 |     fn from(err: crate::Error) -> Self {
28 |         Error::Cuda(err)
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/ffi/mod.rs:
--------------------------------------------------------------------------------
 1 | mod includes;
 2 | 
 3 | pub mod device;
 4 | pub mod error;
 5 | pub mod memory;
 6 | pub mod ptr;
 7 | pub mod stream;
 8 | 
 9 | #[cfg(feature = "npp")]
10 | pub mod npp;
11 | 
12 | /// Convenience macro for turning a CUDA error code into a `std::result::Result`.
13 | ///
14 | /// # Usage
15 | ///
16 | /// There are two possible uses of the macro:
17 | ///
18 | /// (1) Shorthand to return `Ok(something)` or a CUDA error:
19 | ///
20 | /// ```ignore
21 | /// result!(code, return_value);
22 | /// ```
23 | ///
24 | /// (2) Shorthand to return `Ok(())` or a CUDA error:
25 | ///
26 | /// ```ignore
27 | /// result!(code)
28 | /// ```
29 | macro_rules! result {
30 |     ($code:expr, $ok:expr) => {
31 |         if $code == 0 {
32 |             Ok($ok)
33 |         } else {
34 |             Err($crate::error::Error::Cuda($code))
35 |         }
36 |     };
37 |     ($code:expr) => {
38 |         result!($code, ())
39 |     };
40 | }
41 | 
42 | use result;
43 | 


--------------------------------------------------------------------------------
/src/ffi/npp/mod.rs:
--------------------------------------------------------------------------------
 1 | mod includes;
 2 | 
 3 | pub mod context;
 4 | pub mod copy_constant_border;
 5 | pub mod remap;
 6 | pub mod resize;
 7 | 
 8 | #[cfg(feature = "npp-unstable")]
 9 | pub mod resize_batch;
10 | 
11 | /// Convenience macro for turning an NPP error code into a `std::result::Result`.
12 | ///
13 | /// # Usage
14 | ///
15 | /// There are two possible uses of the macro:
16 | ///
17 | /// (1) Shorthand to return `Ok(something)` or an NPP error:
18 | ///
19 | /// ```ignore
20 | /// result!(code, return_value);
21 | /// ```
22 | ///
23 | /// (2) Shorthand to return `Ok(())` or an NPP error:
24 | ///
25 | /// ```ignore
26 | /// result!(code)
27 | /// ```
28 | macro_rules! result {
29 |     ($code:expr, $ok:expr) => {
30 |         if $code == 0 {
31 |             Ok($ok)
32 |         } else {
33 |             Err($crate::npp::error::Error::Npp($code))
34 |         }
35 |     };
36 |     ($code:expr) => {
37 |         result!($code, ())
38 |     };
39 | }
40 | 
41 | use result;
42 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "async-cuda"
 3 | description = "Async CUDA for Rust."
 4 | keywords = ["async", "nvidia", "cuda", "gpu", "npp"]
 5 | readme = "README.md"
 6 | categories = ["asynchronous"]
 7 | edition = "2021"
 8 | version = "0.6.1"
 9 | authors = ["Oddity.ai Developers <hello@oddity.ai>"]
10 | repository = "https://github.com/oddity-ai/async-cuda"
11 | license = "MIT OR Apache-2.0"
12 | 
13 | [dependencies]
14 | cpp = "0.5"
15 | ndarray = { version = "0.16", optional = true }
16 | once_cell = "1.17"
17 | 
18 | [dev-dependencies]
19 | futures = { version = "0.3", default-features = false, features = ["std"] }
20 | tokio = { version = "1", default-features = false, features = [
21 |   "macros",
22 |   "test-util",
23 |   "time",
24 | ] }
25 | tokio-test = { version = "0.4" }
26 | 
27 | [build-dependencies]
28 | cpp_build = "0.5"
29 | 
30 | [features]
31 | npp = []
32 | npp-unstable = []
33 | 
34 | [package.metadata.docs.rs]
35 | rustc-args = ["--cfg", "feature=\"docs-only\""]
36 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::error::error_description;
 2 | 
 3 | /// An error that occurred during a CUDA operation.
 4 | #[derive(Debug, Clone)]
 5 | pub enum Error {
 6 |     /// Error code as reported by the CUDA backend.
 7 |     ///
 8 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/npp/group__typedefs__npp.html#ga1105a17b5e76381583c46ecd6a60fe21)
 9 |     Cuda(i32),
10 |     /// The runtime backend unexpectedly broke down. This is usually irrecoverable because the
11 |     /// entire crate assumes that all backend execution will happen on the runtime thread.
12 |     Runtime,
13 | }
14 | 
15 | impl std::fmt::Display for Error {
16 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
17 |         match self {
18 |             Error::Cuda(code) => {
19 |                 let error_code = *code;
20 |                 let error_description = error_description(error_code);
21 |                 write!(
22 |                     f,
23 |                     "CUDA error ({}): {}",
24 |                     error_code,
25 |                     error_description.as_str(),
26 |                 )
27 |             }
28 |             Error::Runtime => write!(f, "CUDA runtime broken"),
29 |         }
30 |     }
31 | }
32 | 
33 | impl std::error::Error for Error {}
34 | 


--------------------------------------------------------------------------------
/src/npp/tests/memory.rs:
--------------------------------------------------------------------------------
 1 | /// Convenience macro for testing to take a memory slice and put it on the device and return the
 2 | /// [`crate::memory::DeviceBuffer2D`] that refers to it.
 3 | macro_rules! to_device_2d {
 4 |     ($slice:expr, $width:expr, $height:expr, $num_channels:expr, $stream:expr) => {{
 5 |         let host_buffer = crate::memory::HostBuffer::from_slice($slice).await;
 6 |         let mut device_buffer =
 7 |             crate::memory::DeviceBuffer2D::new($width, $height, $num_channels).await;
 8 |         device_buffer
 9 |             .copy_from(&host_buffer, $stream)
10 |             .await
11 |             .unwrap();
12 |         device_buffer
13 |     }};
14 | }
15 | 
16 | /// Convenience macro for testing to take a [`crate::memory::DeviceBuffer2D`] and copy it back to
17 | /// the host, then return a [`Vec`] of that memory.
18 | macro_rules! to_host_2d {
19 |     ($device_buffer:expr, $stream:expr) => {{
20 |         let mut host_buffer = crate::memory::HostBuffer::new($device_buffer.num_elements()).await;
21 |         $device_buffer
22 |             .copy_to(&mut host_buffer, $stream)
23 |             .await
24 |             .unwrap();
25 |         host_buffer.to_vec()
26 |     }};
27 | }
28 | 
29 | pub(crate) use to_device_2d;
30 | pub(crate) use to_host_2d;
31 | 


--------------------------------------------------------------------------------
/src/ffi/error.rs:
--------------------------------------------------------------------------------
 1 | use cpp::cpp;
 2 | 
 3 | /// Returns the description string for an error code.
 4 | ///
 5 | /// Note that this function is not executed on the runtime thread, since it is purely a utility
 6 | /// function and should have no side-effects with regards to CUDA devices.
 7 | ///
 8 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html#group__CUDART__ERROR_1g4bc9e35a618dfd0877c29c8ee45148f1)
 9 | ///
10 | /// # Arguments
11 | ///
12 | /// * `error_code` - CUDA error code.
13 | pub fn error_description(error_code: i32) -> String {
14 |     let error_description = cpp!(unsafe [
15 |       error_code as "std::int32_t"
16 |     ] -> *const std::ffi::c_char as "const char*" {
17 |       return cudaGetErrorString(static_cast<cudaError_t>(error_code));
18 |     });
19 |     // SAFETY: The pointer returned by `cudaGetErrorString` actually has a static lifetime so this
20 |     // is safe for sure. We even copy inside the unsafe block so we just need it to remain for a
21 |     // little bit.
22 |     unsafe {
23 |         std::ffi::CStr::from_ptr(error_description)
24 |             .to_string_lossy()
25 |             .to_string()
26 |     }
27 | }
28 | 
29 | #[cfg(test)]
30 | mod tests {
31 |     use super::*;
32 | 
33 |     #[test]
34 |     fn test_correct_description() {
35 |         assert_eq!(error_description(1), "invalid argument");
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/runtime/work.rs:
--------------------------------------------------------------------------------
 1 | /// Represents a unit of work passed to the runtime. Holds a closure inside.
 2 | ///
 3 | /// The closure is explictly [`Send`] because it will be sent over the thread boundary to be
 4 | /// executed in the runtime thread. For the same reason, the closure must be `'static`.
 5 | ///
 6 | /// # Usage
 7 | ///
 8 | /// ```ignore
 9 | /// let work = Work::new(|| {
10 | ///     // ...
11 | /// });
12 | /// work.run();
13 | /// ```
14 | pub struct Work(Box<dyn FnOnce() + Send + 'static>);
15 | 
16 | impl Work {
17 |     /// Create a new work item.
18 |     ///
19 |     /// # Arguments
20 |     ///
21 |     /// * `f` - Closure to execute.
22 |     pub fn new(f: impl FnOnce() + Send + 'static) -> Self {
23 |         Work(Box::new(f))
24 |     }
25 | 
26 |     /// Execute work.
27 |     pub fn run(self) {
28 |         let Work(f) = self;
29 |         f();
30 |     }
31 | }
32 | 
33 | #[cfg(test)]
34 | mod tests {
35 |     use super::*;
36 | 
37 |     #[test]
38 |     fn test_it_runs() {
39 |         let make_me_true = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
40 |         let work = Work::new({
41 |             let make_me_true = make_me_true.clone();
42 |             move || {
43 |                 make_me_true.store(true, std::sync::atomic::Ordering::SeqCst);
44 |             }
45 |         });
46 |         work.run();
47 |         assert!(make_me_true.load(std::sync::atomic::Ordering::SeqCst));
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/npp/tests/sync/memory.rs:
--------------------------------------------------------------------------------
 1 | /// Convenience macro for testing to take a memory slice and put it on the device and return the
 2 | /// [`crate::ffi::memory::DeviceBuffer2D`] that refers to it.
 3 | macro_rules! to_device_2d {
 4 |     ($slice:expr, $width:expr, $height:expr, $num_channels:expr, $context:expr) => {{
 5 |         let host_buffer = crate::ffi::memory::HostBuffer::from_slice($slice);
 6 |         let mut device_buffer =
 7 |             crate::ffi::memory::DeviceBuffer2D::new($width, $height, $num_channels);
 8 |         // SAFETY: Stream is synchronized right after this.
 9 |         unsafe {
10 |             device_buffer
11 |                 .copy_from_async(&host_buffer, &$context.stream.inner())
12 |                 .unwrap();
13 |         }
14 |         $context.stream.inner().synchronize().unwrap();
15 |         device_buffer
16 |     }};
17 | }
18 | 
19 | /// Convenience macro for testing to take a [`crate::ffi::memory::DeviceBuffer2D`] and copy it back
20 | /// to the host, then return a [`Vec`] of that memory.
21 | macro_rules! to_host_2d {
22 |     ($device_buffer:expr, $context:expr) => {{
23 |         let mut host_buffer = crate::ffi::memory::HostBuffer::new($device_buffer.num_elements());
24 |         // SAFETY: Stream is synchronized right after this.
25 |         unsafe {
26 |             $device_buffer
27 |                 .copy_to_async(&mut host_buffer, &$context.stream.inner())
28 |                 .unwrap();
29 |         }
30 |         $context.stream.inner().synchronize().unwrap();
31 |         host_buffer.to_vec()
32 |     }};
33 | }
34 | 
35 | pub(crate) use to_device_2d;
36 | pub(crate) use to_host_2d;
37 | 


--------------------------------------------------------------------------------
/src/npp/tests/image.rs:
--------------------------------------------------------------------------------
 1 | pub type Pixel = [u8; 3];
 2 | pub type Image2x2 = [[Pixel; 2]; 2];
 3 | pub type Image4x4 = [[Pixel; 4]; 4];
 4 | 
 5 | pub const R: Pixel = [255_u8, 0_u8, 0_u8];
 6 | pub const G: Pixel = [0_u8, 255_u8, 0_u8];
 7 | pub const B: Pixel = [0_u8, 0_u8, 255_u8];
 8 | 
 9 | /// This is a 4 by 4 testing image that represents the hypothetical RGB flag, which looks something
10 | /// like this:
11 | ///
12 | /// ```text
13 | /// .. .. .. ..
14 | /// RR RR RR RR
15 | /// RR GG GG RR
16 | /// RR BB BB RR
17 | /// RR RR RR RR
18 | /// .. .. .. ..
19 | /// ```
20 | /// (It consists of a two-pixel green and blue band, wrapped in a red one-pixel border.)
21 | ///
22 | /// Where `RR` represents a red pixel, `GG` a green one and `BB` a blue one.
23 | pub const RGB_FLAG_RAW: Image4x4 = [
24 |     [R, R, R, R], // Red border
25 |     [R, G, G, R], // Green band with red border
26 |     [R, B, B, R], // Blue band with red border
27 |     [R, R, R, R], // Red border
28 | ];
29 | 
30 | /// This is the [`RGB_FLAG_RAW`] image with contiguous memory layout so that it can be easily put
31 | /// into a host or device buffer.
32 | pub const RGB_FLAG: [u8; 4 * 4 * 3] = flatten!(RGB_FLAG_RAW, 4 * 4 * 3);
33 | 
34 | /// Convenience macro to flatten a nested array to a flat array.
35 | ///
36 | /// # Usage
37 | ///
38 | /// ```ignore
39 | /// let array = [
40 | ///     [1, 2, 3],
41 | ///     [4, 5, 6],
42 | ///     [7, 8, 9],
43 | /// ];
44 | /// assert_eq!(
45 | ///     &flatten!(array),
46 | ///     &[1, 2, 3, 4, 5, 6, 7, 8, 9],
47 | /// );
48 | /// ```
49 | macro_rules! flatten {
50 |     ($array:expr, $size:expr) => {
51 |         unsafe { std::mem::transmute::<_, [_; $size]>($array) }
52 |     };
53 | }
54 | 
55 | pub(crate) use flatten;
56 | 


--------------------------------------------------------------------------------
/src/npp/constant_border.rs:
--------------------------------------------------------------------------------
 1 | /// Represents a constant border around an image.
 2 | ///
 3 | /// This is used to specify the border around an image when copying a constant border around it for
 4 | /// the purposes of letterbox resizing.
 5 | #[derive(Debug, Clone, PartialEq)]
 6 | pub struct ConstantBorder {
 7 |     pub left: u32,
 8 |     pub top: u32,
 9 |     pub color: [u8; 3],
10 | }
11 | 
12 | impl ConstantBorder {
13 |     /// New constant border.
14 |     ///
15 |     /// # Arguments
16 |     ///
17 |     /// * `left` - Size of border on the left and right sides of the image in number of pixels.
18 |     /// * `top`- Size of border on the top and bottom sides of the image in number of pixels.
19 |     /// * `color` - Color of border (RGB).
20 |     pub fn new(left: u32, top: u32, color: [u8; 3]) -> Self {
21 |         Self { left, top, color }
22 |     }
23 | 
24 |     /// New constant border with white color.
25 |     ///
26 |     /// # Arguments
27 |     ///
28 |     /// * `left` - Size of border on the left and right sides of the image in number of pixels.
29 |     /// * `top`- Size of border on the top and bottom sides of the image in number of pixels.
30 |     pub fn white(left: u32, top: u32) -> Self {
31 |         Self::new(left, top, [255, 255, 255])
32 |     }
33 | 
34 |     /// New constant border with black color.
35 |     ///
36 |     /// # Arguments
37 |     ///
38 |     /// * `left` - Size of border on the left and right sides of the image in number of pixels.
39 |     /// * `top`- Size of border on the top and bottom sides of the image in number of pixels.
40 |     pub fn black(left: u32, top: u32) -> Self {
41 |         Self::new(left, top, [0, 0, 0])
42 |     }
43 | }
44 | 
45 | #[cfg(test)]
46 | mod tests {
47 |     use super::*;
48 | 
49 |     #[test]
50 |     fn test_new() {
51 |         let border = ConstantBorder::new(1, 2, [3, 4, 5]);
52 |         assert_eq!(border.left, 1);
53 |         assert_eq!(border.top, 2);
54 |         assert_eq!(border.color, [3, 4, 5]);
55 |     }
56 | 
57 |     #[test]
58 |     fn test_white() {
59 |         let border = ConstantBorder::white(1, 2);
60 |         assert_eq!(border.left, 1);
61 |         assert_eq!(border.top, 2);
62 |         assert_eq!(border.color, [255, 255, 255]);
63 |     }
64 | 
65 |     #[test]
66 |     fn test_black() {
67 |         let border = ConstantBorder::black(1, 2);
68 |         assert_eq!(border.left, 1);
69 |         assert_eq!(border.top, 2);
70 |         assert_eq!(border.color, [0, 0, 0]);
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/tests/functions_side_effects_test.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "npp")]
 2 | use async_cuda::ffi::device::Device;
 3 | #[cfg(feature = "npp")]
 4 | use async_cuda::stream::Stream;
 5 | 
 6 | #[cfg(feature = "npp")]
 7 | use async_cuda::ffi::npp::context::Context;
 8 | 
 9 | /// This integration test helps determine which ffi functions affect the GPU state, or local thread
10 | /// state.
11 | ///
12 | /// This information is important to determine which function need to be executed on the runtime
13 | /// thread, and which functions can be executed directly by the caller (and don't need to be async).
14 | ///
15 | /// We only test functions where it is not immediately apparent whether or not the function has
16 | /// side-effects. All wrappers for NPP operations aren't tested since it is evident that they affect
17 | /// the GPU state.
18 | ///
19 | /// # Find GPU side-effects
20 | ///
21 | /// Run this integration test under the Nsight profile with the following command:
22 | ///
23 | /// ```bash
24 | /// nsys profile --output /tmp/side_effects_trace --force-overwrite true cargo test --release --test functions_side_effects_test
25 | /// ```
26 | ///
27 | /// Use the `nsys-ui` utility to inspect the report produced in `/tmp/side_effects_trace.qdstrm` and
28 | /// determine for each function call if one or more CUDA API functions were invoked, and if the GPU
29 | /// was affected in any way. Function calls are separated by device synchronization markers in the
30 | /// trace.
31 | ///
32 | /// # Find thread-local side-effects
33 | ///
34 | /// These need to inferred from documentation or usage (or an educated guess).
35 | ///
36 | /// # Results
37 | ///
38 | /// | Function                      | Side-effect: GPU | Side-effect: thread-local |
39 | /// | ----------------------------- | ---------------- | ------------------------- |
40 | /// | `Context::from_null_stream`   | ❌               | ✅                        |
41 | /// | `Context::from_stream`        | ❌               | ✅                        |
42 | #[cfg(feature = "npp")]
43 | #[tokio::test]
44 | async fn test_side_effects() {
45 |     // First block contains stuff we are not interested in measuring...
46 |     let stream = Stream::new().await.unwrap();
47 | 
48 |     // A sequence of CUDA calls that is easy to find in the trace.
49 |     Device::synchronize().unwrap();
50 |     let _mem_info_1 = Device::memory_info().unwrap();
51 |     let _mem_info_2 = Device::memory_info().unwrap();
52 |     let _mem_info_3 = Device::memory_info().unwrap();
53 |     let _mem_info_4 = Device::memory_info().unwrap();
54 |     Device::synchronize().unwrap();
55 | 
56 |     let _context_null = Context::from_null_stream();
57 |     Device::synchronize().unwrap();
58 | 
59 |     let _context_new = Context::from_stream(stream);
60 |     Device::synchronize().unwrap();
61 | }
62 | 


--------------------------------------------------------------------------------
/src/stream.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi;
 2 | use crate::runtime::{Future, SynchronizeFuture};
 3 | 
 4 | type Result<T> = std::result::Result<T, crate::error::Error>;
 5 | 
 6 | /// CUDA stream.
 7 | pub struct Stream {
 8 |     inner: ffi::stream::Stream,
 9 | }
10 | 
11 | impl Stream {
12 |     /// Create a [`Stream`] object that represent the default stream, also known as the null stream.
13 |     ///
14 |     /// Refer to the [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html)
15 |     /// for more information regarding the default ("null") stream:
16 |     ///
17 |     /// # Prefer owned streams
18 |     ///
19 |     /// It is recommended to use owned streams as much as possible, for two reasons:
20 |     ///
21 |     /// * Using streams to separate semanticly unrelated streams of operations allows the GPU to
22 |     ///   overlap operations and improved parallelism.
23 |     /// * Using the default stream can incur implicit synchronization, even on other streams, which
24 |     ///   causes their performance to degrade.
25 |     ///
26 |     /// Note that it is not enforced that there is only one [`Stream`] object that represents the
27 |     /// default stream. This is safe because all operations are serialized anyway.
28 |     pub fn null() -> Self {
29 |         Self {
30 |             inner: ffi::stream::Stream::null(),
31 |         }
32 |     }
33 | 
34 |     /// Create an asynchronous stream.
35 |     ///
36 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g6a3c4b819e6a994c26d0c4824a4c80da)
37 |     pub async fn new() -> Result<Self> {
38 |         let inner = Future::new(ffi::stream::Stream::new).await?;
39 |         Ok(Self { inner })
40 |     }
41 | 
42 |     /// Synchronize stream. This future will only return once all currently enqueued work on the
43 |     /// stream is done.
44 |     ///
45 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g74aa9f4b1c2f12d994bf13876a5a2498)
46 |     ///
47 |     /// # Behavior
48 |     ///
49 |     /// In constrast to most of the API, this future does not become ready eagerly. Instead, a
50 |     /// callback is pushed onto the given stream that will be invoked to make the future ready once
51 |     /// all work on the stream that was previously queued asynchroneously is completed.
52 |     ///
53 |     /// Internally, the future uses `cudaStreamAddCallback` to schedule the callback on the stream.
54 |     pub async fn synchronize(&self) -> Result<()> {
55 |         SynchronizeFuture::new(self).await
56 |     }
57 | 
58 |     /// Access the inner synchronous implementation of [`Stream`].
59 |     #[inline(always)]
60 |     pub fn inner(&self) -> &ffi::stream::Stream {
61 |         &self.inner
62 |     }
63 | }
64 | 
65 | #[cfg(test)]
66 | mod tests {
67 |     use super::*;
68 | 
69 |     #[tokio::test]
70 |     async fn test_new() {
71 |         assert!(Stream::new().await.is_ok());
72 |     }
73 | 
74 |     #[tokio::test]
75 |     async fn test_synchronize() {
76 |         let stream = Stream::new().await.unwrap();
77 |         assert!(stream.synchronize().await.is_ok());
78 |     }
79 | 
80 |     #[tokio::test]
81 |     async fn test_synchronize_null_stream() {
82 |         let stream = Stream::null();
83 |         assert!(stream.synchronize().await.is_ok());
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">
 2 |   <code>async-cuda</code>
 3 | </h1>
 4 | <p align="center">Asynchronous CUDA for Rust.</p>
 5 | <div align="center">
 6 | 
 7 | [![version](https://img.shields.io/crates/v/async-cuda)](https://crates.io/crates/async-cuda)
 8 | [![license](https://img.shields.io/crates/l/async-cuda)](#license)
 9 | [![docs](https://img.shields.io/docsrs/async-cuda)](https://docs.rs/async-cuda)
10 | 
11 | </div>
12 | 
13 | ## ℹ️ Introduction
14 | 
15 | `async-cuda` is an experimental library for interacting with the GPU asynchronously. Since the GPU
16 | is just another I/O device (from the point of view of your program), the async model actually fits
17 | surprisingly well. The way it is implemented in `async-cuda` is that all operations are scheduled on
18 | a single runtime thread that drives the GPU. The interface of this library enforces that
19 | synchronization happens when it is necessary (and synchronization itself is also asynchronous).
20 | 
21 | On top of common CUDA primitives, this library also includes async wrappers for
22 | [NVIDIA's NPP library](https://developer.nvidia.com/npp).
23 | 
24 | The async wrappers for TensorRT have been moved to a separate repository here:
25 | [`async-tensorrt`](https://github.com/oddity-ai/async-tensorrt).
26 | 
27 | ## 🛠 S️️tatus
28 | 
29 | This project is still a work-in-progress, and will contain bugs. Some parts of the API have not
30 | been flushed out yet. Use with caution.
31 | 
32 | ## 📦 Setup
33 | 
34 | Make sure you have the necessary dependencies installed:
35 | 
36 | * CUDA toolkit 11 or later.
37 | 
38 | Then, add the following to your dependencies in `Cargo.toml`:
39 | 
40 | ```toml
41 | async-cuda = "0.6"
42 | ```
43 | 
44 | To enable the NPP functions:
45 | 
46 | ```toml
47 | async-cuda = { version = "0.6", features = ["npp"] }
48 | ```
49 | 
50 | ## ⚠️ Safety warning
51 | 
52 | This crate is **intentionally unsafe**. Due to the limitations of how async Rust currently works,
53 | usage of the async interface of this crate can cause undefined behavior in some rare cases. It is up
54 | to the user of this crate to prevent this from happening by following these rules:
55 | 
56 | * No futures produced by functions in this crate may be leaked (either by `std::mem::forget` or
57 |   otherwise).
58 | * Use a well-behaved runtime (one that will not forget your future) like Tokio or async-std.
59 | 
60 | Internally, the `Future` type in this crate schedules a CUDA call on a separate runtime thread. To
61 | make the API as ergonomic as possible, the lifetime bounds of the closure (that is sent to the
62 | runtime) are tied to the future object. To enforce this bound, the future will block and wait if it
63 | is dropped. This mechanism relies on the future being driven to completion, and not forgotten. This
64 | is not necessarily guaranteed. Unsafety may arise if either the runtime gives up on or forgets the
65 | future, or the caller manually polls the future, then forgets it.
66 | 
67 | ## License
68 | 
69 | Licensed under either of
70 | 
71 |  * Apache License, Version 2.0
72 |    ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
73 |  * MIT license
74 |    ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
75 | 
76 | at your option.
77 | 
78 | ## Contribution
79 | 
80 | Unless you explicitly state otherwise, any contribution intentionally submitted
81 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
82 | dual licensed as above, without any additional terms or conditions.
83 | 


--------------------------------------------------------------------------------
/src/runtime/thread_local.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::mpsc::Sender;
  2 | 
  3 | use once_cell::sync::Lazy;
  4 | 
  5 | use crate::error::Error;
  6 | use crate::runtime::execution::RUNTIME;
  7 | use crate::runtime::work::Work;
  8 | 
  9 | thread_local! {
 10 |     /// Thread-local runtime delegate.
 11 |     ///
 12 |     /// This object serves as the per-thread reference to the [`RUNTIME`] that can be used to
 13 |     /// enqueue work on the runtime thread.
 14 |     ///
 15 |     /// # Usage
 16 |     ///
 17 |     /// ```ignore
 18 |     /// assert!(
 19 |     ///     RUNTIME_THREAD_LOCAL.with(|runtime|
 20 |     ///         runtime.enqueue(Work::new(|| ()))
 21 |     ///     ).is_ok()
 22 |     /// )
 23 |     /// ```
 24 |     pub(super) static RUNTIME_THREAD_LOCAL: Lazy<RuntimeThreadLocal> = Lazy::new(|| {
 25 |         RUNTIME.lock().unwrap().thread_local()
 26 |     });
 27 | }
 28 | 
 29 | /// Per-thread delegate for global runtime.
 30 | pub struct RuntimeThreadLocal(Sender<Work>);
 31 | 
 32 | impl RuntimeThreadLocal {
 33 |     /// Initialize [`RuntimeThreadLocal`] from [`Sender`] that allows the delegate to send work to
 34 |     /// the actual [`crate::runtime::execution::Runtime`].
 35 |     ///
 36 |     /// # Arguments
 37 |     ///
 38 |     /// * `sender` - Sender through which work can be sent to runtime.
 39 |     pub(super) fn from_sender(sender: Sender<Work>) -> Self {
 40 |         RuntimeThreadLocal(sender)
 41 |     }
 42 | 
 43 |     /// Enqueue work on runtime.
 44 |     ///
 45 |     /// # Arguments
 46 |     ///
 47 |     /// * `function` - Unit of work in function closure to enqueue.
 48 |     pub(super) fn enqueue(&self, function: Work) -> Result<(), Error> {
 49 |         self.0.send(function).map_err(|_| Error::Runtime)
 50 |     }
 51 | }
 52 | 
 53 | /// Enqueue work on the runtime without caring about the return value. This is useful in situations
 54 | /// where work must be performed but the result does not matter. For example, when destorying CUDA
 55 | /// object as part of dropping an object.
 56 | ///
 57 | /// # Arguments
 58 | ///
 59 | /// * `f` - Function closure to execute on runtime.
 60 | ///
 61 | /// # Example
 62 | ///
 63 | /// ```ignore
 64 | /// enqueue_decoupled(move || {
 65 | ///     // ...
 66 | /// });
 67 | /// ```
 68 | #[inline]
 69 | pub fn enqueue_decoupled(f: impl FnOnce() + Send + 'static) {
 70 |     let f = Box::new(f);
 71 |     RUNTIME_THREAD_LOCAL
 72 |         .with(|runtime| runtime.enqueue(Work::new(f)))
 73 |         .expect("runtime broken")
 74 | }
 75 | 
 76 | #[cfg(test)]
 77 | mod tests {
 78 |     use super::*;
 79 | 
 80 |     #[test]
 81 |     fn test_enqueue_works() {
 82 |         let (tx, rx) = std::sync::mpsc::channel();
 83 |         assert!(RUNTIME_THREAD_LOCAL
 84 |             .with(|runtime| {
 85 |                 runtime.enqueue(Work::new(move || {
 86 |                     assert!(tx.send(true).is_ok());
 87 |                 }))
 88 |             })
 89 |             .is_ok());
 90 |         assert!(matches!(
 91 |             rx.recv_timeout(std::time::Duration::from_millis(100)),
 92 |             Ok(true),
 93 |         ));
 94 |     }
 95 | 
 96 |     #[test]
 97 |     fn test_enqueue_decoupled_works() {
 98 |         let (tx, rx) = std::sync::mpsc::channel();
 99 |         enqueue_decoupled(move || {
100 |             assert!(tx.send(true).is_ok());
101 |         });
102 |         assert!(matches!(
103 |             rx.recv_timeout(std::time::Duration::from_millis(100)),
104 |             Ok(true),
105 |         ));
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/ffi/device.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::device::DeviceId;
  4 | use crate::device::MemoryInfo;
  5 | use crate::ffi::result;
  6 | 
  7 | type Result<T> = std::result::Result<T, crate::error::Error>;
  8 | 
  9 | /// Synchronous implementation of [`crate::num_devices`].
 10 | ///
 11 | /// Refer to [`crate::num_devices`] for documentation.
 12 | pub fn num_devices() -> Result<usize> {
 13 |     let mut num = 0_i32;
 14 |     let num_ptr = std::ptr::addr_of_mut!(num);
 15 |     let ret = cpp!(unsafe [
 16 |         num_ptr as "std::int32_t*"
 17 |     ] -> i32 as "std::int32_t" {
 18 |         return cudaGetDeviceCount(num_ptr);
 19 |     });
 20 | 
 21 |     result!(ret, num as usize)
 22 | }
 23 | 
 24 | /// Synchronous implementation of [`crate::Device`].
 25 | ///
 26 | /// Refer to [`crate::Device`] for documentation.
 27 | pub struct Device;
 28 | 
 29 | impl Device {
 30 |     #[inline]
 31 |     pub fn get() -> Result<DeviceId> {
 32 |         let mut id: i32 = 0;
 33 |         let id_ptr = std::ptr::addr_of_mut!(id);
 34 |         let ret = cpp!(unsafe [
 35 |             id_ptr as "int*"
 36 |         ] -> i32 as "int" {
 37 |             return cudaGetDevice(id_ptr);
 38 |         });
 39 |         result!(ret, id)
 40 |     }
 41 | 
 42 |     #[inline(always)]
 43 |     pub fn get_or_panic() -> DeviceId {
 44 |         Device::get().unwrap_or_else(|err| panic!("failed to get device: {err}"))
 45 |     }
 46 | 
 47 |     #[inline]
 48 |     pub fn set(id: DeviceId) -> Result<()> {
 49 |         let ret = cpp!(unsafe [
 50 |             id as "int"
 51 |         ] -> i32 as "int" {
 52 |             return cudaSetDevice(id);
 53 |         });
 54 |         result!(ret)
 55 |     }
 56 | 
 57 |     #[inline(always)]
 58 |     pub fn set_or_panic(id: DeviceId) {
 59 |         Device::set(id).unwrap_or_else(|err| panic!("failed to set device {id}: {err}"));
 60 |     }
 61 | 
 62 |     pub fn synchronize() -> Result<()> {
 63 |         let ret = cpp!(unsafe [] -> i32 as "std::int32_t" {
 64 |             return cudaDeviceSynchronize();
 65 |         });
 66 |         result!(ret)
 67 |     }
 68 | 
 69 |     pub fn memory_info() -> Result<MemoryInfo> {
 70 |         let mut free: usize = 0;
 71 |         let free_ptr = std::ptr::addr_of_mut!(free);
 72 |         let mut total: usize = 0;
 73 |         let total_ptr = std::ptr::addr_of_mut!(total);
 74 | 
 75 |         let ret = cpp!(unsafe [
 76 |             free_ptr as "std::size_t*",
 77 |             total_ptr as "std::size_t*"
 78 |         ] -> i32 as "std::int32_t" {
 79 |             return cudaMemGetInfo(free_ptr, total_ptr);
 80 |         });
 81 |         result!(ret, MemoryInfo { free, total })
 82 |     }
 83 | }
 84 | 
 85 | #[cfg(test)]
 86 | mod tests {
 87 |     use super::*;
 88 | 
 89 |     #[test]
 90 |     fn test_num_devices() {
 91 |         assert!(matches!(num_devices(), Ok(num) if num > 0));
 92 |     }
 93 | 
 94 |     #[test]
 95 |     fn test_get_device() {
 96 |         assert!(matches!(Device::get(), Ok(0)));
 97 |     }
 98 | 
 99 |     #[test]
100 |     fn test_set_device() {
101 |         assert!(Device::set(0).is_ok());
102 |         assert!(matches!(Device::get(), Ok(0)));
103 |     }
104 | 
105 |     #[test]
106 |     fn test_synchronize() {
107 |         assert!(Device::synchronize().is_ok());
108 |     }
109 | 
110 |     #[test]
111 |     fn test_memory_info() {
112 |         let memory_info = Device::memory_info().unwrap();
113 |         assert!(memory_info.free > 0);
114 |         assert!(memory_info.total > 0);
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/npp/stream.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use crate::ffi::npp::context::Context;
 4 | use crate::runtime::Future;
 5 | 
 6 | /// Represents an NPP stream.
 7 | ///
 8 | /// An NPP stream is a thin wrapper around a normal CUDA stream ([`crate::Stream`]). It manages some
 9 | /// additional context information required in NPP to statelessly execute on a user-provided stream.
10 | ///
11 | /// This struct implements `Deref` such that it can be used as a normal [`crate::Stream`] as well.
12 | ///
13 | /// # Usage
14 | ///
15 | /// If the caller wants to use a stream context for mixed NPP and non-NPP operations, they should
16 | /// create an NPP stream and pass it as CUDA stream when desired. This should work out-of-the-box
17 | /// since [`Stream`] dereferences to [`crate::Stream`].
18 | pub struct Stream {
19 |     context: Arc<Context>,
20 | }
21 | 
22 | impl Stream {
23 |     /// Create an NPP [`Stream`] that represent the default stream, also known as the null stream.
24 |     ///
25 |     /// This type is a wrapper around the actual CUDA stream type: [`crate::Stream`].
26 |     #[inline]
27 |     pub async fn null() -> Self {
28 |         let context = Future::new(Context::from_null_stream).await;
29 |         Self {
30 |             context: Arc::new(context),
31 |         }
32 |     }
33 | 
34 |     /// Create a new [`Stream`] for use with NPP.
35 |     ///
36 |     /// This type is a wrapper around the actual CUDA stream type: [`crate::Stream`].
37 |     #[inline]
38 |     pub async fn new() -> std::result::Result<Self, crate::Error> {
39 |         let stream = crate::Stream::new().await?;
40 |         let context = Future::new(move || Context::from_stream(stream)).await;
41 |         Ok(Self {
42 |             context: Arc::new(context),
43 |         })
44 |     }
45 | 
46 |     /// Acquire shared access to the underlying NPP context object.
47 |     ///
48 |     /// This NPP object can be safetly sent to the runtime thread so it can be used as a context.
49 |     ///
50 |     /// # Safety
51 |     ///
52 |     /// The [`Context`] object may only be *used* from the runtime thread.
53 |     pub(crate) fn to_context(&self) -> Arc<Context> {
54 |         self.context.clone()
55 |     }
56 | }
57 | 
58 | impl std::ops::Deref for Stream {
59 |     type Target = crate::Stream;
60 | 
61 |     fn deref(&self) -> &Self::Target {
62 |         &self.context.stream
63 |     }
64 | }
65 | 
66 | #[cfg(test)]
67 | mod tests {
68 |     use super::*;
69 | 
70 |     #[tokio::test]
71 |     async fn test_new() {
72 |         let stream = Stream::new().await.unwrap();
73 |         assert!(!stream.to_context().as_ptr().is_null());
74 |         // SAFETY: This works because we know that the first field of the underlying
75 |         // `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped
76 |         // stream or it was not initalized correctly.
77 |         assert_eq!(
78 |             unsafe { *(stream.to_context().as_ptr() as *const *const std::ffi::c_void) },
79 |             stream.inner().as_internal().as_ptr(),
80 |         );
81 |     }
82 | 
83 |     #[tokio::test]
84 |     async fn test_null() {
85 |         let stream = Stream::null().await;
86 |         assert!(!stream.to_context().as_ptr().is_null());
87 |         // SAFETY: This works because we know that the first field of the underlying
88 |         // `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped
89 |         // stream, which is the null stream in this case.
90 |         assert!(
91 |             unsafe { *(stream.to_context().as_ptr() as *const *const std::ffi::c_void) }.is_null()
92 |         );
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/device.rs:
--------------------------------------------------------------------------------
  1 | use crate::ffi;
  2 | use crate::runtime::Future;
  3 | 
  4 | type Result<T> = std::result::Result<T, crate::error::Error>;
  5 | 
  6 | /// Returns the number of compute-capable devices.
  7 | ///
  8 | /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g18808e54893cfcaafefeab31a73cc55f)
  9 | ///
 10 | /// # Return value
 11 | ///
 12 | /// Number of CUDA devices or error in case of failure.
 13 | pub async fn num_devices() -> Result<usize> {
 14 |     Future::new(ffi::device::num_devices).await
 15 | }
 16 | 
 17 | /// CUDA device ID.
 18 | pub type DeviceId = i32;
 19 | 
 20 | /// CUDA device.
 21 | pub struct Device;
 22 | 
 23 | impl Device {
 24 |     /// Returns which device is currently being used by [`DeviceId`].
 25 |     ///
 26 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g80861db2ce7c29b6e8055af8ae01bc78)
 27 |     pub async fn get() -> Result<DeviceId> {
 28 |         Future::new(ffi::device::Device::get).await
 29 |     }
 30 | 
 31 |     /// Set device to be used for GPU executions.
 32 |     ///
 33 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb)
 34 |     ///
 35 |     /// # Arguments
 36 |     ///
 37 |     /// * `id` - Device ID to use.
 38 |     pub async fn set(id: DeviceId) -> Result<()> {
 39 |         Future::new(move || ffi::device::Device::set(id)).await
 40 |     }
 41 | 
 42 |     /// Synchronize the current CUDA device.
 43 |     ///
 44 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g10e20b05a95f638a4071a655503df25d)
 45 |     ///
 46 |     /// # Warning
 47 |     ///
 48 |     /// Note that this operation will block all device operations, even from other processes while
 49 |     /// running. Use this operation sparingly.
 50 |     pub async fn synchronize() -> Result<()> {
 51 |         Future::new(ffi::device::Device::synchronize).await
 52 |     }
 53 | 
 54 |     /// Gets free and total device memory.
 55 |     ///
 56 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g376b97f5ab20321ca46f7cfa9511b978)
 57 |     ///
 58 |     /// # Return value
 59 |     ///
 60 |     /// Total amount of memory and free memory in bytes.
 61 |     pub async fn memory_info() -> Result<MemoryInfo> {
 62 |         Future::new(ffi::device::Device::memory_info).await
 63 |     }
 64 | }
 65 | 
 66 | /// CUDA device memory information.
 67 | #[derive(Debug, Clone, Copy, PartialEq)]
 68 | pub struct MemoryInfo {
 69 |     /// Amount of free device memory in bytes.
 70 |     pub free: usize,
 71 |     /// Total amount of device memory in bytes.
 72 |     pub total: usize,
 73 | }
 74 | 
 75 | #[cfg(test)]
 76 | mod tests {
 77 |     use super::*;
 78 | 
 79 |     #[tokio::test]
 80 |     async fn test_num_devices() {
 81 |         assert!(matches!(num_devices().await, Ok(num) if num > 0));
 82 |     }
 83 | 
 84 |     #[tokio::test]
 85 |     async fn test_get_device() {
 86 |         assert!(matches!(Device::get().await, Ok(0)));
 87 |     }
 88 | 
 89 |     #[tokio::test]
 90 |     async fn test_set_device() {
 91 |         assert!(Device::set(0).await.is_ok());
 92 |         assert!(matches!(Device::get().await, Ok(0)));
 93 |     }
 94 | 
 95 |     #[tokio::test]
 96 |     async fn test_synchronize() {
 97 |         assert!(Device::synchronize().await.is_ok());
 98 |     }
 99 | 
100 |     #[tokio::test]
101 |     async fn test_memory_info() {
102 |         let memory_info = Device::memory_info().await.unwrap();
103 |         assert!(memory_info.free > 0);
104 |         assert!(memory_info.total > 0);
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/runtime/execution.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::atomic::{AtomicBool, Ordering};
  2 | use std::sync::mpsc::{channel, Receiver, Sender};
  3 | use std::sync::{Arc, Mutex};
  4 | 
  5 | use once_cell::sync::Lazy;
  6 | 
  7 | use crate::runtime::thread_local::RuntimeThreadLocal;
  8 | use crate::runtime::work::Work;
  9 | 
 10 | /// Refers to the global runtime. The runtime is responsible for running all CUDA operations in a
 11 | /// dedicated thread.
 12 | ///
 13 | /// Note that this object should not be used by callers because each thread gets its own delegate
 14 | /// object to communicate with the runtime.
 15 | ///
 16 | /// # Usage
 17 | ///
 18 | /// Each thread should get its own [`RuntimeThreadLocal`] object, which acts as delegate object.
 19 | ///
 20 | /// Use `Runtime::thread_local` to get the thread local object:
 21 | ///
 22 | /// ```ignore
 23 | /// let runtime = RUNTIME.lock().unwrap().thread_local();
 24 | /// ```
 25 | pub(super) static RUNTIME: Lazy<Mutex<Runtime>> = Lazy::new(|| Mutex::new(Runtime::new()));
 26 | 
 27 | /// Runtime object that holds the runtime thread and a channel
 28 | /// to send jobs onto the worker queue.
 29 | pub struct Runtime {
 30 |     join_handle: Option<std::thread::JoinHandle<()>>,
 31 |     run_flag: Arc<AtomicBool>,
 32 |     work_tx: Sender<Work>,
 33 | }
 34 | 
 35 | impl Runtime {
 36 |     /// Acquire a thread local delegate for the runtime.
 37 |     pub(super) fn thread_local(&self) -> RuntimeThreadLocal {
 38 |         RuntimeThreadLocal::from_sender(self.work_tx.clone())
 39 |     }
 40 | 
 41 |     /// Create runtime.
 42 |     fn new() -> Self {
 43 |         let run_flag = Arc::new(AtomicBool::new(true));
 44 |         let (work_tx, work_rx) = channel::<Work>();
 45 | 
 46 |         let join_handle = std::thread::spawn({
 47 |             let run_flag = run_flag.clone();
 48 |             move || Self::worker(run_flag, work_rx)
 49 |         });
 50 | 
 51 |         Runtime {
 52 |             join_handle: Some(join_handle),
 53 |             run_flag,
 54 |             work_tx,
 55 |         }
 56 |     }
 57 | 
 58 |     /// Worker loop. Receives jobs from the worker queue and executes them until [`run_flag`]
 59 |     /// becomes `false`.
 60 |     ///
 61 |     /// # Arguments
 62 |     ///
 63 |     /// * `run_flag` - Atomic flag that indicates whether the worker should continue running.
 64 |     /// * `work_rx` - Receives work to execute.
 65 |     fn worker(run_flag: Arc<AtomicBool>, work_rx: Receiver<Work>) {
 66 |         while run_flag.load(Ordering::Relaxed) {
 67 |             match work_rx.recv() {
 68 |                 Ok(work) => work.run(),
 69 |                 Err(_) => break,
 70 |             }
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | impl Drop for Runtime {
 76 |     fn drop(&mut self) {
 77 |         self.run_flag.store(false, Ordering::Relaxed);
 78 | 
 79 |         // Put dummy workload into the queue to trigger the loop to continue and encounted the
 80 |         // `run_flag` that is now false, then stop. Note that if this fails, it means the underlying
 81 |         // channel is broken. It is not a problem, since that must mean the worker already quit
 82 |         // before, and it will join immediatly.
 83 |         let _ = self.work_tx.send(Work::new(|| {}));
 84 | 
 85 |         if let Some(join_handle) = self.join_handle.take() {
 86 |             join_handle
 87 |                 .join()
 88 |                 .expect("failed to join on runtime thread");
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | #[cfg(test)]
 94 | mod tests {
 95 |     use super::*;
 96 | 
 97 |     #[test]
 98 |     fn test_drop() {
 99 |         let runtime = Runtime::new();
100 |         std::thread::sleep(std::time::Duration::from_millis(10));
101 |         drop(runtime);
102 |     }
103 | 
104 |     #[test]
105 |     fn test_it_does_work() {
106 |         let runtime = Runtime::new();
107 |         let (tx, rx) = std::sync::mpsc::channel();
108 |         assert!(runtime
109 |             .thread_local()
110 |             .enqueue(Work::new(move || {
111 |                 assert!(tx.send(true).is_ok());
112 |             }))
113 |             .is_ok());
114 |         assert!(matches!(
115 |             rx.recv_timeout(std::time::Duration::from_millis(100)),
116 |             Ok(true),
117 |         ));
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/npp/copy_constant_border.rs:
--------------------------------------------------------------------------------
  1 | use crate::memory::DeviceBuffer2D;
  2 | use crate::npp::constant_border::ConstantBorder;
  3 | use crate::npp::stream::Stream;
  4 | use crate::runtime::Future;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  7 | 
  8 | /// Copy an image with a constant border. This function expects a reference to a device image for
  9 | /// input, and a mutable reference to a device image to place the output in.
 10 | ///
 11 | /// This function assumes the following about the input and output images:
 12 | /// * Images are in RGB format.
 13 | /// * Images are in standard memory order, i.e. HWC.
 14 | ///
 15 | /// # Stream ordered semantics
 16 | ///
 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially
 18 | /// relative to operations scheduled on the same stream or the default stream.
 19 | ///
 20 | /// # Arguments
 21 | ///
 22 | /// * `input` - The on-device input image.
 23 | /// * `output` - The on-device output image.
 24 | /// * `constant_border` - The constant border parameters to apply.
 25 | /// * `stream` - Stream to use.
 26 | pub async fn copy_constant_border(
 27 |     input: &DeviceBuffer2D<u8>,
 28 |     output: &mut DeviceBuffer2D<u8>,
 29 |     constant_border: &ConstantBorder,
 30 |     stream: &Stream,
 31 | ) -> Result<()> {
 32 |     assert_eq!(input.num_channels(), 3, "input image must be in RGB format");
 33 |     assert_eq!(
 34 |         output.num_channels(),
 35 |         3,
 36 |         "output image must be in RGB format"
 37 |     );
 38 | 
 39 |     let context = stream.to_context();
 40 |     Future::new(move || {
 41 |         crate::ffi::npp::copy_constant_border::copy_constant_border(
 42 |             input.inner(),
 43 |             output.inner_mut(),
 44 |             constant_border,
 45 |             &context,
 46 |         )
 47 |     })
 48 |     .await
 49 | }
 50 | 
 51 | #[cfg(test)]
 52 | mod tests {
 53 |     use super::*;
 54 | 
 55 |     use crate::memory::DeviceBuffer2D;
 56 |     use crate::npp::stream::Stream;
 57 |     use crate::npp::tests::image::*;
 58 |     use crate::npp::tests::memory::*;
 59 | 
 60 |     #[tokio::test]
 61 |     async fn test_copy_constant_border() {
 62 |         // Input image is 1x2 and just contains one red and one green pixel.
 63 |         const INPUT: [[Pixel; 2]; 1] = [[R, G]];
 64 |         const INPUT_FLAT: [u8; 6] = flatten!(INPUT, 6);
 65 | 
 66 |         // Expected output of copy constant border with left border of 1 and top border of 2, if
 67 |         // the border color is blue.
 68 |         const OUTPUT: [[Pixel; 4]; 5] = [
 69 |             [B, B, B, B],
 70 |             [B, B, B, B],
 71 |             [B, R, G, B],
 72 |             [B, B, B, B],
 73 |             [B, B, B, B],
 74 |         ];
 75 |         const OUTPUT_FLAT: [u8; 4 * 5 * 3] = flatten!(OUTPUT, 4 * 5 * 3);
 76 | 
 77 |         let stream = Stream::new().await.unwrap();
 78 | 
 79 |         let image = to_device_2d!(&INPUT_FLAT, 2, 1, 3, &stream);
 80 |         let mut output = DeviceBuffer2D::<u8>::new(4, 5, 3).await;
 81 |         copy_constant_border(&image, &mut output, &ConstantBorder::new(1, 2, B), &stream)
 82 |             .await
 83 |             .unwrap();
 84 | 
 85 |         let output = to_host_2d!(output, &stream);
 86 |         assert_eq!(&output, &OUTPUT_FLAT);
 87 |     }
 88 | 
 89 |     #[tokio::test]
 90 |     #[should_panic]
 91 |     async fn test_it_panics_when_input_num_channels_incorrect() {
 92 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
 93 |         let mut output = DeviceBuffer2D::<u8>::new(200, 200, 3).await;
 94 |         copy_constant_border(
 95 |             &input,
 96 |             &mut output,
 97 |             &ConstantBorder::black(10, 20),
 98 |             &Stream::null().await,
 99 |         )
100 |         .await
101 |         .unwrap();
102 |     }
103 | 
104 |     #[tokio::test]
105 |     #[should_panic]
106 |     async fn test_it_panics_when_output_num_channels_incorrect() {
107 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
108 |         let mut output = DeviceBuffer2D::<u8>::new(200, 200, 2).await;
109 |         copy_constant_border(
110 |             &input,
111 |             &mut output,
112 |             &ConstantBorder::black(10, 20),
113 |             &Stream::null().await,
114 |         )
115 |         .await
116 |         .unwrap();
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/src/npp/region.rs:
--------------------------------------------------------------------------------
  1 | /// Represents subregion of image.
  2 | #[derive(Debug, Clone, Copy, PartialEq, Default)]
  3 | pub enum Region {
  4 |     #[default]
  5 |     Full,
  6 |     Rectangle {
  7 |         x: usize,
  8 |         y: usize,
  9 |         width: usize,
 10 |         height: usize,
 11 |     },
 12 | }
 13 | 
 14 | impl Region {
 15 |     /// Create new [`Region`] that covers the whole image.
 16 |     #[inline]
 17 |     pub fn full() -> Self {
 18 |         Region::Full
 19 |     }
 20 | 
 21 |     /// Create new partial [`Region`] with normalized width and height.
 22 |     ///
 23 |     /// If the `width` or `height` is less than 2, it will be set to 2 to produce a region that
 24 |     /// is valid when used with the NPP API.
 25 |     ///
 26 |     /// # Arguments
 27 |     ///
 28 |     /// * `topleft` - Coordinates of top left corner of the region.
 29 |     /// * `dims` - Dimensions of the region.
 30 |     #[inline]
 31 |     pub fn rectangle_normalized(topleft: (usize, usize), dims: (usize, usize)) -> Self {
 32 |         let (x, y) = topleft;
 33 |         let (width, height) = dims;
 34 |         Self::Rectangle {
 35 |             x,
 36 |             y,
 37 |             width: width.max(2),
 38 |             height: height.max(2),
 39 |         }
 40 |     }
 41 | 
 42 |     /// Resolve the actual values for `x`, `y`, `width` and `height` of the box, even if when it is
 43 |     /// `Region::Full`. To compute these, the outer `width` and `height` are required.
 44 |     ///
 45 |     /// # Arguments
 46 |     ///
 47 |     /// * `width` - Outer width.
 48 |     /// * `height` - Outer height.
 49 |     ///
 50 |     /// # Return value
 51 |     ///
 52 |     /// Region coordinates `x`, `y`, `width` and `height`.
 53 |     pub fn resolve_to_xywh(&self, width: usize, height: usize) -> (usize, usize, usize, usize) {
 54 |         match self {
 55 |             Region::Full => (0, 0, width, height),
 56 |             Region::Rectangle {
 57 |                 x,
 58 |                 y,
 59 |                 width,
 60 |                 height,
 61 |             } => (*x, *y, *width, *height),
 62 |         }
 63 |     }
 64 | 
 65 |     /// Whether or not the region is of type `Region::Full`.
 66 |     pub fn is_full(&self) -> bool {
 67 |         matches!(self, Region::Full)
 68 |     }
 69 | }
 70 | 
 71 | impl std::fmt::Display for Region {
 72 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
 73 |         match self {
 74 |             Region::Full => write!(f, "[full]"),
 75 |             // This formats to something like this:
 76 |             //
 77 |             // ```
 78 |             // [x: 10, y: 10, width: 80, height: 40]
 79 |             // ```
 80 |             Region::Rectangle {
 81 |                 x,
 82 |                 y,
 83 |                 width,
 84 |                 height,
 85 |             } => write!(f, "[x: {x}, y: {y}, width: {width}, height: {height}]",),
 86 |         }
 87 |     }
 88 | }
 89 | 
 90 | #[cfg(test)]
 91 | mod tests {
 92 |     use super::*;
 93 | 
 94 |     #[test]
 95 |     fn test_new_full() {
 96 |         assert_eq!(Region::full(), Region::Full);
 97 |         assert!(Region::full().is_full());
 98 |     }
 99 | 
100 |     #[test]
101 |     fn test_new_rectangle_normalized() {
102 |         assert_eq!(
103 |             Region::rectangle_normalized((1, 2), (3, 4)),
104 |             Region::Rectangle {
105 |                 x: 1,
106 |                 y: 2,
107 |                 width: 3,
108 |                 height: 4
109 |             }
110 |         );
111 |         assert_eq!(
112 |             Region::rectangle_normalized((1, 2), (0, 1)),
113 |             Region::Rectangle {
114 |                 x: 1,
115 |                 y: 2,
116 |                 width: 2,
117 |                 height: 2
118 |             }
119 |         );
120 |         assert!(!Region::rectangle_normalized((1, 2), (3, 4)).is_full());
121 |     }
122 | 
123 |     #[test]
124 |     fn test_resolve_region() {
125 |         let region = Region::Rectangle {
126 |             x: 8,
127 |             y: 10,
128 |             width: 12,
129 |             height: 16,
130 |         };
131 |         assert_eq!(region.resolve_to_xywh(20, 20), (8, 10, 12, 16));
132 |     }
133 | 
134 |     #[test]
135 |     fn test_resolve_full() {
136 |         let region = Region::Full;
137 |         assert_eq!(region.resolve_to_xywh(10, 20), (0, 0, 10, 20));
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/ffi/ptr.rs:
--------------------------------------------------------------------------------
  1 | /// Represents a device-local pointer. Pointers qualify as device-local if they refer to memory that
  2 | /// lives on the device, and not on the host.
  3 | ///
  4 | /// # Safety
  5 | ///
  6 | /// ## Null
  7 | ///
  8 | /// Creating a null pointer is always unsafe, because any CUDA operations on null pointers can cause
  9 | /// undefined behavior.
 10 | ///
 11 | /// Use the `unsafe` function `Ptr::null` to create a null pointer in cases where usage is safe.
 12 | pub struct DevicePtr {
 13 |     addr: *mut std::ffi::c_void,
 14 | }
 15 | 
 16 | impl DevicePtr {
 17 |     /// Create from device address.
 18 |     ///
 19 |     /// # Arguments
 20 |     ///
 21 |     /// * `addr` - Address of pointer.
 22 |     #[inline]
 23 |     pub fn from_addr(addr: *mut std::ffi::c_void) -> Self {
 24 |         if !addr.is_null() {
 25 |             DevicePtr { addr }
 26 |         } else {
 27 |             panic!("unexpected null pointer");
 28 |         }
 29 |     }
 30 | 
 31 |     /// Create null pointer.
 32 |     ///
 33 |     /// # Safety
 34 |     ///
 35 |     /// This is unsafe because operating on a `null` pointer in CUDA code can cause crashes. In some
 36 |     /// cases it is allowed though, for example, a `null` pointer can designate the default stream
 37 |     /// in stream-related operations.
 38 |     #[inline]
 39 |     pub unsafe fn null() -> Self {
 40 |         DevicePtr {
 41 |             addr: std::ptr::null_mut(),
 42 |         }
 43 |     }
 44 | 
 45 |     /// Whether or not the device pointer is a null pointer.
 46 |     #[inline]
 47 |     pub fn is_null(&self) -> bool {
 48 |         self.addr.is_null()
 49 |     }
 50 | 
 51 |     /// Get the readonly pointer value.
 52 |     #[inline(always)]
 53 |     pub fn as_ptr(&self) -> *const std::ffi::c_void {
 54 |         self.addr as *const std::ffi::c_void
 55 |     }
 56 | 
 57 |     /// Get the mutable pointer value.
 58 |     #[inline(always)]
 59 |     pub fn as_mut_ptr(&mut self) -> *mut std::ffi::c_void {
 60 |         self.addr
 61 |     }
 62 | 
 63 |     /// Take the pointer from this wrapper and replace it with a null pointer.
 64 |     ///
 65 |     /// # Safety
 66 |     ///
 67 |     /// This operation is unsafe because it creates a null pointer.
 68 |     ///
 69 |     /// # Usage
 70 |     ///
 71 |     /// This function can be used inside [`Drop`] if it known that the pointer object will not be
 72 |     /// used for the remainder of the function scope, and the object is to be dropped.
 73 |     ///
 74 |     /// # Example
 75 |     ///
 76 |     /// ```ignore
 77 |     /// # use async_cuda::ffi::DevicePtr;
 78 |     /// pub struct Object {
 79 |     ///     internal: DevicePtr,
 80 |     /// }
 81 |     ///
 82 |     /// impl Drop for Object {
 83 |     ///     fn drop(&mut self) {
 84 |     ///         // SAFETY: This is safe because `self` and `self.internal`
 85 |     ///         // are not used beyond this unsafe block.
 86 |     ///         let ptr = unsafe {
 87 |     ///             self.internal.take();
 88 |     ///         };
 89 |     ///         // Propertly deallocate the pointer here and do *NOT* use
 90 |     ///         // use `self` for anything!
 91 |     ///     }
 92 |     /// }
 93 |     /// ```
 94 |     #[inline]
 95 |     pub unsafe fn take(&mut self) -> DevicePtr {
 96 |         DevicePtr {
 97 |             // sets `self.addr` to NULL, puts addr in new device ptr
 98 |             addr: std::mem::replace(&mut self.addr, std::ptr::null_mut()),
 99 |         }
100 |     }
101 | }
102 | 
103 | impl std::fmt::Display for DevicePtr {
104 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
105 |         write!(f, "{:?}", self.addr)
106 |     }
107 | }
108 | 
109 | #[cfg(test)]
110 | mod tests {
111 |     use super::*;
112 | 
113 |     #[test]
114 |     fn test_it_holds_on() {
115 |         let fake = 0xffffffff as *mut std::ffi::c_void;
116 |         let ptr = DevicePtr::from_addr(fake);
117 |         assert_eq!(ptr.as_ptr(), 0xffffffff as *const std::ffi::c_void);
118 |     }
119 | 
120 |     #[test]
121 |     #[should_panic]
122 |     fn test_it_panics_when_null() {
123 |         let _ = DevicePtr::from_addr(std::ptr::null_mut());
124 |     }
125 | 
126 |     #[test]
127 |     fn test_null() {
128 |         let ptr = unsafe { DevicePtr::null() };
129 |         assert!(ptr.is_null());
130 |         assert_eq!(ptr.as_ptr(), std::ptr::null_mut());
131 |     }
132 | 
133 |     #[test]
134 |     fn test_take() {
135 |         let fake = 0xffffffff as *mut std::ffi::c_void;
136 |         let mut ptr = DevicePtr::from_addr(fake);
137 |         assert_eq!(
138 |             unsafe { ptr.take().as_ptr() },
139 |             0xffffffff as *const std::ffi::c_void,
140 |         );
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/ffi/npp/copy_constant_border.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::ffi::npp::context::Context;
  4 | use crate::ffi::npp::result;
  5 | use crate::npp::constant_border::ConstantBorder;
  6 | 
  7 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  8 | 
  9 | /// Synchroneous implementation of [`crate::copy_constant_border()`].
 10 | ///
 11 | /// Refer to [`crate::copy_constant_border()`] for documentation.
 12 | pub fn copy_constant_border(
 13 |     input: &crate::ffi::memory::DeviceBuffer2D<u8>,
 14 |     output: &mut crate::ffi::memory::DeviceBuffer2D<u8>,
 15 |     border: &ConstantBorder,
 16 |     context: &Context,
 17 | ) -> Result<()> {
 18 |     assert_eq!(input.num_channels, 3, "input image must be in RGB format");
 19 |     assert_eq!(output.num_channels, 3, "output image must be in RGB format");
 20 | 
 21 |     let (src_pitch, src_width, src_height) = (input.pitch, input.width as i32, input.height as i32);
 22 |     let (dst_pitch, dst_width, dst_height) =
 23 |         (output.pitch, output.width as i32, output.height as i32);
 24 | 
 25 |     let (border_left, border_top) = (border.left as i32, border.top as i32);
 26 |     let border_color_ptr = border.color.as_ptr();
 27 | 
 28 |     let src_ptr = input.as_internal().as_ptr();
 29 |     let dst_ptr = output.as_mut_internal().as_mut_ptr();
 30 |     let context_ptr = context.as_ptr();
 31 |     let ret = cpp!(unsafe [
 32 |         src_ptr as "const void*",
 33 |         src_pitch as "std::size_t",
 34 |         src_width as "std::int32_t",
 35 |         src_height as "std::int32_t",
 36 |         dst_ptr as "void*",
 37 |         dst_pitch as "std::size_t",
 38 |         dst_width as "std::int32_t",
 39 |         dst_height as "std::int32_t",
 40 |         border_left as "std::int32_t",
 41 |         border_top as "std::int32_t",
 42 |         border_color_ptr as "const std::uint8_t*",
 43 |         context_ptr as "void*"
 44 |     ] -> i32 as "std::int32_t" {
 45 |         NppiSize src_size = { src_width, src_height };
 46 |         NppiSize dst_size = { dst_width, dst_height };
 47 |         return nppiCopyConstBorder_8u_C3R_Ctx(
 48 |             (const Npp8u*) src_ptr,
 49 |             src_pitch,
 50 |             src_size,
 51 |             (Npp8u*) dst_ptr,
 52 |             dst_pitch,
 53 |             dst_size,
 54 |             border_top,
 55 |             border_left,
 56 |             border_color_ptr,
 57 |             *((NppStreamContext*) context_ptr)
 58 |         );
 59 |     });
 60 |     result!(ret)
 61 | }
 62 | 
 63 | #[cfg(test)]
 64 | mod tests {
 65 |     use super::*;
 66 | 
 67 |     use crate::ffi::npp::context::Context;
 68 |     use crate::npp::tests::image::*;
 69 |     use crate::npp::tests::sync::memory::*;
 70 | 
 71 |     #[test]
 72 |     fn test_copy_constant_border() {
 73 |         // Input image is 1x2 and just contains one red and one green pixel.
 74 |         const INPUT: [[Pixel; 2]; 1] = [[R, G]];
 75 |         const INPUT_FLAT: [u8; 6] = flatten!(INPUT, 6);
 76 | 
 77 |         // Expected output of copy constant border with left border of 1 and top border of 2, if
 78 |         // the border color is blue.
 79 |         const OUTPUT: [[Pixel; 4]; 5] = [
 80 |             [B, B, B, B],
 81 |             [B, B, B, B],
 82 |             [B, R, G, B],
 83 |             [B, B, B, B],
 84 |             [B, B, B, B],
 85 |         ];
 86 |         const OUTPUT_FLAT: [u8; 4 * 5 * 3] = flatten!(OUTPUT, 4 * 5 * 3);
 87 | 
 88 |         let context = Context::from_null_stream();
 89 | 
 90 |         let image = to_device_2d!(&INPUT_FLAT, 2, 1, 3, &context);
 91 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(4, 5, 3);
 92 |         copy_constant_border(&image, &mut output, &ConstantBorder::new(1, 2, B), &context).unwrap();
 93 | 
 94 |         let output = to_host_2d!(output, &context);
 95 |         assert_eq!(&output, &OUTPUT_FLAT);
 96 |     }
 97 | 
 98 |     #[test]
 99 |     #[should_panic]
100 |     fn test_it_panics_when_input_num_channels_incorrect() {
101 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2);
102 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 3);
103 |         copy_constant_border(
104 |             &input,
105 |             &mut output,
106 |             &ConstantBorder::black(10, 20),
107 |             &Context::from_null_stream(),
108 |         )
109 |         .unwrap();
110 |     }
111 | 
112 |     #[test]
113 |     #[should_panic]
114 |     fn test_it_panics_when_output_num_channels_incorrect() {
115 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
116 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 2);
117 |         copy_constant_border(
118 |             &input,
119 |             &mut output,
120 |             &ConstantBorder::black(10, 20),
121 |             &Context::from_null_stream(),
122 |         )
123 |         .unwrap();
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/ffi/npp/context.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::ffi::device::Device;
  4 | use crate::ffi::npp::result;
  5 | use crate::stream::Stream;
  6 | 
  7 | /// NPP stream context structure.
  8 | ///
  9 | /// [NPP documentation](https://docs.nvidia.com/cuda/npp/struct_npp_stream_context.html)
 10 | pub struct Context {
 11 |     raw: *mut std::ffi::c_void,
 12 |     pub stream: Stream,
 13 | }
 14 | 
 15 | /// Implements [`Send`] for [`Context`].
 16 | ///
 17 | /// # Safety
 18 | ///
 19 | /// This is safe because the way we use the underlying `NppStreamContext` object is thread-safe.
 20 | unsafe impl Send for Context {}
 21 | 
 22 | /// Implements [`Sync`] for [`Context`].
 23 | ///
 24 | /// # Safety
 25 | ///
 26 | /// This is safe because the way we use the underlying `NppStreamContext` object is thread-safe.
 27 | unsafe impl Sync for Context {}
 28 | 
 29 | impl Context {
 30 |     /// Create context on null stream.
 31 |     ///
 32 |     /// This creates a context that can be passed to NPP functions. Any functions using this context
 33 |     /// will be executed on the null stream.
 34 |     pub fn from_null_stream() -> Self {
 35 |         let mut raw = std::ptr::null_mut();
 36 |         let raw_ptr = std::ptr::addr_of_mut!(raw);
 37 |         // SAFETY:
 38 |         // * Must call this function on runtime since `nppGetStreamContext` needs the correct thread
 39 |         //   locals to determine current device and other context settings.
 40 |         // * We can store a reference to the stream in `NppStreamContext` as long as we make sure
 41 |         //   `NppStreamContext` cannot outlive the stream, which we can guarantee because we take
 42 |         //   ownership of the stream.
 43 |         let ret = cpp!(unsafe [
 44 |             raw_ptr as "void**"
 45 |         ] -> i32 as "std::int32_t" {
 46 |             NppStreamContext* stream_context = new NppStreamContext();
 47 |             NppStatus ret = nppGetStreamContext(stream_context);
 48 |             if (ret == NPP_SUCCESS) {
 49 |                 stream_context->hStream = nullptr;
 50 |                 *raw_ptr = (void*) stream_context;
 51 |             }
 52 |             return ret;
 53 |         });
 54 |         match result!(ret) {
 55 |             Ok(()) => Self {
 56 |                 raw,
 57 |                 stream: Stream::null(),
 58 |             },
 59 |             Err(err) => {
 60 |                 panic!("failed to get current NPP stream context: {err}")
 61 |             }
 62 |         }
 63 |     }
 64 | 
 65 |     /// Create context.
 66 |     ///
 67 |     /// This creates an NPP context object. It can be passed to NPP functions, and they will execute
 68 |     /// on the associated stream.
 69 |     ///
 70 |     /// # Arguments
 71 |     ///
 72 |     /// * `stream` - Stream to associate with context.
 73 |     pub fn from_stream(stream: Stream) -> Self {
 74 |         let (ret, raw) = {
 75 |             let mut raw = std::ptr::null_mut();
 76 |             let raw_ptr = std::ptr::addr_of_mut!(raw);
 77 |             let stream_ptr = stream.inner().as_internal().as_ptr();
 78 |             let device_id = stream.inner().device();
 79 |             // SAFETY:
 80 |             // * Must call this function on runtime since `nppGetStreamContext` needs the correct
 81 |             //   thread locals to determine current device and other context settings.
 82 |             // * We can store a reference to the stream in `NppStreamContext` as long as we make
 83 |             //   sure `NppStreamContext` cannot outlive the stream, which we can guarantee because
 84 |             //   we take ownership of the stream.
 85 |             let ret = cpp!(unsafe [
 86 |                 raw_ptr as "void**",
 87 |                 stream_ptr as "void*",
 88 |                 device_id as "int"
 89 |             ] -> i32 as "std::int32_t" {
 90 |                 NppStreamContext* stream_context = new NppStreamContext();
 91 |                 NppStatus ret = nppGetStreamContext(stream_context);
 92 |                 if (ret == NPP_SUCCESS) {
 93 |                     stream_context->hStream = (cudaStream_t) stream_ptr;
 94 |                     stream_context->nCudaDeviceId = device_id;
 95 |                     *raw_ptr = (void*) stream_context;
 96 |                 }
 97 |                 return ret;
 98 |             });
 99 |             (ret, raw)
100 |         };
101 |         match result!(ret) {
102 |             Ok(()) => Self { raw, stream },
103 |             Err(err) => {
104 |                 panic!("failed to get current NPP stream context: {err}")
105 |             }
106 |         }
107 |     }
108 | 
109 |     /// Get internal readonly pointer.
110 |     #[inline]
111 |     pub(crate) fn as_ptr(&self) -> *const std::ffi::c_void {
112 |         self.raw
113 |     }
114 | 
115 |     /// Delete the context.
116 |     ///
117 |     /// # Panics
118 |     ///
119 |     /// This function panics if binding to the corresponding device fails.
120 |     ///
121 |     /// # Safety
122 |     ///
123 |     /// The context may not be used after this function is called, except for being dropped.
124 |     pub unsafe fn delete(&mut self) {
125 |         if self.raw.is_null() {
126 |             return;
127 |         }
128 | 
129 |         Device::set_or_panic(self.stream.inner().device());
130 | 
131 |         let raw = self.raw;
132 |         self.raw = std::ptr::null_mut();
133 | 
134 |         cpp!(unsafe [raw as "void*"] {
135 |             delete ((NppStreamContext*) raw);
136 |         });
137 |     }
138 | }
139 | 
140 | impl Drop for Context {
141 |     #[inline]
142 |     fn drop(&mut self) {
143 |         // SAFETY: This is safe since the buffer cannot be used after this.
144 |         unsafe {
145 |             self.delete();
146 |         }
147 |     }
148 | }
149 | 
150 | #[cfg(test)]
151 | mod tests {
152 |     use super::*;
153 | 
154 |     #[tokio::test]
155 |     async fn test_from_stream() {
156 |         let stream = Stream::new().await.unwrap();
157 |         let context = Context::from_stream(stream);
158 |         assert!(!context.as_ptr().is_null());
159 |         assert!(!context.stream.inner().as_internal().as_ptr().is_null());
160 |     }
161 | 
162 |     #[test]
163 |     fn test_from_null_stream() {
164 |         let context = Context::from_null_stream();
165 |         assert!(!context.as_ptr().is_null());
166 |         assert!(context.stream.inner().as_internal().as_ptr().is_null());
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/src/npp/resize.rs:
--------------------------------------------------------------------------------
  1 | use crate::memory::DeviceBuffer2D;
  2 | use crate::npp::region::Region;
  3 | use crate::npp::stream::Stream;
  4 | use crate::runtime::Future;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  7 | 
  8 | /// Resize an image using bilinear interpolation. This function expects a reference to a device
  9 | /// image for input, and a mutable reference to a device image to place the output in.
 10 | ///
 11 | /// This function assumes the following about the input and output images:
 12 | /// * Images are in RGB format.
 13 | /// * Images are in standard memory order, i.e. HWC.
 14 | ///
 15 | /// # Stream ordered semantics
 16 | ///
 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially
 18 | /// relative to operations scheduled on the same stream or the default stream.
 19 | ///
 20 | /// # Arguments
 21 | ///
 22 | /// * `input` - The on-device input image.
 23 | /// * `input_region` - Specify region of interest in input image. This can be used to combine crop
 24 | ///   and resize in a single operation.
 25 | /// * `output_region` - Specify region of interest in input image.
 26 | /// * `output` - The on-device output image.
 27 | /// * `stream` - Stream to use.
 28 | pub async fn resize(
 29 |     input: &DeviceBuffer2D<u8>,
 30 |     input_region: Region,
 31 |     output: &mut DeviceBuffer2D<u8>,
 32 |     output_region: Region,
 33 |     stream: &Stream,
 34 | ) -> Result<()> {
 35 |     assert_eq!(input.num_channels(), 3, "input image must be in RGB format");
 36 |     assert_eq!(
 37 |         output.num_channels(),
 38 |         3,
 39 |         "output image must be in RGB format"
 40 |     );
 41 | 
 42 |     let context = stream.to_context();
 43 |     Future::new(move || {
 44 |         crate::ffi::npp::resize::resize(
 45 |             input.inner(),
 46 |             input_region,
 47 |             output.inner_mut(),
 48 |             output_region,
 49 |             &context,
 50 |         )
 51 |     })
 52 |     .await
 53 | }
 54 | 
 55 | #[cfg(test)]
 56 | mod tests {
 57 |     use super::*;
 58 | 
 59 |     use crate::memory::DeviceBuffer2D;
 60 |     use crate::npp::stream::Stream;
 61 |     use crate::npp::tests::image::*;
 62 |     use crate::npp::tests::memory::*;
 63 | 
 64 |     #[tokio::test]
 65 |     async fn test_resize() {
 66 |         // This is the expected result when resizing the RGB flag to 2 by 2 with bilinear
 67 |         // interpolation.
 68 |         const OUTPUT: Image2x2 = [[R, R], [R, B]];
 69 |         const OUTPUT_FLAT: [u8; 2 * 2 * 3] = flatten!(OUTPUT, 2 * 2 * 3);
 70 | 
 71 |         let stream = Stream::new().await.unwrap();
 72 | 
 73 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream);
 74 |         let mut output = DeviceBuffer2D::<u8>::new(2, 2, 3).await;
 75 |         resize(&image, Region::Full, &mut output, Region::Full, &stream)
 76 |             .await
 77 |             .unwrap();
 78 | 
 79 |         let output = to_host_2d!(output, &stream);
 80 |         assert_eq!(&output, &OUTPUT_FLAT);
 81 |     }
 82 | 
 83 |     #[tokio::test]
 84 |     async fn test_resize_with_input_region() {
 85 |         // This is the raw expected result when resizing the center part of the RGB flag from two by
 86 |         // to two four by four.
 87 |         #[rustfmt::skip]
 88 |         #[allow(clippy::zero_prefixed_literal)]
 89 |         const OUTPUT: [u8; 4 * 4 * 3] = [
 90 |             000, 255, 000, 000, 255, 000, 000, 255, 000, 064, 191, 000,
 91 |             000, 191, 064, 000, 191, 064, 000, 191, 064, 064, 143, 048,
 92 |             000, 064, 191, 000, 064, 191, 000, 064, 191, 064, 048, 143,
 93 |             064, 000, 191, 064, 000, 191, 064, 000, 191, 112, 000, 143,
 94 |         ];
 95 | 
 96 |         let stream = Stream::new().await.unwrap();
 97 | 
 98 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream);
 99 |         let center = Region::Rectangle {
100 |             x: 1,
101 |             y: 1,
102 |             width: 2,
103 |             height: 2,
104 |         };
105 |         let mut output = DeviceBuffer2D::<u8>::new(4, 4, 3).await;
106 |         resize(&image, center, &mut output, Region::Full, &stream)
107 |             .await
108 |             .unwrap();
109 | 
110 |         let output = to_host_2d!(output, &stream);
111 |         assert_eq!(&output, &OUTPUT);
112 |     }
113 | 
114 |     #[tokio::test]
115 |     async fn test_resize_with_output_region() {
116 |         #[rustfmt::skip]
117 |         const INPUT: [u8; 2 * 2 * 3] = [
118 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
119 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
120 |         ];
121 |         #[rustfmt::skip]
122 |         const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [
123 |             0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
124 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
125 |         ];
126 | 
127 |         let stream = Stream::new().await.unwrap();
128 |         let bottom_half = Region::Rectangle {
129 |             x: 0,
130 |             y: 1,
131 |             width: 2,
132 |             height: 1,
133 |         };
134 | 
135 |         let image = to_device_2d!(&INPUT, 2, 2, 3, &stream);
136 |         let mut output = DeviceBuffer2D::<u8>::new(2, 2, 3).await;
137 |         output.fill_with_byte(0x00, &stream).await.unwrap();
138 |         resize(&image, Region::Full, &mut output, bottom_half, &stream)
139 |             .await
140 |             .unwrap();
141 | 
142 |         let output = to_host_2d!(output, &stream);
143 |         assert_eq!(&output, &EXPECTED_OUTPUT);
144 |     }
145 | 
146 |     #[tokio::test]
147 |     #[should_panic]
148 |     async fn test_it_panics_when_input_num_channels_incorrect() {
149 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
150 |         let mut output = DeviceBuffer2D::<u8>::new(200, 200, 3).await;
151 |         resize(
152 |             &input,
153 |             Region::Full,
154 |             &mut output,
155 |             Region::Full,
156 |             &Stream::null().await,
157 |         )
158 |         .await
159 |         .unwrap();
160 |     }
161 | 
162 |     #[tokio::test]
163 |     #[should_panic]
164 |     async fn test_it_panics_when_output_num_channels_incorrect() {
165 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
166 |         let mut output = DeviceBuffer2D::<u8>::new(200, 200, 2).await;
167 |         resize(
168 |             &input,
169 |             Region::Full,
170 |             &mut output,
171 |             Region::Full,
172 |             &Stream::null().await,
173 |         )
174 |         .await
175 |         .unwrap();
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/npp/remap.rs:
--------------------------------------------------------------------------------
  1 | use crate::memory::DeviceBuffer2D;
  2 | use crate::npp::stream::Stream;
  3 | use crate::runtime::Future;
  4 | 
  5 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  6 | 
  7 | /// Remaps an image using bilinear interpolation. This function expects a reference to a device
  8 | /// buffer as inputs, and a mutable reference to a device buffer to store the output of the
  9 | /// operation in.
 10 | ///
 11 | /// This function assumes the following about the input and output images:
 12 | /// * Images are in RGB format.
 13 | /// * Images are in standard memory order, i.e. HWC.
 14 | ///
 15 | /// # Stream ordered semantics
 16 | ///
 17 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially
 18 | /// relative to operations scheduled on the same stream or the default stream.
 19 | ///
 20 | /// # Arguments
 21 | ///
 22 | /// * `input` - The on-device input image.
 23 | /// * `output` - The on-device output image.
 24 | /// * `map_x` - On-device X pixel map.
 25 | /// * `map_y` - On-device Y pixel map.
 26 | /// * `stream` - Stream to use.
 27 | pub async fn remap(
 28 |     input: &DeviceBuffer2D<u8>,
 29 |     output: &mut DeviceBuffer2D<u8>,
 30 |     map_x: &DeviceBuffer2D<f32>,
 31 |     map_y: &DeviceBuffer2D<f32>,
 32 |     stream: &Stream,
 33 | ) -> Result<()> {
 34 |     assert_eq!(input.num_channels(), 3, "input image must be in RGB format");
 35 |     assert_eq!(
 36 |         output.num_channels(),
 37 |         3,
 38 |         "output image must be in RGB format"
 39 |     );
 40 |     assert_eq!(map_x.num_channels(), 1, "map must have one channel");
 41 |     assert_eq!(map_y.num_channels(), 1, "map must have one channel");
 42 |     assert_eq!(
 43 |         output.width(),
 44 |         map_x.width(),
 45 |         "map x must have same width as output image"
 46 |     );
 47 |     assert_eq!(
 48 |         output.height(),
 49 |         map_x.height(),
 50 |         "map x must have same height as output image"
 51 |     );
 52 |     assert_eq!(
 53 |         output.width(),
 54 |         map_y.width(),
 55 |         "map y must have same width as output image"
 56 |     );
 57 |     assert_eq!(
 58 |         output.height(),
 59 |         map_y.height(),
 60 |         "map y must have same height as output image"
 61 |     );
 62 | 
 63 |     let context = stream.to_context();
 64 |     Future::new(move || {
 65 |         crate::ffi::npp::remap::remap(
 66 |             input.inner(),
 67 |             output.inner_mut(),
 68 |             map_x.inner(),
 69 |             map_y.inner(),
 70 |             &context,
 71 |         )
 72 |     })
 73 |     .await
 74 | }
 75 | 
 76 | #[cfg(test)]
 77 | mod tests {
 78 |     use super::*;
 79 | 
 80 |     use crate::memory::DeviceBuffer2D;
 81 |     use crate::npp::stream::Stream;
 82 |     use crate::npp::tests::image::*;
 83 |     use crate::npp::tests::memory::*;
 84 | 
 85 |     #[tokio::test]
 86 |     async fn test_remap() {
 87 |         const MAP_X: &[f32; 16] = &[
 88 |             0.0, 1.0, 2.0, 3.0, // No mapping at all
 89 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
 90 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
 91 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
 92 |         ];
 93 |         const MAP_Y: &[f32; 16] = &[
 94 |             0.0, 0.0, 0.0, 0.0, // No mapping at all
 95 |             1.0, 1.0, 1.0, 1.0, // Take from green band
 96 |             1.0, 1.0, 1.0, 1.0, // Take from green band
 97 |             2.0, 2.0, 2.0, 2.0, // Take from blue band
 98 |         ];
 99 |         const OUTPUT: Image4x4 = [
100 |             [R, R, R, R], // Red band
101 |             [G, G, G, G], // Green band
102 |             [G, G, G, G], // Green band
103 |             [B, B, B, B], // Blue band
104 |         ];
105 |         const OUTPUT_FLAT: [u8; 4 * 4 * 3] = flatten!(OUTPUT, 4 * 4 * 3);
106 | 
107 |         let stream = Stream::new().await.unwrap();
108 | 
109 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &stream);
110 |         let map_x = to_device_2d!(MAP_X, 4, 4, 1, &stream);
111 |         let map_y = to_device_2d!(MAP_Y, 4, 4, 1, &stream);
112 |         let mut output = DeviceBuffer2D::<u8>::new(4, 4, 3).await;
113 |         assert!(remap(&image, &mut output, &map_x, &map_y, &stream)
114 |             .await
115 |             .is_ok());
116 | 
117 |         let output = to_host_2d!(output, &stream);
118 |         assert_eq!(&output, &OUTPUT_FLAT);
119 |     }
120 | 
121 |     #[tokio::test]
122 |     #[should_panic]
123 |     async fn test_it_panics_when_input_num_channels_incorrect() {
124 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
125 |         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
126 |         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
127 |         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
128 |         remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
129 |             .await
130 |             .unwrap();
131 |     }
132 | 
133 |     #[tokio::test]
134 |     #[should_panic]
135 |     async fn test_it_panics_when_output_num_channels_incorrect() {
136 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
137 |         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
138 |         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
139 |         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
140 |         remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
141 |             .await
142 |             .unwrap();
143 |     }
144 | 
145 |     #[tokio::test]
146 |     #[should_panic]
147 |     async fn test_it_panics_when_map_num_channels_incorrect() {
148 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
149 |         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 2).await;
150 |         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 3).await;
151 |         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
152 |         remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
153 |             .await
154 |             .unwrap();
155 |     }
156 | 
157 |     #[tokio::test]
158 |     #[should_panic]
159 |     async fn test_it_panics_when_map_width_incorrect() {
160 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
161 |         let map_x = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
162 |         let map_y = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
163 |         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
164 |         remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
165 |             .await
166 |             .unwrap();
167 |     }
168 | 
169 |     #[tokio::test]
170 |     #[should_panic]
171 |     async fn test_it_panics_when_map_height_incorrect() {
172 |         let input = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
173 |         let map_x = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
174 |         let map_y = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
175 |         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
176 |         remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
177 |             .await
178 |             .unwrap();
179 |     }
180 | }
181 | 


--------------------------------------------------------------------------------
/src/ffi/stream.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::device::DeviceId;
  4 | use crate::ffi::device::Device;
  5 | use crate::ffi::ptr::DevicePtr;
  6 | use crate::ffi::result;
  7 | 
  8 | type Result<T> = std::result::Result<T, crate::error::Error>;
  9 | 
 10 | /// Synchronous implementation of [`crate::Stream`].
 11 | ///
 12 | /// Refer to [`crate::Stream`] for documentation.
 13 | pub struct Stream {
 14 |     internal: DevicePtr,
 15 |     device: DeviceId,
 16 | }
 17 | 
 18 | /// Implements [`Send`] for [`Stream`].
 19 | ///
 20 | /// # Safety
 21 | ///
 22 | /// This property is inherited from the CUDA API, which is thread-safe.
 23 | unsafe impl Send for Stream {}
 24 | 
 25 | /// Implements [`Sync`] for [`Stream`].
 26 | ///
 27 | /// # Safety
 28 | ///
 29 | /// This property is inherited from the CUDA API, which is thread-safe.
 30 | unsafe impl Sync for Stream {}
 31 | 
 32 | impl Stream {
 33 |     pub fn null() -> Self {
 34 |         Self {
 35 |             // SAFETY: This is safe because a null pointer for stream indicates the default
 36 |             // stream in CUDA and all functions accept this.
 37 |             internal: unsafe { DevicePtr::null() },
 38 |             device: Device::get_or_panic(),
 39 |         }
 40 |     }
 41 | 
 42 |     pub fn new() -> Result<Self> {
 43 |         let device = Device::get()?;
 44 |         let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut();
 45 |         let ptr_ptr = std::ptr::addr_of_mut!(ptr);
 46 |         let ret = cpp!(unsafe [
 47 |             ptr_ptr as "void**"
 48 |         ] -> i32 as "std::int32_t" {
 49 |             return cudaStreamCreate((cudaStream_t*) ptr_ptr);
 50 |         });
 51 |         result!(
 52 |             ret,
 53 |             Stream {
 54 |                 internal: DevicePtr::from_addr(ptr),
 55 |                 device,
 56 |             }
 57 |         )
 58 |     }
 59 | 
 60 |     pub fn synchronize(&self) -> Result<()> {
 61 |         Device::set(self.device)?;
 62 |         let ptr = self.internal.as_ptr();
 63 |         let ret = cpp!(unsafe [
 64 |             ptr as "void*"
 65 |         ] -> i32 as "std::int32_t" {
 66 |             return cudaStreamSynchronize((cudaStream_t) ptr);
 67 |         });
 68 |         result!(ret)
 69 |     }
 70 | 
 71 |     pub fn add_callback(&self, f: impl FnOnce() + Send) -> Result<()> {
 72 |         Device::set(self.device)?;
 73 |         let ptr = self.internal.as_ptr();
 74 |         let f_boxed = Box::new(f) as Box<dyn FnOnce()>;
 75 |         let f_boxed2 = Box::new(f_boxed);
 76 |         let f_boxed2_ptr = Box::into_raw(f_boxed2);
 77 |         let user_data = f_boxed2_ptr as *mut std::ffi::c_void;
 78 |         let ret = cpp!(unsafe [
 79 |             ptr as "void*",
 80 |             user_data as "void*"
 81 |         ] -> i32 as "std::int32_t" {
 82 |             return cudaStreamAddCallback(
 83 |                 (cudaStream_t) ptr,
 84 |                 cuda_ffi_Callback,
 85 |                 user_data,
 86 |                 0
 87 |             );
 88 |         });
 89 |         result!(ret)
 90 |     }
 91 | 
 92 |     /// Get readonly reference to internal [`DevicePtr`].
 93 |     #[inline(always)]
 94 |     pub fn as_internal(&self) -> &DevicePtr {
 95 |         &self.internal
 96 |     }
 97 | 
 98 |     /// Get mutable reference to internal [`DevicePtr`].
 99 |     #[inline(always)]
100 |     pub fn as_mut_internal(&mut self) -> &mut DevicePtr {
101 |         &mut self.internal
102 |     }
103 | 
104 |     /// Get corresponding device as [`DeviceId`].
105 |     #[inline(always)]
106 |     pub fn device(&self) -> DeviceId {
107 |         self.device
108 |     }
109 | 
110 |     /// Destroy stream.
111 |     ///
112 |     /// # Panics
113 |     ///
114 |     /// This function panics if binding to the corresponding device fails.
115 |     ///
116 |     /// # Safety
117 |     ///
118 |     /// The object may not be used after this function is called, except for being dropped.
119 |     pub unsafe fn destroy(&mut self) {
120 |         if self.internal.is_null() {
121 |             return;
122 |         }
123 | 
124 |         Device::set_or_panic(self.device);
125 | 
126 |         // SAFETY: This will cause `self` to hold a null pointer. It is safe here because we don't
127 |         // use the object after this.
128 |         let mut internal = unsafe { self.internal.take() };
129 |         let ptr = internal.as_mut_ptr();
130 | 
131 |         // SAFETY: We must synchronize the stream before destroying it to make sure we are not
132 |         // dropping a stream that still has operations pending.
133 |         let _ret = cpp!(unsafe [
134 |             ptr as "void*"
135 |         ] -> i32 as "std::int32_t" {
136 |             return cudaStreamSynchronize((cudaStream_t) ptr);
137 |         });
138 | 
139 |         let _ret = cpp!(unsafe [
140 |             ptr as "void*"
141 |         ] -> i32 as "std::int32_t" {
142 |             return cudaStreamDestroy((cudaStream_t) ptr);
143 |         });
144 |     }
145 | }
146 | 
147 | impl Drop for Stream {
148 |     #[inline]
149 |     fn drop(&mut self) {
150 |         // SAFETY: This is safe since the object cannot be used after this.
151 |         unsafe {
152 |             self.destroy();
153 |         }
154 |     }
155 | }
156 | 
157 | cpp! {{
158 |     /// Holds the C++ code that makes up the native part required to get our CUDA callback to work
159 |     /// over the FFI.
160 |     ///
161 |     /// # Arguments
162 |     ///
163 |     /// * `stream` - The CUDA stream on which the callback was scheduled.
164 |     /// * `status` - The CUDA status value (this could represent an error from an earlier async CUDA
165 |     ///   call).
166 |     /// * `user_data` - The user data pointer provided when adding the callback.
167 |     ///
168 |     /// # Example
169 |     ///
170 |     /// It can be used like so:
171 |     ///
172 |     /// ```cpp
173 |     /// return cudaStreamAddCallback(
174 |     ///     stream,
175 |     ///     cuda_ffi_Callback,
176 |     ///     user_data,
177 |     ///     0
178 |     /// );
179 |     /// ```
180 |     static void cuda_ffi_Callback(
181 |       __attribute__((unused)) cudaStream_t stream,
182 |       cudaError_t status,
183 |       void* user_data
184 |     ) {
185 |         rust!(cuda_ffi_Callback_internal [
186 |             status : i32 as "std::int32_t",
187 |             user_data : *mut std::ffi::c_void as "void*"
188 |         ] {
189 |             // SAFETY: We boxed the closure ourselves and did `Box::into_raw`, which allows us to
190 |             // reinstate the box here and use it accordingly. It will be dropped here after use.
191 |             unsafe {
192 |                 let user_data = std::mem::transmute(user_data);
193 |                 let function = Box::<Box<dyn FnOnce()>>::from_raw(user_data);
194 |                 function()
195 |             }
196 |         });
197 |     }
198 | }}
199 | 
200 | #[cfg(test)]
201 | mod tests {
202 |     use super::*;
203 | 
204 |     #[test]
205 |     fn test_new() {
206 |         assert!(Stream::new().is_ok());
207 |     }
208 | 
209 |     #[test]
210 |     fn test_synchronize() {
211 |         let stream = Stream::new().unwrap();
212 |         assert!(stream.synchronize().is_ok());
213 |     }
214 | 
215 |     #[test]
216 |     fn test_synchronize_null_stream() {
217 |         let stream = Stream::null();
218 |         assert!(stream.synchronize().is_ok());
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/ffi/npp/resize.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::ffi::npp::context::Context;
  4 | use crate::ffi::npp::result;
  5 | use crate::npp::region::Region;
  6 | 
  7 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  8 | 
  9 | /// Synchroneous implementation of [`crate::resize()`].
 10 | ///
 11 | /// Refer to [`crate::resize()`] for documentation.
 12 | pub fn resize(
 13 |     input: &crate::ffi::memory::DeviceBuffer2D<u8>,
 14 |     input_region: Region,
 15 |     output: &mut crate::ffi::memory::DeviceBuffer2D<u8>,
 16 |     output_region: Region,
 17 |     context: &Context,
 18 | ) -> Result<()> {
 19 |     assert_eq!(input.num_channels, 3, "input image must be in RGB format");
 20 |     assert_eq!(output.num_channels, 3, "output image must be in RGB format");
 21 | 
 22 |     let (src_pitch, src_width, src_height) = (input.pitch, input.width as i32, input.height as i32);
 23 |     let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) =
 24 |         input_region.resolve_to_xywh(src_width as usize, src_height as usize);
 25 |     let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = (
 26 |         src_rect_x as i32,
 27 |         src_rect_y as i32,
 28 |         src_rect_width as i32,
 29 |         src_rect_height as i32,
 30 |     );
 31 | 
 32 |     let (dst_pitch, dst_width, dst_height) =
 33 |         (output.pitch, output.width as i32, output.height as i32);
 34 |     let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) =
 35 |         output_region.resolve_to_xywh(dst_width as usize, dst_height as usize);
 36 |     let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = (
 37 |         dst_rect_x as i32,
 38 |         dst_rect_y as i32,
 39 |         dst_rect_width as i32,
 40 |         dst_rect_height as i32,
 41 |     );
 42 | 
 43 |     let src_ptr = input.as_internal().as_ptr();
 44 |     let dst_ptr = output.as_mut_internal().as_mut_ptr();
 45 |     let context_ptr = context.as_ptr();
 46 |     let ret = cpp!(unsafe [
 47 |         src_ptr as "const void*",
 48 |         src_pitch as "std::size_t",
 49 |         src_width as "std::int32_t",
 50 |         src_height as "std::int32_t",
 51 |         src_rect_x as "std::int32_t",
 52 |         src_rect_y as "std::int32_t",
 53 |         src_rect_width as "std::int32_t",
 54 |         src_rect_height as "std::int32_t",
 55 |         dst_ptr as "void*",
 56 |         dst_pitch as "std::size_t",
 57 |         dst_width as "std::int32_t",
 58 |         dst_height as "std::int32_t",
 59 |         dst_rect_x as "std::int32_t",
 60 |         dst_rect_y as "std::int32_t",
 61 |         dst_rect_width as "std::int32_t",
 62 |         dst_rect_height as "std::int32_t",
 63 |         context_ptr as "void*"
 64 |     ] -> i32 as "std::int32_t" {
 65 |         NppiSize src_size = { src_width, src_height };
 66 |         NppiSize dst_size = { dst_width, dst_height };
 67 |         NppiRect src_rect = { src_rect_x, src_rect_y, src_rect_width, src_rect_height };
 68 |         NppiRect dst_rect = { dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height };
 69 |         return nppiResize_8u_C3R_Ctx(
 70 |             (const Npp8u*) src_ptr,
 71 |             src_pitch,
 72 |             src_size,
 73 |             src_rect,
 74 |             (Npp8u*) dst_ptr,
 75 |             dst_pitch,
 76 |             dst_size,
 77 |             dst_rect,
 78 |             // We use bilinear interpolation, which is the fastest resize method that does not
 79 |             // produce messed up quality.
 80 |             NPPI_INTER_LINEAR,
 81 |             *((NppStreamContext*) context_ptr)
 82 |         );
 83 |     });
 84 |     result!(ret)
 85 | }
 86 | 
 87 | #[cfg(test)]
 88 | mod tests {
 89 |     use super::*;
 90 | 
 91 |     use crate::ffi::npp::context::Context;
 92 |     use crate::npp::tests::image::*;
 93 |     use crate::npp::tests::sync::memory::*;
 94 | 
 95 |     #[test]
 96 |     fn test_resize() {
 97 |         // This is the expected result when resizing the RGB flag to 2 by 2 with bilinear
 98 |         // interpolation.
 99 |         const OUTPUT: Image2x2 = [[R, R], [R, B]];
100 |         const OUTPUT_FLAT: [u8; 2 * 2 * 3] = flatten!(OUTPUT, 2 * 2 * 3);
101 | 
102 |         let context = Context::from_null_stream();
103 | 
104 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context);
105 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(2, 2, 3);
106 |         resize(&image, Region::Full, &mut output, Region::Full, &context).unwrap();
107 | 
108 |         let output = to_host_2d!(output, &context);
109 |         assert_eq!(&output, &OUTPUT_FLAT);
110 |     }
111 | 
112 |     #[test]
113 |     fn test_resize_with_input_region() {
114 |         // This is the raw expected result when resizing the center part of the RGB flag from two by
115 |         // to two four by four.
116 |         #[rustfmt::skip]
117 |         #[allow(clippy::zero_prefixed_literal)]
118 |         const OUTPUT: [u8; 4 * 4 * 3] = [
119 |             000, 255, 000, 000, 255, 000, 000, 255, 000, 064, 191, 000,
120 |             000, 191, 064, 000, 191, 064, 000, 191, 064, 064, 143, 048,
121 |             000, 064, 191, 000, 064, 191, 000, 064, 191, 064, 048, 143,
122 |             064, 000, 191, 064, 000, 191, 064, 000, 191, 112, 000, 143,
123 |         ];
124 | 
125 |         let context = Context::from_null_stream();
126 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context);
127 |         let center = Region::Rectangle {
128 |             x: 1,
129 |             y: 1,
130 |             width: 2,
131 |             height: 2,
132 |         };
133 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(4, 4, 3);
134 |         resize(&image, center, &mut output, Region::Full, &context).unwrap();
135 | 
136 |         let output = to_host_2d!(output, &context);
137 |         assert_eq!(&output, &OUTPUT);
138 |     }
139 | 
140 |     #[test]
141 |     fn test_resize_with_output_region() {
142 |         #[rustfmt::skip]
143 |         const INPUT: [u8; 2 * 2 * 3] = [
144 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
145 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
146 |         ];
147 |         #[rustfmt::skip]
148 |         const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [
149 |             0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
150 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
151 |         ];
152 | 
153 |         let context = Context::from_null_stream();
154 |         let bottom_half = Region::Rectangle {
155 |             x: 0,
156 |             y: 1,
157 |             width: 2,
158 |             height: 1,
159 |         };
160 | 
161 |         let image = to_device_2d!(&INPUT, 2, 2, 3, &context);
162 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(2, 2, 3);
163 |         output.fill_with_byte(0x00, context.stream.inner()).unwrap();
164 |         resize(&image, Region::Full, &mut output, bottom_half, &context).unwrap();
165 | 
166 |         let output = to_host_2d!(output, &context);
167 |         assert_eq!(&output, &EXPECTED_OUTPUT);
168 |     }
169 | 
170 |     #[test]
171 |     #[should_panic]
172 |     fn test_it_panics_when_input_num_channels_incorrect() {
173 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2);
174 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 3);
175 |         resize(
176 |             &input,
177 |             Region::Full,
178 |             &mut output,
179 |             Region::Full,
180 |             &Context::from_null_stream(),
181 |         )
182 |         .unwrap();
183 |     }
184 | 
185 |     #[test]
186 |     #[should_panic]
187 |     fn test_it_panics_when_output_num_channels_incorrect() {
188 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
189 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 2);
190 |         resize(
191 |             &input,
192 |             Region::Full,
193 |             &mut output,
194 |             Region::Full,
195 |             &Context::from_null_stream(),
196 |         )
197 |         .unwrap();
198 |     }
199 | }
200 | 


--------------------------------------------------------------------------------
/src/ffi/npp/remap.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::ffi::npp::context::Context;
  4 | use crate::ffi::npp::result;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  7 | 
  8 | /// Synchroneous implementation of [`crate::remap()`].
  9 | ///
 10 | /// Refer to [`crate::remap()`] for documentation.
 11 | pub fn remap(
 12 |     input: &crate::ffi::memory::DeviceBuffer2D<u8>,
 13 |     output: &mut crate::ffi::memory::DeviceBuffer2D<u8>,
 14 |     map_x: &crate::ffi::memory::DeviceBuffer2D<f32>,
 15 |     map_y: &crate::ffi::memory::DeviceBuffer2D<f32>,
 16 |     context: &Context,
 17 | ) -> Result<()> {
 18 |     assert_eq!(input.num_channels, 3, "input image must be in RGB format");
 19 |     assert_eq!(output.num_channels, 3, "output image must be in RGB format");
 20 |     assert_eq!(map_x.num_channels, 1, "map must have one channel");
 21 |     assert_eq!(map_y.num_channels, 1, "map must have one channel");
 22 |     assert_eq!(
 23 |         output.width, map_x.width,
 24 |         "map x must have same width as output image"
 25 |     );
 26 |     assert_eq!(
 27 |         output.height, map_x.height,
 28 |         "map x must have same height as output image"
 29 |     );
 30 |     assert_eq!(
 31 |         output.width, map_y.width,
 32 |         "map y must have same width as output image"
 33 |     );
 34 |     assert_eq!(
 35 |         output.height, map_y.height,
 36 |         "map y must have same height as output image"
 37 |     );
 38 | 
 39 |     let (src_width, src_height, src_pitch) = (input.width as i32, input.height as i32, input.pitch);
 40 |     let (dst_width, dst_height, dst_pitch) =
 41 |         (output.width as i32, output.height as i32, output.pitch);
 42 | 
 43 |     let map_x_pitch = map_x.pitch;
 44 |     let map_y_pitch = map_y.pitch;
 45 | 
 46 |     let src_ptr = input.as_internal().as_ptr();
 47 |     let dst_ptr = output.as_mut_internal().as_mut_ptr();
 48 |     let map_x_ptr = map_x.as_internal().as_ptr();
 49 |     let map_y_ptr = map_y.as_internal().as_ptr();
 50 |     let context_ptr = context.as_ptr();
 51 |     let ret = cpp!(unsafe [
 52 |         src_ptr as "const std::uint8_t*",
 53 |         src_width as "std::int32_t",
 54 |         src_height as "std::int32_t",
 55 |         src_pitch as "std::size_t",
 56 |         map_x_ptr as "const float*",
 57 |         map_x_pitch as "std::size_t",
 58 |         map_y_ptr as "const float*",
 59 |         map_y_pitch as "std::size_t",
 60 |         dst_ptr as "std::uint8_t*",
 61 |         dst_width as "std::int32_t",
 62 |         dst_height as "std::int32_t",
 63 |         dst_pitch as "std::size_t",
 64 |         context_ptr as "void*"
 65 |     ] -> i32 as "std::int32_t" {
 66 |         NppiSize src_size = { src_width, src_height };
 67 |         NppiSize dst_size = { dst_width, dst_height };
 68 |         NppiRect src_rect = { 0, 0, src_width, src_height };
 69 |         return nppiRemap_8u_C3R_Ctx(
 70 |             (const Npp8u*) src_ptr,
 71 |             src_size,
 72 |             src_pitch,
 73 |             src_rect,
 74 |             (const Npp32f*) map_x_ptr,
 75 |             map_x_pitch,
 76 |             (const Npp32f*) map_y_ptr,
 77 |             map_y_pitch,
 78 |             (Npp8u*) dst_ptr,
 79 |             dst_pitch,
 80 |             dst_size,
 81 |             // We use bilinear interpolation, which is the fastest resize method that does not
 82 |             // produce messed up quality.
 83 |             NPPI_INTER_LINEAR,
 84 |             *((NppStreamContext*) context_ptr)
 85 |         );
 86 |     });
 87 |     result!(ret)
 88 | }
 89 | 
 90 | #[cfg(test)]
 91 | mod tests {
 92 |     use super::*;
 93 | 
 94 |     use crate::ffi::npp::context::Context;
 95 |     use crate::npp::tests::image::*;
 96 |     use crate::npp::tests::sync::memory::*;
 97 | 
 98 |     #[test]
 99 |     fn test_remap() {
100 |         const MAP_X: &[f32; 16] = &[
101 |             0.0, 1.0, 2.0, 3.0, // No mapping at all
102 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
103 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
104 |             1.0, 1.0, 2.0, 2.0, // Ignore the red border
105 |         ];
106 |         const MAP_Y: &[f32; 16] = &[
107 |             0.0, 0.0, 0.0, 0.0, // No mapping at all
108 |             1.0, 1.0, 1.0, 1.0, // Take from green band
109 |             1.0, 1.0, 1.0, 1.0, // Take from green band
110 |             2.0, 2.0, 2.0, 2.0, // Take from blue band
111 |         ];
112 |         const OUTPUT: Image4x4 = [
113 |             [R, R, R, R], // Red band
114 |             [G, G, G, G], // Green band
115 |             [G, G, G, G], // Green band
116 |             [B, B, B, B], // Blue band
117 |         ];
118 |         const OUTPUT_FLAT: [u8; 4 * 4 * 3] = flatten!(OUTPUT, 4 * 4 * 3);
119 | 
120 |         let context = Context::from_null_stream();
121 | 
122 |         let image = to_device_2d!(&RGB_FLAG, 4, 4, 3, &context);
123 |         let map_x = to_device_2d!(MAP_X, 4, 4, 1, &context);
124 |         let map_y = to_device_2d!(MAP_Y, 4, 4, 1, &context);
125 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(4, 4, 3);
126 |         assert!(remap(&image, &mut output, &map_x, &map_y, &context).is_ok());
127 | 
128 |         let output = to_host_2d!(output, &context);
129 |         assert_eq!(&output, &OUTPUT_FLAT);
130 |     }
131 | 
132 |     #[test]
133 |     #[should_panic]
134 |     fn test_it_panics_when_input_num_channels_incorrect() {
135 |         let context = Context::from_null_stream();
136 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2);
137 |         let map_x = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 1);
138 |         let map_y = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 1);
139 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
140 |         remap(&input, &mut output, &map_x, &map_y, &context).unwrap();
141 |     }
142 | 
143 |     #[test]
144 |     #[should_panic]
145 |     fn test_it_panics_when_output_num_channels_incorrect() {
146 |         let context = Context::from_null_stream();
147 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
148 |         let map_x = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 1);
149 |         let map_y = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 1);
150 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2);
151 |         remap(&input, &mut output, &map_x, &map_y, &context).unwrap();
152 |     }
153 | 
154 |     #[test]
155 |     #[should_panic]
156 |     fn test_it_panics_when_map_num_channels_incorrect() {
157 |         let context = Context::from_null_stream();
158 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
159 |         let map_x = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 2);
160 |         let map_y = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 100, 3);
161 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
162 |         remap(&input, &mut output, &map_x, &map_y, &context).unwrap();
163 |     }
164 | 
165 |     #[test]
166 |     #[should_panic]
167 |     fn test_it_panics_when_map_width_incorrect() {
168 |         let context = Context::from_null_stream();
169 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
170 |         let map_x = crate::ffi::memory::DeviceBuffer2D::<f32>::new(120, 100, 1);
171 |         let map_y = crate::ffi::memory::DeviceBuffer2D::<f32>::new(120, 100, 1);
172 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
173 |         remap(&input, &mut output, &map_x, &map_y, &context).unwrap();
174 |     }
175 | 
176 |     #[test]
177 |     #[should_panic]
178 |     fn test_it_panics_when_map_height_incorrect() {
179 |         let context = Context::from_null_stream();
180 |         let input = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
181 |         let map_x = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 120, 1);
182 |         let map_y = crate::ffi::memory::DeviceBuffer2D::<f32>::new(100, 120, 1);
183 |         let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 3);
184 |         remap(&input, &mut output, &map_x, &map_y, &context).unwrap();
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/src/ffi/memory/host.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::device::DeviceId;
  4 | use crate::ffi::device::Device;
  5 | use crate::ffi::memory::device::DeviceBuffer;
  6 | use crate::ffi::ptr::DevicePtr;
  7 | use crate::ffi::result;
  8 | use crate::ffi::stream::Stream;
  9 | 
 10 | type Result<T> = std::result::Result<T, crate::error::Error>;
 11 | 
 12 | /// Synchronous implementation of [`crate::HostBuffer`].
 13 | ///
 14 | /// Refer to [`crate::HostBuffer`] for documentation.
 15 | pub struct HostBuffer<T: Copy> {
 16 |     pub num_elements: usize,
 17 |     internal: DevicePtr,
 18 |     device: DeviceId,
 19 |     _phantom: std::marker::PhantomData<T>,
 20 | }
 21 | 
 22 | /// Implements [`Send`] for [`HostBuffer`].
 23 | ///
 24 | /// # Safety
 25 | ///
 26 | /// This property is inherited from the CUDA API, which is thread-safe.
 27 | unsafe impl<T: Copy> Send for HostBuffer<T> {}
 28 | 
 29 | /// Implements [`Sync`] for [`HostBuffer`].
 30 | ///
 31 | /// # Safety
 32 | ///
 33 | /// This property is inherited from the CUDA API, which is thread-safe.
 34 | unsafe impl<T: Copy> Sync for HostBuffer<T> {}
 35 | 
 36 | impl<T: Copy> HostBuffer<T> {
 37 |     pub fn new(num_elements: usize) -> Self {
 38 |         let device = Device::get_or_panic();
 39 |         let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut();
 40 |         let ptr_ptr = std::ptr::addr_of_mut!(ptr);
 41 |         let size = num_elements * std::mem::size_of::<T>();
 42 |         let ret = cpp!(unsafe [
 43 |             ptr_ptr as "void**",
 44 |             size as "std::size_t"
 45 |         ] -> i32 as "std::int32_t" {
 46 |             return cudaMallocHost(ptr_ptr, size);
 47 |         });
 48 |         match result!(ret, DevicePtr::from_addr(ptr)) {
 49 |             Ok(internal) => Self {
 50 |                 internal,
 51 |                 device,
 52 |                 num_elements,
 53 |                 _phantom: Default::default(),
 54 |             },
 55 |             Err(err) => {
 56 |                 panic!("failed to allocate host memory: {err}");
 57 |             }
 58 |         }
 59 |     }
 60 | 
 61 |     pub fn from_slice(slice: &[T]) -> Self {
 62 |         let mut this = Self::new(slice.len());
 63 |         this.copy_from_slice(slice);
 64 |         this
 65 |     }
 66 | 
 67 |     #[cfg(feature = "ndarray")]
 68 |     pub fn from_array<D: ndarray::Dimension>(array: &ndarray::ArrayView<T, D>) -> Self {
 69 |         let mut this = Self::new(array.len());
 70 |         this.copy_from_array(array);
 71 |         this
 72 |     }
 73 | 
 74 |     /// Copy from device buffer.
 75 |     ///
 76 |     /// # Safety
 77 |     ///
 78 |     /// This function is marked unsafe because it does not synchronize and the operation might not
 79 |     /// have completed when it returns.
 80 |     #[inline]
 81 |     pub unsafe fn copy_from_async(
 82 |         &mut self,
 83 |         other: &DeviceBuffer<T>,
 84 |         stream: &Stream,
 85 |     ) -> Result<()> {
 86 |         other.copy_to_async(self, stream)
 87 |     }
 88 | 
 89 |     /// Copy to device buffer.
 90 |     ///
 91 |     /// # Safety
 92 |     ///
 93 |     /// This function is marked unsafe because it does not synchronize and the operation might not
 94 |     /// have completed when it returns.
 95 |     #[inline]
 96 |     pub unsafe fn copy_to_async(&self, other: &mut DeviceBuffer<T>, stream: &Stream) -> Result<()> {
 97 |         other.copy_from_async(self, stream)
 98 |     }
 99 | 
100 |     pub fn copy_from_slice(&mut self, slice: &[T]) {
101 |         // SAFETY: This is safe because we only instantiate the slice temporarily whilst having
102 |         // exclusive mutable access to it to copy the data into it.
103 |         let target = unsafe {
104 |             std::slice::from_raw_parts_mut(self.internal.as_mut_ptr() as *mut T, self.num_elements)
105 |         };
106 |         target.copy_from_slice(slice);
107 |     }
108 | 
109 |     #[cfg(feature = "ndarray")]
110 |     pub fn copy_from_array<D: ndarray::Dimension>(&mut self, array: &ndarray::ArrayView<T, D>) {
111 |         assert!(
112 |             array.is_standard_layout(),
113 |             "array must be in standard layout"
114 |         );
115 |         // SAFETY: This is safe because we only instantiate the slice temporarily whilst having
116 |         // exclusive mutable access to it to copy the data into it.
117 |         let target = unsafe {
118 |             std::slice::from_raw_parts_mut(self.internal.as_mut_ptr() as *mut T, self.num_elements)
119 |         };
120 |         target.copy_from_slice(array.as_slice().unwrap());
121 |     }
122 | 
123 |     #[inline]
124 |     pub fn to_vec(&self) -> Vec<T> {
125 |         // SAFETY: This is safe because we only instantiate the slice temporarily to copy the data
126 |         // to a safe Rust [`Vec`].
127 |         let source = unsafe {
128 |             std::slice::from_raw_parts(self.internal.as_ptr() as *const T, self.num_elements)
129 |         };
130 |         source.to_vec()
131 |     }
132 | 
133 |     #[cfg(feature = "ndarray")]
134 |     pub fn to_array_with_shape<D: ndarray::Dimension>(
135 |         &self,
136 |         shape: impl Into<ndarray::StrideShape<D>>,
137 |     ) -> ndarray::Array<T, D> {
138 |         let shape = shape.into();
139 |         assert_eq!(
140 |             self.num_elements,
141 |             shape.size(),
142 |             "provided shape does not match number of elements in buffer"
143 |         );
144 |         ndarray::Array::from_shape_vec(shape, self.to_vec()).unwrap()
145 |     }
146 | 
147 |     /// Get readonly reference to internal [`DevicePtr`].
148 |     #[inline(always)]
149 |     pub fn as_internal(&self) -> &DevicePtr {
150 |         &self.internal
151 |     }
152 | 
153 |     /// Get mutable reference to internal [`DevicePtr`].
154 |     #[inline(always)]
155 |     pub fn as_mut_internal(&mut self) -> &mut DevicePtr {
156 |         &mut self.internal
157 |     }
158 | 
159 |     /// Release the buffer memory.
160 |     ///
161 |     /// # Panics
162 |     ///
163 |     /// This function panics if binding to the corresponding device fails.
164 |     ///
165 |     /// # Safety
166 |     ///
167 |     /// The buffer may not be used after this function is called, except for being dropped.
168 |     pub unsafe fn free(&mut self) {
169 |         if self.internal.is_null() {
170 |             return;
171 |         }
172 | 
173 |         Device::set_or_panic(self.device);
174 | 
175 |         // SAFETY: Safe because we won't use the pointer after this.
176 |         let mut internal = unsafe { self.internal.take() };
177 |         let ptr = internal.as_mut_ptr();
178 |         let _ret = cpp!(unsafe [
179 |             ptr as "void*"
180 |         ] -> i32 as "std::int32_t" {
181 |             return cudaFreeHost(ptr);
182 |         });
183 |     }
184 | }
185 | 
186 | impl<T: Copy> Drop for HostBuffer<T> {
187 |     #[inline]
188 |     fn drop(&mut self) {
189 |         // SAFETY: This is safe since the buffer cannot be used after this.
190 |         unsafe {
191 |             self.free();
192 |         }
193 |     }
194 | }
195 | 
196 | #[cfg(test)]
197 | mod tests {
198 |     use super::*;
199 | 
200 |     #[test]
201 |     fn test_new() {
202 |         let buffer = HostBuffer::<u32>::new(100);
203 |         assert_eq!(buffer.num_elements, 100);
204 |         assert_eq!(buffer.to_vec().len(), 100);
205 |     }
206 | 
207 |     #[test]
208 |     fn test_from_slice() {
209 |         let all_ones = vec![1_u32; 200];
210 |         let buffer = HostBuffer::from_slice(all_ones.as_slice());
211 |         assert_eq!(buffer.num_elements, 200);
212 |         let data = buffer.to_vec();
213 |         assert_eq!(data.len(), 200);
214 |         assert!(data.into_iter().all(|v| v == 1_u32));
215 |     }
216 | 
217 |     #[test]
218 |     fn test_copy() {
219 |         let stream = Stream::new().unwrap();
220 |         let all_ones = vec![1_u32; 100];
221 |         let host_buffer = HostBuffer::from_slice(all_ones.as_slice());
222 | 
223 |         let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream);
224 |         unsafe {
225 |             host_buffer
226 |                 .copy_to_async(&mut device_buffer, &stream)
227 |                 .unwrap();
228 |         }
229 | 
230 |         let mut return_host_buffer = HostBuffer::<u32>::new(100);
231 |         unsafe {
232 |             return_host_buffer
233 |                 .copy_from_async(&device_buffer, &stream)
234 |                 .unwrap();
235 |         }
236 | 
237 |         stream.synchronize().unwrap();
238 | 
239 |         assert_eq!(return_host_buffer.num_elements, 100);
240 |         let return_data = return_host_buffer.to_vec();
241 |         assert_eq!(return_data.len(), 100);
242 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
243 |     }
244 | 
245 |     #[test]
246 |     #[should_panic]
247 |     fn test_it_panics_when_copying_invalid_size() {
248 |         let stream = Stream::new().unwrap();
249 |         let host_buffer = HostBuffer::<u32>::new(100);
250 |         let mut device_buffer = DeviceBuffer::<u32>::new(101, &Stream::null());
251 |         let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream) };
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/src/npp/resize_batch.rs:
--------------------------------------------------------------------------------
  1 | use crate::memory::DeviceBuffer2D;
  2 | use crate::npp::region::Region;
  3 | use crate::npp::stream::Stream;
  4 | use crate::runtime::Future;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  7 | 
  8 | /// Resize a batch of images using bilinear interpolation. This function expects a batch with
  9 | /// on-device input and output buffers.
 10 | ///
 11 | /// This function assumes the following about the input and output images:
 12 | /// * Images are in RGB format.
 13 | /// * Images are in standard memory order, i.e. HWC.
 14 | ///
 15 | /// This is the batched version of [`crate::resize()`].
 16 | ///
 17 | /// # Stability
 18 | ///
 19 | /// This function is only available when the `npp-unstable` feature is enabled. Testing shows that
 20 | /// the batched version can be imprecise when the input image dimensions are small.
 21 | ///
 22 | /// Currently identified suspicious behavior:
 23 | /// * It does not necessarily produce the same output over a batch of images that would have been
 24 | ///   produced if the non-batched version of resize were used on each image individually.
 25 | /// * When invoking batched resize to resize to the same dimensions as the input, it might not
 26 | ///   reproduce the input image exactly.
 27 | ///
 28 | /// # Stream ordered semantics
 29 | ///
 30 | /// This function uses stream ordered semantics. It can only be guaranteed to complete sequentially
 31 | /// relative to operations scheduled on the same stream or the default stream.
 32 | ///
 33 | /// # Arguments
 34 | ///
 35 | /// * `batch` - The on-device input and output images as batch.
 36 | /// * `input_region` - Specify region of interest in input image. This can be used to combine crop
 37 | ///   and resize in a single operation.
 38 | /// * `output_region` - Specify region of interest in output image.
 39 | /// * `stream` - Stream to use.
 40 | pub async fn resize_batch(
 41 |     inputs_and_outputs: &mut [(&DeviceBuffer2D<u8>, &mut DeviceBuffer2D<u8>)],
 42 |     input_region: Region,
 43 |     output_region: Region,
 44 |     stream: &Stream,
 45 | ) -> Result<()> {
 46 |     assert!(
 47 |         !inputs_and_outputs.is_empty(),
 48 |         "batch must have at least one item"
 49 |     );
 50 | 
 51 |     let (first_input, first_output) = &inputs_and_outputs[0];
 52 |     let first_input_width = first_input.width();
 53 |     let first_input_height = first_input.height();
 54 |     let first_output_width = first_output.width();
 55 |     let first_output_height = first_output.height();
 56 |     for (input, output) in inputs_and_outputs.iter() {
 57 |         assert_eq!(
 58 |             input.width(),
 59 |             first_input_width,
 60 |             "all inputs in batch must have the same width",
 61 |         );
 62 |         assert_eq!(
 63 |             input.height(),
 64 |             first_input_height,
 65 |             "all inputs in batch must have the same height",
 66 |         );
 67 |         assert_eq!(
 68 |             output.width(),
 69 |             first_output_width,
 70 |             "all outputs in batch must have the same width",
 71 |         );
 72 |         assert_eq!(
 73 |             output.height(),
 74 |             first_output_height,
 75 |             "all outputs in batch must have the same height",
 76 |         );
 77 |         assert_eq!(
 78 |             input.num_channels(),
 79 |             3,
 80 |             "all inputs and outputs must be in RGB format"
 81 |         );
 82 |         assert_eq!(
 83 |             output.num_channels(),
 84 |             3,
 85 |             "all inputs and outputs must be in RGB format"
 86 |         );
 87 |     }
 88 | 
 89 |     let context = stream.to_context();
 90 |     Future::new(move || {
 91 |         let mut inputs_and_outputs_inner = inputs_and_outputs
 92 |             .iter_mut()
 93 |             .map(|(input, output)| (input.inner(), output.inner_mut()))
 94 |             .collect::<Vec<_>>();
 95 |         crate::ffi::npp::resize_batch::resize_batch(
 96 |             inputs_and_outputs_inner.as_mut_slice(),
 97 |             input_region,
 98 |             output_region,
 99 |             &context,
100 |         )
101 |     })
102 |     .await
103 | }
104 | 
105 | #[cfg(test)]
106 | mod tests {
107 |     use super::*;
108 | 
109 |     use crate::memory::DeviceBuffer2D;
110 |     use crate::npp::stream::Stream;
111 |     use crate::npp::tests::memory::*;
112 | 
113 |     use futures::future;
114 | 
115 |     #[tokio::test]
116 |     async fn test_resize_batch() {
117 |         #[rustfmt::skip]
118 |         const INPUT: [u8; 12] = [
119 |             10, 10, 10, 20, 20, 20,
120 |             30, 30, 30, 40, 40, 40,
121 |         ];
122 |         #[rustfmt::skip]
123 |         const EXPECTED_OUTPUT: [u8; 27] = [
124 |             10, 10, 10, 14, 14, 14, 20, 20, 20,
125 |             18, 18, 18, 23, 23, 23, 28, 28, 28,
126 |             30, 30, 30, 34, 34, 34, 40, 40, 40,
127 |         ];
128 | 
129 |         let stream = Stream::new().await.unwrap();
130 | 
131 |         let mut inputs_and_outputs = future::join_all((0..10).map(|_| async {
132 |             let image = to_device_2d!(&INPUT, 2, 2, 3, &stream);
133 |             let output = DeviceBuffer2D::<u8>::new(3, 3, 3).await;
134 |             (image, output)
135 |         }))
136 |         .await;
137 |         let mut inputs_and_outputs_ref = inputs_and_outputs
138 |             .iter_mut()
139 |             .map(|(input, output)| (&*input, output))
140 |             .collect::<Vec<_>>();
141 |         resize_batch(
142 |             &mut inputs_and_outputs_ref,
143 |             Region::Full,
144 |             Region::Full,
145 |             &stream,
146 |         )
147 |         .await
148 |         .unwrap();
149 | 
150 |         for (_, output) in inputs_and_outputs {
151 |             let output = to_host_2d!(output, &stream);
152 |             assert_eq!(&output, &EXPECTED_OUTPUT);
153 |         }
154 |     }
155 | 
156 |     #[tokio::test]
157 |     async fn test_resize_batch_with_input_region() {
158 |         #[rustfmt::skip]
159 |         const INPUT: [u8; 27] = [
160 |             99, 99, 99, 10, 10, 10, 20, 20, 20,
161 |             99, 99, 99, 30, 30, 30, 40, 40, 40,
162 |             99, 99, 99, 99, 99, 99, 99, 99, 99,
163 |         ];
164 |         #[rustfmt::skip]
165 |         const EXPECTED_OUTPUT: [u8; 27] = [
166 |             32, 32, 32, 14, 14, 14, 20, 20, 20,
167 |             39, 39, 39, 23, 23, 23, 28, 28, 28,
168 |             52, 52, 52, 40, 40, 40, 45, 45, 45,
169 |         ];
170 | 
171 |         let stream = Stream::new().await.unwrap();
172 | 
173 |         let center = Region::Rectangle {
174 |             x: 1,
175 |             y: 0,
176 |             width: 2,
177 |             height: 2,
178 |         };
179 | 
180 |         let mut inputs_and_outputs = future::join_all((0..10).map(|_| async {
181 |             let image = to_device_2d!(&INPUT, 3, 3, 3, &stream);
182 |             let output = DeviceBuffer2D::<u8>::new(3, 3, 3).await;
183 |             (image, output)
184 |         }))
185 |         .await;
186 |         let mut inputs_and_outputs_ref = inputs_and_outputs
187 |             .iter_mut()
188 |             .map(|(input, output)| (&*input, output))
189 |             .collect::<Vec<_>>();
190 |         resize_batch(&mut inputs_and_outputs_ref, center, Region::Full, &stream)
191 |             .await
192 |             .unwrap();
193 | 
194 |         for (_, output) in inputs_and_outputs {
195 |             let output = to_host_2d!(output, &stream);
196 |             assert_eq!(&output, &EXPECTED_OUTPUT);
197 |         }
198 |     }
199 | 
200 |     #[tokio::test]
201 |     async fn test_resize_batch_with_output_region() {
202 |         #[rustfmt::skip]
203 |         const INPUT: [u8; 2 * 2 * 3] = [
204 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
205 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
206 |         ];
207 |         #[rustfmt::skip]
208 |         const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [
209 |             0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
210 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
211 |         ];
212 | 
213 |         let stream = Stream::new().await.unwrap();
214 |         let bottom_half = Region::Rectangle {
215 |             x: 0,
216 |             y: 1,
217 |             width: 2,
218 |             height: 1,
219 |         };
220 | 
221 |         let mut inputs_and_outputs = future::join_all((0..10).map(|_| async {
222 |             let image = to_device_2d!(&INPUT, 2, 2, 3, &stream);
223 |             let mut output = DeviceBuffer2D::<u8>::new(2, 2, 3).await;
224 |             output.fill_with_byte(0x00, &stream).await.unwrap();
225 |             (image, output)
226 |         }))
227 |         .await;
228 |         let mut inputs_and_outputs_ref = inputs_and_outputs
229 |             .iter_mut()
230 |             .map(|(input, output)| (&*input, output))
231 |             .collect::<Vec<_>>();
232 |         resize_batch(
233 |             &mut inputs_and_outputs_ref,
234 |             Region::Full,
235 |             bottom_half,
236 |             &stream,
237 |         )
238 |         .await
239 |         .unwrap();
240 | 
241 |         for (_, output) in inputs_and_outputs {
242 |             let output = to_host_2d!(output, &stream);
243 |             assert_eq!(&output, &EXPECTED_OUTPUT);
244 |         }
245 |     }
246 | 
247 |     #[tokio::test]
248 |     #[should_panic]
249 |     async fn test_it_panics_when_input_num_channels_incorrect() {
250 |         let mut inputs_and_outputs = vec![
251 |             (
252 |                 DeviceBuffer2D::<u8>::new(100, 100, 2).await,
253 |                 DeviceBuffer2D::<u8>::new(200, 200, 2).await,
254 |             ),
255 |             (
256 |                 DeviceBuffer2D::<u8>::new(100, 100, 2).await,
257 |                 DeviceBuffer2D::<u8>::new(200, 200, 2).await,
258 |             ),
259 |         ];
260 |         let mut inputs_and_outputs_ref = inputs_and_outputs
261 |             .iter_mut()
262 |             .map(|(input, output)| (&*input, output))
263 |             .collect::<Vec<_>>();
264 |         resize_batch(
265 |             &mut inputs_and_outputs_ref,
266 |             Region::Full,
267 |             Region::Full,
268 |             &Stream::null().await,
269 |         )
270 |         .await
271 |         .unwrap();
272 |     }
273 | }
274 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/src/ffi/memory/device.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::device::DeviceId;
  4 | use crate::ffi::device::Device;
  5 | use crate::ffi::memory::host::HostBuffer;
  6 | use crate::ffi::ptr::DevicePtr;
  7 | use crate::ffi::result;
  8 | use crate::ffi::stream::Stream;
  9 | 
 10 | type Result<T> = std::result::Result<T, crate::error::Error>;
 11 | 
 12 | /// Synchronous implementation of [`crate::DeviceBuffer`].
 13 | ///
 14 | /// Refer to [`crate::DeviceBuffer`] for documentation.
 15 | pub struct DeviceBuffer<T: Copy> {
 16 |     pub num_elements: usize,
 17 |     internal: DevicePtr,
 18 |     device: DeviceId,
 19 |     _phantom: std::marker::PhantomData<T>,
 20 | }
 21 | 
 22 | /// Implements [`Send`] for [`DeviceBuffer`].
 23 | ///
 24 | /// # Safety
 25 | ///
 26 | /// This property is inherited from the CUDA API, which is thread-safe.
 27 | unsafe impl<T: Copy> Send for DeviceBuffer<T> {}
 28 | 
 29 | /// Implements [`Sync`] for [`DeviceBuffer`].
 30 | ///
 31 | /// # Safety
 32 | ///
 33 | /// This property is inherited from the CUDA API, which is thread-safe.
 34 | unsafe impl<T: Copy> Sync for DeviceBuffer<T> {}
 35 | 
 36 | impl<T: Copy> DeviceBuffer<T> {
 37 |     pub fn new(num_elements: usize, stream: &Stream) -> Self {
 38 |         let device = Device::get_or_panic();
 39 |         let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut();
 40 |         let ptr_ptr = std::ptr::addr_of_mut!(ptr);
 41 |         let size = num_elements * std::mem::size_of::<T>();
 42 |         let stream_ptr = stream.as_internal().as_ptr();
 43 |         let ret = cpp!(unsafe [
 44 |             ptr_ptr as "void**",
 45 |             size as "std::size_t",
 46 |             stream_ptr as "const void*"
 47 |         ] -> i32 as "std::int32_t" {
 48 |             return cudaMallocAsync(ptr_ptr, size, (cudaStream_t) stream_ptr);
 49 |         });
 50 |         match result!(ret, DevicePtr::from_addr(ptr)) {
 51 |             Ok(internal) => Self {
 52 |                 internal,
 53 |                 device,
 54 |                 num_elements,
 55 |                 _phantom: Default::default(),
 56 |             },
 57 |             Err(err) => {
 58 |                 panic!("failed to allocate device memory: {err}");
 59 |             }
 60 |         }
 61 |     }
 62 | 
 63 |     pub fn from_slice(slice: &[T], stream: &Stream) -> Result<Self> {
 64 |         let host_buffer = HostBuffer::from_slice(slice);
 65 |         let mut this = Self::new(slice.len(), stream);
 66 |         // SAFETY: Safe because the stream is synchronized after this.
 67 |         unsafe {
 68 |             this.copy_from_async(&host_buffer, stream)?;
 69 |         }
 70 |         stream.synchronize()?;
 71 |         Ok(this)
 72 |     }
 73 | 
 74 |     #[cfg(feature = "ndarray")]
 75 |     pub fn from_array<D: ndarray::Dimension>(
 76 |         array: &ndarray::ArrayView<T, D>,
 77 |         stream: &Stream,
 78 |     ) -> Result<Self> {
 79 |         let host_buffer = HostBuffer::from_array(array);
 80 |         let mut this = Self::new(array.len(), stream);
 81 |         // SAFETY: Safe because the stream is synchronized after this.
 82 |         unsafe {
 83 |             this.copy_from_async(&host_buffer, stream)?;
 84 |         }
 85 |         stream.synchronize()?;
 86 |         Ok(this)
 87 |     }
 88 | 
 89 |     /// Copy from host buffer.
 90 |     ///
 91 |     /// # Safety
 92 |     ///
 93 |     /// This function is marked unsafe because it does not synchronize and the operation might not
 94 |     /// have completed when it returns.
 95 |     pub unsafe fn copy_from_async(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
 96 |         assert_eq!(self.num_elements, other.num_elements);
 97 |         let ptr_to = self.as_mut_internal().as_mut_ptr();
 98 |         let ptr_from = other.as_internal().as_ptr();
 99 |         let stream_ptr = stream.as_internal().as_ptr();
100 |         let size = self.num_elements * std::mem::size_of::<T>();
101 |         let ret = cpp!(unsafe [
102 |             ptr_from as "void*",
103 |             ptr_to as "void*",
104 |             size as "std::size_t",
105 |             stream_ptr as "const void*"
106 |         ] -> i32 as "std::int32_t" {
107 |             return cudaMemcpyAsync(
108 |                 ptr_to,
109 |                 ptr_from,
110 |                 size,
111 |                 cudaMemcpyHostToDevice,
112 |                 (cudaStream_t) stream_ptr
113 |             );
114 |         });
115 |         result!(ret)
116 |     }
117 | 
118 |     /// Copy to host buffer.
119 |     ///
120 |     /// # Safety
121 |     ///
122 |     /// This function is marked unsafe because it does not synchronize and the operation might not
123 |     /// have completed when it returns.
124 |     pub unsafe fn copy_to_async(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
125 |         assert_eq!(self.num_elements, other.num_elements);
126 |         let ptr_from = self.as_internal().as_ptr();
127 |         let ptr_to = other.as_mut_internal().as_mut_ptr();
128 |         let size = self.num_elements * std::mem::size_of::<T>();
129 |         let stream_ptr = stream.as_internal().as_ptr();
130 |         let ret = cpp!(unsafe [
131 |             ptr_from as "void*",
132 |             ptr_to as "void*",
133 |             size as "std::size_t",
134 |             stream_ptr as "const void*"
135 |         ] -> i32 as "std::int32_t" {
136 |             return cudaMemcpyAsync(
137 |                 ptr_to,
138 |                 ptr_from,
139 |                 size,
140 |                 cudaMemcpyDeviceToHost,
141 |                 (cudaStream_t) stream_ptr
142 |             );
143 |         });
144 |         result!(ret)
145 |     }
146 | 
147 |     /// Fill buffer with byte value.
148 |     pub fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
149 |         let ptr = self.as_internal().as_ptr();
150 |         let value = value as std::ffi::c_int;
151 |         let size = self.num_elements * std::mem::size_of::<T>();
152 |         let stream_ptr = stream.as_internal().as_ptr();
153 |         let ret = cpp!(unsafe [
154 |             ptr as "void*",
155 |             value as "int",
156 |             size as "std::size_t",
157 |             stream_ptr as "const void*"
158 |         ] -> i32 as "std::int32_t" {
159 |             return cudaMemsetAsync(
160 |                 ptr,
161 |                 value,
162 |                 size,
163 |                 (cudaStream_t) stream_ptr
164 |             );
165 |         });
166 |         result!(ret)
167 |     }
168 | 
169 |     /// Get readonly reference to internal [`DevicePtr`].
170 |     #[inline(always)]
171 |     pub fn as_internal(&self) -> &DevicePtr {
172 |         &self.internal
173 |     }
174 | 
175 |     /// Get mutable reference to internal [`DevicePtr`].
176 |     #[inline(always)]
177 |     pub fn as_mut_internal(&mut self) -> &mut DevicePtr {
178 |         &mut self.internal
179 |     }
180 | 
181 |     /// Release the buffer memory.
182 |     ///
183 |     /// # Panics
184 |     ///
185 |     /// This function panics if binding to the corresponding device fails.
186 |     ///
187 |     /// # Safety
188 |     ///
189 |     /// The buffer may not be used after this function is called, except for being dropped.
190 |     pub unsafe fn free(&mut self) {
191 |         if self.internal.is_null() {
192 |             return;
193 |         }
194 | 
195 |         Device::set_or_panic(self.device);
196 | 
197 |         // SAFETY: Safe because we won't use pointer after this.
198 |         let mut internal = unsafe { self.internal.take() };
199 |         let ptr = internal.as_mut_ptr();
200 |         let _ret = cpp!(unsafe [
201 |             ptr as "void*"
202 |         ] -> i32 as "std::int32_t" {
203 |             return cudaFree(ptr);
204 |         });
205 |     }
206 | }
207 | 
208 | impl<T: Copy> Drop for DeviceBuffer<T> {
209 |     #[inline]
210 |     fn drop(&mut self) {
211 |         // SAFETY: This is safe since the buffer cannot be used after this.
212 |         unsafe {
213 |             self.free();
214 |         }
215 |     }
216 | }
217 | 
218 | #[cfg(test)]
219 | mod tests {
220 |     use super::*;
221 | 
222 |     #[test]
223 |     fn test_new() {
224 |         let buffer = DeviceBuffer::<u32>::new(100, &Stream::null());
225 |         assert_eq!(buffer.num_elements, 100);
226 |     }
227 | 
228 |     #[test]
229 |     fn test_copy() {
230 |         let stream = Stream::new().unwrap();
231 |         let all_ones = vec![1_u32; 100];
232 |         let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice());
233 | 
234 |         let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream);
235 |         unsafe {
236 |             device_buffer
237 |                 .copy_from_async(&host_buffer_all_ones, &stream)
238 |                 .unwrap();
239 |         }
240 | 
241 |         let mut host_buffer = HostBuffer::<u32>::new(100);
242 |         unsafe {
243 |             device_buffer
244 |                 .copy_to_async(&mut host_buffer, &stream)
245 |                 .unwrap();
246 |         }
247 | 
248 |         let mut another_device_buffer = DeviceBuffer::<u32>::new(100, &stream);
249 |         unsafe {
250 |             another_device_buffer
251 |                 .copy_from_async(&host_buffer, &stream)
252 |                 .unwrap();
253 |         }
254 | 
255 |         let mut return_host_buffer = HostBuffer::<u32>::new(100);
256 |         unsafe {
257 |             another_device_buffer
258 |                 .copy_to_async(&mut return_host_buffer, &stream)
259 |                 .unwrap();
260 |         }
261 | 
262 |         stream.synchronize().unwrap();
263 | 
264 |         assert_eq!(return_host_buffer.num_elements, 100);
265 |         let return_data = return_host_buffer.to_vec();
266 |         assert_eq!(return_data.len(), 100);
267 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
268 |     }
269 | 
270 |     #[test]
271 |     fn test_fill_with_byte() {
272 |         let stream = Stream::new().unwrap();
273 |         let mut device_buffer = DeviceBuffer::<u8>::new(4, &stream);
274 |         let mut host_buffer = HostBuffer::<u8>::new(4);
275 |         device_buffer.fill_with_byte(0xab, &stream).unwrap();
276 |         unsafe {
277 |             device_buffer
278 |                 .copy_to_async(&mut host_buffer, &stream)
279 |                 .unwrap();
280 |         }
281 |         stream.synchronize().unwrap();
282 |         assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]);
283 |     }
284 | 
285 |     #[test]
286 |     #[should_panic]
287 |     fn test_it_panics_when_copying_invalid_size() {
288 |         let stream = Stream::new().unwrap();
289 |         let device_buffer = DeviceBuffer::<u32>::new(101, &stream);
290 |         let mut host_buffer = HostBuffer::<u32>::new(100);
291 |         let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream) };
292 |     }
293 | }
294 | 


--------------------------------------------------------------------------------
/src/ffi/memory/device2d.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::device::DeviceId;
  4 | use crate::ffi::device::Device;
  5 | use crate::ffi::memory::host::HostBuffer;
  6 | use crate::ffi::ptr::DevicePtr;
  7 | use crate::ffi::result;
  8 | use crate::ffi::stream::Stream;
  9 | 
 10 | type Result<T> = std::result::Result<T, crate::error::Error>;
 11 | 
 12 | /// Synchronous implementation of [`crate::DeviceBuffer2D`].
 13 | ///
 14 | /// Refer to [`crate::DeviceBuffer2D`] for documentation.
 15 | pub struct DeviceBuffer2D<T: Copy> {
 16 |     pub width: usize,
 17 |     pub height: usize,
 18 |     pub num_channels: usize,
 19 |     pub pitch: usize,
 20 |     internal: DevicePtr,
 21 |     device: DeviceId,
 22 |     _phantom: std::marker::PhantomData<T>,
 23 | }
 24 | 
 25 | /// Implements [`Send`] for [`DeviceBuffer2D`].
 26 | ///
 27 | /// # Safety
 28 | ///
 29 | /// This property is inherited from the CUDA API, which is thread-safe.
 30 | unsafe impl<T: Copy> Send for DeviceBuffer2D<T> {}
 31 | 
 32 | /// Implements [`Sync`] for [`DeviceBuffer2D`].
 33 | ///
 34 | /// # Safety
 35 | ///
 36 | /// This property is inherited from the CUDA API, which is thread-safe.
 37 | unsafe impl<T: Copy> Sync for DeviceBuffer2D<T> {}
 38 | 
 39 | impl<T: Copy> DeviceBuffer2D<T> {
 40 |     pub fn new(width: usize, height: usize, num_channels: usize) -> Self {
 41 |         let device = Device::get_or_panic();
 42 |         let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut();
 43 |         let ptr_ptr = std::ptr::addr_of_mut!(ptr);
 44 |         let mut pitch = 0_usize;
 45 |         let pitch_ptr = std::ptr::addr_of_mut!(pitch);
 46 |         let line_size = width * num_channels * std::mem::size_of::<T>();
 47 |         let ret = cpp!(unsafe [
 48 |             ptr_ptr as "void**",
 49 |             pitch_ptr as "std::size_t*",
 50 |             line_size as "std::size_t",
 51 |             height as "std::size_t"
 52 |         ] -> i32 as "std::int32_t" {
 53 |             return cudaMallocPitch(
 54 |                 ptr_ptr,
 55 |                 pitch_ptr,
 56 |                 line_size,
 57 |                 height
 58 |             );
 59 |         });
 60 |         match result!(ret, DevicePtr::from_addr(ptr)) {
 61 |             Ok(internal) => Self {
 62 |                 width,
 63 |                 height,
 64 |                 num_channels,
 65 |                 pitch,
 66 |                 internal,
 67 |                 device,
 68 |                 _phantom: Default::default(),
 69 |             },
 70 |             Err(err) => {
 71 |                 panic!("failed to allocate device memory: {err}");
 72 |             }
 73 |         }
 74 |     }
 75 | 
 76 |     #[cfg(feature = "ndarray")]
 77 |     pub fn from_array(array: &ndarray::ArrayView3<T>, stream: &Stream) -> Result<Self> {
 78 |         let host_buffer = HostBuffer::from_array(array);
 79 |         let (height, width, num_channels) = array.dim();
 80 |         let mut this = Self::new(width, height, num_channels);
 81 |         // SAFETY: Safe because the stream is synchronized after this.
 82 |         unsafe {
 83 |             this.copy_from_async(&host_buffer, stream)?;
 84 |         }
 85 |         stream.synchronize()?;
 86 |         Ok(this)
 87 |     }
 88 | 
 89 |     /// Copy from host buffer.
 90 |     ///
 91 |     /// # Safety
 92 |     ///
 93 |     /// This function is marked unsafe because it does not synchronize and the operation might not
 94 |     /// have completed when it returns.
 95 |     pub unsafe fn copy_from_async(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
 96 |         assert_eq!(self.num_elements(), other.num_elements);
 97 |         let ptr_from = other.as_internal().as_ptr();
 98 |         let ptr_to = self.as_mut_internal().as_mut_ptr();
 99 |         let line_size = self.width * self.num_channels * std::mem::size_of::<T>();
100 |         let height = self.height;
101 |         let pitch = self.pitch;
102 |         let stream_ptr = stream.as_internal().as_ptr();
103 |         let ret = cpp!(unsafe [
104 |             ptr_from as "void*",
105 |             ptr_to as "void*",
106 |             pitch as "std::size_t",
107 |             line_size as "std::size_t",
108 |             height as "std::size_t",
109 |             stream_ptr as "const void*"
110 |         ] -> i32 as "std::int32_t" {
111 |             return cudaMemcpy2DAsync(
112 |                 ptr_to,
113 |                 pitch,
114 |                 ptr_from,
115 |                 line_size,
116 |                 line_size,
117 |                 height,
118 |                 cudaMemcpyHostToDevice,
119 |                 (cudaStream_t) stream_ptr
120 |             );
121 |         });
122 |         result!(ret)
123 |     }
124 | 
125 |     /// Copy to host buffer.
126 |     ///
127 |     /// # Safety
128 |     ///
129 |     /// This function is marked unsafe because it does not synchronize and the operation might not
130 |     /// have completed when it returns.
131 |     pub unsafe fn copy_to_async(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
132 |         assert_eq!(self.num_elements(), other.num_elements);
133 |         let ptr_from = self.as_internal().as_ptr();
134 |         let ptr_to = other.as_mut_internal().as_mut_ptr();
135 |         let line_size = self.width * self.num_channels * std::mem::size_of::<T>();
136 |         let height = self.height;
137 |         let pitch = self.pitch;
138 |         let stream_ptr = stream.as_internal().as_ptr();
139 |         let ret = cpp!(unsafe [
140 |             ptr_from as "void*",
141 |             ptr_to as "void*",
142 |             pitch as "std::size_t",
143 |             line_size as "std::size_t",
144 |             height as "std::size_t",
145 |             stream_ptr as "const void*"
146 |         ] -> i32 as "std::int32_t" {
147 |             return cudaMemcpy2DAsync(
148 |                 ptr_to,
149 |                 line_size,
150 |                 ptr_from,
151 |                 pitch,
152 |                 line_size,
153 |                 height,
154 |                 cudaMemcpyDeviceToHost,
155 |                 (cudaStream_t) stream_ptr
156 |             );
157 |         });
158 |         result!(ret)
159 |     }
160 | 
161 |     /// Fill buffer with byte value.
162 |     pub fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
163 |         let ptr = self.as_internal().as_ptr();
164 |         let value = value as std::ffi::c_int;
165 |         let line_size = self.width * self.num_channels * std::mem::size_of::<T>();
166 |         let height = self.height;
167 |         let pitch = self.pitch;
168 |         let stream_ptr = stream.as_internal().as_ptr();
169 |         let ret = cpp!(unsafe [
170 |             ptr as "void*",
171 |             value as "int",
172 |             pitch as "std::size_t",
173 |             line_size as "std::size_t",
174 |             height as "std::size_t",
175 |             stream_ptr as "const void*"
176 |         ] -> i32 as "std::int32_t" {
177 |             return cudaMemset2DAsync(
178 |                 ptr,
179 |                 pitch,
180 |                 value,
181 |                 line_size,
182 |                 height,
183 |                 (cudaStream_t) stream_ptr
184 |             );
185 |         });
186 |         result!(ret)
187 |     }
188 | 
189 |     #[inline(always)]
190 |     pub fn num_elements(&self) -> usize {
191 |         self.width * self.height * self.num_channels
192 |     }
193 | 
194 |     /// Get readonly reference to internal [`DevicePtr`].
195 |     #[inline(always)]
196 |     pub fn as_internal(&self) -> &DevicePtr {
197 |         &self.internal
198 |     }
199 | 
200 |     /// Get mutable reference to internal [`DevicePtr`].
201 |     #[inline(always)]
202 |     pub fn as_mut_internal(&mut self) -> &mut DevicePtr {
203 |         &mut self.internal
204 |     }
205 | 
206 |     /// Release the buffer memory.
207 |     ///
208 |     /// # Panics
209 |     ///
210 |     /// This function panics if binding to the corresponding device fails.
211 |     ///
212 |     /// # Safety
213 |     ///
214 |     /// The buffer may not be used after this function is called, except for being dropped.
215 |     pub unsafe fn free(&mut self) {
216 |         if self.internal.is_null() {
217 |             return;
218 |         }
219 | 
220 |         Device::set_or_panic(self.device);
221 | 
222 |         // SAFETY: Safe because we won't use pointer after this.
223 |         let mut internal = unsafe { self.internal.take() };
224 |         let ptr = internal.as_mut_ptr();
225 |         let _ret = cpp!(unsafe [
226 |             ptr as "void*"
227 |         ] -> i32 as "std::int32_t" {
228 |             return cudaFree(ptr);
229 |         });
230 |     }
231 | }
232 | 
233 | impl<T: Copy> Drop for DeviceBuffer2D<T> {
234 |     #[inline]
235 |     fn drop(&mut self) {
236 |         // SAFETY: This is safe since the buffer cannot be used after this.
237 |         unsafe {
238 |             self.free();
239 |         }
240 |     }
241 | }
242 | 
243 | #[cfg(test)]
244 | mod tests {
245 |     use super::*;
246 | 
247 |     #[test]
248 |     fn test_new() {
249 |         let buffer = DeviceBuffer2D::<u32>::new(120, 80, 3);
250 |         assert_eq!(buffer.width, 120);
251 |         assert_eq!(buffer.height, 80);
252 |         assert_eq!(buffer.num_channels, 3);
253 |         assert_eq!(buffer.num_elements(), 120 * 80 * 3);
254 |         assert!(buffer.pitch >= 360);
255 |     }
256 | 
257 |     #[test]
258 |     fn test_copy() {
259 |         let stream = Stream::new().unwrap();
260 |         let all_ones = vec![1_u32; 150];
261 |         let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice());
262 | 
263 |         let mut device_buffer = DeviceBuffer2D::<u32>::new(10, 5, 3);
264 |         unsafe {
265 |             device_buffer
266 |                 .copy_from_async(&host_buffer_all_ones, &stream)
267 |                 .unwrap();
268 |         }
269 | 
270 |         let mut host_buffer = HostBuffer::<u32>::new(150);
271 |         unsafe {
272 |             device_buffer
273 |                 .copy_to_async(&mut host_buffer, &stream)
274 |                 .unwrap();
275 |         }
276 | 
277 |         let mut another_device_buffer = DeviceBuffer2D::<u32>::new(10, 5, 3);
278 |         unsafe {
279 |             another_device_buffer
280 |                 .copy_from_async(&host_buffer, &stream)
281 |                 .unwrap();
282 |         }
283 | 
284 |         let mut return_host_buffer = HostBuffer::<u32>::new(150);
285 |         unsafe {
286 |             another_device_buffer
287 |                 .copy_to_async(&mut return_host_buffer, &stream)
288 |                 .unwrap();
289 |         }
290 | 
291 |         stream.synchronize().unwrap();
292 | 
293 |         assert_eq!(return_host_buffer.num_elements, 150);
294 |         let return_data = return_host_buffer.to_vec();
295 |         assert_eq!(return_data.len(), 150);
296 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
297 |     }
298 | 
299 |     #[test]
300 |     fn test_copy_2d() {
301 |         let stream = Stream::new().unwrap();
302 |         let image: [u8; 12] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4];
303 |         let host_buffer = HostBuffer::from_slice(&image);
304 |         let mut device_buffer = DeviceBuffer2D::<u8>::new(2, 2, 3);
305 |         unsafe {
306 |             device_buffer
307 |                 .copy_from_async(&host_buffer, &stream)
308 |                 .unwrap();
309 |         }
310 |         let mut return_host_buffer = HostBuffer::<u8>::new(12);
311 |         unsafe {
312 |             device_buffer
313 |                 .copy_to_async(&mut return_host_buffer, &stream)
314 |                 .unwrap();
315 |         }
316 |         stream.synchronize().unwrap();
317 |         assert_eq!(
318 |             &return_host_buffer.to_vec(),
319 |             &[1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
320 |         );
321 |     }
322 | 
323 |     #[test]
324 |     fn test_fill_with_byte() {
325 |         let stream = Stream::new().unwrap();
326 |         let mut device_buffer = DeviceBuffer2D::<u8>::new(2, 2, 3);
327 |         let mut host_buffer = HostBuffer::<u8>::new(2 * 2 * 3);
328 |         device_buffer.fill_with_byte(0xab, &stream).unwrap();
329 |         unsafe {
330 |             device_buffer
331 |                 .copy_to_async(&mut host_buffer, &stream)
332 |                 .unwrap();
333 |         }
334 |         stream.synchronize().unwrap();
335 |         assert_eq!(
336 |             host_buffer.to_vec(),
337 |             &[0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab]
338 |         );
339 |     }
340 | 
341 |     #[test]
342 |     #[should_panic]
343 |     fn test_it_panics_when_copying_invalid_size() {
344 |         let stream = Stream::new().unwrap();
345 |         let device_buffer = DeviceBuffer2D::<u32>::new(5, 5, 3);
346 |         let mut host_buffer = HostBuffer::<u32>::new(80);
347 |         let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream) };
348 |     }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/memory/host.rs:
--------------------------------------------------------------------------------
  1 | use crate::ffi;
  2 | use crate::memory::DeviceBuffer;
  3 | use crate::runtime::Future;
  4 | use crate::stream::Stream;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::error::Error>;
  7 | 
  8 | /// A host buffer.
  9 | ///
 10 | /// # Performance
 11 | ///
 12 | /// Host buffers are managed by CUDA and can be used for pinned memory transfer. Pinned memory
 13 | /// transfer speeds are usually higher compared to paged memory transfers. Pinned memory buffers are
 14 | /// especially important for this crate because the runtime thread must do the least amount of CPU
 15 | /// work possible. Paged transfers do require the host to move data into a CUDA managed buffer first
 16 | /// (an extra memory copy) whilst pinned transfers do not.
 17 | pub struct HostBuffer<T: Copy + 'static> {
 18 |     inner: ffi::memory::HostBuffer<T>,
 19 | }
 20 | 
 21 | impl<T: Copy + 'static> HostBuffer<T> {
 22 |     /// Allocates memory on the host. This creates a pinned buffer. Any transfers to and from this
 23 |     /// buffer automatically become pinned transfers, and will be much faster.
 24 |     ///
 25 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c)
 26 |     ///
 27 |     /// # Arguments
 28 |     ///
 29 |     /// * `num_elements` - Number of elements to allocate.
 30 |     pub async fn new(num_elements: usize) -> Self {
 31 |         let inner = Future::new(move || ffi::memory::HostBuffer::<T>::new(num_elements)).await;
 32 |         Self { inner }
 33 |     }
 34 | 
 35 |     /// Allocates memory on the host and copies the provided data into it.
 36 |     ///
 37 |     /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
 38 |     /// pinned transfers, and will be much faster.
 39 |     ///
 40 |     /// This is a convenience function that allows the caller to quickly put data into a host
 41 |     /// buffer. It is roughly similar to `buffer.copy_from_slice(slice)`.
 42 |     ///
 43 |     /// # Arguments
 44 |     ///
 45 |     /// * `slice` - Data to copy into the new host buffer.
 46 |     pub async fn from_slice(slice: &[T]) -> Self {
 47 |         let mut this = Self::new(slice.len()).await;
 48 |         this.copy_from_slice(slice);
 49 |         this
 50 |     }
 51 | 
 52 |     /// Allocates memory on the host and copies the provided array into it.
 53 |     ///
 54 |     /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
 55 |     /// pinned transfers, and will be much faster.
 56 |     ///
 57 |     /// This is a convenience function that allows the caller to quickly put data into a host
 58 |     /// buffer. It is roughly similar to `buffer.copy_from_array(slice)`.
 59 |     ///
 60 |     /// # Arguments
 61 |     ///
 62 |     /// * `array` - Array to copy into the new host buffer.
 63 |     #[cfg(feature = "ndarray")]
 64 |     pub async fn from_array<D: ndarray::Dimension>(array: &ndarray::ArrayView<'_, T, D>) -> Self {
 65 |         let mut this = Self::new(array.len()).await;
 66 |         this.copy_from_array(array);
 67 |         this
 68 |     }
 69 | 
 70 |     /// Copies memory from the provided device buffer to this buffer.
 71 |     ///
 72 |     /// This function synchronizes the stream implicitly.
 73 |     ///
 74 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
 75 |     ///
 76 |     /// # Pinned transfer
 77 |     ///
 78 |     /// This function is guaranteed to produce a pinned transfer on the runtime thread.
 79 |     ///
 80 |     /// # Stream ordered semantics
 81 |     ///
 82 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
 83 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
 84 |     ///
 85 |     /// # Arguments
 86 |     ///
 87 |     /// * `other` - Device buffer to copy from.
 88 |     /// * `stream` - Stream to use.
 89 |     #[inline(always)]
 90 |     pub async fn copy_from(&mut self, other: &DeviceBuffer<T>, stream: &Stream) -> Result<()> {
 91 |         other.copy_to(self, stream).await
 92 |     }
 93 | 
 94 |     /// Copies memory from the provided device buffer to this buffer.
 95 |     ///
 96 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
 97 |     ///
 98 |     /// # Pinned transfer
 99 |     ///
100 |     /// This function is guaranteed to produce a pinned transfer on the runtime thread.
101 |     ///
102 |     /// # Stream ordered semantics
103 |     ///
104 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
105 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
106 |     ///
107 |     /// # Safety
108 |     ///
109 |     /// This function is unsafe because the operation might not have completed when the function
110 |     /// returns, and thus the state of the buffer is undefined.
111 |     ///
112 |     /// # Arguments
113 |     ///
114 |     /// * `other` - Device buffer to copy from.
115 |     /// * `stream` - Stream to use.
116 |     #[inline(always)]
117 |     pub async unsafe fn copy_from_async(
118 |         &mut self,
119 |         other: &DeviceBuffer<T>,
120 |         stream: &Stream,
121 |     ) -> Result<()> {
122 |         other.copy_to_async(self, stream).await
123 |     }
124 | 
125 |     /// Copies memory from this buffer to the provided device buffer.
126 |     ///
127 |     /// This function synchronizes the stream implicitly.
128 |     ///
129 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
130 |     ///
131 |     /// # Pinned transfer
132 |     ///
133 |     /// This function is guaranteed to produce a pinned transfer on the runtime thread.
134 |     ///
135 |     /// # Stream ordered semantics
136 |     ///
137 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
138 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
139 |     ///
140 |     /// # Arguments
141 |     ///
142 |     /// * `other` - Device buffer to copy to.
143 |     /// * `stream` - Stream to use.
144 |     #[inline(always)]
145 |     pub async fn copy_to(&self, other: &mut DeviceBuffer<T>, stream: &Stream) -> Result<()> {
146 |         other.copy_from(self, stream).await
147 |     }
148 | 
149 |     /// Copies memory from this buffer to the provided device buffer.
150 |     ///
151 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
152 |     ///
153 |     /// # Pinned transfer
154 |     ///
155 |     /// This function is guaranteed to produce a pinned transfer on the runtime thread.
156 |     ///
157 |     /// # Stream ordered semantics
158 |     ///
159 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
160 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
161 |     ///
162 |     /// # Safety
163 |     ///
164 |     /// This function is unsafe because the operation might not have completed when the function
165 |     /// returns, and thus the state of the buffer is undefined.
166 |     ///
167 |     /// # Arguments
168 |     ///
169 |     /// * `other` - Device buffer to copy to.
170 |     /// * `stream` - Stream to use.
171 |     #[inline(always)]
172 |     pub async unsafe fn copy_to_async(
173 |         &self,
174 |         other: &mut DeviceBuffer<T>,
175 |         stream: &Stream,
176 |     ) -> Result<()> {
177 |         other.copy_from_async(self, stream).await
178 |     }
179 | 
180 |     /// Copy data into the host buffer from a slice.
181 |     ///
182 |     /// # Synchronization safety
183 |     ///
184 |     /// This call is only synchronization-safe if all streams that have previously been used for
185 |     /// copy operations either from or to this host buffer have been synchronized, and no operations
186 |     /// have been scheduled since.
187 |     ///
188 |     /// # Arguments
189 |     ///
190 |     /// * `slice` - Data to copy into the new host buffer.
191 |     ///
192 |     /// # Example
193 |     ///
194 |     /// ```
195 |     /// # use async_cuda::HostBuffer;
196 |     /// # tokio_test::block_on(async {
197 |     /// let mut host_buffer = HostBuffer::<u8>::new(100).await;
198 |     /// let some_data = vec![10; 100];
199 |     /// host_buffer.copy_from_slice(&some_data);
200 |     /// # })
201 |     /// ```
202 |     #[inline(always)]
203 |     pub fn copy_from_slice(&mut self, slice: &[T]) {
204 |         self.inner.copy_from_slice(slice);
205 |     }
206 | 
207 |     /// Copy array into the host buffer from a slice.
208 |     ///
209 |     /// # Synchronization safety
210 |     ///
211 |     /// This call is only synchronization-safe if all streams that have previously been used for
212 |     /// copy operations either from or to this host buffer have been synchronized, and no operations
213 |     /// have been scheduled since.
214 |     ///
215 |     /// # Arguments
216 |     ///
217 |     /// * `array` - Array to copy into the new host buffer.
218 |     #[cfg(feature = "ndarray")]
219 |     #[inline(always)]
220 |     pub fn copy_from_array<D: ndarray::Dimension>(&mut self, array: &ndarray::ArrayView<T, D>) {
221 |         self.inner.copy_from_array(array)
222 |     }
223 | 
224 |     /// Copy the data to a [`Vec`] and return it.
225 |     #[inline(always)]
226 |     pub fn to_vec(&self) -> Vec<T> {
227 |         self.inner.to_vec()
228 |     }
229 | 
230 |     /// Copy the data to an [`ndarray::Array`] and return it.
231 |     ///
232 |     /// Function panics if provided shape does not match size of array.
233 |     ///
234 |     /// # Arguments
235 |     ///
236 |     /// * `shape` - Shape for array.
237 |     #[cfg(feature = "ndarray")]
238 |     #[inline(always)]
239 |     pub fn to_array_with_shape<D: ndarray::Dimension>(
240 |         &self,
241 |         shape: impl Into<ndarray::StrideShape<D>>,
242 |     ) -> ndarray::Array<T, D> {
243 |         self.inner.to_array_with_shape::<D>(shape)
244 |     }
245 | 
246 |     /// Get number of elements in buffer.
247 |     #[inline(always)]
248 |     pub fn num_elements(&self) -> usize {
249 |         self.inner.num_elements
250 |     }
251 | 
252 |     /// Access the inner synchronous implementation of [`HostBuffer`].
253 |     #[inline(always)]
254 |     pub fn inner(&self) -> &ffi::memory::HostBuffer<T> {
255 |         &self.inner
256 |     }
257 | 
258 |     /// Access the inner synchronous implementation of [`HostBuffer`].
259 |     #[inline(always)]
260 |     pub fn inner_mut(&mut self) -> &mut ffi::memory::HostBuffer<T> {
261 |         &mut self.inner
262 |     }
263 | }
264 | 
265 | #[cfg(test)]
266 | mod tests {
267 |     use super::*;
268 | 
269 |     #[tokio::test]
270 |     async fn test_new() {
271 |         let buffer = HostBuffer::<u32>::new(100).await;
272 |         assert_eq!(buffer.num_elements(), 100);
273 |         assert_eq!(buffer.to_vec().len(), 100);
274 |     }
275 | 
276 |     #[tokio::test]
277 |     async fn test_from_slice() {
278 |         let all_ones = vec![1_u32; 200];
279 |         let buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
280 |         assert_eq!(buffer.num_elements(), 200);
281 |         let data = buffer.to_vec();
282 |         assert_eq!(data.len(), 200);
283 |         assert!(data.into_iter().all(|v| v == 1_u32));
284 |     }
285 | 
286 |     #[tokio::test]
287 |     async fn test_copy() {
288 |         let stream = Stream::new().await.unwrap();
289 |         let all_ones = vec![1_u32; 100];
290 |         let host_buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
291 | 
292 |         let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
293 |         unsafe {
294 |             host_buffer
295 |                 .copy_to_async(&mut device_buffer, &stream)
296 |                 .await
297 |                 .unwrap();
298 |         }
299 | 
300 |         let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
301 |         unsafe {
302 |             return_host_buffer
303 |                 .copy_from_async(&device_buffer, &stream)
304 |                 .await
305 |                 .unwrap();
306 |         }
307 | 
308 |         stream.synchronize().await.unwrap();
309 | 
310 |         assert_eq!(return_host_buffer.num_elements(), 100);
311 |         let return_data = return_host_buffer.to_vec();
312 |         assert_eq!(return_data.len(), 100);
313 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
314 |     }
315 | 
316 |     #[tokio::test]
317 |     #[should_panic]
318 |     async fn test_it_panics_when_copying_invalid_size() {
319 |         let stream = Stream::new().await.unwrap();
320 |         let host_buffer = HostBuffer::<u32>::new(100).await;
321 |         let mut device_buffer = DeviceBuffer::<u32>::new(101, &Stream::null()).await;
322 |         let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream).await };
323 |     }
324 | }
325 | 


--------------------------------------------------------------------------------
/src/ffi/npp/resize_batch.rs:
--------------------------------------------------------------------------------
  1 | use cpp::cpp;
  2 | 
  3 | use crate::ffi::npp::context::Context;
  4 | use crate::ffi::npp::result;
  5 | use crate::npp::region::Region;
  6 | 
  7 | type Result<T> = std::result::Result<T, crate::npp::error::Error>;
  8 | 
  9 | /// Synchroneous implementation of [`crate::resize_batch()`].
 10 | ///
 11 | /// Refer to [`crate::resize_batch()`] for documentation.
 12 | pub fn resize_batch(
 13 |     inputs_and_outputs: &mut [(
 14 |         &crate::ffi::memory::DeviceBuffer2D<u8>,
 15 |         &mut crate::ffi::memory::DeviceBuffer2D<u8>,
 16 |     )],
 17 |     input_region: Region,
 18 |     output_region: Region,
 19 |     context: &Context,
 20 | ) -> Result<()> {
 21 |     assert!(
 22 |         !inputs_and_outputs.is_empty(),
 23 |         "batch must have at least one item"
 24 |     );
 25 | 
 26 |     let (first_input, first_output) = &inputs_and_outputs[0];
 27 |     let first_input_width = first_input.width;
 28 |     let first_input_height = first_input.height;
 29 |     let first_output_width = first_output.width;
 30 |     let first_output_height = first_output.height;
 31 |     for (input, output) in inputs_and_outputs.iter() {
 32 |         assert_eq!(
 33 |             input.width, first_input_width,
 34 |             "all inputs in batch must have the same width",
 35 |         );
 36 |         assert_eq!(
 37 |             input.height, first_input_height,
 38 |             "all inputs in batch must have the same height",
 39 |         );
 40 |         assert_eq!(
 41 |             output.width, first_output_width,
 42 |             "all outputs in batch must have the same width",
 43 |         );
 44 |         assert_eq!(
 45 |             output.height, first_output_height,
 46 |             "all outputs in batch must have the same height",
 47 |         );
 48 |         assert_eq!(
 49 |             input.num_channels, 3,
 50 |             "all inputs and outputs must be in RGB format"
 51 |         );
 52 |         assert_eq!(
 53 |             output.num_channels, 3,
 54 |             "all inputs and outputs must be in RGB format"
 55 |         );
 56 |     }
 57 | 
 58 |     let batch_size = inputs_and_outputs.len();
 59 | 
 60 |     let (src_width, src_height) = (first_input_width as i32, first_input_height as i32);
 61 |     let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) =
 62 |         input_region.resolve_to_xywh(src_width as usize, src_height as usize);
 63 |     let (src_rect_x, src_rect_y, src_rect_width, src_rect_height) = (
 64 |         src_rect_x as i32,
 65 |         src_rect_y as i32,
 66 |         src_rect_width as i32,
 67 |         src_rect_height as i32,
 68 |     );
 69 | 
 70 |     let (dst_width, dst_height) = (first_output_width as i32, first_output_height as i32);
 71 |     let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) =
 72 |         output_region.resolve_to_xywh(dst_width as usize, dst_height as usize);
 73 |     let (dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height) = (
 74 |         dst_rect_x as i32,
 75 |         dst_rect_y as i32,
 76 |         dst_rect_width as i32,
 77 |         dst_rect_height as i32,
 78 |     );
 79 | 
 80 |     let srcs = inputs_and_outputs
 81 |         .iter()
 82 |         // SAFETY: This is safe because we keep the original input and output device buffers around
 83 |         // for the duration of this call.
 84 |         .map(|(input, _)| input.as_internal().as_ptr())
 85 |         .collect::<Vec<_>>();
 86 |     let src_pitches = inputs_and_outputs
 87 |         .iter()
 88 |         .map(|(input, _)| input.pitch)
 89 |         .collect::<Vec<_>>();
 90 |     let dsts = inputs_and_outputs
 91 |         .iter_mut()
 92 |         // SAFETY: This is safe because we keep the original input and output device buffers around
 93 |         // for the duration of this call.
 94 |         .map(|(_, output)| output.as_mut_internal().as_mut_ptr())
 95 |         .collect::<Vec<_>>();
 96 |     let dst_pitches = inputs_and_outputs
 97 |         .iter()
 98 |         .map(|(_, output)| output.pitch)
 99 |         .collect::<Vec<_>>();
100 | 
101 |     let src_array = srcs.as_ptr();
102 |     let src_pitches_array = src_pitches.as_ptr();
103 |     let dst_array = dsts.as_ptr();
104 |     let dst_pitches_array = dst_pitches.as_ptr();
105 | 
106 |     let context_ptr = context.as_ptr();
107 |     let ret = cpp!(unsafe [
108 |         src_array as "const void* const*",
109 |         src_pitches_array as "const std::size_t*",
110 |         src_width as "std::int32_t",
111 |         src_height as "std::int32_t",
112 |         src_rect_x as "std::int32_t",
113 |         src_rect_y as "std::int32_t",
114 |         src_rect_width as "std::int32_t",
115 |         src_rect_height as "std::int32_t",
116 |         dst_array as "void* const*",
117 |         dst_pitches_array as "const std::size_t*",
118 |         dst_width as "std::int32_t",
119 |         dst_height as "std::int32_t",
120 |         dst_rect_x as "std::int32_t",
121 |         dst_rect_y as "std::int32_t",
122 |         dst_rect_width as "std::int32_t",
123 |         dst_rect_height as "std::int32_t",
124 |         batch_size as "std::size_t",
125 |         context_ptr as "void*"
126 |     ] -> i32 as "std::int32_t" {
127 |         NppStatus ret {};
128 |         cudaError_t ret_cuda {};
129 | 
130 |         NppiSize src_size = { src_width, src_height };
131 |         NppiSize dst_size = { dst_width, dst_height };
132 |         NppiRect src_rect = { src_rect_x, src_rect_y, src_rect_width, src_rect_height };
133 |         NppiRect dst_rect = { dst_rect_x, dst_rect_y, dst_rect_width, dst_rect_height };
134 | 
135 |         NppiResizeBatchCXR* batch_host = new NppiResizeBatchCXR[batch_size];
136 |         for (std::size_t i = 0; i < batch_size; i++) {
137 |             batch_host[i].pSrc = src_array[i];
138 |             batch_host[i].nSrcStep = src_pitches_array[i];
139 |             batch_host[i].pDst = dst_array[i];
140 |             batch_host[i].nDstStep = dst_pitches_array[i];
141 |         }
142 | 
143 |         NppiResizeBatchCXR* batch = nullptr;
144 |         ret_cuda = cudaMallocAsync(
145 |             &batch,
146 |             batch_size * sizeof(NppiResizeBatchCXR),
147 |             ((NppStreamContext*) context_ptr)->hStream
148 |         );
149 |         if (ret_cuda != cudaSuccess)
150 |             goto cleanup;
151 |         ret_cuda = cudaMemcpyAsync(
152 |             batch,
153 |             batch_host,
154 |             batch_size * sizeof(NppiResizeBatchCXR),
155 |             cudaMemcpyHostToDevice,
156 |             ((NppStreamContext*) context_ptr)->hStream
157 |         );
158 |         if (ret_cuda != cudaSuccess)
159 |             goto cleanup;
160 | 
161 |         ret = nppiResizeBatch_8u_C3R_Ctx(
162 |             src_size,
163 |             src_rect,
164 |             dst_size,
165 |             dst_rect,
166 |             // We use bilinear interpolation, which is the fastest resize method that does not
167 |             // produce messed up quality.
168 |             NPPI_INTER_LINEAR,
169 |             batch,
170 |             batch_size,
171 |             *((NppStreamContext*) context_ptr)
172 |         );
173 | 
174 |     cleanup:
175 |         if (batch != nullptr)
176 |             cudaFreeAsync(
177 |                 batch,
178 |                 ((NppStreamContext*) context_ptr)->hStream
179 |             );
180 |         if (batch_host != nullptr)
181 |             delete[] batch_host;
182 | 
183 |         return ret;
184 |     });
185 |     result!(ret)
186 | }
187 | 
188 | #[cfg(test)]
189 | mod tests {
190 |     use super::*;
191 | 
192 |     use crate::ffi::npp::context::Context;
193 |     use crate::npp::tests::sync::memory::*;
194 | 
195 |     #[test]
196 |     fn test_resize_batch() {
197 |         #[rustfmt::skip]
198 |         const INPUT: [u8; 12] = [
199 |             10, 10, 10, 20, 20, 20,
200 |             30, 30, 30, 40, 40, 40,
201 |         ];
202 |         #[rustfmt::skip]
203 |         const EXPECTED_OUTPUT: [u8; 27] = [
204 |             10, 10, 10, 14, 14, 14, 20, 20, 20,
205 |             18, 18, 18, 23, 23, 23, 28, 28, 28,
206 |             30, 30, 30, 34, 34, 34, 40, 40, 40,
207 |         ];
208 | 
209 |         let context = Context::from_null_stream();
210 | 
211 |         let mut inputs_and_outputs = (0..10)
212 |             .map(|_| {
213 |                 let image = to_device_2d!(&INPUT, 2, 2, 3, &context);
214 |                 let output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(3, 3, 3);
215 |                 (image, output)
216 |             })
217 |             .collect::<Vec<_>>();
218 |         let mut inputs_and_outputs_ref = inputs_and_outputs
219 |             .iter_mut()
220 |             .map(|(input, output)| (&*input, output))
221 |             .collect::<Vec<_>>();
222 |         resize_batch(
223 |             &mut inputs_and_outputs_ref,
224 |             Region::Full,
225 |             Region::Full,
226 |             &context,
227 |         )
228 |         .unwrap();
229 | 
230 |         for (_, output) in inputs_and_outputs {
231 |             let output = to_host_2d!(output, &context);
232 |             assert_eq!(&output, &EXPECTED_OUTPUT);
233 |         }
234 |     }
235 | 
236 |     #[test]
237 |     fn test_resize_batch_with_input_region() {
238 |         #[rustfmt::skip]
239 |         const INPUT: [u8; 27] = [
240 |             99, 99, 99, 10, 10, 10, 20, 20, 20,
241 |             99, 99, 99, 30, 30, 30, 40, 40, 40,
242 |             99, 99, 99, 99, 99, 99, 99, 99, 99,
243 |         ];
244 |         #[rustfmt::skip]
245 |         const EXPECTED_OUTPUT: [u8; 27] = [
246 |             32, 32, 32, 14, 14, 14, 20, 20, 20,
247 |             39, 39, 39, 23, 23, 23, 28, 28, 28,
248 |             52, 52, 52, 40, 40, 40, 45, 45, 45,
249 |         ];
250 | 
251 |         let context = Context::from_null_stream();
252 |         let center = Region::Rectangle {
253 |             x: 1,
254 |             y: 0,
255 |             width: 2,
256 |             height: 2,
257 |         };
258 |         let mut inputs_and_outputs = (0..10)
259 |             .map(|_| {
260 |                 let image = to_device_2d!(&INPUT, 3, 3, 3, &context);
261 |                 let output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(3, 3, 3);
262 |                 (image, output)
263 |             })
264 |             .collect::<Vec<_>>();
265 |         let mut inputs_and_outputs_ref = inputs_and_outputs
266 |             .iter_mut()
267 |             .map(|(input, output)| (&*input, output))
268 |             .collect::<Vec<_>>();
269 |         resize_batch(&mut inputs_and_outputs_ref, center, Region::Full, &context).unwrap();
270 | 
271 |         for (_, output) in inputs_and_outputs {
272 |             let output = to_host_2d!(output, &context);
273 |             assert_eq!(&output, &EXPECTED_OUTPUT);
274 |         }
275 |     }
276 | 
277 |     #[test]
278 |     fn test_resize_batch_with_output_region() {
279 |         #[rustfmt::skip]
280 |         const INPUT: [u8; 2 * 2 * 3] = [
281 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
282 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
283 |         ];
284 |         #[rustfmt::skip]
285 |         const EXPECTED_OUTPUT: [u8; 2 * 2 * 3] = [
286 |             0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
287 |             0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
288 |         ];
289 | 
290 |         let context = Context::from_null_stream();
291 |         let bottom_half = Region::Rectangle {
292 |             x: 0,
293 |             y: 1,
294 |             width: 2,
295 |             height: 1,
296 |         };
297 | 
298 |         let mut inputs_and_outputs = (0..10)
299 |             .map(|_| {
300 |                 let image = to_device_2d!(&INPUT, 2, 2, 3, &context);
301 |                 let mut output = crate::ffi::memory::DeviceBuffer2D::<u8>::new(2, 2, 3);
302 |                 output.fill_with_byte(0x00, context.stream.inner()).unwrap();
303 |                 (image, output)
304 |             })
305 |             .collect::<Vec<_>>();
306 |         let mut inputs_and_outputs_ref = inputs_and_outputs
307 |             .iter_mut()
308 |             .map(|(input, output)| (&*input, output))
309 |             .collect::<Vec<_>>();
310 |         resize_batch(
311 |             &mut inputs_and_outputs_ref,
312 |             Region::Full,
313 |             bottom_half,
314 |             &context,
315 |         )
316 |         .unwrap();
317 | 
318 |         for (_, output) in inputs_and_outputs {
319 |             let output = to_host_2d!(output, &context);
320 |             assert_eq!(&output, &EXPECTED_OUTPUT);
321 |         }
322 |     }
323 | 
324 |     #[test]
325 |     #[should_panic]
326 |     fn test_it_panics_when_input_num_channels_incorrect() {
327 |         let mut inputs_and_outputs = vec![
328 |             (
329 |                 crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2),
330 |                 crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 2),
331 |             ),
332 |             (
333 |                 crate::ffi::memory::DeviceBuffer2D::<u8>::new(100, 100, 2),
334 |                 crate::ffi::memory::DeviceBuffer2D::<u8>::new(200, 200, 2),
335 |             ),
336 |         ];
337 |         let mut inputs_and_outputs_ref = inputs_and_outputs
338 |             .iter_mut()
339 |             .map(|(input, output)| (&*input, output))
340 |             .collect::<Vec<_>>();
341 |         resize_batch(
342 |             &mut inputs_and_outputs_ref,
343 |             Region::Full,
344 |             Region::Full,
345 |             &Context::from_null_stream(),
346 |         )
347 |         .unwrap();
348 |     }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/memory/device.rs:
--------------------------------------------------------------------------------
  1 | use crate::ffi;
  2 | use crate::memory::HostBuffer;
  3 | use crate::runtime::Future;
  4 | use crate::stream::Stream;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::error::Error>;
  7 | 
  8 | /// A buffer on the device.
  9 | ///
 10 | /// # Example
 11 | ///
 12 | /// Copying data from a [`HostBuffer`] to a [`DeviceBuffer`]:
 13 | ///
 14 | /// ```
 15 | /// # use async_cuda::{DeviceBuffer, HostBuffer, Stream};
 16 | /// # tokio_test::block_on(async {
 17 | /// let stream = Stream::new().await.unwrap();
 18 | /// let all_ones = vec![1_u8; 100];
 19 | /// let host_buffer = HostBuffer::<u8>::from_slice(&all_ones).await;
 20 | /// let mut device_buffer = DeviceBuffer::<u8>::new(100, &stream).await;
 21 | /// device_buffer.copy_from(&host_buffer, &stream).await.unwrap();
 22 | /// # })
 23 | /// ```
 24 | pub struct DeviceBuffer<T: Copy + 'static> {
 25 |     inner: ffi::memory::DeviceBuffer<T>,
 26 | }
 27 | 
 28 | impl<T: Copy + 'static> DeviceBuffer<T> {
 29 |     /// Allocates memory on the device.
 30 |     ///
 31 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS_1gbbf70065888d61853c047513baa14081)
 32 |     ///
 33 |     /// # Stream ordered semantics
 34 |     ///
 35 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
 36 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
 37 |     ///
 38 |     /// # Arguments
 39 |     ///
 40 |     /// * `num_elements` - Number of elements to allocate.
 41 |     /// * `stream` - Stream to use.
 42 |     pub async fn new(num_elements: usize, stream: &Stream) -> Self {
 43 |         let inner =
 44 |             Future::new(move || ffi::memory::DeviceBuffer::<T>::new(num_elements, stream.inner()))
 45 |                 .await;
 46 |         Self { inner }
 47 |     }
 48 | 
 49 |     /// Allocate memory on the device, and copy data from host into it.
 50 |     ///
 51 |     /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
 52 |     /// copies the data from the host buffer to the [`DeviceBuffer`].
 53 |     ///
 54 |     /// The given stream is automatically synchronized, since the temporary host buffer might
 55 |     /// otherwise be dropped before the copy can complete.
 56 |     ///
 57 |     /// # Arguments
 58 |     ///
 59 |     /// * `slice` - Data to copy into the buffer.
 60 |     /// * `stream` - Stream to use.
 61 |     pub async fn from_slice(slice: &[T], stream: &Stream) -> Result<Self> {
 62 |         let host_buffer = HostBuffer::from_slice(slice).await;
 63 |         let mut this = Self::new(slice.len(), stream).await;
 64 |         this.copy_from(&host_buffer, stream).await?;
 65 |         Ok(this)
 66 |     }
 67 | 
 68 |     /// Allocate memory on the device, and copy array from host into it.
 69 |     ///
 70 |     /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
 71 |     /// copies the data from the host buffer to the [`DeviceBuffer`].
 72 |     ///
 73 |     /// The given stream is automatically synchronized, since the temporary host buffer might
 74 |     /// otherwise be dropped before the copy can complete.
 75 |     ///
 76 |     /// # Arguments
 77 |     ///
 78 |     /// * `slice` - Data to copy into the buffer.
 79 |     /// * `stream` - Stream to use.
 80 |     #[cfg(feature = "ndarray")]
 81 |     pub async fn from_array<D: ndarray::Dimension>(
 82 |         array: &ndarray::ArrayView<'_, T, D>,
 83 |         stream: &Stream,
 84 |     ) -> Result<Self> {
 85 |         let host_buffer = HostBuffer::from_array(array).await;
 86 |         let mut this = Self::new(array.len(), stream).await;
 87 |         this.copy_from(&host_buffer, stream).await?;
 88 |         Ok(this)
 89 |     }
 90 | 
 91 |     /// Copies memory from the provided pinned host buffer to this buffer.
 92 |     ///
 93 |     /// This function synchronizes the stream implicitly.
 94 |     ///
 95 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
 96 |     ///
 97 |     /// # Pinned transfer
 98 |     ///
 99 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
100 |     /// guaranteed to produce a pinned transfer on the runtime thread.
101 |     ///
102 |     /// # Stream ordered semantics
103 |     ///
104 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
105 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
106 |     ///
107 |     /// # Arguments
108 |     ///
109 |     /// * `other` - Buffer to copy from.
110 |     /// * `stream` - Stream to use.
111 |     #[inline]
112 |     pub async fn copy_from(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
113 |         // SAFETY: Stream is synchronized after this.
114 |         unsafe {
115 |             self.copy_from_async(other, stream).await?;
116 |         }
117 |         stream.synchronize().await?;
118 |         Ok(())
119 |     }
120 | 
121 |     /// Copies memory from the provided pinned host buffer to this buffer.
122 |     ///
123 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
124 |     ///
125 |     /// # Pinned transfer
126 |     ///
127 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
128 |     /// guaranteed to produce a pinned transfer on the runtime thread.
129 |     ///
130 |     /// # Stream ordered semantics
131 |     ///
132 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
133 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
134 |     ///
135 |     /// # Safety
136 |     ///
137 |     /// This function is unsafe because the operation might not have completed when the function
138 |     /// returns, and thus the state of the buffer is undefined.
139 |     ///
140 |     /// # Arguments
141 |     ///
142 |     /// * `other` - Buffer to copy from.
143 |     /// * `stream` - Stream to use.
144 |     pub async unsafe fn copy_from_async(
145 |         &mut self,
146 |         other: &HostBuffer<T>,
147 |         stream: &Stream,
148 |     ) -> Result<()> {
149 |         assert_eq!(self.num_elements(), other.num_elements());
150 |         Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await
151 |     }
152 | 
153 |     /// Copies memory from this buffer to the provided pinned host buffer.
154 |     ///
155 |     /// This function synchronizes the stream implicitly.
156 |     ///
157 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
158 |     ///
159 |     /// # Pinned transfer
160 |     ///
161 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
162 |     /// guaranteed to produce a pinned transfer on the runtime thread.
163 |     ///
164 |     /// # Stream ordered semantics
165 |     ///
166 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
167 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
168 |     ///
169 |     /// # Arguments
170 |     ///
171 |     /// * `other` - Buffer to copy to.
172 |     /// * `stream` - Stream to use.
173 |     #[inline]
174 |     pub async fn copy_to(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
175 |         // SAFETY: Stream is synchronized after this.
176 |         unsafe {
177 |             self.copy_to_async(other, stream).await?;
178 |         }
179 |         stream.synchronize().await?;
180 |         Ok(())
181 |     }
182 | 
183 |     /// Copies memory from this buffer to the provided pinned host buffer.
184 |     ///
185 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
186 |     ///
187 |     /// # Pinned transfer
188 |     ///
189 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
190 |     /// guaranteed to produce a pinned transfer on the runtime thread.
191 |     ///
192 |     /// # Stream ordered semantics
193 |     ///
194 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
195 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
196 |     ///
197 |     /// # Safety
198 |     ///
199 |     /// This function is unsafe because the operation might not have completed when the function
200 |     /// returns, and thus the state of the buffer is undefined.
201 |     ///
202 |     /// # Arguments
203 |     ///
204 |     /// * `other` - Buffer to copy to.
205 |     /// * `stream` - Stream to use.
206 |     pub async unsafe fn copy_to_async(
207 |         &self,
208 |         other: &mut HostBuffer<T>,
209 |         stream: &Stream,
210 |     ) -> Result<()> {
211 |         assert_eq!(self.num_elements(), other.num_elements());
212 |         Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await
213 |     }
214 | 
215 |     /// Fill the entire buffer with the given byte.
216 |     ///
217 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g7c9761e21d9f0999fd136c51e7b9b2a0)
218 |     ///
219 |     /// # Stream ordered semantics
220 |     ///
221 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
222 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
223 |     ///
224 |     /// # Arguments
225 |     ///
226 |     /// * `value` - Byte value to fill buffer with.
227 |     pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
228 |         Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await
229 |     }
230 | 
231 |     /// Get number of elements in buffer.
232 |     #[inline(always)]
233 |     pub fn num_elements(&self) -> usize {
234 |         self.inner.num_elements
235 |     }
236 | 
237 |     /// Access the inner synchronous implementation of [`DeviceBuffer`].
238 |     #[inline(always)]
239 |     pub fn inner(&self) -> &ffi::memory::DeviceBuffer<T> {
240 |         &self.inner
241 |     }
242 | 
243 |     /// Access the inner synchronous implementation of [`DeviceBuffer`].
244 |     #[inline(always)]
245 |     pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer<T> {
246 |         &mut self.inner
247 |     }
248 | }
249 | 
250 | #[cfg(test)]
251 | mod tests {
252 |     use super::*;
253 | 
254 |     #[tokio::test]
255 |     async fn test_new() {
256 |         let buffer = DeviceBuffer::<u32>::new(100, &Stream::null()).await;
257 |         assert_eq!(buffer.num_elements(), 100);
258 |     }
259 | 
260 |     #[tokio::test]
261 |     async fn test_copy() {
262 |         let stream = Stream::new().await.unwrap();
263 |         let all_ones = vec![1_u32; 100];
264 |         let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await;
265 | 
266 |         let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
267 |         unsafe {
268 |             device_buffer
269 |                 .copy_from_async(&host_buffer_all_ones, &stream)
270 |                 .await
271 |                 .unwrap();
272 |         }
273 | 
274 |         let mut host_buffer = HostBuffer::<u32>::new(100).await;
275 |         unsafe {
276 |             device_buffer
277 |                 .copy_to_async(&mut host_buffer, &stream)
278 |                 .await
279 |                 .unwrap();
280 |         }
281 | 
282 |         let mut another_device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
283 |         unsafe {
284 |             another_device_buffer
285 |                 .copy_from_async(&host_buffer, &stream)
286 |                 .await
287 |                 .unwrap();
288 |         }
289 | 
290 |         let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
291 |         unsafe {
292 |             another_device_buffer
293 |                 .copy_to_async(&mut return_host_buffer, &stream)
294 |                 .await
295 |                 .unwrap();
296 |         }
297 | 
298 |         stream.synchronize().await.unwrap();
299 | 
300 |         assert_eq!(return_host_buffer.num_elements(), 100);
301 |         let return_data = return_host_buffer.to_vec();
302 |         assert_eq!(return_data.len(), 100);
303 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
304 |     }
305 | 
306 |     #[tokio::test]
307 |     async fn test_fill_with_byte() {
308 |         let stream = Stream::new().await.unwrap();
309 |         let mut device_buffer = DeviceBuffer::<u8>::new(4, &stream).await;
310 |         let mut host_buffer = HostBuffer::<u8>::new(4).await;
311 |         device_buffer.fill_with_byte(0xab, &stream).await.unwrap();
312 |         device_buffer
313 |             .copy_to(&mut host_buffer, &stream)
314 |             .await
315 |             .unwrap();
316 |         assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]);
317 |     }
318 | 
319 |     #[tokio::test]
320 |     #[should_panic]
321 |     async fn test_it_panics_when_copying_invalid_size() {
322 |         let stream = Stream::new().await.unwrap();
323 |         let device_buffer = DeviceBuffer::<u32>::new(101, &stream).await;
324 |         let mut host_buffer = HostBuffer::<u32>::new(100).await;
325 |         let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await };
326 |     }
327 | }
328 | 


--------------------------------------------------------------------------------
/src/memory/device2d.rs:
--------------------------------------------------------------------------------
  1 | use crate::ffi;
  2 | use crate::memory::HostBuffer;
  3 | use crate::runtime::Future;
  4 | use crate::stream::Stream;
  5 | 
  6 | type Result<T> = std::result::Result<T, crate::error::Error>;
  7 | 
  8 | /// A buffer on the device.
  9 | ///
 10 | /// # Example
 11 | ///
 12 | /// Copying data from a [`HostBuffer`] to a [`DeviceBuffer2D`]:
 13 | ///
 14 | /// ```
 15 | /// # use async_cuda::{DeviceBuffer2D, HostBuffer, Stream};
 16 | /// # tokio_test::block_on(async {
 17 | /// let stream = Stream::new().await.unwrap();
 18 | /// let all_ones = vec![1_u8; 300];
 19 | /// let host_buffer = HostBuffer::<u8>::from_slice(&all_ones).await;
 20 | /// let mut device_buffer = DeviceBuffer2D::<u8>::new(10, 10, 3).await;
 21 | /// device_buffer.copy_from(&host_buffer, &stream).await.unwrap();
 22 | /// # })
 23 | /// ```
 24 | pub struct DeviceBuffer2D<T: Copy + 'static> {
 25 |     inner: ffi::memory::DeviceBuffer2D<T>,
 26 | }
 27 | 
 28 | impl<T: Copy + 'static> DeviceBuffer2D<T> {
 29 |     /// Allocates 2D memory on the device.
 30 |     ///
 31 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c)
 32 |     ///
 33 |     /// # Arguments
 34 |     ///
 35 |     /// * `width` - Width of 2-dimensional buffer.
 36 |     /// * `height` - Height of 2-dimensional buffer.
 37 |     /// * `num_channels` - Number of channels per item.
 38 |     pub async fn new(width: usize, height: usize, num_channels: usize) -> Self {
 39 |         let inner =
 40 |             Future::new(move || ffi::memory::DeviceBuffer2D::<T>::new(width, height, num_channels))
 41 |                 .await;
 42 |         Self { inner }
 43 |     }
 44 | 
 45 |     /// Allocate memory on the device, and copy 3D array from host into it.
 46 |     ///
 47 |     /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
 48 |     /// copies the data from the host buffer to the [`DeviceBuffer`].
 49 |     ///
 50 |     /// The given stream is automatically synchronized, since the temporary host buffer might
 51 |     /// otherwise be dropped before the copy can complete.
 52 |     ///
 53 |     /// # Arguments
 54 |     ///
 55 |     /// * `array` - 3-dimensional array to copy into the buffer. The first and second dimensions are
 56 |     ///   equivalent to the height and width of the 2D buffer (respectively), and the third
 57 |     ///   dimension is the number of channels.
 58 |     /// * `stream` - Stream to use.
 59 |     #[cfg(feature = "ndarray")]
 60 |     pub async fn from_array(array: &ndarray::ArrayView3<'_, T>, stream: &Stream) -> Result<Self> {
 61 |         let host_buffer = HostBuffer::from_array(array).await;
 62 |         let (height, width, num_channels) = array.dim();
 63 |         let mut this = Self::new(width, height, num_channels).await;
 64 |         this.copy_from(&host_buffer, stream).await?;
 65 |         Ok(this)
 66 |     }
 67 | 
 68 |     /// Copies memory from the provided pinned host buffer to this 2D buffer.
 69 |     ///
 70 |     /// This function synchronizes the stream implicitly.
 71 |     ///
 72 |     /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is
 73 |     /// `width` times `height` times `num_channels`.
 74 |     ///
 75 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1)
 76 |     ///
 77 |     /// # Pinned transfer
 78 |     ///
 79 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
 80 |     /// guaranteed to produce a pinned transfer on the runtime thread.
 81 |     ///
 82 |     /// # Stream ordered semantics
 83 |     ///
 84 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
 85 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
 86 |     ///
 87 |     /// # Arguments
 88 |     ///
 89 |     /// * `other` - Buffer to copy from.
 90 |     /// * `stream` - Stream to use.
 91 |     #[inline]
 92 |     pub async fn copy_from(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
 93 |         // SAFETY: Stream is synchronized after this.
 94 |         unsafe {
 95 |             self.copy_from_async(other, stream).await?;
 96 |         }
 97 |         stream.synchronize().await?;
 98 |         Ok(())
 99 |     }
100 | 
101 |     /// Copies memory from the provided pinned host buffer to this 2D buffer.
102 |     ///
103 |     /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is
104 |     /// `width` times `height` times `num_channels`.
105 |     ///
106 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1)
107 |     ///
108 |     /// # Pinned transfer
109 |     ///
110 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
111 |     /// guaranteed to produce a pinned transfer on the runtime thread.
112 |     ///
113 |     /// # Stream ordered semantics
114 |     ///
115 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
116 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
117 |     ///
118 |     /// # Safety
119 |     ///
120 |     /// This function is unsafe because the operation might not have completed when the function
121 |     /// returns, and thus the state of the buffer is undefined.
122 |     ///
123 |     /// # Arguments
124 |     ///
125 |     /// * `other` - Buffer to copy from.
126 |     /// * `stream` - Stream to use.
127 |     pub async unsafe fn copy_from_async(
128 |         &mut self,
129 |         other: &HostBuffer<T>,
130 |         stream: &Stream,
131 |     ) -> Result<()> {
132 |         assert_eq!(self.num_elements(), other.num_elements());
133 |         Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await
134 |     }
135 | 
136 |     /// Copies memory from this 2D buffer to the provided pinned host buffer.
137 |     ///
138 |     /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is
139 |     /// `width` times `height` times `num_channels`.
140 |     ///
141 |     /// This function synchronizes the stream implicitly.
142 |     ///
143 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1)
144 |     ///
145 |     /// # Pinned transfer
146 |     ///
147 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
148 |     /// guaranteed to produce a pinned transfer on the runtime thread.
149 |     ///
150 |     /// # Stream ordered semantics
151 |     ///
152 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
153 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
154 |     ///
155 |     /// # Arguments
156 |     ///
157 |     /// * `other` - Buffer to copy to.
158 |     /// * `stream` - Stream to use.
159 |     #[inline]
160 |     pub async fn copy_to(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
161 |         // SAFETY: Stream is synchronized after this.
162 |         unsafe {
163 |             self.copy_to_async(other, stream).await?;
164 |         }
165 |         stream.synchronize().await?;
166 |         Ok(())
167 |     }
168 | 
169 |     /// Copies memory from this 2D buffer to the provided pinned host buffer.
170 |     ///
171 |     /// The host buffer must be of the same size. For the 2D buffer, the total number of elements is
172 |     /// `width` times `height` times `num_channels`.
173 |     ///
174 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge529b926e8fb574c2666a9a1d58b0dc1)
175 |     ///
176 |     /// # Pinned transfer
177 |     ///
178 |     /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
179 |     /// guaranteed to produce a pinned transfer on the runtime thread.
180 |     ///
181 |     /// # Stream ordered semantics
182 |     ///
183 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
184 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
185 |     ///
186 |     /// # Safety
187 |     ///
188 |     /// This function is unsafe because the operation might not have completed when the function
189 |     /// returns, and thus the state of the buffer is undefined.
190 |     ///
191 |     /// # Arguments
192 |     ///
193 |     /// * `other` - Buffer to copy to.
194 |     /// * `stream` - Stream to use.
195 |     pub async unsafe fn copy_to_async(
196 |         &self,
197 |         other: &mut HostBuffer<T>,
198 |         stream: &Stream,
199 |     ) -> Result<()> {
200 |         assert_eq!(self.num_elements(), other.num_elements());
201 |         Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await
202 |     }
203 | 
204 |     /// Fill the entire buffer with the given byte.
205 |     ///
206 |     /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8fdcc53996ff49c570f4b5ead0256ef0)
207 |     ///
208 |     /// # Stream ordered semantics
209 |     ///
210 |     /// This function uses stream ordered semantics. It can only be guaranteed to complete
211 |     /// sequentially relative to operations scheduled on the same stream or the default stream.
212 |     ///
213 |     /// # Arguments
214 |     ///
215 |     /// * `value` - Byte value to fill buffer with.
216 |     pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
217 |         Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await
218 |     }
219 | 
220 |     /// Get 2D buffer width.
221 |     #[inline(always)]
222 |     pub fn width(&self) -> usize {
223 |         self.inner.width
224 |     }
225 | 
226 |     /// Get 2D buffer height.
227 |     #[inline(always)]
228 |     pub fn height(&self) -> usize {
229 |         self.inner.height
230 |     }
231 | 
232 |     /// Get 2D buffer number of channels.
233 |     #[inline(always)]
234 |     pub fn num_channels(&self) -> usize {
235 |         self.inner.num_channels
236 |     }
237 | 
238 |     /// Get the total number of elements in buffer.
239 |     ///
240 |     /// This is equal to: `width` times `height` times `num_channels`.
241 |     #[inline(always)]
242 |     pub fn num_elements(&self) -> usize {
243 |         self.inner.num_elements()
244 |     }
245 | 
246 |     /// Access the inner synchronous implementation of [`DeviceBuffer2D`].
247 |     #[inline(always)]
248 |     pub fn inner(&self) -> &ffi::memory::DeviceBuffer2D<T> {
249 |         &self.inner
250 |     }
251 | 
252 |     /// Access the inner synchronous implementation of [`DeviceBuffer2D`].
253 |     #[inline(always)]
254 |     pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer2D<T> {
255 |         &mut self.inner
256 |     }
257 | }
258 | 
259 | #[cfg(test)]
260 | mod tests {
261 |     use super::*;
262 | 
263 |     #[tokio::test]
264 |     async fn test_new() {
265 |         let buffer = DeviceBuffer2D::<u32>::new(120, 80, 3).await;
266 |         assert_eq!(buffer.width(), 120);
267 |         assert_eq!(buffer.height(), 80);
268 |         assert_eq!(buffer.num_channels(), 3);
269 |         assert_eq!(buffer.num_elements(), 120 * 80 * 3);
270 |         assert!(buffer.inner().pitch >= 360);
271 |     }
272 | 
273 |     #[tokio::test]
274 |     async fn test_copy() {
275 |         let stream = Stream::new().await.unwrap();
276 |         let all_ones = vec![1_u32; 150];
277 |         let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await;
278 | 
279 |         let mut device_buffer = DeviceBuffer2D::<u32>::new(10, 5, 3).await;
280 |         unsafe {
281 |             device_buffer
282 |                 .copy_from_async(&host_buffer_all_ones, &stream)
283 |                 .await
284 |                 .unwrap();
285 |         }
286 | 
287 |         let mut host_buffer = HostBuffer::<u32>::new(150).await;
288 |         unsafe {
289 |             device_buffer
290 |                 .copy_to_async(&mut host_buffer, &stream)
291 |                 .await
292 |                 .unwrap();
293 |         }
294 | 
295 |         let mut another_device_buffer = DeviceBuffer2D::<u32>::new(10, 5, 3).await;
296 |         unsafe {
297 |             another_device_buffer
298 |                 .copy_from_async(&host_buffer, &stream)
299 |                 .await
300 |                 .unwrap();
301 |         }
302 | 
303 |         let mut return_host_buffer = HostBuffer::<u32>::new(150).await;
304 |         unsafe {
305 |             another_device_buffer
306 |                 .copy_to_async(&mut return_host_buffer, &stream)
307 |                 .await
308 |                 .unwrap();
309 |         }
310 | 
311 |         stream.synchronize().await.unwrap();
312 | 
313 |         assert_eq!(return_host_buffer.num_elements(), 150);
314 |         let return_data = return_host_buffer.to_vec();
315 |         assert_eq!(return_data.len(), 150);
316 |         assert!(return_data.into_iter().all(|v| v == 1_u32));
317 |     }
318 | 
319 |     #[tokio::test]
320 |     async fn test_copy_2d() {
321 |         let stream = Stream::new().await.unwrap();
322 |         let image: [u8; 12] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4];
323 |         let host_buffer = HostBuffer::from_slice(&image).await;
324 |         let mut device_buffer = DeviceBuffer2D::<u8>::new(2, 2, 3).await;
325 |         unsafe {
326 |             device_buffer
327 |                 .copy_from_async(&host_buffer, &stream)
328 |                 .await
329 |                 .unwrap();
330 |         }
331 |         let mut return_host_buffer = HostBuffer::<u8>::new(12).await;
332 |         unsafe {
333 |             device_buffer
334 |                 .copy_to_async(&mut return_host_buffer, &stream)
335 |                 .await
336 |                 .unwrap();
337 |         }
338 |         stream.synchronize().await.unwrap();
339 |         assert_eq!(
340 |             &return_host_buffer.to_vec(),
341 |             &[1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
342 |         );
343 |     }
344 | 
345 |     #[tokio::test]
346 |     async fn test_fill_with_byte() {
347 |         let stream = Stream::new().await.unwrap();
348 |         let mut device_buffer = DeviceBuffer2D::<u8>::new(2, 2, 3).await;
349 |         let mut host_buffer = HostBuffer::<u8>::new(2 * 2 * 3).await;
350 |         device_buffer.fill_with_byte(0xab, &stream).await.unwrap();
351 |         device_buffer
352 |             .copy_to(&mut host_buffer, &stream)
353 |             .await
354 |             .unwrap();
355 |         assert_eq!(
356 |             host_buffer.to_vec(),
357 |             &[0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab, 0xab]
358 |         );
359 |     }
360 | 
361 |     #[tokio::test]
362 |     #[should_panic]
363 |     async fn test_it_panics_when_copying_invalid_size() {
364 |         let stream = Stream::new().await.unwrap();
365 |         let device_buffer = DeviceBuffer2D::<u32>::new(5, 5, 3).await;
366 |         let mut host_buffer = HostBuffer::<u32>::new(80).await;
367 |         let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await };
368 |     }
369 | }
370 | 


--------------------------------------------------------------------------------