├── .cargo
    └── config.toml
├── .devcontainer.json
├── .github
    └── workflows
    │   ├── build_guide.yml
    │   ├── ci_linux.yml
    │   ├── ci_windows.yml
    │   └── container_images.yml
├── .gitignore
├── CODEOWNERS
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── container
    ├── rockylinux9-cuda12
    │   └── Dockerfile
    ├── ubuntu22-cuda11
    │   └── Dockerfile
    ├── ubuntu22-cuda12
    │   └── Dockerfile
    └── ubuntu24-cuda12
    │   └── Dockerfile
├── crates
    ├── blastoff
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── context.rs
    │   │   ├── error.rs
    │   │   ├── level1.rs
    │   │   ├── level3.rs
    │   │   ├── lib.rs
    │   │   └── raw
    │   │       ├── level1.rs
    │   │       ├── level3.rs
    │   │       └── mod.rs
    ├── cuda_builder
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── cuda_std
    │   ├── CHANGELOG.md
    │   ├── Cargo.toml
    │   ├── assets
    │   │   └── diagrams_xml
    │   │   │   ├── streams.drawio
    │   │   │   └── thread.drawio
    │   └── src
    │   │   ├── atomic.rs
    │   │   ├── atomic
    │   │       ├── intrinsics.rs
    │   │       └── mid.rs
    │   │   ├── cfg.rs
    │   │   ├── float.rs
    │   │   ├── float_ext.rs
    │   │   ├── intrinsics.rs
    │   │   ├── io.rs
    │   │   ├── lib.rs
    │   │   ├── mem.rs
    │   │   ├── misc.rs
    │   │   ├── ptr.rs
    │   │   ├── rt
    │   │       ├── driver_types_sys.rs
    │   │       ├── error.rs
    │   │       ├── mod.rs
    │   │       └── sys.rs
    │   │   ├── shared.rs
    │   │   ├── thread.rs
    │   │   └── warp.rs
    ├── cuda_std_macros
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── cudnn-sys
    │   ├── Cargo.toml
    │   ├── build
    │   │   ├── cudnn_sdk.rs
    │   │   ├── main.rs
    │   │   └── wrapper.h
    │   └── src
    │   │   └── lib.rs
    ├── cudnn
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── build.rs
    │   └── src
    │   │   ├── activation
    │   │       ├── activation_descriptor.rs
    │   │       ├── activation_mode.rs
    │   │       └── mod.rs
    │   │   ├── attention
    │   │       ├── attention_descriptor.rs
    │   │       ├── attention_weights_kind.rs
    │   │       ├── mod.rs
    │   │       ├── seq_data_axis.rs
    │   │       └── seq_data_descriptor.rs
    │   │   ├── backend
    │   │       ├── conv_bwd_data.rs
    │   │       ├── conv_bwd_filter.rs
    │   │       ├── conv_cfg.rs
    │   │       ├── conv_fwd.rs
    │   │       ├── descriptor.rs
    │   │       ├── engine.rs
    │   │       ├── engine_cfg.rs
    │   │       ├── engine_heuristic.rs
    │   │       ├── execution_plan.rs
    │   │       ├── graph.rs
    │   │       ├── matmul.rs
    │   │       ├── matmul_cfg.rs
    │   │       ├── mod.rs
    │   │       ├── operation.rs
    │   │       ├── pointwise.rs
    │   │       ├── pointwise_cfg.rs
    │   │       ├── pointwise_mode.rs
    │   │       ├── reduction.rs
    │   │       ├── reduction_cfg.rs
    │   │       ├── reduction_mode.rs
    │   │       └── tensor.rs
    │   │   ├── context.rs
    │   │   ├── convolution
    │   │       ├── convolution_algo.rs
    │   │       ├── convolution_config.rs
    │   │       ├── convolution_descriptor.rs
    │   │       ├── convolution_mode.rs
    │   │       ├── filter_descriptor.rs
    │   │       └── mod.rs
    │   │   ├── data_type.rs
    │   │   ├── determinism.rs
    │   │   ├── dropout
    │   │       ├── dropout_descriptor.rs
    │   │       └── mod.rs
    │   │   ├── error.rs
    │   │   ├── lib.rs
    │   │   ├── math_type.rs
    │   │   ├── nan_propagation.rs
    │   │   ├── op
    │   │       ├── mod.rs
    │   │       ├── op_tensor_descriptor.rs
    │   │       └── op_tensor_op.rs
    │   │   ├── pooling
    │   │       ├── mod.rs
    │   │       ├── pooling_descriptor.rs
    │   │       └── pooling_mode.rs
    │   │   ├── reduction
    │   │       ├── indices_type.rs
    │   │       ├── mod.rs
    │   │       ├── reduce_indices.rs
    │   │       ├── reduce_op.rs
    │   │       └── reduction_descriptor.rs
    │   │   ├── rnn
    │   │       ├── forward_mode.rs
    │   │       ├── mod.rs
    │   │       ├── rnn_algo.rs
    │   │       ├── rnn_bias_mode.rs
    │   │       ├── rnn_clip_mode.rs
    │   │       ├── rnn_data_descriptor.rs
    │   │       ├── rnn_data_layout.rs
    │   │       ├── rnn_descriptor.rs
    │   │       ├── rnn_direction_mode.rs
    │   │       ├── rnn_input_mode.rs
    │   │       └── rnn_mode.rs
    │   │   ├── softmax
    │   │       ├── mod.rs
    │   │       ├── softmax_algo.rs
    │   │       └── softmax_mode.rs
    │   │   ├── tensor
    │   │       ├── mod.rs
    │   │       ├── tensor_descriptor.rs
    │   │       └── tensor_format.rs
    │   │   └── w_grad_mode.rs
    ├── cust
    │   ├── CHANGELOG.md
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── build.rs
    │   ├── resources
    │   │   ├── add.cu
    │   │   ├── add.cubin
    │   │   ├── add.fatbin
    │   │   └── add.ptx
    │   └── src
    │   │   ├── compile.rs
    │   │   ├── context
    │   │       ├── legacy.rs
    │   │       └── mod.rs
    │   │   ├── device.rs
    │   │   ├── error.rs
    │   │   ├── event.rs
    │   │   ├── external.rs
    │   │   ├── function.rs
    │   │   ├── graph.rs
    │   │   ├── graph_dotfile.rs
    │   │   ├── legacy.rs
    │   │   ├── lib.rs
    │   │   ├── link.rs
    │   │   ├── memory
    │   │       ├── array.rs
    │   │       ├── device
    │   │       │   ├── device_box.rs
    │   │       │   ├── device_buffer.rs
    │   │       │   ├── device_slice.rs
    │   │       │   ├── device_variable.rs
    │   │       │   └── mod.rs
    │   │       ├── locked.rs
    │   │       ├── locked
    │   │       │   ├── locked_box.rs
    │   │       │   └── locked_buffer.rs
    │   │       ├── malloc.rs
    │   │       ├── mod.rs
    │   │       ├── pointer.rs
    │   │       └── unified.rs
    │   │   ├── module.rs
    │   │   ├── nvtx.rs
    │   │   ├── prelude.rs
    │   │   ├── stream.rs
    │   │   ├── surface.rs
    │   │   ├── texture.rs
    │   │   └── util.rs
    ├── cust_core
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── cust_derive
    │   ├── .gitignore
    │   ├── Cargo.toml
    │   ├── README.md
    │   └── src
    │   │   └── lib.rs
    ├── cust_raw
    │   ├── Cargo.toml
    │   ├── build
    │   │   ├── callbacks.rs
    │   │   ├── cublasLt_wrapper.h
    │   │   ├── cublasXt_wrapper.h
    │   │   ├── cublas_wrapper.h
    │   │   ├── cuda_sdk.rs
    │   │   ├── driver_wrapper.h
    │   │   ├── main.rs
    │   │   ├── nvptx_compiler_wrapper.h
    │   │   ├── nvvm_wrapper.h
    │   │   └── runtime_wrapper.h
    │   └── src
    │   │   ├── cublas_sys.rs
    │   │   ├── cublaslt_sys.rs
    │   │   ├── cublasxt_sys.rs
    │   │   ├── driver_sys.rs
    │   │   ├── lib.rs
    │   │   ├── nvptx_compiler_sys.rs
    │   │   ├── nvvm_sys.rs
    │   │   └── runtime_sys.rs
    ├── gpu_rand
    │   ├── Cargo.toml
    │   ├── LICENSE-RAND
    │   ├── README.md
    │   └── src
    │   │   ├── default.rs
    │   │   ├── gpurng.rs
    │   │   ├── lib.rs
    │   │   └── xoroshiro
    │   │       ├── common.rs
    │   │       ├── mod.rs
    │   │       ├── splitmix64.rs
    │   │       ├── xoroshiro128plus.rs
    │   │       ├── xoroshiro128plusplus.rs
    │   │       ├── xoroshiro128starstar.rs
    │   │       ├── xoroshiro64star.rs
    │   │       ├── xoroshiro64starstar.rs
    │   │       ├── xoshiro128plus.rs
    │   │       ├── xoshiro128plusplus.rs
    │   │       ├── xoshiro128starstar.rs
    │   │       ├── xoshiro256plus.rs
    │   │       ├── xoshiro256plusplus.rs
    │   │       ├── xoshiro256starstar.rs
    │   │       ├── xoshiro512plus.rs
    │   │       ├── xoshiro512plusplus.rs
    │   │       └── xoshiro512starstar.rs
    ├── nvvm
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── optix-sys
    │   ├── Cargo.toml
    │   ├── build
    │   │   ├── main.rs
    │   │   ├── optix_sdk.rs
    │   │   ├── optix_stubs.c
    │   │   └── wrapper.h
    │   └── src
    │   │   ├── lib.rs
    │   │   ├── optix_sys.rs
    │   │   └── stub.rs
    ├── optix
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── examples
    │   │   ├── common
    │   │   │   └── gdt
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── cmake
    │   │   │   │       ├── FindOptiX.cmake
    │   │   │   │       ├── FindTBB.cmake
    │   │   │   │       ├── configure_build_type.cmake
    │   │   │   │       ├── configure_glut.cmake
    │   │   │   │       ├── configure_optix.cmake
    │   │   │   │       └── configure_tbb.cmake
    │   │   │   │   └── gdt
    │   │   │   │       ├── gdt.cpp
    │   │   │   │       ├── gdt.h
    │   │   │   │       ├── math
    │   │   │   │           ├── AffineSpace.h
    │   │   │   │           ├── LinearSpace.h
    │   │   │   │           ├── Quaternion.h
    │   │   │   │           ├── box.h
    │   │   │   │           ├── constants.h
    │   │   │   │           ├── fixedpoint.h
    │   │   │   │           ├── vec.h
    │   │   │   │           └── vec
    │   │   │   │           │   ├── compare.h
    │   │   │   │           │   ├── functors.h
    │   │   │   │           │   └── rotate.h
    │   │   │   │       └── random
    │   │   │   │           └── random.h
    │   │   ├── ex02_pipeline
    │   │   │   ├── Cargo.toml
    │   │   │   ├── build.rs
    │   │   │   ├── device
    │   │   │   │   ├── Cargo.toml
    │   │   │   │   └── src
    │   │   │   │   │   └── lib.rs
    │   │   │   └── src
    │   │   │   │   ├── ex02_pipeline.cu
    │   │   │   │   ├── launch_params.h
    │   │   │   │   ├── main.rs
    │   │   │   │   └── renderer.rs
    │   │   ├── ex03_window
    │   │   │   ├── Cargo.toml
    │   │   │   ├── build.rs
    │   │   │   └── src
    │   │   │   │   ├── ex03_window.cu
    │   │   │   │   ├── gl_util.rs
    │   │   │   │   ├── main.rs
    │   │   │   │   ├── renderer.rs
    │   │   │   │   └── vector.rs
    │   │   ├── ex04_mesh
    │   │   │   ├── Cargo.toml
    │   │   │   ├── build.rs
    │   │   │   └── src
    │   │   │   │   ├── gl_util.rs
    │   │   │   │   ├── main.rs
    │   │   │   │   ├── renderer.rs
    │   │   │   │   └── vector.rs
    │   │   ├── resources
    │   │   │   └── .gitignore
    │   │   └── rust
    │   │   │   └── ex04_mesh_gpu
    │   │   │       ├── Cargo.toml
    │   │   │       └── src
    │   │   │           └── lib.rs
    │   ├── images
    │   │   ├── example_sbt.jpg
    │   │   ├── example_sbt.png
    │   │   ├── optix_programs.jpg
    │   │   ├── scene_graph.jpg
    │   │   ├── scene_graph.png
    │   │   └── traversables_graph.jpg
    │   └── src
    │   │   ├── acceleration.md
    │   │   ├── acceleration.rs
    │   │   ├── context.md
    │   │   ├── context.rs
    │   │   ├── denoiser.md
    │   │   ├── denoiser.rs
    │   │   ├── error.rs
    │   │   ├── impl_glam.rs
    │   │   ├── introduction.md
    │   │   ├── lib.rs
    │   │   ├── pipeline.md
    │   │   ├── pipeline.rs
    │   │   ├── prelude.rs
    │   │   ├── shader_binding_table.md
    │   │   └── shader_binding_table.rs
    ├── optix_device
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── hit.rs
    │   │   ├── intersect.rs
    │   │   ├── lib.rs
    │   │   ├── misc.rs
    │   │   ├── payload.rs
    │   │   ├── ray.rs
    │   │   ├── sys.rs
    │   │   ├── trace.rs
    │   │   ├── transform.rs
    │   │   └── util.rs
    ├── optix_device_macros
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── ptx
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── lexer.rs
    │   │   ├── lexer_tests.rs
    │   │   ├── lib.rs
    │   │   ├── parser.rs
    │   │   ├── parser
    │   │       ├── directive.rs
    │   │       └── mod.rs
    │   │   └── types.rs
    ├── ptx_compiler
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── rustc_codegen_nvvm
    │   ├── CHANGELOG.md
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── libintrinsics.bc
    │   ├── libintrinsics.ll
    │   ├── rustc_llvm_wrapper
    │   │   ├── .editorconfig
    │   │   ├── PassWrapper.cpp
    │   │   ├── RustWrapper.cpp
    │   │   └── rustllvm.h
    │   └── src
    │   │   ├── abi.rs
    │   │   ├── allocator.rs
    │   │   ├── asm.rs
    │   │   ├── attributes.rs
    │   │   ├── back.rs
    │   │   ├── builder.rs
    │   │   ├── common.rs
    │   │   ├── const_ty.rs
    │   │   ├── consts.rs
    │   │   ├── context.rs
    │   │   ├── ctx_intrinsics.rs
    │   │   ├── debug_info
    │   │       ├── create_scope_map.rs
    │   │       ├── dwarf_const.rs
    │   │       ├── metadata.rs
    │   │       ├── metadata
    │   │       │   ├── enums.rs
    │   │       │   └── type_map.rs
    │   │       ├── mod.rs
    │   │       ├── namespace.rs
    │   │       └── util.rs
    │   │   ├── init.rs
    │   │   ├── int_replace.rs
    │   │   ├── intrinsic.rs
    │   │   ├── lib.rs
    │   │   ├── link.rs
    │   │   ├── llvm.rs
    │   │   ├── lto.rs
    │   │   ├── mono_item.rs
    │   │   ├── nvvm.rs
    │   │   ├── override_fns.rs
    │   │   ├── ptxgen.rs
    │   │   ├── target.rs
    │   │   └── ty.rs
    └── rustc_codegen_nvvm_macros
    │   ├── Cargo.toml
    │   └── src
    │       └── lib.rs
├── examples
    ├── cuda
    │   ├── README.md
    │   ├── assets
    │   │   └── path_tracer.png
    │   ├── gemm
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── kernels
    │   │   │   ├── Cargo.toml
    │   │   │   └── src
    │   │   │   │   ├── gemm_naive.rs
    │   │   │   │   ├── gemm_tiled.rs
    │   │   │   │   └── lib.rs
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── path_tracer
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── kernels
    │   │   │   ├── Cargo.toml
    │   │   │   └── src
    │   │   │   │   ├── hittable.rs
    │   │   │   │   ├── lib.rs
    │   │   │   │   ├── material.rs
    │   │   │   │   ├── math.rs
    │   │   │   │   ├── optix.rs
    │   │   │   │   ├── render.rs
    │   │   │   │   ├── render_kernels.rs
    │   │   │   │   ├── scene.rs
    │   │   │   │   └── sphere.rs
    │   │   ├── shaders
    │   │   │   ├── image.frag
    │   │   │   └── image.vert
    │   │   └── src
    │   │   │   ├── common.rs
    │   │   │   ├── cpu
    │   │   │       └── mod.rs
    │   │   │   ├── cuda
    │   │   │       ├── data.rs
    │   │   │       └── mod.rs
    │   │   │   ├── main.rs
    │   │   │   ├── optix
    │   │   │       └── mod.rs
    │   │   │   ├── renderer.rs
    │   │   │   └── viewer.rs
    │   └── vecadd
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── kernels
    │   │       ├── Cargo.toml
    │   │       └── src
    │   │       │   └── lib.rs
    │   │   └── src
    │   │       └── main.rs
    └── optix
    │   └── denoiser
    │       ├── Cargo.toml
    │       ├── README.md
    │       ├── noisy.png
    │       └── src
    │           └── main.rs
├── guide
    ├── assets
    │   ├── nsight.png
    │   └── streams.svg
    ├── book.toml
    └── src
    │   ├── README.md
    │   ├── SUMMARY.md
    │   ├── cuda
    │       ├── README.md
    │       ├── gpu_computing.md
    │       └── pipeline.md
    │   ├── faq.md
    │   ├── features.md
    │   ├── guide
    │       ├── README.md
    │       ├── getting_started.md
    │       ├── kernel_abi.md
    │       ├── safety.md
    │       └── tips.md
    │   └── nvvm
    │       ├── README.md
    │       └── technical
    │           ├── README.md
    │           ├── backends.md
    │           ├── debugging.md
    │           ├── nvvm.md
    │           ├── ptxgen.md
    │           └── types.md
├── katex-header.html
├── rust-toolchain.toml
├── rustfmt.toml
├── scripts
    ├── data
    │   ├── libdevice.json
    │   ├── libdevice.pdf
    │   ├── libdevice.txt
    │   └── std_intrinsics.rs
    ├── download_ci_optix.bash
    ├── gen_intrinsics.py
    └── gen_libdevice_json.py
└── xtask
    ├── Cargo.toml
    └── src
        ├── extract_llfns.rs
        └── main.rs


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [alias]
2 | xtask = "run -p xtask --bin xtask --"
3 | 


--------------------------------------------------------------------------------
/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "image" : "ghcr.io/rust-gpu/rust-cuda-ubuntu24-cuda12:latest",
 3 |   // Uncoment the following lines to use a Dockerfile instead of a prebuilt image.
 4 |   // "build": {
 5 |   //   "dockerfile": "${localWorkspaceFolder}/container/ubuntu24-cuda12/Dockerfile",
 6 |   //   "context": "${localWorkspaceFolder}"
 7 |   // },
 8 |   "containerEnv": {
 9 |     "NVIDIA_DRIVER_CAPABILITIES": "all"
10 |   },
11 |   "runArgs": ["--runtime=nvidia", "--gpus", "all"],
12 |   "customizations": {
13 |     "vscode": {
14 |       "extensions": ["rust-lang.rust-analyzer"]
15 |     }
16 |   },
17 |   "features": {
18 |     "ghcr.io/devcontainers/features/git:1": {}
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build_guide.yml:
--------------------------------------------------------------------------------
 1 | name: build mdbook for github pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-20.04
11 |     steps:
12 |       - uses: actions/checkout@v2
13 | 
14 |       - name: Setup mdBook
15 |         uses: peaceiris/actions-mdbook@v1
16 |         with:
17 |           mdbook-version: 'latest'
18 | 
19 |       - run: mdbook build guide/ -d ../book
20 | 
21 |       - name: Deploy
22 |         uses: peaceiris/actions-gh-pages@v3
23 |         with:
24 |           github_token: ${{ secrets.GITHUB_TOKEN }}
25 |           publish_dir: ./book
26 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_windows.yml:
--------------------------------------------------------------------------------
 1 | name: CI on Windows
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths-ignore:
 6 |       - "**.md"
 7 |   push:
 8 |     paths-ignore:
 9 |       - "**.md"
10 | 
11 | env:
12 |   RUST_LOG: info
13 |   RUST_BACKTRACE: 1
14 | 
15 | jobs:
16 |   rust:
17 |     name: Build / ${{ matrix.os }} / CUDA-${{ matrix.cuda }}
18 |     runs-on: ${{ matrix.os }}
19 |     env:
20 |       LLVM_LINK_STATIC: 1
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         include:
25 |           - os: windows-latest
26 |             target: x86_64-pc-windows-msvc
27 |             cuda: "12.8.1"
28 |             linux-local-args: []
29 |             sub-packages: ["nvcc", "nvrtc", "nvrtc_dev", "cuda_profiler_api", "cudart", "cublas", "cublas_dev", "curand", "curand_dev"]
30 | 
31 |     steps:
32 |       - name: Checkout repository
33 |         uses: actions/checkout@v2
34 | 
35 |       - name: Install CUDA
36 |         uses: Jimver/cuda-toolkit@v0.2.22
37 |         id: cuda-toolkit
38 |         with:
39 |           cuda: ${{ matrix.cuda }}
40 |           method: network
41 |           linux-local-args: ${{ toJson(matrix.linux-local-args) }}
42 |           use-local-cache: false
43 |           sub-packages: ${{ toJson(matrix.sub-packages) }}
44 |           log-file-suffix: '${{matrix.os}}-${{matrix.cuda}}'
45 | 
46 |       - name: Verify CUDA installation
47 |         run: nvcc --version
48 | 
49 |       - name: List CUDA_PATH files
50 |         shell: pwsh
51 |         run: Get-ChildItem -Path $env:CUDA_PATH -Recurse | ForEach-Object { $_.FullName }
52 | 
53 |       # random command that forces rustup to install stuff in rust-toolchain
54 |       - name: Install rust-toolchain
55 |         run: cargo version
56 | 
57 |       - name: Add rustup components
58 |         run: rustup component add rustfmt clippy
59 | 
60 |       - name: Load Rust Cache
61 |         uses: Swatinem/rust-cache@v2.7.7
62 |         with:
63 |           key: ${{ matrix.os }}-${{ matrix.target }}-${{ matrix.cuda }}
64 | 
65 |       - name: Build all bindings
66 |         run: cargo build --all-features -p cust_raw
67 | 
68 |       - name: Build
69 |         run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*"
70 | 
71 |       # Don't currently test because many tests rely on the system having a CUDA GPU
72 |       # - name: Test
73 |       #   run: cargo test --workspace
74 | 
75 |       - name: Check documentation
76 |         env:
77 |           RUSTDOCFLAGS: -Dwarnings
78 |         run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | /target
3 | Cargo.lock
4 | **/.vscode
5 | .devcontainer


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # cudnn crate
2 | /crates/cudnn/ @frjnn
3 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | members = [
 4 |   "crates/*",
 5 |   "crates/optix/examples/ex*",
 6 |   "crates/optix/examples/ex*/device",
 7 |   "crates/optix/examples/rust/ex*",
 8 | 
 9 |   "xtask",
10 | 
11 |   "examples/cuda/vecadd",
12 |   "examples/cuda/vecadd/kernels",
13 |   "examples/cuda/gemm",
14 |   "examples/cuda/gemm/kernels",
15 |   "examples/cuda/path_tracer",
16 |   "examples/cuda/path_tracer/kernels",
17 | 
18 |   "examples/optix/*",
19 | ]
20 | 
21 | exclude = [
22 |     "crates/optix/examples/common",
23 | ]
24 | 
25 | [profile.dev.package.rustc_codegen_nvvm]
26 | opt-level = 3
27 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2021 Riccardo D'Ambrosio
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/container/rockylinux9-cuda12/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.8.1-cudnn-devel-rockylinux9
 2 | 
 3 | RUN dnf -y install \
 4 |     clang \
 5 |     openssl-devel \
 6 |     pkgconfig \
 7 |     redhat-rpm-config \
 8 |     which \
 9 |     xz \
10 |     zlib-devel && \
11 |     dnf clean all
12 | 
13 | # Needed to build `path_tracer`, `optix/ex03_window` example
14 | RUN dnf -y install \
15 |     cmake \
16 |     fontconfig-devel \
17 |     libX11-devel  \
18 |     libXcursor-devel \
19 |     libXi-devel \
20 |     libXrandr-devel && \
21 |     dnf clean all
22 | 
23 | # Get LLVM 7 & libffi.so.6
24 | WORKDIR /data/llvm7
25 | RUN curl -sSf -L -O https://dl.fedoraproject.org/pub/epel/9/Everything/x86_64/Packages/l/libffi3.1-3.1-36.el9.x86_64.rpm && \
26 |     curl -sSf -L -O https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/l/llvm7.0-7.0.1-7.el8.x86_64.rpm && \
27 |     curl -sSf -L -O https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/l/llvm7.0-devel-7.0.1-7.el8.x86_64.rpm && \
28 |     curl -sSf -L -O https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/l/llvm7.0-libs-7.0.1-7.el8.x86_64.rpm && \
29 |     curl -sSf -L -O https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/l/llvm7.0-static-7.0.1-7.el8.x86_64.rpm && \
30 |     dnf -y install ./*.rpm && \
31 |     ln -s /usr/bin/llvm-config-7-64 /usr/bin/llvm-config && \
32 |     rm -rf ./*.rpm && \
33 |     dnf clean all
34 | 
35 | # Get Rust
36 | RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
37 | ENV PATH="/root/.cargo/bin:${PATH}"
38 | 
39 | # Setup the workspace
40 | WORKDIR /data/Rust-CUDA
41 | RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
42 |     rustup show
43 | 
44 | # Add nvvm to LD_LIBRARY_PATH.
45 | ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
46 | ENV LLVM_LINK_STATIC=1
47 | ENV RUST_LOG=info


--------------------------------------------------------------------------------
/container/ubuntu22-cuda11/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 2 | 
 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
 4 |     build-essential \
 5 |     curl \
 6 |     clang \
 7 |     libssl-dev \
 8 |     libtinfo-dev \
 9 |     pkg-config \
10 |     xz-utils \
11 |     zlib1g-dev && \
12 |     rm -rf /var/lib/apt/lists/*
13 | 
14 | # Needed to build `path_tracer`, `optix/ex03_window` example
15 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
16 |     cmake \
17 |     libfontconfig-dev \
18 |     libx11-xcb-dev \
19 |     libxcursor-dev \
20 |     libxi-dev \
21 |     libxinerama-dev \
22 |     libxrandr-dev && \
23 |     rm -rf /var/lib/apt/lists/*
24 | 
25 | # Get LLVM 7
26 | WORKDIR /data/llvm7
27 | RUN curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7_7.0.1-12_amd64.deb && \
28 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-dev_7.0.1-12_amd64.deb && \
29 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/libllvm7_7.0.1-12_amd64.deb && \
30 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-runtime_7.0.1-12_amd64.deb && \
31 |     apt-get update && apt-get install -y ./*.deb && \
32 |     ln -s /usr/bin/llvm-config-7 /usr/bin/llvm-config && \
33 |     rm -rf ./*.deb && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | # Get Rust
37 | RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
38 | ENV PATH="/root/.cargo/bin:${PATH}"
39 | 
40 | # Setup the workspace
41 | WORKDIR /data/Rust-CUDA
42 | RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
43 |     rustup show
44 | 
45 | # Add nvvm to LD_LIBRARY_PATH.
46 | ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
47 | ENV LLVM_LINK_STATIC=1
48 | ENV RUST_LOG=info


--------------------------------------------------------------------------------
/container/ubuntu22-cuda12/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
 2 | 
 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
 4 |     build-essential \
 5 |     curl \
 6 |     clang \
 7 |     libssl-dev \
 8 |     libtinfo-dev \
 9 |     pkg-config \
10 |     xz-utils \
11 |     zlib1g-dev && \
12 |     rm -rf /var/lib/apt/lists/*
13 | 
14 | # Needed to build `path_tracer`, `optix/ex03_window` example
15 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
16 |     cmake \
17 |     libfontconfig-dev \
18 |     libx11-xcb-dev \
19 |     libxcursor-dev \
20 |     libxi-dev \
21 |     libxinerama-dev \
22 |     libxrandr-dev && \
23 |     rm -rf /var/lib/apt/lists/*
24 | 
25 | # Get LLVM 7
26 | WORKDIR /data/llvm7
27 | RUN curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7_7.0.1-12_amd64.deb && \
28 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-dev_7.0.1-12_amd64.deb && \
29 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/libllvm7_7.0.1-12_amd64.deb && \
30 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-runtime_7.0.1-12_amd64.deb && \
31 |     apt-get update && apt-get install -y ./*.deb && \
32 |     ln -s /usr/bin/llvm-config-7 /usr/bin/llvm-config && \
33 |     rm -rf ./*.deb && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | # Get Rust
37 | RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
38 | ENV PATH="/root/.cargo/bin:${PATH}"
39 | 
40 | # Setup the workspace
41 | WORKDIR /data/Rust-CUDA
42 | RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
43 |     rustup show
44 | 
45 | # Add nvvm to LD_LIBRARY_PATH.
46 | ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
47 | ENV LLVM_LINK_STATIC=1
48 | ENV RUST_LOG=info


--------------------------------------------------------------------------------
/container/ubuntu24-cuda12/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
 2 | 
 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
 4 |     build-essential \
 5 |     clang \
 6 |     curl \
 7 |     libssl-dev \
 8 |     libtinfo-dev \
 9 |     pkg-config \
10 |     xz-utils \
11 |     zlib1g-dev && \
12 |     rm -rf /var/lib/apt/lists/*
13 | 
14 | # Needed to build `path_tracer`, `optix/ex03_window` example
15 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
16 |     cmake \
17 |     libfontconfig-dev \
18 |     libx11-xcb-dev \
19 |     libxcursor-dev \
20 |     libxi-dev \
21 |     libxinerama-dev \
22 |     libxrandr-dev && \
23 |     rm -rf /var/lib/apt/lists/*
24 | 
25 | # Get LLVM 7 & libffi7
26 | WORKDIR /data/llvm7
27 | RUN curl -sSf -L -O http://security.ubuntu.com/ubuntu/pool/universe/libf/libffi7/libffi7_3.3-5ubuntu1_amd64.deb && \
28 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7_7.0.1-12_amd64.deb && \
29 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-dev_7.0.1-12_amd64.deb && \
30 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/libllvm7_7.0.1-12_amd64.deb && \
31 |     curl -sSf -L -O http://mirrors.kernel.org/ubuntu/pool/universe/l/llvm-toolchain-7/llvm-7-runtime_7.0.1-12_amd64.deb && \
32 |     apt-get install -y ./*.deb && \
33 |     ln -s /usr/bin/llvm-config-7 /usr/bin/llvm-config && \
34 |     rm -rf ./*.deb && \
35 |     rm -rf /var/lib/apt/lists/*
36 | 
37 | # Get Rust
38 | RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
39 | ENV PATH="/root/.cargo/bin:${PATH}"
40 | 
41 | # Setup the workspace
42 | WORKDIR /data/Rust-CUDA
43 | RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
44 |     rustup show
45 | 
46 | # Add nvvm to LD_LIBRARY_PATH.
47 | ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
48 | ENV LLVM_LINK_STATIC=1
49 | ENV RUST_LOG=info


--------------------------------------------------------------------------------
/crates/blastoff/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "blastoff"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 6 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 7 | 
 8 | [dependencies]
 9 | bitflags = "2.8"
10 | cust = { version = "0.3", path = "../cust", features = ["impl_num_complex"] }
11 | cust_raw = { path = "../cust_raw", features = ["cublas"] }
12 | num-complex = "0.4.6"
13 | half = { version = "2.4.1", optional = true }
14 | 
15 | [package.metadata.docs.rs]
16 | rustdoc-args = ["--html-in-header", "katex-header.html", "--cfg", "docsrs"]
17 | 


--------------------------------------------------------------------------------
/crates/blastoff/src/raw/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Generic traits over raw FFI functions for floats, doubles, complex numbers, and double complex numbers.
 2 | //!
 3 | //! The functions are still very unsafe and do nothing except dispatch to the correct FFI function.
 4 | 
 5 | #![allow(clippy::missing_safety_doc, clippy::too_many_arguments)]
 6 | 
 7 | mod level1;
 8 | mod level3;
 9 | 
10 | pub use level1::*;
11 | pub use level3::*;
12 | 


--------------------------------------------------------------------------------
/crates/cuda_builder/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda_builder"
 3 | version = "0.3.0"
 4 | edition = "2021"
 5 | authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>", "The rust-gpu Authors"]
 6 | license = "MIT OR Apache-2.0"
 7 | description = "Builder for easily building rustc_codegen_nvvm crates"
 8 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 9 | readme = "../../README.md"
10 | 
11 | [dependencies]
12 | rustc_codegen_nvvm = { version = "0.3", path = "../rustc_codegen_nvvm" }
13 | nvvm = { path = "../nvvm", version = "0.1" }
14 | serde = { version = "1.0.217", features = ["derive"] }
15 | serde_json = "1.0.138"
16 | 


--------------------------------------------------------------------------------
/crates/cuda_std/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | Notable changes to this project will be documented in this file.
 4 | 
 5 | ## Unreleased
 6 | 
 7 | - Added warp shuffles, matches, reductions, and votes in the `warp` module.
 8 | - Added `activemask` in the `warp` module to query a mask of the active threads.
 9 | - Fixed `lane_id` generating invalid ptx.
10 | 
11 | ## 0.2.2 - 2/7/22
12 | 
13 | - Thread/Block/Grid index/dim intrinsics now hint to llvm that their range is in some bound declared by CUDA. Hopefully allowing for more optimizations.
14 | 
15 | ## 0.2.1 - 12/8/21
16 | 
17 | - Fixed `shared_array!` not using fully qualified MaybeUninit.
18 | - Fixed `shared_array!` working on the CPU.
19 | - Added experimental dynamic shared memory support through `shared::dynamic_shared_memory`.
20 | 
21 | ## 0.2.0 - 12/5/21
22 | 
23 | - Added `#[externally_visible]` in conjunction with cg_nvvm dead code elimination changes to mark that
24 | a function is externally visible.
25 | - Added `#[address_space(...)]` in conjunction with cg_nvvm address space changes. Only meant for internal use
26 | and advanced users.
27 | - Added `cuda_std::ptr`.
28 | - Added `is_in_address_space`
29 | - Added `convert_generic_to_specific_address_space`
30 | - Added `convert_specific_address_space_to_generic`
31 | - Added basic static shared memory support with `cuda_std::shared_array`.
32 | 


--------------------------------------------------------------------------------
/crates/cuda_std/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda_std"
 3 | version = "0.2.2"
 4 | edition = "2018"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Standard library for CUDA with rustc_codegen_nvvm"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [dependencies]
11 | vek = { version = "0.17.1", default-features = false, features = ["libm"] }
12 | cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
13 | half = "2.4.1"
14 | bitflags = "2.8"
15 | paste = "1.0.15"
16 | 


--------------------------------------------------------------------------------
/crates/cuda_std/assets/diagrams_xml/streams.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2021-10-05T21:35:56.367Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" etag="QkwMzno3eVJKbGEEY9TI" version="15.2.9" type="device"><diagram id="55urjaCjmfQdbR2H_qAJ" name="Page-1">5Vhbb9sgFP41eayEwbc+Npdu0japUjrt8lIx+8T2gk2EcZP01w8Iju3YrTopS6LFDwnn44Dh+zjHwIhM8s0HQVfpFx4DG2EUb0ZkOsIYIxyqP41sd4jvkx2QiCzeQU4DzLMXsCCyaJXFUHYcJedMZqsuGPGigEh2MCoEX3fdFpx137qiCfSAeURZH/2WxTLdoaGHGvwjZElq3+whW5HT2tcCZUpjvm5BZDYiE8G53JXyzQSY5q6mZdfu/pXa/bgEFPI9DV42X+ePj+lysfzJvLuHpxl6IDe2l2fKKjvfTyAKLaEZstzWNAheFTHortCIjNdpJmG+opGuXSvhFZbKnCnLUcVFxtiEMy5MW+LHYeC7Ci+l4Eto1ZA7zyWmBS9kC1+YR+F2eCAkbF6dt7NnU61C4DlIsVUudQPfCmAXYGjNdaOmU2uUtpSsm1G7gJJ9zw3JqmB5/hvOwx67EKs1Z00uZMoTXlA2a9BxVIlnQ79mqytG0+Az5yvr8huk3NpoopXkXYEUdWL7vW38sJ0ZY7rpWFtr7Yatx/q2FGpqvBIRvMEBtpFMRQLyDT9vWFoBjMrsuTuOowuFTxIcbuCHwWBwOC7x3NMGh+OeOzrIKUhfIEoQHSJ9PA0QQn3SkXmOQzrBl5aRbi8nI6EzZST3nRnJP2dGck8RHBR5GMdDweFPnfuh4DhmRjoMjvNnJO8K9khhl3R8dtL9K/gMoEsjPbiCDc/BSt+LcDbSwyvI6ejSSHf6B9+5FEBzhTk95tXMZZfeLo0FL+CAcwtRliWFMiPFFCh8rHnMIsrubEWexbHZTg3p2VX8GKv/tiuE19fBH5AB/zMZ+kesvQz4/5WBeJclQ91xS4bZBqJKZrwwV1l67L9Ad6P5Y0D1zv8y5UkYLcvjSXVzsB3FtwOpKzipWP3UNa6kObqUy9JopF9rtLGhtM5Mt2A0hdpFprrIRawoN5aetiJG/TJaFVF6LRq7B5cgOBjQ2D3O50mZzZWvqWvdm5PZHw==</diagram></mxfile>


--------------------------------------------------------------------------------
/crates/cuda_std/assets/diagrams_xml/thread.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2021-09-02T01:21:18.333Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" etag="7uE_EoAPnGaUN6n6nVaj" version="15.0.5" type="device"><diagram id="YhH7WYBAxTGbiCbuz2tr" name="Page-1">3ZnLboMwEEW/hmUkDMEhy+bRdtGuSBWpOwdcsAQYGaeQfn1NMeFhRWoXFY5XGV+PPXCMwh1hudusfmKoSF5phFPLsaPacneW4wCw8sVPo1xaBUK3FWJGIpnUCwH5wlK0pXomES5HiZzSlJNiLIY0z3HIRxpijFbjtA+ajqsWKMaKEIQoVdUjiXjSqr5n9/ozJnEiK3u2nMhQlyuFMkERrQaSu7fcLaOUt1FWb3HasOuwtOseb8xer4vhnP9mQfxSr3aH4Hhcv53I6WH/Hi7qxard5ROlZ3m/h4RhFAlNFij5pUPB6DmPcLOdbbmbKiEcBwUKm9lKHL7QEp6lYgREKDfGjOP65hWDKwfx/GCaYc4uIkUuWEpyl/Gw6o8BdHCTwRFAqSF58vF1456OCCSgP8DyFViWA1NRdVMWKBdx3MRDgO2sKDZMmJ0rgJqBXRsC1vE1A9v9id492aWtG1lgCFnP0Y2sYwhZqNv7C7iGkJ2ABfbsZJeGkJ1aAw3QeoagnZoDDdBCQ9BO3YEGaNXG6j7RTu2BBmhNacOm/kADtKY0YlOycG6yXbG7J6sYhPnRmtKJKQZhfrSmtGKKQZgfrSm9mGIQ5kdrSjOmGIR/RCuG/aeIn7nB9xx3/w0=</diagram></mxfile>


--------------------------------------------------------------------------------
/crates/cuda_std/src/cfg.rs:
--------------------------------------------------------------------------------
 1 | //! Utilities for configuring code based on the specified compute capability.
 2 | 
 3 | use cuda_std_macros::gpu_only;
 4 | 
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 6 | pub enum ComputeCapability {
 7 |     Compute35,
 8 |     Compute37,
 9 |     Compute50,
10 |     Compute52,
11 |     Compute53,
12 |     Compute60,
13 |     Compute61,
14 |     Compute62,
15 |     Compute70,
16 |     Compute72,
17 |     Compute75,
18 |     Compute80,
19 | }
20 | 
21 | impl ComputeCapability {
22 |     /// Parses a compute capability from the `CUDA_ARCH` environment variable set by `cuda_builder`.
23 |     /// This is a compile-time variable so any comparisons of the compute capability should expand to constant
24 |     /// values.
25 |     ///
26 |     /// This allows you to use the current capability to decide what path to take in a function with the incorrect
27 |     /// path being optimized away.
28 |     #[gpu_only]
29 |     #[inline(always)]
30 |     pub fn from_cuda_arch_env() -> Self {
31 |         // set by cuda_builder
32 |         match env!("CUDA_ARCH") {
33 |             "350" => ComputeCapability::Compute35,
34 |             "370" => ComputeCapability::Compute37,
35 |             "500" => ComputeCapability::Compute50,
36 |             "520" => ComputeCapability::Compute52,
37 |             "530" => ComputeCapability::Compute53,
38 |             "600" => ComputeCapability::Compute60,
39 |             "610" => ComputeCapability::Compute61,
40 |             "620" => ComputeCapability::Compute62,
41 |             "700" => ComputeCapability::Compute70,
42 |             "720" => ComputeCapability::Compute72,
43 |             "750" => ComputeCapability::Compute75,
44 |             "800" => ComputeCapability::Compute80,
45 |             _ => panic!("CUDA_ARCH had an invalid value"),
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/crates/cuda_std/src/mem.rs:
--------------------------------------------------------------------------------
 1 | //! Support for allocating memory and using `alloc` using CUDA memory allocation system-calls.
 2 | 
 3 | use crate::gpu_only;
 4 | #[cfg(target_arch = "nvptx64")]
 5 | use alloc::alloc::*;
 6 | #[cfg(target_arch = "nvptx64")]
 7 | use core::ffi::c_void;
 8 | 
 9 | #[cfg(target_arch = "nvptx64")]
10 | extern "C" {
11 |     // implicitly defined by cuda.
12 |     pub fn malloc(size: usize) -> *mut c_void;
13 | 
14 |     pub fn free(ptr: *mut c_void);
15 | }
16 | 
17 | pub struct CUDAAllocator;
18 | 
19 | #[cfg(target_arch = "nvptx64")]
20 | unsafe impl GlobalAlloc for CUDAAllocator {
21 |     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
22 |         malloc(layout.size()) as *mut u8
23 |     }
24 |     unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
25 |         free(ptr as *mut _);
26 |     }
27 | }
28 | 
29 | #[cfg(target_arch = "nvptx64")]
30 | #[global_allocator]
31 | pub static GLOBAL_ALLOCATOR: CUDAAllocator = CUDAAllocator;
32 | 
33 | /// Returns the amount of shared memory that has been dynamically allocated
34 | /// by the caller of the kernel for every thread block (CTA).
35 | #[gpu_only]
36 | #[inline(always)]
37 | pub fn dynamic_smem_size() -> u32 {
38 |     let mut out;
39 |     unsafe {
40 |         core::arch::asm!(
41 |             "mov.u32 {}, %dynamic_smem_size",
42 |             out(reg32) out
43 |         )
44 |     }
45 |     out
46 | }
47 | 
48 | /// Returns the amount of total shared memory that has been allocated
49 | /// for every thread block for this kernel. This includes both static and dynamic
50 | /// shared memory. The returned number will be a multiple of static memory allocation unit size:
51 | /// - 128 bytes on sm_2x and sm_8x
52 | /// - 256 bytes on sm_3x, sm_5x, sm_6x, and sm_7x
53 | #[gpu_only]
54 | #[inline(always)]
55 | pub fn total_smem_size() -> u32 {
56 |     let mut out;
57 |     unsafe {
58 |         core::arch::asm!(
59 |             "mov.u32 {}, %total_smem_size",
60 |             out(reg32) out
61 |         )
62 |     }
63 |     out
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/cuda_std/src/misc.rs:
--------------------------------------------------------------------------------
 1 | //! Misc functions that do not exactly fit into other categories.
 2 | 
 3 | use crate::gpu_only;
 4 | #[cfg(target_os = "cuda")]
 5 | use core::arch::asm;
 6 | 
 7 | /// Suspends execution of the kernel, usually to pause at a specific point when debugging in a debugger.
 8 | #[gpu_only]
 9 | #[inline(always)]
10 | pub fn breakpoint() {
11 |     unsafe {
12 |         asm!("brkpt");
13 |     }
14 | }
15 | 
16 | /// Increments a hardware counter between `0` and `7` (inclusive).
17 | /// This function will increment the counter by one per warp.
18 | ///
19 | /// # Panics
20 | ///
21 | /// Panics if `counter` is not in the range of `0..=7`.
22 | #[gpu_only]
23 | #[inline(always)]
24 | pub fn profiler_counter(counter: u32) {
25 |     assert!(
26 |         (0..=7).contains(&counter),
27 |         "Profiler counter value must be in the range of 0..=7"
28 |     );
29 |     unsafe {
30 |         asm!(
31 |             "pmevent {}",
32 |             in(reg32) counter
33 |         )
34 |     }
35 | }
36 | 
37 | /// Returns the value of a per-multiprocessor counter incremented on every clock cycle.
38 | #[gpu_only]
39 | #[inline(always)]
40 | pub fn clock() -> u64 {
41 |     let mut clock;
42 |     unsafe {
43 |         asm!(
44 |             "mov.u64 {}, %clock64",
45 |             out(reg64) clock
46 |         )
47 |     }
48 |     clock
49 | }
50 | 


--------------------------------------------------------------------------------
/crates/cuda_std/src/rt/sys.rs:
--------------------------------------------------------------------------------
 1 | //! Raw bindings to cuda_device_runtime_api functions.
 2 | 
 3 | use core::ffi::c_void;
 4 | 
 5 | #[allow(non_camel_case_types)]
 6 | pub type c_char = i8;
 7 | #[allow(non_camel_case_types)]
 8 | pub type c_int = i32;
 9 | #[allow(non_camel_case_types)]
10 | pub type c_uint = u32;
11 | pub use crate::rt::driver_types_sys::*;
12 | 
13 | // TODO(RDambrosio016): We should probably create a common crate
14 | // to share this stuff with cust.
15 | 
16 | extern "C" {
17 |     pub fn cudaDeviceGetAttribute(
18 |         value: *mut c_int,
19 |         attr: cudaDeviceAttr,
20 |         device: c_int,
21 |     ) -> cudaError_t;
22 |     pub fn cudaDeviceGetLimit(pValue: *mut usize, limit: cudaLimit) -> cudaError_t;
23 |     pub fn cudaDeviceGetSharedMemConfig(pConfig: *mut cudaSharedMemConfig) -> cudaError_t;
24 |     pub fn cudaDeviceSynchronize() -> cudaError_t;
25 |     pub fn cudaGetLastError() -> cudaError_t;
26 |     pub fn cudaPeekAtLastError() -> cudaError_t;
27 |     pub fn cudaGetErrorString(error: cudaError_t) -> *const c_char;
28 |     pub fn cudaGetErrorName(error: cudaError_t) -> *const c_char;
29 |     pub fn cudaGetDeviceCount(count: *mut c_int) -> cudaError_t;
30 |     pub fn cudaGetDevice(device: *mut c_int) -> cudaError_t;
31 |     pub fn cudaStreamCreateWithFlags(pStream: *mut cudaStream_t, flags: c_uint) -> cudaError_t;
32 |     pub fn cudaStreamDestroy(stream: cudaStream_t) -> cudaError_t;
33 |     pub fn cudaStreamWaitEvent(
34 |         stream: cudaStream_t,
35 |         event: cudaEvent_t,
36 |         flags: c_uint,
37 |     ) -> cudaError_t;
38 |     pub fn cudaEventCreateWithFlags(event: *mut cudaEvent_t, flags: c_uint) -> cudaError_t;
39 |     pub fn cudaEventRecord(event: cudaEvent_t, stream: cudaStream_t) -> cudaError_t;
40 |     pub fn cudaEventRecordWithFlags(
41 |         event: cudaEvent_t,
42 |         stream: cudaStream_t,
43 |         flags: c_uint,
44 |     ) -> cudaError_t;
45 |     pub fn cudaEventDestroy(event: cudaEvent_t) -> cudaError_t;
46 | 
47 |     pub fn cudaGetParameterBufferV2(
48 |         func: *const c_void,
49 |         gridDimension: dim3,
50 |         block_dimension: dim3,
51 |         shared_mem_size: c_uint,
52 |     ) -> *mut c_void;
53 |     pub fn cudaLaunchDeviceV2(parameter_buffer: *mut c_void, stream: cudaStream_t) -> cudaError_t;
54 | }
55 | 


--------------------------------------------------------------------------------
/crates/cuda_std_macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda_std_macros"
 3 | version = "0.2.0"
 4 | edition = "2018"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Macros for cuda_std"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [lib]
11 | proc-macro = true
12 | 
13 | [dependencies]
14 | quote = "1.0.38"
15 | syn = { version = "2.0.96", features = ["full"] }
16 | proc-macro2 = "1.0.93"
17 | 


--------------------------------------------------------------------------------
/crates/cudnn-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cudnn-sys"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | license = "MIT OR Apache-2.0"
 6 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 7 | readme = "../../README.md"
 8 | links = "cudnn"
 9 | build = "build/main.rs"
10 | 
11 | [dependencies]
12 | cust_raw = { path = "../cust_raw", default-features = false, features = ["driver"] }
13 | 
14 | [build-dependencies]
15 | bindgen = "0.71.1"
16 | 


--------------------------------------------------------------------------------
/crates/cudnn-sys/build/main.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path;
 3 | 
 4 | pub mod cudnn_sdk;
 5 | 
 6 | fn main() {
 7 |     let sdk = cudnn_sdk::CudnnSdk::new().expect("Cannot create cuDNN SDK instance.");
 8 |     let cuda_include_paths = env::var_os("DEP_CUDA_INCLUDES")
 9 |         .map(|s| env::split_paths(s.as_os_str()).collect::<Vec<_>>())
10 |         .expect("Cannot find transitive metadata 'cuda_include' from cust_raw package.");
11 | 
12 |     println!("cargo::rerun-if-changed=build");
13 |     // Emit metadata for the build script.
14 |     let (version, version_major, version_minor, version_patch) = (
15 |         sdk.cudnn_version(),
16 |         sdk.cudnn_version_major(),
17 |         sdk.cudnn_version_minor(),
18 |         sdk.cudnn_version_patch(),
19 |     );
20 |     let include_dir = sdk.cudnn_include_path().display().to_string();
21 |     println!("cargo::metadata=version={version}");
22 |     println!("cargo::metadata=version_major={version_major}");
23 |     println!("cargo::metadata=version_minor={version_minor}");
24 |     println!("cargo::metadata=version_patch={version_patch}");
25 |     println!("cargo::metadata=include_dir={include_dir}",);
26 | 
27 |     // Generate bindings and link to the library.
28 |     create_cudnn_bindings(&sdk, &cuda_include_paths);
29 |     println!("cargo::rustc-link-lib=dylib=cudnn");
30 | }
31 | 
32 | fn create_cudnn_bindings(sdk: &cudnn_sdk::CudnnSdk, cuda_include_paths: &[path::PathBuf]) {
33 |     println!("cargo::rerun-if-changed=build/wrapper.h");
34 |     let outdir = path::PathBuf::from(
35 |         env::var("OUT_DIR").expect("OUT_DIR environment variable should be set by cargo."),
36 |     );
37 |     let bindgen_path = path::PathBuf::from(format!("{}/cudnn_sys.rs", outdir.display()));
38 |     let bindings = bindgen::Builder::default()
39 |         .header("build/wrapper.h")
40 |         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
41 |         .clang_arg(format!("-I{}", sdk.cudnn_include_path().display()))
42 |         .clang_args(
43 |             cuda_include_paths
44 |                 .iter()
45 |                 .map(|p| format!("-I{}", p.display())),
46 |         )
47 |         .allowlist_function("^cudnn.*")
48 |         .allowlist_type("^cudnn.*")
49 |         .allowlist_var("^CUDNN.*")
50 |         .default_enum_style(bindgen::EnumVariation::Rust {
51 |             non_exhaustive: false,
52 |         })
53 |         .derive_default(true)
54 |         .derive_eq(true)
55 |         .derive_hash(true)
56 |         .derive_ord(true)
57 |         .size_t_is_usize(true)
58 |         .layout_tests(true)
59 |         .generate()
60 |         .expect("Unable to generate cuDNN bindings.");
61 |     bindings
62 |         .write_to_file(bindgen_path.as_path())
63 |         .expect("Cannot write cuDNN bindgen output to file.");
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/cudnn-sys/build/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cudnn.h"


--------------------------------------------------------------------------------
/crates/cudnn-sys/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/cudnn_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cudnn/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["frjnn <f.iannelli.francesco229@gmail.com>"]
 3 | edition = "2021"
 4 | name = "cudnn"
 5 | version = "0.1.0"
 6 | 
 7 | [dependencies]
 8 | bitflags = "2.8"
 9 | cust = {version = "0.3.2", path = "../cust"}
10 | cudnn-sys = { path = "../cudnn-sys" }
11 | 


--------------------------------------------------------------------------------
/crates/cudnn/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | 
 3 | fn main() {
 4 |     let cudnn_version = env::var("DEP_CUDNN_VERSION")
 5 |         .expect("Cannot find transitive metadata 'version' from cudnn-sys package.")
 6 |         .parse::<u32>()
 7 |         .expect("Failed to parse cuDNN version");
 8 | 
 9 |     println!("cargo::rustc-check-cfg=cfg(cudnn9)");
10 |     if cudnn_version >= 90000 {
11 |         println!("cargo::rustc-cfg=cudnn9");
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/activation/activation_descriptor.rs:
--------------------------------------------------------------------------------
 1 | use std::mem::MaybeUninit;
 2 | 
 3 | use crate::{ActivationMode, CudnnError, IntoResult, NanPropagation};
 4 | 
 5 | /// The descriptor of a neuron activation operation.
 6 | #[derive(Debug, PartialEq, Eq, Hash)]
 7 | pub struct ActivationDescriptor {
 8 |     pub(crate) raw: cudnn_sys::cudnnActivationDescriptor_t,
 9 | }
10 | 
11 | impl ActivationDescriptor {
12 |     /// Creates a new neuron activation descriptor.
13 |     ///
14 |     /// # Arguments
15 |     ///
16 |     ///   * `mode` - activation function to compute.
17 |     ///   * `nan_opt` - NaN propagation policy for the operation.
18 |     ///   * `coefficient` - optional coefficient for the given function. It specifies
19 |     ///     the clipping threshold for `ActivationMode::ClippedRelu`.
20 |     ///
21 |     /// cuDNN
22 |     /// [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetActivationDescriptor)
23 |     /// may offer additional information about the API behavior.
24 |     ///
25 |     /// # Examples
26 |     ///
27 |     /// ```
28 |     /// # use std::error::Error;
29 |     /// #
30 |     /// # fn main() -> Result<(), Box<dyn Error>> {
31 |     /// use cudnn::{ActivationDescriptor, ActivationMode, CudnnContext, NanPropagation};
32 |     ///
33 |     /// let ctx = CudnnContext::new()?;
34 |     ///
35 |     /// let mode = ActivationMode::Swish;
36 |     /// let nan_opt = NanPropagation::PropagateNaN;
37 |     /// let coefficient = None;
38 |     ///
39 |     /// let desc = ActivationDescriptor::new(mode, nan_opt, coefficient)?;
40 |     /// # Ok(())
41 |     /// # }
42 |     /// ```
43 |     pub fn new(
44 |         mode: ActivationMode,
45 |         nan_opt: NanPropagation,
46 |         coefficient: impl Into<Option<f64>>,
47 |     ) -> Result<Self, CudnnError> {
48 |         let mut raw = MaybeUninit::uninit();
49 | 
50 |         unsafe {
51 |             cudnn_sys::cudnnCreateActivationDescriptor(raw.as_mut_ptr()).into_result()?;
52 | 
53 |             let raw = raw.assume_init();
54 | 
55 |             let coefficient = coefficient.into().unwrap_or(match mode {
56 |                 ActivationMode::ClippedRelu => f64::MAX,
57 |                 _ => 1.0,
58 |             });
59 | 
60 |             cudnn_sys::cudnnSetActivationDescriptor(raw, mode.into(), nan_opt.into(), coefficient)
61 |                 .into_result()?;
62 | 
63 |             Ok(Self { raw })
64 |         }
65 |     }
66 | }
67 | 
68 | impl Drop for ActivationDescriptor {
69 |     fn drop(&mut self) {
70 |         unsafe {
71 |             cudnn_sys::cudnnDestroyActivationDescriptor(self.raw);
72 |         }
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/activation/activation_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies a neuron activation function.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnActivationMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[non_exhaustive]
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum ActivationMode {
 8 |     /// Selects the sigmoid function.
 9 |     Sigmoid,
10 |     /// Selects the rectified linear function.
11 |     Relu,
12 |     /// Selects the hyperbolic tangent function.
13 |     Tanh,
14 |     /// Selects the clipped rectified linear function.
15 |     ClippedRelu,
16 |     /// Selects the exponential linear function.
17 |     Elu,
18 |     /// Selects the swish function.
19 |     Swish,
20 |     /// Selects no activation.
21 |     ///
22 |     /// **Do note** that this is only valid for an activation descriptor passed to
23 |     /// [`convolution_bias_act_forward()`](crate::CudnnContext::convolution_bias_act_forward).
24 |     Identity,
25 | }
26 | 
27 | impl From<ActivationMode> for cudnn_sys::cudnnActivationMode_t {
28 |     fn from(mode: ActivationMode) -> Self {
29 |         match mode {
30 |             ActivationMode::Sigmoid => Self::CUDNN_ACTIVATION_SIGMOID,
31 |             ActivationMode::Relu => Self::CUDNN_ACTIVATION_RELU,
32 |             ActivationMode::Tanh => Self::CUDNN_ACTIVATION_TANH,
33 |             ActivationMode::ClippedRelu => Self::CUDNN_ACTIVATION_CLIPPED_RELU,
34 |             ActivationMode::Elu => Self::CUDNN_ACTIVATION_ELU,
35 |             ActivationMode::Swish => Self::CUDNN_ACTIVATION_SWISH,
36 |             ActivationMode::Identity => Self::CUDNN_ACTIVATION_IDENTITY,
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/attention/attention_weights_kind.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies a group of weights or biases for the multi-head attention layer.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnMultiHeadAttnWeightKind_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum AttnWeight {
 7 |     /// Selects the input projection weights for queries.
 8 |     QWeights,
 9 |     /// Selects the input projection weights for keys.
10 |     KWeights,
11 |     /// Selects the input projection weights for values.
12 |     VWeights,
13 |     /// Selects the output projection weights.
14 |     OWeights,
15 |     /// Selects the input projection biases for queries.
16 |     QBiases,
17 |     /// Selects the input projection biases for keys.
18 |     KBiases,
19 |     /// Selects the input projection biases for values.
20 |     VBiases,
21 |     /// Selects the output projection biases.
22 |     OBiases,
23 | }
24 | 
25 | impl From<AttnWeight> for cudnn_sys::cudnnMultiHeadAttnWeightKind_t {
26 |     fn from(kind: AttnWeight) -> Self {
27 |         match kind {
28 |             AttnWeight::QWeights => Self::CUDNN_MH_ATTN_Q_WEIGHTS,
29 |             AttnWeight::KWeights => Self::CUDNN_MH_ATTN_K_WEIGHTS,
30 |             AttnWeight::VWeights => Self::CUDNN_MH_ATTN_V_WEIGHTS,
31 |             AttnWeight::OWeights => Self::CUDNN_MH_ATTN_O_WEIGHTS,
32 |             AttnWeight::QBiases => Self::CUDNN_MH_ATTN_Q_BIASES,
33 |             AttnWeight::KBiases => Self::CUDNN_MH_ATTN_K_BIASES,
34 |             AttnWeight::VBiases => Self::CUDNN_MH_ATTN_V_BIASES,
35 |             AttnWeight::OBiases => Self::CUDNN_MH_ATTN_O_BIASES,
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/attention/seq_data_axis.rs:
--------------------------------------------------------------------------------
 1 | /// Describes and indexes active dimensions in the `SeqDataDescriptor` `dim` field. This enum is
 2 | /// also used in the `axis` argument of the `SeqDataDescriptor` constructor to  define the layout
 3 | /// of the sequence data buffer in memory.
 4 | ///
 5 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSeqDataAxis_t)
 6 | /// may offer additional information about the APi behavior.
 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 8 | pub enum SeqDataAxis {
 9 |     /// Identifies the time (sequence length) dimension or specifies the time in the data layout.
10 |     TimeDim,
11 |     /// Identifies the batch dimension or specifies the batch in the data layout.
12 |     BatchDim,
13 |     /// Identifies the beam dimension or specifies the beam in the data layout.
14 |     BeamDim,
15 |     /// Identifies the vect (vector) dimension or specifies the vector in the data layout.
16 |     VectDim,
17 | }
18 | 
19 | impl From<SeqDataAxis> for cudnn_sys::cudnnSeqDataAxis_t {
20 |     fn from(axis: SeqDataAxis) -> Self {
21 |         match axis {
22 |             SeqDataAxis::TimeDim => cudnn_sys::cudnnSeqDataAxis_t::CUDNN_SEQDATA_TIME_DIM,
23 |             SeqDataAxis::BatchDim => cudnn_sys::cudnnSeqDataAxis_t::CUDNN_SEQDATA_BATCH_DIM,
24 |             SeqDataAxis::BeamDim => cudnn_sys::cudnnSeqDataAxis_t::CUDNN_SEQDATA_BEAM_DIM,
25 |             SeqDataAxis::VectDim => cudnn_sys::cudnnSeqDataAxis_t::CUDNN_SEQDATA_VECT_DIM,
26 |         }
27 |     }
28 | }
29 | 
30 | impl<T> std::ops::Index<SeqDataAxis> for [T; 4] {
31 |     type Output = T;
32 | 
33 |     fn index(&self, index: SeqDataAxis) -> &Self::Output {
34 |         let raw: cudnn_sys::cudnnSeqDataAxis_t = index.into();
35 |         self.index(raw as usize)
36 |     }
37 | }
38 | 
39 | impl<T> std::ops::IndexMut<SeqDataAxis> for [T; 4] {
40 |     fn index_mut(&mut self, index: SeqDataAxis) -> &mut Self::Output {
41 |         let raw: cudnn_sys::cudnnSeqDataAxis_t = index.into();
42 |         self.index_mut(raw as usize)
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/descriptor.rs:
--------------------------------------------------------------------------------
 1 | use std::{mem::MaybeUninit, rc::Rc};
 2 | 
 3 | use crate::{CudnnError, IntoResult};
 4 | 
 5 | #[derive(PartialEq, Eq, Hash, Debug)]
 6 | pub(crate) struct Inner {
 7 |     pub(crate) raw: cudnn_sys::cudnnBackendDescriptor_t,
 8 | }
 9 | 
10 | impl Drop for Inner {
11 |     fn drop(&mut self) {
12 |         unsafe {
13 |             cudnn_sys::cudnnBackendDestroyDescriptor(self.raw);
14 |         }
15 |     }
16 | }
17 | 
18 | #[derive(Clone, PartialEq, Eq, Hash, Debug)]
19 | pub struct Descriptor(Rc<Inner>);
20 | 
21 | impl Descriptor {
22 |     pub(crate) unsafe fn new(
23 |         dtype: cudnn_sys::cudnnBackendDescriptorType_t,
24 |     ) -> Result<Self, CudnnError> {
25 |         let mut raw = MaybeUninit::uninit();
26 | 
27 |         cudnn_sys::cudnnBackendCreateDescriptor(dtype, raw.as_mut_ptr()).into_result()?;
28 | 
29 |         let raw = raw.assume_init();
30 | 
31 |         Ok(Self(Rc::new(Inner { raw })))
32 |     }
33 | 
34 |     pub(crate) unsafe fn finalize(&mut self) -> Result<(), CudnnError> {
35 |         cudnn_sys::cudnnBackendFinalize(self.0.raw).into_result()
36 |     }
37 | 
38 |     pub(crate) unsafe fn set_attribute<T: ?Sized>(
39 |         &mut self,
40 |         aname: cudnn_sys::cudnnBackendAttributeName_t,
41 |         atype: cudnn_sys::cudnnBackendAttributeType_t,
42 |         count: i64,
43 |         val: &T,
44 |     ) -> Result<(), CudnnError> {
45 |         let ptr = val as *const T as *const std::ffi::c_void;
46 | 
47 |         cudnn_sys::cudnnBackendSetAttribute(self.0.raw, aname, atype, count, ptr).into_result()
48 |     }
49 | 
50 |     pub(crate) unsafe fn get_attribute_count(
51 |         &self,
52 |         aname: cudnn_sys::cudnnBackendAttributeName_t,
53 |         atype: cudnn_sys::cudnnBackendAttributeType_t,
54 |     ) -> Result<i64, CudnnError> {
55 |         let mut count = MaybeUninit::<i64>::uninit();
56 | 
57 |         cudnn_sys::cudnnBackendGetAttribute(
58 |             self.0.raw,
59 |             aname,
60 |             atype,
61 |             0,
62 |             count.as_mut_ptr(),
63 |             std::ptr::null_mut(),
64 |         )
65 |         .into_result()?;
66 | 
67 |         Ok(count.assume_init())
68 |     }
69 | 
70 |     pub(crate) fn inner(&self) -> cudnn_sys::cudnnBackendDescriptor_t {
71 |         self.0.raw
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/engine.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, Graph},
 3 |     CudnnError, IntoResult,
 4 | };
 5 | 
 6 | #[derive(Default, Debug, PartialEq)]
 7 | pub struct EngineBuilder {
 8 |     graph: Option<Graph>,
 9 |     global_index: Option<i64>,
10 | }
11 | 
12 | impl EngineBuilder {
13 |     pub fn set_graph(mut self, graph: Graph) -> Self {
14 |         self.graph = Some(graph);
15 |         self
16 |     }
17 | 
18 |     pub fn set_global_index(mut self, global_index: i64) -> Self {
19 |         self.global_index = Some(global_index);
20 |         self
21 |     }
22 | 
23 |     pub fn build(self) -> Result<Engine, CudnnError> {
24 |         let graph = self.graph.expect("operation graph is required");
25 |         let global_index = self.global_index.expect("global index is required.");
26 | 
27 |         unsafe {
28 |             let mut descriptor = Descriptor::new(
29 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_ENGINE_DESCRIPTOR,
30 |             )?;
31 | 
32 |             descriptor.set_attribute(
33 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
34 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
35 |                 1,
36 |                 &graph.descriptor.inner(),
37 |             )?;
38 | 
39 |             descriptor.set_attribute(
40 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
41 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_INT64,
42 |                 1,
43 |                 &global_index,
44 |             )?;
45 | 
46 |             descriptor.finalize()?;
47 | 
48 |             Ok(Engine {
49 |                 descriptor,
50 |                 graph,
51 |                 global_index,
52 |             })
53 |         }
54 |     }
55 | }
56 | 
57 | #[derive(PartialEq, Debug)]
58 | pub struct Engine {
59 |     pub(crate) descriptor: Descriptor,
60 |     graph: Graph,
61 |     global_index: i64,
62 | }
63 | 
64 | impl Engine {
65 |     pub fn get_graph(&self) -> &Graph {
66 |         &self.graph
67 |     }
68 | 
69 |     pub fn get_global_index(&self) -> i64 {
70 |         self.global_index
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/engine_cfg.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, Engine},
 3 |     CudnnError, IntoResult,
 4 | };
 5 | 
 6 | #[derive(Default, PartialEq, Debug)]
 7 | pub struct EngineCfgBuilder {
 8 |     descriptor: Option<Descriptor>,
 9 |     engine: Option<Engine>,
10 | }
11 | 
12 | impl EngineCfgBuilder {
13 |     pub(crate) fn set_descriptor(mut self, descriptor: Descriptor) -> Self {
14 |         self.descriptor = Some(descriptor);
15 |         self
16 |     }
17 | 
18 |     pub fn set_engine(mut self, engine: Engine) -> Self {
19 |         self.engine = Some(engine);
20 |         self
21 |     }
22 | 
23 |     pub fn build(self) -> Result<EngineCfg, CudnnError> {
24 |         let engine = self.engine.expect("engine is required.");
25 | 
26 |         unsafe {
27 |             let mut descriptor = match self.descriptor {
28 |                 None => Descriptor::new(
29 |                     cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
30 |                 )?,
31 |                 Some(descriptor) => descriptor,
32 |             };
33 | 
34 |             descriptor.set_attribute(
35 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_ENGINECFG_ENGINE,
36 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
37 |                 1,
38 |                 &engine.descriptor.inner(),
39 |             )?;
40 | 
41 |             descriptor.finalize()?;
42 | 
43 |             Ok(EngineCfg { descriptor, engine })
44 |         }
45 |     }
46 | }
47 | 
48 | #[derive(PartialEq, Debug)]
49 | pub struct EngineCfg {
50 |     pub(crate) descriptor: Descriptor,
51 |     engine: Engine,
52 | }
53 | 
54 | impl EngineCfg {
55 |     pub fn get_engine(&self) -> &Engine {
56 |         &self.engine
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/engine_heuristic.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, EngineCfgBuilder, Graph},
 3 |     CudnnContext, CudnnError, IntoResult,
 4 | };
 5 | 
 6 | pub enum HeuristicMode {
 7 |     A,
 8 |     B,
 9 | }
10 | 
11 | impl From<HeuristicMode> for cudnn_sys::cudnnBackendHeurMode_t {
12 |     fn from(mode: HeuristicMode) -> Self {
13 |         match mode {
14 |             HeuristicMode::A => cudnn_sys::cudnnBackendHeurMode_t::CUDNN_HEUR_MODE_A,
15 |             HeuristicMode::B => cudnn_sys::cudnnBackendHeurMode_t::CUDNN_HEUR_MODE_B,
16 |         }
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/execution_plan.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, EngineCfg},
 3 |     CudnnContext, CudnnError, IntoResult,
 4 | };
 5 | 
 6 | #[derive(Default, PartialEq, Debug)]
 7 | pub struct ExecutionPlanBuilder {
 8 |     engine_cfg: Option<EngineCfg>,
 9 | }
10 | 
11 | impl ExecutionPlanBuilder {
12 |     pub fn set_engine_cfg(mut self, engine_cfg: EngineCfg) -> Self {
13 |         self.engine_cfg = Some(engine_cfg);
14 |         self
15 |     }
16 | 
17 |     pub fn build(self) -> Result<ExecutionPlan, CudnnError> {
18 |         let engine_cfg = self.engine_cfg.expect("engine configuration is required.");
19 | 
20 |         unsafe {
21 |             let mut descriptor = Descriptor::new(
22 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
23 |             )?;
24 | 
25 |             descriptor.set_attribute(
26 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
27 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
28 |                 1,
29 |                 &engine_cfg.descriptor.inner(),
30 |             )?;
31 | 
32 |             descriptor.finalize()?;
33 | 
34 |             Ok(ExecutionPlan {
35 |                 descriptor,
36 |                 engine_cfg,
37 |             })
38 |         }
39 |     }
40 | }
41 | 
42 | pub struct ExecutionPlan {
43 |     pub(crate) descriptor: Descriptor,
44 |     engine_cfg: EngineCfg,
45 | }
46 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/matmul.rs:
--------------------------------------------------------------------------------
 1 | use cust::memory::bytemuck::Contiguous;
 2 | 
 3 | use crate::{
 4 |     backend::{Descriptor, MatMulCfg, Operation, Tensor},
 5 |     CudnnError, DataType, IntoResult,
 6 | };
 7 | 
 8 | #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
 9 | pub struct MatMulBuilder {
10 |     cfg: Option<MatMulCfg>,
11 |     a: Option<Tensor>,
12 |     b: Option<Tensor>,
13 |     c: Option<Tensor>,
14 | }
15 | 
16 | impl MatMulBuilder {
17 |     pub fn set_cfg(mut self, cfg: MatMulCfg) -> Self {
18 |         self.cfg = Some(cfg);
19 |         self
20 |     }
21 | 
22 |     pub fn set_a(mut self, a: Tensor) -> Self {
23 |         self.a = Some(a);
24 |         self
25 |     }
26 | 
27 |     pub fn set_b(mut self, b: Tensor) -> Self {
28 |         self.b = Some(b);
29 |         self
30 |     }
31 | 
32 |     pub fn set_c(mut self, c: Tensor) -> Self {
33 |         self.c = Some(c);
34 |         self
35 |     }
36 | 
37 |     pub fn build(self) -> Result<Operation, CudnnError> {
38 |         let a = self.a.expect("a matrix is required.");
39 |         let b = self.b.expect("b matrix is required");
40 |         let c = self.c.expect("c matrix is required");
41 |         let cfg = self.cfg.expect("matmul configuration is required.");
42 | 
43 |         unsafe {
44 |             let mut raw = Descriptor::new(
45 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
46 |             )?;
47 | 
48 |             raw.set_attribute(
49 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_MATMUL_ADESC,
50 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
51 |                 1,
52 |                 &a.raw.inner(),
53 |             )?;
54 | 
55 |             raw.set_attribute(
56 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_MATMUL_BDESC,
57 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
58 |                 1,
59 |                 &b.raw.inner(),
60 |             )?;
61 | 
62 |             raw.set_attribute(
63 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_MATMUL_CDESC,
64 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
65 |                 1,
66 |                 &c.raw.inner(),
67 |             )?;
68 | 
69 |             raw.set_attribute(
70 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_MATMUL_DESC,
71 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
72 |                 1,
73 |                 &cfg.raw.inner(),
74 |             )?;
75 | 
76 |             raw.finalize()?;
77 | 
78 |             Ok(Operation::MatMul { raw, cfg, a, b, c })
79 |         }
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/matmul_cfg.rs:
--------------------------------------------------------------------------------
 1 | use crate::{backend::Descriptor, CudnnError, DataType, IntoResult};
 2 | 
 3 | #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
 4 | pub struct MatMulCfgBuilder {
 5 |     compt_type: Option<cudnn_sys::cudnnDataType_t>,
 6 | }
 7 | 
 8 | impl MatMulCfgBuilder {
 9 |     pub fn set_comp_type<T>(mut self) -> Self
10 |     where
11 |         T: DataType,
12 |     {
13 |         self.compt_type = Some(T::into_raw());
14 |         self
15 |     }
16 | 
17 |     pub fn build(self) -> Result<MatMulCfg, CudnnError> {
18 |         let compt_type = self.compt_type.expect("computation type is rquired");
19 | 
20 |         unsafe {
21 |             let mut raw = Descriptor::new(
22 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_MATMUL_DESCRIPTOR,
23 |             )?;
24 | 
25 |             raw.set_attribute(
26 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_MATMUL_COMP_TYPE,
27 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_DATA_TYPE,
28 |                 1,
29 |                 &compt_type,
30 |             )?;
31 | 
32 |             raw.finalize()?;
33 | 
34 |             Ok(MatMulCfg { raw })
35 |         }
36 |     }
37 | }
38 | 
39 | #[derive(Clone, PartialEq, Eq, Hash, Debug)]
40 | pub struct MatMulCfg {
41 |     pub(crate) raw: Descriptor,
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(warnings)]
 2 | 
 3 | mod conv_bwd_data;
 4 | mod conv_bwd_filter;
 5 | mod conv_cfg;
 6 | mod conv_fwd;
 7 | mod descriptor;
 8 | mod engine;
 9 | mod engine_cfg;
10 | mod engine_heuristic;
11 | mod execution_plan;
12 | mod graph;
13 | mod matmul;
14 | mod matmul_cfg;
15 | mod operation;
16 | mod pointwise;
17 | mod pointwise_cfg;
18 | mod pointwise_mode;
19 | mod reduction;
20 | mod reduction_cfg;
21 | mod reduction_mode;
22 | mod tensor;
23 | 
24 | pub use conv_bwd_data::*;
25 | pub use conv_bwd_filter::*;
26 | pub use conv_cfg::*;
27 | pub use conv_fwd::*;
28 | pub use descriptor::*;
29 | pub use engine::*;
30 | pub use engine_cfg::*;
31 | pub use engine_heuristic::*;
32 | pub use execution_plan::*;
33 | pub use graph::*;
34 | pub use matmul::*;
35 | pub use matmul_cfg::*;
36 | pub use operation::*;
37 | pub use pointwise::*;
38 | pub use pointwise_cfg::*;
39 | pub use pointwise_mode::*;
40 | pub use reduction::*;
41 | pub use reduction_cfg::*;
42 | pub use reduction_mode::*;
43 | pub use tensor::*;
44 | 
45 | pub trait FloatDataType: crate::DataType {
46 |     fn wrap(self) -> Real;
47 | }
48 | 
49 | #[non_exhaustive]
50 | #[derive(Debug, Clone, Copy, PartialEq)]
51 | pub enum Real {
52 |     Float(f32),
53 |     Double(f64),
54 | }
55 | 
56 | impl FloatDataType for f32 {
57 |     fn wrap(self) -> Real {
58 |         Real::Float(self)
59 |     }
60 | }
61 | 
62 | impl FloatDataType for f64 {
63 |     fn wrap(self) -> Real {
64 |         Real::Double(self)
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/operation.rs:
--------------------------------------------------------------------------------
 1 | use crate::backend::{ConvCfg, Descriptor, MatMulCfg, PointwiseCfg, Real, ReductionCfg, Tensor};
 2 | 
 3 | #[non_exhaustive]
 4 | #[derive(Clone, PartialEq, Debug)]
 5 | pub enum Operation {
 6 |     Pointwise {
 7 |         raw: Descriptor,
 8 |         cfg: PointwiseCfg,
 9 |         x: Tensor,
10 |         y: Tensor,
11 |         b: Option<Tensor>,
12 |         alpha: Option<Real>,
13 |         beta: Option<Real>,
14 |     },
15 |     ConvFwd {
16 |         raw: Descriptor,
17 |         cfg: ConvCfg,
18 |         alpha: Real,
19 |         beta: Real,
20 |         w: Tensor,
21 |         x: Tensor,
22 |         y: Tensor,
23 |     },
24 |     ConvBwdData {
25 |         raw: Descriptor,
26 |         cfg: ConvCfg,
27 |         alpha: Real,
28 |         beta: Real,
29 |         w: Tensor,
30 |         dx: Tensor,
31 |         dy: Tensor,
32 |     },
33 |     ConvBwdFilter {
34 |         raw: Descriptor,
35 |         cfg: ConvCfg,
36 |         alpha: Real,
37 |         beta: Real,
38 |         dw: Tensor,
39 |         x: Tensor,
40 |         dy: Tensor,
41 |     },
42 |     MatMul {
43 |         raw: Descriptor,
44 |         cfg: MatMulCfg,
45 |         a: Tensor,
46 |         b: Tensor,
47 |         c: Tensor,
48 |     },
49 |     Reduction {
50 |         raw: Descriptor,
51 |         cfg: ReductionCfg,
52 |         x: Tensor,
53 |         y: Tensor,
54 |     },
55 | }
56 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/reduction.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, Operation, ReductionCfg, Tensor},
 3 |     CudnnError, IntoResult,
 4 | };
 5 | 
 6 | #[derive(Default, Clone, Debug, PartialEq, Eq, Hash)]
 7 | pub struct ReductionBuilder {
 8 |     cfg: Option<ReductionCfg>,
 9 |     x: Option<Tensor>,
10 |     y: Option<Tensor>,
11 | }
12 | 
13 | impl ReductionBuilder {
14 |     pub fn set_cfg(mut self, cfg: ReductionCfg) -> Self {
15 |         self.cfg = Some(cfg);
16 |         self
17 |     }
18 | 
19 |     pub fn set_x(mut self, x: Tensor) -> Self {
20 |         self.x = Some(x);
21 |         self
22 |     }
23 | 
24 |     pub fn set_y(mut self, y: Tensor) -> Self {
25 |         self.y = Some(y);
26 |         self
27 |     }
28 | 
29 |     pub fn build(self) -> Result<Operation, CudnnError> {
30 |         let cfg = self.cfg.expect("reduce configuration is required.");
31 |         let x = self.x.expect("x tensor is required.");
32 |         let y = self.y.expect("y tensor is required");
33 | 
34 |         unsafe {
35 |             let mut raw = Descriptor::new(
36 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
37 |             )?;
38 | 
39 |             raw.set_attribute(
40 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_REDUCTION_DESC,
41 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
42 |                 1,
43 |                 &cfg.raw.inner(),
44 |             )?;
45 | 
46 |             raw.set_attribute(
47 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
48 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
49 |                 1,
50 |                 &x.raw.inner(),
51 |             )?;
52 | 
53 |             raw.set_attribute(
54 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
55 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_BACKEND_DESCRIPTOR,
56 |                 1,
57 |                 &y.raw.inner(),
58 |             )?;
59 | 
60 |             raw.finalize()?;
61 | 
62 |             Ok(Operation::Reduction { raw, cfg, x, y })
63 |         }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/reduction_cfg.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     backend::{Descriptor, ReductionMode},
 3 |     CudnnError, DataType, IntoResult,
 4 | };
 5 | 
 6 | #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
 7 | pub struct ReductionCfgBuilder {
 8 |     math_precision: Option<cudnn_sys::cudnnDataType_t>,
 9 |     mode: Option<ReductionMode>,
10 | }
11 | 
12 | impl ReductionCfgBuilder {
13 |     pub fn set_math_precision<T>(mut self) -> Self
14 |     where
15 |         T: DataType,
16 |     {
17 |         self.math_precision = Some(T::into_raw());
18 |         self
19 |     }
20 | 
21 |     pub fn set_mode(mut self, mode: ReductionMode) -> Self {
22 |         self.mode = Some(mode);
23 |         self
24 |     }
25 | 
26 |     pub fn build(self) -> Result<ReductionCfg, CudnnError> {
27 |         let math_precision = self
28 |             .math_precision
29 |             .unwrap_or(cudnn_sys::cudnnDataType_t::CUDNN_DATA_FLOAT);
30 | 
31 |         let mode: cudnn_sys::cudnnReduceTensorOp_t =
32 |             self.mode.expect("reduction mode is required.").into();
33 | 
34 |         unsafe {
35 |             let mut raw = Descriptor::new(
36 |                 cudnn_sys::cudnnBackendDescriptorType_t::CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
37 |             )?;
38 | 
39 |             raw.set_attribute(
40 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_REDUCTION_COMP_TYPE,
41 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_DATA_TYPE,
42 |                 1,
43 |                 &math_precision,
44 |             )?;
45 | 
46 |             raw.set_attribute(
47 |                 cudnn_sys::cudnnBackendAttributeName_t::CUDNN_ATTR_REDUCTION_OPERATOR,
48 |                 cudnn_sys::cudnnBackendAttributeType_t::CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
49 |                 1,
50 |                 &mode,
51 |             )?;
52 | 
53 |             raw.finalize()?;
54 | 
55 |             Ok(ReductionCfg { raw })
56 |         }
57 |     }
58 | }
59 | 
60 | #[derive(Clone, Debug, PartialEq, Eq, Hash)]
61 | pub struct ReductionCfg {
62 |     pub(crate) raw: Descriptor,
63 | }
64 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/backend/reduction_mode.rs:
--------------------------------------------------------------------------------
 1 | #[non_exhaustive]
 2 | #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 3 | pub enum ReductionMode {
 4 |     Add,
 5 |     Amax,
 6 |     Avg,
 7 |     Max,
 8 |     Min,
 9 |     Mul,
10 |     MulNoZeros,
11 |     Norm1,
12 |     Norm2,
13 | }
14 | 
15 | impl From<ReductionMode> for cudnn_sys::cudnnReduceTensorOp_t {
16 |     fn from(mode: ReductionMode) -> Self {
17 |         match mode {
18 |             ReductionMode::Add => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
19 |             ReductionMode::Amax => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_AMAX,
20 |             ReductionMode::Avg => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_AVG,
21 |             ReductionMode::Max => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_MAX,
22 |             ReductionMode::Min => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_MIN,
23 |             ReductionMode::Mul => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_MUL,
24 |             ReductionMode::MulNoZeros => {
25 |                 cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
26 |             }
27 |             ReductionMode::Norm1 => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_NORM1,
28 |             ReductionMode::Norm2 => cudnn_sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_NORM2,
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/convolution/convolution_config.rs:
--------------------------------------------------------------------------------
 1 | use crate::{private, DataType};
 2 | 
 3 | /// Supported data types configurations for convolution operations.
 4 | ///
 5 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnConvolutionForward)
 6 | /// may offer additional information about the APi behavior.
 7 | pub trait SupportedConv<X, W, Y>: private::Sealed + DataType
 8 | where
 9 |     X: DataType,
10 |     W: DataType,
11 |     Y: DataType,
12 | {
13 | }
14 | 
15 | impl SupportedConv<f32, f32, f32> for f32 {}
16 | impl SupportedConv<f64, f64, f64> for f64 {}
17 | impl SupportedConv<i8, i8, i8> for i32 {}
18 | impl SupportedConv<i8, i8, f32> for i32 {}
19 | impl SupportedConv<u8, i8, i8> for i32 {}
20 | impl SupportedConv<u8, i8, f32> for i32 {}
21 | impl SupportedConv<i32, i32, i32> for i32 {}
22 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/convolution/convolution_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Enum used to configure a convolution descriptor.
 2 | ///
 3 | /// The filter used for the convolution can be applied in two different ways, corresponding
 4 | /// mathematically to a convolution or to a cross-correlation.
 5 | ///
 6 | /// A cross-correlation is equivalent to a convolution with its filter rotated by 180 degrees.
 7 | ///
 8 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnConvolutionMode_t)
 9 | /// may offer additional information about the APi behavior.
10 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11 | pub enum ConvMode {
12 |     /// Convolution operation.
13 |     Convolution,
14 |     /// Cross Correlation operation.
15 |     CrossCorrelation,
16 | }
17 | 
18 | impl From<ConvMode> for cudnn_sys::cudnnConvolutionMode_t {
19 |     fn from(convolution_mode: ConvMode) -> cudnn_sys::cudnnConvolutionMode_t {
20 |         match convolution_mode {
21 |             ConvMode::Convolution => cudnn_sys::cudnnConvolutionMode_t::CUDNN_CONVOLUTION,
22 |             ConvMode::CrossCorrelation => {
23 |                 cudnn_sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION
24 |             }
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/data_type.rs:
--------------------------------------------------------------------------------
 1 | use crate::private;
 2 | 
 3 | pub trait DataType: private::Sealed + cust::memory::DeviceCopy {
 4 |     /// Returns the corresponding raw cuDNN data type.
 5 |     fn into_raw() -> cudnn_sys::cudnnDataType_t;
 6 | }
 7 | 
 8 | macro_rules! impl_cudnn_data_type {
 9 |     ($safe_type:ident, $raw_type:ident) => {
10 |         impl private::Sealed for $safe_type {}
11 | 
12 |         impl DataType for $safe_type {
13 |             fn into_raw() -> cudnn_sys::cudnnDataType_t {
14 |                 cudnn_sys::cudnnDataType_t::$raw_type
15 |             }
16 |         }
17 |     };
18 | }
19 | 
20 | impl_cudnn_data_type!(f32, CUDNN_DATA_FLOAT);
21 | impl_cudnn_data_type!(f64, CUDNN_DATA_DOUBLE);
22 | impl_cudnn_data_type!(i8, CUDNN_DATA_INT8);
23 | impl_cudnn_data_type!(u8, CUDNN_DATA_UINT8);
24 | impl_cudnn_data_type!(i32, CUDNN_DATA_INT32);
25 | impl_cudnn_data_type!(i64, CUDNN_DATA_INT64);
26 | 
27 | #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
28 | pub struct Vec4;
29 | 
30 | #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
31 | pub struct Vec32;
32 | 
33 | /// Vectorized data type. Vectorization size can be either 4 or 32 elements.
34 | pub trait VecType<T>: private::Sealed
35 | where
36 |     T: DataType,
37 | {
38 |     /// Return the corresponding raw cuDNN data type.
39 |     fn into_raw() -> cudnn_sys::cudnnDataType_t;
40 | }
41 | 
42 | impl private::Sealed for Vec4 {}
43 | 
44 | impl private::Sealed for Vec32 {}
45 | 
46 | macro_rules! impl_cudnn_vec_type {
47 |     ($type:ident, $safe_type:ident, $raw_type:ident) => {
48 |         impl VecType<$safe_type> for $type {
49 |             fn into_raw() -> cudnn_sys::cudnnDataType_t {
50 |                 cudnn_sys::cudnnDataType_t::$raw_type
51 |             }
52 |         }
53 |     };
54 | }
55 | 
56 | impl_cudnn_vec_type!(Vec4, i8, CUDNN_DATA_INT8x4);
57 | impl_cudnn_vec_type!(Vec32, i8, CUDNN_DATA_INT8x32);
58 | impl_cudnn_vec_type!(Vec4, u8, CUDNN_DATA_UINT8x4);
59 | 
60 | /// Admissible data types for scaling parameters.
61 | pub trait ScalingDataType<T>: DataType + private::Sealed
62 | where
63 |     T: DataType,
64 | {
65 | }
66 | 
67 | impl ScalingDataType<i8> for f32 {}
68 | impl ScalingDataType<u8> for f32 {}
69 | impl ScalingDataType<i32> for f32 {}
70 | impl ScalingDataType<i64> for f32 {}
71 | impl ScalingDataType<f32> for f32 {}
72 | 
73 | impl ScalingDataType<f64> for f64 {}
74 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/determinism.rs:
--------------------------------------------------------------------------------
 1 | /// Enum stating whether or not the computed results are deterministic (reproducible).
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDeterminism_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum Determinism {
 7 |     /// Results are guaranteed to be reproducible.
 8 |     Deterministic,
 9 |     /// Results are not guaranteed to be reproducible.
10 |     NonDeterministic,
11 | }
12 | 
13 | impl From<cudnn_sys::cudnnDeterminism_t> for Determinism {
14 |     fn from(raw: cudnn_sys::cudnnDeterminism_t) -> Self {
15 |         use cudnn_sys::cudnnDeterminism_t::*;
16 |         match raw {
17 |             CUDNN_DETERMINISTIC => Self::Deterministic,
18 |             CUDNN_NON_DETERMINISTIC => Self::NonDeterministic,
19 |         }
20 |     }
21 | }
22 | 
23 | impl From<Determinism> for cudnn_sys::cudnnDeterminism_t {
24 |     fn from(determinism: Determinism) -> Self {
25 |         use cudnn_sys::cudnnDeterminism_t::*;
26 |         match determinism {
27 |             Determinism::Deterministic => CUDNN_DETERMINISTIC,
28 |             Determinism::NonDeterministic => CUDNN_NON_DETERMINISTIC,
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/dropout/dropout_descriptor.rs:
--------------------------------------------------------------------------------
 1 | use cust::memory::GpuBuffer;
 2 | 
 3 | /// The descriptor of a dropout operation.
 4 | #[derive(Debug, PartialEq, Eq, Hash)]
 5 | pub struct DropoutDescriptor<T>
 6 | where
 7 |     T: GpuBuffer<u8>,
 8 | {
 9 |     pub(crate) raw: cudnn_sys::cudnnDropoutDescriptor_t,
10 |     pub(crate) states: T,
11 | }
12 | 
13 | impl<T> Drop for DropoutDescriptor<T>
14 | where
15 |     T: GpuBuffer<u8>,
16 | {
17 |     fn drop(&mut self) {
18 |         unsafe {
19 |             cudnn_sys::cudnnDestroyDropoutDescriptor(self.raw);
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![deny(rustdoc::broken_intra_doc_links)]
 2 | #[doc = include_str!("../README.md")]
 3 | mod activation;
 4 | mod attention;
 5 | mod backend;
 6 | mod context;
 7 | mod convolution;
 8 | mod data_type;
 9 | mod determinism;
10 | mod dropout;
11 | mod error;
12 | mod math_type;
13 | mod nan_propagation;
14 | mod op;
15 | mod pooling;
16 | mod reduction;
17 | mod rnn;
18 | mod softmax;
19 | mod tensor;
20 | mod w_grad_mode;
21 | 
22 | pub use activation::*;
23 | pub use attention::*;
24 | pub use context::*;
25 | pub use convolution::*;
26 | pub use data_type::*;
27 | pub use determinism::*;
28 | pub use dropout::*;
29 | pub use error::*;
30 | pub use math_type::*;
31 | pub use nan_propagation::*;
32 | pub use op::*;
33 | pub use pooling::*;
34 | pub use reduction::*;
35 | pub use rnn::*;
36 | pub use softmax::*;
37 | pub use tensor::*;
38 | pub use w_grad_mode::*;
39 | 
40 | pub(crate) mod private {
41 |     pub trait Sealed {}
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/math_type.rs:
--------------------------------------------------------------------------------
 1 | /// Enum stating whether the use of tensor core operations is permitted in a given library routine.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnMathType_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[non_exhaustive]
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum MathType {
 8 |     /// Tensor Core operations are not used on pre-NVIDIA A100 GPU devices. On A100 GPU
 9 |     /// architecture devices, Tensor Core TF32 operation is permitted.
10 |     Default,
11 |     /// The use of Tensor Core operations is permitted but will not actively perform datatype
12 |     /// down conversion on tensors in order to utilize Tensor Cores.
13 |     TensorOp,
14 |     /// The use of Tensor Core operations is permitted and will actively perform datatype down
15 |     /// conversion on tensors in order to utilize Tensor Cores.
16 |     TensorOpAllowConversion,
17 |     /// Restricted to only kernels that use FMA instructions.
18 |     Fma,
19 | }
20 | 
21 | impl From<cudnn_sys::cudnnMathType_t> for MathType {
22 |     fn from(raw: cudnn_sys::cudnnMathType_t) -> Self {
23 |         use cudnn_sys::cudnnMathType_t::*;
24 |         match raw {
25 |             CUDNN_DEFAULT_MATH => Self::Default,
26 |             CUDNN_TENSOR_OP_MATH => Self::TensorOp,
27 |             CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION => Self::TensorOpAllowConversion,
28 |             CUDNN_FMA_MATH => Self::Fma,
29 |         }
30 |     }
31 | }
32 | 
33 | impl From<MathType> for cudnn_sys::cudnnMathType_t {
34 |     fn from(math_type: MathType) -> Self {
35 |         match math_type {
36 |             MathType::Default => Self::CUDNN_DEFAULT_MATH,
37 |             MathType::TensorOp => Self::CUDNN_TENSOR_OP_MATH,
38 |             MathType::TensorOpAllowConversion => Self::CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION,
39 |             MathType::Fma => Self::CUDNN_FMA_MATH,
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/nan_propagation.rs:
--------------------------------------------------------------------------------
 1 | /// Indicates whether a given cuDNN routine should propagate Nan numbers.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnNanPropagation_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum NanPropagation {
 7 |     /// NaN numbers are not propagated.
 8 |     NotPropagateNaN,
 9 |     /// Nan numbers are propagated.
10 |     PropagateNaN,
11 | }
12 | 
13 | impl From<NanPropagation> for cudnn_sys::cudnnNanPropagation_t {
14 |     fn from(nan_propagation: NanPropagation) -> cudnn_sys::cudnnNanPropagation_t {
15 |         use cudnn_sys::cudnnNanPropagation_t::*;
16 |         match nan_propagation {
17 |             NanPropagation::NotPropagateNaN => CUDNN_NOT_PROPAGATE_NAN,
18 |             NanPropagation::PropagateNaN => CUDNN_PROPAGATE_NAN,
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/op/op_tensor_op.rs:
--------------------------------------------------------------------------------
 1 | /// A unary tensor core operation.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnOpTensorOp_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[non_exhaustive]
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum UnaryOp {
 8 |     Sqrt,
 9 |     Not,
10 | }
11 | 
12 | impl From<UnaryOp> for cudnn_sys::cudnnOpTensorOp_t {
13 |     fn from(op: UnaryOp) -> Self {
14 |         match op {
15 |             UnaryOp::Sqrt => Self::CUDNN_OP_TENSOR_SQRT,
16 |             UnaryOp::Not => Self::CUDNN_OP_TENSOR_NOT,
17 |         }
18 |     }
19 | }
20 | 
21 | /// A binary tensor core operation.
22 | ///
23 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnOpTensorOp_t)
24 | /// may offer additional information about the APi behavior.
25 | #[non_exhaustive]
26 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
27 | pub enum BinaryOp {
28 |     Add,
29 |     Mul,
30 |     Min,
31 |     Max,
32 | }
33 | 
34 | impl From<BinaryOp> for cudnn_sys::cudnnOpTensorOp_t {
35 |     fn from(op: BinaryOp) -> Self {
36 |         match op {
37 |             BinaryOp::Add => Self::CUDNN_OP_TENSOR_ADD,
38 |             BinaryOp::Mul => Self::CUDNN_OP_TENSOR_MUL,
39 |             BinaryOp::Min => Self::CUDNN_OP_TENSOR_MIN,
40 |             BinaryOp::Max => Self::CUDNN_OP_TENSOR_MAX,
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/pooling/pooling_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the pooling method.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum PoolingMode {
 7 |     /// The maximum value inside the pooling window is used.
 8 |     Max,
 9 |     /// Values inside the pooling window are averaged. The number of elements used to calculate
10 |     /// the average includes spatial locations falling in the padding region.
11 |     AvgIncludePadding,
12 |     /// Values inside the pooling window are averaged. The number of elements used to calculate
13 |     /// the average excludes spatial locations falling in the padding region.
14 |     AvgExcludePadding,
15 |     /// The maximum value inside the pooling window is used. The algorithm used is deterministic.
16 |     MaxDeterministic,
17 | }
18 | 
19 | impl From<PoolingMode> for cudnn_sys::cudnnPoolingMode_t {
20 |     fn from(mode: PoolingMode) -> Self {
21 |         match mode {
22 |             PoolingMode::Max => Self::CUDNN_POOLING_MAX,
23 |             PoolingMode::AvgExcludePadding => Self::CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
24 |             PoolingMode::AvgIncludePadding => Self::CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
25 |             PoolingMode::MaxDeterministic => Self::CUDNN_POOLING_MAX_DETERMINISTIC,
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/reduction/indices_type.rs:
--------------------------------------------------------------------------------
 1 | /// Indicates the data type of the indices computed by a reduction operation.
 2 | #[non_exhaustive]
 3 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 4 | pub enum IndicesType {
 5 |     U8,
 6 |     U16,
 7 |     U32,
 8 |     U64,
 9 | }
10 | 
11 | impl From<IndicesType> for cudnn_sys::cudnnIndicesType_t {
12 |     fn from(mode: IndicesType) -> Self {
13 |         match mode {
14 |             IndicesType::U8 => Self::CUDNN_8BIT_INDICES,
15 |             IndicesType::U16 => Self::CUDNN_16BIT_INDICES,
16 |             IndicesType::U32 => Self::CUDNN_32BIT_INDICES,
17 |             IndicesType::U64 => Self::CUDNN_64BIT_INDICES,
18 |         }
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/reduction/reduce_indices.rs:
--------------------------------------------------------------------------------
 1 | /// Indicates whether a reduction operation should compute indices or not.
 2 | #[non_exhaustive]
 3 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 4 | pub enum ReduceIndices {
 5 |     /// Do not compute indices.
 6 |     None,
 7 |     /// Compute indices. The resulting indices are relative to the dimensions being reduced, and
 8 |     /// flattened.
 9 |     Flattened,
10 | }
11 | 
12 | impl From<ReduceIndices> for cudnn_sys::cudnnReduceTensorIndices_t {
13 |     fn from(mode: ReduceIndices) -> Self {
14 |         match mode {
15 |             ReduceIndices::None => Self::CUDNN_REDUCE_TENSOR_NO_INDICES,
16 |             ReduceIndices::Flattened => Self::CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/reduction/reduce_op.rs:
--------------------------------------------------------------------------------
 1 | /// Tensor reduction operation.
 2 | #[non_exhaustive]
 3 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 4 | pub enum ReduceOp {
 5 |     Add,
 6 |     Mul,
 7 |     Min,
 8 |     Max,
 9 |     Amax,
10 |     Avg,
11 |     Norm1,
12 |     Norm2,
13 |     MulNoZeros,
14 | }
15 | 
16 | impl From<ReduceOp> for cudnn_sys::cudnnReduceTensorOp_t {
17 |     fn from(op: ReduceOp) -> Self {
18 |         match op {
19 |             ReduceOp::Add => Self::CUDNN_REDUCE_TENSOR_ADD,
20 |             ReduceOp::Mul => Self::CUDNN_REDUCE_TENSOR_MUL,
21 |             ReduceOp::Min => Self::CUDNN_REDUCE_TENSOR_MIN,
22 |             ReduceOp::Max => Self::CUDNN_REDUCE_TENSOR_MAX,
23 |             ReduceOp::Amax => Self::CUDNN_REDUCE_TENSOR_AMAX,
24 |             ReduceOp::Avg => Self::CUDNN_REDUCE_TENSOR_AVG,
25 |             ReduceOp::Norm1 => Self::CUDNN_REDUCE_TENSOR_NORM1,
26 |             ReduceOp::Norm2 => Self::CUDNN_REDUCE_TENSOR_NORM2,
27 |             ReduceOp::MulNoZeros => Self::CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS,
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/forward_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies inference or training mode in RNN API.
 2 | ///
 3 | /// This parameter allows the cuDNN library to tune more precisely the size of the workspace buffer
 4 | /// that could be different in inference and training regimes.
 5 | ///
 6 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnForwardMode_t)
 7 | /// may offer additional information about the APi behavior.
 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 9 | pub enum ForwardMode {
10 |     /// Selects the inference mode.
11 |     Inference,
12 |     /// Selects the training mode.
13 |     Training,
14 | }
15 | 
16 | impl From<ForwardMode> for cudnn_sys::cudnnForwardMode_t {
17 |     fn from(mode: ForwardMode) -> Self {
18 |         use cudnn_sys::cudnnForwardMode_t::*;
19 |         match mode {
20 |             ForwardMode::Training => CUDNN_FWD_MODE_TRAINING,
21 |             ForwardMode::Inference => CUDNN_FWD_MODE_INFERENCE,
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_algo.rs:
--------------------------------------------------------------------------------
 1 | /// A recurrent neural network algorithm.
 2 | ///
 3 | /// **Do note** that double precision is only supported by `RnnAlgo::Standard`.
 4 | ///
 5 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNAlgo_t)
 6 | /// may offer additional information about the APi behavior.
 7 | #[non_exhaustive]
 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 9 | pub enum RnnAlgo {
10 |     Standard,
11 |     PersistStatic,
12 |     PersistDynamic,
13 |     PersistStaticSmallH,
14 | }
15 | 
16 | impl From<RnnAlgo> for cudnn_sys::cudnnRNNAlgo_t {
17 |     fn from(algo: RnnAlgo) -> Self {
18 |         use cudnn_sys::cudnnRNNAlgo_t::*;
19 |         match algo {
20 |             RnnAlgo::Standard => CUDNN_RNN_ALGO_STANDARD,
21 |             RnnAlgo::PersistStatic => CUDNN_RNN_ALGO_PERSIST_STATIC,
22 |             RnnAlgo::PersistDynamic => CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
23 |             RnnAlgo::PersistStaticSmallH => CUDNN_RNN_ALGO_PERSIST_STATIC,
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_bias_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the number of bias vectors for a recurrent neural network function.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNBiasMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[non_exhaustive]
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum RnnBiasMode {
 8 |     /// Applies a recurrent neural network cell formula that does not use biases.
 9 |     NoBias,
10 |     /// Applies a recurrent neural network cell formula that uses one input bias vector in the
11 |     /// input GEMM.
12 |     SingleInpBias,
13 |     /// Applies a recurrent neural network cell formula that uses uses two bias vectors.
14 |     DoubleBias,
15 |     /// Applies a recurrent neural network cell formula that uses one recurrent bias vector in the
16 |     /// recurrent GEMM.
17 |     SingleRecurrentBias,
18 | }
19 | 
20 | impl From<RnnBiasMode> for cudnn_sys::cudnnRNNBiasMode_t {
21 |     fn from(mode: RnnBiasMode) -> Self {
22 |         use cudnn_sys::cudnnRNNBiasMode_t::*;
23 |         match mode {
24 |             RnnBiasMode::NoBias => CUDNN_RNN_NO_BIAS,
25 |             RnnBiasMode::SingleInpBias => CUDNN_RNN_SINGLE_INP_BIAS,
26 |             RnnBiasMode::DoubleBias => CUDNN_RNN_DOUBLE_BIAS,
27 |             RnnBiasMode::SingleRecurrentBias => CUDNN_RNN_SINGLE_REC_BIAS,
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_clip_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Selects the LSTM cell clipping mode.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNClipMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[non_exhaustive]
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum RnnClipMode {
 8 |     /// Disables LSTM cell clipping.
 9 |     ClipNone,
10 |     /// Enables LSTM cell clipping.
11 |     ClipMinMax,
12 | }
13 | 
14 | impl From<RnnClipMode> for cudnn_sys::cudnnRNNClipMode_t {
15 |     fn from(mode: RnnClipMode) -> Self {
16 |         use cudnn_sys::cudnnRNNClipMode_t::*;
17 |         match mode {
18 |             RnnClipMode::ClipNone => CUDNN_RNN_CLIP_NONE,
19 |             RnnClipMode::ClipMinMax => CUDNN_RNN_CLIP_MINMAX,
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_data_layout.rs:
--------------------------------------------------------------------------------
 1 | /// The data layout for input and output of a recurrent neural network.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNDataLayout_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum RnnDataLayout {
 7 |     /// Data layout is padded, with outer stride from one time-step to the next.
 8 |     SeqMajorUnpacked,
 9 |     /// The sequence length is sorted and packed as in the basic RNN API.
10 |     SeqMajorPacked,
11 |     /// Data layout is padded, with outer stride from one batch to the next.
12 |     BatchMajorUnpacked,
13 | }
14 | 
15 | impl From<RnnDataLayout> for cudnn_sys::cudnnRNNDataLayout_t {
16 |     fn from(rnn_data_layout: RnnDataLayout) -> Self {
17 |         use cudnn_sys::cudnnRNNDataLayout_t::*;
18 |         match rnn_data_layout {
19 |             RnnDataLayout::SeqMajorUnpacked => CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
20 |             RnnDataLayout::SeqMajorPacked => CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
21 |             RnnDataLayout::BatchMajorUnpacked => CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED,
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_direction_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the recurrence pattern for a recurrent neural network.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDirectionMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum RnnDirectionMode {
 7 |     /// The network iterates recurrently from the first input to the last.
 8 |     Unidirectional,
 9 |     /// Each layer of the network iterates recurrently from the first input to the last and
10 |     /// separately from the last input to the first. The outputs of the two are concatenated at
11 |     /// each iteration giving the output of the layer.
12 |     Bidirectional,
13 | }
14 | 
15 | impl From<RnnDirectionMode> for cudnn_sys::cudnnDirectionMode_t {
16 |     fn from(mode: RnnDirectionMode) -> Self {
17 |         match mode {
18 |             RnnDirectionMode::Unidirectional => Self::CUDNN_UNIDIRECTIONAL,
19 |             RnnDirectionMode::Bidirectional => Self::CUDNN_BIDIRECTIONAL,
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_input_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the behavior of the first layer in a recurrent neural network.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNInputMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum RnnInputMode {
 7 |     /// A biased matrix multiplication is performed at the input of the first recurrent layer.
 8 |     LinearInput,
 9 |     /// No operation is performed at the input of the first recurrent layer. If `SkipInput` is used
10 |     /// the leading dimension of the input tensor must be equal to the hidden state size of the
11 |     /// network.
12 |     SkipInput,
13 | }
14 | 
15 | impl From<RnnInputMode> for cudnn_sys::cudnnRNNInputMode_t {
16 |     fn from(mode: RnnInputMode) -> Self {
17 |         use cudnn_sys::cudnnRNNInputMode_t::*;
18 |         match mode {
19 |             RnnInputMode::LinearInput => CUDNN_LINEAR_INPUT,
20 |             RnnInputMode::SkipInput => CUDNN_SKIP_INPUT,
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/rnn/rnn_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the type of recurrent neural network used.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum RnnMode {
 7 |     /// A single-gate recurrent neural network with a ReLU activation function.
 8 |     RnnReLu,
 9 |     /// A single-gate recurrent neural network with a tanh activation function.
10 |     RnnTanh,
11 |     /// A four-gate Long Short-Term Memory (LSTM) network with no peephole connections.
12 |     Lstm,
13 |     /// A three-gate network consisting of Gated Recurrent Units.
14 |     Gru,
15 | }
16 | 
17 | impl From<RnnMode> for cudnn_sys::cudnnRNNMode_t {
18 |     fn from(mode: RnnMode) -> Self {
19 |         use cudnn_sys::cudnnRNNMode_t::*;
20 |         match mode {
21 |             RnnMode::RnnReLu => CUDNN_RNN_RELU,
22 |             RnnMode::RnnTanh => CUDNN_RNN_TANH,
23 |             RnnMode::Lstm => CUDNN_LSTM,
24 |             RnnMode::Gru => CUDNN_GRU,
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/softmax/softmax_algo.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies the implementation of the softmax function.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSoftmaxAlgorithm_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum SoftmaxAlgo {
 7 |     /// This implementation applies the straightforward softmax operation.
 8 |     Fast,
 9 |     /// This implementation scales each point of the softmax input domain by its maximum value
10 |     /// to avoid potential floating point overflows in the softmax evaluation.
11 |     Accurate,
12 |     /// This entry performs the log softmax operation, avoiding overflows by scaling each point in
13 |     /// the input domain as in the accurate version.
14 |     Log,
15 | }
16 | 
17 | impl From<SoftmaxAlgo> for cudnn_sys::cudnnSoftmaxAlgorithm_t {
18 |     fn from(algo: SoftmaxAlgo) -> Self {
19 |         match algo {
20 |             SoftmaxAlgo::Fast => Self::CUDNN_SOFTMAX_FAST,
21 |             SoftmaxAlgo::Accurate => Self::CUDNN_SOFTMAX_ACCURATE,
22 |             SoftmaxAlgo::Log => Self::CUDNN_SOFTMAX_ACCURATE,
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/softmax/softmax_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Specifies how the softmax input must be processed.
 2 | ///
 3 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSoftmaxMode_t)
 4 | /// may offer additional information about the APi behavior.
 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 6 | pub enum SoftmaxMode {
 7 |     /// The softmax operation is computed per image (N) across the dimensions C,H,W.
 8 |     Instance,
 9 |     /// The softmax operation is computed per spatial location (H,W) per image (N) across
10 |     /// dimension C.
11 |     Channel,
12 | }
13 | 
14 | impl From<SoftmaxMode> for cudnn_sys::cudnnSoftmaxMode_t {
15 |     fn from(mode: SoftmaxMode) -> Self {
16 |         match mode {
17 |             SoftmaxMode::Channel => Self::CUDNN_SOFTMAX_MODE_CHANNEL,
18 |             SoftmaxMode::Instance => Self::CUDNN_SOFTMAX_MODE_INSTANCE,
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/tensor/mod.rs:
--------------------------------------------------------------------------------
1 | mod tensor_descriptor;
2 | mod tensor_format;
3 | 
4 | pub use tensor_descriptor::*;
5 | pub use tensor_format::*;
6 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/tensor/tensor_format.rs:
--------------------------------------------------------------------------------
 1 | /// Tensor formats in which each element of the tensor has scalar value.
 2 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 3 | pub enum ScalarC {
 4 |     /// This tensor format specifies that the data is laid out in the following order: batch size,
 5 |     /// feature maps, rows, columns. The strides are implicitly defined in such a way that the data
 6 |     /// are contiguous in memory with no padding between images, feature maps, rows, and columns;
 7 |     /// the columns are the inner dimension and the images are the outermost dimension.
 8 |     Nchw,
 9 |     /// This tensor format specifies that the data is laid out in the following order: batch size,
10 |     /// rows, columns, feature maps. The strides are implicitly defined in such a way that the data
11 |     /// are contiguous in memory with no padding between images, rows, columns, and feature maps; the
12 |     /// feature maps are the inner dimension and the images are the outermost dimension.
13 |     Nhwc,
14 | }
15 | 
16 | impl From<ScalarC> for cudnn_sys::cudnnTensorFormat_t {
17 |     fn from(tensor_format: ScalarC) -> Self {
18 |         match tensor_format {
19 |             ScalarC::Nchw => cudnn_sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
20 |             ScalarC::Nhwc => cudnn_sys::cudnnTensorFormat_t::CUDNN_TENSOR_NHWC,
21 |         }
22 |     }
23 | }
24 | 
25 | /// Predefined layouts for tensors.
26 | ///
27 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t)
28 | /// may offer additional information about the APi behavior.
29 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
30 | pub enum TensorFormat {
31 |     /// Scalar valued formats.
32 |     ///
33 |     /// * `Nchw` format.
34 |     ///
35 |     /// * `Nhwc` format.
36 |     ScalarC(ScalarC),
37 |     /// Vector valued formats.
38 |     ///
39 |     /// This tensor format specifies that the data is laid out in the following order: batch size,
40 |     /// feature maps, rows, columns. However, each element of the tensor is a vector of multiple
41 |     /// feature maps.
42 |     NchwVectC,
43 | }
44 | 
45 | impl From<ScalarC> for TensorFormat {
46 |     fn from(tensor_format: ScalarC) -> Self {
47 |         Self::ScalarC(tensor_format)
48 |     }
49 | }
50 | 
51 | impl From<TensorFormat> for cudnn_sys::cudnnTensorFormat_t {
52 |     fn from(tensor_format: TensorFormat) -> Self {
53 |         match tensor_format {
54 |             TensorFormat::ScalarC(fmt) => fmt.into(),
55 |             TensorFormat::NchwVectC => cudnn_sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW_VECT_C,
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/crates/cudnn/src/w_grad_mode.rs:
--------------------------------------------------------------------------------
 1 | /// Selects how buffers holding gradients of the loss function, computed with respect to trainable
 2 | /// parameters, are updated.
 3 | ///
 4 | /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnWgradMode_t)
 5 | /// may offer additional information about the APi behavior.
 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 7 | pub enum WGradMode {
 8 |     /// A weight gradient component, corresponding to a new batch of inputs, overwrites previously
 9 |     /// stored weight gradients in the output buffer.
10 |     Set,
11 |     /// A weight gradient component corresponding to a new batch of inputs is added to previously
12 |     /// evaluated weight gradients. Before using this mode, the buffer holding weight gradients
13 |     /// should be initialized to zero. Alternatively, the first API call outputting to an
14 |     /// uninitialized buffer should use the `WGradMode::Set` variant.
15 |     Add,
16 | }
17 | 
18 | impl From<WGradMode> for cudnn_sys::cudnnWgradMode_t {
19 |     fn from(mode: WGradMode) -> Self {
20 |         match mode {
21 |             WGradMode::Set => Self::CUDNN_WGRAD_MODE_SET,
22 |             WGradMode::Add => Self::CUDNN_WGRAD_MODE_ADD,
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/crates/cust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cust"
 3 | version = "0.3.2"
 4 | # Big thanks to the original author of rustacuda <3
 5 | authors = [
 6 |     "Riccardo D'Ambrosio <rdambrosio016@gmail.com>",
 7 |     "Brook Heisler <brookheisler@gmail.com>",
 8 | ]
 9 | edition = "2021"
10 | license = "MIT OR Apache-2.0"
11 | description = "High level bindings to the CUDA Driver API"
12 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
13 | readme = "../../README.md"
14 | 
15 | [dependencies]
16 | cust_core = { path = "../cust_core", version = "0.1.0"}
17 | cust_raw = { path = "../cust_raw", default-features = false, features = ["driver"] }
18 | bitflags = "2.8"
19 | cust_derive = { path = "../cust_derive", version = "0.2" }
20 | glam = { version = "0.29.2", features=["cuda"], optional = true }
21 | mint = { version = "^0.5", optional = true }
22 | num-complex = { version = "0.4.6", optional = true }
23 | vek = { version = "0.17.1", optional = true, default-features = false }
24 | bytemuck = { version = "1.21", optional = true }
25 | 
26 | [features]
27 | default= ["bytemuck", "impl_glam", "impl_mint", "impl_vek"]
28 | impl_glam = ["cust_core/glam", "glam"]
29 | impl_mint = ["cust_core/mint", "mint"]
30 | impl_vek = ["cust_core/vek", "vek"]
31 | impl_half = ["cust_core/half"]
32 | impl_num_complex = ["cust_core/num-complex", "num-complex"]
33 | 
34 | [build-dependencies]
35 | serde_json = "1.0.140"
36 | 
37 | [dev-dependencies]
38 | image = "0.25.5"
39 | 
40 | [package.metadata.docs.rs]
41 | rustdoc-args = ["--cfg", "docsrs"]
42 | 


--------------------------------------------------------------------------------
/crates/cust/README.md:
--------------------------------------------------------------------------------
1 | # Cust 
2 | 
3 | Featureful, Safe, and Fast CUDA Driver API wrapper for the Rust CUDA Project.
4 | 
5 | Cust is a fork of rustacuda with a lot of API changes, added functions, etc. Big thanks to everyone who worked on RustaCUDA!
6 | 


--------------------------------------------------------------------------------
/crates/cust/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | 
 3 | fn main() {
 4 |     let driver_version = env::var("DEP_CUDA_DRIVER_VERSION")
 5 |         .expect("Cannot find transitive metadata 'driver_version' from cust_raw package.")
 6 |         .parse::<u32>()
 7 |         .expect("Failed to parse CUDA driver version");
 8 | 
 9 |     println!("cargo::rustc-check-cfg=cfg(conditional_node)");
10 |     if driver_version >= 12030 {
11 |         println!("cargo::rustc-cfg=conditional_node");
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/crates/cust/resources/add.cu:
--------------------------------------------------------------------------------
1 | extern "C" __constant__ int my_constant = 314;
2 | 
3 | extern "C" __global__ void sum(const float *x, const float *y, float *out, int count)
4 | {
5 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x)
6 |   {
7 |     out[i] = x[i] + y[i];
8 |   }
9 | }


--------------------------------------------------------------------------------
/crates/cust/resources/add.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/cust/resources/add.cubin


--------------------------------------------------------------------------------
/crates/cust/resources/add.fatbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/cust/resources/add.fatbin


--------------------------------------------------------------------------------
/crates/cust/resources/add.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-24817639
 5 | // Cuda compilation tools, release 10.0, V10.0.130
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 3.2
10 | .target sm_20
11 | .address_size 64
12 | 
13 |         // .globl       sum
14 | .const .align 4 .u32 my_constant = 314;
15 | 
16 | .visible .entry sum(
17 |         .param .u64 sum_param_0,
18 |         .param .u64 sum_param_1,
19 |         .param .u64 sum_param_2,
20 |         .param .u32 sum_param_3
21 | )
22 | {
23 |         .reg .pred      %p<3>;
24 |         .reg .f32       %f<4>;
25 |         .reg .b32       %r<11>;
26 |         .reg .b64       %rd<11>;
27 | 
28 | 
29 |         ld.param.u64    %rd4, [sum_param_0];
30 |         ld.param.u64    %rd5, [sum_param_1];
31 |         ld.param.u64    %rd6, [sum_param_2];
32 |         ld.param.u32    %r6, [sum_param_3];
33 |         mov.u32         %r1, %ntid.x;
34 |         mov.u32         %r7, %ctaid.x;
35 |         mov.u32         %r8, %tid.x;
36 |         mad.lo.s32      %r10, %r1, %r7, %r8;
37 |         setp.ge.s32     %p1, %r10, %r6;
38 |         @%p1 bra        BB0_3;
39 | 
40 |         cvta.to.global.u64      %rd1, %rd6;
41 |         cvta.to.global.u64      %rd2, %rd5;
42 |         cvta.to.global.u64      %rd3, %rd4;
43 |         mov.u32         %r9, %nctaid.x;
44 |         mul.lo.s32      %r3, %r9, %r1;
45 | 
46 | BB0_2:
47 |         mul.wide.s32    %rd7, %r10, 4;
48 |         add.s64         %rd8, %rd3, %rd7;
49 |         add.s64         %rd9, %rd2, %rd7;
50 |         ld.global.f32   %f1, [%rd9];
51 |         ld.global.f32   %f2, [%rd8];
52 |         add.f32         %f3, %f2, %f1;
53 |         add.s64         %rd10, %rd1, %rd7;
54 |         st.global.f32   [%rd10], %f3;
55 |         add.s32         %r10, %r3, %r10;
56 |         setp.lt.s32     %p2, %r10, %r6;
57 |         @%p2 bra        BB0_2;
58 | 
59 | BB0_3:
60 |         ret;
61 | }


--------------------------------------------------------------------------------
/crates/cust/src/compile.rs:
--------------------------------------------------------------------------------
1 | //! Utilities for compiling PTX strings to cubin.
2 | 
3 | 


--------------------------------------------------------------------------------
/crates/cust/src/external.rs:
--------------------------------------------------------------------------------
 1 | //! External memory and synchronization resources
 2 | 
 3 | use cust_raw::driver_sys;
 4 | 
 5 | use crate::error::{CudaResult, ToResult};
 6 | use crate::memory::{DeviceCopy, DevicePointer};
 7 | 
 8 | #[repr(transparent)]
 9 | pub struct ExternalMemory(driver_sys::CUexternalMemory);
10 | 
11 | impl ExternalMemory {
12 |     // Import an external memory referenced by `fd` with `size`
13 |     #[allow(clippy::missing_safety_doc)]
14 |     pub unsafe fn import(fd: i32, size: usize) -> CudaResult<ExternalMemory> {
15 |         let desc = driver_sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
16 |             type_: driver_sys::CUexternalMemoryHandleType_enum::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
17 |             handle: driver_sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 { fd },
18 |             size: size as u64,
19 |             flags: 0,
20 |             reserved: Default::default(),
21 |         };
22 | 
23 |         let mut memory: driver_sys::CUexternalMemory = std::ptr::null_mut();
24 | 
25 |         driver_sys::cuImportExternalMemory(&mut memory, &desc)
26 |             .to_result()
27 |             .map(|_| ExternalMemory(memory))
28 |     }
29 | 
30 |     #[allow(clippy::missing_safety_doc)]
31 |     pub unsafe fn reimport(&mut self, fd: i32, size: usize) -> CudaResult<()> {
32 |         // import new memory - this will call drop to destroy the old one
33 |         *self = ExternalMemory::import(fd, size)?;
34 | 
35 |         Ok(())
36 |     }
37 | 
38 |     // Map a buffer from this memory with `size` and `offset`
39 |     pub fn mapped_buffer<T: DeviceCopy>(
40 |         &self,
41 |         size_in_bytes: usize,
42 |         offset_in_bytes: usize,
43 |     ) -> CudaResult<DevicePointer<T>> {
44 |         let buffer_desc = driver_sys::CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
45 |             flags: 0,
46 |             size: size_in_bytes as u64,
47 |             offset: offset_in_bytes as u64,
48 |             reserved: Default::default(),
49 |         };
50 | 
51 |         let mut dptr = 0;
52 |         unsafe {
53 |             driver_sys::cuExternalMemoryGetMappedBuffer(&mut dptr, self.0, &buffer_desc)
54 |                 .to_result()
55 |                 .map(|_| DevicePointer::from_raw(dptr))
56 |         }
57 |     }
58 | }
59 | 
60 | impl Drop for ExternalMemory {
61 |     fn drop(&mut self) {
62 |         unsafe {
63 |             driver_sys::cuDestroyExternalMemory(self.0)
64 |                 .to_result()
65 |                 .unwrap();
66 |         }
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/crates/cust/src/graph_dotfile.rs:
--------------------------------------------------------------------------------
 1 | /// Implementation of turning a Graph into a dotfile for debugging and visualization.
 2 | use crate::{
 3 |     error::CudaResult,
 4 |     graph::{Graph, GraphNode, GraphNodeType},
 5 | };
 6 | use std::fmt::Write;
 7 | 
 8 | // CUDA has a function exactly for this, but it has a couple issues:
 9 | // - it includes useless info users dont really need
10 | // - it can only dump it to a file
11 | // - it takes a cstring for the
12 | 
13 | // #[allow(unused_must_use)]
14 | // pub(crate) fn graph_to_dot(graph: &mut Graph) -> CudaResult<String> {
15 | //     let mut dot = String::new();
16 | 
17 | //     writeln!(dot, "digraph dot {{");
18 | //     writeln!(dot, "  subgraph cluster_1 {{");
19 | //     writeln!(dot, "    label = \"Graph 1\" graph[style = \"dashed\"];");
20 | //     writeln!(dot, "    ")
21 | 
22 | //     Ok(dot)
23 | // }
24 | 
25 | // fn node_to_dot(graph: &mut Graph, node: GraphNode, graph_num: u32, node_num: u32) -> CudaResult<String> {
26 | //     let kind = graph.node_type(node)?;
27 | 
28 | //     let name = format!("graph_{}_node_{}", graph_num, node_num);
29 | //     let style = "Mrecord";
30 | //     let contents = match graph.node_type(node)? {
31 | //         GraphNodeType::KernelInvocation => {
32 | //             let params = graph.kernel_node_params(node)?;
33 | 
34 | //         }
35 | //     }
36 | // }
37 | 


--------------------------------------------------------------------------------
/crates/cust/src/memory/device/device_variable.rs:
--------------------------------------------------------------------------------
 1 | use crate::error::CudaResult;
 2 | use crate::memory::device::CopyDestination;
 3 | use crate::memory::DeviceCopy;
 4 | use crate::memory::{DeviceBox, DevicePointer};
 5 | use std::ops::{Deref, DerefMut};
 6 | 
 7 | /// Wrapper around a variable on the host and a [`DeviceBox`] holding the
 8 | /// variable on the device, allowing for easy synchronization and storage.
 9 | #[derive(Debug)]
10 | pub struct DeviceVariable<T: DeviceCopy> {
11 |     mem: DeviceBox<T>,
12 |     var: T,
13 | }
14 | 
15 | impl<T: DeviceCopy> DeviceVariable<T> {
16 |     /// Create a new `DeviceVariable` wrapping `var`.
17 |     ///
18 |     /// Allocates storage on the device and copies `var` to the device.
19 |     pub fn new(var: T) -> CudaResult<Self> {
20 |         let mem = DeviceBox::new(&var)?;
21 |         Ok(Self { mem, var })
22 |     }
23 | 
24 |     /// Copy the host copy of the variable to the device
25 |     pub fn copy_htod(&mut self) -> CudaResult<()> {
26 |         self.mem.copy_from(&self.var)
27 |     }
28 | 
29 |     /// Copy the device copy of the variable to the host
30 |     pub fn copy_dtoh(&mut self) -> CudaResult<()> {
31 |         self.mem.copy_to(&mut self.var)
32 |     }
33 | 
34 |     pub fn as_device_ptr(&self) -> DevicePointer<T> {
35 |         self.mem.as_device_ptr()
36 |     }
37 | }
38 | 
39 | impl<T: DeviceCopy> Deref for DeviceVariable<T> {
40 |     type Target = T;
41 | 
42 |     fn deref(&self) -> &Self::Target {
43 |         &self.var
44 |     }
45 | }
46 | 
47 | impl<T: DeviceCopy> DerefMut for DeviceVariable<T> {
48 |     fn deref_mut(&mut self) -> &mut Self::Target {
49 |         &mut self.var
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/cust/src/memory/locked.rs:
--------------------------------------------------------------------------------
1 | mod locked_box;
2 | mod locked_buffer;
3 | 
4 | pub use locked_box::*;
5 | pub use locked_buffer::*;
6 | 


--------------------------------------------------------------------------------
/crates/cust/src/nvtx.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crates/cust/src/prelude.rs:
--------------------------------------------------------------------------------
 1 | //! This module re-exports a number of commonly-used types for working with cust.
 2 | //!
 3 | //! This allows the user to `use cust::prelude::*;` and have the most commonly-used types
 4 | //! available quickly.
 5 | 
 6 | pub use crate::context::{Context, ContextFlags};
 7 | pub use crate::device::Device;
 8 | pub use crate::event::{Event, EventFlags, EventStatus};
 9 | pub use crate::external::*;
10 | pub use crate::function::Function;
11 | pub use crate::launch;
12 | pub use crate::memory::{
13 |     CopyDestination, DeviceBuffer, DevicePointer, DeviceSlice, DeviceVariable, UnifiedBuffer,
14 | };
15 | pub use crate::module::Module;
16 | pub use crate::stream::{Stream, StreamFlags};
17 | pub use crate::util::*;
18 | pub use crate::CudaFlags;
19 | 


--------------------------------------------------------------------------------
/crates/cust_core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cust_core"
 3 | version = "0.1.1"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Core library for cust that can be shared across CPU and GPU"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [dependencies]
11 | vek = { version = "0.17.1", default-features=false, features=["libm"], optional = true }
12 | glam = { version = "0.29.2", features=["cuda", "libm"], default-features=false, optional=true }
13 | mint = { version = "^0.5", optional = true }
14 | half = { version = "2.4.1", optional = true }
15 | num-complex = { version = "0.4.6", optional = true }
16 | cust_derive = { path = "../cust_derive", version = "0.2" }
17 | 


--------------------------------------------------------------------------------
/crates/cust_derive/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | .env


--------------------------------------------------------------------------------
/crates/cust_derive/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cust_derive"
 3 | version = "0.2.0"
 4 | authors = ["Brook Heisler <brookheisler@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 5 | edition = "2018"
 6 | license = "MIT OR Apache-2.0"
 7 | description = "Macros for cust"
 8 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 9 | readme = "../../README.md"
10 | 
11 | [lib]
12 | proc-macro = true
13 | 
14 | [dependencies]
15 | syn = "2.0.96"
16 | quote = "1.0.38"
17 | proc-macro2 = "1.0.93"
18 | 


--------------------------------------------------------------------------------
/crates/cust_derive/README.md:
--------------------------------------------------------------------------------
1 | Custom derive macro crate for [RustaCUDA](https://github.com/bheisler/RustaCUDA).


--------------------------------------------------------------------------------
/crates/cust_raw/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cust_raw"
 3 | version = "0.11.3"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Low level bindings to the CUDA Driver API"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | links = "cuda"
10 | build = "build/main.rs"
11 | 
12 | [build-dependencies]
13 | bindgen = "0.71.1"
14 | bimap = "0.6.3"
15 | cc = "1.2.17"
16 | 
17 | [package.metadata.docs.rs]
18 | features = [
19 |     "driver",
20 |     "runtime",
21 |     "cublas",
22 |     "cublaslt",
23 |     "cublasxt",
24 |     "cudnn",
25 |     "nvptx-compiler",
26 |     "nvvm",
27 | ]
28 | 
29 | [features]
30 | default = ["driver"]
31 | driver = []
32 | runtime = []
33 | cublas = []
34 | cublaslt = []
35 | cublasxt = []
36 | cudnn = []
37 | nvptx-compiler = []
38 | nvvm = []
39 | 


--------------------------------------------------------------------------------
/crates/cust_raw/build/cublasLt_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cublasLt.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/cublasXt_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cublasXt.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/cublas_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cublas_v2.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/driver_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cuComplex.h"
2 | #include "cuda.h"
3 | #include "cudaProfiler.h"
4 | #include "vector_types.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/nvptx_compiler_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "nvPTXCompiler.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/nvvm_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "nvvm.h"


--------------------------------------------------------------------------------
/crates/cust_raw/build/runtime_wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cuda_runtime.h"
2 | #include "cuda_runtime_api.h"
3 | #include "cuda_profiler_api.h"


--------------------------------------------------------------------------------
/crates/cust_raw/src/cublas_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/cublas_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/cublaslt_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/cublasLt_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/cublasxt_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/cublasXt_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/driver_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/driver_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "driver")]
 2 | pub mod driver_sys;
 3 | #[cfg(feature = "runtime")]
 4 | pub mod runtime_sys;
 5 | 
 6 | #[cfg(feature = "cublas")]
 7 | pub mod cublas_sys;
 8 | #[cfg(feature = "cublaslt")]
 9 | pub mod cublaslt_sys;
10 | #[cfg(feature = "cublasxt")]
11 | pub mod cublasxt_sys;
12 | 
13 | #[cfg(feature = "nvptx-compiler")]
14 | pub mod nvptx_compiler_sys;
15 | #[cfg(feature = "nvvm")]
16 | pub mod nvvm_sys;
17 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/nvptx_compiler_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/nvptx_compiler_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/nvvm_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | pub const LIBDEVICE_BITCODE: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libdevice.bc"));
6 | 
7 | include!(concat!(env!("OUT_DIR"), "/nvvm_sys.rs"));
8 | 


--------------------------------------------------------------------------------
/crates/cust_raw/src/runtime_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/runtime_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gpu_rand"
 3 | version = "0.1.3"
 4 | authors = ["The Rand Project Developers", "The Rust CUDA Project Developers"]
 5 | license = "MIT OR Apache-2.0"
 6 | edition = "2021"
 7 | description = "GPU-friendly random number generators for the Rust CUDA Project"
 8 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 9 | readme = "../../README.md"
10 | 
11 | [dependencies]
12 | rand_core = { version = "0.9" }
13 | cust_core = { version = "0.1.0", path = "../cust_core" }
14 | 
15 | [target.'cfg(target_os = "cuda")'.dependencies]
16 | cuda_std = { version = "0.2", path = "../cuda_std" }
17 | 
18 | [package.metadata.docs.rs]
19 | rustdoc-args = ["--cfg", "docsrs"]
20 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/LICENSE-RAND:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 Developers of the Rand project
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/README.md:
--------------------------------------------------------------------------------
 1 | # gpu_rand
 2 | 
 3 | gpu_rand is the Rust CUDA Project's equivalent of cuRAND. cuRAND unfortunately does not work with
 4 | the CUDA Driver API, therefore, we reimplement (and extend) some of its algorithms and provide them in this crate.
 5 | 
 6 | This crate is meant to be gpu-centric, which means it may special-case certain things to run faster on the GPU by using PTX 
 7 | assembly. However, it is supposed to also work on the CPU, allowing you to reuse the same random states across CPU and GPU.
 8 | 
 9 | A lot of the initial code is taken from the [rust-random project](https://github.com/rust-random) and modified to make it able to
10 | pass to the GPU, as well as cleaning up certain things and updating it to edition 2021.
11 | 
12 | The random generators currently implemented are:
13 | 
14 | 32-bit:
15 | - Xoroshiro64**
16 | - Xoroshiro64*
17 | - Xoroshiro128+
18 | - Xoroshiro128++
19 | - Xoroshiro128**
20 | 
21 | 64-bit:
22 | - Xoroshiro128+
23 | - Xoroshiro128++
24 | - Xoroshiro128**
25 | - Xoroshiro256+
26 | - Xoroshiro256++
27 | - Xoroshiro256**
28 | - Xoroshiro512+
29 | - Xoroshiro512++
30 | - Xoroshiro512**
31 | 
32 | - SplitMix64
33 | 
34 | We also provide a default 64-bit generator which should be more than enough for most applications. The default
35 | currently uses Xoroshiro128** but that is subject to change in the future.
36 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/src/default.rs:
--------------------------------------------------------------------------------
 1 | use crate::xoroshiro::Xoroshiro128StarStar;
 2 | use rand_core::{RngCore, SeedableRng};
 3 | 
 4 | /// Default random generator which is good for most applications.
 5 | ///
 6 | /// This currently uses [`Xoroshiro128StarStar`], but that may be changed in the future (with a major version bump).
 7 | #[cfg_attr(not(target_os = "cuda"), derive(Copy, cust_core::DeviceCopy))]
 8 | #[derive(Debug, Clone, PartialEq, Eq)]
 9 | #[repr(transparent)]
10 | pub struct DefaultRand {
11 |     inner: Xoroshiro128StarStar,
12 | }
13 | 
14 | impl DefaultRand {
15 |     /// Initializes many states such that each state is offset in the main sequence by at least
16 |     /// `2**64` elements (based on the current default generator). Such that every state is independent
17 |     /// from the others as long as no state requests more than `2**64` random numbers.
18 |     #[cfg_attr(docsrs, doc(cfg(not(target_os = "cuda"))))]
19 |     #[cfg(not(target_os = "cuda"))]
20 |     pub fn initialize_states(seed: u64, num_states: usize) -> Vec<Self> {
21 |         Xoroshiro128StarStar::initialize_states(seed, num_states)
22 |             .into_iter()
23 |             .map(|inner| Self { inner })
24 |             .collect()
25 |     }
26 | }
27 | 
28 | impl RngCore for DefaultRand {
29 |     fn next_u32(&mut self) -> u32 {
30 |         self.inner.next_u32()
31 |     }
32 | 
33 |     fn next_u64(&mut self) -> u64 {
34 |         self.inner.next_u64()
35 |     }
36 | 
37 |     fn fill_bytes(&mut self, dest: &mut [u8]) {
38 |         self.inner.fill_bytes(dest)
39 |     }
40 | }
41 | 
42 | impl SeedableRng for DefaultRand {
43 |     type Seed = <Xoroshiro128StarStar as SeedableRng>::Seed;
44 | 
45 |     fn seed_from_u64(state: u64) -> Self {
46 |         Self {
47 |             inner: Xoroshiro128StarStar::seed_from_u64(state),
48 |         }
49 |     }
50 | 
51 |     fn from_seed(seed: Self::Seed) -> Self {
52 |         Self {
53 |             inner: Xoroshiro128StarStar::from_seed(seed),
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! gpu_rand is the Rust CUDA Project's equivalent of cuRAND. cuRAND unfortunately does not work with
 2 | //! the CUDA Driver API, therefore, we reimplement (and extend) some of its algorithms and provide them in this crate.
 3 | //!
 4 | //! This crate is meant to be gpu-centric, which means it may special-case certain things to run faster on the GPU by using PTX
 5 | //! assembly. However, it is supposed to also work on the CPU, allowing you to reuse the same random states across CPU and GPU.
 6 | //!
 7 | //! A lot of the initial code is taken from the [rust-random project](https://github.com/rust-random) and modified to make it able to
 8 | //! pass to the GPU, as well as cleaning up certain things and updating it to edition 2021.
 9 | //! The following generators are implemented:
10 | //!
11 | 
12 | #![deny(missing_docs)]
13 | #![deny(missing_debug_implementations)]
14 | #![allow(clippy::unreadable_literal)]
15 | #![cfg_attr(target_os = "cuda", no_std)]
16 | #![feature(doc_cfg)]
17 | 
18 | pub mod xoroshiro;
19 | 
20 | mod default;
21 | mod gpurng;
22 | 
23 | pub use default::*;
24 | pub use gpurng::*;
25 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/src/xoroshiro/xoroshiro64star.rs:
--------------------------------------------------------------------------------
 1 | use rand_core::impls::{fill_bytes_via_next, next_u64_via_u32};
 2 | use rand_core::le::read_u32_into;
 3 | use rand_core::{RngCore, SeedableRng};
 4 | 
 5 | /// A xoroshiro64* random number generator.
 6 | ///
 7 | /// The xoroshiro64* algorithm is not suitable for cryptographic purposes, but
 8 | /// is very fast and has good statistical properties, besides a low linear
 9 | /// complexity in the lowest bits.
10 | ///
11 | /// The algorithm used here is translated from [the `xoroshiro64star.c`
12 | /// reference source code](http://xoshiro.di.unimi.it/xoroshiro64star.c) by
13 | /// David Blackman and Sebastiano Vigna.
14 | #[allow(missing_copy_implementations)]
15 | #[cfg_attr(not(target_os = "cuda"), derive(Copy, cust_core::DeviceCopy))]
16 | #[derive(Debug, Clone, PartialEq, Eq)]
17 | #[repr(C)]
18 | pub struct Xoroshiro64Star {
19 |     s0: u32,
20 |     s1: u32,
21 | }
22 | 
23 | impl RngCore for Xoroshiro64Star {
24 |     #[inline]
25 |     fn next_u32(&mut self) -> u32 {
26 |         let r = self.s0.wrapping_mul(0x9E3779BB);
27 |         impl_xoroshiro_u32!(self);
28 |         r
29 |     }
30 | 
31 |     #[inline]
32 |     fn next_u64(&mut self) -> u64 {
33 |         next_u64_via_u32(self)
34 |     }
35 | 
36 |     #[inline]
37 |     fn fill_bytes(&mut self, dest: &mut [u8]) {
38 |         fill_bytes_via_next(self, dest);
39 |     }
40 | }
41 | 
42 | impl SeedableRng for Xoroshiro64Star {
43 |     type Seed = [u8; 8];
44 | 
45 |     /// Create a new `Xoroshiro64Star`.  If `seed` is entirely 0, it will be
46 |     /// mapped to a different seed.
47 |     fn from_seed(seed: [u8; 8]) -> Xoroshiro64Star {
48 |         deal_with_zero_seed!(seed, Self);
49 |         let mut s = [0; 2];
50 |         read_u32_into(&seed, &mut s);
51 | 
52 |         Xoroshiro64Star { s0: s[0], s1: s[1] }
53 |     }
54 | 
55 |     /// Seed a `Xoroshiro64Star` from a `u64` using `SplitMix64`.
56 |     fn seed_from_u64(seed: u64) -> Xoroshiro64Star {
57 |         from_splitmix!(seed)
58 |     }
59 | }
60 | 
61 | #[cfg(test)]
62 | mod tests {
63 |     use super::*;
64 | 
65 |     #[test]
66 |     fn reference() {
67 |         let mut rng = Xoroshiro64Star::from_seed([1, 0, 0, 0, 2, 0, 0, 0]);
68 |         // These values were produced with the reference implementation:
69 |         // http://xoshiro.di.unimi.it/xoshiro64star.c
70 |         let expected = [
71 |             2654435771, 327208753, 4063491769, 4259754937, 261922412, 168123673, 552743735,
72 |             1672597395, 1031040050, 2755315674,
73 |         ];
74 |         for &e in &expected {
75 |             assert_eq!(rng.next_u32(), e);
76 |         }
77 |     }
78 | 
79 |     #[test]
80 |     fn zero_seed() {
81 |         let mut rng = Xoroshiro64Star::seed_from_u64(0);
82 |         assert_ne!(rng.next_u64(), 0);
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/crates/gpu_rand/src/xoroshiro/xoroshiro64starstar.rs:
--------------------------------------------------------------------------------
 1 | use rand_core::impls::{fill_bytes_via_next, next_u64_via_u32};
 2 | use rand_core::le::read_u32_into;
 3 | use rand_core::{RngCore, SeedableRng};
 4 | 
 5 | /// A xoroshiro64** random number generator.
 6 | ///
 7 | /// The xoshiro64** algorithm is not suitable for cryptographic purposes, but
 8 | /// is very fast and has excellent statistical properties.
 9 | ///
10 | /// The algorithm used here is translated from [the `xoroshiro64starstar.c`
11 | /// reference source code](http://xoshiro.di.unimi.it/xoroshiro64starstar.c) by
12 | /// David Blackman and Sebastiano Vigna.
13 | #[allow(missing_copy_implementations)]
14 | #[cfg_attr(not(target_os = "cuda"), derive(Copy, cust_core::DeviceCopy))]
15 | #[derive(Debug, Clone, PartialEq, Eq)]
16 | #[repr(C)]
17 | pub struct Xoroshiro64StarStar {
18 |     s0: u32,
19 |     s1: u32,
20 | }
21 | 
22 | impl RngCore for Xoroshiro64StarStar {
23 |     #[inline]
24 |     fn next_u32(&mut self) -> u32 {
25 |         let r = starstar_u32!(self.s0);
26 |         impl_xoroshiro_u32!(self);
27 |         r
28 |     }
29 | 
30 |     #[inline]
31 |     fn next_u64(&mut self) -> u64 {
32 |         next_u64_via_u32(self)
33 |     }
34 | 
35 |     #[inline]
36 |     fn fill_bytes(&mut self, dest: &mut [u8]) {
37 |         fill_bytes_via_next(self, dest);
38 |     }
39 | }
40 | 
41 | impl SeedableRng for Xoroshiro64StarStar {
42 |     type Seed = [u8; 8];
43 | 
44 |     /// Create a new `Xoroshiro64StarStar`.  If `seed` is entirely 0, it will be
45 |     /// mapped to a different seed.
46 |     fn from_seed(seed: [u8; 8]) -> Xoroshiro64StarStar {
47 |         deal_with_zero_seed!(seed, Self);
48 |         let mut s = [0; 2];
49 |         read_u32_into(&seed, &mut s);
50 | 
51 |         Xoroshiro64StarStar { s0: s[0], s1: s[1] }
52 |     }
53 | 
54 |     /// Seed a `Xoroshiro64StarStar` from a `u64` using `SplitMix64`.
55 |     fn seed_from_u64(seed: u64) -> Xoroshiro64StarStar {
56 |         from_splitmix!(seed)
57 |     }
58 | }
59 | 
60 | #[cfg(test)]
61 | mod tests {
62 |     use super::*;
63 | 
64 |     #[test]
65 |     fn reference() {
66 |         let mut rng = Xoroshiro64StarStar::from_seed([1, 0, 0, 0, 2, 0, 0, 0]);
67 |         // These values were produced with the reference implementation:
68 |         // http://xoshiro.di.unimi.it/xoshiro64starstar.c
69 |         let expected = [
70 |             3802928447, 813792938, 1618621494, 2955957307, 3252880261, 1129983909, 2539651700,
71 |             1327610908, 1757650787, 2763843748,
72 |         ];
73 |         for &e in &expected {
74 |             assert_eq!(rng.next_u32(), e);
75 |         }
76 |     }
77 | 
78 |     #[test]
79 |     fn zero_seed() {
80 |         let mut rng = Xoroshiro64StarStar::seed_from_u64(0);
81 |         assert_ne!(rng.next_u64(), 0);
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/crates/nvvm/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nvvm"
 3 | version = "0.1.1"
 4 | authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 5 | edition = "2018"
 6 | license = "MIT OR Apache-2.0"
 7 | description = "High level bindings to libnvvm"
 8 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 9 | readme = "../../README.md"
10 | 
11 | [dependencies]
12 | cust_raw = { path = "../cust_raw", default-features = false, features = ["nvvm"] }
13 | 


--------------------------------------------------------------------------------
/crates/optix-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "optix-sys"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | license = "MIT OR Apache-2.0"
 6 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 7 | readme = "../../README.md"
 8 | links = "optix"
 9 | build = "build/main.rs"
10 | 
11 | [dependencies]
12 | cust_raw = { path = "../cust_raw", default-features = false, features = ["driver"] }
13 | 
14 | [build-dependencies]
15 | bindgen = "0.71.1"
16 | cc = "1.0.71"
17 | 


--------------------------------------------------------------------------------
/crates/optix-sys/build/optix_stubs.c:
--------------------------------------------------------------------------------
 1 | // Optix uses a function table approach to load its api at runtime. The table is defined
 2 | // by including optix_function_table_definition.h here.
 3 | #include "optix_function_table_definition.h"
 4 | 
 5 | // These are copied from optix_stubs.h so they do not get affected by the redefinition
 6 | // of inline. See below.
 7 | #ifdef _WIN32
 8 | #ifndef WIN32_LEAN_AND_MEAN
 9 | #define WIN32_LEAN_AND_MEAN 1
10 | #endif
11 | #include <windows.h>
12 | // The cfgmgr32 header is necessary for interrogating driver information in the registry.
13 | // For convenience the library is also linked in automatically using the #pragma command.
14 | #include <cfgmgr32.h>
15 | #pragma comment(lib, "Cfgmgr32.lib")
16 | #include <string.h>
17 | #else
18 | #include <dlfcn.h>
19 | #endif
20 | 
21 | // optix_stubs.h contains the functions needed to load the library and provides stubs
22 | // to call the functions in the function table. However, the stubs are defined as
23 | // `inline`, and won't be included in the final binary as is. We work around this by
24 | // redefining `inline` to do nothing before including the header.
25 | #define inline
26 | #include "optix_stubs.h"
27 | #undef inline
28 | 


--------------------------------------------------------------------------------
/crates/optix-sys/build/wrapper.h:
--------------------------------------------------------------------------------
 1 | #include <optix.h>
 2 | #include <optix_host.h>
 3 | 
 4 | static const size_t OptixSbtRecordHeaderSize = OPTIX_SBT_RECORD_HEADER_SIZE;
 5 | static const size_t OptixSbtRecordAlignment = OPTIX_SBT_RECORD_ALIGNMENT;
 6 | static const size_t OptixAccelBufferByteAlignment =
 7 |     OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT;
 8 | static const size_t OptixInstanceByteAlignment = OPTIX_INSTANCE_BYTE_ALIGNMENT;
 9 | static const size_t OptixAabbBufferByteAlignment =
10 |     OPTIX_AABB_BUFFER_BYTE_ALIGNMENT;
11 | static const size_t OptixGeometryTransformByteAlignment =
12 |     OPTIX_GEOMETRY_TRANSFORM_BYTE_ALIGNMENT;
13 | static const size_t OptixTransformByteAlignment =
14 |     OPTIX_TRANSFORM_BYTE_ALIGNMENT;
15 | 
16 | static const size_t OptixVersion = OPTIX_VERSION;
17 | 
18 | static const size_t OptixBuildInputSize = sizeof(OptixBuildInput);
19 | static const size_t OptixShaderBindingTableSize = sizeof(OptixShaderBindingTable);
20 | 
21 | /**
22 |  * <div rustbindgen replaces="OptixGeometryFlags"></div>
23 |  */
24 | enum GeometryFlags
25 | {
26 |     None = OPTIX_GEOMETRY_FLAG_NONE,
27 |     DisableAnyHit = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT,
28 |     RequireSingleAnyHitCall = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL
29 | };
30 | 


--------------------------------------------------------------------------------
/crates/optix-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | #![allow(unsafe_op_in_unsafe_fn)]
 5 | 
 6 | mod optix_sys;
 7 | mod stub;
 8 | 
 9 | pub use crate::optix_sys::*;
10 | pub use crate::stub::*;
11 | 


--------------------------------------------------------------------------------
/crates/optix-sys/src/optix_sys.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | 
5 | include!(concat!(env!("OUT_DIR"), "/optix_sys.rs"));
6 | 


--------------------------------------------------------------------------------
/crates/optix-sys/src/stub.rs:
--------------------------------------------------------------------------------
1 | use crate::optix_sys::OptixResult;
2 | 
3 | unsafe extern "C" {
4 |     pub fn optixInit() -> OptixResult;
5 | }
6 | 


--------------------------------------------------------------------------------
/crates/optix/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "optix"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 7 | readme = "../../README.md"
 8 | authors = ["Anders Langlands <anderslanglands@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 9 | 
10 | [dependencies]
11 | cust = { version = "0.3", path = "../cust", features=["impl_mint"] }
12 | cust_raw = { path = "../cust_raw", features=["driver"] }
13 | cfg-if = "1.0.0"
14 | bitflags = "2.9.0"
15 | glam = { version = "0.29", features=["cuda", "libm"], default-features=false, optional=true }
16 | half = { version = "2.4.1", optional = true }
17 | memoffset = "0.9.1"
18 | mint = "0.5.9"
19 | embed-doc-image = {version = "0.1.4"}
20 | optix-sys = { path = "../optix-sys", default-features = false }
21 | 
22 | [features]
23 | default=["impl_glam"]
24 | impl_glam=["cust/impl_glam", "glam"]
25 | impl_half=["cust/impl_half", "half"]
26 | 
27 | [package.metadata.docs.rs]
28 | rustdoc-args = [ "--html-in-header", "katex-header.html" ]
29 | 


--------------------------------------------------------------------------------
/crates/optix/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | 
 3 | fn main() {
 4 |     let optix_version = env::var("DEP_OPTIX_VERSION")
 5 |         .expect("Cannot find transitive metadata 'version' from optix-sys package.")
 6 |         .parse::<u32>()
 7 |         .expect("Failed to parse OptiX version");
 8 | 
 9 |     println!("cargo::rustc-check-cfg=cfg(optix_build_input_instance_array_aabbs)");
10 |     println!("cargo::rustc-check-cfg=cfg(optix_module_compile_options_bound_values)");
11 |     println!("cargo::rustc-check-cfg=cfg(optix_pipeline_compile_options_reserved)");
12 |     println!("cargo::rustc-check-cfg=cfg(optix_program_group_options_reserved)");
13 | 
14 |     if optix_version < 70200 {
15 |         println!("cargo::rustc-cfg=optix_build_input_instance_array_aabbs");
16 |     }
17 |     if optix_version >= 70200 {
18 |         println!("cargo::rustc-cfg=optix_module_compile_options_bound_values");
19 |     }
20 |     if optix_version >= 70300 {
21 |         println!("cargo::rustc-cfg=optix_pipeline_compile_options_reserved");
22 |         println!("cargo::rustc-cfg=optix_program_group_options_reserved");
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ======================================================================== #
 2 | # Copyright 2018-2019 Ingo Wald                                            #
 3 | #                                                                          #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 5 | # you may not use this file except in compliance with the License.         #
 6 | # You may obtain a copy of the License at                                  #
 7 | #                                                                          #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0                           #
 9 | #                                                                          #
10 | # Unless required by applicable law or agreed to in writing, software      #
11 | # distributed under the License is distributed on an "AS IS" BASIS,        #
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
13 | # See the License for the specific language governing permissions and      #
14 | # limitations under the License.                                           #
15 | # ======================================================================== #
16 | 
17 | project(GPU_Development_Tools)
18 | cmake_minimum_required(VERSION 3.5)
19 | 
20 | set(CMAKE_CXX_STANDARD 11)
21 | 
22 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
23 | 
24 | add_library(gdt 
25 |   cmake/configure_build_type.cmake
26 |   cmake/configure_optix.cmake
27 |   cmake/FindOptiX.cmake
28 |   
29 |   gdt/gdt.h
30 |   gdt/math/LinearSpace.h
31 |   gdt/math/AffineSpace.h
32 |   
33 |   gdt/gdt.cpp
34 |   )
35 | 
36 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/cmake/configure_build_type.cmake:
--------------------------------------------------------------------------------
 1 | # ======================================================================== #
 2 | # Copyright 2018-2020 Ingo Wald                                            #
 3 | #                                                                          #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 5 | # you may not use this file except in compliance with the License.         #
 6 | # You may obtain a copy of the License at                                  #
 7 | #                                                                          #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0                           #
 9 | #                                                                          #
10 | # Unless required by applicable law or agreed to in writing, software      #
11 | # distributed under the License is distributed on an "AS IS" BASIS,        #
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
13 | # See the License for the specific language governing permissions and      #
14 | # limitations under the License.                                           #
15 | # ======================================================================== #
16 | 
17 | # This helper script sets up default build targets for Release/Debug, etc,
18 | # something which each project I worked on seems to need, eventually, so
19 | # having it in one place arguably makes sense.
20 | 
21 | if(NOT SET_UP_CONFIGURATIONS_DONE)
22 |     set(SET_UP_CONFIGURATIONS_DONE 1)
23 | 
24 |     # No reason to set CMAKE_CONFIGURATION_TYPES if it's not a multiconfig generator
25 |     # Also no reason mess with CMAKE_BUILD_TYPE if it's a multiconfig generator.
26 |     if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
27 |         set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) 
28 |     else()
29 |         if(NOT CMAKE_BUILD_TYPE)
30 | #            message("Defaulting to release build.")
31 |             set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
32 |         endif()
33 |         set_property(CACHE CMAKE_BUILD_TYPE PROPERTY HELPSTRING "Choose the type of build")
34 |         # set the valid options for cmake-gui drop-down list
35 |         set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug;Release")
36 |     endif()
37 | endif()
38 | 
39 | SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
40 | SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
41 | SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/cmake/configure_glut.cmake:
--------------------------------------------------------------------------------
 1 | # helper script that finds GLUT, either from the system install (linux), or from the included, precompiled binaries (windows)
 2 | # Note we *intentionally* do not use the file name of "FindGLUT.cmake" because we want to call the system-provided FindGLUT later on, we just set up some paths, where required
 3 | 
 4 | # legacy gl vs glvnd/glx
 5 | if (POLICY CMP0072)
 6 |   cmake_policy(SET CMP0072 NEW)
 7 | endif()
 8 | 
 9 | if (WIN32)
10 |    # The default cmake-FindGLUT.cmake script will automatically search in 
11 |    # - ${GLUT_ROOT_PATH}/Release (fro the lib)
12 |    # - ${GLUT_ROOT_PATH}/include 
13 |    # ... ie, setting this search path _should_ make the default script find the
14 |    # right stuff, and set the right variables
15 |    set(GLUT_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../3rdParty/freeglut")
16 | endif()
17 | 
18 | 
19 | find_package(OpenGL REQUIRED)
20 | find_package(GLUT REQUIRED)
21 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/cmake/configure_tbb.cmake:
--------------------------------------------------------------------------------
 1 | # ======================================================================== #
 2 | # Copyright 2018-2019 Ingo Wald                                            #
 3 | #                                                                          #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 5 | # you may not use this file except in compliance with the License.         #
 6 | # You may obtain a copy of the License at                                  #
 7 | #                                                                          #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0                           #
 9 | #                                                                          #
10 | # Unless required by applicable law or agreed to in writing, software      #
11 | # distributed under the License is distributed on an "AS IS" BASIS,        #
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
13 | # See the License for the specific language governing permissions and      #
14 | # limitations under the License.                                           #
15 | # ======================================================================== #
16 | 
17 | find_package(TBB REQUIRED)
18 | if (TBB_FOUND)
19 |     include_directories(${TBB_INCLUDE_DIR})
20 | endif()
21 | 
22 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/gdt/gdt.cpp:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018-2019 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "gdt.h"
18 | #include "math/LinearSpace.h"
19 | #include "math/AffineSpace.h"
20 | 
21 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/gdt/math/fixedpoint.h:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018 Ingo Wald                                                 //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #pragma once
18 | 
19 | #include "gdt/gdt.h"
20 | #include "gdt/math/constants.h"
21 | #include <iostream>
22 | 
23 | namespace gdt {
24 | 
25 |   /*! a n-bit fixed-point float in the [0..1] region */
26 |   template<typename storageT, int Nbits, int is_signed>
27 |   struct FixedPoint {
28 |     FixedPoint();
29 | 
30 |     float operator float() const {
31 |       return bits / float((1ULL << Nbits)-1);
32 |     }
33 |     storageT bits;
34 |   };
35 | }
36 |  
37 | 


--------------------------------------------------------------------------------
/crates/optix/examples/common/gdt/gdt/math/vec/rotate.h:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018 Ingo Wald                                                 //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #pragma once
18 | 
19 | namespace gdt {
20 | 
21 |   /*! perform 'rotation' of float a by amount b. Both a and b must be
22 |       in 0,1 range, the result will be (a+1) clamped to that same
23 |       range (ie, it is the value a shifted by the amount b to the
24 |       right, and re-entering the [0,1) range on the left if it
25 |       "rotates" out on the right */
26 |   inline __both__ float rotate(const float a, const float b)
27 |   {
28 |     float sum = a+b;
29 |     return (sum-1.f)<0.f?sum:(sum-1.f);
30 |   }
31 | 
32 |   /*! perform 'rotation' of float a by amount b. Both a and b must be
33 |       in 0,1 range, the result will be (a+1) clamped to that same
34 |       range (ie, it is the value a shifted by the amount b to the
35 |       right, and re-entering the [0,1) range on the left if it
36 |       "rotates" out on the right */
37 |   inline __both__ vec2f rotate(const vec2f a, const vec2f b) 
38 |   { return vec2f(rotate(a.x,b.x),rotate(a.y,b.y)); }
39 |   
40 | } // ::gdt
41 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ex02_pipeline"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | optix = { path = "../../" }
10 | optix-sys = { path = "../../../optix-sys" }
11 | cust = { path = "../../../cust" }
12 | anyhow = "1.0.44"
13 | device = { path = "./device" }
14 | 
15 | [build-dependencies]
16 | cuda_builder = { version = "0.3", path = "../../../cuda_builder" }
17 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::iter;
 3 | 
 4 | use cuda_builder::CudaBuilder;
 5 | 
 6 | fn main() {
 7 |     println!("cargo::rerun-if-changed=build.rs");
 8 | 
 9 |     let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
10 |     let optix_include_paths = env::var_os("DEP_OPTIX_INCLUDE_DIR")
11 |         .map(|s| env::split_paths(s.as_os_str()).collect::<Vec<_>>())
12 |         .expect("Cannot find transitive metadata 'optix_include' from optix-sys package.");
13 | 
14 |     let args = optix_include_paths
15 |         .iter()
16 |         .map(|p| format!("-I{}", p.display()))
17 |         .chain(iter::once(format!("-I{}/../common/gdt", manifest_dir)))
18 |         .collect::<Vec<_>>();
19 |     compile_to_ptx("src/ex02_pipeline.cu", &args);
20 | 
21 |     let ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("device.ptx");
22 | 
23 |     CudaBuilder::new("device")
24 |         .copy_to(ptx_path)
25 |         .arch(cuda_builder::NvvmArch::Compute75)
26 |         .optix(true)
27 |         .build()
28 |         .unwrap();
29 | }
30 | 
31 | fn compile_to_ptx(cu_path: &str, args: &[String]) {
32 |     println!("cargo::rerun-if-changed={}", cu_path);
33 | 
34 |     let full_path =
35 |         std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join(cu_path);
36 | 
37 |     let mut ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join(cu_path);
38 |     ptx_path.set_extension("ptx");
39 |     std::fs::create_dir_all(ptx_path.parent().unwrap()).unwrap();
40 | 
41 |     let output = std::process::Command::new("nvcc")
42 |         .arg("-ptx")
43 |         .arg(&full_path)
44 |         .arg("-o")
45 |         .arg(&ptx_path)
46 |         .args(args)
47 |         .output()
48 |         .expect("failed to fun nvcc");
49 | 
50 |     if !output.status.success() {
51 |         panic!("{}", unsafe { String::from_utf8_unchecked(output.stderr) });
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/device/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "device"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda_std = { version = "0.2", path = "../../../../cuda_std" }
10 | cust_core = { version = "0.1", path = "../../../../cust_core" }
11 | optix_device = { path = "../../../../optix_device" }
12 | 
13 | [lib]
14 | crate-type = ["cdylib", "rlib"]
15 | 
16 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/device/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // #![deny(warnings)]
 2 | #![allow(clippy::missing_safety_doc)]
 3 | 
 4 | use cuda_std::*;
 5 | use cust_core::DeviceCopy;
 6 | use optix_device as optix;
 7 | 
 8 | extern crate alloc;
 9 | 
10 | #[repr(C)]
11 | #[derive(Copy, Clone)]
12 | pub struct LaunchParams {
13 |     pub frame_id: i32,
14 |     pub fb_size: [u32; 2],
15 |     pub color_buffer: u64,
16 | }
17 | 
18 | unsafe impl DeviceCopy for LaunchParams {}
19 | 
20 | #[no_mangle]
21 | static PARAMS: LaunchParams = LaunchParams {
22 |     frame_id: 88,
23 |     fb_size: [1, 1],
24 |     color_buffer: 0,
25 | };
26 | 
27 | extern "C" {
28 |     pub fn vprintf(format: *const u8, valist: *const core::ffi::c_void) -> i32;
29 | }
30 | 
31 | #[kernel]
32 | pub unsafe fn __closesthit__radiance() {}
33 | 
34 | #[kernel]
35 | pub unsafe fn __anyhit__radiance() {}
36 | 
37 | #[kernel]
38 | pub unsafe fn __miss__radiance() {}
39 | 
40 | #[kernel]
41 | pub unsafe fn __raygen__renderFrame() {
42 |     // let ix = _optix_get_launch_index_x();
43 |     // let iy = _optix_get_launch_index_y();
44 | 
45 |     let idx = optix::get_launch_index();
46 | 
47 |     if idx[0] == 3 && idx[1] == 4 {
48 |         vprintf(
49 |             c"Hello from Rust kernel!\n".as_ptr().cast(),
50 |             core::ptr::null::<core::ffi::c_void>(),
51 |         );
52 | 
53 |         #[repr(C)]
54 |         struct PrintArgs(i32);
55 | 
56 |         vprintf(
57 |             c"frame id is %d\n".as_ptr().cast(),
58 |             &PrintArgs(core::ptr::read_volatile(&PARAMS.frame_id)) as *const PrintArgs
59 |                 as *const core::ffi::c_void,
60 |         );
61 |     }
62 | }
63 | 
64 | // #[kernel]
65 | // pub unsafe fn render(fb: *mut Vec3<f32>, view: &Viewport) {}
66 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/src/launch_params.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <gdt/math/vec.h>
4 | 
5 | namespace osc {
6 | } // namespace osc
7 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex02_pipeline/src/main.rs:
--------------------------------------------------------------------------------
1 | mod renderer;
2 | use renderer::Renderer;
3 | 
4 | fn main() -> Result<(), Box<dyn std::error::Error>> {
5 |     let mut renderer = Renderer::new(256, 128)?;
6 |     renderer.render()?;
7 |     Ok(())
8 | }
9 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex03_window/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ex03_window"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | optix = { path = "../../" }
10 | optix-sys = { path = "../../../optix-sys" }
11 | cust = { path = "../../../cust" }
12 | anyhow = "1.0.44"
13 | glfw = "0.42.0"
14 | gl = "0.14.0"
15 | num-traits = "0.2.14"
16 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex03_window/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::iter;
 3 | 
 4 | fn main() {
 5 |     println!("cargo::rerun-if-changed=build.rs");
 6 | 
 7 |     let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
 8 |     let optix_include_paths = env::var_os("DEP_OPTIX_INCLUDE_DIR")
 9 |         .map(|s| env::split_paths(s.as_os_str()).collect::<Vec<_>>())
10 |         .expect("Cannot find transitive metadata 'optix_include' from optix-sys package.");
11 | 
12 |     let args = optix_include_paths
13 |         .iter()
14 |         .map(|p| format!("-I{}", p.display()))
15 |         .chain(iter::once(format!("-I{}/../common/gdt", manifest_dir)))
16 |         .collect::<Vec<_>>();
17 |     compile_to_ptx("src/ex03_window.cu", &args);
18 | }
19 | 
20 | fn compile_to_ptx(cu_path: &str, args: &[String]) {
21 |     println!("cargo:rerun-if-changed={}", cu_path);
22 | 
23 |     let full_path =
24 |         std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join(cu_path);
25 | 
26 |     let mut ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join(cu_path);
27 |     ptx_path.set_extension("ptx");
28 |     std::fs::create_dir_all(ptx_path.parent().unwrap()).unwrap();
29 | 
30 |     let output = std::process::Command::new("nvcc")
31 |         .arg("-ptx")
32 |         .arg(&full_path)
33 |         .arg("-o")
34 |         .arg(&ptx_path)
35 |         .args(args)
36 |         .output()
37 |         .expect("failed to fun nvcc");
38 | 
39 |     if !output.status.success() {
40 |         panic!("{}", unsafe { String::from_utf8_unchecked(output.stderr) });
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex04_mesh/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ex04_mesh"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | optix = {path = "../../"}
10 | cust = {path = "../../../cust", features=["impl_glam"]}
11 | anyhow = "1.0.44"
12 | glfw = "0.42.0"
13 | gl = "0.14.0"
14 | num-traits = "0.2.14"
15 | glam = { version = "0.29.2", features=["cuda"] }
16 | 
17 | [build-dependencies]
18 | cuda_builder = { version = "0.3", path = "../../../cuda_builder" }
19 | 


--------------------------------------------------------------------------------
/crates/optix/examples/ex04_mesh/build.rs:
--------------------------------------------------------------------------------
1 | use cuda_builder::CudaBuilder;
2 | 
3 | fn main() {
4 |     CudaBuilder::new("../rust/ex04_mesh_gpu")
5 |         .copy_to("../resources/ex04_mesh.ptx")
6 |         .build()
7 |         .unwrap();
8 | }
9 | 


--------------------------------------------------------------------------------
/crates/optix/examples/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.ptx


--------------------------------------------------------------------------------
/crates/optix/examples/rust/ex04_mesh_gpu/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ex04_mesh_gpu"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | cuda_std = { version = "0.2.0", path = "../../../../cuda_std" }
 8 | optix_device = { version = "0.1.0", path = "../../../../optix_device" }
 9 | 
10 | [lib]
11 | crate-type = ["cdylib", "rlib"]
12 | 


--------------------------------------------------------------------------------
/crates/optix/images/example_sbt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/example_sbt.jpg


--------------------------------------------------------------------------------
/crates/optix/images/example_sbt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/example_sbt.png


--------------------------------------------------------------------------------
/crates/optix/images/optix_programs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/optix_programs.jpg


--------------------------------------------------------------------------------
/crates/optix/images/scene_graph.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/scene_graph.jpg


--------------------------------------------------------------------------------
/crates/optix/images/scene_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/scene_graph.png


--------------------------------------------------------------------------------
/crates/optix/images/traversables_graph.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/optix/images/traversables_graph.jpg


--------------------------------------------------------------------------------
/crates/optix/src/denoiser.md:
--------------------------------------------------------------------------------
 1 | # NVIDIA AI Denoiser
 2 | 
 3 | Image areas that have not yet fully converged during rendering will often exhibit pixel-scale noise due to the insufficient amount of information gathered by the renderer. This grainy appearance in an image may be caused by low iteration counts, especially in scenes with complex lighting environments and material calculations.
 4 | 
 5 | The NVIDIA AI Denoiser can estimate the converged image from a partially converged image. Instead of improving image quality through a larger number of path tracing iterations, the denoiser can produce images of acceptable quality with far fewer iterations by post-processing the image.
 6 | 
 7 | The denoiser is based on statistical data sets that guide the denoising process. These data, represented by a binary blob called a training model, are produced from a large number of rendered images in different stages of convergence. The images are used as input to an underlying deep learning system. (See the NVIDIA Developer article “Deep Learning” for more information about deep-learning systems.)
 8 | 
 9 | Because deep-learning training needs significant computational resources—even obtaining a sufficient number of partially converged images can be difficult—a general-purpose model is included with the OptiX software. This model is suitable for many renderers. However, the model may not yield optimal results when applied to images produced by renderers with very different noise characteristics compared to those used in the original training data.
10 | 
11 | Post-processing rendered images includes image filters, such as blurring or sharpening, or reconstruction filters, such as box, triangle, or Gaussian filters. Custom post-processing performed on a noisy image can lead to unsatisfactory denoising results. During post-processing, the original high-frequency, per-pixel noise may become smeared across multiple pixels, making it more difficult to detect and be handled by the model. Therefore, post-processing operations should be done after the denoising process, while reconstruction filters should be implemented by using filter importance-sampling.
12 | 
13 | In general, the pixel color space of an image that is used as input for the denoiser should match the color space of the images on which the denoiser was trained. However, slight variations, such as substituting sRGB with a simple gamma curve, should not have a noticeable impact. Images used for the training model included with the NVIDIA AI Denoiser distribution were output directly as HDR data.
14 | 
15 | 


--------------------------------------------------------------------------------
/crates/optix/src/impl_glam.rs:
--------------------------------------------------------------------------------
 1 | use crate::acceleration::{IndexTriple, IndicesFormat, Vertex, VertexFormat};
 2 | 
 3 | impl Vertex for glam::Vec3 {
 4 |     const FORMAT: VertexFormat = VertexFormat::Float3;
 5 | }
 6 | 
 7 | impl IndexTriple for glam::IVec3 {
 8 |     const FORMAT: IndicesFormat = IndicesFormat::Int3;
 9 | }
10 | 


--------------------------------------------------------------------------------
/crates/optix/src/prelude.rs:
--------------------------------------------------------------------------------
 1 | pub use crate::{
 2 |     acceleration::{
 3 |         Aabb, Accel, AccelBufferSizes, AccelBuildOptions, AccelEmitDesc, AccelRelocationInfo,
 4 |         BuildFlags, BuildOperation, CurveArray, CurveType, CustomPrimitiveArray, DynamicAccel,
 5 |         GeometryFlags, IndexTriple, IndexedTriangleArray, Instance, InstanceArray, InstanceFlags,
 6 |         InstancePointerArray, TraversableHandle, TriangleArray, Vertex,
 7 |     },
 8 |     context::{DeviceContext, DeviceProperty},
 9 |     init, launch,
10 |     pipeline::{
11 |         CompileDebugLevel, CompileOptimizationLevel, ExceptionFlags, Module, ModuleCompileOptions,
12 |         Pipeline, PipelineCompileOptions, PipelineLinkOptions, PrimitiveType, PrimitiveTypeFlags,
13 |         ProgramGroup, ProgramGroupDesc, ProgramGroupModule, StackSizes, TraversableGraphFlags,
14 |     },
15 |     shader_binding_table::{SbtRecord, ShaderBindingTable},
16 | };
17 | 


--------------------------------------------------------------------------------
/crates/optix_device/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "optix_device"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | authors = ["Anders Langlands <anderslanglands@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 6 | 
 7 | [dependencies]
 8 | bitflags = "2.8"
 9 | cuda_std = { version = "0.2", path = "../cuda_std" }
10 | glam = { version = "0.29", features=["cuda", "libm"], default-features=false }
11 | paste = "1.0.15"
12 | seq-macro = "0.3.5"
13 | cust_core = { version = "0.1", path = "../cust_core" }
14 | 


--------------------------------------------------------------------------------
/crates/optix_device/src/misc.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(target_os = "cuda")]
 2 | use core::arch::asm;
 3 | use cuda_std::gpu_only;
 4 | 
 5 | /// Retrieves the data past the SBT header for this particular program
 6 | ///
 7 | /// # Safety
 8 | ///
 9 | /// The type requested must match with what is stored in the SBT.
10 | #[gpu_only]
11 | pub unsafe fn sbt_data<T>() -> &'static T {
12 |     let ptr: *const T;
13 |     asm!("call ({}), _optix_get_sbt_data_ptr_64, ();", out(reg64) ptr);
14 |     &*ptr
15 | }
16 | 


--------------------------------------------------------------------------------
/crates/optix_device/src/payload.rs:
--------------------------------------------------------------------------------
 1 | use crate::sys;
 2 | #[cfg(target_os = "cuda")]
 3 | use core::arch::asm;
 4 | use cuda_std::gpu_only;
 5 | 
 6 | /// Overrides the payload for the given register to a value.
 7 | ///
 8 | /// # Panics
 9 | ///
10 | /// Panics if the index is higher than 7.
11 | pub fn set_payload(idx: u8, p: u32) {
12 |     // SAFETY: setting the payload cannot cause UB, only getting the value of an unset
13 |     // register can.
14 |     unsafe { sys::set_payload(idx, p) }
15 | }
16 | 
17 | /// Returns the payload in the given register.
18 | ///
19 | /// # Safety
20 | ///
21 | /// The payload must have been set by the ray dispatch or by a program which came before this.
22 | /// Moreover, the payload must have not been cleared with [`clear_register`].
23 | ///
24 | /// # Panics
25 | ///
26 | /// Panics if the index is higher than `7`.
27 | pub unsafe fn get_payload(idx: u8) -> u32 {
28 |     sys::get_payload(idx)
29 | }
30 | 
31 | /// Clears the payload in the given register with an undefined value to ease register usage for programs
32 | /// down the line.
33 | ///
34 | /// The register is overriden with an undefined value and it must not be read until it is set again.
35 | ///
36 | /// # Panics
37 | ///
38 | /// Panics if the index is higher than `7`.
39 | #[gpu_only]
40 | pub fn clear_register(idx: u8) {
41 |     assert!(idx <= 7, "Register index must be in range 0..=7");
42 |     let idx = idx as u32;
43 |     // its unclear whether optix_undef_value is just a compiler
44 |     // hint for the optix jit compiler or if it returns a random
45 |     // unspecified value. Uninit values are not allowed to exist
46 |     // without MaybeUninit in rust, so just to be safe we do not
47 |     // expose the value, and instead directly use inline asm to
48 |     // create the value and then set it as a payload to avoid this.
49 |     unsafe {
50 |         asm!(
51 |            "call ({tmp}), _optix_undef_value, ();",
52 |            "call _optix_set_payload, ({}, {tmp});",
53 |            in(reg32) idx,
54 |            tmp = out(reg32) _,
55 |         );
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/crates/optix_device/src/util.rs:
--------------------------------------------------------------------------------
 1 | use crate::{intersect::get_attribute, payload::*};
 2 | use glam::Vec3;
 3 | 
 4 | pub fn pack_pointer<T>(ptr: *mut T) -> (u32, u32) {
 5 |     let x = ptr as u32;
 6 |     let y = (ptr as u64 >> 32) as u32;
 7 |     (x, y)
 8 | }
 9 | 
10 | pub fn unpack_pointer<T>(x: u32, y: u32) -> *mut T {
11 |     (((y as u64) << 32) | x as u64) as *mut T
12 | }
13 | 
14 | pub fn store_vec3_payload(vec: Vec3, start_reg: u8) {
15 |     set_payload(start_reg, vec.x.to_bits());
16 |     set_payload(start_reg + 1, vec.y.to_bits());
17 |     set_payload(start_reg + 2, vec.z.to_bits());
18 | }
19 | 
20 | /// Retrieves a vector from the passed payload. This uses 3 payload
21 | /// registers in total, starting from `start_reg`.
22 | ///
23 | /// # Safety
24 | ///
25 | /// `start_reg..(start_reg + 3)` payload slots must have all been set.
26 | pub unsafe fn get_vec3_payload(start_reg: u8) -> Vec3 {
27 |     let x = f32::from_bits(get_payload(start_reg));
28 |     let y = f32::from_bits(get_payload(start_reg + 1));
29 |     let z = f32::from_bits(get_payload(start_reg + 2));
30 |     Vec3::new(x, y, z)
31 | }
32 | 
33 | /// Retrieves a vector from the passed attributes. This uses 3 attribute
34 | /// registers in total, starting from `start_reg`.
35 | ///
36 | /// # Safety
37 | ///
38 | /// `start_reg..(start_reg + 3)` attribute slots must have all been set.
39 | pub unsafe fn get_vec3_attributes(start_reg: u8) -> Vec3 {
40 |     let x = f32::from_bits(get_attribute(start_reg));
41 |     let y = f32::from_bits(get_attribute(start_reg + 1));
42 |     let z = f32::from_bits(get_attribute(start_reg + 2));
43 |     Vec3::new(x, y, z)
44 | }
45 | 


--------------------------------------------------------------------------------
/crates/optix_device_macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "optix_device_macros"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Macros for optix_device"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [lib]
11 | proc-macro = true
12 | 
13 | [dependencies]
14 | quote = "1.0.38"
15 | syn = { version = "2.0.96", features = ["full"] }
16 | proc-macro2 = "1.0.93"
17 | 


--------------------------------------------------------------------------------
/crates/optix_device_macros/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // use quote::quote;
 2 | // use syn::*;
 3 | 
 4 | // #[proc_macro_attribute]
 5 | // pub fn launch_params(_input: proc_macro::TokenStream, item: proc_macro::TokenStream) -> proc_macro::TokenStream {
 6 | //     let statik = parse_macro_input!(item as ItemStatic);
 7 | //     let ty = statik.ty;
 8 | //     let name = statik.ident;
 9 | //     let vis = statik.vis;
10 | //     let tokens = quote! {
11 | //         extern "C" {
12 | 
13 | //         }
14 | //     }
15 | // }
16 | 


--------------------------------------------------------------------------------
/crates/ptx/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ptx"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "PTX parser and analyzer"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [dependencies]
11 | ascii = "1"
12 | # used for lexing/serializing very large enums of instructions/opcodes
13 | strum = { version = "0.26.3", features = ["derive"] }
14 | smallvec = "1.13.2"
15 | 


--------------------------------------------------------------------------------
/crates/ptx/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod lexer;
 2 | // #[cfg(test)]
 3 | // mod lexer_tests;
 4 | mod types;
 5 | // pub mod parser;
 6 | 
 7 | pub use types::*;
 8 | 
 9 | pub use ascii;
10 | 


--------------------------------------------------------------------------------
/crates/ptx/src/parser.rs:
--------------------------------------------------------------------------------
 1 | use ascii::AsciiStr;
 2 | 
 3 | use crate::{
 4 |     lexer::{Lexer, Token},
 5 |     types::*,
 6 | };
 7 | use std::iter::Peekable;
 8 | 
 9 | #[derive(Debug)]
10 | pub(crate) struct TokenManager<'src> {
11 |     pub(crate) token_pos: usize,
12 |     pub(crate) lexer: Peekable<Lexer<'src>>,
13 | }
14 | 
15 | pub type ParserResult<T> = Result<T, String>;
16 | 
17 | impl<'src> TokenManager<'src> {
18 |     pub(crate) fn next(&mut self) -> Option<ParserResult<Token>> {
19 |         let next = self.lexer.next()??;
20 |         self.token_pos += 1;
21 |         Some(Ok(next))
22 |     }
23 | 
24 |     pub(crate) fn expect(&mut self, kind: TokenKind) -> ParserResult<Token> {
25 |         match self.next() {
26 |             None => Err(format!("Expected `{:?}`, but instead the file ended", kind)),
27 |             Some(Ok(token)) if token.kind == kind => Ok(token),
28 |             Some(Ok(token)) => Err(format!(
29 |                 "Expected `{:?}`, but instead found `{:?}`",
30 |                 kind, token.kind
31 |             )),
32 |             Some(Err(err)) => Err(err),
33 |         }
34 |     }
35 | 
36 |     pub(crate) fn cur_value(&mut self) -> TokenValue {
37 |         self.lexer.values[self.token_pos]
38 |     }
39 | }
40 | 
41 | pub struct Parser<'src> {
42 |     tokens: TokenManager<'src>,
43 | }
44 | 
45 | impl<'src> Parser<'src> {
46 |     pub fn new(src: &'src AsciiStr) -> Self {
47 |         Self {
48 |             tokens: TokenManager {
49 |                 lexer: Lexer::new(src),
50 |                 token_pos: 0,
51 |             },
52 |         }
53 |     }
54 | 
55 |     fn directive(&mut self) -> ParserResult<>
56 | }
57 | 


--------------------------------------------------------------------------------
/crates/ptx/src/parser/directive.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/ptx/src/parser/directive.rs


--------------------------------------------------------------------------------
/crates/ptx/src/parser/mod.rs:
--------------------------------------------------------------------------------
1 | mod directive;
2 | 


--------------------------------------------------------------------------------
/crates/ptx_compiler/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ptx_compiler"
 3 | version = "0.1.1"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "High level bindings to CUDA's ptx compilation APIs"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [dependencies]
11 | cust_raw = { path = "../cust_raw", default-features = false, features = ["nvptx-compiler"] }
12 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rustc_codegen_nvvm"
 3 | version = "0.3.0"
 4 | authors = [
 5 |     "Riccardo D'Ambrosio <rdambrosio016@gmail.com>",
 6 |     "The Rust Project Developers",
 7 | ]
 8 | edition = "2024"
 9 | license = "MIT OR Apache-2.0"
10 | description = "A codegen backend for Rustc which targets the libnvvm CUDA library"
11 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
12 | readme = "../../README.md"
13 | 
14 | [lib]
15 | crate-type = ["dylib"]
16 | 
17 | [dependencies]
18 | nvvm = { version = "0.1", path = "../nvvm" }
19 | rustc-demangle = "0.1.24"
20 | libc = "0.2.169"
21 | libloading = "0.8.0"
22 | tar = "0.4.43"
23 | object = "0.36.7"
24 | bitflags = "2.8.0"
25 | # To avoid duplicate dependencies, this should match the version of gimli used
26 | # by `rustc_codegen_ssa` via its `thorin-dwp` dependency.
27 | gimli = "0.30"
28 | tracing = { version = "0.1.41", features = ["release_max_level_debug"] }
29 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
30 | rustc_codegen_nvvm_macros = { version = "0.1", path = "../rustc_codegen_nvvm_macros" }
31 | smallvec = { version = "1.14.0", features = ["union", "may_dangle"] }
32 | itertools = "0.14.0"
33 | 
34 | [build-dependencies]
35 | build-helper = "0.1.1"
36 | cc = { version = "1.0", features = ["parallel"] }
37 | xz = "0.1.0"
38 | tar = "0.4.37"
39 | curl = "0.4.40"
40 | 
41 | [package.metadata.rust-analyzer]
42 | rustc_private = true
43 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/libintrinsics.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/crates/rustc_codegen_nvvm/libintrinsics.bc


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/rustc_llvm_wrapper/.editorconfig:
--------------------------------------------------------------------------------
1 | [*.{h,cpp}]
2 | end_of_line = lf
3 | insert_final_newline = true
4 | charset = utf-8
5 | indent_style = space
6 | indent_size = 2
7 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/src/common.rs:
--------------------------------------------------------------------------------
 1 | use libc::c_char;
 2 | 
 3 | /// Extension trait for explicit casts to `*const c_char`.
 4 | pub(crate) trait AsCCharPtr {
 5 |     /// Equivalent to `self.as_ptr().cast()`, but only casts to `*const c_char`.
 6 |     fn as_c_char_ptr(&self) -> *const c_char;
 7 | }
 8 | 
 9 | impl AsCCharPtr for str {
10 |     fn as_c_char_ptr(&self) -> *const c_char {
11 |         self.as_ptr().cast()
12 |     }
13 | }
14 | 
15 | impl AsCCharPtr for [u8] {
16 |     fn as_c_char_ptr(&self) -> *const c_char {
17 |         self.as_ptr().cast()
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/src/debug_info/dwarf_const.rs:
--------------------------------------------------------------------------------
 1 | //! Definitions of various DWARF-related constants.
 2 | 
 3 | use libc::c_uint;
 4 | 
 5 | /// Helper macro to let us redeclare gimli's constants as our own constants
 6 | /// with a different type, with less risk of copy-paste errors.
 7 | macro_rules! declare_constant {
 8 |     (
 9 |         $name:ident : $type:ty
10 |     ) => {
11 |         #[allow(non_upper_case_globals)]
12 |         pub(crate) const $name: $type = ::gimli::constants::$name.0 as $type;
13 | 
14 |         // Assert that as-cast probably hasn't changed the value.
15 |         const _: () = assert!($name as i128 == ::gimli::constants::$name.0 as i128);
16 |     };
17 | }
18 | 
19 | // DWARF languages.
20 | declare_constant!(DW_LANG_Rust: c_uint);
21 | 
22 | // DWARF attribute type encodings.
23 | declare_constant!(DW_ATE_boolean: c_uint);
24 | declare_constant!(DW_ATE_float: c_uint);
25 | declare_constant!(DW_ATE_signed: c_uint);
26 | declare_constant!(DW_ATE_unsigned: c_uint);
27 | declare_constant!(DW_ATE_UTF: c_uint);
28 | 
29 | // DWARF expression operators.
30 | declare_constant!(DW_OP_deref: i64);
31 | declare_constant!(DW_OP_plus_uconst: i64);
32 | /// Defined by LLVM in `llvm/include/llvm/BinaryFormat/Dwarf.h`.
33 | /// Double-checked by a static assertion in `RustWrapper.cpp`.
34 | #[allow(non_upper_case_globals)]
35 | pub(crate) const DW_OP_LLVM_fragment: i64 = 0x1000;
36 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/src/debug_info/namespace.rs:
--------------------------------------------------------------------------------
 1 | // Namespace Handling.
 2 | 
 3 | use rustc_codegen_ssa::debuginfo::type_names;
 4 | use rustc_middle::ty::{self, Instance};
 5 | 
 6 | use crate::common::AsCCharPtr;
 7 | use crate::context::CodegenCx;
 8 | use crate::llvm;
 9 | use crate::llvm::debuginfo::DIScope;
10 | use rustc_hir::def_id::DefId;
11 | 
12 | use super::util::{DIB, debug_context};
13 | 
14 | pub(crate) fn mangled_name_of_instance<'tcx>(
15 |     cx: &CodegenCx<'_, 'tcx>,
16 |     instance: Instance<'tcx>,
17 | ) -> ty::SymbolName<'tcx> {
18 |     let tcx = cx.tcx;
19 |     tcx.symbol_name(instance)
20 | }
21 | 
22 | pub(crate) fn item_namespace<'ll>(cx: &CodegenCx<'ll, '_>, def_id: DefId) -> &'ll DIScope {
23 |     if let Some(&scope) = debug_context(cx).namespace_map.borrow().get(&def_id) {
24 |         return scope;
25 |     }
26 | 
27 |     let def_key = cx.tcx.def_key(def_id);
28 |     let parent_scope = def_key.parent.map(|parent| {
29 |         item_namespace(
30 |             cx,
31 |             DefId {
32 |                 krate: def_id.krate,
33 |                 index: parent,
34 |             },
35 |         )
36 |     });
37 | 
38 |     let namespace_name_string = {
39 |         let mut output = String::new();
40 |         type_names::push_item_name(cx.tcx, def_id, false, &mut output);
41 |         output
42 |     };
43 | 
44 |     let scope = unsafe {
45 |         llvm::LLVMRustDIBuilderCreateNameSpace(
46 |             DIB(cx),
47 |             parent_scope,
48 |             namespace_name_string.as_c_char_ptr(),
49 |             namespace_name_string.len(),
50 |         )
51 |     };
52 | 
53 |     debug_context(cx)
54 |         .namespace_map
55 |         .borrow_mut()
56 |         .insert(def_id, scope);
57 |     scope
58 | }
59 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/src/ptxgen.rs:
--------------------------------------------------------------------------------
1 | //! The final stages of compilation, gathering the llvm bitcode from rlibs and generating a ptx file.
2 | 
3 | //
4 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm/src/target.rs:
--------------------------------------------------------------------------------
 1 | use crate::llvm::{self, Type};
 2 | use rustc_target::spec::{
 3 |     LinkerFlavor, MergeFunctions, PanicStrategy, Target, TargetMetadata, TargetOptions,
 4 | };
 5 | 
 6 | pub const DATA_LAYOUT: &str = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
 7 | pub const TARGET_TRIPLE: &str = "nvptx64-nvidia-cuda";
 8 | pub const POINTER_WIDTH: u32 = 64;
 9 | 
10 | /// The pointer width of the current target
11 | pub(crate) unsafe fn usize_ty(llcx: &'_ llvm::Context) -> &'_ Type {
12 |     unsafe { llvm::LLVMInt64TypeInContext(llcx) }
13 | }
14 | 
15 | pub fn target() -> Target {
16 |     let mut options = TargetOptions::default();
17 | 
18 |     options.os = "cuda".into();
19 |     options.vendor = "nvidia".into();
20 |     options.linker_flavor = LinkerFlavor::Ptx;
21 |     // nvvm does all the linking for us, but technically its not a linker
22 |     options.linker = None;
23 |     options.cpu = "sm_30".into();
24 |     options.max_atomic_width = Some(64);
25 |     // Unwinding on CUDA is neither feasible nor useful.
26 |     options.panic_strategy = PanicStrategy::Abort;
27 |     // Needed to use `dylib` and `bin` crate types and the linker.
28 |     options.dynamic_linking = true;
29 |     options.executables = true;
30 |     options.only_cdylib = true;
31 | 
32 |     // nvvm does all the work of turning the bitcode into ptx
33 |     options.obj_is_bitcode = true;
34 | 
35 |     options.dll_prefix = "".into();
36 |     options.dll_suffix = ".ptx".into();
37 |     options.exe_suffix = ".ptx".into();
38 | 
39 |     // Disable MergeFunctions LLVM optimisation pass because it can
40 |     // produce kernel functions that call other kernel functions.
41 |     // This behavior is not supported by PTX ISA.
42 |     options.merge_functions = MergeFunctions::Disabled;
43 | 
44 |     Target {
45 |         arch: "nvptx".into(),
46 |         data_layout: DATA_LAYOUT.into(),
47 |         llvm_target: "nvptx64-nvidia-cuda".into(),
48 |         pointer_width: POINTER_WIDTH,
49 |         options,
50 |         metadata: TargetMetadata {
51 |             description: Some("NVIDIA CUDA".into()),
52 |             ..Default::default()
53 |         },
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm_macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rustc_codegen_nvvm_macros"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "Macros for rustc_codegen_nvvm"
 7 | repository = "https://github.com/Rust-GPU/Rust-CUDA"
 8 | readme = "../../README.md"
 9 | 
10 | [lib]
11 | proc-macro = true
12 | 
13 | [dependencies]
14 | syn = { version = "2.0.96", features = ["full"] }
15 | quote = "1.0.38"
16 | proc-macro2 = "1.0.93"
17 | 


--------------------------------------------------------------------------------
/crates/rustc_codegen_nvvm_macros/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro::TokenStream;
 2 | use quote::quote;
 3 | use syn::parse_macro_input;
 4 | use syn::punctuated::Punctuated;
 5 | use syn::token::Comma;
 6 | use syn::*;
 7 | 
 8 | #[proc_macro_attribute]
 9 | pub fn trace_ffi_calls(_attr: TokenStream, item: TokenStream) -> TokenStream {
10 |     let item = parse_macro_input!(item as ItemForeignMod);
11 |     let clone = item.clone();
12 |     let priv_module: ItemMod = parse_quote! {
13 |         mod private {
14 |             use super::*;
15 |             #clone
16 |         }
17 |     };
18 |     let mut module: ItemMod = parse_quote!(
19 |         pub(crate) mod public {
20 |             use super::*;
21 |         }
22 |     );
23 | 
24 |     for foreign in item.items {
25 |         if let ForeignItem::Fn(func) = foreign {
26 |             let contents = &mut module.content.as_mut().unwrap().1;
27 |             let Signature {
28 |                 ident,
29 |                 generics,
30 |                 inputs,
31 |                 output,
32 |                 ..
33 |             } = &func.sig;
34 | 
35 |             let args = inputs
36 |                 .into_iter()
37 |                 .map(|arg| match arg {
38 |                     FnArg::Typed(ty) => (*ty.pat).clone(),
39 |                     _ => unreachable!(),
40 |                 })
41 |                 .collect::<Punctuated<Pat, Comma>>();
42 | 
43 |             let new_func = parse_quote! {
44 |                 pub(crate) unsafe fn #ident #generics(#inputs) #output {
45 |                     tracing::trace!(stringify!(#ident));
46 |                     super::private::#ident(#args)
47 |                 }
48 |             };
49 | 
50 |             contents.push(Item::Fn(new_func));
51 |         }
52 |     }
53 | 
54 |     let tokens = quote! {
55 |         #priv_module
56 |         #module
57 |         pub(crate) use public::*;
58 |     };
59 | 
60 |     tokens.into()
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/cuda/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA + Rust examples
 2 | 
 3 | The examples in here showcase both the GPU side and the CPU side of writing a tool which uses the GPU.
 4 | 
 5 | ## [Interactive Path Tracer](cpu/path_tracer)
 6 | 
 7 | This example showcases a very simple interactive Path Tracer inspired by [Ray Tracing In One Weekend](https://raytracing.github.io/books/RayTracingInOneWeekend.html)
 8 | which runs on CPU or GPU, with the additional option of running OptiX denoising.
 9 | 
10 | ![Path Tracer](assets/path_tracer.png)
11 | 
12 | The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate
13 | for CPU rendering and sharing structures.
14 | 


--------------------------------------------------------------------------------
/examples/cuda/assets/path_tracer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/examples/cuda/assets/path_tracer.png


--------------------------------------------------------------------------------
/examples/cuda/gemm/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gemm"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | blastoff = { path = "../../../crates/blastoff" }
 8 | cuda_std = { path = "../../../crates/cuda_std" }
 9 | cust = { path = "../../../crates/cust" }
10 | cust_raw = { path = "../../../crates/cust_raw", features = ["driver"] }
11 | ndarray = { version = "0.16", features = ["approx"] }
12 | ndarray-rand = "0.15.0"
13 | rand = "0.9"
14 | 
15 | [build-dependencies]
16 | cuda_builder = { path = "../../../crates/cuda_builder" }
17 | 


--------------------------------------------------------------------------------
/examples/cuda/gemm/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path;
 3 | 
 4 | use cuda_builder::CudaBuilder;
 5 | 
 6 | fn main() {
 7 |     println!("cargo::rerun-if-changed=build.rs");
 8 |     println!("cargo::rerun-if-changed=kernels");
 9 | 
10 |     let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11 |     let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12 | 
13 |     CudaBuilder::new(manifest_dir.join("kernels"))
14 |         .copy_to(out_path.join("kernels.ptx"))
15 |         .build()
16 |         .unwrap();
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/cuda/gemm/kernels/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gemm-kernels"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | cuda_std = { path = "../../../../crates/cuda_std" }
 8 | glam = { version = "0.30.1", default-features = false, features = ["cuda", "nostd-libm"] }
 9 | 
10 | [lib]
11 | crate-type = ["cdylib", "rlib"]
12 | 


--------------------------------------------------------------------------------
/examples/cuda/gemm/kernels/src/gemm_naive.rs:
--------------------------------------------------------------------------------
 1 | use cuda_std::kernel;
 2 | use cuda_std::thread;
 3 | 
 4 | #[kernel]
 5 | #[allow(improper_ctypes_definitions)]
 6 | /// Naive GEMM kernel for C = alpha * A * B + beta * C.
 7 | ///
 8 | /// This kernel computes each element of the output matrix C independently, without any memory coalescing or tiling optimizations.
 9 | ///
10 | /// # Safety
11 | /// CUDA kernel requires unsafe.
12 | ///
13 | /// # Parameters
14 | /// - `mat_a`: Input matrix A, shape (m x k), row-major order.
15 | /// - `mat_b`: Input matrix B, shape (k x n), row-major order.
16 | /// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
17 | /// - `m`: Number of rows in A and C.
18 | /// - `n`: Number of columns in B and C.
19 | /// - `k`: Number of columns in A and rows in B.
20 | /// - `alpha`: Scalar multiplier for A * B.
21 | /// - `beta`: Scalar multiplier for C.
22 | ///
23 | /// # Thread Mapping
24 | /// Each thread computes one element of C at (row, col).
25 | pub unsafe fn gemm_naive(
26 |     mat_a: &[f32],
27 |     mat_b: &[f32],
28 |     mat_c: *mut f32,
29 |     m: usize,
30 |     n: usize,
31 |     k: usize,
32 |     alpha: f32,
33 |     beta: f32,
34 | ) {
35 |     let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
36 |     let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
37 | 
38 |     if row < m && col < n {
39 |         let mut sum = 0.0f32;
40 |         for i in 0..k {
41 |             sum += mat_a[row * k + i] * mat_b[i * n + col];
42 |         }
43 |         let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
44 |         *elem = alpha * sum + beta * *elem;
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/examples/cuda/gemm/kernels/src/lib.rs:
--------------------------------------------------------------------------------
1 | mod gemm_naive;
2 | mod gemm_tiled;
3 | 
4 | pub use crate::gemm_naive::gemm_naive;
5 | pub use crate::gemm_tiled::gemm_tiled;
6 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "path-tracer"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | 
 6 | [dependencies]
 7 | vek = { version = "0.17.1", features = ["bytemuck", "mint"] }
 8 | bytemuck = { version = "1.21", features = ["derive"] }
 9 | cust = { version = "0.3", path = "../../../crates/cust", features = [
10 |     "impl_vek",
11 | ] }
12 | image = "0.25.5"
13 | path-tracer-kernels = { path = "kernels" }
14 | gpu_rand = { version = "0.1", path = "../../../crates/gpu_rand" }
15 | optix = { version = "0.1", path = "../../../crates/optix" }
16 | glium = "0.32.0"
17 | glutin = "0.28.0"
18 | imgui = "0.9.0"
19 | imgui-glium-renderer = "0.9.0"
20 | imgui-winit-support = "0.9.0"
21 | rayon = "1.10.0"
22 | sysinfo = "0.33.1"
23 | anyhow = "1.0.53"
24 | 
25 | [build-dependencies]
26 | cuda_builder = { version = "0.3", path = "../../../crates/cuda_builder" }
27 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path;
 3 | 
 4 | use cuda_builder::CudaBuilder;
 5 | 
 6 | fn main() {
 7 |     println!("cargo::rerun-if-changed=build.rs");
 8 |     println!("cargo::rerun-if-changed=kernels");
 9 | 
10 |     let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11 |     let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12 | 
13 |     CudaBuilder::new(manifest_dir.join("kernels"))
14 |         .copy_to(out_path.join("kernels.ptx"))
15 |         .build()
16 |         .unwrap();
17 |     CudaBuilder::new(manifest_dir.join("kernels"))
18 |         .copy_to(out_path.join("kernels_optix.ptx"))
19 |         .build_args(&["--features", "optix"])
20 |         .build()
21 |         .unwrap();
22 | }
23 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "path-tracer-kernels"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | 
 6 | [dependencies]
 7 | cuda_std = { version = "0.2", path = "../../../../crates/cuda_std" }
 8 | enum_dispatch = "0.3.13"
 9 | gpu_rand = { version = "0.1", path = "../../../../crates/gpu_rand" }
10 | cust_core = { path = "../../../../crates/cust_core", features=["vek"] }
11 | optix_device = { path = "../../../../crates/optix_device" }
12 | 
13 | [lib]
14 | crate-type = ["cdylib", "rlib"]
15 | 
16 | [features]
17 | optix = []
18 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/hittable.rs:
--------------------------------------------------------------------------------
 1 | use crate::{Ray, Vec3};
 2 | use enum_dispatch::enum_dispatch;
 3 | 
 4 | #[derive(Clone, Copy, PartialEq)]
 5 | pub struct HitRecord {
 6 |     pub material_handle: usize,
 7 |     pub t: f32,
 8 |     pub point: Vec3,
 9 |     pub normal: Vec3,
10 | }
11 | 
12 | #[enum_dispatch]
13 | pub trait Hittable {
14 |     fn material(&self) -> usize;
15 |     fn hit(&self, ray: Ray, t_min: f32, t_max: f32) -> Option<HitRecord>;
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::missing_safety_doc)]
 2 | 
 3 | extern crate alloc;
 4 | 
 5 | pub mod hittable;
 6 | pub mod material;
 7 | pub mod math;
 8 | pub mod optix;
 9 | pub mod render;
10 | pub mod render_kernels;
11 | pub mod scene;
12 | pub mod sphere;
13 | 
14 | pub use cuda_std::vek;
15 | use cust_core::DeviceCopy;
16 | use enum_dispatch::enum_dispatch;
17 | use hittable::{HitRecord, Hittable};
18 | use sphere::Sphere;
19 | 
20 | pub type Vec3<T = f32> = vek::Vec3<T>;
21 | pub type Point<T = f32> = vek::Vec3<T>;
22 | pub type Vec2<T = f32> = vek::Vec2<T>;
23 | 
24 | #[derive(Default, Clone, Copy, DeviceCopy)]
25 | #[repr(C)]
26 | pub struct Viewport {
27 |     pub bounds: vek::Vec2<usize>,
28 |     pub lower_left: Vec3,
29 |     pub horizontal: Vec3,
30 |     pub vertical: Vec3,
31 |     pub origin: Vec3,
32 | }
33 | 
34 | #[repr(C)]
35 | #[derive(Clone, Copy, DeviceCopy)]
36 | #[enum_dispatch(Hittable)]
37 | pub enum Object {
38 |     Sphere(Sphere),
39 | }
40 | 
41 | #[derive(Clone, Copy, PartialEq)]
42 | pub struct Ray {
43 |     pub dir: Vec3,
44 |     pub origin: Point,
45 | }
46 | 
47 | impl Ray {
48 |     pub fn new(dir: Vec3, origin: Point) -> Self {
49 |         Self { dir, origin }
50 |     }
51 | 
52 |     pub fn at(&self, t: f32) -> Point {
53 |         self.origin + t * self.dir
54 |     }
55 | 
56 |     pub fn from_optix() -> Self {
57 |         use optix_device::intersection;
58 | 
59 |         Self {
60 |             dir: Vec3::from(intersection::ray_world_direction().to_array()),
61 |             origin: Vec3::from(intersection::ray_world_origin().to_array()),
62 |         }
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/math.rs:
--------------------------------------------------------------------------------
 1 | //! Generic math utilities.
 2 | 
 3 | use crate::Vec3;
 4 | #[cfg(target_os = "cuda")]
 5 | use cuda_std::GpuFloat;
 6 | use gpu_rand::{DefaultRand, GpuRand};
 7 | 
 8 | /// Converts a float in the range of [0.0, 1.0] to a range of [-1.0, 1.0].
 9 | pub fn norm_f32_to_snorm(x: f32) -> f32 {
10 |     x * 2.0 - 1.0
11 | }
12 | 
13 | pub fn random_unit_vec(state: &mut DefaultRand) -> Vec3 {
14 |     let [x, y] = state.normal_f32_2();
15 |     let z = state.normal_f32();
16 |     Vec3::new(x, y, z)
17 | }
18 | 
19 | /// Creates a random vector with each element being in the range of [-1.0, 1.0] (signed normalized).
20 | pub fn random_snorm_vec(state: &mut DefaultRand) -> Vec3 {
21 |     random_unit_vec(state).map(norm_f32_to_snorm)
22 | }
23 | 
24 | pub fn random_in_unit_sphere(state: &mut DefaultRand) -> Vec3 {
25 |     loop {
26 |         let p = random_snorm_vec(state);
27 |         if p.magnitude_squared() >= 1.0 {
28 |             continue;
29 |         }
30 |         return p;
31 |     }
32 | }
33 | 
34 | pub fn reflect(v: Vec3, n: Vec3) -> Vec3 {
35 |     v - 2.0 * v.dot(n) * n
36 | }
37 | 
38 | pub fn refract(v: Vec3, n: Vec3, ni_over_nt: f32) -> Option<Vec3> {
39 |     let uv = v.normalized();
40 |     let dt = uv.dot(n);
41 |     let discriminant = 1.0 - ni_over_nt * ni_over_nt * (1.0 - dt * dt);
42 |     if discriminant > 0.0 {
43 |         Some(ni_over_nt * (uv - n * dt) - discriminant.sqrt() * n)
44 |     } else {
45 |         None
46 |     }
47 | }
48 | 
49 | pub fn schlick(cos: f32, ref_idx: f32) -> f32 {
50 |     let r0 = (1.0 - ref_idx) / (1.0 + ref_idx);
51 |     let r0sq = r0 * r0;
52 |     r0sq + (1.0 - r0sq) * (1.0 - cos).powf(5.0)
53 | }
54 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/render.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | const BACKGROUND_BLUE_MULTIPLIER: f32 = 0.7;
 4 | 
 5 | pub fn color(ray: Ray) -> Vec3 {
 6 |     let unit = ray.dir.normalized();
 7 |     let t = BACKGROUND_BLUE_MULTIPLIER * (unit.y + 1.0);
 8 |     (1.0 - t) * Vec3::one() + t * Vec3::new(0.5, 0.7, 1.0)
 9 | }
10 | 
11 | pub fn generate_ray(idx: vek::Vec2<u32>, view: &Viewport, offset: Vec2) -> Ray {
12 |     let uv = (idx.numcast::<f32>().unwrap() + offset) / view.bounds.numcast().unwrap();
13 |     Ray {
14 |         origin: view.origin,
15 |         dir: view.lower_left + uv.x * view.horizontal + uv.y * view.vertical - view.origin,
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/render_kernels.rs:
--------------------------------------------------------------------------------
 1 | use crate::{render::*, scene::Scene, *};
 2 | use cuda_std::{vek::Clamp, *};
 3 | use gpu_rand::{DefaultRand, GpuRand};
 4 | 
 5 | #[kernel]
 6 | pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: *mut DefaultRand) {
 7 |     let idx = thread::index_2d();
 8 |     if idx.x >= view.bounds.x as u32 || idx.y >= view.bounds.y as u32 {
 9 |         return;
10 |     }
11 |     let px_idx = idx.y as usize * view.bounds.x + idx.x as usize;
12 | 
13 |     // generate a tiny offset for the ray for antialiasing
14 |     let rng = &mut *rand_states.add(px_idx);
15 |     let offset = Vec2::from(rng.normal_f32_2());
16 | 
17 |     let ray = generate_ray(idx, &view, offset);
18 | 
19 |     let color = scene.ray_color(ray, rng);
20 |     *fb.add(px_idx) += color;
21 | }
22 | 
23 | /// Scales an accumulated buffer by the sample count, storing each pixel in the corresponding `out` pixel.
24 | #[kernel]
25 | pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: Viewport) {
26 |     let idx_2d = thread::index_2d();
27 |     if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
28 |         return;
29 |     }
30 |     let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
31 |     let original = &*fb.add(idx);
32 |     let out = &mut *out.add(idx);
33 | 
34 |     let scale = 1.0 / samples as f32;
35 |     let scaled = original * scale;
36 |     *out = scaled;
37 | }
38 | 
39 | /// Postprocesses a (scaled) buffer into a final u8 buffer.
40 | #[kernel]
41 | pub unsafe fn postprocess(fb: *const Vec3, out: *mut vek::Vec3<u8>, view: Viewport) {
42 |     let idx_2d = thread::index_2d();
43 |     if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
44 |         return;
45 |     }
46 |     let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
47 |     let original = &*fb.add(idx);
48 |     let out = &mut *out.add(idx);
49 |     // gamma=2.0
50 |     let gamma_corrected = original.sqrt();
51 | 
52 |     *out = (gamma_corrected * 255.0)
53 |         .clamped(Vec3::zero(), Vec3::broadcast(255.0))
54 |         .numcast()
55 |         .unwrap();
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/scene.rs:
--------------------------------------------------------------------------------
 1 | use crate::material::*;
 2 | use crate::*;
 3 | use cust_core::DeviceCopy;
 4 | use gpu_rand::DefaultRand;
 5 | 
 6 | pub const MAX_BOUNCES: u32 = 5;
 7 | 
 8 | #[repr(C)]
 9 | #[derive(Clone, Copy)]
10 | pub struct Scene<'a> {
11 |     pub objects: &'a [Object],
12 |     pub materials: &'a [MaterialKind],
13 | }
14 | 
15 | /// SAFETY: the slice is created from unified memory so it works on the GPU too.
16 | unsafe impl DeviceCopy for Scene<'_> {}
17 | 
18 | impl Scene<'_> {
19 |     pub fn hit(&self, ray: Ray, t_min: f32, t_max: f32) -> Option<HitRecord> {
20 |         let mut hit = None;
21 |         let mut closest_so_far = t_max;
22 | 
23 |         for obj in self.objects {
24 |             if let Some(rec) = obj.hit(ray, t_min, closest_so_far) {
25 |                 hit = Some(rec);
26 |                 closest_so_far = rec.t;
27 |             }
28 |         }
29 |         hit
30 |     }
31 | 
32 |     /// Casts a ray into the scene and returns the object hit by the ray.
33 |     pub fn raycast(&self, ray: Ray) -> Option<&Object> {
34 |         let mut hit = None;
35 |         let mut closest_so_far = f32::INFINITY;
36 | 
37 |         for obj in self.objects {
38 |             if let Some(rec) = obj.hit(ray, 0.001, closest_so_far) {
39 |                 hit = Some(obj);
40 |                 closest_so_far = rec.t;
41 |             }
42 |         }
43 |         hit
44 |     }
45 | 
46 |     pub fn ray_color(&self, ray: Ray, rng: &mut DefaultRand) -> Vec3 {
47 |         let mut cur_ray = ray;
48 |         let mut attenuation = Vec3::one();
49 | 
50 |         for _ in 0..MAX_BOUNCES {
51 |             if let Some(hit) = self.hit(cur_ray, 0.001, f32::INFINITY) {
52 |                 let material = self.materials[hit.material_handle];
53 |                 let (hit_attenuation, scattered) = material.scatter(cur_ray, hit, rng);
54 |                 if let Some(scattered) = scattered {
55 |                     attenuation *= hit_attenuation;
56 |                     cur_ray = scattered;
57 |                 } else {
58 |                     return Vec3::zero();
59 |                 }
60 |             } else {
61 |                 return attenuation * render::color(cur_ray);
62 |             }
63 |         }
64 |         Vec3::zero()
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/kernels/src/sphere.rs:
--------------------------------------------------------------------------------
 1 | use crate::hittable::{HitRecord, Hittable};
 2 | use crate::*;
 3 | #[cfg(target_os = "cuda")]
 4 | use cuda_std::GpuFloat;
 5 | use cust_core::DeviceCopy;
 6 | 
 7 | #[derive(Clone, Copy, PartialEq, DeviceCopy)]
 8 | pub struct Sphere {
 9 |     pub center: Point,
10 |     pub radius: f32,
11 |     pub mat: usize,
12 | }
13 | 
14 | impl Sphere {
15 |     pub fn new(center: Point, radius: f32, mat: usize) -> Self {
16 |         Self {
17 |             center,
18 |             radius,
19 |             mat,
20 |         }
21 |     }
22 | }
23 | 
24 | impl Hittable for Sphere {
25 |     fn material(&self) -> usize {
26 |         self.mat
27 |     }
28 | 
29 |     fn hit(&self, ray: Ray, t_min: f32, t_max: f32) -> Option<HitRecord> {
30 |         let oc = ray.origin - self.center;
31 |         let a = ray.dir.dot(ray.dir);
32 |         let b = oc.dot(ray.dir);
33 |         let c = oc.dot(oc) - self.radius * self.radius;
34 |         let discriminant = b * b - a * c;
35 | 
36 |         if discriminant > 0.0 {
37 |             let temp = (-b - discriminant.sqrt()) / a;
38 |             if temp < t_max && temp > t_min {
39 |                 return Some(HitRecord {
40 |                     t: temp,
41 |                     point: ray.at(temp),
42 |                     normal: (ray.at(temp) - self.center) / self.radius,
43 |                     material_handle: self.mat,
44 |                 });
45 |             }
46 |             let temp = (-b + discriminant.sqrt()) / a;
47 |             if temp < t_max && temp > t_min {
48 |                 return Some(HitRecord {
49 |                     t: temp,
50 |                     point: ray.at(temp),
51 |                     normal: (ray.at(temp) - self.center) / self.radius,
52 |                     material_handle: self.mat,
53 |                 });
54 |             }
55 |         }
56 |         None
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/shaders/image.frag:
--------------------------------------------------------------------------------
 1 | #version 450 
 2 | 
 3 | out vec4 color;
 4 | in vec2 tex_coords;
 5 | 
 6 | uniform sampler2D tex;
 7 | 
 8 | void main() {
 9 |   color = texture(tex, tex_coords);
10 | }


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/shaders/image.vert:
--------------------------------------------------------------------------------
1 | #version 450
2 | 
3 | in vec3 pos;
4 | out vec2 tex_coords;
5 | 
6 | void main() {
7 |   gl_Position = vec4(pos, 1.0);
8 |   tex_coords = (vec2(pos) / 2.0) + 0.5;
9 | }


--------------------------------------------------------------------------------
/examples/cuda/path_tracer/src/main.rs:
--------------------------------------------------------------------------------
 1 | pub mod common;
 2 | pub mod cpu;
 3 | pub mod cuda;
 4 | pub mod optix;
 5 | pub mod renderer;
 6 | pub mod viewer;
 7 | 
 8 | use common::Camera;
 9 | use path_tracer_kernels::{
10 |     material::{DielectricMaterial, DiffuseMaterial, MaterialKind, MetallicMaterial},
11 |     scene::Scene,
12 |     sphere::Sphere,
13 |     Object,
14 | };
15 | use std::error::Error;
16 | use vek::Vec3;
17 | 
18 | pub const WIDTH: u32 = 1920;
19 | pub const HEIGHT: u32 = 1080;
20 | 
21 | fn main() -> Result<(), Box<dyn Error>> {
22 |     let camera = Camera {
23 |         origin: Vec3::new(0.0, 0.5, 2.0),
24 |         lookat: Vec3::new(0.0, 0.0, -0.5),
25 |         vup: Vec3::unit_y(),
26 |         fov: 70.0,
27 |         aspect_ratio: (WIDTH as f32) / (HEIGHT as f32),
28 |     };
29 | 
30 |     let materials = vec![
31 |         MaterialKind::Metallic(MetallicMaterial {
32 |             color: Vec3::new(1.0, 0.85, 0.45),
33 |             roughness: 0.0,
34 |         }),
35 |         MaterialKind::Diffuse(DiffuseMaterial {
36 |             color: Vec3::new(0.5, 0.5, 1.0),
37 |         }),
38 |         MaterialKind::Metallic(MetallicMaterial {
39 |             color: Vec3::new(1.0, 0.7, 0.7),
40 |             roughness: 0.2,
41 |         }),
42 |         MaterialKind::Dielectric(DielectricMaterial {
43 |             ior: 1.0,
44 |             color: Vec3::new(1.0, 1.0, 1.0),
45 |         }),
46 |     ];
47 | 
48 |     let objects = vec![
49 |         Object::Sphere(Sphere::new(Vec3::new(0.0, 0.0, -1.0), 0.5, 0)),
50 |         Object::Sphere(Sphere::new(Vec3::new(1.1, 0.2, -0.7), 0.2, 2)),
51 |         Object::Sphere(Sphere::new(Vec3::new(-0.5, 0.4, -0.9), 0.3, 3)),
52 |         Object::Sphere(Sphere::new(Vec3::new(0.7, 0.0, -0.2), 0.2, 1)),
53 |         // Object::Sphere(Sphere::new(Vec3::new(0.0, -200.5, -1.0), 200.0, 1)),
54 |     ];
55 |     let cpu_scene = Scene {
56 |         objects: &objects,
57 |         materials: &materials,
58 |     };
59 | 
60 |     viewer::run(&camera, &cpu_scene);
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/cuda/vecadd/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "vecadd"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | cust = { path = "../../../crates/cust" }
 8 | nanorand = "0.7"
 9 | 
10 | [build-dependencies]
11 | cuda_builder = { path = "../../../crates/cuda_builder" }
12 | 


--------------------------------------------------------------------------------
/examples/cuda/vecadd/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path;
 3 | 
 4 | use cuda_builder::CudaBuilder;
 5 | 
 6 | fn main() {
 7 |     println!("cargo::rerun-if-changed=build.rs");
 8 |     println!("cargo::rerun-if-changed=kernels");
 9 | 
10 |     let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11 |     let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12 | 
13 |     CudaBuilder::new(manifest_dir.join("kernels"))
14 |         .copy_to(out_path.join("kernels.ptx"))
15 |         .build()
16 |         .unwrap();
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/cuda/vecadd/kernels/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "vecadd-kernels"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | cuda_std = { path = "../../../../crates/cuda_std" }
 8 | 
 9 | [lib]
10 | crate-type = ["cdylib", "rlib"]
11 | 


--------------------------------------------------------------------------------
/examples/cuda/vecadd/kernels/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use cuda_std::prelude::*;
 2 | 
 3 | #[kernel]
 4 | #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 5 | pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) {
 6 |     let idx = thread::index_1d() as usize;
 7 |     if idx < a.len() {
 8 |         let elem = unsafe { &mut *c.add(idx) };
 9 |         *elem = a[idx] + b[idx];
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/examples/optix/denoiser/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "denoiser"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | optix = { version = "0.1", path = "../../../crates/optix" }
 8 | structopt = "0.3"
 9 | cust = { version = "0.3", path = "../../../crates/cust", features = ["impl_vek", "bytemuck"] }
10 | image = "0.25.5"
11 | vek = { version = "0.17.1", features = ["bytemuck"] }
12 | 


--------------------------------------------------------------------------------
/examples/optix/denoiser/README.md:
--------------------------------------------------------------------------------
1 | # Denoiser
2 | 
3 | Example of a very simple binary which loads an image from a path and runs it through the OptiX 
4 | AI denoiser to denoise it and output it in the same directory.
5 | 
6 | Note that this is just about the worst way to run OptiX because the input is not Hdr, and it provides
7 | no albedo and normal guide images, so results may be pretty bad around object edges and such.
8 | 


--------------------------------------------------------------------------------
/examples/optix/denoiser/noisy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/examples/optix/denoiser/noisy.png


--------------------------------------------------------------------------------
/guide/assets/nsight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/guide/assets/nsight.png


--------------------------------------------------------------------------------
/guide/book.toml:
--------------------------------------------------------------------------------
1 | [book]
2 | authors = ["Riccardo D'Ambrosio<rdambrosio016@gmail.com>"]
3 | language = "en"
4 | multilingual = false
5 | src = "src"
6 | title = "GPU Computing with Rust using CUDA"
7 | description = "Writing extremely fast GPU Computing code with rust using rustc_codegen_nvvm and CUDA"
8 | 


--------------------------------------------------------------------------------
/guide/src/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/guide/src/README.md


--------------------------------------------------------------------------------
/guide/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary 
 2 | 
 3 | - [Introduction](README.md)
 4 | - [Supported Features](features.md)
 5 | - [Frequently Asked Questions](faq.md)
 6 | - [Guide](guide/README.md)
 7 |   - [Getting Started](guide/getting_started.md)
 8 |   - [Tips](guide/tips.md)
 9 |   - [Kernel ABI](guide/kernel_abi.md)
10 |   - [Safety](guide/safety.md)
11 | - [The CUDA Toolkit](cuda/README.md)
12 |   - [GPU Computing](cuda/gpu_computing.md)
13 |   - [The CUDA Pipeline](cuda/pipeline.md)
14 | - [rustc_codegen_nvvm](nvvm/README.md)
15 |   - [Technical](nvvm/technical/README.md)
16 |     - [Custom Rustc Backends](nvvm/technical/backends.md)
17 |     - [rustc_codegen_nvvm](nvvm/technical/nvvm.md)
18 |     - [Types](nvvm/technical/types.md)
19 | 


--------------------------------------------------------------------------------
/guide/src/cuda/README.md:
--------------------------------------------------------------------------------
 1 | # The CUDA Toolkit
 2 | 
 3 | The CUDA Toolkit is an ecosystem for executing extremely fast code on NVIDIA GPUs for the purpose of general computing.
 4 | 
 5 | CUDA includes many libraries for this purpose, including the Driver API, Runtime API, the PTX ISA, libnvvm, etc. CUDA
 6 | is currently the best option for computing in terms of libraries and control available, however, it unfortunately only works
 7 | on NVIDIA GPUs.
 8 | 
 9 | This section will cover some of the general uses of GPU computing, why use CUDA, and general CUDA principles. 
10 | This section will not cover everything about CUDA and it is not meant to. You can check out the [official CUDA guide](https://docs.nvidia.com/cuda/)
11 | for a complete overview.
12 | 


--------------------------------------------------------------------------------
/guide/src/guide/README.md:
--------------------------------------------------------------------------------
1 | # Guide
2 | 


--------------------------------------------------------------------------------
/guide/src/guide/tips.md:
--------------------------------------------------------------------------------
 1 | # Tips 
 2 | 
 3 | This section contains some tips on what to do and what not to do using the project.
 4 | 
 5 | ## GPU kernels
 6 | 
 7 | - Generally don't derive `Debug` for structs in GPU crates. The codegen currently does not do much global
 8 | DCE (dead code elimination) so debug can really slow down compile times and make the PTX gigantic. This
 9 | will get much better in the future but currently it will cause some undesirable effects.
10 | 
11 | - Don't use recursion, CUDA allows it but threads have very limited stacks (local memory) and stack overflows
12 | yield confusing `InvalidAddress` errors. If you are getting such an error, run the executable in cuda-memcheck,
13 | it should yield a write failure to `Local` memory at an address of about 16mb. You can also put the ptx file through
14 | `cuobjdump` and it should yield ptxas warnings for functions without a statically known stack usage.
15 | 


--------------------------------------------------------------------------------
/guide/src/nvvm/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/guide/src/nvvm/README.md


--------------------------------------------------------------------------------
/guide/src/nvvm/technical/README.md:
--------------------------------------------------------------------------------
 1 | # Technical 
 2 | 
 3 | This section will cover the more technical details of how rustc_codegen_nvvm works 
 4 | as well as the issues that came with it.
 5 | 
 6 | It will also explain some technical details about CUDA/PTX/etc, it is not necessarily 
 7 | limited to rustc_codegen_nvvm.
 8 | 
 9 | Basic knowledge of how rustc and LLVM work and what they do is assumed. You can find
10 | info about rustc in the [rustc dev guide](https://rustc-dev-guide.rust-lang.org/).
11 | 


--------------------------------------------------------------------------------
/katex-header.html:
--------------------------------------------------------------------------------
 1 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/katex.min.css"
 2 |     integrity="sha384-9eLZqc9ds8eNjO3TmqPeYcDj8n+Qfa4nuSiGYa6DjLNcv9BtN69ZIulL9+8CqC9Y" crossorigin="anonymous">
 3 | <script src="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/katex.min.js"
 4 |     integrity="sha384-K3vbOmF2BtaVai+Qk37uypf7VrgBubhQreNQe9aGsz9lB63dIFiQVlJbr92dw2Lx"
 5 |     crossorigin="anonymous"></script>
 6 | <script src="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/contrib/auto-render.min.js"
 7 |     integrity="sha384-kmZOZB5ObwgQnS/DuDg6TScgOiWWBiVt0plIRkZCmE6rDZGrEOQeHM5PcHi+nyqe"
 8 |     crossorigin="anonymous"></script>
 9 | <script>
10 |     document.addEventListener("DOMContentLoaded", function () {
11 |         renderMathInElement(document.body, {
12 |             delimiters: [
13 |                 { left: "$$", right: "$$", display: true },
14 |                 { left: "\\(", right: "\\)", display: false },
15 |                 { left: "$", right: "$", display: false },
16 |                 { left: "\\[", right: "\\]", display: true }
17 |             ]
18 |         });
19 |     });
20 | </script>


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "nightly-2025-03-02"
3 | components = ["clippy", "llvm-tools-preview", "rust-src", "rustc-dev", "rustfmt", "rust-analyzer"]


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/rustfmt.toml


--------------------------------------------------------------------------------
/scripts/data/libdevice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rust-GPU/Rust-CUDA/afb147ed51fbb14b758e10a0a24dbc2311a52b82/scripts/data/libdevice.pdf


--------------------------------------------------------------------------------
/scripts/download_ci_optix.bash:
--------------------------------------------------------------------------------
 1 | DEPS_DIR="$HOME/deps"
 2 | OPTIX_VERSION="7.0"
 3 | 
 4 | echo "Used OptiX version: ${OPTIX_VERSION}"
 5 | mkdir -p ${DEPS_DIR}/optix/include
 6 | OPTIX_URL=https://developer.download.nvidia.com/redist/optix/v${OPTIX_VERSION}
 7 | 
 8 | for f in optix.h optix_device.h optix_function_table.h \
 9 |           optix_function_table_definition.h optix_host.h \
10 |           optix_stack_size.h optix_stubs.h optix_types.h optix_7_device.h \
11 |           optix_7_host.h optix_7_types.h \
12 |           internal/optix_7_device_impl.h \
13 |           internal/optix_7_device_impl_exception.h \
14 |           internal/optix_7_device_impl_transformations.h
15 |     do
16 |     curl --retry 100 -m 120 --connect-timeout 30 \
17 |         $OPTIX_URL/include/$f > $DEPS_DIR/optix/include/$f
18 | done
19 | OPTIX_ROOT=${DEPS_DIR}/optix
20 | echo "OPTIX_ROOT=${OPTIX_ROOT}" >> $GITHUB_ENV
21 | 


--------------------------------------------------------------------------------
/scripts/gen_intrinsics.py:
--------------------------------------------------------------------------------
 1 | # Generates rust code for every intrinsic from libdevice.json made in gen_libdevice_json.py
 2 | 
 3 | import os
 4 | import json
 5 | import inspect
 6 | 
 7 | dirname = os.path.dirname(__file__)
 8 | filename = os.path.join(dirname, 'data/libdevice.json')
 9 | out_filename = os.path.join(dirname, 'data/std_intrinsics.rs')
10 | 
11 | txt = open(filename, "r", encoding="utf8").read()
12 | raw = json.loads(txt)
13 | 
14 | header = '''//! Raw libdevice math intrinsics.
15 | //! 
16 | //! Note that this file was autogenerated using crude text analysis
17 | //! from the libdevice PDF file, therefore many of the descriptions have broken
18 | //! text, especially for math symbols. The link to the libdevice website is provided
19 | //! for every intrinsic so you can view the non-broken description. 
20 | //! 
21 | //! Most of the intrinsics here have "proper" functions, corresponding f32/f64 functions
22 | //! are already codegenned to libdevice intrinsics by the codegen automatically. This module
23 | //! is mostly for exotic intrinsics that have not been added as proper functions yet.
24 | //!
25 | //! The underlying intrinsic functions have a prefix of `__nv_`, however this prefix
26 | //! is stripped from the functions in this module just for convenience.
27 | 
28 | // Generated file, do not edit by hand, see scripts/gen_intrinsics.py
29 | 
30 | '''
31 | out = ""
32 | 
33 | for intrinsic in raw:
34 |     raw_name = intrinsic["name"]
35 |     out += f"#[link_name = \"{raw_name}\"]\n"
36 | 
37 |     name = raw_name.removeprefix("__nv_")
38 |     description = intrinsic["description"]
39 |     returns = intrinsic["returns"]
40 | 
41 |     # There isnt actually any availability which is `No` instead of `Yes`, so including it is useless for now
42 |     hyperlink = f"[Nvidia docs](https://docs.nvidia.com/cuda/libdevice-users-guide/{raw_name}.html#{raw_name})"
43 |     full_desc = inspect.cleandoc(
44 |         f"{description}\n\n{hyperlink}\n\n# Returns\n\n{returns}")
45 |     out += f"#[doc = \"{full_desc}\"]\n"
46 |     out += f"pub fn {name}("
47 |     sig = intrinsic["sig"]
48 |     return_ty = sig["returns"]
49 | 
50 |     params = []
51 |     for param in sig["params"]:
52 |         param_name = param["name"]
53 |         param_ty = param["type"]
54 |         if param_name == "in":
55 |             param_name = "in_"
56 | 
57 |         params.append(f"{param_name}: {param_ty}")
58 | 
59 |     out += ", ".join(params)
60 |     out += f") -> {return_ty};\n\n"
61 | 
62 | out = f"{header}extern \"C\" {{\n{out}}}\n"
63 | 
64 | open(out_filename, "w", encoding="utf8").write(out)
65 | 


--------------------------------------------------------------------------------
/xtask/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "xtask"
 3 | version = "0.0.0"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | 
 7 | [dependencies]
 8 | pico-args = "0.4.2"
 9 | rayon = "1.10"
10 | regex = "1.11.1"
11 | rustc_codegen_nvvm = { path = "../crates/rustc_codegen_nvvm" }
12 | 


--------------------------------------------------------------------------------
/xtask/src/extract_llfns.rs:
--------------------------------------------------------------------------------
 1 | //! Uses llvm-extract to extract every function from an LLVM IR file
 2 | //! and dumps them as individual standalone LLVM IR files in a directory.
 3 | 
 4 | use rayon::prelude::*;
 5 | use regex::Regex;
 6 | use std::{path::Path, process::Command};
 7 | 
 8 | // change this if you want it to print a status before running the command.
 9 | // this is useful if running the command causes segfaults.
10 | const PRINT_EVERY_EXECUTION: bool = false;
11 | 
12 | #[allow(unused_variables)]
13 | fn run_command_for_each_fn(func: &str, contents: &str) -> bool {
14 |     // put any code you want to use for debugging in here. For example,
15 |     // running nvvm on every function to see what panics.
16 |     // `true` will keep running the command, `false` will stop running.
17 |     //
18 |     // !!!!! Make sure to delete any changes to this function if committing !!!!!
19 |     true
20 | }
21 | 
22 | // ------------------------------------------------
23 | 
24 | pub(crate) fn extract_llfns(file: &Path, dir: &Path) {
25 |     let contents = std::fs::read_to_string(file).unwrap();
26 |     let re = Regex::new(r#"define .*(_Z.*?)(\(|")"#).unwrap();
27 |     let names = re
28 |         .captures_iter(&contents)
29 |         .map(|x| x.get(1).unwrap().as_str())
30 |         .collect::<Vec<_>>();
31 | 
32 |     let mut contents = names
33 |         .par_iter()
34 |         .filter_map(|name| {
35 |             let out_file = format!("{}/{}.ll", dir.display(), name);
36 |             let _ = Command::new("llvm-extract")
37 |                 .arg(file)
38 |                 .arg(format!("--func={}", name))
39 |                 .arg("-S")
40 |                 .arg("--recursive")
41 |                 .arg("-o")
42 |                 .arg(&out_file)
43 |                 .spawn();
44 | 
45 |             let read = std::fs::read_to_string(format!("{}/{}.ll", dir.display(), name)).ok()?;
46 |             Some((name, read, true))
47 |         })
48 |         .collect::<Vec<_>>();
49 | 
50 |     // sort things by length so that we try the shortest functions first.
51 |     contents.sort_by_key(|x| x.1.len());
52 | 
53 |     for (name, content, failed) in &mut contents {
54 |         if PRINT_EVERY_EXECUTION {
55 |             println!("Running command over `{}.ll`", name);
56 |         }
57 | 
58 |         *failed = !run_command_for_each_fn(name, content);
59 |     }
60 | 
61 |     for (name, _, _) in contents.into_iter().filter(|x| !x.2).take(30) {
62 |         println!("Err: {}", name);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/xtask/src/main.rs:
--------------------------------------------------------------------------------
 1 | mod extract_llfns;
 2 | 
 3 | use pico_args::Arguments;
 4 | use std::{error::Error, path::Path};
 5 | 
 6 | use crate::extract_llfns::extract_llfns;
 7 | 
 8 | fn main() -> Result<(), Box<dyn Error>> {
 9 |     let mut args = Arguments::from_env();
10 |     let sub = args.subcommand()?.unwrap_or_default();
11 | 
12 |     match sub.as_str() {
13 |         "extract_llfns" => {
14 |             let arg1 = args.free_from_str::<String>()?;
15 |             let file = Path::new(&arg1);
16 |             let arg2 = args.free_from_str::<String>()?;
17 |             let dir = Path::new(&arg2);
18 |             args.finish();
19 |             extract_llfns(file, dir);
20 |             Ok(())
21 |         }
22 |         _ => panic!("Unknown command, available: `extract_llfns`"),
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------