├── .gitattributes
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE.MD
├── README.md
├── assets
    └── data
    │   ├── disney.ns.bin
    │   ├── nvidia-logo.png
    │   └── slangpy-weights.json
├── docs
    ├── LibraryGuide.md
    ├── QuickStart.md
    ├── ShaderTraining.md
    ├── SimpleInferencing.md
    ├── SimpleTraining.md
    ├── SlangpyTraining.md
    ├── Tutorial.md
    ├── shader_training.png
    ├── simple_inferencing.png
    ├── simple_training.png
    ├── simple_training_trained.png
    └── slangpy_training.jpg
├── samples
    ├── CMakeLists.txt
    ├── ShaderTraining
    │   ├── CMakeLists.txt
    │   ├── Disney.slang
    │   ├── DisneyMLP.slang
    │   ├── NetworkConfig.h
    │   ├── ShaderTraining.cpp
    │   ├── computeOptimizer.slang
    │   ├── computeTraining.slang
    │   ├── renderDifference.slang
    │   ├── renderDisney.slang
    │   ├── renderInference.slang
    │   └── shaders.cfg
    ├── SimpleInferencing
    │   ├── CMakeLists.txt
    │   ├── NetworkConfig.h
    │   ├── SimpleInferencing.cpp
    │   ├── SimpleInferencing.slang
    │   └── shaders.cfg
    ├── SimpleTraining
    │   ├── CMakeLists.txt
    │   ├── NetworkConfig.h
    │   ├── SimpleTraining.cpp
    │   ├── SimpleTraining_Inference.slang
    │   ├── SimpleTraining_Optimizer.slang
    │   ├── SimpleTraining_Training.slang
    │   └── shaders.cfg
    └── SlangpyTraining
    │   ├── CMakeLists.txt
    │   ├── Helpers.py
    │   ├── NetworkConfig.h
    │   ├── NeuralModules.py
    │   ├── NeuralModules.slang
    │   ├── SlangpyInference.cpp
    │   ├── SlangpyInference.slang
    │   ├── SlangpyTraining.py
    │   ├── SlangpyTraining.slang
    │   ├── requirements.txt
    │   └── shaders.cfg
├── src
    ├── CMakeLists.txt
    ├── NeuralShading
    │   ├── CMakeLists.txt
    │   ├── CoopVector.cpp
    │   ├── CoopVector.h
    │   ├── Float16.cpp
    │   ├── Float16.h
    │   ├── GraphicsResources.cpp
    │   ├── GraphicsResources.h
    │   ├── NeuralNetwork.cpp
    │   ├── NeuralNetwork.h
    │   └── NeuralNetworkTypes.h
    ├── NeuralShading_Shaders
    │   ├── Activation.slang
    │   ├── CMakeLists.txt
    │   ├── CooperativeVectorAutoDiff.slang
    │   ├── CooperativeVectorDerivatives.slang
    │   ├── CooperativeVectorFunctions.slang
    │   ├── LinearOps.slang
    │   ├── Loss.slang
    │   ├── MLP.slang
    │   ├── Optimizers.slang
    │   ├── PCG32.slang
    │   └── Utils.slang
    └── Utils
    │   ├── CMakeLists.txt
    │   ├── DeviceUtils.cpp
    │   ├── DeviceUtils.h
    │   ├── DirectoryHelper.cpp
    │   ├── DirectoryHelper.h
    │   ├── GeometryUtils.cpp
    │   └── GeometryUtils.h
└── support
    └── cmake
        ├── ConfigureAgilitySDK.cmake
        ├── FetchDXCPreview.cmake
        └── FetchPrebuildBinary.cmake


/.gitattributes:
--------------------------------------------------------------------------------
 1 | external/slang/windows-x64/release/dxcompiler.dll filter=lfs diff=lfs merge=lfs -text
 2 | external/slang/windows-x64/release/slang.dll filter=lfs diff=lfs merge=lfs -text
 3 | external/slang/windows-x64/release/slangc.exe filter=lfs diff=lfs merge=lfs -text
 4 | external/slang/windows-x64/release/slangd.exe filter=lfs diff=lfs merge=lfs -text
 5 | external/slang/windows-x64/release/slang-glslang.dll filter=lfs diff=lfs merge=lfs -text
 6 | *.exe filter=lfs diff=lfs merge=lfs -text
 7 | *.pdb filter=lfs diff=lfs merge=lfs -text
 8 | *.dll filter=lfs diff=lfs merge=lfs -text
 9 | *.zip filter=lfs diff=lfs merge=lfs -text
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/*
 2 | /bin
 3 | /out
 4 | /.vscode
 5 | /.vs
 6 | /*.zip
 7 | external/slang/windows-x64/release/slang-stdlib.bin
 8 | /external/dx12-agility-sdk
 9 | /external/nvapi
10 | 
11 | 
12 | # Temp files from running Python sample.
13 | /.temp
14 | __pycache__/
15 | 
16 | # Generated shaders+weights from SlangPyTraining.
17 | samples/SlangpyTraining/trained_shaders.cfg
18 | samples/SlangpyTraining/weights.json
19 | /external/dx12-agility-sdk
20 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "donut"]
2 | 	path = external/donut
3 | 	url = https://github.com/NVIDIAGameWorks/donut
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading Change Log
2 | 
3 | ## 1.1.0
4 | - Added DX12 cooperative vector support using Preview Agility SDK.
5 | - Moved matrix conversion to GPU.
6 | 
7 | ## 1.0.0
8 | 
9 | Initial release.


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3 | # 
  4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
  5 | # and proprietary rights in and to this software, related documentation
  6 | # and any modifications thereto. Any use, reproduction, disclosure or
  7 | # distribution of this software and related documentation without an express
  8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 | 
 10 | cmake_minimum_required(VERSION 3.10)
 11 | 
 12 | project(
 13 |     RtxNeuralShading
 14 |     DESCRIPTION "RTX Neural Shading"
 15 |     LANGUAGES CXX
 16 | )
 17 | 
 18 | set(CMAKE_CXX_STANDARD 20)
 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 20 | set(CMAKE_CXX_EXTENSIONS ON)
 21 | 
 22 | option(ENABLE_DX12_COOP_VECTOR_PREVIEW "" OFF)
 23 | option(DONUT_WITH_DX11 "Not supported in this SDK" OFF)
 24 | option(DONUT_WITH_DX12 "DX12 is only supported with DX12_COOP_VECTOR_PREVIEW ON" OFF)
 25 | option(DONUT_WITH_VULKAN "" ON)
 26 | option(DONUT_WITH_STATIC_SHADERS "" ON)
 27 | 
 28 | # Register our path for CMake modules
 29 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/support/cmake")
 30 | 
 31 | if (MSVC)
 32 |     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D_ITERATOR_DEBUG_LEVEL=1")
 33 | endif()
 34 | 
 35 | option(DONUT_WITH_ASSIMP "" OFF)
 36 | 
 37 | if(WIN32)
 38 | 	set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/windows-x64" CACHE PATH "Output directory for the RTXNS build")
 39 | else()
 40 | 	set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/linux-x64" CACHE PATH "Output directory for the RTXNS build")
 41 | endif()
 42 | 
 43 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${RTXNS_BINARY_DIR}")
 44 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${RTXNS_BINARY_DIR}")
 45 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${RTXNS_BINARY_DIR}")
 46 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${RTXNS_BINARY_DIR}")
 47 | 
 48 | set(SHADERMAKE_BIN_OUTPUT_PATH "${RTXNS_BINARY_DIR}/bin" CACHE STRING "Output directory for the ShaderMake executable")
 49 | set(DONUT_SHADERS_OUTPUT_DIR "${RTXNS_BINARY_DIR}/bin/shaders/framework")
 50 | 
 51 | # Get Slang
 52 | set(SLANG_VERSION "2025.10")
 53 | set(SLANG_URL_BASE "https://github.com/shader-slang/slang/releases/download/v${SLANG_VERSION}")
 54 | if(WIN32)
 55 |     set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-windows-x86_64.zip")
 56 | else()
 57 |     set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-linux-x86_64-glibc-2.17.tar.gz")
 58 | endif()
 59 | 
 60 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchPrebuildBinary.cmake")
 61 | download_package(slang ${SLANG_URL})
 62 | 
 63 | if (WIN32)
 64 | 	set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc.exe")
 65 | 	if (ENABLE_DX12_COOP_VECTOR_PREVIEW)
 66 | 		set(DONUT_WITH_DX12 ON)
 67 | 		set(NVRHI WITH_DX12 ON)
 68 | 	else()
 69 | 		# DX12 is only supported with DX12_COOP_VECTOR_PREVIEW
 70 | 		set(DONUT_WITH_DX12 OFF)
 71 | 		set(NVRHI WITH_DX12 OFF)
 72 | 	endif()
 73 | else()
 74 | 	set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc")
 75 | endif()
 76 | 
 77 | if (NOT SLANGC_PATH)
 78 |     message(FATAL_ERROR "Slang compiler not found - this is required for CoopVec support.")
 79 | else()
 80 |     message("Slang compiler found: ${SLANGC_PATH}")
 81 | endif()
 82 | 
 83 | if (DONUT_WITH_DX12)
 84 | 	# Get D3D Agility SDK Preview for Coop Vector support
 85 | 	set(D3D_AGILITY_SDK_PREVIEW_VERSION "1.717.0-preview")
 86 | 	set(DONUT_D3D_AGILITY_SDK_URL "https://www.nuget.org/api/v2/package/Microsoft.Direct3D.D3D12/${D3D_AGILITY_SDK_PREVIEW_VERSION}")
 87 | 	set(DONUT_D3D_AGILITY_SDK_FETCH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/dx12-agility-sdk" CACHE STRING "" FORCE)
 88 | 	include("${CMAKE_CURRENT_SOURCE_DIR}/external/donut/cmake/FetchAgilitySDK.cmake")
 89 | 	include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/ConfigureAgilitySDK.cmake")
 90 | 	
 91 | 	# Get DXC preview for SM6.9 support
 92 | 	set(DXC_PREVIEW_VERSION "1.8.2505.28")
 93 | 	set(DXC_PREVIEW_PATH "" CACHE STRING "Directory to fetch the DXC to, empty uses build directory default")
 94 | 	include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchDXCPreview.cmake")
 95 | 	
 96 | 	set(DXC_PATH "${DXC_PREVIEW_PATH}")
 97 | 	
 98 | 	# copy dxc to Slang
 99 | 	foreach(file_name IN ITEMS dxc.exe dxcompiler.dll dxil.dll)
100 | 		set(src "${DXC_PREVIEW_BIN_PATH}/${file_name}")
101 | 		set(dst "${slang_SOURCE_DIR}/bin/")
102 | 		if(EXISTS "${src}")
103 | 			configure_file("${src}" "${dst}" COPYONLY)
104 | 		else()
105 | 			message(WARNING "DXC binary not found: ${src}")
106 | 		endif()
107 | 	endforeach()
108 | endif()
109 | 
110 | add_subdirectory(external/donut)
111 | add_subdirectory(src)
112 | add_subdirectory(samples)
113 | set_property (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT SimpleInferencing)
114 | 
115 | file(WRITE "${CMAKE_SOURCE_DIR}/bin/slangc.bat" "${SLANGC_PATH} %*")
116 |     
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RTX Neural Shading
 2 | 
 3 | RTX Neural Shading (RTXNS) also known as RTX Neural Shaders, is intended as a starting point for developers interested in bringing Machine Learning (ML) to their graphics applications. It provides a number of examples to help the reader understand how to train their own neural networks and then use those models to perform inference alongside their normal graphics rendering. 
 4 | 
 5 | RTXNS uses the [Slang](https://shader-slang.com) shading language and it utilizes either the DirectX Preview Agility SDK or the Vulkan Cooperative Vectors extension to provide access to the GPUs ML acceleration.
 6 | 
 7 | A number of examples are included which build upon each other from a simple inference example to more complex examples showing how to train a neural network to represent a shader or a texture. Helper functions to facilitate building your own neural networks are also included. 
 8 | 
 9 | Alongside the core samples is a SlangPy sample to demonstrate how to use python and SlangPy for fast iteration and development of neural networks which can then be integrated into RTXNS for inference. 
10 | 
11 | When exploring RTXNS, it is assumed that the reader is already familiar with ML and neural networks.
12 | 
13 | ## Requirements
14 | 
15 | ### General
16 | [CMake v3.24.3][CMake] **|** [VS 2022][VS22] **|** [Slang v2025.10](https://shader-slang.com/tools/)
17 | 
18 | ### DirectX
19 | [DirectX Preview Agility SDK 1.717.0-preview](https://www.nuget.org/packages/Microsoft.Direct3D.D3D12/1.717.0-preview) **|** [Microsoft DXC 1.8.2505.28](https://www.nuget.org/packages/Microsoft.Direct3D.DXC/1.8.2505.28) **|** [Shader Model 6-9-Preview Driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver) 
20 | 
21 | ### Vulkan
22 | GPU must support the Vulkan `VK_NV_cooperative_vector` extension (minimum NVIDIA RTX 20XX) **|** [Vulkan SDK 1.3.296.0](https://vulkan.lunarg.com/sdk/home) **|** Public Driver ≥ 572.16
23 | 
24 | ## Known Issues
25 | 05/30/2025: When updating from v1.0.0 to v1.1.0 is it recommended to delete the cmake cache to avoid build errors.
26 | 
27 | ## Project structure
28 | 
29 | | Directory                         | Details                                |
30 | | --------------------------------- | -------------------------------------- |
31 | | [/assets](assets)                 | _Asset files for samples_              |
32 | | [/docs](docs)                     | _Documentation for showcased tech_     |
33 | | [/samples](samples)               | _Samples showcasing usage of MLPs_     |
34 | | [/external/donut](external/donut) | _Framework used for the examples_      |
35 | | [/external](external)             | _Helper dependencies for the examples_ |
36 | | [/src](src)                       | _Helper and utility functions_         |
37 | 
38 | ## Getting started
39 | 
40 | - [Quick start guide](docs/QuickStart.md) for building and running the neural shading samples.
41 | - [Library usage guide](docs/LibraryGuide.md) for using helper functions
42 | 
43 | ### External Resources
44 | 
45 | This project uses [Slang](https://shader-slang.com) and the Vulkan CoopVector extensions. The following links provide more detail on these, and other technologies which may help the reader to better understand the relevant technologies, or just to provide further reading.
46 | 
47 | * [Slang User Guide](https://shader-slang.com/slang/user-guide/)
48 |   
49 |   * [Automatic Differentiation](https://shader-slang.com/slang/user-guide/autodiff.html)
50 | 
51 | * [SlangPy](https://slangpy.readthedocs.io/en/latest/) 
52 | 
53 | * [Vulkan `VK_NV_cooperative_vector` extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html)
54 | 
55 | * [Donut](https://github.com/NVIDIAGameWorks/donut)
56 | 
57 | ## Contact
58 | 
59 | RTXNS is actively being developed. Please report any issues directly through the GitHub issue tracker, and for any information or suggestions contact us at rtxns-sdk-support@nvidia.com
60 | 
61 | ## Citation
62 | 
63 | Use the following BibTex entry to cite the usage of RTXNS in published research:
64 | 
65 | ```bibtex
66 | @online{RTXNS,
67 |    title   = {{{NVIDIA}}\textregistered{} {RTXNS}},
68 |    author  = {{NVIDIA}},
69 |    year    = 2025,
70 |    url     = {https://github.com/NVIDIA-RTX/RTXNS},
71 |    urldate = {2025-02-03},
72 | }
73 | ```
74 | 
75 | ## License
76 | 
77 | See [LICENSE.md](LICENSE.MD)
78 | 
79 | [VS22]: https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community&channel=Release&version=VS2022&source=VSLandingPage&passive=false&cid=2030
80 | 
81 | [CMake]: https://github.com/Kitware/CMake/releases/download/v3.24.3/cmake-3.24.3-windows-x86_64.msi
82 | 


--------------------------------------------------------------------------------
/assets/data/disney.ns.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/disney.ns.bin


--------------------------------------------------------------------------------
/assets/data/nvidia-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/nvidia-logo.png


--------------------------------------------------------------------------------
/docs/QuickStart.md:
--------------------------------------------------------------------------------
 1 | # RTX Neural Shading: Quick Start Guide
 2 | 
 3 | ## Build steps
 4 | 
 5 | 1. Clone the project recursively:
 6 |    
 7 |    ```
 8 |    git clone --recursive https://github.com/NVIDIA-RTX/RTXNS
 9 |    ```
10 | 
11 | 2. Configure and then generate the solution using CMake GUI (or the CLI) by setting the repository root as _source_ and specifying a new _build_ directory in the root.
12 |    
13 |    ```
14 |    cd Rtxns
15 |    mkdir build
16 |    cd build
17 |    cmake ..
18 |    ```
19 |    To enable DX12 Cooperative Vector set the option `ENABLE_DX12_COOP_VECTOR_PREVIEW` on.
20 |    ```
21 |    cmake -DENABLE_DX12_COOP_VECTOR_PREVIEW=ON
22 |    ```
23 | 
24 | 3. Open build/RtxNeuralShading.sln in Visual Studio and build all projects, or build using the CMake CLI
25 |    
26 |    ```
27 |    cmake --build .
28 |    ```
29 | 
30 | 4. All of the binaries can be found in `\bin` such as
31 |    
32 |    ```
33 |    bin\Debug\SimpleInferencing.exe
34 | 
35 | 5. Each of the samples can be build and launched as either DX12 or Vulkan with the respective commandline: `-dx12` or `-vk` 
36 | 
37 | ## About
38 | 
39 | All of the samples are built using Slang and can be compiled to either DX12 or Vulkan using DirectX Preview Agility SDK or Vulkan Cooperative Vector extension respectively. 
40 | 
41 | - [DirectX Preview Agility SDK](https://devblogs.microsoft.com/directx/directx12agility/).
42 | - [Vulkan Cooperative Vector extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html).
43 | 
44 | ## Driver Requirements
45 | - Use the DirectX Preview Agility SDK requires a shader model 6.9 preview [driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver)  
46 | - Vulkan Cooperative Vector extension requires a release [driver](https://www.nvidia.com/en-gb/geforce/drivers) from R570 onwards
47 | 
48 | ### Samples
49 | 
50 | | Sample Name                                | Output                                                                   | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                |
51 | | ------------------------------------------ | ------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
52 | | [Simple Inferencing](SimpleInferencing.md) | [<img src="simple_inferencing.png" width="800">](simple_inferencing.png) | This sample demonstrates how to implement an inference shader using some of the low-level building blocks from RTXNS. The sample loads a trained network from a file and uses the network to approximate a Disney BRDF shader. The sample is interactive; the light source can be rotated and various material parameters can be modified at runtime.                                                                                                      |
53 | | [Simple Training](SimpleTraining.md)       | [<img src="simple_training.png" width="800">](simple_training.png)       | This sample builds on the Simple Inferencing sample to provide an introduction to training a neural network for use in a shader. The network replicates a transformed texture.                                                                                                                                                                                                                                                                             |
54 | | [Shader Training](ShaderTraining.md)       | [<img src="shader_training.png" width="800">](shader_training.png)       | This sample extends the techniques shown in the Simple Training example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the Simple Inferencing sample. |
55 | | [SlangPy Training](SlangpyTraining.md)     | [<img src="slangpy_training.jpg" width="800">](slangpy_training.jpg)     | This sample shows how to create and train network architectures in python using SlangPy. This lets you experiment with different networks, encodings and more using the building blocks from RTXNS, but without needing to change or rebuild C++ code. As a demonstration this sample instantiates multiple different network architectures and trains them side-by-side on the same data. It also shows one possible approach of exporting the network parameters and architecture to disk so it can be loaded in C++. |
56 | 
57 | ### Tutorial
58 | 
59 | * [Tutorial](Tutorial.md) 
60 |   A tutorial to help guide you to create your own neural shader based on the [Shader Training](ShaderTraining.md) example.
61 | 
62 | ### Library
63 | 
64 | * [Library](LibraryGuide.md) 
65 |   A guide to using the library / helper functions to create and manage your neural networks.
66 | 


--------------------------------------------------------------------------------
/docs/ShaderTraining.md:
--------------------------------------------------------------------------------
  1 | # RTX Neural Shading: Shader Training Example
  2 | 
  3 | ## Purpose
  4 | 
  5 | This sample extends on the techniques shown in the [Simple Training](SimpleTraining.md) example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the [Simple Inferencing](SimpleInferencing.md) sample. 
  6 | 
  7 | ![Shader Training Output](shader_training.png)
  8 | 
  9 | When the executable is built and run, the output shows 3 images where the image on the left is sphere lit with the full Disney BRDF shader. the middle image is the same sphere lit with the trained neural network and the final image on the right shows the loss delta. There is a UI which allows some control of the material properties as well as buttons to pause and reset the training as well as to save/load the current network.
 10 | 
 11 | ## Training Flow
 12 | 
 13 | To create and train a neural network with RTXNS, several stages are needed which will be described in more detail below. This differs from the previous [Simple Training](SimpleTraining.md) example which had a specific compute shader pass for training and another for inference. In this example, the training and optimization passes are still compute based, but the inference is integrated into an existing pixel shader.
 14 | 
 15 | 1. Create the host side neural network storage and initialize it
 16 | 
 17 | 2. Create a device optimal layout and GPU buffer
 18 | 
 19 | 3. Convert the host layout network to the device optimal layout on the GPU
 20 | 
 21 | 4. Create auxiliary buffers for loss gradients and the optimizer pass
 22 | 
 23 | 5. Run batches of the training shader followed by the optimizer shader on random inputs adjusting for the loss at each epoch
 24 | 
 25 | 6. Render the sphere with the inference pixel shader to generate the output image
 26 | 
 27 | ## Network Configuration
 28 | 
 29 | The network details can be found in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h) and are configured as follows :
 30 | 
 31 | | Property       | Value   | Notes                                       |
 32 | | -------------- | ------- | ------------------------------------------- |
 33 | | Input Features | 5       | 5 input parameters                          |
 34 | | Input Neurons  | 30      | 5 input parameters encoded to 6 inputs each |
 35 | | Output Neurons | 4       | 4 BRDF values                               |
 36 | | Hidden Neurons | 32      |                                             |
 37 | | Hidden Layers  | 3       |                                             |
 38 | | Precision      | float16 |                                             |
 39 | 
 40 | ## Application Code
 41 | 
 42 | On the host, the setup of the neural network is quite simple and broadly similar to [Simple Training](SimpleTraining.md) so we shall only highlight the differences in this document.
 43 | 
 44 | ### Training Loop
 45 | 
 46 | After creating the appropriate pipelines and allocating the GPU buffers, the training loop is similar to the Simple Training example. The training and optimization passes are executed multiple times per frame (`g_trainingStepsPerFrame = 100`) to speed up the training time whilst also running the inference pass at a reasonable rate to see the model convergence. 
 47 | 
 48 | ```
 49 | for (int i = 0; i < g_trainingStepsPerFrame; ++i)
 50 | {
 51 |     nvrhi::ComputeState state;
 52 |     ...
 53 |     // Training pass
 54 |     state.bindings = { m_trainingPass.bindingSet };
 55 |     state.pipeline = m_trainingPass.pipeline;
 56 |     ...
 57 |     m_commandList->setComputeState(state);
 58 |     m_commandList->dispatch(m_batchSize / 64, 1, 1);
 59 |     ...
 60 |     // Optimizer pass
 61 |     state.bindings = { m_optimizerPass.bindingSet };
 62 |     state.pipeline = m_optimizerPass.pipeline;
 63 |     ...
 64 |     m_commandList->setComputeState(state);
 65 |     m_commandList->dispatch(div_ceil(m_totalParameterCount, 32), 1, 1);
 66 |     ...
 67 | }
 68 | ```
 69 | 
 70 | Some of the timer related queries have been removed from the code for ease of understanding.
 71 | 
 72 | After the training pass, the 2 spheres are rendered as expected, but using 2 different pipelines: `m_directPass` for the native Disney BRDF shader and `m_inferencePass` for the trained neural model.
 73 | 
 74 | ## Shader Code
 75 | 
 76 | The neural network in this sample is trying to encode the following :
 77 | 
 78 | ```
 79 | Disney(NdotL, NdotV, NdotH, LdotH, roughness);
 80 | ```
 81 | 
 82 | The shader code extends the concepts shown in the [Simple Training](SimpleTraining.md) example by using Slangs [AutoDiff](https://shader-slang.org/slang/user-guide/autodiff.html) feature to create a templated training class `TrainingMLP`, implemented in [MLP.slang](../src/NeuralShading_Shaders/MLP.slang), that can be used to help train your own models. Using the Autodiff features means we don't need to implement a full backwards pass containing all of the derivative activation functions as it is automatically derived for you.
 83 | 
 84 | The main 3 shaders are: [training](../samples/ShaderTraining/computeTraining.slang), [optimizer](../samples/ShaderTraining/computeOptimizer.slang) and [inference](../samples/ShaderTraining/renderInference.slang).
 85 | 
 86 | ### Training
 87 | 
 88 | The training shader starts by generating the random inputs and encoding them ready for passing to the neural network.
 89 | 
 90 | ```
 91 | //----------- Training step
 92 | float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness};
 93 | var inputParams = rtxns::EncodeFrequency<half, INPUT_FEATURES>(params);
 94 | ```
 95 | 
 96 | Next, the model is created and the inputs are passed to the model for the forwards pass.
 97 | 
 98 | ```
 99 | var model = rtxns::mlp::TrainingMLP<half, 
100 |     NUM_HIDDEN_LAYERS, INPUT_NEURONS, HIDDEN_NEURONS, OUTPUT_NEURONS, 
101 |     CoopVecMatrixLayout::TrainingOptimal, CoopVecComponentType::Float16>(
102 |     gMLPParams, 
103 |     gMLPParamsGradients, 
104 |     rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets),
105 |     rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets));
106 | ```
107 | 
108 | The `TrainingMLP` is heavily templated with lots of parameters, but the templated parameters consist of :
109 | 
110 | * Number of hidden layers
111 | * Number of input neurons
112 | * Number of neurons per hidden layer
113 | * Number of output neurons
114 | * Matrix layout
115 | * Precision
116 | 
117 | The non templated parameters consist of :
118 | 
119 | * Weight/Bias buffer
120 | * Gradient buffer
121 | * Weight offsets per layer
122 | * Bias offsets per layer
123 | 
124 | Once the model has been created, executing the forward pass is trivial and involves assigning templated activation functions to the forward pass before passing the input parameters in. The detailed implementation is described in the [Library Guide](LibraryGuide.md)
125 | 
126 | ```
127 | var hiddenActivation = rtxns::mlp::ReLUAct<half, HIDDEN_NEURONS>();
128 | var finalActivation = rtxns::mlp::ExponentialAct<half, OUTPUT_NEURONS>();
129 | 
130 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
131 | ```
132 | 
133 | To generate the loss gradient, this example uses the `L2Relative` derivative function of the output of the actual disney BRDF shader and the output of the forward pass.
134 | 
135 | ```
136 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
137 | 
138 | float4 lossGradient = rtxns::mlp::L2Relative<float, 4>.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS);
139 | ```
140 | 
141 | Finally, the loss gradient along with the input vector are passed through the models backward propagation function to update the gradient parameters.
142 | 
143 | ```
144 | model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec<OUTPUT_NEURONS>(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3]));
145 | ```
146 | 
147 | ### Optimizer
148 | 
149 | The optimizer shader is no different to the one used in the [Simple Training](SimpleTraining.md) example.
150 | 
151 | ```
152 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
153 | {
154 |     uint i = dispatchThreadID.x;
155 |     if (i >= maxParamSize)
156 |         return;
157 | 
158 |     float gradient = (float)gMLPParamsGradients[i];
159 |     gMLPParamsGradients[i] = half(0.0);
160 | 
161 |     float weightbias = gMLPParams32[i];
162 | 
163 |     optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, learningRate, LOSS_SCALE);
164 | 
165 |     float adjustedWeightbias = optimizer.step(weightbias, i, gradient, currentStep);
166 | 
167 |     gMLPParams32[i] = adjustedWeightbias;
168 |     gMLPParams[i] = (half)adjustedWeightbias;
169 | }
170 | ```
171 | 
172 | ### Inference
173 | 
174 | The inference pass is nearly identical to the forward pass of the training shader. It currently uses `CoopVecMatrixLayout::TrainingOptimal` layout as it is run directly after a batch of training without converting the weights to an inference layout, but the default layout for inference should be `CoopVecMatrixLayout::InferencingOptimal`.
175 | 
176 | ```
177 | float4 DisneyMLP<let HIDDEN_LAYERS : int, let HIDDEN_NEURONS : int>(
178 |     float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer,
179 |     uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1])
180 | {
181 |     // Calculate approximated core shader part using MLP
182 |     float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
183 | 
184 |     var inputParams = rtxns::EncodeFrequency<half, INPUT_FEATURES>(params);
185 | 
186 |     var model = rtxns::mlp::InferenceMLP<half, 
187 |         HIDDEN_LAYERS, 
188 |         INPUT_FEATURES * FREQUENCY_EXPANSION, 
189 |         HIDDEN_NEURONS, 
190 |         OUTPUT_NEURONS, 
191 |         CoopVecMatrixLayout::TrainingOptimal, 
192 |         CoopVecComponentType::Float16>
193 |         (mlpBuffer, weightOffsets, biasOffsets);
194 | 
195 |     var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct<half, HIDDEN_NEURONS>(), rtxns::mlp::ExponentialAct<half, OUTPUT_NEURONS>());
196 |     return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
197 | }
198 | ```
199 | 


--------------------------------------------------------------------------------
/docs/Tutorial.md:
--------------------------------------------------------------------------------
 1 | # RTX Neural Shading: How to Write Your First Neural Shader
 2 | 
 3 | ## Purpose
 4 | 
 5 | Using  [Shader Training](ShaderTraining.md) as the basis of this tutorial, we will briefly discuss an approach to writing your first neural shader.
 6 | 
 7 | The main areas we will focus on are :
 8 | 
 9 | 1. Extracting the key features from the shader to be trained
10 | 
11 | 2. Modifying the network configuration
12 | 
13 | 3. Modifying the activation and loss functions
14 | 
15 | It is outside the scope of this document to discuss how AI training and optimization works and instead we will focus on modifying the existing sample to configure and train the network with different content.
16 | 
17 | ## Extracting the Key Features for Training Input
18 | 
19 | When implementing the Disney BRDF for use in the [Shader Training](ShaderTraining.md) example, the first task was feature extraction. Which features from the shader should be inferred from the network and which should be calculated to ensure the network is not over specialized or overly complex. The network for the Disney BRDF takes inputs such as the `view`, `light` and `normal` vectors as well as  `material roughness`. Other variables, such as `light intensity`, `material metallicness` and various `material color` components have been left as part of the shader. This is a balancing act which may require some experimentation.
20 | 
21 | Once the key features are identified as potential training inputs, look to optimize them where possible by reducing their form and scaling them to be in the range `0-1` or `-1 - 1` which is preferred by networks. In the Disney BRDF, this was done by recognizing that the input vectors where always normalized and used in their dot product form, so the inputs were reduced from 3 `float3` vectors, to 4 `float` dot products.
22 | 
23 | Next, the network inputs may benefit from encoding which research has shown to improve the performance of the network. The library provides 2 encoders, `EncodeFrequency` and `EncodeTriangle` which encode the inputs into some form of wave function. The shader training example uses the frequency encoder which increases the number of inputs by a factor of 6 but provides a better network as a result. You should experiment with encoders to find the one suitable for your dataset.
24 | 
25 | At this point, you should know the number of (encoded) input parameters and output parameters, so it is time to configure the network.
26 | 
27 | ## Modifying the Network Configuration
28 | 
29 | The network configuration is stored in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h), and may require modification. Some elements are fixed for your dataset, like the input and output neuron counts and others are available for configuration. In the provided samples, the configuration is hard-coded for ease of understanding, but in a production system they are fully expected to be a configurable part of the training pipeline.
30 | 
31 | These are fixed configuration parameters that are directly tied to the shader you are trying to train from :
32 | 
33 | - `INPUT_NEURONS` should equal the number of encoded input parameters from above that are directly passed into the network.
34 | 
35 | - `OUTPUT_NEURONS` should equal the output parameters that the network generates. This may be an RGB triple, or just a number of unconnected outputs like for the DisneyBRDF.
36 |   
37 | The following parameters are available for experimentation and should be modified to find suitable settings for the network you are trying to train :
38 | 
39 | - `NUM_HIDDEN_LAYERS` - The number of hidden layers that make up the network.
40 | 
41 | - `HIDDEN_NEURONS` - The number of neurons in the hidden layers of the network. Changing this can make significant differences to the accuracy and cost of your network.
42 | 
43 | - `LEARNING_RATE` - This should be tuned to improve convergence of your model.
44 |   
45 | In future versions of the library, precision of the neurons may be alterable which could change the quality and performance of the network. The current version is fixed to `float16`.
46 | 
47 | Changing any of these parameters should not require any further code changes as the defines are shared amongst the C++ and shader code; they will just require a re-compile.  The exception may be when changing the size of input/output `CoopVecs`  and any code that dereferences their elements directly, such as :
48 | 
49 | ```
50 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
51 | ```
52 | 
53 | As always, experimentation will be required to find the right set of configuration parameters for the optimal training of your network.
54 | 
55 | ## Modifying the Activation and Loss Functions
56 | 
57 | The Simple Shading example uses the `TrainingMLP` which abstracts much of the training shader code for the user :
58 | 
59 | ```
60 | var model = rtxns::mlp::TrainingMLP<half, 
61 |     NUM_HIDDEN_LAYERS, INPUT_NEURONS, HIDDEN_NEURONS, OUTPUT_NEURONS, 
62 |     CoopVecMatrixLayout::TrainingOptimal, CoopVecComponentType::Float16>(
63 |     gMLPParams, 
64 |     gMLPParamsGradients, 
65 |     rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets),
66 |     rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets));
67 | 
68 | var hiddenActivation = rtxns::mlp::ReLUAct<half, HIDDEN_NEURONS>();
69 | var finalActivation = rtxns::mlp::ExponentialAct<half, OUTPUT_NEURONS>();
70 | 
71 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
72 | ```
73 | 
74 | The activation functions are passed into the models forward and backward pass (`ReLUAct` and `ExponentialAct`) for use with the `TrainingMLP` and `InferenceMLP`. These can be found in [CooperativeVectorFunctions.slang](../src/NeuralShading_Shaders/CooperativeVectorFunctions.slang) and extended as necessary. The current version of RTXNS provides a limited set of activation functions, but these can be examined and modified to support more activation functions as required.
75 | 
76 | The choice of loss function to use will be dependent on your dataset. The Simple Training example uses a simple L2 loss function whereas the Shader Training example uses a more complex L2 relative loss function. Any loss function can be trivially coded in slang to help tune your shader.
77 | 
78 | ## Hyper Parameters
79 | 
80 | These are some of the hyper parameters that are available for tuning for your dataset.
81 | 
82 | | Parameter                   | Value            |
83 | | --------------------------- | ---------------- |
84 | | HIDDEN_NEURONS              | 32               |
85 | | NUM_HIDDEN_LAYERS           | 3                |
86 | | LEARNING_RATE               | 1e-2f            |
87 | | BATCH_SIZE                  | (1 << 16)        |
88 | | BATCH_COUNT                 | 100              |
89 | | Hidden Activation Functions | ReLUAct()        |
90 | | Final Activation Functions  | ExponentialAct() |
91 | | Loss Function               | L2Relative()     |
92 | 
93 | ## Summary
94 | 
95 | The Shader Training sample is a good place to start to train your own neural shader. It will require some thought as to how to decompose your shader into network inputs and shader inputs and then the network can be re-configured through experimentation to find the suitable model that can handle your dataset.
96 | 


--------------------------------------------------------------------------------
/docs/shader_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/shader_training.png


--------------------------------------------------------------------------------
/docs/simple_inferencing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_inferencing.png


--------------------------------------------------------------------------------
/docs/simple_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training.png


--------------------------------------------------------------------------------
/docs/simple_training_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training_trained.png


--------------------------------------------------------------------------------
/docs/slangpy_training.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/slangpy_training.jpg


--------------------------------------------------------------------------------
/samples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | add_subdirectory(SimpleInferencing)
11 | add_subdirectory(ShaderTraining)
12 | add_subdirectory(SimpleTraining)
13 | add_subdirectory(SlangpyTraining)
14 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | include(../../external/donut/compileshaders.cmake)
11 | 
12 | set(shader_includes
13 | 	${SAMPLES_SHADER_INCLUDE_DIR}
14 | 	${CMAKE_CURRENT_LIST_DIR}
15 | )
16 | 
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 | 
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 | 
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 | 
23 | set(project ShaderTraining)
24 | set(folder "Samples/ShaderTraining")
25 | 
26 | file(GLOB ${project}_shaders "*.slang")
27 | file(GLOB ${project}_sources "*.cpp" "*.h")
28 | 
29 | donut_compile_shaders_all_platforms(
30 |     TARGET ${project}_shaders
31 |     CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | 	INCLUDES ${shader_includes}
33 |     FOLDER ${folder}
34 |     OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | 	SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 |     SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | 	SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 |     SOURCES ${${project}_shaders}
39 |     SLANG
40 | )
41 | 
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 | 
47 | if (MSVC)
48 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()


--------------------------------------------------------------------------------
/samples/ShaderTraining/Disney.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | //----------- Core part of the shader
12 | 
13 | const static float PI = 3.14159265358979323846;
14 | 
15 | float SchlickFresnel(float u)
16 | {
17 |     float m = clamp(1 - u, 0, 1);
18 |     float m2 = m * m;
19 |     return m2 * m2 * m; // pow(m,5)
20 | }
21 | 
22 | float Gtr1(float NdotH, float a)
23 | {
24 |     if (a >= 1)
25 |     {
26 |         return 1 / PI;
27 |     }
28 |     float a2 = a * a;
29 |     float t = 1 + (a2 - 1) * NdotH * NdotH;
30 |     return (a2 - 1) / (PI * log(a2) * t);
31 | }
32 | 
33 | float Gtr2(float NdotH, float ax)
34 | {
35 |     float a = ax * (1 / ax / ax * (1 - NdotH * NdotH) + NdotH * NdotH);
36 |     return 1 / (PI * a * a);
37 | }
38 | 
39 | float SmithGGX(float NdotV, float alphaG)
40 | {
41 |     float a = alphaG * alphaG;
42 |     float b = NdotV * NdotV;
43 |     return 1 / (NdotV + sqrt(a + b - a * b));
44 | }
45 | 
46 | float SmithGGXAnisotropy(float NdotV, float ax)
47 | {
48 |     return 1 / (NdotV + sqrt(ax * ax * (1 - NdotV * NdotV) + NdotV * NdotV));
49 | }
50 | 
51 | float4 Disney(float NdotL, float NdotV, float NdotH, float LdotH, float roughness)
52 | {
53 |     float FL = SchlickFresnel(NdotL), FV = SchlickFresnel(NdotV);
54 |     float Fss90 = LdotH * LdotH * roughness;
55 |     float Fss = lerp(1.0f, Fss90, FL) * lerp(1.0f, Fss90, FV);
56 |     float ss = 1.25f * (Fss * (1.f / (NdotL + NdotV) - .5f) + .5f);
57 | 
58 |     // specular
59 |     float ax = max(.001f, roughness * roughness);
60 |     float Ds = Gtr2(NdotH, ax);
61 |     float FH = SchlickFresnel(LdotH);
62 |     float Gs = SmithGGXAnisotropy(NdotL, ax);
63 |     Gs *= SmithGGXAnisotropy(NdotV, ax);
64 | 
65 |     // clearcoat (ior = 1.5 -> F0 = 0.04)
66 |     float Dr = Gtr1(NdotH, .01f);
67 |     float Fr = lerp(.04f, 1.0f, FH);
68 |     float Gr = SmithGGX(NdotL, .25f) * SmithGGX(NdotV, .25f);
69 | 
70 |     return float4((1 / PI) * ss, Gs * Ds, FH, .25 * Gr * Fr * Dr);
71 | }
72 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/DisneyMLP.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | import MLP;
12 | import CooperativeVectorFunctions;
13 | import Activation;
14 | import Utils;
15 | 
16 | // 5 inputs are passed into this function; NdotL, NdotV, NdotH, LdotH, roughness
17 | #define INPUT_FEATURES 5 
18 | 
19 | // The output is float4
20 | #define OUTPUT_NEURONS 4
21 | 
22 | // EncodeFrequency expands the input by 6 per input feature
23 | #define FREQUENCY_EXPANSION 6
24 | 
25 | float4 DisneyMLP<let HIDDEN_LAYERS : int, let HIDDEN_NEURONS : int>(
26 |     float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer,
27 |     uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1])
28 | {
29 |     // Calculate approximated core shader part using MLP
30 |     float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
31 | 
32 |     var inputParams = rtxns::EncodeFrequency<half, INPUT_FEATURES>(params);
33 | 
34 |     var model = rtxns::mlp::InferenceMLP<half, 
35 |         HIDDEN_LAYERS, 
36 |         INPUT_FEATURES * FREQUENCY_EXPANSION, 
37 |         HIDDEN_NEURONS, 
38 |         OUTPUT_NEURONS, 
39 |         CoopVecMatrixLayout::TrainingOptimal, 
40 |         CoopVecComponentType::Float16>
41 |         (mlpBuffer, weightOffsets, biasOffsets);
42 | 
43 |     var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct<half, HIDDEN_NEURONS>(), rtxns::mlp::ExponentialAct<half, OUTPUT_NEURONS>());
44 |     return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
45 | }
46 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #define INPUT_FEATURES 5
12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // 6* from Frequency Encoding
13 | #define OUTPUT_NEURONS 4
14 | 
15 | #define HIDDEN_NEURONS 32
16 | #define NUM_HIDDEN_LAYERS 3
17 | #define BATCH_SIZE (1 << 16)
18 | #define BATCH_COUNT 100
19 | 
20 | #define LEARNING_RATE 1e-2f
21 | #define COMPONENT_WEIGHTS float4(1.f, 10.f, 1.f, 5.f)
22 | 
23 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1)
24 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4)
25 | #define LOSS_SCALE 128.0
26 | 
27 | struct DirectConstantBufferEntry
28 | {
29 |     // Scene setup
30 |     float4x4 viewProject;
31 |     float4x4 view;
32 |     float4 cameraPos;
33 | 
34 |     // Light setup
35 |     float4 lightDir;
36 |     float4 lightIntensity;
37 | 
38 |     // Material props
39 |     float4 baseColor;
40 |     float specular = 0;
41 |     float roughness = 0;
42 |     float metallic = 0;
43 | 
44 |     // Alignment
45 |     float pad = 0;
46 | };
47 | 
48 | struct InferenceConstantBufferEntry : DirectConstantBufferEntry
49 | {
50 |     // Neural weight & bias offsets
51 |     uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
52 |     uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
53 | };
54 | 
55 | struct TrainingConstantBufferEntry
56 | {
57 |     uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
58 |     uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
59 |     uint32_t maxParamSize;
60 |     float learningRate;
61 |     float currentStep;
62 |     uint32_t batchSize;
63 |     uint64_t seed;
64 | };
65 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/computeOptimizer.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import Optimizers;
15 | 
16 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0);
17 | RWBuffer<half> gMLPParams             : REGISTER_UAV(0, 0);
18 | RWBuffer<float> gMLPParams32          : REGISTER_UAV(1, 0);
19 | RWBuffer<half> gMLPParamsGradients    : REGISTER_UAV(2, 0);
20 | RWBuffer<float> gMoments1             : REGISTER_UAV(3, 0);
21 | RWBuffer<float> gMoments2             : REGISTER_UAV(4, 0);
22 | 
23 | [numthreads(32, 1, 1)]
24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
25 | {
26 |     uint i = dispatchThreadID.x;
27 |     if (i >= gConst.maxParamSize)
28 |         return;
29 | 
30 |     float gradient = (float)gMLPParamsGradients[i];
31 |     gMLPParamsGradients[i] = half(0.0);
32 | 
33 |     if (isfinite(gradient))
34 |     {
35 |         float weightbias = gMLPParams32[i];
36 | 
37 |         optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE);
38 | 
39 |         float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep);
40 | 
41 |         gMLPParams32[i] = adjustedWeightbias;
42 |         gMLPParams[i] = (half) adjustedWeightbias;
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/computeTraining.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import Loss;
20 | import PCG32;
21 | import Disney;
22 | 
23 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0);
24 | ByteAddressBuffer gMLPParams               : REGISTER_SRV(0, 0);
25 | RWByteAddressBuffer gMLPParamsGradients    : REGISTER_UAV(0, 0);
26 | 
27 | [shader("compute")]
28 | [numthreads(64, 1, 1)] 
29 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
30 | {
31 |     //----------- Randomly generate input parameters
32 |     uint idx = dispatchThreadID.x;
33 |     PCG32 rng = PCG32(gConst.seed, idx);
34 | 
35 |     // Using tangent coordinate system. N = (0,0,1)
36 |     // L is arbitrary, but (N,L) >= 0 => L.z > 0, so generate random L in XZ plane's first quadrant
37 |     float3 L;
38 |     L.y = 0.f;
39 |     sincos(rng.nextFloat()*PI/2, L.z, L.x);
40 | 
41 |     // V is random direction, but (N,V) >= 0 => V.z > 0
42 |     float sa, ca; // Azimuth [-PI/2,PI/2]
43 |     sincos(-PI + 2 * PI * rng.nextFloat(), sa, ca);
44 |     float se, ce; // Elevation [0,PI/2]
45 |     sincos(PI/2 * rng.nextFloat(), se, ce);
46 |     float3 V = float3(ce*ca, ce*sa, se);
47 | 
48 |     float NdotL = L.z;
49 |     float NdotV = V.z;
50 | 
51 |     float3 H = normalize(L+V);
52 |     float NdotH = H.z;
53 |     float LdotH = dot(L,H);
54 | 
55 |     float roughness = rng.nextFloat()*0.7f+0.3f;
56 | 
57 |     //----------- Calculate core shader part DIRECTLY
58 |     float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, roughness);
59 | 
60 |     //----------- Training step
61 |     float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness};
62 |     var inputParams = rtxns::EncodeFrequency<half, INPUT_FEATURES>(params);
63 | 
64 |     var model = rtxns::mlp::TrainingMLP<half, 
65 |         NUM_HIDDEN_LAYERS, INPUT_NEURONS, HIDDEN_NEURONS, OUTPUT_NEURONS, 
66 |         CoopVecMatrixLayout::TrainingOptimal, CoopVecComponentType::Float16>(
67 |         gMLPParams, 
68 |         gMLPParamsGradients, 
69 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets), 
70 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets));
71 | 
72 |     var hiddenActivation = rtxns::mlp::ReLUAct<half, HIDDEN_NEURONS>();
73 |     var finalActivation = rtxns::mlp::ExponentialAct<half, OUTPUT_NEURONS>();
74 | 
75 |     var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
76 | 
77 |     float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
78 | 
79 |     float4 lossGradient = rtxns::mlp::L2Relative<float, 4>.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS);
80 |    
81 |     model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec<OUTPUT_NEURONS>(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3]));
82 | }
83 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/renderDifference.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import Disney;
20 | import DisneyMLP;
21 | 
22 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0);
23 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0);
24 | 
25 | struct PS_INPUT
26 | {
27 |     float4 pos : SV_Position;
28 |     float3 norm : NORMAL;
29 |     float3 view : VIEW;
30 | }
31 | 
32 | [shader("vertex")]
33 | void main_vs(
34 |     float3 i_pos : POSITION, 
35 |     float3 i_norm : NORMAL, 
36 |     out PS_INPUT output)
37 | {
38 |     output.pos = mul(float4(i_pos, 1), gConst.viewProject);
39 |     output.norm = i_norm;
40 |     output.view = gConst.cameraPos.xyz - i_pos;
41 | }
42 | 
43 | float3 calcColor(float4 params)
44 | {
45 |     float3 Cdlin = pow(gConst.baseColor.rgb, 2.2);
46 |     float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic);
47 |     float3 brdfn = params.x * Cdlin * (1 - gConst.metallic) + params.y * lerp(Cspec0, float3(1), params.z) + params.w;
48 |     return clamp(brdfn * gConst.lightIntensity.rgb, 0, 1);
49 | }
50 | 
51 | [shader("fragment")]
52 | void main_ps(
53 |     PS_INPUT input,
54 |     out float4 o_color : SV_Target0)
55 | {
56 |     // Prepare input parameters
57 |     float3 view = normalize(input.view);
58 |     float3 norm = normalize(input.norm);
59 |     float3 h = normalize(-gConst.lightDir.xyz + view);
60 | 
61 |     float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
62 |     float NdotV = max(0.f, dot(norm, view));
63 |     float NdotH = max(0.f, dot(norm, h));
64 |     float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
65 | 
66 |     //----------- Calculate core shader part DIRECTLY
67 |     float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness);
68 | 
69 |     // Calculate approximated core shader
70 |     float4 outParams = DisneyMLP<NUM_HIDDEN_LAYERS, HIDDEN_NEURONS>(
71 |         NdotL, NdotV, NdotH, LdotH, gConst.roughness,
72 |         gMLPParams,
73 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets),
74 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets)
75 |     );
76 | 
77 |     o_color = float4((calcColor(actualDisney) - calcColor(outParams)) * NdotL * 4 + 0.5, 1.f);
78 | }
79 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/renderDisney.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | DECLARE_CBUFFER(DirectConstantBufferEntry, gConst, 0, 0);
15 | 
16 | struct PS_INPUT
17 | {
18 |     float4 pos : SV_Position;
19 |     float3 norm : NORMAL;
20 |     float3 view : VIEW;
21 | }
22 | 
23 | [shader("vertex")]
24 | void main_vs(
25 |     float3 i_pos : POSITION, 
26 |     float3 i_norm : NORMAL, 
27 |     out PS_INPUT output)
28 | {
29 |     output.pos = mul(float4(i_pos, 1), gConst.viewProject);
30 |     output.norm = i_norm;
31 |     output.view = gConst.cameraPos.xyz - i_pos;
32 | }
33 | 
34 | import Disney;
35 | 
36 | [shader("fragment")]
37 | void main_ps(
38 |     PS_INPUT input,
39 |     out float4 o_color : SV_Target0)
40 | {
41 |     //----------- Prepare input parameters
42 |     float3 view = normalize(input.view);
43 |     float3 norm = normalize(input.norm);
44 |     float3 h = normalize(-gConst.lightDir.xyz + view);
45 | 
46 |     float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
47 |     float NdotV = max(0.f, dot(norm, view));
48 |     float NdotH = max(0.f, dot(norm, h));
49 |     float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
50 | 
51 |     //----------- Calculate core shader part DIRECTLY
52 |     float4 outParams = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness);
53 | 
54 |     //----------- Calculate final color
55 |     float3 Cdlin = float3(pow(gConst.baseColor[0], 2.2), pow(gConst.baseColor[1], 2.2), pow(gConst.baseColor[2], 2.2));
56 |     float3 Cspec0 = lerp(gConst.specular * .08 * float3(1), Cdlin, gConst.metallic);
57 |     float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
58 |     float3 colorh = brdfn * float3(NdotL) * gConst.lightIntensity.rgb;
59 | 
60 |     o_color = float4(colorh, 1.f);
61 | }
62 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/renderInference.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import DisneyMLP;
20 | 
21 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0);
22 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0);
23 | 
24 | struct PS_INPUT
25 | {
26 |     float4 pos : SV_Position;
27 |     float3 norm : NORMAL;
28 |     float3 view : VIEW;
29 | }
30 | 
31 | [shader("vertex")]
32 | void main_vs(
33 |     float3 i_pos : POSITION, 
34 |     float3 i_norm : NORMAL, 
35 |     out PS_INPUT output)
36 | {
37 |     output.pos = mul(float4(i_pos, 1), gConst.viewProject);
38 |     output.norm = i_norm;
39 |     output.view = gConst.cameraPos.xyz - i_pos;
40 | }
41 | 
42 | [shader("fragment")]
43 | void main_ps(
44 |     PS_INPUT input,
45 |     out float4 o_color : SV_Target0)
46 | {
47 |     // Prepare input parameters
48 |     float3 view = normalize(input.view);
49 |     float3 norm = normalize(input.norm);
50 |     float3 h = normalize(-gConst.lightDir.xyz + view);
51 | 
52 |     float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
53 |     float NdotV = max(0.f, dot(norm, view));
54 |     float NdotH = max(0.f, dot(norm, h));
55 |     float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
56 | 
57 |     // Calculate approximated core shader
58 |     float4 outParams = DisneyMLP<NUM_HIDDEN_LAYERS, HIDDEN_NEURONS>(
59 |         NdotL, NdotV, NdotH, LdotH, gConst.roughness,
60 |         gMLPParams,
61 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets),
62 |         rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets)
63 |     );
64 | 
65 |     // Calculate final color
66 |     float3 Cdlin = pow(gConst.baseColor.rgb, 2.2);
67 |     float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic);
68 |     float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
69 |     float3 colorh = brdfn * NdotL * gConst.lightIntensity.rgb;
70 | 
71 |     o_color = float4(colorh, 1.f);
72 | }
73 | 


--------------------------------------------------------------------------------
/samples/ShaderTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | renderDisney.slang -E main_vs -T vs
2 | renderDisney.slang -E main_ps -T ps
3 | renderInference.slang -E main_vs -T vs
4 | renderInference.slang -E main_ps -T ps
5 | renderDifference.slang -E main_vs -T vs 
6 | renderDifference.slang -E main_ps -T ps
7 | computeTraining.slang -E main_cs -T cs
8 | computeOptimizer.slang -E adam_cs -T cs
9 | 


--------------------------------------------------------------------------------
/samples/SimpleInferencing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | include(../../external/donut/compileshaders.cmake)
11 | 
12 | set(shader_includes
13 | 	${SAMPLES_SHADER_INCLUDE_DIR}
14 | 	${CMAKE_CURRENT_LIST_DIR}
15 | )
16 | 
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 | 
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 | 
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 | 
23 | set(project SimpleInferencing)
24 | set(folder "Samples/SimpleInferencing")
25 | 
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 | 
29 | donut_compile_shaders_all_platforms(
30 |     TARGET ${project}_shaders
31 |     CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | 	INCLUDES ${shader_includes}
33 |     FOLDER ${folder}
34 |     OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | 	SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 |     SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | 	SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 |     SOURCES ${${project}_shaders}
39 |     SLANG
40 | )
41 | 
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 | 
47 | if (MSVC)
48 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()


--------------------------------------------------------------------------------
/samples/SimpleInferencing/NetworkConfig.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #ifndef __NETWORK_CONFIG_H__
12 | #define __NETWORK_CONFIG_H__
13 | 
14 | #define VECTOR_FORMAT half
15 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16
16 | 
17 | // When loading a model from file, these parameters must match
18 | #define INPUT_FEATURES 5
19 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input
20 | #define OUTPUT_NEURONS 4
21 | #define HIDDEN_NEURONS 32
22 | 
23 | struct NeuralConstants
24 | {
25 |     // Scene setup
26 |     float4x4 viewProject;
27 |     float4x4 view;
28 |     float4 cameraPos;
29 | 
30 |     // Light setup
31 |     float4 lightDir;
32 |     float4 lightIntensity;
33 | 
34 |     // Material props
35 |     float4 baseColor;
36 |     float specular;
37 |     float roughness;
38 |     float metallic;
39 |     float padding;
40 | 
41 |     // Neural weight & bias offsets
42 |     uint4 weightOffsets; // Offsets to weight matrices in bytes.
43 |     uint4 biasOffsets; // Offsets to bias vectors in bytes.
44 | };
45 | 
46 | #endif //__NETWORK_CONFIG_H__


--------------------------------------------------------------------------------
/samples/SimpleInferencing/SimpleInferencing.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorFunctions;
 12 | import Utils;
 13 | import LinearOps;
 14 | 
 15 | #include "NetworkConfig.h"
 16 | #include <donut/shaders/binding_helpers.hlsli>
 17 | 
 18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
 19 | ByteAddressBuffer gMLPParams            :REGISTER_SRV(0, 0);
 20 | 
 21 | struct VertexIn
 22 | {
 23 |     float3 pos : POSITION; 
 24 |     float3 norm : NORMAL; 
 25 | };
 26 | 
 27 | struct VertexOut
 28 | {
 29 |     float4 pos : SV_Position;
 30 |     float3 norm : NORMAL;
 31 |     float3 view : VIEW;
 32 | }
 33 | 
 34 | [shader("vertex")]
 35 | void main_vs(
 36 |     VertexIn vIn,
 37 |     out VertexOut vOut)
 38 | {
 39 |     vOut.pos = mul(float4(vIn.pos, 1), gConst.viewProject);
 40 |     vOut.norm = vIn.norm;
 41 |     vOut.view = gConst.cameraPos.xyz - vIn.pos;
 42 | }
 43 | 
 44 | float4 DisneyMLP(float NdotL, float NdotV, float NdotH, float LdotH, float roughness)
 45 | {   
 46 |     uint4 weightOffsets = gConst.weightOffsets; 
 47 |     uint4 biasOffsets = gConst.biasOffsets;  
 48 | 
 49 |    CoopVec<VECTOR_FORMAT, INPUT_NEURONS> inputParams;
 50 |    CoopVec<VECTOR_FORMAT, HIDDEN_NEURONS> hiddenParams;
 51 |    CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> outputParams;
 52 | 
 53 |    // Encode input parameters, 5 inputs to 30 parameters 
 54 |    float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
 55 |    inputParams = rtxns::EncodeFrequency<half, INPUT_FEATURES>(params);
 56 |    
 57 |    // Forward propagation through the neural network
 58 |    // Input to hidden layer, then apply activation function
 59 |    hiddenParams = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, INPUT_NEURONS>(
 60 |        inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
 61 |    hiddenParams = rtxns::relu(hiddenParams);
 62 |    
 63 |    // Hidden layer to hidden layer, then apply activation function 
 64 |    hiddenParams = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, HIDDEN_NEURONS>(
 65 |        hiddenParams, gMLPParams, weightOffsets[1], biasOffsets[1], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
 66 |    hiddenParams = rtxns::relu(hiddenParams);
 67 |    
 68 |    // Hidden layer to hidden layer, then apply activation function    
 69 |    hiddenParams = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, HIDDEN_NEURONS>(
 70 |        hiddenParams, gMLPParams, weightOffsets[2], biasOffsets[2], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
 71 |    hiddenParams = rtxns::relu(hiddenParams);
 72 |    
 73 |    // Hidden layer to output layer, then apply final activation function
 74 |    outputParams = rtxns::LinearOp<VECTOR_FORMAT, OUTPUT_NEURONS, HIDDEN_NEURONS>(
 75 |        hiddenParams, gMLPParams, weightOffsets[3], biasOffsets[3], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
 76 |    outputParams = exp(outputParams);
 77 | 
 78 |     // Take the output from the neural network as the output color
 79 |     return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
 80 | }
 81 | 
 82 | [shader("fragment")]
 83 | void main_ps( 
 84 |     VertexOut vOut,
 85 |     out float4 o_color : SV_Target0)
 86 | {
 87 |     float4 lightIntensity = gConst.lightIntensity;
 88 |     float4 lightDir =  gConst.lightDir;
 89 |     float4 baseColor = gConst.baseColor;
 90 |     float specular = gConst.specular;
 91 |     float roughness = gConst.roughness;
 92 |     float metallic = gConst.metallic;
 93 | 
 94 |     // Prepare input parameters
 95 |     float3 view = normalize(vOut.view);
 96 |     float3 norm = normalize(vOut.norm);
 97 |     float3 h = normalize(-lightDir.xyz + view);
 98 | 
 99 |     float NdotL = max(0.f, dot(norm, -lightDir.xyz));
100 |     float NdotV = max(0.f, dot(norm, view));
101 |     float NdotH = max(0.f, dot(norm, h));
102 |     float LdotH = max(0.f, dot(h, -lightDir.xyz));
103 | 
104 |     // Calculate approximated core shader part using MLP
105 |     float4 outParams = DisneyMLP(NdotL, NdotV, NdotH, LdotH, roughness);
106 | 
107 |     // Calculate final color
108 |     float3 Cdlin = float3(pow(baseColor.r, 2.2), pow(baseColor.g, 2.2), pow(baseColor.b, 2.2));
109 |     float3 Cspec0 = lerp(specular * .08f * float3(1,1,1), Cdlin, metallic);
110 |     float3 brdfn = outParams.x * Cdlin * (1 - metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
111 |     float3 colorh = brdfn * float3(NdotL) * lightIntensity.rgb;
112 | 
113 |     o_color = float4(colorh, 1.f);
114 |  }
115 | 


--------------------------------------------------------------------------------
/samples/SimpleInferencing/shaders.cfg:
--------------------------------------------------------------------------------
1 | SimpleInferencing.slang -T vs -E main_vs 
2 | SimpleInferencing.slang -T ps -E main_ps
3 | 


--------------------------------------------------------------------------------
/samples/SimpleTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | include(../../external/donut/compileshaders.cmake)
11 | 
12 | set(shader_includes
13 | 	${SAMPLES_SHADER_INCLUDE_DIR}
14 | 	${CMAKE_CURRENT_LIST_DIR}
15 | )
16 | 
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 | 
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 | 
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 | 
23 | set(project SimpleTraining)
24 | set(folder "Samples/SimpleTraining")
25 | 
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 | 
29 | donut_compile_shaders_all_platforms(
30 |     TARGET ${project}_shaders
31 |     CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | 	INCLUDES ${shader_includes}
33 |     FOLDER ${folder}
34 |     OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | 	SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 |     SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | 	SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 |     SOURCES ${${project}_shaders}
39 |     SLANG
40 | )
41 | 
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 | 
47 | if (MSVC)
48 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()


--------------------------------------------------------------------------------
/samples/SimpleTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #define INPUT_FEATURES 2
12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input
13 | #define OUTPUT_NEURONS 3
14 | 
15 | #define HIDDEN_NEURONS 64
16 | #define NUM_HIDDEN_LAYERS 4
17 | 
18 | #define LEARNING_RATE 0.001f
19 | 
20 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1)
21 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4)
22 | #define LOSS_SCALE 128.0
23 | #define RELU_LEAK 0.01h
24 | 
25 | #define VECTOR_FORMAT half
26 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16
27 | #define NETWORK_PRECISION rtxns::Precision::F16
28 | 
29 | #define MATRIX_LAYOUT CoopVecMatrixLayout::TrainingOptimal
30 | 
31 | #define BATCH_COUNT 128
32 | #define BATCH_SIZE_X 32
33 | #define BATCH_SIZE_Y 32
34 | 
35 | enum class NetworkTransform
36 | {
37 |     Identity,
38 |     Zoom,
39 |     Flip
40 | };
41 | 
42 | struct NeuralConstants
43 | {
44 |     uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
45 |     uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
46 | 
47 |     uint32_t imageWidth;
48 |     uint32_t imageHeight;
49 |     uint32_t maxParamSize;
50 |     float learningRate;
51 | 
52 |     uint32_t currentStep;
53 |     uint32_t batchSizeX;
54 |     uint32_t batchSizeY;
55 |     NetworkTransform networkTransform;
56 | };
57 | 


--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Inference.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import CooperativeVectorFunctions;
15 | import Utils;
16 | import LinearOps;
17 | 
18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
19 | ByteAddressBuffer gMLPParams            :REGISTER_SRV(0, 0);
20 | Texture2D<float4> inputTexture          :REGISTER_SRV(1, 0);
21 | RWTexture2D<float4> outputTexture       :REGISTER_UAV(0, 0);
22 | 
23 | [shader("compute")]
24 | [numthreads(8, 8, 1)] 
25 | void inference_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
26 | {
27 |     // Set the input ID as the uv coordinate and frequency encode it for the network
28 |     float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight));
29 |     CoopVec<VECTOR_FORMAT, INPUT_NEURONS> inputParams = rtxns::EncodeFrequency<half, 2>({inputUV.x, inputUV.y});
30 | 
31 |     // Load offsets
32 |     uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets);
33 |     uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets);
34 | 
35 |     CoopVec<VECTOR_FORMAT, HIDDEN_NEURONS> hiddenParams;
36 |     CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> outputParams;
37 |    
38 |     // Forward propagation through the neural network
39 |     // Input to hidden layer, then apply activation function
40 |     hiddenParams = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, INPUT_NEURONS>(
41 |         inputParams, gMLPParams, weightOffsets[0], biasOffsets[0],
42 |         MATRIX_LAYOUT, TYPE_INTERPRETATION);
43 |     hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK);
44 | 
45 |     // Hidden layers to hidden layers, then apply activation function 
46 |     [ForceUnroll]
47 |     for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++)
48 |     {
49 |         hiddenParams = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, HIDDEN_NEURONS>(
50 |             hiddenParams, gMLPParams, weightOffsets[layer], 
51 |             biasOffsets[layer], 
52 |             MATRIX_LAYOUT, TYPE_INTERPRETATION);
53 |         hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK);
54 |     }
55 | 
56 |     // Hidden layer to output layer, then apply final activation function
57 |     outputParams = rtxns::LinearOp<VECTOR_FORMAT, OUTPUT_NEURONS, HIDDEN_NEURONS>(
58 |         hiddenParams, gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS],
59 |         MATRIX_LAYOUT, TYPE_INTERPRETATION);
60 |     outputParams = rtxns::sigmoid(outputParams);
61 | 
62 |     // Take the output from the neural network as the output color
63 |     float4 color = {outputParams[0], outputParams[1], outputParams[2], 1.f};
64 |     outputTexture[dispatchThreadID.xy] = color;
65 | }


--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Optimizer.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import Optimizers;
15 | 
16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
17 | RWBuffer<half> gMLPParams             :REGISTER_UAV(0, 0);
18 | RWBuffer<float> gMLPParamsf           :REGISTER_UAV(1, 0);
19 | RWBuffer<half> gMLPParamsGradients    :REGISTER_UAV(2, 0);
20 | RWBuffer<float> gMoments1             :REGISTER_UAV(3, 0);
21 | RWBuffer<float> gMoments2             :REGISTER_UAV(4, 0);
22 | 
23 | [numthreads(32, 1, 1)]
24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
25 | {
26 |     uint i = dispatchThreadID.x;
27 |     if (i >= gConst.maxParamSize)
28 |         return;
29 | 
30 |     float gradient = (float)gMLPParamsGradients[i];
31 |     gMLPParamsGradients[i] = half(0.0);
32 | 
33 |     // Get the floating point params, not float16
34 |     float weightbias = gMLPParamsf[i];
35 |     
36 |     optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE);
37 |     
38 |     float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep);
39 | 
40 |     gMLPParamsf[i] = adjustedWeightbias;
41 |     gMLPParams[i] = (half)adjustedWeightbias;
42 | }
43 | 
44 | [numthreads(32, 1, 1)]
45 | void convert_weights_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
46 | {
47 |     uint i = dispatchThreadID.x;
48 |     if (i >= gConst.maxParamSize)
49 |         return;
50 |     
51 |     half param = gMLPParams[i];
52 |     gMLPParamsf[i] = float(param);
53 | }


--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Training.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #include "NetworkConfig.h"
 12 | #include <donut/shaders/binding_helpers.hlsli>
 13 | 
 14 | import CooperativeVectorDerivatives;
 15 | import CooperativeVectorFunctions;
 16 | import Utils;
 17 | import LinearOps;
 18 | 
 19 | 
 20 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
 21 | ByteAddressBuffer gMLPParams                    :REGISTER_SRV(0, 0);
 22 | Texture2D<float4> inputTexture                  :REGISTER_SRV(1, 0);
 23 | RWByteAddressBuffer gMLPParamsGradients         :REGISTER_UAV(0, 0);
 24 | RWStructuredBuffer<uint> gRandState             :REGISTER_UAV(1, 0);
 25 | RWTexture2D<float4> outputTexture               :REGISTER_UAV(2, 0);
 26 | RWTexture2D<float4> lossTexture                 :REGISTER_UAV(3, 0);
 27 | 
 28 | struct RNG
 29 | {
 30 |     uint state;
 31 | 
 32 |     __init(uint state) { this.state = state; }
 33 | 
 34 |     [mutating]
 35 |     float next()
 36 |     {
 37 |         float r = (state >> 8) * 0x1p-24;
 38 |         state = state * 2739110765U + 2739110765U;
 39 |         return r;
 40 |     }
 41 | }
 42 | 
 43 | [shader("compute")]
 44 | [numthreads(8, 8, 1)] 
 45 | void training_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
 46 | {
 47 |     uint2 batchSize = uint2(gConst.batchSizeX, gConst.batchSizeY);
 48 | 
 49 |     uint dispatchThreadIdxy = dispatchThreadID.y * batchSize.x + dispatchThreadID.x;
 50 | 
 51 |     RNG rng = RNG(gRandState[dispatchThreadIdxy]);
 52 | 
 53 |     // Get a random uv coordinate for the input and frequency encode it for improved convergance
 54 |     float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0);
 55 |     CoopVec<VECTOR_FORMAT, INPUT_NEURONS> inputParams = rtxns::EncodeFrequency<half, 2>({inputUV.x, inputUV.y});
 56 | 
 57 |      // Load offsets
 58 |     uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.weightOffsets);
 59 |     uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray<NUM_TRANSITIONS_ALIGN4, NUM_TRANSITIONS>(gConst.biasOffsets);
 60 | 
 61 |     // Create variables to cache the results from each stage
 62 |     CoopVec<VECTOR_FORMAT, HIDDEN_NEURONS> hiddenParams[NUM_HIDDEN_LAYERS];
 63 |     CoopVec<VECTOR_FORMAT, HIDDEN_NEURONS> hiddenActivated[NUM_HIDDEN_LAYERS];
 64 |     CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> outputParams;
 65 |     CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> outputActivated;
 66 | 
 67 |     // Forward propagation through the neural network
 68 |     // Input to hidden layer, then apply activation function
 69 |     hiddenParams[0] = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, INPUT_NEURONS>(
 70 |         inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION);
 71 |     hiddenActivated[0] = rtxns::leakyReLU(hiddenParams[0], RELU_LEAK);
 72 | 
 73 |     // Hidden layers to hidden layers, then apply activation function 
 74 |     [ForceUnroll]
 75 |     for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++)
 76 |     {
 77 |         hiddenParams[layer] = rtxns::LinearOp<VECTOR_FORMAT, HIDDEN_NEURONS, HIDDEN_NEURONS>(
 78 |             hiddenActivated[layer - 1], gMLPParams, weightOffsets[layer], biasOffsets[layer], 
 79 |             MATRIX_LAYOUT, TYPE_INTERPRETATION);
 80 |         hiddenActivated[layer] = rtxns::leakyReLU(hiddenParams[layer], RELU_LEAK);
 81 |     }
 82 | 
 83 |     // Hidden layer to output layer, then apply final activation function    
 84 |     outputParams = rtxns::LinearOp<VECTOR_FORMAT, OUTPUT_NEURONS, HIDDEN_NEURONS>(
 85 |         hiddenActivated[NUM_HIDDEN_LAYERS - 1], gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS],
 86 |         biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION);
 87 |     outputActivated = rtxns::sigmoid(outputParams);
 88 | 
 89 |     // Take the output from the neural network as the output color
 90 |     float3 predictedRGB = {outputActivated[0], outputActivated[1], outputActivated[2]};
 91 | 
 92 |     // Now transform the input UVs according to the NetworkModel enum.
 93 |     // This can easily be extended to try many different transforms.
 94 |     uint2 actualUV;
 95 |     if (gConst.networkTransform == NetworkTransform.Flip)
 96 |     {
 97 |         float2 flipUV = inputUV.yx;
 98 |         actualUV = uint2(flipUV.xy * float2(gConst.imageHeight, gConst.imageWidth));
 99 |     }
100 |     else if (gConst.networkTransform == NetworkTransform.Zoom)
101 |     {
102 |         float2 zoomUV = inputUV * 0.5 + 0.25;
103 |         actualUV = uint2(zoomUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
104 |     }
105 |     else
106 |     {
107 |         actualUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
108 |     }
109 | 
110 |     // Load the texture according to the transformed input UVs. This will
111 |     // provide the RGB that the model is trying to train towards.
112 |     float3 actualRGB = inputTexture[actualUV].rgb;
113 | 
114 |     // Output the loss, scaled to greyscale for output
115 |     uint2 lossUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
116 |     const float lossScaleFactor = 10.0f; // scale it up for better vis
117 |     lossTexture[lossUV] = float4((predictedRGB - actualRGB) * lossScaleFactor + 0.5, 1);  
118 | 
119 |     // Compute the L2 loss gradient
120 |     // L2Loss = (a-b)^2
121 |     // L2Loss Derivative = 2(a-b)
122 |     float3 lossGradient = 2.0 * (predictedRGB - actualRGB);
123 |    
124 |     // Scale by batch size 
125 |     lossGradient /= (batchSize.x * batchSize.y);
126 | 
127 |     // Apply the LOSS_SCALE factor to retain precision. Remove it in the optimizer pass before use.
128 |     lossGradient *= LOSS_SCALE;
129 | 
130 |     CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> lossGradientCV = CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS>(VECTOR_FORMAT(lossGradient[0]), VECTOR_FORMAT(lossGradient[1]), VECTOR_FORMAT(lossGradient[2]));
131 | 
132 |     // Back-propogation pass, generate the gradients and accumulate the results into memory to be applied in the optimisation pass.
133 |     CoopVec<VECTOR_FORMAT, OUTPUT_NEURONS> outputGradient;
134 |     CoopVec<VECTOR_FORMAT, HIDDEN_NEURONS> hiddenGradient;
135 | 
136 |     // Output layer (loss gradient) to final hidden layer
137 |     outputGradient = rtxns::sigmoid_Derivative(outputParams, lossGradientCV);
138 |     hiddenGradient = rtxns::LinearOp_Backward<VECTOR_FORMAT, OUTPUT_NEURONS, HIDDEN_NEURONS>(
139 |        hiddenActivated[NUM_HIDDEN_LAYERS - 1], outputGradient, gMLPParams, gMLPParamsGradients, 
140 |        weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION);
141 | 
142 |     // Hidden layer to hidden layer 
143 |     for(int layer = NUM_HIDDEN_LAYERS - 1; layer >= 1; layer--)
144 |     {
145 |         hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[layer], RELU_LEAK, hiddenGradient);
146 |         hiddenGradient = rtxns::LinearOp_Backward<VECTOR_FORMAT, HIDDEN_NEURONS, HIDDEN_NEURONS>
147 |             (hiddenActivated[layer - 1], hiddenGradient, gMLPParams, gMLPParamsGradients, 
148 |             weightOffsets[layer], biasOffsets[layer], MATRIX_LAYOUT, TYPE_INTERPRETATION);
149 |     }
150 | 
151 |     // First hidden layer to input layer
152 |     hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[0], RELU_LEAK, hiddenGradient);
153 |     rtxns::LinearOp_Backward<VECTOR_FORMAT, HIDDEN_NEURONS, INPUT_NEURONS>(
154 |         inputParams, hiddenGradient, gMLPParams, gMLPParamsGradients, weightOffsets[0], 
155 |         biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION);
156 | 
157 |     // Store the random state to continue iterating next time.
158 |     gRandState[dispatchThreadIdxy] = rng.state;
159 | }


--------------------------------------------------------------------------------
/samples/SimpleTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | SimpleTraining_Inference.slang -E inference_cs -T cs
2 | SimpleTraining_Training.slang -E training_cs -T cs
3 | SimpleTraining_Optimizer.slang -E adam_cs -T cs
4 | SimpleTraining_Optimizer.slang -E convert_weights_cs -T cs
5 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | include(../../external/donut/compileshaders.cmake)
11 | 
12 | set(shader_includes
13 | 	${SAMPLES_SHADER_INCLUDE_DIR}
14 | 	${CMAKE_CURRENT_LIST_DIR}
15 | )
16 | 
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 | 
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 | 
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 | 
23 | set(project SlangpyTraining)
24 | set(folder "Samples/SlangpyTraining")
25 | 
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 | 
29 | donut_compile_shaders_all_platforms(
30 |     TARGET ${project}_shaders
31 |     CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | 	INCLUDES ${shader_includes}
33 |     FOLDER ${folder}
34 |     OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | 	SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 |     SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | 	SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 |     SOURCES ${${project}_shaders}
39 |     SLANG
40 | )
41 | 
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 | 
47 | if (MSVC)
48 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()


--------------------------------------------------------------------------------
/samples/SlangpyTraining/Helpers.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | #
  3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  4 | # 
  5 | # NVIDIA CORPORATION and its licensors retain all intellectual property
  6 | # and proprietary rights in and to this software, related documentation
  7 | # and any modifications thereto. Any use, reproduction, disclosure or
  8 | # distribution of this software and related documentation without an express
  9 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 10 | #
 11 | from slangpy.backend import Device, DeviceType, TextureLoader, Bitmap, SlangCompilerOptions
 12 | import slangpy as spy
 13 | from pathlib import Path
 14 | from typing import Any, Union
 15 | import subprocess
 16 | import os
 17 | 
 18 | from NeuralModules import CoopVecModule
 19 | 
 20 | class SDKSample:
 21 |     def __init__(self, args: list[str]):
 22 |         super().__init__()
 23 | 
 24 |         # Set up directories to find includes and executables
 25 |         self.spy_dir = Path(spy.__file__).parent / "slang"
 26 |         self.sdk_root = Path(__file__).parent.parent.parent
 27 |         self.sdk_data_dir = self.sdk_root / "assets/data"
 28 |         self.rtxns_dir = self.sdk_root / "src/NeuralShading_Shaders"
 29 |         self.spy_sample_dir = self.sdk_root / "samples/SlangpyTraining"
 30 |         self.donut_dir = self.sdk_root / "external/donut/include"
 31 |         self.slang_compiler = self.sdk_root / "bin/slangc.bat"
 32 | 
 33 |         search_root = self.sdk_root / "bin"
 34 |         bin_ext = ".exe" if os.name == "nt" else ""
 35 |         inference_candidates = [f for f in search_root.glob(f"**/SlangpyTraining{bin_ext}") if f.is_file()]
 36 |         shadermake_candidates = [f for f in search_root.glob(f"**/ShaderMake{bin_ext}") if f.is_file()]
 37 |         
 38 |         if len(inference_candidates) == 0:
 39 |             print(f"Warning: Could not find SlangpyTraining executable within {search_root}. "
 40 |                   "C++ sample will not be launched after training.")
 41 |             self.inference_sample_path = None
 42 |         else:
 43 |             self.inference_sample_path = inference_candidates[0]
 44 |             if len(inference_candidates) > 1:
 45 |                 print(f"Warning: Found multiple possible SlangpyTraining executables. Picking {self.inference_sample_path}")
 46 |             else:
 47 |                 print(f"Found SlangpyTraining executable at {self.inference_sample_path}")
 48 | 
 49 |         if len(shadermake_candidates) == 0:
 50 |             print(f"Warning: Could not find ShaderMake executable within {search_root}. "
 51 |                   "C++ sample will not be launched after training.")
 52 |             self.shadermake_path = None
 53 |         else:
 54 |             self.shadermake_path = shadermake_candidates[0]
 55 |             if len(shadermake_candidates) > 1:
 56 |                 print(f"Warning: Found multiple possible ShaderMake executables. Picking {self.shadermake_path}")
 57 |             else:
 58 |                 print(f"Found ShaderMake executable at {self.shadermake_path}")
 59 | 
 60 |         self.include_dirs = [
 61 |             self.rtxns_dir,
 62 |             self.spy_dir,
 63 |             self.spy_sample_dir
 64 |         ]
 65 | 
 66 |         for field in ("spy_dir", "sdk_root", "sdk_data_dir", "rtxns_dir", "spy_sample_dir", "donut_dir", "slang_compiler"):
 67 |             path: Path = getattr(self, field)
 68 |             if not path.exists():
 69 |                 print(f"Warning: Can't find path {field} at {path}. This may cause errors.")
 70 |         
 71 |         self.device = self._create_device()
 72 |     
 73 |     # Create an sgl device and setup default include directories
 74 |     def _create_device(self):
 75 |         device = Device(
 76 |             type=DeviceType.vulkan,
 77 |             compiler_options=SlangCompilerOptions({
 78 |                 "include_paths": self.include_dirs,
 79 |                 "disable_warnings": [
 80 |                     "41018", # Overzealous uninitialized-out-parameter warning
 81 |                     "41012"  # Coop vec capability warning
 82 |                 ]
 83 |             }),
 84 |         )
 85 | 
 86 |         print("Selected adapter", device.info.adapter_name)
 87 | 
 88 |         return device
 89 |     
 90 |     def load_texture(self, path: Union[str,Path]):
 91 |         bmp = Bitmap(self.sdk_data_dir / path)
 92 |         loader = TextureLoader(self.device)
 93 |         target_tex = loader.load_texture(bmp, {"load_as_normalized": True})
 94 |         return target_tex
 95 |     
 96 |     # Take a trained model and distill it to defines and compile it
 97 |     def compile_inference_shader(self, model: CoopVecModule):
 98 |         if self.inference_sample_path is None or self.shadermake_path is None:
 99 |             print("Missing executables, skipping compilation.")
100 |             return
101 |         
102 |         if len(model.parameters()) > 1:
103 |             raise ValueError("Shader generation only supports a single parameter buffer")
104 | 
105 |         defines = [
106 |             ("MODEL_TYPE", f'"{model.inference_type_name}"'),
107 |             ("MODEL_INITIALIZER", f'"{model.get_initializer()}"'),
108 |             ("VECTOR_FORMAT", model.elem_name),
109 |         ]
110 | 
111 |         self.compile_shader("SlangpyInference.slang", defines)
112 | 
113 |     def compile_shader(self, shader_path: str, defines: list[Union[str,tuple[str, Any]]]):
114 |         config_path = self.spy_sample_dir / "trained_shaders.cfg"
115 |         with open(config_path, "w") as file:
116 |             file.write(f"{shader_path} -E main_cs -T cs")
117 | 
118 |         output_path = self.inference_sample_path.parent / "shaders/SlangpyTraining/spirv"
119 |         
120 |         args = [
121 |             self.shadermake_path,
122 |             "--config", config_path,
123 |             "-o", output_path,
124 |             "--compiler", self.slang_compiler,
125 |             "--platform", "SPIRV",
126 |             "--flatten",
127 |             "--binaryBlob",
128 |             "--outputExt", ".bin",
129 |             "--slang",
130 |             "--tRegShift", "0",
131 |             "--sRegShift", "128",
132 |             "--bRegShift", "256",
133 |             "--uRegShift", "384",
134 |             "--vulkanVersion", "1.2",
135 |             "--matrixRowMajor",
136 |             "--force",
137 |             "-X", "-capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV",
138 |         ]
139 |         for d in defines + ["SPIRV", "TARGET_VULKAN"]:
140 |             if isinstance(d, str):
141 |                 args.extend(("-D", d))
142 |             else:
143 |                 args.extend(("-D", f"{d[0]}={d[1]}"))
144 |         
145 |         for include_dir in self.include_dirs + [self.donut_dir]:
146 |             args.extend(("-I", include_dir))
147 | 
148 |         result = subprocess.run(args, text=True, capture_output=True)
149 |         if result.stderr:
150 |             raise RuntimeError(f"ShaderMake exited with errors: {result.stderr}")
151 |         stdout = str(result.stdout)
152 |         if stdout.find(": error") != -1:
153 |             raise RuntimeError(f"slang compiler exited with errors: {stdout}")
154 |     
155 |     def run_sdk_inference(self, model_weights: Path):
156 |         if self.inference_sample_path is None or self.shadermake_path is None:
157 |             print("Missing executables, skipping C++ sample.")
158 |             return
159 |         
160 |         subprocess.run([self.inference_sample_path, model_weights])
161 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #define MAX_LAYER_COUNT 8
12 | #define MAX_LAYER_COUNT_ALIGN4 ((MAX_LAYER_COUNT + 3) / 4)
13 | 
14 | // These defines will be overriden by texture-training.py with the
15 | // chosen network architecture. However, if we compile this file
16 | // from scratch, we provide a default architexture here so the sample
17 | // runs. We provide the trained weights for this network under
18 | // assets/data/slangpy-weights.json
19 | #ifndef MODEL_TYPE
20 | #define MODEL_TYPE                                                                                                                                                                 \
21 |     rtxns::ModuleChain<half, 2, 12, 3, rtxns::FrequencyEncoding<half, 2, 3>,                                                                                                       \
22 |                        rtxns::InferenceMLPModule<half, 4, 12, 32, 3, CoopVecComponentType::Float16, rtxns::mlp::LeakyReLUAct<half, 32>, rtxns::mlp::SigmoidAct<half, 3>>>
23 | 
24 | #define MODEL_INITIALIZER                                                                                                                                                          \
25 |     {                                                                                                                                                                              \
26 |         {},                                                                                                                                                                        \
27 |         {                                                                                                                                                                          \
28 |             weights, { wo[0], wo[1], wo[2], wo[3], wo[4] }, { bo[0], bo[1], bo[2], bo[3], bo[4] }, { 0.01h },                                                                      \
29 |             {                                                                                                                                                                      \
30 |             }                                                                                                                                                                      \
31 |         }                                                                                                                                                                          \
32 |     }
33 | #define VECTOR_FORMAT half
34 | #endif
35 | 
36 | struct NeuralConstants
37 | {
38 |     uint4 weightOffsets[MAX_LAYER_COUNT_ALIGN4];
39 |     uint4 biasOffsets[MAX_LAYER_COUNT_ALIGN4];
40 | 
41 |     uint32_t imageWidth;
42 |     uint32_t imageHeight;
43 | };
44 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/NeuralModules.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | __exported import CooperativeVectorDerivatives;
 11 | __exported import CooperativeVectorFunctions;
 12 | __exported import CooperativeVectorAutoDiff;
 13 | __exported import Optimizers;
 14 | __exported import Utils;
 15 | __exported import LinearOps;
 16 | __exported import MLP;
 17 | __exported import Activation;
 18 | 
 19 | namespace rtxns
 20 | {
 21 |     ////////////////////////
 22 |     //
 23 |     // Root interface for neural modules and implementations of several
 24 |     // Takes a CoopVec of type T with NumInput elements and returns NumOutputs elements
 25 |     // Several RTXNS classes and functions are wrapped here to conform to the IModule interface
 26 |     // This lets you build network architectures with generic types
 27 |     //
 28 |     ////////////////////////
 29 |     interface IModule<T : __BuiltinFloatingPointType, let NumInputs : int, let NumOutputs : int>
 30 |     {
 31 |         [BackwardDifferentiable]
 32 |         CoopVec<T, NumOutputs> forward(CoopVec<T, NumInputs> inputParams);
 33 |     }
 34 | 
 35 |     // Chain two modules together, i.e. pass the output of the first to the second
 36 |     // Can be nested arbitrarily
 37 |     struct ModuleChain<
 38 |         T : __BuiltinFloatingPointType,
 39 |         let NumInputs : int,
 40 |         let NumHidden : int,
 41 |         let NumOutputs : int,
 42 |         First : IModule<T, NumInputs, NumHidden>,
 43 |         Second : IModule<T, NumHidden, NumOutputs>
 44 |     > : IModule<T, NumInputs, NumOutputs>
 45 |     {
 46 |         First first;
 47 |         Second second;
 48 | 
 49 |         [BackwardDifferentiable]
 50 |         CoopVec<T, NumOutputs> forward(CoopVec<T, NumInputs> inputParams)
 51 |         {
 52 |             CoopVec<T, NumHidden> middle = first.forward(inputParams);
 53 |             return second.forward(middle);
 54 |         }
 55 |     }
 56 | 
 57 |     struct TrainableMLPModule<
 58 |         T : __BuiltinFloatingPointType,
 59 |         let NumHiddenLayers : int,
 60 |         let InputNeurons : int,
 61 |         let HiddenNeurons : int,
 62 |         let OutputNeurons : int,
 63 |         let ComponentType : CoopVecComponentType,
 64 |         HiddenAct : mlp::IActivation<T, HiddenNeurons>,
 65 |         OutputAct : mlp::IActivation<T, OutputNeurons>
 66 |     > : IModule<T, InputNeurons, OutputNeurons>
 67 |     {
 68 |         ByteAddressBuffer parameters;
 69 |         RWByteAddressBuffer derivatives; 
 70 |         uint matrixOffsets[NumHiddenLayers + 1];
 71 |         uint biasOffsets[NumHiddenLayers + 1];
 72 | 
 73 |         HiddenAct hiddenAct;
 74 |         OutputAct outputAct;
 75 | 
 76 |         [BackwardDerivative(backward)]
 77 |         CoopVec<T, OutputNeurons> forward(CoopVec<T, InputNeurons> inputParams)
 78 |         {
 79 |             var mlp = mlp::TrainingMLP<
 80 |                 T, 
 81 |                 NumHiddenLayers, 
 82 |                 InputNeurons, 
 83 |                 HiddenNeurons, 
 84 |                 OutputNeurons, 
 85 |                 CoopVecMatrixLayout::TrainingOptimal, 
 86 |                 ComponentType
 87 |             >(parameters, derivatives, matrixOffsets, biasOffsets);
 88 |             return mlp.forward(inputParams, hiddenAct, outputAct);
 89 |         }
 90 | 
 91 |         void backward(inout DifferentialPair<CoopVec<T, InputNeurons>> inputParams, const CoopVec<T, OutputNeurons> dOutputActivated)
 92 |         {
 93 |             var mlp = mlp::TrainingMLP<
 94 |                 T, 
 95 |                 NumHiddenLayers, 
 96 |                 InputNeurons, 
 97 |                 HiddenNeurons, 
 98 |                 OutputNeurons, 
 99 |                 CoopVecMatrixLayout::TrainingOptimal, 
100 |                 ComponentType
101 |             >(parameters, derivatives, matrixOffsets, biasOffsets);
102 |             
103 |             mlp.backward(inputParams, hiddenAct, outputAct, dOutputActivated);
104 |         }
105 |     }
106 | 
107 |     struct InferenceMLPModule<
108 |         T : __BuiltinFloatingPointType,
109 |         let NumHiddenLayers : int,
110 |         let InputNeurons : int,
111 |         let HiddenNeurons : int,
112 |         let OutputNeurons : int,
113 |         let ComponentType : CoopVecComponentType,
114 |         HiddenAct : mlp::IActivation<T, HiddenNeurons>,
115 |         OutputAct : mlp::IActivation<T, OutputNeurons>
116 |     > : IModule<T, InputNeurons, OutputNeurons>
117 |     {
118 |         ByteAddressBuffer parameters;
119 |         uint matrixOffsets[NumHiddenLayers + 1];
120 |         uint biasOffsets[NumHiddenLayers + 1];
121 | 
122 |         HiddenAct hiddenAct;
123 |         OutputAct outputAct;
124 | 
125 |         [TreatAsDifferentiable]
126 |         CoopVec<T, OutputNeurons> forward(CoopVec<T, InputNeurons> inputParams)
127 |         {
128 |             var mlp = mlp::InferenceMLP<
129 |                 T, 
130 |                 NumHiddenLayers, 
131 |                 InputNeurons, 
132 |                 HiddenNeurons, 
133 |                 OutputNeurons, 
134 |                 CoopVecMatrixLayout::InferencingOptimal, 
135 |                 ComponentType
136 |             >(parameters, matrixOffsets, biasOffsets);
137 |             return mlp.forward(inputParams, hiddenAct, outputAct);
138 |         }
139 |     }
140 | 
141 |     struct FrequencyEncoding<T : __BuiltinFloatingPointType, let NumInputs : int, let NumScales : int> : IModule<T, NumInputs, NumScales * NumInputs * 2>
142 |     {
143 |         [BackwardDifferentiable]
144 |         CoopVec<T, NumScales * NumInputs * 2> forward(CoopVec<T, NumInputs> inputParams)
145 |         {
146 |             return rtxns::EncodeFrequencyN<T, NumInputs, NumScales>(inputParams);
147 |         }
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyInference.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "NetworkConfig.h"
12 | #include <donut/shaders/binding_helpers.hlsli>
13 | 
14 | import SlangpyTraining;
15 | 
16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
17 | ByteAddressBuffer gMLPParams        :REGISTER_SRV(0, 0);
18 | Texture2D<float4> inputTexture      :REGISTER_SRV(1, 0);
19 | RWTexture2D<float4> outputTexture   :REGISTER_UAV(0, 0);
20 | 
21 | float3 evalModel(ByteAddressBuffer weights, uint wo[MAX_LAYER_COUNT], uint bo[MAX_LAYER_COUNT], float2 uv)
22 | {
23 |     // Auto-generated defines from texture-training.py
24 |     MODEL_TYPE model = MODEL_INITIALIZER;
25 | 
26 |     let inputParams = rtxns::CoopVecFromVector<VECTOR_FORMAT>(uv);
27 | 
28 |     let result = model.forward(inputParams);
29 | 
30 |     return rtxns::VectorFromCoopVec(result);
31 | }
32 | 
33 | [shader("compute")]
34 | [numthreads(8, 8, 1)] 
35 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
36 | {
37 |     // Get the UV coordinate from the thread ID
38 |     float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight));
39 | 
40 |     // Load offsets
41 |     uint weightOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray<MAX_LAYER_COUNT_ALIGN4, MAX_LAYER_COUNT>(gConst.weightOffsets);
42 |     uint biasOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray<MAX_LAYER_COUNT_ALIGN4, MAX_LAYER_COUNT>(gConst.biasOffsets);
43 | 
44 |     // Run the model
45 |     float3 modelOutput = evalModel(gMLPParams, weightOffsets, biasOffsets, inputUV);
46 | 
47 |     // Write to output
48 |     outputTexture[dispatchThreadID.xy] = float4(modelOutput, 1.0f);
49 | }


--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyTraining.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3 | # 
  4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
  5 | # and proprietary rights in and to this software, related documentation
  6 | # and any modifications thereto. Any use, reproduction, disclosure or
  7 | # distribution of this software and related documentation without an express
  8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 | 
 10 | from slangpy.backend import DataType
 11 | from slangpy.core.module import Module
 12 | from slangpy.types import NDBuffer, call_id
 13 | import numpy as np
 14 | import json
 15 | import math
 16 | import time
 17 | import sys
 18 | 
 19 | from Helpers import SDKSample
 20 | from NeuralModules import CoopVecModule, TrainableMLP, FrequencyEncoding, ModuleChain
 21 | from NeuralModules import Activation, NoneAct, LinearAct, ExponentialAct, ShiftedExponentialAct, ReLUAct, LeakyReLUAct, SigmoidAct, SwishAct, TanhAct
 22 | 
 23 | # Set to true for an interactive training. This can be helpful
 24 | # but slows down training quite a bit
 25 | INTERACTIVE = True
 26 | if INTERACTIVE:
 27 |     import matplotlib.pyplot as plt
 28 | 
 29 | def training_main():
 30 |     ##
 31 |     ## Setup window, device and file paths
 32 |     ##
 33 |     sample = SDKSample(sys.argv[1:])
 34 |     device = sample.device
 35 | 
 36 |     ##
 37 |     ## Set up training constants.
 38 |     ## When we train interactively, choose smaller batches
 39 |     ## for faster feedback.
 40 |     ##
 41 |     batch_shape = (256, 256)
 42 |     learning_rate = 0.005
 43 |     grad_scale = 128.0
 44 |     loss_scale = grad_scale / math.prod(batch_shape)
 45 | 
 46 |     sample_target = 1000000000
 47 |     num_batches_per_epoch = 1000 if INTERACTIVE else 5000
 48 |     num_epochs = sample_target // (num_batches_per_epoch * math.prod(batch_shape))
 49 | 
 50 |     ##
 51 |     ## Set up models
 52 |     ##
 53 | 
 54 |     # A basic MLP with ReLU activations and a linear output that maps a 2D UV input
 55 |     # to an RGB color. This is a good baseline, but it won't achieve state-of-the-art
 56 |     basic_mlp = TrainableMLP(device, DataType.float16,
 57 |                              num_hidden_layers=3,
 58 |                              input_width=2,
 59 |                              hidden_width=32,
 60 |                              output_width=3,
 61 |                              hidden_act=ReLUAct(),
 62 |                              output_act=NoneAct())
 63 | 
 64 |     # Replacing ReLU with LeakyReLU makes training more stable for small networks,
 65 |     # and a Sigmoid activation at the output helps bring the network into the right range
 66 |     better_activations = TrainableMLP(device, DataType.float16,
 67 |                                       num_hidden_layers=3,
 68 |                                       input_width=2,
 69 |                                       hidden_width=32,
 70 |                                       output_width=3,
 71 |                                       hidden_act=LeakyReLUAct(),
 72 |                                       output_act=SigmoidAct())
 73 | 
 74 |     # For 2D or 3D inputs, we can do even better with an input encoding
 75 |     # We need to adjust the input width of the MLP to take the additional
 76 |     # outputs from the encoding
 77 |     encoding = FrequencyEncoding(DataType.float16, 2, 3)
 78 |     mlp_with_encoding = ModuleChain(
 79 |         encoding,
 80 |         TrainableMLP(device, DataType.float16,
 81 |                      num_hidden_layers=3,
 82 |                      input_width=encoding.fan_out,
 83 |                      hidden_width=32,
 84 |                      output_width=3,
 85 |                      hidden_act=LeakyReLUAct(),
 86 |                      output_act=SigmoidAct())
 87 |     )
 88 | 
 89 |     # We're not limited to predefined modules - for example, try using the custom
 90 |     # activation from the slang file:
 91 |     activation = SigmoidAct()
 92 |     #activation = Activation("SiLUActivation")
 93 | 
 94 |     # Now take the working model and scale up the number of weights by adding another layer
 95 |     larger_mlp = ModuleChain(
 96 |         encoding,
 97 |         TrainableMLP(device, DataType.float16,
 98 |                      num_hidden_layers=4,
 99 |                      input_width=encoding.fan_out,
100 |                      hidden_width=32,
101 |                      output_width=3,
102 |                      hidden_act=LeakyReLUAct(),
103 |                      output_act=activation)
104 |     )
105 | 
106 |     # Make a list of models to be optimized so we can compare them
107 |     models = [
108 |         ("Basic MLP", basic_mlp),
109 |         ("+Better activations", better_activations),
110 |         ("+Frequency encoding", mlp_with_encoding),
111 |         ("+More Weights", larger_mlp),
112 |     ]
113 | 
114 |     # You can also play with different losses. For images, L2 is not a bad default
115 |     loss_name = "rtxns::mlp::L2<float, 3>"
116 | 
117 |     ##
118 |     ## Load training data and slang code
119 |     ##
120 |     target_tex = sample.load_texture("nvidia-logo.png")
121 | 
122 |     module = Module.load_from_file(device, "SlangpyTraining.slang")
123 | 
124 |     # Instantiate the slang RNG from the loaded module,
125 |     # seeded with a random buffer of uints
126 |     pcg = np.random.PCG64(seed=12345)
127 |     seeds = pcg.random_raw(batch_shape).astype(np.uint32)
128 |     rng = module.RNG(seeds)
129 | 
130 |     # Fill a buffer with UVs for later evaluating the model during training
131 |     vis_resolution = 256
132 |     span = np.linspace(0, 1, vis_resolution, dtype=np.float32)
133 |     vis_uvs_np = np.stack(np.broadcast_arrays(span[None, :], span[:, None]), axis=2)
134 |     vis_uvs = NDBuffer(device, module.float2.struct, shape=(vis_resolution, vis_resolution))
135 |     vis_uvs.copy_from_numpy(vis_uvs_np)
136 | 
137 |     # Create a figure to fill out as we go
138 |     if INTERACTIVE:
139 |         n = len(models)
140 |         fig, axes = plt.subplots(2, n, dpi=200, figsize=(2.4 * n, 4.8), squeeze=False)
141 |         plt.ion()
142 |         plt.show()
143 | 
144 |         black = np.zeros((vis_resolution, vis_resolution, 3), dtype=np.uint8)
145 |         canvases = []
146 |         for i, (model_name, _) in enumerate(models):
147 |             axes[0, i].text(0.5, 1.05, f"{model_name}", horizontalalignment='center', size=8)
148 |             top = axes[0, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1)
149 |             bot = axes[1, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1)
150 |             canvases.append([top, bot])
151 |             axes[0, i].set_axis_off()
152 |             axes[1, i].set_axis_off()
153 |             fig.tight_layout(h_pad=-1, w_pad=0.5)
154 | 
155 | 
156 |     for i, (model_name, model) in enumerate(models):
157 |         print(f"Training model {model_name}")
158 | 
159 |         assert len(model.parameters()) == 1, "Only one set of parameters is supported in this sample"
160 |         assert model.fan_in == 2 and model.fan_out == 3, "Model must have 2 inputs (UV) and 3 outputs (RGB)"
161 | 
162 |         ##
163 |         ## Set up optimizer and specialize the slang functions to our model
164 |         ##
165 |         grads = model.gradients()[0]
166 |         parameters = model.parameters()[0]
167 | 
168 |         parametersF = module.ConvertToFloat(parameters)
169 | 
170 |         # These match up with the argument names of optimizerStep in texture-training.slang
171 |         optimizer_state = {
172 |             "moments1": NDBuffer.zeros_like(parametersF),
173 |             "moments2": NDBuffer.zeros_like(parametersF),
174 |             "paramF": parametersF,
175 |             "paramH": parameters,
176 |             "grad": grads,
177 |             "learningRate": learning_rate,
178 |             "gradScale": grad_scale
179 |         }
180 |         num_params = parameters.shape[0]
181 | 
182 |         # Specialize slang functions by substituting generic parameters
183 |         optimizer_step = module.OptimizerStep
184 |         train_texture = module[f"TrainTexture<{model.type_name}, {loss_name} >"]
185 |         eval_model = module[f"EvalModel<{model.type_name} >"]
186 |         eval_loss = module[f"EvalLoss<{loss_name} >"]
187 | 
188 |         # Begin main training loop
189 |         iteration = 1
190 |         for epoch in range(num_epochs):
191 |             start = time.time()
192 | 
193 |             cmd = device.create_command_buffer()
194 |             cmd.open()
195 |             # Each batch is submitted to a command buffer
196 |             for batch in range(num_batches_per_epoch):
197 |                 # Compute gradients
198 |                 train_texture.append_to(cmd, model, rng, target_tex, loss_scale)
199 |                 # Do one parameter optimization step using those gradients
200 |                 optimizer_step.append_to(cmd, idx=call_id((num_params, )), iteration=iteration, **optimizer_state)
201 |                 iteration += 1
202 |             cmd.close()
203 |             device.submit_command_buffer(cmd)
204 |             device.wait()
205 |             end = time.time()
206 | 
207 |             device.run_garbage_collection()
208 |             
209 |             # Print out progress info
210 |             elapsed = end - start
211 |             num_samples_per_epoch = math.prod(batch_shape) * num_batches_per_epoch
212 |             progress = (num_samples_per_epoch * (epoch + 1)) // 1000000
213 |             info = (f"Epoch {epoch + 1} complete, "
214 |                     f"{progress}/{sample_target // 1000000} MSamples: "
215 |                     f"Time: {elapsed:.3f}s "
216 |                     f"Throughput: {num_samples_per_epoch / elapsed * 1e-6:.2f} MSamples/s")
217 | 
218 |             # In the interactive case, draw updates to window and compute loss. This goes
219 |             # through the CPU, so this is quite slow
220 |             if INTERACTIVE:
221 |                 current_prediction = eval_model(model, vis_uvs, _result=np.ndarray)
222 |                 loss_val = np.mean(eval_loss(vis_uvs, current_prediction, target_tex, _result=np.ndarray))
223 |                 diff = module.TextureDifference(vis_uvs, current_prediction, target_tex, 10.0, _result=np.ndarray)
224 | 
225 |                 info += f" Loss: {loss_val:.3f}"
226 | 
227 |                 current_prediction = np.clip(current_prediction, 0, 1)
228 |                 diff = np.clip(diff, 0, 1)
229 | 
230 |                 canvases[i][0].set_data(current_prediction)
231 |                 canvases[i][1].set_data(diff)
232 |                 fig.canvas.draw()
233 |                 fig.canvas.flush_events()
234 |             
235 |             print(info)
236 | 
237 |     print("Training complete!")
238 | 
239 |     best_model = models[-1][1]
240 | 
241 |     weight_path = sample.spy_sample_dir / "weights.json"
242 |     print(f"Writing trained weights of best model to {weight_path}")
243 |     param_dict = best_model.serialize()
244 |     open(weight_path, "w").write(json.dumps(param_dict, indent=4))
245 | 
246 |     print(f"Compiling inference shader...")
247 |     sample.compile_inference_shader(best_model)
248 | 
249 |     print(f"Running RTXNS inference...")
250 |     if INTERACTIVE:
251 |         plt.close()
252 |     sample.run_sdk_inference(weight_path)
253 | 
254 | if __name__ == "__main__":
255 |     training_main()
256 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyTraining.slang:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: Apache-2.0
  2 | // clang-format off
  3 | 
  4 | /*
  5 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  6 |  *
  7 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  8 |  * and proprietary rights in and to this software, related documentation
  9 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 10 |  * distribution of this software and related documentation without an express
 11 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 12 |  */
 13 | 
 14 | __exported import NeuralModules;
 15 | __exported import Loss;
 16 | __exported import Optimizers;
 17 | 
 18 | struct RNG
 19 | {
 20 |     uint state;
 21 | 
 22 |     __init(uint state) { this.state = state; }
 23 | 
 24 |     [mutating]
 25 |     float next()
 26 |     {
 27 |         float r = (state >> 8) * 0x1p-24;
 28 |         state = state * 2739110765U + 2739110765U;
 29 |         return r;
 30 |     }
 31 | }
 32 | 
 33 | // An example of adding a custom activation to your network
 34 | // This implements the Sigmoid Linear Unit (SiLU)
 35 | struct SiLUActivation<T : __BuiltinFloatingPointType, let K : int> : rtxns::mlp::IActivation<T, K>
 36 | {
 37 |     [Differentiable]
 38 |     CoopVec<T, K> eval(CoopVec<T, K> x)
 39 |     {
 40 |         return x * no_diff CoopVec<T, K>(T(1.)) / (no_diff CoopVec<T, K>(T(1.)) + exp(-x));
 41 |     }
 42 | }
 43 | 
 44 | // Utility function for nearest-neighbor sampling of texture
 45 | T SampleTexture<T : ITexelElement>(Texture2D<T> tex, float2 uv)
 46 | {
 47 |     float2 size;
 48 |     tex.GetDimensions(size[0], size[1]);
 49 |     uint2 xy = uint2(uv * size);
 50 |     return tex[xy];
 51 | }
 52 | 
 53 | // Take one step with the adam optimizer
 54 | void OptimizerStep(
 55 |     RWBuffer<float> moments1,
 56 |     RWBuffer<float> moments2,
 57 |     RWBuffer<float> paramF,
 58 |     RWBuffer<half> paramH,
 59 |     RWBuffer<half> grad,
 60 |     uint idx,
 61 |     float learningRate,
 62 |     float gradScale,
 63 |     int iteration)
 64 | {
 65 |     var optimizer = optimizers::Adam(moments1, moments2, learningRate, gradScale);
 66 | 
 67 |     // Parameters are converted to FP16 for computing gradients,
 68 |     // but we keep the FP32 originals around so we don't accumulate
 69 |     // rounding errors
 70 |     float parameter = paramF[idx];
 71 |     float gradient = (float)grad[idx];
 72 | 
 73 |     parameter = optimizer.step(parameter, idx, gradient, iteration);
 74 | 
 75 |     // Update the reference FP32 parameter, and convert the new value back to FP16
 76 |     paramF[idx] = parameter;
 77 |     paramH[idx] = (half)parameter;
 78 |     // Zero out gradients
 79 |     grad[idx] = 0.0h;
 80 | }
 81 | 
 82 | void TrainTexture<Model : rtxns::IModule<half, 2, 3>, Loss : rtxns::mlp::ILoss<float, 3>>(Model model, inout RNG rng, Texture2D<float4> targetTex, float lossScale)
 83 | {
 84 |     // Get a random uv coordinate for the input
 85 |     float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0);
 86 | 
 87 |     // Sample the target texture at the generated UV
 88 |     float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
 89 | 
 90 |     // Evaluate the current output of the model
 91 |     float3 predictedRGB = EvalModel(model, inputUV);
 92 | 
 93 |     // Evaluate the loss gradient
 94 |     float3 lossGradient = Loss.deriv(targetRGB, predictedRGB, lossScale);
 95 | 
 96 |     // Backpropragate gradient through network parameters
 97 |     bwd_diff(EvalModel)(model, inputUV, lossGradient);
 98 | }
 99 | 
100 | // Convenience functions for evaluating the model from vector inputs
101 | // Converts to/from CoopVec internally
102 | [Differentiable]
103 | float3 EvalModel<Model: rtxns::IModule<half, 2, 3>>(Model model, no_diff float2 inputUV)
104 | {
105 |     var inputVec = rtxns::CoopVecFromVector<half>(inputUV);
106 | 
107 |     var result = model.forward(inputVec);
108 | 
109 |     return rtxns::VectorFromCoopVec(result);
110 | }
111 | 
112 | // Computes the loss between the predicted RGB at a given UV coordinate and a reference texture
113 | float3 EvalLoss<Loss : rtxns::mlp::ILoss<float, 3>>(float2 inputUV, float3 predictedRGB, Texture2D<float4> targetTex)
114 | {
115 |     float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
116 | 
117 |     return Loss.value(targetRGB, predictedRGB, 1.0f);
118 | }
119 | 
120 | // Computes the difference between the predicted RGB at a given UV coordinate and a reference texture
121 | // for visualization
122 | float3 TextureDifference(float2 inputUV, float3 predictedRGB, Texture2D<float4> targetTex, float scale)
123 | {
124 |     float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
125 | 
126 |     return (predictedRGB - targetRGB) * scale + 0.5f;
127 | }
128 | 
129 | // Convenience function to convert from half to float params
130 | float ConvertToFloat(half paramH)
131 | {
132 |     return (float)paramH;
133 | }
134 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=3.0,<4.0
2 | numpy>=2.0,<3.0
3 | slangpy==0.19.4
4 | 


--------------------------------------------------------------------------------
/samples/SlangpyTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | SlangpyInference.slang -E main_cs -T cs


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | set(LIBRARY_FILTER src)
11 | add_subdirectory(NeuralShading)
12 | add_subdirectory(Utils)
13 | add_subdirectory(NeuralShading_Shaders)
14 | 
15 | 


--------------------------------------------------------------------------------
/src/NeuralShading/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | 
11 | file(GLOB sources "*.cpp" "*.h")
12 | 
13 | set(project NeuralShading)
14 | set(folder "${LIBRARY_FILTER}/NeuralShading")
15 | 
16 | add_library(${project} STATIC EXCLUDE_FROM_ALL ${sources})
17 | target_include_directories(${project} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | target_link_libraries(${project} donut_app donut_engine)
19 | 
20 | set_target_properties(${project} PROPERTIES 
21 | 	FOLDER ${folder}
22 | )
23 | 


--------------------------------------------------------------------------------
/src/NeuralShading/CoopVector.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #include "CoopVector.h"
 12 | #include <algorithm>
 13 | 
 14 | #if DONUT_WITH_VULKAN
 15 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
 16 | #include <vulkan/vulkan.hpp>
 17 | #endif
 18 | 
 19 | #if DONUT_WITH_DX12
 20 | #include <dxgi1_4.h>
 21 | #include <wrl.h>
 22 | #endif
 23 | 
 24 | using namespace rtxns;
 25 | 
 26 | namespace
 27 | {
 28 | /**
 29 |  * Bytes between a consecutive row or column (if row/column-major layout).
 30 |  * The stride is only used for row/column major layouts
 31 |  **/
 32 | size_t GetStride(const MatrixLayout layout, const uint32_t rows, const uint32_t cols, const size_t precision)
 33 | {
 34 |     size_t stride = 0;
 35 |     if (layout == MatrixLayout::RowMajor)
 36 |     {
 37 |         stride = cols * precision;
 38 |     }
 39 |     else if (layout == MatrixLayout::ColumnMajor)
 40 |     {
 41 |         stride = rows * precision;
 42 |     }
 43 |     return stride;
 44 | }
 45 | } // namespace
 46 | 
 47 | #if DONUT_WITH_VULKAN
 48 | namespace
 49 | {
 50 | 
 51 | VkComponentTypeKHR GetVkComponentType(rtxns::Precision precision)
 52 | {
 53 |     return precision == rtxns::Precision::F16 ? VK_COMPONENT_TYPE_FLOAT16_NV : VK_COMPONENT_TYPE_FLOAT32_NV;
 54 | }
 55 | 
 56 | VkCooperativeVectorMatrixLayoutNV GetVkLayout(const MatrixLayout layout)
 57 | {
 58 |     switch (layout)
 59 |     {
 60 |     case MatrixLayout::RowMajor:
 61 |         return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_ROW_MAJOR_NV;
 62 |     case MatrixLayout::ColumnMajor:
 63 |         return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_COLUMN_MAJOR_NV;
 64 |     case MatrixLayout::InferencingOptimal:
 65 |         return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_INFERENCING_OPTIMAL_NV;
 66 |     case MatrixLayout::TrainingOptimal:
 67 |         return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_TRAINING_OPTIMAL_NV;
 68 |     default:
 69 |         return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_MAX_ENUM_NV;
 70 |     }
 71 | }
 72 | 
 73 | VkConvertCooperativeVectorMatrixInfoNV GetVkConvertLayerDesc(
 74 |     int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t* dstSize, uint64_t srcData = 0, uint64_t dstData = 0)
 75 | {
 76 |     VkConvertCooperativeVectorMatrixInfoNV info{};
 77 |     info.sType = VK_STRUCTURE_TYPE_CONVERT_COOPERATIVE_VECTOR_MATRIX_INFO_NV;
 78 |     info.pNext = nullptr;
 79 |     info.numRows = rows;
 80 |     info.numColumns = columns;
 81 |     info.srcComponentType = GetVkComponentType(precision);
 82 |     info.srcLayout = GetVkLayout(srcLayout);
 83 |     info.srcStride = GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision));
 84 |     info.srcSize = srcSize;
 85 |     info.srcData.deviceAddress = srcData;
 86 |     info.dstComponentType = GetVkComponentType(precision);
 87 |     info.dstLayout = GetVkLayout(dstLayout);
 88 |     info.dstStride = GetStride(dstLayout, rows, columns, GetSize(precision));
 89 |     info.pDstSize = dstSize;
 90 |     info.dstData.deviceAddress = dstData;
 91 |     return info;
 92 | }
 93 | 
 94 | } // namespace
 95 | 
 96 | CoopVectorUtils_VK::CoopVectorUtils_VK(VkDevice vkDevice)
 97 | {
 98 |     m_vkDevice = vkDevice;
 99 |     assert(m_vkDevice != VK_NULL_HANDLE && "Failed to get Vulkan device handle from GFX.");
100 | 
101 |     m_vkConvertCooperativeVectorMatrixNV =
102 |         (PFN_vkConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkConvertCooperativeVectorMatrixNV");
103 |     assert(m_vkConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkConvertCooperativeVectorMatrixNV'.");
104 | 
105 |     m_vkCmdConvertCooperativeVectorMatrixNV =
106 |         (PFN_vkCmdConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdConvertCooperativeVectorMatrixNV");
107 |     assert(m_vkCmdConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkCmdConvertCooperativeVectorMatrixNV'.");
108 | 
109 |     m_vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdCopyBuffer");
110 |     assert(m_vkCmdCopyBuffer != nullptr && "Failed to get Vulkan function 'vkCmdCopyBuffer'.");
111 | 
112 |     m_vkGetBufferDeviceAddress = (PFN_vkGetBufferDeviceAddress)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkGetBufferDeviceAddress");
113 |     assert(m_vkGetBufferDeviceAddress != nullptr && "Failed to get Vulkan function 'vkGetBufferDeviceAddress'.");
114 | }
115 | 
116 | size_t CoopVectorUtils_VK::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision)
117 | {
118 |     assert(m_vkDevice);
119 |     assert(m_vkConvertCooperativeVectorMatrixNV);
120 |     assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128.");
121 |     assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128.");
122 | 
123 |     size_t requiredSize = 0;
124 | 
125 |     VkConvertCooperativeVectorMatrixInfoNV info = GetVkConvertLayerDesc(rows, cols, precision, MatrixLayout::RowMajor, layout, 0, &requiredSize);
126 | 
127 |     VkResult res = m_vkConvertCooperativeVectorMatrixNV(m_vkDevice, &info);
128 |     assert(res == VK_SUCCESS && "Call to vkConvertCooperativeVectorMatrixNV failed");
129 |     assert(requiredSize > 0 && "Expected matrix size to be larger than zero.");
130 | 
131 |     return requiredSize;
132 | }
133 | 
134 | void CoopVectorUtils_VK::ConvertDeviceMatrixLayout(
135 |     NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const
136 | {
137 |     VkCommandBuffer vkCmdBuf = static_cast<VkCommandBuffer>(commandList);
138 |     VkBuffer vkSrcBuffer = static_cast<VkBuffer>(srcBuffer);
139 |     VkBuffer vkDstBuffer = static_cast<VkBuffer>(dstBuffer);
140 | 
141 |     // Obtain the device addresses of the buffers for the conversion functions
142 |     VkBufferDeviceAddressInfo bufferDeviceAddressInfo{};
143 |     bufferDeviceAddressInfo.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO;
144 |     bufferDeviceAddressInfo.buffer = vkSrcBuffer;
145 |     VkDeviceAddress const srcBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo);
146 |     bufferDeviceAddressInfo.buffer = vkDstBuffer;
147 |     VkDeviceAddress const dstBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo);
148 | 
149 |     // Convert weights
150 |     std::vector<VkConvertCooperativeVectorMatrixInfoNV> convertInfos(srcLayout.networkLayers.size());
151 |     for (int i = 0; i < srcLayout.networkLayers.size(); i++)
152 |     {
153 |         // Weights
154 |         size_t dstLayerSize = dstLayout.networkLayers[i].weightSize;
155 |         convertInfos[i] =
156 |             GetVkConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout, dstLayout.matrixLayout,
157 |                                   srcLayout.networkLayers[i].weightSize, &dstLayerSize, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset,
158 |                                   dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset);
159 |     }
160 |     m_vkCmdConvertCooperativeVectorMatrixNV(vkCmdBuf, (uint32_t)convertInfos.size(), convertInfos.data());
161 | 
162 |     // Copy the bias
163 |     std::vector<VkBufferCopy> copyRegions(srcLayout.networkLayers.size());
164 |     for (int i = 0; i < srcLayout.networkLayers.size(); i++)
165 |     {
166 |         copyRegions[i].srcOffset = srcBufferOffset + srcLayout.networkLayers[i].biasOffset;
167 |         copyRegions[i].dstOffset = dstBufferOffset + dstLayout.networkLayers[i].biasOffset;
168 |         copyRegions[i].size = srcLayout.networkLayers[i].biasSize;
169 |     }
170 |     m_vkCmdCopyBuffer(vkCmdBuf, vkSrcBuffer, vkDstBuffer, (uint32_t)copyRegions.size(), copyRegions.data());
171 | }
172 | #endif
173 | 
174 | #if DONUT_WITH_DX12
175 | 
176 | namespace
177 | {
178 | D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT GetDX12MatrixLayout(const MatrixLayout layout)
179 | {
180 |     switch (layout)
181 |     {
182 |     case MatrixLayout::RowMajor:
183 |         return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
184 |     case MatrixLayout::ColumnMajor:
185 |         return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR;
186 |     case MatrixLayout::InferencingOptimal:
187 |         return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL;
188 |     case MatrixLayout::TrainingOptimal:
189 |         return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
190 |     }
191 | }
192 | 
193 | D3D12_LINEAR_ALGEBRA_DATATYPE GetDX12ComponentType(rtxns::Precision precision)
194 | {
195 |     return precision == rtxns::Precision::F16 ? D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
196 | }
197 | 
198 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO GetDX12ConvertLayerDestInfo(int rows, int columns, MatrixLayout layout, Precision precision)
199 | {
200 |     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info{};
201 |     info.DestLayout = GetDX12MatrixLayout(layout);
202 |     info.NumRows = rows;
203 |     info.NumColumns = columns;
204 |     info.DestStride = UINT(GetStride(layout, rows, columns, GetSize(precision)));
205 |     info.DestDataType = GetDX12ComponentType(precision);
206 |     return info;
207 | }
208 | 
209 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12ConvertLayerDesc(
210 |     int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t dstSize, uint64_t srcData, uint64_t dstData)
211 | {
212 |     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{};
213 |     info.DestInfo = GetDX12ConvertLayerDestInfo(rows, columns, dstLayout, precision);
214 |     info.DestInfo.DestSize = UINT(dstSize);
215 |     info.SrcInfo.SrcSize = UINT(srcSize);
216 |     info.SrcInfo.SrcDataType = GetDX12ComponentType(precision);
217 |     info.SrcInfo.SrcLayout = GetDX12MatrixLayout(srcLayout);
218 |     info.SrcInfo.SrcStride = UINT(GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision)));
219 |     info.DataDesc.SrcVA = srcData;
220 |     info.DataDesc.DestVA = dstData;
221 |     return info;
222 | }
223 | 
224 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12CopyScaleBiasDesc(size_t biasSize, Precision precision, uint64_t srcData, uint64_t dstData)
225 | {
226 |     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{};
227 |     info.DestInfo.DestSize = UINT(biasSize);
228 |     info.DestInfo.DestLayout = D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
229 |     info.DestInfo.DestStride = info.DestInfo.DestSize;
230 |     info.DestInfo.NumRows = 1;
231 |     info.DestInfo.NumColumns = UINT(biasSize / GetSize(precision));
232 |     info.DestInfo.DestDataType = GetDX12ComponentType(precision);
233 |     info.SrcInfo.SrcSize = info.DestInfo.DestSize;
234 |     info.SrcInfo.SrcDataType = info.DestInfo.DestDataType;
235 |     info.SrcInfo.SrcLayout = info.DestInfo.DestLayout;
236 |     info.SrcInfo.SrcStride = info.DestInfo.DestStride;
237 |     info.DataDesc.SrcVA = srcData;
238 |     info.DataDesc.DestVA = dstData;
239 |     return info;
240 | }
241 | } // namespace
242 | 
243 | CoopVectorUtils_DX12::CoopVectorUtils_DX12(ID3D12Device* d3d12Device)
244 | {
245 |     m_d3d12Device = d3d12Device;
246 |     assert(m_d3d12Device != nullptr && "Failed to get D3D12 device from GFX.");
247 | }
248 | 
249 | /**
250 |  * Query the size of a matrix in bytes.
251 |  * @return Size of matrix in bytes.
252 |  */
253 | size_t CoopVectorUtils_DX12::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision /*= Precision::F16*/)
254 | {
255 |     assert(m_d3d12Device);
256 |     assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128.");
257 |     assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128.");
258 | 
259 |     Microsoft::WRL::ComPtr<ID3D12DevicePreview> devicePreview;
260 |     assert(m_d3d12Device->QueryInterface(IID_PPV_ARGS(&devicePreview)) == S_OK && "Failed to get device preview");
261 | 
262 |     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info = GetDX12ConvertLayerDestInfo(rows, cols, layout, precision);
263 | 
264 |     devicePreview->GetLinearAlgebraMatrixConversionDestinationInfo(&info);
265 | 
266 |     assert(info.DestSize > 0 && "Expected matrix size to be larger than zero.");
267 |     return info.DestSize;
268 | }
269 | 
270 | void rtxns::CoopVectorUtils_DX12::ConvertDeviceMatrixLayout(
271 |     NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const
272 | {
273 |     ID3D12GraphicsCommandList* d3dCmdList = static_cast<ID3D12GraphicsCommandList*>(commandList);
274 |     ID3D12Resource* d3dSrcBuffer = static_cast<ID3D12Resource*>(srcBuffer);
275 |     ID3D12Resource* d3dDstBuffer = static_cast<ID3D12Resource*>(dstBuffer);
276 | 
277 |     Microsoft::WRL::ComPtr<ID3D12GraphicsCommandListPreview> commandListPreview;
278 |     assert(d3dCmdList->QueryInterface(IID_PPV_ARGS(&commandListPreview)) == S_OK && "Command list provided does not support matrix conversion");
279 | 
280 |     D3D12_GPU_VIRTUAL_ADDRESS const srcBufferVA = d3dSrcBuffer->GetGPUVirtualAddress();
281 |     D3D12_GPU_VIRTUAL_ADDRESS const dstBufferVA = d3dDstBuffer->GetGPUVirtualAddress();
282 | 
283 |     // We need conversion data for each of the weights and bias separately so we need two entry for each layer
284 |     std::vector<D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO> convertInfos(srcLayout.networkLayers.size() * 2);
285 | 
286 |     // Convert weights
287 |     for (int i = 0; i < srcLayout.networkLayers.size(); i++)
288 |     {
289 |         // Weights
290 |         convertInfos[i] = GetDX12ConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout,
291 |                                                   dstLayout.matrixLayout, srcLayout.networkLayers[i].weightSize, dstLayout.networkLayers[i].weightSize,
292 |                                                   srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset,
293 |                                                   dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset);
294 |     }
295 | 
296 |     // Convert bias
297 |     // D3D's CopyBufferRegion requires resource states incompatible with the conversion ops.
298 |     // Use a degenerate form of a matrix conversion to copy the extra data to avoid placing a barrier.
299 |     int infoOffset = int(srcLayout.networkLayers.size());
300 |     for (int ii = 0; ii < srcLayout.networkLayers.size(); ii++)
301 |     {
302 |         convertInfos[ii + infoOffset] =
303 |             GetDX12CopyScaleBiasDesc(srcLayout.networkLayers[ii].biasSize, srcLayout.matrixPrecision, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[ii].biasOffset,
304 |                                      dstBufferVA + dstBufferOffset + dstLayout.networkLayers[ii].biasOffset);
305 |     }
306 |     commandListPreview->ConvertLinearAlgebraMatrix(convertInfos.data(), UINT(convertInfos.size()));
307 | }
308 | #endif
309 | 


--------------------------------------------------------------------------------
/src/NeuralShading/CoopVector.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #pragma once
 12 | 
 13 | #if DONUT_WITH_DX12
 14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h"
 15 | #endif
 16 | 
 17 | #include <vector>
 18 | #include <donut/app/DeviceManager.h>
 19 | 
 20 | 
 21 | #include "Float16.h"
 22 | #include "NeuralNetworkTypes.h"
 23 | 
 24 | namespace rtxns
 25 | {
 26 | 
 27 | class ICoopVectorUtils
 28 | {
 29 | public:
 30 |     size_t GetMatrixAlignment()
 31 |     {
 32 |         return s_matrixAlignment;
 33 |     }
 34 |     size_t GetVectorAlignment()
 35 |     {
 36 |         return s_vectorAlignment;
 37 |     }
 38 | 
 39 |     /**
 40 |      * Query the size of a matrix in bytes.
 41 |      * @return Size of matrix in bytes.
 42 |      */
 43 |     virtual size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16) = 0;
 44 | 
 45 |     /**
 46 |      * Convert matrix on the device between any layouts.
 47 |      * The Precision must currently be the same.
 48 |      * @return Size of matrix in bytes.
 49 |      */
 50 |     virtual void ConvertDeviceMatrixLayout(NetworkLayout const& srcLayout,
 51 |                                            NetworkLayout const& dstLayout,
 52 |                                            void* srcBuffer,
 53 |                                            uint64_t srcBufferOffset,
 54 |                                            void* dstBuffer,
 55 |                                            uint64_t dstBufferOffset,
 56 |                                            void* commandList) const = 0;
 57 | 
 58 | protected:
 59 |     static const size_t s_matrixAlignment = 64; ///< Minimum byte alignment according to spec.
 60 |     static const size_t s_vectorAlignment = 16; ///< Minimum byte alignment according to spec.
 61 | };
 62 | 
 63 | #if DONUT_WITH_VULKAN
 64 | class CoopVectorUtils_VK : public ICoopVectorUtils
 65 | {
 66 | public:
 67 |     CoopVectorUtils_VK(VkDevice vkDevice);
 68 | 
 69 |     /**
 70 |      * Query the size of a matrix in bytes.
 71 |      * @return Size of matrix in bytes.
 72 |      */
 73 |     size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16);
 74 | 
 75 |     /**
 76 |      * Convert matrix on the device between any layouts.
 77 |      * The Precision must currently be the same.
 78 |      * @return Size of matrix in bytes.
 79 |      */
 80 |     void ConvertDeviceMatrixLayout(
 81 |         NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const;
 82 | 
 83 | private:
 84 |     VkDevice m_vkDevice = nullptr;
 85 |     PFN_vkConvertCooperativeVectorMatrixNV m_vkConvertCooperativeVectorMatrixNV = nullptr;
 86 |     PFN_vkCmdConvertCooperativeVectorMatrixNV m_vkCmdConvertCooperativeVectorMatrixNV = nullptr;
 87 |     PFN_vkCmdCopyBuffer m_vkCmdCopyBuffer = nullptr;
 88 |     PFN_vkGetBufferDeviceAddress m_vkGetBufferDeviceAddress = nullptr;
 89 | };
 90 | #endif
 91 | 
 92 | #if DONUT_WITH_DX12
 93 | class CoopVectorUtils_DX12 : public ICoopVectorUtils
 94 | {
 95 | public:
 96 |     CoopVectorUtils_DX12(ID3D12Device* d3d12Device);
 97 | 
 98 |     /**
 99 |      * Query the size of a matrix in bytes.
100 |      * @return Size of matrix in bytes.
101 |      */
102 |     size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16);
103 | 
104 |     /**
105 |      * Convert matrix on the device between any layouts.
106 |      * The Precision must currently be the same.
107 |      * @return Size of matrix in bytes.
108 |      */
109 |     void ConvertDeviceMatrixLayout(
110 |         NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const;
111 | 
112 | private:
113 |     ID3D12Device* m_d3d12Device = nullptr;
114 | };
115 | #endif
116 | } // namespace rtxns


--------------------------------------------------------------------------------
/src/NeuralShading/Float16.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | /**
 12 |  * Most of this code is derived from the GLM library at https://github.com/g-truc/glm
 13 |  *
 14 |  * License: https://github.com/g-truc/glm/blob/master/copying.txt
 15 |  */
 16 | 
 17 | #include "Float16.h"
 18 | 
 19 | namespace rtxns
 20 | {
 21 | 
 22 | static float overflow()
 23 | {
 24 |     volatile float f = 1e10;
 25 |     for (int i = 0; i < 10; ++i)
 26 |     {
 27 |         f *= f; // this will overflow before the for loop terminates
 28 |     }
 29 |     return f;
 30 | }
 31 | 
 32 | union uif32
 33 | {
 34 |     float f;
 35 |     unsigned int i;
 36 | };
 37 | 
 38 | uint16_t float32ToFloat16(float value)
 39 | {
 40 |     uif32 entry;
 41 |     entry.f = value;
 42 |     int i = static_cast<int>(entry.i);
 43 | 
 44 |     //
 45 |     // Our floating point number, f, is represented by the bit
 46 |     // pattern in integer i.  Disassemble that bit pattern into
 47 |     // the sign, s, the exponent, e, and the significand, m.
 48 |     // Shift s into the position where it will go in the
 49 |     // resulting half number.
 50 |     // Adjust e, accounting for the different exponent bias
 51 |     // of float and half (127 versus 15).
 52 |     //
 53 | 
 54 |     int s = (i >> 16) & 0x00008000;
 55 |     int e = ((i >> 23) & 0x000000ff) - (127 - 15);
 56 |     int m = i & 0x007fffff;
 57 | 
 58 |     //
 59 |     // Now reassemble s, e and m into a half:
 60 |     //
 61 | 
 62 |     if (e <= 0)
 63 |     {
 64 |         if (e < -10)
 65 |         {
 66 |             //
 67 |             // E is less than -10.  The absolute value of f is
 68 |             // less than half_MIN (f may be a small normalized
 69 |             // float, a denormalized float or a zero).
 70 |             //
 71 |             // We convert f to a half zero.
 72 |             //
 73 | 
 74 |             return uint16_t(s);
 75 |         }
 76 | 
 77 |         //
 78 |         // E is between -10 and 0.  F is a normalized float,
 79 |         // whose magnitude is less than __half_NRM_MIN.
 80 |         //
 81 |         // We convert f to a denormalized half.
 82 |         //
 83 | 
 84 |         m = (m | 0x00800000) >> (1 - e);
 85 | 
 86 |         //
 87 |         // Round to nearest, round "0.5" up.
 88 |         //
 89 |         // Rounding may cause the significand to overflow and make
 90 |         // our number normalized.  Because of the way a half's bits
 91 |         // are laid out, we don't have to treat this case separately;
 92 |         // the code below will handle it correctly.
 93 |         //
 94 | 
 95 |         if (m & 0x00001000)
 96 |         {
 97 |             m += 0x00002000;
 98 |         }
 99 | 
100 |         //
101 |         // Assemble the half from s, e (zero) and m.
102 |         //
103 | 
104 |         return uint16_t(s | (m >> 13));
105 |     }
106 |     else if (e == 0xff - (127 - 15))
107 |     {
108 |         if (m == 0)
109 |         {
110 |             //
111 |             // F is an infinity; convert f to a half
112 |             // infinity with the same sign as f.
113 |             //
114 | 
115 |             return uint16_t(s | 0x7c00);
116 |         }
117 |         else
118 |         {
119 |             //
120 |             // F is a NAN; we produce a half NAN that preserves
121 |             // the sign bit and the 10 leftmost bits of the
122 |             // significand of f, with one exception: If the 10
123 |             // leftmost bits are all zero, the NAN would turn
124 |             // into an infinity, so we have to set at least one
125 |             // bit in the significand.
126 |             //
127 | 
128 |             m >>= 13;
129 | 
130 |             return uint16_t(s | 0x7c00 | m | (m == 0));
131 |         }
132 |     }
133 |     else
134 |     {
135 |         //
136 |         // E is greater than zero.  F is a normalized float.
137 |         // We try to convert f to a normalized half.
138 |         //
139 | 
140 |         //
141 |         // Round to nearest, round "0.5" up
142 |         //
143 | 
144 |         if (m & 0x00001000)
145 |         {
146 |             m += 0x00002000;
147 | 
148 |             if (m & 0x00800000)
149 |             {
150 |                 m = 0; // overflow in significand,
151 |                 e += 1; // adjust exponent
152 |             }
153 |         }
154 | 
155 |         //
156 |         // Handle exponent overflow
157 |         //
158 | 
159 |         if (e > 30)
160 |         {
161 |             overflow(); // Cause a hardware floating point overflow;
162 | 
163 |             return uint16_t(s | 0x7c00); // Return infinity with same sign as f.
164 |         }
165 | 
166 |         //
167 |         // Assemble the half from s, e and m.
168 |         //
169 | 
170 |         return uint16_t(s | (e << 10) | (m >> 13));
171 |     }
172 | }
173 | 
174 | float float16ToFloat32(uint16_t value)
175 | {
176 |     int s = (value >> 15) & 0x00000001;
177 |     int e = (value >> 10) & 0x0000001f;
178 |     int m = value & 0x000003ff;
179 | 
180 |     if (e == 0)
181 |     {
182 |         if (m == 0)
183 |         {
184 |             //
185 |             // Plus or minus zero
186 |             //
187 | 
188 |             uif32 result;
189 |             result.i = static_cast<unsigned int>(s << 31);
190 |             return result.f;
191 |         }
192 |         else
193 |         {
194 |             //
195 |             // Denormalized number -- renormalize it
196 |             //
197 | 
198 |             while (!(m & 0x00000400))
199 |             {
200 |                 m <<= 1;
201 |                 e -= 1;
202 |             }
203 | 
204 |             e += 1;
205 |             m &= ~0x00000400;
206 |         }
207 |     }
208 |     else if (e == 31)
209 |     {
210 |         if (m == 0)
211 |         {
212 |             //
213 |             // Positive or negative infinity
214 |             //
215 | 
216 |             uif32 result;
217 |             result.i = static_cast<unsigned int>((s << 31) | 0x7f800000);
218 |             return result.f;
219 |         }
220 |         else
221 |         {
222 |             //
223 |             // Nan -- preserve sign and significand bits
224 |             //
225 | 
226 |             uif32 result;
227 |             result.i = static_cast<unsigned int>((s << 31) | 0x7f800000 | (m << 13));
228 |             return result.f;
229 |         }
230 |     }
231 | 
232 |     //
233 |     // Normalized number
234 |     //
235 | 
236 |     e = e + (127 - 15);
237 |     m = m << 13;
238 | 
239 |     //
240 |     // Assemble s, e and m.
241 |     //
242 | 
243 |     uif32 result;
244 |     result.i = static_cast<unsigned int>((s << 31) | (e << 23) | m);
245 |     return result.f;
246 | }
247 | 
248 | } // namespace rtxns
249 | 


--------------------------------------------------------------------------------
/src/NeuralShading/Float16.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <cstdint>
14 | #include <limits>
15 | 
16 | namespace rtxns
17 | {
18 | 
19 | uint16_t float32ToFloat16(float value);
20 | float float16ToFloat32(uint16_t value);
21 | 
22 | } // namespace rtxns


--------------------------------------------------------------------------------
/src/NeuralShading/GraphicsResources.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #pragma once
 12 | 
 13 | #if DONUT_WITH_DX12
 14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h"
 15 | #include <dxgi1_4.h>
 16 | #include <wrl/client.h>
 17 | #endif
 18 | 
 19 | #if DONUT_WITH_VULKAN
 20 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
 21 | #include <vulkan/vulkan.hpp>
 22 | #endif
 23 | 
 24 | #include "GraphicsResources.h"
 25 | #include <donut/app/DeviceManager.h>
 26 | #include <donut/core/log.h>
 27 | 
 28 | namespace rtxns
 29 | {
 30 | 
 31 | GraphicsResources::GraphicsResources(nvrhi::DeviceHandle device)
 32 | {
 33 | #if DONUT_WITH_VULKAN
 34 |     if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::VULKAN)
 35 |     {
 36 |         VkInstance vkInstance = device->getNativeObject(nvrhi::ObjectTypes::VK_Instance);
 37 |         VkPhysicalDevice vkPhysicalDevice = device->getNativeObject(nvrhi::ObjectTypes::VK_PhysicalDevice);
 38 | 
 39 |         m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetInstanceProcAddr(
 40 |             vkInstance, "vkGetPhysicalDeviceCooperativeVectorPropertiesNV");
 41 |         assert(m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV != nullptr && "Failed to get Vulkan function 'vkGetPhysicalDeviceCooperativeVectorPropertiesNV'.");
 42 | 
 43 |         // Get the property count
 44 |         uint32_t propertyCount = 0;
 45 |         if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, nullptr) != VK_SUCCESS)
 46 |         {
 47 |             return;
 48 |         }
 49 | 
 50 |         // If we vkGetPhysicalDeviceCooperativeVectorPropertiesNV returns we have inference and training support
 51 |         m_coopVectorFeatures.inferenceSupported = true;
 52 |         m_coopVectorFeatures.trainingSupported = true;
 53 | 
 54 |         std::vector<VkCooperativeVectorPropertiesNV> properties(propertyCount);
 55 |         // Init the sType fields
 56 |         for (auto& property : properties)
 57 |         {
 58 |             property.sType = VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV;
 59 |         }
 60 | 
 61 |         // Get the actual properties
 62 |         if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, properties.data()) != VK_SUCCESS)
 63 |         {
 64 |             return;
 65 |         }
 66 | 
 67 |         for (const auto& property : properties)
 68 |         {
 69 |             if (property.sType == VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV && property.inputType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
 70 |                 property.inputInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR && property.matrixInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR &&
 71 |                 property.resultType == VK_COMPONENT_TYPE_FLOAT16_KHR)
 72 |             {
 73 |                 m_coopVectorFeatures.fp16InferencingSupported = true;
 74 |                 m_coopVectorFeatures.fp16TrainingSupported = true;
 75 |             }
 76 |         }
 77 |     }
 78 | #endif
 79 | 
 80 | #if DONUT_WITH_DX12
 81 |     if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::D3D12)
 82 |     {
 83 |         ID3D12Device* d3d12Device = device->getNativeObject(nvrhi::ObjectTypes::D3D12_Device);
 84 | 
 85 |         // Check experimental features are enabled
 86 |         D3D12_FEATURE_DATA_D3D12_OPTIONS_EXPERIMENTAL experimentalOptions{};
 87 |         auto hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS_EXPERIMENTAL, &experimentalOptions, sizeof(experimentalOptions));
 88 |         if (hr != S_OK)
 89 |         {
 90 |             donut::log::error("Coop vector is not supported.");
 91 |             return;
 92 |         }
 93 | 
 94 |         // Mute preview shader model (6.9) validation warning.
 95 |         Microsoft::WRL::ComPtr<ID3D12InfoQueue> infoQueue;
 96 |         if (d3d12Device->QueryInterface(IID_PPV_ARGS(&infoQueue)) == S_OK)
 97 |         {
 98 |             D3D12_MESSAGE_ID denyIds[] = { D3D12_MESSAGE_ID_NON_RETAIL_SHADER_MODEL_WONT_VALIDATE };
 99 | 
100 |             D3D12_INFO_QUEUE_FILTER filter = {};
101 |             filter.DenyList.NumIDs = _countof(denyIds);
102 |             filter.DenyList.pIDList = denyIds;
103 | 
104 |             infoQueue->AddStorageFilterEntries(&filter);
105 |         }
106 | 
107 |         // Check coop vector is supported
108 |         if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_0)
109 |         {
110 |             m_coopVectorFeatures.inferenceSupported = true;
111 |         }
112 |         else
113 |         {
114 |             return;
115 |         }
116 |         if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1)
117 |         {
118 |             m_coopVectorFeatures.trainingSupported = true;
119 |         }
120 | 
121 |         // Get supported coop vector formats
122 |         D3D12_FEATURE_DATA_COOPERATIVE_VECTOR coopVecData{};
123 |         hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData));
124 |         if (hr != S_OK)
125 |         {
126 |             return;
127 |         }
128 | 
129 |         std::vector<D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL> mulProperties(coopVecData.MatrixVectorMulAddPropCount);
130 |         std::vector<D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE> outerProductProperties;
131 |         std::vector<D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE> vectorAccumlateProperties;
132 | 
133 |         coopVecData.pMatrixVectorMulAddProperties = mulProperties.data();
134 | 
135 |         if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1)
136 |         {
137 |             outerProductProperties.resize(coopVecData.OuterProductAccumulatePropCount);
138 |             coopVecData.pOuterProductAccumulateProperties = outerProductProperties.data();
139 |             vectorAccumlateProperties.resize(coopVecData.VectorAccumulatePropCount);
140 |             coopVecData.pVectorAccumulateProperties = vectorAccumlateProperties.data();
141 |         }
142 |         else
143 |         {
144 |             coopVecData.OuterProductAccumulatePropCount = 0;
145 |             coopVecData.VectorAccumulatePropCount = 0;
146 |         }
147 | 
148 |         if (d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData)) != S_OK)
149 |         {
150 |             return;
151 |         }
152 | 
153 |         for (const auto& properties : mulProperties)
154 |         {
155 |             if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 &&
156 |                 properties.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.OutputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
157 |             {
158 |                 m_coopVectorFeatures.fp16InferencingSupported = true;
159 |             }
160 |         }
161 | 
162 |         bool opSupported = false;
163 |         for (const auto& properties : outerProductProperties)
164 |         {
165 |             if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
166 |             {
167 |                 opSupported = true;
168 |             }
169 |         }
170 | 
171 |         bool vaSupported = false;
172 |         for (const auto& properties : vectorAccumlateProperties)
173 |         {
174 |             if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
175 |             {
176 |                 vaSupported = true;
177 |             }
178 |         }
179 |         m_coopVectorFeatures.fp16TrainingSupported = opSupported && vaSupported;
180 |     }
181 | #endif
182 | }
183 | 
184 | GraphicsResources::~GraphicsResources()
185 | {
186 | }
187 | 
188 | } // namespace rtxns
189 | 


--------------------------------------------------------------------------------
/src/NeuralShading/GraphicsResources.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <nvrhi/nvrhi.h>
14 | 
15 | namespace rtxns
16 | {
17 | 
18 | struct CoopVectorFeatures
19 | {
20 |     bool inferenceSupported = false;
21 |     bool trainingSupported = false;
22 |     bool fp16InferencingSupported = false;
23 |     bool fp16TrainingSupported = false;
24 | };
25 | 
26 | class GraphicsResources
27 | {
28 | public:
29 |     GraphicsResources(nvrhi::DeviceHandle device);
30 |     ~GraphicsResources();
31 |     CoopVectorFeatures GetCoopVectorFeatures() const
32 |     {
33 |         return m_coopVectorFeatures;
34 |     }
35 | 
36 | private:
37 |     CoopVectorFeatures m_coopVectorFeatures;
38 | #if DONUT_WITH_VULKAN
39 |     PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = nullptr;
40 | #endif
41 | };
42 | } // namespace rtxns
43 | 


--------------------------------------------------------------------------------
/src/NeuralShading/NeuralNetwork.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #pragma once
 12 | 
 13 | #include "CoopVector.h"
 14 | #include <vector>
 15 | #include <filesystem>
 16 | #include <nvrhi/utils.h>
 17 | 
 18 | #include "NeuralNetworkTypes.h"
 19 | 
 20 | namespace rtxns
 21 | {
 22 | 
 23 | class NetworkUtilities
 24 | {
 25 | public:
 26 |     NetworkUtilities(nvrhi::DeviceHandle device);
 27 |     ~NetworkUtilities()
 28 |     {
 29 |     }
 30 | 
 31 |     bool ValidateNetworkArchitecture(NetworkArchitecture const& netArch);
 32 | 
 33 |     // Create host side network layout.
 34 |     NetworkLayout CreateHostNetworkLayout(NetworkArchitecture const& netArch);
 35 | 
 36 |     // Set the weights and bias size / offsets for each layer in the network.
 37 |     void SetNetworkLayerSizes(NetworkLayout& layout);
 38 | 
 39 |     // Returns a updated network layout where the weights and bias size / offsets have been update
 40 |     // for the new matrix layout
 41 |     // Can be device optimal matrix layout
 42 |     NetworkLayout GetNewMatrixLayout(NetworkLayout const& srcLayout, MatrixLayout newMatrixLayout);
 43 | 
 44 |     // Converts weights and bias buffers from src layout to the dst layout.
 45 |     // Both buffers must be device side.
 46 |     // Both networks must be of the same network layout, only differing in MatrixLayout
 47 |     void ConvertWeights(NetworkLayout const& srcLayout,
 48 |                         NetworkLayout const& dstLayout,
 49 |                         nvrhi::BufferHandle srcBuffer,
 50 |                         uint64_t srcBufferOffset,
 51 |                         nvrhi::BufferHandle dstBuffer,
 52 |                         uint64_t dstBufferOffset,
 53 |                         nvrhi::DeviceHandle device,
 54 |                         nvrhi::CommandListHandle commandList);
 55 | 
 56 | private:
 57 |     std::unique_ptr<ICoopVectorUtils> m_coopVecUtils;
 58 | };
 59 | 
 60 | // Represent a host side neural network.
 61 | // Stores the network layout and parameters.
 62 | // Functionality to initialize a network to starting values or load from file.
 63 | // Also write parameters back to file
 64 | class HostNetwork
 65 | {
 66 | public:
 67 |     HostNetwork(std::shared_ptr<NetworkUtilities> networkUtils);
 68 |     ~HostNetwork(){};
 69 | 
 70 |     // Create host side network from provided architecture with initial values.
 71 |     bool Initialise(const NetworkArchitecture& netArch);
 72 | 
 73 |     // Create host side network of provided architecture and initial values from a json file.
 74 |     bool InitialiseFromJson(donut::vfs::IFileSystem& fs, const std::string& fileName);
 75 |     // Create host side network of provided architecture and initial values from a file.
 76 |     bool InitialiseFromFile(const std::string& fileName);
 77 |     // Create host side network from an existing network.
 78 |     bool InitialiseFromNetwork(HostNetwork const& network);
 79 |     // Write the current network and parameters to file.
 80 |     bool WriteToFile(const std::string& fileName);
 81 |     // Convert device layout to host layout and update the host side parameters.
 82 |     void UpdateFromBufferToFile(nvrhi::BufferHandle hostLayoutBuffer,
 83 |                                 nvrhi::BufferHandle deviceLayoutBuffer,
 84 |                                 NetworkLayout const& hostLayout,
 85 |                                 NetworkLayout const& deviceLayout,
 86 |                                 const std::string& fileName,
 87 |                                 nvrhi::DeviceHandle device,
 88 |                                 nvrhi::CommandListHandle commandList);
 89 | 
 90 |     const NetworkArchitecture& GetNetworkArchitecture() const
 91 |     {
 92 |         return m_networkArchitecture;
 93 |     }
 94 | 
 95 |     const std::vector<uint8_t>& GetNetworkParams() const
 96 |     {
 97 |         return m_networkParams;
 98 |     }
 99 | 
100 |     const NetworkLayout& GetNetworkLayout() const
101 |     {
102 |         return m_networkLayout;
103 |     }
104 | 
105 | private:
106 |     std::shared_ptr<NetworkUtilities> m_networkUtils;
107 |     NetworkArchitecture m_networkArchitecture;
108 |     std::vector<uint8_t> m_networkParams;
109 |     NetworkLayout m_networkLayout;
110 | };
111 | }; // namespace rtxns


--------------------------------------------------------------------------------
/src/NeuralShading/NeuralNetworkTypes.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | #include <cstdint>
13 | 
14 | namespace rtxns
15 | {
16 | 
17 | enum class MatrixLayout
18 | {
19 |     RowMajor,
20 |     ColumnMajor,
21 |     InferencingOptimal,
22 |     TrainingOptimal,
23 | };
24 | 
25 | enum class Precision
26 | {
27 |     F16,
28 |     F32
29 | };
30 | 
31 | struct NetworkArchitecture
32 | {
33 |     uint32_t numHiddenLayers = 0;
34 |     uint32_t inputNeurons = 0;
35 |     uint32_t hiddenNeurons = 0;
36 |     uint32_t outputNeurons = 0;
37 |     Precision weightPrecision = Precision::F16;
38 |     Precision biasPrecision = Precision::F16;
39 | };
40 | 
41 | struct NetworkLayer
42 | {
43 |     uint32_t inputs = 0; ///< Columns in the weight matrix.
44 |     uint32_t outputs = 0; ///< Rows in the weight matrix.
45 |     size_t weightSize = 0; ///< Size of the weight matrix in bytes.
46 |     size_t biasSize = 0; ///< Size of the bias vector in bytes.
47 |     uint32_t weightOffset = 0; ///< Offset to the weights in bytes.
48 |     uint32_t biasOffset = 0; ///< Offset to the biases in bytes.
49 | };
50 | 
51 | struct NetworkLayout
52 | {
53 |     MatrixLayout matrixLayout = MatrixLayout::RowMajor;
54 |     Precision matrixPrecision = Precision::F16;
55 |     size_t networkSize = 0;
56 |     std::vector<NetworkLayer> networkLayers;
57 | };
58 | 
59 | constexpr size_t GetSize(Precision precision)
60 | {
61 |     switch (precision)
62 |     {
63 |     case Precision::F16:
64 |         return sizeof(uint16_t); // 2 bytes
65 |     case Precision::F32:
66 |         return sizeof(float);
67 |     default:
68 |         return 0; // Should not get here
69 |     }
70 | }
71 | 
72 | } // namespace rtxns


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/Activation.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorAutoDiff;
 12 | import CooperativeVectorFunctions;
 13 | 
 14 | namespace rtxns
 15 | {
 16 | namespace mlp
 17 | {
 18 |     ////////////////////////
 19 |     //
 20 |     // Activation function interface and implementation for several activation functions
 21 |     // for using with classes in MLP module
 22 |     //
 23 |     ////////////////////////
 24 | 
 25 |     // Base interface for activation functions
 26 |     interface IActivation<T : __BuiltinFloatingPointType, let K : int>
 27 |     {
 28 |         [Differentiable]
 29 |         CoopVec<T, K> eval(CoopVec<T, K> x);
 30 |     };
 31 | 
 32 |     // None activation function
 33 |     struct NoneAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 34 |     {
 35 |         [Differentiable]
 36 |         CoopVec<T, K> eval(CoopVec<T, K> x)
 37 |         {
 38 |             return x;
 39 |         }
 40 |     };
 41 | 
 42 |     // Linear activation function
 43 |     struct LinearAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 44 |     {
 45 |         T a;
 46 | 
 47 |         __init(T a)
 48 |         {
 49 |             this.a = a;
 50 |         }
 51 | 
 52 |         [Differentiable]
 53 |         CoopVec<T, K> eval(CoopVec<T, K> x)
 54 |         {
 55 |             return no_diff CoopVec<T, K>(a) * x;
 56 |         }
 57 |     };
 58 | 
 59 |     // Exponential activation function
 60 |     struct ExponentialAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 61 |     {
 62 |         [Differentiable]
 63 |         CoopVec<T, K> eval(CoopVec<T, K> x)
 64 |         {
 65 |             // Exponent is builtin function.
 66 |             return exp(x); 
 67 |         }
 68 |     };
 69 | 
 70 |     // Shifted exponential activation function
 71 |     struct ShiftedExponentialAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 72 |     {
 73 |         [Differentiable]
 74 |         CoopVec<T, K> eval(CoopVec<T, K> x)
 75 |         {
 76 |             return exp(x) - no_diff CoopVec<T, K>(T(1.));
 77 |         }
 78 |     };
 79 | 
 80 |     // ReLU activation function
 81 |     struct ReLUAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 82 |     {
 83 |         [Differentiable]
 84 |         CoopVec<T, K> eval(CoopVec<T, K> x)
 85 |         {
 86 |             return relu(x); 
 87 |         }
 88 |     };
 89 | 
 90 |     // Leaky ReLU activation function
 91 |     struct LeakyReLUAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
 92 |     {
 93 |         T a;
 94 |         
 95 |         __init(T a) 
 96 |         { 
 97 |             this.a = a; 
 98 |         }
 99 | 
100 |         [Differentiable]
101 |         CoopVec<T, K> eval(CoopVec<T, K> x)
102 |         {
103 |             return leakyReLU(x, a);
104 |         }
105 |     };
106 | 
107 |     // Sigmoid activation function
108 |     struct SigmoidAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
109 |     {
110 |         [Differentiable]
111 |         CoopVec<T, K> eval(CoopVec<T, K> x)
112 |         {
113 |             // Sigmoid function calculation. Compiler will infer the derivative automatically (autodiff)
114 |             return sigmoid(x);
115 |         }
116 |     };
117 | 
118 |     // Swish activation function
119 |     struct SwishAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
120 |     {
121 |         [Differentiable]
122 |         CoopVec<T, K> eval(CoopVec<T, K> x)
123 |         {
124 |             return x / (no_diff CoopVec<T, K>(T(1.)) + exp(no_diff CoopVec<T, K>(T(-1.)) * x));
125 |         }
126 |     };
127 | 
128 |     // Tanh activation function
129 |     struct TanhAct<T : __BuiltinFloatingPointType, let K : int> : IActivation<T, K>
130 |     {
131 |         [Differentiable]
132 |         CoopVec<T, K> eval(CoopVec<T, K> x)
133 |         {
134 |             var c1 = no_diff CoopVec<T, K>(T(1.));
135 |             return no_diff CoopVec<T, K>(T(2.)) / (c1 + exp(no_diff CoopVec<T, K>(T(-2.)) * x)) - c1;
136 |         }
137 |     };
138 | }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | 
11 | set(project NeuralShading_Shaders)
12 | set(folder "${LIBRARY_FILTER}/NeuralShading_Shaders")
13 | 
14 | file(GLOB shaders "*.slang")
15 | 
16 | set_source_files_properties(${shaders} PROPERTIES VS_TOOL_OVERRIDE "None") 
17 | add_custom_target(${project}
18 | 	DEPENDS ShaderMake
19 |     SOURCES ${shaders})
20 | 	set_target_properties(${project} PROPERTIES 
21 | 	FOLDER ${folder}
22 | )
23 | 
24 | set(SAMPLES_SHADER_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR} CACHE PATH "" FORCE)


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CooperativeVectorAutoDiff.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | import CooperativeVectorFunctions;
12 | import CooperativeVectorDerivatives;
13 | 
14 | // Implementation to extend CoopVec to make it automatically differentiable (autodiff)
15 | 
16 | namespace rtxns
17 | {
18 | 
19 | // Extension for builtin type CoopVec<T, K> to make it automatically differentiable (autodiff)
20 | extension<T : __BuiltinFloatingPointType, let K : int> CoopVec<T, K> : IDifferentiable
21 | {
22 |     typealias Differential = CoopVec<T, K>;
23 | };
24 | 
25 | typealias HCoopVec<let K : int> = CoopVec<half, K>;
26 | 
27 | ////////////////////////
28 | //
29 | // Additional functions and their derivatives for using in actvation functions
30 | // To support Slang autodiff, for each function its derivative should be defined
31 | //
32 | ////////////////////////
33 | 
34 | // exp is builtin function, so we just need to define derivative for autodiff support
35 | [BackwardDerivativeOf(exp)]
36 | void exp_BackwardAutoDiff<T : __BuiltinFloatingPointType, let K : int>(inout DifferentialPair<CoopVec<T, K>> p0, CoopVec<T, K>.Differential dResult)
37 | {
38 |     p0 = diffPair(p0.p, dResult * exp(p0.p));
39 | }
40 | 
41 | // Relu backward derivative
42 | [BackwardDerivativeOf(relu)]
43 | void relu_BackwardAutoDiff<T : __BuiltinFloatingPointType, let K : int>(inout DifferentialPair<CoopVec<T, K>> p0, CoopVec<T, K>.Differential dResult)
44 | {
45 |     CoopVec<T, K> d;
46 | 
47 |     [ForceUnroll]
48 |     for (int i = 0; i < K; ++i)
49 |     {
50 |         d[i] = p0.p[i] > T(0.) ? dResult[i] : T(0.);
51 |     }
52 | 
53 |     p0 = diffPair(p0.p, d);
54 | }
55 | 
56 | // LeakyRelu backward derivative
57 | [BackwardDerivativeOf(leakyReLU)]
58 | void leakyRelu_BackwardAutoDiff<T : __BuiltinFloatingPointType, let K : int>(inout DifferentialPair<CoopVec<T, K>> p0, T a, CoopVec<T, K>.Differential dResult)
59 | {
60 |     p0 = diffPair(p0.p, leakyReLU_Derivative(p0.p, a, dResult));
61 | }
62 | 
63 | // Sigmoid backward derivative
64 | [BackwardDerivativeOf(sigmoid)]
65 | void sigmoid_BackwardAutoDiff<T : __BuiltinFloatingPointType, let K : int>(inout DifferentialPair<CoopVec<T, K>> p0, CoopVec<T, K>.Differential dResult)
66 | {
67 |     p0 = diffPair(p0.p, sigmoid_Derivative(p0.p, dResult));
68 | }
69 | 
70 | }


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CooperativeVectorDerivatives.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | import CooperativeVectorFunctions;
12 | 
13 | // Derivatives of the functions in the CooperativeVectorFunction module
14 | 
15 | namespace rtxns
16 | {
17 | 
18 | // Derivative of leaky relu
19 | CoopVec<T, K> leakyReLU_Derivative<T : __BuiltinFloatingPointType, let K : int>(CoopVec<T, K> p, T a, CoopVec<T, K> dResult)
20 | {
21 |     [ForceUnroll]
22 |     for (int i = 0; i < K; ++i)
23 |     {
24 |         p[i] = p[i] > T(0.) ? dResult[i] : a * dResult[i];
25 |     }
26 |     return p;
27 | }
28 | 
29 | // Derivative of sigmoid 
30 | CoopVec<T, K> sigmoid_Derivative<T : __BuiltinFloatingPointType, let K : int>(CoopVec<T, K> p, CoopVec<T, K> dResult)
31 | {
32 |     var sigmoidOut = sigmoid(p);
33 |     return dResult * sigmoidOut * (CoopVec<T, K>(T(1.)) - sigmoidOut);
34 | }
35 | 
36 | }


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CooperativeVectorFunctions.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | // Additional functions for use with the CoopVec type
12 | 
13 | namespace rtxns
14 | {
15 | 
16 | // Relu function implementation
17 | CoopVec<T, K> relu<T : __BuiltinFloatingPointType, let K : int>(CoopVec<T, K> v)
18 | {
19 |     return max(v, CoopVec<T, K>(T(0.)));
20 | }
21 | 
22 | // Leaky relu function implementation
23 | CoopVec<T, K> leakyReLU<T : __BuiltinFloatingPointType, let K : int>(CoopVec<T, K> p, no_diff T a)
24 | {
25 |     [ForceUnroll]
26 |     for (int i = 0; i < K; ++i)
27 |     {
28 |         p[i] = p[i] < T(0.) ? a * p[i] : p[i];
29 |     }
30 |     return p;
31 | }
32 | 
33 | // Sigmoid function implementation
34 | CoopVec<T, K> sigmoid<T : __BuiltinFloatingPointType, let K : int>(CoopVec<T, K> v)
35 | {
36 |     var c1 = CoopVec<T, K>(T(1.));
37 |     return c1 / (c1 + exp(CoopVec<T, K>(T(-1.)) * v));
38 | }
39 | 
40 | 
41 | 
42 | }


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/LinearOps.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorAutoDiff;
 12 | import CooperativeVectorFunctions;
 13 | 
 14 | namespace rtxns
 15 | {
 16 |     // Linear (without activation function) forward step of MLP using Cooperative Vector extension functions
 17 |     // Weights matrix and biases vector are stored in a byteaddress buffer at offsets matrixOffset and biasOffset
 18 |     CoopVec<T, M> LinearOp<T : __BuiltinFloatingPointType, let M : int, let K : int>( 
 19 |         CoopVec<T, K> ip, 
 20 |         ByteAddressBuffer matrixBiasBuffer, 
 21 |         uint matrixOffset, 
 22 |         uint biasOffset, 
 23 |         constexpr CoopVecMatrixLayout matrixLayout, 
 24 |         constexpr CoopVecComponentType componentType)
 25 |     {
 26 |         return coopVecMatMulAdd<T, M>(
 27 |             ip, 
 28 |             componentType, 
 29 |             matrixBiasBuffer, 
 30 |             matrixOffset, 
 31 |             componentType, 
 32 |             matrixBiasBuffer, 
 33 |             biasOffset, 
 34 |             componentType, 
 35 |             matrixLayout, 
 36 |             false, 
 37 |             0
 38 |         );
 39 |     }
 40 | 
 41 |     // One linear backward step of MLP using Cooperative Vector extension functions
 42 |     // Weights matrix and biases vector are stored in byteaddress buffer at offsets matrixOffset and biasOffset
 43 |     // Derivates of weights matrix and derivatives of biases vector are stored in read write byteaddress buffer at offsets matrixOffset and biasOffset
 44 |     CoopVec<T, K> LinearOp_Backward<T : __BuiltinFloatingPointType, let M : int, let K : int>(
 45 |         CoopVec<T, K> ip, 
 46 |         CoopVec<T, M> grad, 
 47 |         ByteAddressBuffer matrixBiasBuffer, 
 48 |         RWByteAddressBuffer matrixBiasBufferDerivative, 
 49 |         uint matrixOffset, 
 50 |         uint biasOffset, 
 51 |         constexpr CoopVecMatrixLayout matrixLayout, 
 52 |         constexpr CoopVecComponentType componentType)
 53 |     {
 54 |         coopVecOuterProductAccumulate(grad, ip, matrixBiasBufferDerivative, matrixOffset, 0, matrixLayout, componentType);
 55 |         coopVecReduceSumAccumulate(grad, matrixBiasBufferDerivative, biasOffset);
 56 | 
 57 |         return coopVecMatMul<T, K>(grad, componentType, matrixBiasBuffer, matrixOffset, componentType, matrixLayout, true, 0);
 58 |     }
 59 | }
 60 | 
 61 | namespace rtxns
 62 | {
 63 | namespace mlp
 64 | {
 65 |     // Structure to store derivatives of weights matrix and derivatives of biases vector
 66 |     // Extends IDifferentiablePtrType interface to support autodiff
 67 |     struct MatrixBiasBufferDifferential : IDifferentiablePtrType
 68 |     {
 69 |         typealias Differential = MatrixBiasBufferDifferential;
 70 | 
 71 |         __init(RWByteAddressBuffer buf) 
 72 |         { 
 73 |             buffer = buf;
 74 |         }
 75 | 
 76 |         RWByteAddressBuffer buffer;
 77 |     };
 78 | 
 79 |     // Structure to store weights matrix and biases vector
 80 |     // Extends IDifferentiablePtrType interface to support autodiff
 81 |     struct MatrixBiasBuffer : IDifferentiablePtrType
 82 |     {
 83 |         typealias Differential = MatrixBiasBufferDifferential;
 84 | 
 85 |         __init(ByteAddressBuffer buf) 
 86 |         { 
 87 |             buffer = buf;
 88 |         }
 89 | 
 90 |         ByteAddressBuffer buffer;
 91 |     };
 92 | 
 93 |     // Linear forward step of MLP using MatrixBiasBuffer structure to store weights and biases
 94 |     CoopVec<T, M> LinearOp<T : __BuiltinFloatingPointType, let M : int, let K : int>( 
 95 |         CoopVec<T, K> ip, 
 96 |         MatrixBiasBuffer matrixBiasBuffer, 
 97 |         uint2 offsets,
 98 |         constexpr CoopVecMatrixLayout matrixLayout, 
 99 |         constexpr CoopVecComponentType componentType)
100 |     {
101 |         return LinearOp<T, M, K>(
102 |             ip, 
103 |             matrixBiasBuffer.buffer, 
104 |             offsets[0], 
105 |             offsets[1], 
106 |             matrixLayout, 
107 |             componentType
108 |         );
109 |     }
110 | 
111 |     // Linear backward step of MLP using MatrixBiasBuffer and MatrixBiasBufferDifferential
112 |     [BackwardDerivativeOf(LinearOp)]
113 |     void LinearOp_BackwardAutoDiff<T : __BuiltinFloatingPointType, let M : int, let K : int>( 
114 |         inout DifferentialPair<CoopVec<T, K>> ip, 
115 |         DifferentialPtrPair<MatrixBiasBuffer> matrixBiasBuffer, 
116 |         uint2 offsets,
117 |         constexpr CoopVecMatrixLayout matrixLayout, 
118 |         constexpr CoopVecComponentType componentType, CoopVec<T, M>.Differential grad)
119 |     {
120 |         ip = diffPair(
121 |             ip.p, 
122 |             LinearOp_Backward<T, M, K>(
123 |                 ip.p, 
124 |                 grad, 
125 |                 matrixBiasBuffer.p.buffer, 
126 |                 matrixBiasBuffer.d.buffer, 
127 |                 offsets[0], 
128 |                 offsets[1], 
129 |                 matrixLayout, 
130 |                 componentType
131 |             )
132 |         );
133 |     }
134 | }
135 | }


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/Loss.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorAutoDiff;
 12 | import CooperativeVectorFunctions;
 13 | 
 14 | namespace rtxns
 15 | {
 16 | namespace mlp
 17 | {
 18 |     ////////////////////////
 19 |     //
 20 |     // Loss function interface and implementation for several loss functions
 21 |     // for using with classes in MLP module
 22 |     //
 23 |     ////////////////////////
 24 | 
 25 |     // Base interface for activation functions
 26 |     interface ILoss<T : __BuiltinFloatingPointType, let K : int>
 27 |     {
 28 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale);
 29 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale);
 30 |     };
 31 | 
 32 |     // L1
 33 |     struct L1<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
 34 |     {
 35 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 36 |         {
 37 |             return scale * abs(predicted - target);
 38 |         }
 39 | 
 40 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 41 |         {
 42 |             return copysign(scale, predicted - target);
 43 |         }
 44 |     };
 45 | 
 46 |     // Relative L1
 47 |     struct L1Relative<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
 48 |     {
 49 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 50 |         {
 51 |             return scale * abs(predicted - target) / (abs(predicted) + vector<T, K>(0.01f));
 52 |         }
 53 | 
 54 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 55 |         {
 56 |             return copysign(scale, predicted - target) / (abs(predicted) + vector<T, K>(0.01f));
 57 |         }
 58 |     };
 59 | 
 60 |     // Mean absolute percentage error (MAPE)
 61 |     struct MAPE<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
 62 |     {
 63 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 64 |         {
 65 |             return scale * abs(predicted - target) / (abs(target) + vector<T, K>(0.01f));
 66 |         }
 67 | 
 68 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 69 |         {
 70 |             return copysign(scale, predicted - target) / (abs(target) + vector<T, K>(0.01f));
 71 |         }
 72 |     };
 73 | 
 74 |     // Symmetric mean absolute percentage error (SMAPE)
 75 |     struct SMAPE<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
 76 |     {
 77 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 78 |         {
 79 |             return scale * abs(predicted - target) / ((abs(target) + abs(predicted)) * T(0.5) + vector<T, K>(0.01f));
 80 |         }
 81 | 
 82 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 83 |         {
 84 |             return copysign(scale, predicted - target) / ((abs(target) + abs(predicted))*T(0.5) + vector<T, K>(0.01f));
 85 |         }
 86 |     };
 87 | 
 88 |     // L2 
 89 |     struct L2<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
 90 |     {
 91 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 92 |         {
 93 |             return scale * (predicted - target) * (predicted - target);
 94 |         }
 95 | 
 96 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
 97 |         {
 98 |             return T(2) * scale * (predicted - target);
 99 |         }
100 |     };
101 | 
102 |     // Relative L2
103 |     struct L2Relative<T : __BuiltinFloatingPointType, let K : int> : ILoss<T, K>
104 |     {
105 |         static vector<T, K> value(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
106 |         {
107 |             return scale * (predicted - target) * (predicted - target) / (predicted * predicted + vector<T, K>(0.01f));
108 |         }
109 | 
110 |         static vector<T, K> deriv(vector<T, K> target, vector<T, K> predicted, vector<T, K> scale)
111 |         {
112 |             return T(2) * scale * (predicted - target) / (predicted * predicted + vector<T, K>(0.01f));
113 |         }
114 |     };
115 | }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/MLP.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorAutoDiff;
 12 | import CooperativeVectorFunctions;
 13 | import LinearOps;
 14 | import Activation;
 15 | 
 16 | namespace rtxns
 17 | {
 18 | namespace mlp
 19 | {
 20 |     // Structure to store MLP layers. Implements full forward step for inference
 21 |     // MLP is defined by number of hidden layers and number of inputs, outputs and elements in hidden layers
 22 |     struct InferenceMLP<
 23 |         T : __BuiltinFloatingPointType, 
 24 |         let HIDDEN_LAYERS : int, 
 25 |         let INPUTS : int, 
 26 |         let HIDDEN : int, 
 27 |         let OUTPUTS : int, 
 28 |         let matrixLayout : CoopVecMatrixLayout, 
 29 |         let componentType : CoopVecComponentType
 30 |     >
 31 |     {
 32 |         // Initialized from buffer with weights and biases and two vectors of offsets
 33 |         __init(ByteAddressBuffer buf, uint matrixOffset[HIDDEN_LAYERS+1], uint biasOffset[HIDDEN_LAYERS+1]) 
 34 |         {
 35 |             parameters = MatrixBiasBuffer(buf);
 36 | 
 37 |             [ForceUnroll]
 38 |             for (int i = 0; i <= HIDDEN_LAYERS; ++i)
 39 |                 layerOffsets[i] = uint2(matrixOffset[i], biasOffset[i]);
 40 |         }
 41 | 
 42 |         // Full MLP forward step using one activation function for input and hidden layers and another for output
 43 |         // Returns MLP output
 44 |         CoopVec<T, OUTPUTS> forward<Act : IActivation<T, HIDDEN>, FinalAct : IActivation<T, OUTPUTS>>(CoopVec<T, INPUTS> inputParams, Act act, FinalAct finalAct)
 45 |         {
 46 |             var params = act.eval(LinearOp<T, HIDDEN, INPUTS>(inputParams, parameters, layerOffsets[0], matrixLayout, componentType));
 47 |             
 48 |             [ForceUnroll]
 49 |             for(int i = 1; i < HIDDEN_LAYERS; ++i)
 50 |                 params = act.eval(LinearOp<T, HIDDEN, HIDDEN>(params, parameters, layerOffsets[i], matrixLayout, componentType));
 51 | 
 52 |             return finalAct.eval(LinearOp<T, OUTPUTS, HIDDEN>(params, parameters, layerOffsets[HIDDEN_LAYERS], matrixLayout, componentType));
 53 |         }
 54 | 
 55 |         MatrixBiasBuffer parameters;
 56 |         uint2 layerOffsets[HIDDEN_LAYERS+1];
 57 |     }
 58 | 
 59 |     // Structure to store MLP layers and derivatives. Implements full forward step and backward steps
 60 |     // MLP is defined by number of hidden layers and number of inputs, outputs and elements in hidden layers
 61 |     struct TrainingMLP<
 62 |         T : __BuiltinFloatingPointType, 
 63 |         let HIDDEN_LAYERS : int, 
 64 |         let INPUTS : int, 
 65 |         let HIDDEN : int, 
 66 |         let OUTPUTS : int, 
 67 |         let matrixLayout : CoopVecMatrixLayout, 
 68 |         let componentType : CoopVecComponentType
 69 |     >
 70 |     {
 71 |         // Initialized from buffer with weights and biases, buffer to store derivatives and two vectors of offsets
 72 |         __init(
 73 |             ByteAddressBuffer matrixBuffer, 
 74 |             RWByteAddressBuffer derivativeBuffer, 
 75 |             uint matrixOffset[HIDDEN_LAYERS+1], 
 76 |             uint biasOffset[HIDDEN_LAYERS+1]
 77 |         ) 
 78 |         {
 79 |             parameters = MatrixBiasBuffer(matrixBuffer);
 80 |             derivatives = MatrixBiasBufferDifferential(derivativeBuffer);
 81 |             
 82 |             [ForceUnroll]
 83 |             for (int i = 0; i <= HIDDEN_LAYERS; ++i)
 84 |                 layerOffsets[i] = uint2(matrixOffset[i], biasOffset[i]);
 85 |         }
 86 | 
 87 |         // Full MLP forward step using one activation function for input and hidden layers and another for output
 88 |         // Implemented as static function to support autodiff
 89 |         // Input parameter inputParams is no_diff to skip derivative calculation for inputs as we interested in weights and biases derivatives only
 90 |         // Returns MLP output
 91 |         [Differentiable]
 92 |         static CoopVec<T, OUTPUTS> forward_s<Act : IActivation<T, HIDDEN>, FinalAct : IActivation<T, OUTPUTS>>(
 93 |             CoopVec<T, INPUTS> inputParams, 
 94 |             MatrixBiasBuffer parameters,
 95 |             uint2 layerOffsets[HIDDEN_LAYERS + 1],
 96 |             no_diff Act act,
 97 |             no_diff FinalAct finalAct
 98 |         )
 99 |         {
100 |             var params = act.eval(LinearOp<T, HIDDEN, INPUTS>(inputParams, parameters, layerOffsets[0], matrixLayout, componentType));
101 |             
102 |             [ForceUnroll]
103 |             for(int i = 1; i < HIDDEN_LAYERS; ++i)
104 |                 params = act.eval(LinearOp<T, HIDDEN, HIDDEN>(params, parameters, layerOffsets[i], matrixLayout, componentType));
105 | 
106 |             return finalAct.eval(LinearOp<T, OUTPUTS, HIDDEN>(params, parameters, layerOffsets[HIDDEN_LAYERS], matrixLayout, componentType));
107 |         }
108 | 
109 |         // Forward step member function
110 |         CoopVec<T, OUTPUTS> forward<Act : IActivation<T, HIDDEN>, FinalAct : IActivation<T, OUTPUTS>>(CoopVec<T, INPUTS> inputParams, Act act, FinalAct finalAct)
111 |         {
112 |             return forward_s<Act, FinalAct>(inputParams, parameters, layerOffsets, act, finalAct);
113 |         }
114 | 
115 |         // Full MLP backward step calculation is infered automatically by Slang autodiff from forward function
116 |         void backward<Act : IActivation<T, HIDDEN>, FinalAct : IActivation<T, OUTPUTS>>(DifferentialPair<CoopVec<T, INPUTS>> dInputParams, Act act, FinalAct finalAct, CoopVec<T, OUTPUTS> loss)
117 |         {
118 |             bwd_diff(forward_s<Act, FinalAct>)( dInputParams, DifferentialPtrPair<MatrixBiasBuffer>(parameters, derivatives), layerOffsets, act, finalAct, loss);
119 |         }
120 |         void backward<Act : IActivation<T, HIDDEN>, FinalAct : IActivation<T, OUTPUTS>>(CoopVec<T, INPUTS> inputParams, Act act, FinalAct finalAct, CoopVec<T, OUTPUTS> loss)
121 |         {
122 |             var dInputParams = diffPair(inputParams);
123 |             backward(dInputParams, act, finalAct, loss);
124 |         }
125 | 
126 |         MatrixBiasBuffer parameters;
127 |         MatrixBiasBufferDifferential derivatives;
128 |         uint2 layerOffsets[HIDDEN_LAYERS+1];
129 |     }
130 | }
131 | }


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/Optimizers.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #define ADAM_BETA1                               0.9f
12 | #define ADAM_BETA2                               0.999f
13 | #define ADAM_EPSILON                             1E-8f
14 | 
15 | namespace optimizers
16 | {
17 |     // Common interface for optimizers
18 |     interface IOptimizer
19 |     {
20 |         float step(float weightBias, uint parameterID, float gradient, const float currentStep);
21 |     };
22 | 
23 |     // Adam optimizer 
24 |     struct Adam : IOptimizer
25 |     {
26 |         RWBuffer<float> m_moments1;
27 |         RWBuffer<float> m_moments2;
28 |         float m_learningRate;
29 |         float m_lossScale;
30 |         float m_beta1;
31 |         float m_beta2;
32 |         float m_epsilon;
33 | 
34 |         // Initializes from two moments buffers and optimizations parameters
35 |         __init(
36 |             RWBuffer<float> moments1, 
37 |             RWBuffer<float> moments2,
38 |             float learningRate, 
39 |             float lossScale,
40 |             float beta1 = ADAM_BETA1,
41 |             float beta2 = ADAM_BETA2,
42 |             float epsilon = ADAM_EPSILON)
43 |         {
44 |             m_moments1 = moments1;
45 |             m_moments2 = moments2;
46 |             m_learningRate = learningRate;
47 |             m_lossScale = lossScale;
48 |             m_beta1 = beta1;
49 |             m_beta2 = beta2;
50 |             m_epsilon = epsilon;
51 |         }
52 | 
53 |         // Optimization step for one MLP parameter
54 |         float step(in float weightBias, uint parameterID, float gradient, const float currentStep)
55 |         {
56 |             gradient /= m_lossScale;
57 |             float gradient_sq = gradient * gradient;
58 |             float moment1 = m_moments1[parameterID] * m_beta1 + gradient * (1 - m_beta1);
59 |             float moment2 = m_moments2[parameterID] * m_beta2 + gradient_sq * (1 - m_beta2);
60 | 
61 |             float bias_correction1 = 1 - pow(m_beta1, (float) currentStep);
62 |             float bias_correction2 = 1 - pow(m_beta2, (float) currentStep);
63 | 
64 |             float denom = sqrt(moment2) * rsqrt(bias_correction2) + m_epsilon;
65 |             float step_size = m_learningRate / bias_correction1;
66 | 
67 |             float adjustedWeightbias = weightBias - (moment1 / denom) * step_size;
68 | 
69 |             m_moments1[parameterID] = moment1;
70 |             m_moments2[parameterID] = moment2;
71 |             return adjustedWeightbias;
72 |         }
73 |     };
74 | };


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/PCG32.slang:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | /*
12 |  * The PCG random number generator was developed by Melissa O'Neill
13 |  * <oneill@pcg-random.org>
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License");
16 |  * you may not use this file except in compliance with the License.
17 |  * You may obtain a copy of the License at
18 |  *
19 |  *     http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software
22 |  * distributed under the License is distributed on an "AS IS" BASIS,
23 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 |  * See the License for the specific language governing permissions and
25 |  * limitations under the License.
26 |  *
27 |  * For additional information about the PCG random number generation scheme,
28 |  * including its license and other licensing options, visit
29 |  *
30 |  *     http://www.pcg-random.org
31 |  */
32 | 
33 | #pragma once
34 | 
35 | struct PCG32
36 | {
37 |     uint64_t state, inc;
38 | 
39 |     // Initialize from initial state value and stream index
40 | 	__init(uint64_t init_state, uint64_t init_seq) 
41 |     { 
42 | 		state = 0;
43 | 		inc = (init_seq << 1u) | 1u;
44 | 		nextUInt();
45 | 		state += init_state;
46 | 		nextUInt();
47 |     }
48 | 
49 |     // Generates random 32 bit unsigned integer
50 |     [mutating]
51 |     uint nextUInt()
52 |     {
53 | 		uint64_t oldstate = state;
54 | 		state = oldstate * 0x5851f42d4c957f2dULL + inc;
55 | 		uint xorshifted = (uint) (((oldstate >> 18u) ^ oldstate) >> 27u);
56 | 		uint rot = (uint) (oldstate >> 59u);
57 | 		return (xorshifted >> rot) | (xorshifted << ((~rot + 1u) & 31));
58 |     }
59 | 
60 | 	// Generates random float
61 |     [mutating]
62 | 	float nextFloat() 
63 |     {
64 | 		uint x = (nextUInt() >> 9) | 0x3f800000u;
65 | 		return reinterpret<float>(x) - 1.0f;
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/Utils.slang:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | import CooperativeVectorAutoDiff;
 12 | 
 13 | namespace rtxns
 14 | {
 15 |     ////////////////////////
 16 |     //
 17 |     // Encoders for input parameters
 18 |     // Using encoding of input parameters often significantly improves MLP convergence speed and results quality
 19 |     //
 20 |     ////////////////////////
 21 | 
 22 |     // Frequency encoding expands input count 
 23 |     static const int FREQUENCY_ENCODING_COUNT = 6;  
 24 | 
 25 |     // Encode every input as sequence sin(pi*x), cos(pi*x), sin(2*pi*x), cos(2*pi*x), sin(4*pi*x), cos(4*pi*x), ...
 26 |     // The output parameter count must equal 6 * PARAMS_COUNT
 27 |     CoopVec<T, PARAMS_COUNT * FREQUENCY_ENCODING_COUNT> EncodeFrequency<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(float parameters[PARAMS_COUNT])
 28 |     {
 29 |         var output = CoopVec<T, PARAMS_COUNT * FREQUENCY_ENCODING_COUNT>(T(1.));
 30 | 
 31 |         [ForceUnroll] 
 32 |         for (int i = 0; i < PARAMS_COUNT; ++i)
 33 |         {
 34 |             float sn, cn;
 35 |             sincos(parameters[i] * 3.14159265358979323846, sn, cn);
 36 | 
 37 |             int i1 = i * FREQUENCY_ENCODING_COUNT;
 38 |             output[i1 + 0] = T(sn);
 39 |             output[i1 + 1] = T(cn);
 40 |             output[i1 + 2] = T(2.) * output[i1 + 0] * output[i1 + 1];
 41 |             output[i1 + 3] = T(2.) * output[i1 + 1] * output[i1 + 1] - T(1.);
 42 |             output[i1 + 4] = T(2.) * output[i1 + 2] * output[i1 + 3];
 43 |             output[i1 + 5] = T(2.) * output[i1 + 3] * output[i1 + 3] - T(1.);
 44 |         }
 45 | 
 46 |         return output;
 47 |     }
 48 | 
 49 |     // Generic version that allows arbitrary number of scales
 50 |     [Differentiable]
 51 |     CoopVec<T, NUM_SCALES * PARAMS_COUNT * 2> EncodeFrequencyN<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int, let NUM_SCALES : int>(CoopVec<T, PARAMS_COUNT> inputParams)
 52 |     {
 53 |         var inputVec = VectorFromCoopVec(inputParams);
 54 |         float[NUM_SCALES * PARAMS_COUNT * 2] outputVec;
 55 | 
 56 |         [ForceUnroll]
 57 |         for (int i = 0; i < PARAMS_COUNT; ++i)
 58 |         {
 59 |             const int base = i * NUM_SCALES * 2;
 60 | 
 61 |             float sn, cn;
 62 |             sincos(inputVec[i] * 3.141592653589793238f, sn, cn);
 63 |             outputVec[base + 0] = sn;
 64 |             outputVec[base + 1] = cn;
 65 |             [ForceUnroll]
 66 |             for (int j = 1; j < NUM_SCALES; ++j) {
 67 |                 sn = 2.0f * sn * cn;
 68 |                 cn = 2.0f * cn * cn - 1.0f;
 69 |                 outputVec[base + j * 2 + 0] = sn;
 70 |                 outputVec[base + j * 2 + 1] = cn;
 71 |             }
 72 |         }
 73 | 
 74 |         return CoopVecFromArray<T, NUM_SCALES * PARAMS_COUNT * 2>(outputVec);
 75 |     }
 76 | 
 77 |     // Triangle encoding expands input count 
 78 |     static const int TRIANGLE_ENCODING_COUNT = 6;  
 79 | 
 80 |     // Similar to frequence encoder, but use triangle waves to improve performance
 81 |     // The output parameter count must equal 6 * PARAMS_COUNT
 82 |     CoopVec<T, PARAMS_COUNT * TRIANGLE_ENCODING_COUNT> EncodeTriangle<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(float parameters[PARAMS_COUNT])
 83 |     {
 84 |         var output = CoopVec<T, PARAMS_COUNT * TRIANGLE_ENCODING_COUNT>(T(1.));
 85 | 
 86 |         [ForceUnroll] 
 87 |         for (int i = 0; i < PARAMS_COUNT; ++i)
 88 |         {
 89 |             int i1 = i * TRIANGLE_ENCODING_COUNT;
 90 |             float p = parameters[i];
 91 |             float s = 0.5f, k = 0.f;
 92 |             [ForceUnroll] for (int j = 0; j < TRIANGLE_ENCODING_COUNT; ++j)
 93 |             {
 94 |                 float r = p * s + k;
 95 |                 output[i1 + j] = T(abs(r - floor(r) - 0.5f) * 4.f - 1.f);
 96 |                 s *= 2.f;
 97 |                 k += 0.25f;
 98 |             }
 99 |         }
100 | 
101 |         return output;
102 |     }
103 | 
104 |     ////////////////////////
105 |     //
106 |     // Packing functions 
107 |     //
108 |     ////////////////////////
109 | 
110 |     // Unpacks a uint4 array to a uint array 
111 |     // Used for storing weight and bias buffer offsets 
112 |     uint[NUM_UNPACKED] UnpackArray<let NUM_PACKED4 : int, let NUM_UNPACKED : int>(uint4 ps[NUM_PACKED4])
113 |     {
114 |         uint res[NUM_UNPACKED];
115 |         
116 |         [ForceUnroll]
117 |         for(int i = 0; i < NUM_UNPACKED; ++i)
118 |             res[i] = ps[i/4][i%4];
119 |         
120 |         return res;
121 |     }
122 | 
123 |     ////////////////////////
124 |     //
125 |     // Conversion functions 
126 |     //
127 |     ////////////////////////
128 | 
129 |     // Construct a CoopVec from a float array
130 |     [BackwardDerivative(CoopVecFromArray_Backward)]
131 |     CoopVec<T, PARAMS_COUNT> CoopVecFromArray<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(float parameters[PARAMS_COUNT])
132 |     {
133 |         var output = CoopVec<T, PARAMS_COUNT>(T(0.));
134 |         
135 |         [ForceUnroll] 
136 |         for(int i = 0; i < PARAMS_COUNT; i++)
137 |         {
138 |             output[i] = T(parameters[i]);
139 |         }
140 |         return output;
141 |     }
142 |     
143 |     void CoopVecFromArray_Backward<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(inout DifferentialPair<float[PARAMS_COUNT]> parameters, CoopVec<T, PARAMS_COUNT> grad)
144 |     {
145 |         float gradArray[PARAMS_COUNT];
146 | 
147 |         [ForceUnroll]
148 |         for (int i = 0; i < PARAMS_COUNT; ++i)
149 |         {
150 |             gradArray[i] = grad[i].toFloat();
151 |         }
152 |         parameters = diffPair(parameters.p, gradArray);
153 |     }
154 | 
155 |     // Construct a CoopVec from a float vector of up to 4 parameters
156 |     [BackwardDerivative(CoopVecFromVector_Backward)]
157 |     CoopVec<T, PARAMS_COUNT> CoopVecFromVector<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(vector<float, PARAMS_COUNT> parameters)
158 |     {
159 |         var output = CoopVec<T, PARAMS_COUNT>(T(0.));
160 |         
161 |         [ForceUnroll] 
162 |         for(int i = 0; i < PARAMS_COUNT; i++)
163 |         {
164 |             output[i] = T(parameters[i]);
165 |         }
166 |         return output;
167 |     }
168 |     
169 |     void CoopVecFromVector_Backward<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(inout DifferentialPair<vector<float, PARAMS_COUNT>> parameters, CoopVec<T, PARAMS_COUNT> grad)
170 |     {
171 |         vector<float, PARAMS_COUNT> gradVec;
172 | 
173 |         [ForceUnroll]
174 |         for (int i = 0; i < PARAMS_COUNT; ++i)
175 |         {
176 |             gradVec[i] = grad[i].toFloat();
177 |         }
178 |         parameters = diffPair(parameters.p, gradVec);
179 |     }
180 | 
181 |     // Convert a CoopVec to a float vector
182 |     [BackwardDerivative(VectorFromCoopVec_Backward)]
183 |     vector<float, PARAMS_COUNT> VectorFromCoopVec<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(CoopVec<T, PARAMS_COUNT> parameters)
184 |     {
185 |         vector<float, PARAMS_COUNT> result;
186 | 
187 |         [ForceUnroll]
188 |         for (int i = 0; i < PARAMS_COUNT; ++i)
189 |         {
190 |             result[i] = parameters[i].toFloat();
191 |         }
192 |         return result;
193 |     }
194 |     void VectorFromCoopVec_Backward<T : __BuiltinFloatingPointType, let PARAMS_COUNT : int>(inout DifferentialPair<CoopVec<T, PARAMS_COUNT>> parameters, vector<float, PARAMS_COUNT> grad)
195 |     {
196 |         CoopVec<T, PARAMS_COUNT> gradParams;
197 | 
198 |         [ForceUnroll]
199 |         for (int i = 0; i < PARAMS_COUNT; ++i)
200 |         {
201 |             gradParams[i] = T(grad[i]);
202 |         }
203 | 
204 |         parameters = diffPair(parameters.p, gradParams);
205 |     }
206 | }
207 | 


--------------------------------------------------------------------------------
/src/Utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | 
10 | 
11 | file(GLOB sources "*.cpp" "*.h")
12 | 
13 | set(project Utils)
14 | set(folder "${LIBRARY_FILTER}/Utils")
15 | 
16 | add_library(${project} STATIC EXCLUDE_FROM_ALL ${sources})
17 | target_include_directories(${project} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | target_link_libraries(${project} donut_app donut_engine NeuralShading)
19 | 
20 | 
21 | if (DONUT_WITH_DX12)
22 |     add_dependencies(${project} dx12-agility-sdk)
23 |     target_compile_definitions(
24 | 		${project} PRIVATE DONUT_D3D_AGILITY_SDK_VERSION=${DONUT_D3D_AGILITY_SDK_VERSION}
25 | 		${project} PRIVATE DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION=${DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION}
26 | 	)
27 | endif()
28 | 
29 | set_target_properties(${project} PROPERTIES 
30 | 	FOLDER ${folder}
31 | )
32 | 


--------------------------------------------------------------------------------
/src/Utils/DeviceUtils.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distribution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  9 |  */
 10 | 
 11 | #if DONUT_WITH_DX12
 12 | #include "../../../external/dx12-agility-sdk/build/native/include/d3d12.h"
 13 | #endif
 14 | 
 15 | #include "DeviceUtils.h"
 16 | 
 17 | #include <donut/app/DeviceManager.h>
 18 | #include <donut/core/log.h>
 19 | 
 20 | #if DONUT_WITH_DX12
 21 | extern "C"
 22 | {
 23 |     _declspec(dllexport) extern const uint32_t D3D12SDKVersion = DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION;
 24 |     _declspec(dllexport) extern const char* D3D12SDKPath = ".\\d3d12\\";
 25 | }
 26 | 
 27 | static bool g_dx12DeveloperModeEnabled = false;
 28 | #endif
 29 | 
 30 | #if DONUT_WITH_VULKAN
 31 | static bool g_vulkanStorageBuffer16BitAccess = false;
 32 | static bool g_vulkanFloat16Supported = false;
 33 | #endif
 34 | 
 35 | // For the purposes of the SDK, when using Vulkan VK_NV_COOPERATIVE_VECTOR_EXTENSION_NAME is set as a required device extension.
 36 | void SetCoopVectorExtensionParameters(donut::app::DeviceCreationParameters& deviceParams, nvrhi::GraphicsAPI graphicsApi, bool enableSharedMemory, char const* windowTitle)
 37 | {
 38 | #if DONUT_WITH_VULKAN
 39 |     if (graphicsApi == nvrhi::GraphicsAPI::VULKAN)
 40 |     {
 41 |         if (enableSharedMemory)
 42 |         {
 43 | #ifdef _WIN32
 44 |             deviceParams.requiredVulkanDeviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
 45 | #else
 46 |             deviceParams.requiredVulkanDeviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
 47 | #endif
 48 |         }
 49 | 
 50 |         deviceParams.requiredVulkanDeviceExtensions.push_back(VK_EXT_SHADER_REPLICATED_COMPOSITES_EXTENSION_NAME);
 51 |         deviceParams.requiredVulkanDeviceExtensions.push_back(VK_NV_COOPERATIVE_VECTOR_EXTENSION_NAME);
 52 | 
 53 |         // vkCmdCopyImage: Dest image pRegion[0] x-dimension offset [0] + extent [4] exceeds subResource width [2]
 54 |         // vkCmdCopyImage: Dest image pRegion[0] y-dimension offset [0] + extent [4] exceeds subResource height [2]
 55 |         // These errors happen during copies from block textures to BCn textures at the last 2 mips, no way around it.
 56 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x38b5face);
 57 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x4bb17a0e);
 58 | 
 59 |         // The following warnings are related to the Cooperative Vector extension that the validation layers don't know.
 60 |         // SPIR-V module not valid: Invalid capability operand: 5394
 61 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x6bbb14);
 62 | 
 63 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0xa5625282);
 64 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x79de34d4);
 65 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x901f59ec);
 66 | 
 67 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x605314fa);
 68 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x938b32);
 69 | 
 70 |         // vkCreateShaderModule(): A SPIR-V Capability (Unhandled OpCapability) was declared that is not supported by Vulkan.
 71 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x2c00a3d6);
 72 |         // A SPIR-V Extension (SPV_NV_cooperative_vector) was declared that is not supported by Vulkan.
 73 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0xffffffffd80a42ae);
 74 | 
 75 |         // fragment shader writes to output location 1 with no matching attachment
 76 |         // This happens in the forward shading pass for transmissive materials. Difficult to work around.
 77 |         deviceParams.ignoredVulkanValidationMessageLocations.push_back(0x609a13b);
 78 | 
 79 |         // Add feature structures querying for cooperative vector support and DP4a support
 80 |         static VkPhysicalDeviceCooperativeVectorFeaturesNV cooperativeVectorFeatures{};
 81 |         cooperativeVectorFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_VECTOR_FEATURES_NV;
 82 | 
 83 |         static VkPhysicalDeviceShaderReplicatedCompositesFeaturesEXT shaderReplicatedCompositesFeatures{};
 84 |         shaderReplicatedCompositesFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_REPLICATED_COMPOSITES_FEATURES_EXT;
 85 |         shaderReplicatedCompositesFeatures.pNext = &cooperativeVectorFeatures;
 86 | 
 87 |         static VkPhysicalDeviceVulkan11Features vulkan11Features{};
 88 |         vulkan11Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
 89 |         vulkan11Features.pNext = &shaderReplicatedCompositesFeatures;
 90 | 
 91 |         static VkPhysicalDeviceVulkan12Features vulkan12Features{};
 92 |         vulkan12Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
 93 |         vulkan12Features.pNext = &vulkan11Features;
 94 |         deviceParams.physicalDeviceFeatures2Extensions = &vulkan12Features;
 95 | 
 96 |         // Set the callback to modify some bits in VkDeviceCreateInfo before creating the device
 97 |         deviceParams.deviceCreateInfoCallback = [](VkDeviceCreateInfo& info) {
 98 |             const_cast<VkPhysicalDeviceFeatures*>(info.pEnabledFeatures)->shaderInt16 = true;
 99 |             const_cast<VkPhysicalDeviceFeatures*>(info.pEnabledFeatures)->fragmentStoresAndAtomics = true;
100 |             const_cast<VkPhysicalDeviceFeatures*>(info.pEnabledFeatures)->shaderInt64 = true;
101 | 
102 |             // Iterate through the structure chain and find the structures to patch
103 |             VkBaseOutStructure* pCurrent = reinterpret_cast<VkBaseOutStructure*>(&info);
104 |             VkBaseOutStructure* pLast = nullptr;
105 |             while (pCurrent)
106 |             {
107 |                 if (pCurrent->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES)
108 |                 {
109 |                     g_vulkanStorageBuffer16BitAccess = vulkan11Features.storageBuffer16BitAccess;
110 |                     reinterpret_cast<VkPhysicalDeviceVulkan11Features*>(pCurrent)->storageBuffer16BitAccess = g_vulkanStorageBuffer16BitAccess;
111 |                 }
112 | 
113 |                 if (pCurrent->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES)
114 |                 {
115 |                     g_vulkanFloat16Supported = vulkan12Features.shaderFloat16;
116 |                     reinterpret_cast<VkPhysicalDeviceVulkan12Features*>(pCurrent)->shaderFloat16 = g_vulkanFloat16Supported;
117 |                     reinterpret_cast<VkPhysicalDeviceVulkan12Features*>(pCurrent)->vulkanMemoryModel = true;
118 |                     reinterpret_cast<VkPhysicalDeviceVulkan12Features*>(pCurrent)->vulkanMemoryModelDeviceScope = true;
119 |                 }
120 | 
121 |                 pLast = pCurrent;
122 |                 pCurrent = pCurrent->pNext;
123 |             }
124 | 
125 |             if (pLast && shaderReplicatedCompositesFeatures.shaderReplicatedComposites)
126 |             {
127 |                 pLast->pNext = reinterpret_cast<VkBaseOutStructure*>(&shaderReplicatedCompositesFeatures);
128 |                 shaderReplicatedCompositesFeatures.pNext = nullptr;
129 |                 pLast = pLast->pNext;
130 |             }
131 | 
132 |             // If cooperative vector is supported, add a feature structure enabling it on the device
133 |             if (pLast && cooperativeVectorFeatures.cooperativeVector)
134 |             {
135 |                 pLast->pNext = reinterpret_cast<VkBaseOutStructure*>(&cooperativeVectorFeatures);
136 |                 cooperativeVectorFeatures.pNext = nullptr;
137 |             }
138 |         };
139 |     }
140 | #endif
141 | 
142 | #if DONUT_WITH_DX12
143 |     if (graphicsApi == nvrhi::GraphicsAPI::D3D12)
144 |     {
145 |         UUID Features[] = { D3D12ExperimentalShaderModels, D3D12CooperativeVectorExperiment };
146 |         HRESULT hr = D3D12EnableExperimentalFeatures(_countof(Features), Features, nullptr, nullptr);
147 | 
148 |         if (FAILED(hr))
149 |         {
150 |             char const* messageText =
151 |                 "Couldn't enable D3D12 experimental shader models. Cooperative Vector features will not be available.\n"
152 |                 "Please make sure that Developer Mode is enabled in the Windows system settings.";
153 | 
154 |             if (windowTitle)
155 |             {
156 |                 MessageBoxA(NULL, messageText, windowTitle, MB_ICONWARNING);
157 |             }
158 |             else
159 |             {
160 |                 donut::log::warning("%s", messageText);
161 |             }
162 |         }
163 |         else
164 |         {
165 |             g_dx12DeveloperModeEnabled = true;
166 |         }
167 |     }
168 | #endif
169 | }
170 | 
171 | // Call after device creation to verify the extension has been enabled
172 | bool CoopVectorExtensionSupported(donut::app::DeviceManager* deviceManager)
173 | {
174 |     std::vector<std::string> extensions;
175 | 
176 |     deviceManager->GetEnabledVulkanDeviceExtensions(extensions);
177 |     for (std::string extension : extensions)
178 |     {
179 |         if (extension == "VK_NV_cooperative_vector")
180 |         {
181 |             return true;
182 |         }
183 |     }
184 |     return false;
185 | }


--------------------------------------------------------------------------------
/src/Utils/DeviceUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <nvrhi/nvrhi.h>
14 | 
15 | namespace donut::app
16 | {
17 | struct DeviceCreationParameters;
18 | class DeviceManager;
19 | } // namespace donut::app
20 | 
21 | void SetCoopVectorExtensionParameters(donut::app::DeviceCreationParameters& deviceParams, nvrhi::GraphicsAPI graphicsApi, bool enableSharedMemory, char const* windowTitle);
22 | 
23 | // Call after device creation to verify the extension has been enabled
24 | bool CoopVectorExtensionSupported(donut::app::DeviceManager* deviceManager);


--------------------------------------------------------------------------------
/src/Utils/DirectoryHelper.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "DirectoryHelper.h"
12 | 
13 | // Get local path for subfolder, used for creating a standalone binary package
14 | std::filesystem::path GetLocalPath(std::string subfolder)
15 | {
16 |     // Repository path
17 |     std::filesystem::path candidateA = donut::app::GetDirectoryWithExecutable().parent_path() / subfolder;
18 |     // Binary path, assuming the folder is under bin/
19 |     std::filesystem::path candidateB = donut::app::GetDirectoryWithExecutable().parent_path().parent_path() / subfolder;
20 | 
21 |     if (std::filesystem::exists(candidateA))
22 |     {
23 |         return candidateA;
24 |     }
25 |     else
26 |     {
27 |         return candidateB;
28 |     }
29 | }


--------------------------------------------------------------------------------
/src/Utils/DirectoryHelper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <donut/app/ApplicationBase.h>
14 | #include <filesystem>
15 | 
16 | // Get local path for subfolder, used for creating a standalone binary package
17 | std::filesystem::path GetLocalPath(std::string subfolder);


--------------------------------------------------------------------------------
/src/Utils/GeometryUtils.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #include "GeometryUtils.h"
12 | 
13 | using namespace dm;
14 | 
15 | std::pair<std::vector<Vertex>, std::vector<uint32_t>> GenerateSphere(float radius, uint32_t segmentsU, uint32_t segmentsV)
16 | {
17 |     std::vector<Vertex> vs;
18 |     std::vector<uint32_t> indices;
19 | 
20 |     // Create vertices.
21 |     for (uint32_t v = 0; v <= segmentsV; ++v)
22 |     {
23 |         for (uint32_t u = 0; u <= segmentsU; ++u)
24 |         {
25 |             float2 uv = float2(u / float(segmentsU), v / float(segmentsV));
26 |             float theta = uv.x * 2.f * PI_f;
27 |             float phi = uv.y * PI_f;
28 |             float3 dir = float3(std::cos(theta) * std::sin(phi), std::cos(phi), std::sin(theta) * std::sin(phi));
29 |             vs.push_back({ dir * radius, dir });
30 |         }
31 |     }
32 | 
33 |     // Create indices.
34 |     for (uint32_t v = 0; v < segmentsV; ++v)
35 |     {
36 |         for (uint32_t u = 0; u < segmentsU; ++u)
37 |         {
38 |             uint32_t i0 = v * (segmentsU + 1) + u;
39 |             uint32_t i1 = v * (segmentsU + 1) + (u + 1) % (segmentsU + 1);
40 |             uint32_t i2 = (v + 1) * (segmentsU + 1) + u;
41 |             uint32_t i3 = (v + 1) * (segmentsU + 1) + (u + 1) % (segmentsU + 1);
42 | 
43 |             indices.emplace_back(i0);
44 |             indices.emplace_back(i1);
45 |             indices.emplace_back(i2);
46 | 
47 |             indices.emplace_back(i2);
48 |             indices.emplace_back(i1);
49 |             indices.emplace_back(i3);
50 |         }
51 |     }
52 | 
53 |     return { vs, indices };
54 | }
55 | 


--------------------------------------------------------------------------------
/src/Utils/GeometryUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 - 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <donut/core/math/math.h>
14 | 
15 | #include <vector>
16 | 
17 | struct Vertex
18 | {
19 |     dm::float3 position;
20 |     dm::float3 normal;
21 | };
22 | 
23 | std::pair<std::vector<Vertex>, std::vector<uint32_t>> GenerateSphere(float radius, uint32_t segmentsU, uint32_t segmentsV);
24 | 


--------------------------------------------------------------------------------
/support/cmake/ConfigureAgilitySDK.cmake:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | set(sdk_files
12 |     D3D12Core.dll
13 |     D3D12Core.pdb
14 |     d3d12SDKLayers.dll
15 |     d3d12SDKLayers.pdb)
16 | 
17 | set(target_path "${RTXNS_BINARY_DIR}/d3d12/")
18 | 
19 | # Find the Agility Preview SDK version number
20 | if(_d3d_agility_include)
21 |     
22 |     #set(DONUT_D3D_AGILITY_SDK_INCLUDE_DIR "${_d3d_agility_include}")
23 |     
24 |     # find the SDK version number
25 |     file(READ "${_d3d_agility_include}/d3d12.idl" _d3d12_idl)
26 |     string(REGEX MATCH "const UINT D3D12_PREVIEW_SDK_VERSION = ([0-9]+)" _match ${_d3d12_idl})
27 |     if(_match AND CMAKE_MATCH_1)
28 |         set(DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION ${CMAKE_MATCH_1})
29 |         message(STATUS "Found D3D12 Agility Preview SDK: ${DONUT_D3D_AGILITY_SDK_INCLUDE_DIR} (version ${DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION})")
30 |     else()
31 |         message(FATAL_ERROR "Cannot resolve D3D12 Agility Preview SDK version number")
32 |     endif()    
33 | endif()
34 | 
35 | if (NOT DONUT_D3D_AGILITY_PREVIEW_SDK_VERSION OR NOT DONUT_D3D_AGILITY_SDK_LIBRARIES)
36 |     message(SEND_ERROR "Agility SDK variables were not configured, please re-configure the project to download it.")
37 | endif()
38 | 
39 | add_custom_target(dx12-agility-sdk)
40 | set_property (TARGET dx12-agility-sdk PROPERTY FOLDER "Third-Party Libraries")
41 | 
42 | file(MAKE_DIRECTORY ${target_path})
43 | 
44 | foreach (filename ${DONUT_D3D_AGILITY_SDK_LIBRARIES})
45 |     add_custom_command(TARGET dx12-agility-sdk POST_BUILD
46 |         COMMAND ${CMAKE_COMMAND} -E copy_if_different "${filename}" "${target_path}")
47 | 
48 | endforeach()
49 | 


--------------------------------------------------------------------------------
/support/cmake/FetchDXCPreview.cmake:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | 
 3 | if(!DXC_PREVIEW_VERSION)
 4 | 	message(FATAL_ERROR "DXC_PREVIEW_VERSION not set")
 5 | endif()
 6 | 
 7 | FetchContent_Declare(
 8 |   dxc_preview
 9 |   URL https://www.nuget.org/api/v2/package/Microsoft.Direct3D.DXC/${DXC_PREVIEW_VERSION}
10 |   SOURCE_DIR "${DXC_PREVIEW_PATH}"
11 |   DOWNLOAD_EXTRACT_TIMESTAMP TRUE)
12 |   
13 | FetchContent_MakeAvailable(dxc_preview)
14 |  
15 | set(DXC_PREVIEW_BIN_PATH "${dxc_preview_SOURCE_DIR}/build/native/bin/x64")
16 | set(DXC_PREVIEW_PATH "${DXC_PREVIEW_BIN_PATH}/dxc.exe")
17 | 


--------------------------------------------------------------------------------
/support/cmake/FetchPrebuildBinary.cmake:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | # This is used for downloading prebuilt external binaries.
 3 | macro(download_package name url)
 4 |     FetchContent_Declare(
 5 |         ${name}
 6 |         URL ${url}
 7 |         DOWNLOAD_EXTRACT_TIMESTAMP TRUE
 8 |     )
 9 |     FetchContent_GetProperties(${name})
10 |     if(NOT ${name}_POPULATED)
11 |         message(STATUS "Populating ${name} ...")
12 |         FetchContent_Populate(${name})
13 |     endif()
14 | endmacro()


--------------------------------------------------------------------------------