├── .gitattributes
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE.MD
├── README.md
├── assets
└── data
│ ├── disney.ns.bin
│ ├── nvidia-logo.png
│ └── slangpy-weights.json
├── docs
├── LibraryGuide.md
├── QuickStart.md
├── ShaderTraining.md
├── SimpleInferencing.md
├── SimpleTraining.md
├── SlangpyTraining.md
├── Tutorial.md
├── shader_training.png
├── simple_inferencing.png
├── simple_training.png
├── simple_training_trained.png
└── slangpy_training.jpg
├── samples
├── CMakeLists.txt
├── ShaderTraining
│ ├── CMakeLists.txt
│ ├── Disney.slang
│ ├── DisneyMLP.slang
│ ├── NetworkConfig.h
│ ├── ShaderTraining.cpp
│ ├── computeOptimizer.slang
│ ├── computeTraining.slang
│ ├── renderDifference.slang
│ ├── renderDisney.slang
│ ├── renderInference.slang
│ └── shaders.cfg
├── SimpleInferencing
│ ├── CMakeLists.txt
│ ├── NetworkConfig.h
│ ├── SimpleInferencing.cpp
│ ├── SimpleInferencing.slang
│ └── shaders.cfg
├── SimpleTraining
│ ├── CMakeLists.txt
│ ├── NetworkConfig.h
│ ├── SimpleTraining.cpp
│ ├── SimpleTraining_Inference.slang
│ ├── SimpleTraining_Optimizer.slang
│ ├── SimpleTraining_Training.slang
│ └── shaders.cfg
└── SlangpyTraining
│ ├── CMakeLists.txt
│ ├── Helpers.py
│ ├── NetworkConfig.h
│ ├── NeuralModules.py
│ ├── NeuralModules.slang
│ ├── SlangpyInference.cpp
│ ├── SlangpyInference.slang
│ ├── SlangpyTraining.py
│ ├── SlangpyTraining.slang
│ ├── requirements.txt
│ └── shaders.cfg
├── src
├── CMakeLists.txt
├── NeuralShading
│ ├── CMakeLists.txt
│ ├── CoopVector.cpp
│ ├── CoopVector.h
│ ├── Float16.cpp
│ ├── Float16.h
│ ├── GraphicsResources.cpp
│ ├── GraphicsResources.h
│ ├── NeuralNetwork.cpp
│ ├── NeuralNetwork.h
│ └── NeuralNetworkTypes.h
├── NeuralShading_Shaders
│ ├── Activation.slang
│ ├── CMakeLists.txt
│ ├── CooperativeVectorAutoDiff.slang
│ ├── CooperativeVectorDerivatives.slang
│ ├── CooperativeVectorFunctions.slang
│ ├── LinearOps.slang
│ ├── Loss.slang
│ ├── MLP.slang
│ ├── Optimizers.slang
│ ├── PCG32.slang
│ └── Utils.slang
└── Utils
│ ├── CMakeLists.txt
│ ├── DeviceUtils.cpp
│ ├── DeviceUtils.h
│ ├── DirectoryHelper.cpp
│ ├── DirectoryHelper.h
│ ├── GeometryUtils.cpp
│ └── GeometryUtils.h
└── support
└── cmake
├── ConfigureAgilitySDK.cmake
├── FetchDXCPreview.cmake
└── FetchPrebuildBinary.cmake
/.gitattributes:
--------------------------------------------------------------------------------
1 | external/slang/windows-x64/release/dxcompiler.dll filter=lfs diff=lfs merge=lfs -text
2 | external/slang/windows-x64/release/slang.dll filter=lfs diff=lfs merge=lfs -text
3 | external/slang/windows-x64/release/slangc.exe filter=lfs diff=lfs merge=lfs -text
4 | external/slang/windows-x64/release/slangd.exe filter=lfs diff=lfs merge=lfs -text
5 | external/slang/windows-x64/release/slang-glslang.dll filter=lfs diff=lfs merge=lfs -text
6 | *.exe filter=lfs diff=lfs merge=lfs -text
7 | *.pdb filter=lfs diff=lfs merge=lfs -text
8 | *.dll filter=lfs diff=lfs merge=lfs -text
9 | *.zip filter=lfs diff=lfs merge=lfs -text
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | /bin
3 | /out
4 | /.vscode
5 | /.vs
6 | /*.zip
7 | external/slang/windows-x64/release/slang-stdlib.bin
8 | /external/dx12-agility-sdk
9 | /external/nvapi
10 |
11 |
12 | # Temp files from running Python sample.
13 | /.temp
14 | __pycache__/
15 |
16 | # Generated shaders+weights from SlangPyTraining.
17 | samples/SlangpyTraining/trained_shaders.cfg
18 | samples/SlangpyTraining/weights.json
19 | /external/dx12-agility-sdk
20 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "donut"]
2 | path = external/donut
3 | url = https://github.com/NVIDIAGameWorks/donut
4 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading Change Log
2 |
3 | ## 1.1.0
4 | - Added DX12 cooperative vector support using Preview Agility SDK.
5 | - Moved matrix conversion to GPU.
6 |
7 | ## 1.0.0
8 |
9 | Initial release.
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | cmake_minimum_required(VERSION 3.10)
11 |
12 | project(
13 | RtxNeuralShading
14 | DESCRIPTION "RTX Neural Shading"
15 | LANGUAGES CXX
16 | )
17 |
18 | set(CMAKE_CXX_STANDARD 20)
19 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
20 | set(CMAKE_CXX_EXTENSIONS ON)
21 |
22 | option(ENABLE_DX12_COOP_VECTOR_PREVIEW "" OFF)
23 | option(DONUT_WITH_DX11 "Not supported in this SDK" OFF)
24 | option(DONUT_WITH_DX12 "DX12 is only supported with DX12_COOP_VECTOR_PREVIEW ON" OFF)
25 | option(DONUT_WITH_VULKAN "" ON)
26 | option(DONUT_WITH_STATIC_SHADERS "" ON)
27 |
28 | # Register our path for CMake modules
29 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/support/cmake")
30 |
31 | if (MSVC)
32 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D_ITERATOR_DEBUG_LEVEL=1")
33 | endif()
34 |
35 | option(DONUT_WITH_ASSIMP "" OFF)
36 |
37 | if(WIN32)
38 | set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/windows-x64" CACHE PATH "Output directory for the RTXNS build")
39 | else()
40 | set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/linux-x64" CACHE PATH "Output directory for the RTXNS build")
41 | endif()
42 |
43 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${RTXNS_BINARY_DIR}")
44 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${RTXNS_BINARY_DIR}")
45 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${RTXNS_BINARY_DIR}")
46 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${RTXNS_BINARY_DIR}")
47 |
48 | set(SHADERMAKE_BIN_OUTPUT_PATH "${RTXNS_BINARY_DIR}/bin" CACHE STRING "Output directory for the ShaderMake executable")
49 | set(DONUT_SHADERS_OUTPUT_DIR "${RTXNS_BINARY_DIR}/bin/shaders/framework")
50 |
51 | # Get Slang
52 | set(SLANG_VERSION "2025.10")
53 | set(SLANG_URL_BASE "https://github.com/shader-slang/slang/releases/download/v${SLANG_VERSION}")
54 | if(WIN32)
55 | set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-windows-x86_64.zip")
56 | else()
57 | set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-linux-x86_64-glibc-2.17.tar.gz")
58 | endif()
59 |
60 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchPrebuildBinary.cmake")
61 | download_package(slang ${SLANG_URL})
62 |
63 | if (WIN32)
64 | set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc.exe")
65 | if (ENABLE_DX12_COOP_VECTOR_PREVIEW)
66 | set(DONUT_WITH_DX12 ON)
67 | set(NVRHI WITH_DX12 ON)
68 | else()
69 | # DX12 is only supported with DX12_COOP_VECTOR_PREVIEW
70 | set(DONUT_WITH_DX12 OFF)
71 | set(NVRHI WITH_DX12 OFF)
72 | endif()
73 | else()
74 | set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc")
75 | endif()
76 |
77 | if (NOT SLANGC_PATH)
78 | message(FATAL_ERROR "Slang compiler not found - this is required for CoopVec support.")
79 | else()
80 | message("Slang compiler found: ${SLANGC_PATH}")
81 | endif()
82 |
83 | if (DONUT_WITH_DX12)
84 | # Get D3D Agility SDK Preview for Coop Vector support
85 | set(D3D_AGILITY_SDK_PREVIEW_VERSION "1.717.0-preview")
86 | set(DONUT_D3D_AGILITY_SDK_URL "https://www.nuget.org/api/v2/package/Microsoft.Direct3D.D3D12/${D3D_AGILITY_SDK_PREVIEW_VERSION}")
87 | set(DONUT_D3D_AGILITY_SDK_FETCH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/dx12-agility-sdk" CACHE STRING "" FORCE)
88 | include("${CMAKE_CURRENT_SOURCE_DIR}/external/donut/cmake/FetchAgilitySDK.cmake")
89 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/ConfigureAgilitySDK.cmake")
90 |
91 | # Get DXC preview for SM6.9 support
92 | set(DXC_PREVIEW_VERSION "1.8.2505.28")
93 | set(DXC_PREVIEW_PATH "" CACHE STRING "Directory to fetch the DXC to, empty uses build directory default")
94 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchDXCPreview.cmake")
95 |
96 | set(DXC_PATH "${DXC_PREVIEW_PATH}")
97 |
98 | # copy dxc to Slang
99 | foreach(file_name IN ITEMS dxc.exe dxcompiler.dll dxil.dll)
100 | set(src "${DXC_PREVIEW_BIN_PATH}/${file_name}")
101 | set(dst "${slang_SOURCE_DIR}/bin/")
102 | if(EXISTS "${src}")
103 | configure_file("${src}" "${dst}" COPYONLY)
104 | else()
105 | message(WARNING "DXC binary not found: ${src}")
106 | endif()
107 | endforeach()
108 | endif()
109 |
110 | add_subdirectory(external/donut)
111 | add_subdirectory(src)
112 | add_subdirectory(samples)
113 | set_property (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT SimpleInferencing)
114 |
115 | file(WRITE "${CMAKE_SOURCE_DIR}/bin/slangc.bat" "${SLANGC_PATH} %*")
116 |
117 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading
2 |
3 | RTX Neural Shading (RTXNS) also known as RTX Neural Shaders, is intended as a starting point for developers interested in bringing Machine Learning (ML) to their graphics applications. It provides a number of examples to help the reader understand how to train their own neural networks and then use those models to perform inference alongside their normal graphics rendering.
4 |
5 | RTXNS uses the [Slang](https://shader-slang.com) shading language and it utilizes either the DirectX Preview Agility SDK or the Vulkan Cooperative Vectors extension to provide access to the GPUs ML acceleration.
6 |
7 | A number of examples are included which build upon each other from a simple inference example to more complex examples showing how to train a neural network to represent a shader or a texture. Helper functions to facilitate building your own neural networks are also included.
8 |
9 | Alongside the core samples is a SlangPy sample to demonstrate how to use python and SlangPy for fast iteration and development of neural networks which can then be integrated into RTXNS for inference.
10 |
11 | When exploring RTXNS, it is assumed that the reader is already familiar with ML and neural networks.
12 |
13 | ## Requirements
14 |
15 | ### General
16 | [CMake v3.24.3][CMake] **|** [VS 2022][VS22] **|** [Slang v2025.10](https://shader-slang.com/tools/)
17 |
18 | ### DirectX
19 | [DirectX Preview Agility SDK 1.717.0-preview](https://www.nuget.org/packages/Microsoft.Direct3D.D3D12/1.717.0-preview) **|** [Microsoft DXC 1.8.2505.28](https://www.nuget.org/packages/Microsoft.Direct3D.DXC/1.8.2505.28) **|** [Shader Model 6-9-Preview Driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver)
20 |
21 | ### Vulkan
22 | GPU must support the Vulkan `VK_NV_cooperative_vector` extension (minimum NVIDIA RTX 20XX) **|** [Vulkan SDK 1.3.296.0](https://vulkan.lunarg.com/sdk/home) **|** Public Driver ≥ 572.16
23 |
24 | ## Known Issues
25 | 05/30/2025: When updating from v1.0.0 to v1.1.0 is it recommended to delete the cmake cache to avoid build errors.
26 |
27 | ## Project structure
28 |
29 | | Directory | Details |
30 | | --------------------------------- | -------------------------------------- |
31 | | [/assets](assets) | _Asset files for samples_ |
32 | | [/docs](docs) | _Documentation for showcased tech_ |
33 | | [/samples](samples) | _Samples showcasing usage of MLPs_ |
34 | | [/external/donut](external/donut) | _Framework used for the examples_ |
35 | | [/external](external) | _Helper dependencies for the examples_ |
36 | | [/src](src) | _Helper and utility functions_ |
37 |
38 | ## Getting started
39 |
40 | - [Quick start guide](docs/QuickStart.md) for building and running the neural shading samples.
41 | - [Library usage guide](docs/LibraryGuide.md) for using helper functions
42 |
43 | ### External Resources
44 |
45 | This project uses [Slang](https://shader-slang.com) and the Vulkan CoopVector extensions. The following links provide more detail on these, and other technologies which may help the reader to better understand the relevant technologies, or just to provide further reading.
46 |
47 | * [Slang User Guide](https://shader-slang.com/slang/user-guide/)
48 |
49 | * [Automatic Differentiation](https://shader-slang.com/slang/user-guide/autodiff.html)
50 |
51 | * [SlangPy](https://slangpy.readthedocs.io/en/latest/)
52 |
53 | * [Vulkan `VK_NV_cooperative_vector` extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html)
54 |
55 | * [Donut](https://github.com/NVIDIAGameWorks/donut)
56 |
57 | ## Contact
58 |
59 | RTXNS is actively being developed. Please report any issues directly through the GitHub issue tracker, and for any information or suggestions contact us at rtxns-sdk-support@nvidia.com
60 |
61 | ## Citation
62 |
63 | Use the following BibTex entry to cite the usage of RTXNS in published research:
64 |
65 | ```bibtex
66 | @online{RTXNS,
67 | title = {{{NVIDIA}}\textregistered{} {RTXNS}},
68 | author = {{NVIDIA}},
69 | year = 2025,
70 | url = {https://github.com/NVIDIA-RTX/RTXNS},
71 | urldate = {2025-02-03},
72 | }
73 | ```
74 |
75 | ## License
76 |
77 | See [LICENSE.md](LICENSE.MD)
78 |
79 | [VS22]: https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community&channel=Release&version=VS2022&source=VSLandingPage&passive=false&cid=2030
80 |
81 | [CMake]: https://github.com/Kitware/CMake/releases/download/v3.24.3/cmake-3.24.3-windows-x86_64.msi
82 |
--------------------------------------------------------------------------------
/assets/data/disney.ns.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/disney.ns.bin
--------------------------------------------------------------------------------
/assets/data/nvidia-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/nvidia-logo.png
--------------------------------------------------------------------------------
/docs/QuickStart.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading: Quick Start Guide
2 |
3 | ## Build steps
4 |
5 | 1. Clone the project recursively:
6 |
7 | ```
8 | git clone --recursive https://github.com/NVIDIA-RTX/RTXNS
9 | ```
10 |
11 | 2. Configure and then generate the solution using CMake GUI (or the CLI) by setting the repository root as _source_ and specifying a new _build_ directory in the root.
12 |
13 | ```
14 | cd Rtxns
15 | mkdir build
16 | cd build
17 | cmake ..
18 | ```
19 | To enable DX12 Cooperative Vector set the option `ENABLE_DX12_COOP_VECTOR_PREVIEW` on.
20 | ```
21 | cmake -DENABLE_DX12_COOP_VECTOR_PREVIEW=ON
22 | ```
23 |
24 | 3. Open build/RtxNeuralShading.sln in Visual Studio and build all projects, or build using the CMake CLI
25 |
26 | ```
27 | cmake --build .
28 | ```
29 |
30 | 4. All of the binaries can be found in `\bin` such as
31 |
32 | ```
33 | bin\Debug\SimpleInferencing.exe
34 |
35 | 5. Each of the samples can be build and launched as either DX12 or Vulkan with the respective commandline: `-dx12` or `-vk`
36 |
37 | ## About
38 |
39 | All of the samples are built using Slang and can be compiled to either DX12 or Vulkan using DirectX Preview Agility SDK or Vulkan Cooperative Vector extension respectively.
40 |
41 | - [DirectX Preview Agility SDK](https://devblogs.microsoft.com/directx/directx12agility/).
42 | - [Vulkan Cooperative Vector extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html).
43 |
44 | ## Driver Requirements
45 | - Use the DirectX Preview Agility SDK requires a shader model 6.9 preview [driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver)
46 | - Vulkan Cooperative Vector extension requires a release [driver](https://www.nvidia.com/en-gb/geforce/drivers) from R570 onwards
47 |
48 | ### Samples
49 |
50 | | Sample Name | Output | Description |
51 | | ------------------------------------------ | ------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
52 | | [Simple Inferencing](SimpleInferencing.md) | [
](simple_inferencing.png) | This sample demonstrates how to implement an inference shader using some of the low-level building blocks from RTXNS. The sample loads a trained network from a file and uses the network to approximate a Disney BRDF shader. The sample is interactive; the light source can be rotated and various material parameters can be modified at runtime. |
53 | | [Simple Training](SimpleTraining.md) | [
](simple_training.png) | This sample builds on the Simple Inferencing sample to provide an introduction to training a neural network for use in a shader. The network replicates a transformed texture. |
54 | | [Shader Training](ShaderTraining.md) | [
](shader_training.png) | This sample extends the techniques shown in the Simple Training example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the Simple Inferencing sample. |
55 | | [SlangPy Training](SlangpyTraining.md) | [
](slangpy_training.jpg) | This sample shows how to create and train network architectures in python using SlangPy. This lets you experiment with different networks, encodings and more using the building blocks from RTXNS, but without needing to change or rebuild C++ code. As a demonstration this sample instantiates multiple different network architectures and trains them side-by-side on the same data. It also shows one possible approach of exporting the network parameters and architecture to disk so it can be loaded in C++. |
56 |
57 | ### Tutorial
58 |
59 | * [Tutorial](Tutorial.md)
60 | A tutorial to help guide you to create your own neural shader based on the [Shader Training](ShaderTraining.md) example.
61 |
62 | ### Library
63 |
64 | * [Library](LibraryGuide.md)
65 | A guide to using the library / helper functions to create and manage your neural networks.
66 |
--------------------------------------------------------------------------------
/docs/ShaderTraining.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading: Shader Training Example
2 |
3 | ## Purpose
4 |
5 | This sample extends on the techniques shown in the [Simple Training](SimpleTraining.md) example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the [Simple Inferencing](SimpleInferencing.md) sample.
6 |
7 | 
8 |
9 | When the executable is built and run, the output shows 3 images where the image on the left is sphere lit with the full Disney BRDF shader. the middle image is the same sphere lit with the trained neural network and the final image on the right shows the loss delta. There is a UI which allows some control of the material properties as well as buttons to pause and reset the training as well as to save/load the current network.
10 |
11 | ## Training Flow
12 |
13 | To create and train a neural network with RTXNS, several stages are needed which will be described in more detail below. This differs from the previous [Simple Training](SimpleTraining.md) example which had a specific compute shader pass for training and another for inference. In this example, the training and optimization passes are still compute based, but the inference is integrated into an existing pixel shader.
14 |
15 | 1. Create the host side neural network storage and initialize it
16 |
17 | 2. Create a device optimal layout and GPU buffer
18 |
19 | 3. Convert the host layout network to the device optimal layout on the GPU
20 |
21 | 4. Create auxiliary buffers for loss gradients and the optimizer pass
22 |
23 | 5. Run batches of the training shader followed by the optimizer shader on random inputs adjusting for the loss at each epoch
24 |
25 | 6. Render the sphere with the inference pixel shader to generate the output image
26 |
27 | ## Network Configuration
28 |
29 | The network details can be found in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h) and are configured as follows :
30 |
31 | | Property | Value | Notes |
32 | | -------------- | ------- | ------------------------------------------- |
33 | | Input Features | 5 | 5 input parameters |
34 | | Input Neurons | 30 | 5 input parameters encoded to 6 inputs each |
35 | | Output Neurons | 4 | 4 BRDF values |
36 | | Hidden Neurons | 32 | |
37 | | Hidden Layers | 3 | |
38 | | Precision | float16 | |
39 |
40 | ## Application Code
41 |
42 | On the host, the setup of the neural network is quite simple and broadly similar to [Simple Training](SimpleTraining.md) so we shall only highlight the differences in this document.
43 |
44 | ### Training Loop
45 |
46 | After creating the appropriate pipelines and allocating the GPU buffers, the training loop is similar to the Simple Training example. The training and optimization passes are executed multiple times per frame (`g_trainingStepsPerFrame = 100`) to speed up the training time whilst also running the inference pass at a reasonable rate to see the model convergence.
47 |
48 | ```
49 | for (int i = 0; i < g_trainingStepsPerFrame; ++i)
50 | {
51 | nvrhi::ComputeState state;
52 | ...
53 | // Training pass
54 | state.bindings = { m_trainingPass.bindingSet };
55 | state.pipeline = m_trainingPass.pipeline;
56 | ...
57 | m_commandList->setComputeState(state);
58 | m_commandList->dispatch(m_batchSize / 64, 1, 1);
59 | ...
60 | // Optimizer pass
61 | state.bindings = { m_optimizerPass.bindingSet };
62 | state.pipeline = m_optimizerPass.pipeline;
63 | ...
64 | m_commandList->setComputeState(state);
65 | m_commandList->dispatch(div_ceil(m_totalParameterCount, 32), 1, 1);
66 | ...
67 | }
68 | ```
69 |
70 | Some of the timer related queries have been removed from the code for ease of understanding.
71 |
72 | After the training pass, the 2 spheres are rendered as expected, but using 2 different pipelines: `m_directPass` for the native Disney BRDF shader and `m_inferencePass` for the trained neural model.
73 |
74 | ## Shader Code
75 |
76 | The neural network in this sample is trying to encode the following :
77 |
78 | ```
79 | Disney(NdotL, NdotV, NdotH, LdotH, roughness);
80 | ```
81 |
82 | The shader code extends the concepts shown in the [Simple Training](SimpleTraining.md) example by using Slangs [AutoDiff](https://shader-slang.org/slang/user-guide/autodiff.html) feature to create a templated training class `TrainingMLP`, implemented in [MLP.slang](../src/NeuralShading_Shaders/MLP.slang), that can be used to help train your own models. Using the Autodiff features means we don't need to implement a full backwards pass containing all of the derivative activation functions as it is automatically derived for you.
83 |
84 | The main 3 shaders are: [training](../samples/ShaderTraining/computeTraining.slang), [optimizer](../samples/ShaderTraining/computeOptimizer.slang) and [inference](../samples/ShaderTraining/renderInference.slang).
85 |
86 | ### Training
87 |
88 | The training shader starts by generating the random inputs and encoding them ready for passing to the neural network.
89 |
90 | ```
91 | //----------- Training step
92 | float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness};
93 | var inputParams = rtxns::EncodeFrequency(params);
94 | ```
95 |
96 | Next, the model is created and the inputs are passed to the model for the forwards pass.
97 |
98 | ```
99 | var model = rtxns::mlp::TrainingMLP(
102 | gMLPParams,
103 | gMLPParamsGradients,
104 | rtxns::UnpackArray(gConst.weightOffsets),
105 | rtxns::UnpackArray(gConst.biasOffsets));
106 | ```
107 |
108 | The `TrainingMLP` is heavily templated with lots of parameters, but the templated parameters consist of :
109 |
110 | * Number of hidden layers
111 | * Number of input neurons
112 | * Number of neurons per hidden layer
113 | * Number of output neurons
114 | * Matrix layout
115 | * Precision
116 |
117 | The non templated parameters consist of :
118 |
119 | * Weight/Bias buffer
120 | * Gradient buffer
121 | * Weight offsets per layer
122 | * Bias offsets per layer
123 |
124 | Once the model has been created, executing the forward pass is trivial and involves assigning templated activation functions to the forward pass before passing the input parameters in. The detailed implementation is described in the [Library Guide](LibraryGuide.md)
125 |
126 | ```
127 | var hiddenActivation = rtxns::mlp::ReLUAct();
128 | var finalActivation = rtxns::mlp::ExponentialAct();
129 |
130 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
131 | ```
132 |
133 | To generate the loss gradient, this example uses the `L2Relative` derivative function of the output of the actual disney BRDF shader and the output of the forward pass.
134 |
135 | ```
136 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
137 |
138 | float4 lossGradient = rtxns::mlp::L2Relative.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS);
139 | ```
140 |
141 | Finally, the loss gradient along with the input vector are passed through the models backward propagation function to update the gradient parameters.
142 |
143 | ```
144 | model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3]));
145 | ```
146 |
147 | ### Optimizer
148 |
149 | The optimizer shader is no different to the one used in the [Simple Training](SimpleTraining.md) example.
150 |
151 | ```
152 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
153 | {
154 | uint i = dispatchThreadID.x;
155 | if (i >= maxParamSize)
156 | return;
157 |
158 | float gradient = (float)gMLPParamsGradients[i];
159 | gMLPParamsGradients[i] = half(0.0);
160 |
161 | float weightbias = gMLPParams32[i];
162 |
163 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, learningRate, LOSS_SCALE);
164 |
165 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, currentStep);
166 |
167 | gMLPParams32[i] = adjustedWeightbias;
168 | gMLPParams[i] = (half)adjustedWeightbias;
169 | }
170 | ```
171 |
172 | ### Inference
173 |
174 | The inference pass is nearly identical to the forward pass of the training shader. It currently uses `CoopVecMatrixLayout::TrainingOptimal` layout as it is run directly after a batch of training without converting the weights to an inference layout, but the default layout for inference should be `CoopVecMatrixLayout::InferencingOptimal`.
175 |
176 | ```
177 | float4 DisneyMLP(
178 | float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer,
179 | uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1])
180 | {
181 | // Calculate approximated core shader part using MLP
182 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
183 |
184 | var inputParams = rtxns::EncodeFrequency(params);
185 |
186 | var model = rtxns::mlp::InferenceMLP
193 | (mlpBuffer, weightOffsets, biasOffsets);
194 |
195 | var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct(), rtxns::mlp::ExponentialAct());
196 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
197 | }
198 | ```
199 |
--------------------------------------------------------------------------------
/docs/Tutorial.md:
--------------------------------------------------------------------------------
1 | # RTX Neural Shading: How to Write Your First Neural Shader
2 |
3 | ## Purpose
4 |
5 | Using [Shader Training](ShaderTraining.md) as the basis of this tutorial, we will briefly discuss an approach to writing your first neural shader.
6 |
7 | The main areas we will focus on are :
8 |
9 | 1. Extracting the key features from the shader to be trained
10 |
11 | 2. Modifying the network configuration
12 |
13 | 3. Modifying the activation and loss functions
14 |
15 | It is outside the scope of this document to discuss how AI training and optimization works and instead we will focus on modifying the existing sample to configure and train the network with different content.
16 |
17 | ## Extracting the Key Features for Training Input
18 |
19 | When implementing the Disney BRDF for use in the [Shader Training](ShaderTraining.md) example, the first task was feature extraction. Which features from the shader should be inferred from the network and which should be calculated to ensure the network is not over specialized or overly complex. The network for the Disney BRDF takes inputs such as the `view`, `light` and `normal` vectors as well as `material roughness`. Other variables, such as `light intensity`, `material metallicness` and various `material color` components have been left as part of the shader. This is a balancing act which may require some experimentation.
20 |
21 | Once the key features are identified as potential training inputs, look to optimize them where possible by reducing their form and scaling them to be in the range `0-1` or `-1 - 1` which is preferred by networks. In the Disney BRDF, this was done by recognizing that the input vectors where always normalized and used in their dot product form, so the inputs were reduced from 3 `float3` vectors, to 4 `float` dot products.
22 |
23 | Next, the network inputs may benefit from encoding which research has shown to improve the performance of the network. The library provides 2 encoders, `EncodeFrequency` and `EncodeTriangle` which encode the inputs into some form of wave function. The shader training example uses the frequency encoder which increases the number of inputs by a factor of 6 but provides a better network as a result. You should experiment with encoders to find the one suitable for your dataset.
24 |
25 | At this point, you should know the number of (encoded) input parameters and output parameters, so it is time to configure the network.
26 |
27 | ## Modifying the Network Configuration
28 |
29 | The network configuration is stored in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h), and may require modification. Some elements are fixed for your dataset, like the input and output neuron counts and others are available for configuration. In the provided samples, the configuration is hard-coded for ease of understanding, but in a production system they are fully expected to be a configurable part of the training pipeline.
30 |
31 | These are fixed configuration parameters that are directly tied to the shader you are trying to train from :
32 |
33 | - `INPUT_NEURONS` should equal the number of encoded input parameters from above that are directly passed into the network.
34 |
35 | - `OUTPUT_NEURONS` should equal the output parameters that the network generates. This may be an RGB triple, or just a number of unconnected outputs like for the DisneyBRDF.
36 |
37 | The following parameters are available for experimentation and should be modified to find suitable settings for the network you are trying to train :
38 |
39 | - `NUM_HIDDEN_LAYERS` - The number of hidden layers that make up the network.
40 |
41 | - `HIDDEN_NEURONS` - The number of neurons in the hidden layers of the network. Changing this can make significant differences to the accuracy and cost of your network.
42 |
43 | - `LEARNING_RATE` - This should be tuned to improve convergence of your model.
44 |
45 | In future versions of the library, precision of the neurons may be alterable which could change the quality and performance of the network. The current version is fixed to `float16`.
46 |
47 | Changing any of these parameters should not require any further code changes as the defines are shared amongst the C++ and shader code; they will just require a re-compile. The exception may be when changing the size of input/output `CoopVecs` and any code that dereferences their elements directly, such as :
48 |
49 | ```
50 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
51 | ```
52 |
53 | As always, experimentation will be required to find the right set of configuration parameters for the optimal training of your network.
54 |
55 | ## Modifying the Activation and Loss Functions
56 |
57 | The Simple Shading example uses the `TrainingMLP` which abstracts much of the training shader code for the user :
58 |
59 | ```
60 | var model = rtxns::mlp::TrainingMLP(
63 | gMLPParams,
64 | gMLPParamsGradients,
65 | rtxns::UnpackArray(gConst.weightOffsets),
66 | rtxns::UnpackArray(gConst.biasOffsets));
67 |
68 | var hiddenActivation = rtxns::mlp::ReLUAct();
69 | var finalActivation = rtxns::mlp::ExponentialAct();
70 |
71 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
72 | ```
73 |
74 | The activation functions are passed into the models forward and backward pass (`ReLUAct` and `ExponentialAct`) for use with the `TrainingMLP` and `InferenceMLP`. These can be found in [CooperativeVectorFunctions.slang](../src/NeuralShading_Shaders/CooperativeVectorFunctions.slang) and extended as necessary. The current version of RTXNS provides a limited set of activation functions, but these can be examined and modified to support more activation functions as required.
75 |
76 | The choice of loss function to use will be dependent on your dataset. The Simple Training example uses a simple L2 loss function whereas the Shader Training example uses a more complex L2 relative loss function. Any loss function can be trivially coded in slang to help tune your shader.
77 |
78 | ## Hyper Parameters
79 |
80 | These are some of the hyper parameters that are available for tuning for your dataset.
81 |
82 | | Parameter | Value |
83 | | --------------------------- | ---------------- |
84 | | HIDDEN_NEURONS | 32 |
85 | | NUM_HIDDEN_LAYERS | 3 |
86 | | LEARNING_RATE | 1e-2f |
87 | | BATCH_SIZE | (1 << 16) |
88 | | BATCH_COUNT | 100 |
89 | | Hidden Activation Functions | ReLUAct() |
90 | | Final Activation Functions | ExponentialAct() |
91 | | Loss Function | L2Relative() |
92 |
93 | ## Summary
94 |
95 | The Shader Training sample is a good place to start to train your own neural shader. It will require some thought as to how to decompose your shader into network inputs and shader inputs and then the network can be re-configured through experimentation to find the suitable model that can handle your dataset.
96 |
--------------------------------------------------------------------------------
/docs/shader_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/shader_training.png
--------------------------------------------------------------------------------
/docs/simple_inferencing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_inferencing.png
--------------------------------------------------------------------------------
/docs/simple_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training.png
--------------------------------------------------------------------------------
/docs/simple_training_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training_trained.png
--------------------------------------------------------------------------------
/docs/slangpy_training.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/slangpy_training.jpg
--------------------------------------------------------------------------------
/samples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | add_subdirectory(SimpleInferencing)
11 | add_subdirectory(ShaderTraining)
12 | add_subdirectory(SimpleTraining)
13 | add_subdirectory(SlangpyTraining)
14 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | include(../../external/donut/compileshaders.cmake)
11 |
12 | set(shader_includes
13 | ${SAMPLES_SHADER_INCLUDE_DIR}
14 | ${CMAKE_CURRENT_LIST_DIR}
15 | )
16 |
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 |
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 |
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 |
23 | set(project ShaderTraining)
24 | set(folder "Samples/ShaderTraining")
25 |
26 | file(GLOB ${project}_shaders "*.slang")
27 | file(GLOB ${project}_sources "*.cpp" "*.h")
28 |
29 | donut_compile_shaders_all_platforms(
30 | TARGET ${project}_shaders
31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | INCLUDES ${shader_includes}
33 | FOLDER ${folder}
34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 | SOURCES ${${project}_shaders}
39 | SLANG
40 | )
41 |
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 |
47 | if (MSVC)
48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()
--------------------------------------------------------------------------------
/samples/ShaderTraining/Disney.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | //----------- Core part of the shader
12 |
13 | const static float PI = 3.14159265358979323846;
14 |
15 | float SchlickFresnel(float u)
16 | {
17 | float m = clamp(1 - u, 0, 1);
18 | float m2 = m * m;
19 | return m2 * m2 * m; // pow(m,5)
20 | }
21 |
22 | float Gtr1(float NdotH, float a)
23 | {
24 | if (a >= 1)
25 | {
26 | return 1 / PI;
27 | }
28 | float a2 = a * a;
29 | float t = 1 + (a2 - 1) * NdotH * NdotH;
30 | return (a2 - 1) / (PI * log(a2) * t);
31 | }
32 |
33 | float Gtr2(float NdotH, float ax)
34 | {
35 | float a = ax * (1 / ax / ax * (1 - NdotH * NdotH) + NdotH * NdotH);
36 | return 1 / (PI * a * a);
37 | }
38 |
39 | float SmithGGX(float NdotV, float alphaG)
40 | {
41 | float a = alphaG * alphaG;
42 | float b = NdotV * NdotV;
43 | return 1 / (NdotV + sqrt(a + b - a * b));
44 | }
45 |
46 | float SmithGGXAnisotropy(float NdotV, float ax)
47 | {
48 | return 1 / (NdotV + sqrt(ax * ax * (1 - NdotV * NdotV) + NdotV * NdotV));
49 | }
50 |
51 | float4 Disney(float NdotL, float NdotV, float NdotH, float LdotH, float roughness)
52 | {
53 | float FL = SchlickFresnel(NdotL), FV = SchlickFresnel(NdotV);
54 | float Fss90 = LdotH * LdotH * roughness;
55 | float Fss = lerp(1.0f, Fss90, FL) * lerp(1.0f, Fss90, FV);
56 | float ss = 1.25f * (Fss * (1.f / (NdotL + NdotV) - .5f) + .5f);
57 |
58 | // specular
59 | float ax = max(.001f, roughness * roughness);
60 | float Ds = Gtr2(NdotH, ax);
61 | float FH = SchlickFresnel(LdotH);
62 | float Gs = SmithGGXAnisotropy(NdotL, ax);
63 | Gs *= SmithGGXAnisotropy(NdotV, ax);
64 |
65 | // clearcoat (ior = 1.5 -> F0 = 0.04)
66 | float Dr = Gtr1(NdotH, .01f);
67 | float Fr = lerp(.04f, 1.0f, FH);
68 | float Gr = SmithGGX(NdotL, .25f) * SmithGGX(NdotV, .25f);
69 |
70 | return float4((1 / PI) * ss, Gs * Ds, FH, .25 * Gr * Fr * Dr);
71 | }
72 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/DisneyMLP.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | import MLP;
12 | import CooperativeVectorFunctions;
13 | import Activation;
14 | import Utils;
15 |
16 | // 5 inputs are passed into this function; NdotL, NdotV, NdotH, LdotH, roughness
17 | #define INPUT_FEATURES 5
18 |
19 | // The output is float4
20 | #define OUTPUT_NEURONS 4
21 |
22 | // EncodeFrequency expands the input by 6 per input feature
23 | #define FREQUENCY_EXPANSION 6
24 |
25 | float4 DisneyMLP(
26 | float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer,
27 | uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1])
28 | {
29 | // Calculate approximated core shader part using MLP
30 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
31 |
32 | var inputParams = rtxns::EncodeFrequency(params);
33 |
34 | var model = rtxns::mlp::InferenceMLP
41 | (mlpBuffer, weightOffsets, biasOffsets);
42 |
43 | var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct(), rtxns::mlp::ExponentialAct());
44 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
45 | }
46 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #define INPUT_FEATURES 5
12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // 6* from Frequency Encoding
13 | #define OUTPUT_NEURONS 4
14 |
15 | #define HIDDEN_NEURONS 32
16 | #define NUM_HIDDEN_LAYERS 3
17 | #define BATCH_SIZE (1 << 16)
18 | #define BATCH_COUNT 100
19 |
20 | #define LEARNING_RATE 1e-2f
21 | #define COMPONENT_WEIGHTS float4(1.f, 10.f, 1.f, 5.f)
22 |
23 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1)
24 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4)
25 | #define LOSS_SCALE 128.0
26 |
27 | struct DirectConstantBufferEntry
28 | {
29 | // Scene setup
30 | float4x4 viewProject;
31 | float4x4 view;
32 | float4 cameraPos;
33 |
34 | // Light setup
35 | float4 lightDir;
36 | float4 lightIntensity;
37 |
38 | // Material props
39 | float4 baseColor;
40 | float specular = 0;
41 | float roughness = 0;
42 | float metallic = 0;
43 |
44 | // Alignment
45 | float pad = 0;
46 | };
47 |
48 | struct InferenceConstantBufferEntry : DirectConstantBufferEntry
49 | {
50 | // Neural weight & bias offsets
51 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
52 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
53 | };
54 |
55 | struct TrainingConstantBufferEntry
56 | {
57 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
58 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
59 | uint32_t maxParamSize;
60 | float learningRate;
61 | float currentStep;
62 | uint32_t batchSize;
63 | uint64_t seed;
64 | };
65 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/computeOptimizer.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import Optimizers;
15 |
16 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0);
17 | RWBuffer gMLPParams : REGISTER_UAV(0, 0);
18 | RWBuffer gMLPParams32 : REGISTER_UAV(1, 0);
19 | RWBuffer gMLPParamsGradients : REGISTER_UAV(2, 0);
20 | RWBuffer gMoments1 : REGISTER_UAV(3, 0);
21 | RWBuffer gMoments2 : REGISTER_UAV(4, 0);
22 |
23 | [numthreads(32, 1, 1)]
24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
25 | {
26 | uint i = dispatchThreadID.x;
27 | if (i >= gConst.maxParamSize)
28 | return;
29 |
30 | float gradient = (float)gMLPParamsGradients[i];
31 | gMLPParamsGradients[i] = half(0.0);
32 |
33 | if (isfinite(gradient))
34 | {
35 | float weightbias = gMLPParams32[i];
36 |
37 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE);
38 |
39 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep);
40 |
41 | gMLPParams32[i] = adjustedWeightbias;
42 | gMLPParams[i] = (half) adjustedWeightbias;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/computeTraining.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import Loss;
20 | import PCG32;
21 | import Disney;
22 |
23 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0);
24 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0);
25 | RWByteAddressBuffer gMLPParamsGradients : REGISTER_UAV(0, 0);
26 |
27 | [shader("compute")]
28 | [numthreads(64, 1, 1)]
29 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
30 | {
31 | //----------- Randomly generate input parameters
32 | uint idx = dispatchThreadID.x;
33 | PCG32 rng = PCG32(gConst.seed, idx);
34 |
35 | // Using tangent coordinate system. N = (0,0,1)
36 | // L is arbitrary, but (N,L) >= 0 => L.z > 0, so generate random L in XZ plane's first quadrant
37 | float3 L;
38 | L.y = 0.f;
39 | sincos(rng.nextFloat()*PI/2, L.z, L.x);
40 |
41 | // V is random direction, but (N,V) >= 0 => V.z > 0
42 | float sa, ca; // Azimuth [-PI/2,PI/2]
43 | sincos(-PI + 2 * PI * rng.nextFloat(), sa, ca);
44 | float se, ce; // Elevation [0,PI/2]
45 | sincos(PI/2 * rng.nextFloat(), se, ce);
46 | float3 V = float3(ce*ca, ce*sa, se);
47 |
48 | float NdotL = L.z;
49 | float NdotV = V.z;
50 |
51 | float3 H = normalize(L+V);
52 | float NdotH = H.z;
53 | float LdotH = dot(L,H);
54 |
55 | float roughness = rng.nextFloat()*0.7f+0.3f;
56 |
57 | //----------- Calculate core shader part DIRECTLY
58 | float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, roughness);
59 |
60 | //----------- Training step
61 | float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness};
62 | var inputParams = rtxns::EncodeFrequency(params);
63 |
64 | var model = rtxns::mlp::TrainingMLP(
67 | gMLPParams,
68 | gMLPParamsGradients,
69 | rtxns::UnpackArray(gConst.weightOffsets),
70 | rtxns::UnpackArray(gConst.biasOffsets));
71 |
72 | var hiddenActivation = rtxns::mlp::ReLUAct();
73 | var finalActivation = rtxns::mlp::ExponentialAct();
74 |
75 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation);
76 |
77 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] };
78 |
79 | float4 lossGradient = rtxns::mlp::L2Relative.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS);
80 |
81 | model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3]));
82 | }
83 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/renderDifference.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import Disney;
20 | import DisneyMLP;
21 |
22 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0);
23 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0);
24 |
25 | struct PS_INPUT
26 | {
27 | float4 pos : SV_Position;
28 | float3 norm : NORMAL;
29 | float3 view : VIEW;
30 | }
31 |
32 | [shader("vertex")]
33 | void main_vs(
34 | float3 i_pos : POSITION,
35 | float3 i_norm : NORMAL,
36 | out PS_INPUT output)
37 | {
38 | output.pos = mul(float4(i_pos, 1), gConst.viewProject);
39 | output.norm = i_norm;
40 | output.view = gConst.cameraPos.xyz - i_pos;
41 | }
42 |
43 | float3 calcColor(float4 params)
44 | {
45 | float3 Cdlin = pow(gConst.baseColor.rgb, 2.2);
46 | float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic);
47 | float3 brdfn = params.x * Cdlin * (1 - gConst.metallic) + params.y * lerp(Cspec0, float3(1), params.z) + params.w;
48 | return clamp(brdfn * gConst.lightIntensity.rgb, 0, 1);
49 | }
50 |
51 | [shader("fragment")]
52 | void main_ps(
53 | PS_INPUT input,
54 | out float4 o_color : SV_Target0)
55 | {
56 | // Prepare input parameters
57 | float3 view = normalize(input.view);
58 | float3 norm = normalize(input.norm);
59 | float3 h = normalize(-gConst.lightDir.xyz + view);
60 |
61 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
62 | float NdotV = max(0.f, dot(norm, view));
63 | float NdotH = max(0.f, dot(norm, h));
64 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
65 |
66 | //----------- Calculate core shader part DIRECTLY
67 | float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness);
68 |
69 | // Calculate approximated core shader
70 | float4 outParams = DisneyMLP(
71 | NdotL, NdotV, NdotH, LdotH, gConst.roughness,
72 | gMLPParams,
73 | rtxns::UnpackArray(gConst.weightOffsets),
74 | rtxns::UnpackArray(gConst.biasOffsets)
75 | );
76 |
77 | o_color = float4((calcColor(actualDisney) - calcColor(outParams)) * NdotL * 4 + 0.5, 1.f);
78 | }
79 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/renderDisney.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | DECLARE_CBUFFER(DirectConstantBufferEntry, gConst, 0, 0);
15 |
16 | struct PS_INPUT
17 | {
18 | float4 pos : SV_Position;
19 | float3 norm : NORMAL;
20 | float3 view : VIEW;
21 | }
22 |
23 | [shader("vertex")]
24 | void main_vs(
25 | float3 i_pos : POSITION,
26 | float3 i_norm : NORMAL,
27 | out PS_INPUT output)
28 | {
29 | output.pos = mul(float4(i_pos, 1), gConst.viewProject);
30 | output.norm = i_norm;
31 | output.view = gConst.cameraPos.xyz - i_pos;
32 | }
33 |
34 | import Disney;
35 |
36 | [shader("fragment")]
37 | void main_ps(
38 | PS_INPUT input,
39 | out float4 o_color : SV_Target0)
40 | {
41 | //----------- Prepare input parameters
42 | float3 view = normalize(input.view);
43 | float3 norm = normalize(input.norm);
44 | float3 h = normalize(-gConst.lightDir.xyz + view);
45 |
46 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
47 | float NdotV = max(0.f, dot(norm, view));
48 | float NdotH = max(0.f, dot(norm, h));
49 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
50 |
51 | //----------- Calculate core shader part DIRECTLY
52 | float4 outParams = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness);
53 |
54 | //----------- Calculate final color
55 | float3 Cdlin = float3(pow(gConst.baseColor[0], 2.2), pow(gConst.baseColor[1], 2.2), pow(gConst.baseColor[2], 2.2));
56 | float3 Cspec0 = lerp(gConst.specular * .08 * float3(1), Cdlin, gConst.metallic);
57 | float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
58 | float3 colorh = brdfn * float3(NdotL) * gConst.lightIntensity.rgb;
59 |
60 | o_color = float4(colorh, 1.f);
61 | }
62 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/renderInference.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import CooperativeVectorAutoDiff;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import Activation;
18 | import MLP;
19 | import DisneyMLP;
20 |
21 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0);
22 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0);
23 |
24 | struct PS_INPUT
25 | {
26 | float4 pos : SV_Position;
27 | float3 norm : NORMAL;
28 | float3 view : VIEW;
29 | }
30 |
31 | [shader("vertex")]
32 | void main_vs(
33 | float3 i_pos : POSITION,
34 | float3 i_norm : NORMAL,
35 | out PS_INPUT output)
36 | {
37 | output.pos = mul(float4(i_pos, 1), gConst.viewProject);
38 | output.norm = i_norm;
39 | output.view = gConst.cameraPos.xyz - i_pos;
40 | }
41 |
42 | [shader("fragment")]
43 | void main_ps(
44 | PS_INPUT input,
45 | out float4 o_color : SV_Target0)
46 | {
47 | // Prepare input parameters
48 | float3 view = normalize(input.view);
49 | float3 norm = normalize(input.norm);
50 | float3 h = normalize(-gConst.lightDir.xyz + view);
51 |
52 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz));
53 | float NdotV = max(0.f, dot(norm, view));
54 | float NdotH = max(0.f, dot(norm, h));
55 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz));
56 |
57 | // Calculate approximated core shader
58 | float4 outParams = DisneyMLP(
59 | NdotL, NdotV, NdotH, LdotH, gConst.roughness,
60 | gMLPParams,
61 | rtxns::UnpackArray(gConst.weightOffsets),
62 | rtxns::UnpackArray(gConst.biasOffsets)
63 | );
64 |
65 | // Calculate final color
66 | float3 Cdlin = pow(gConst.baseColor.rgb, 2.2);
67 | float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic);
68 | float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
69 | float3 colorh = brdfn * NdotL * gConst.lightIntensity.rgb;
70 |
71 | o_color = float4(colorh, 1.f);
72 | }
73 |
--------------------------------------------------------------------------------
/samples/ShaderTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | renderDisney.slang -E main_vs -T vs
2 | renderDisney.slang -E main_ps -T ps
3 | renderInference.slang -E main_vs -T vs
4 | renderInference.slang -E main_ps -T ps
5 | renderDifference.slang -E main_vs -T vs
6 | renderDifference.slang -E main_ps -T ps
7 | computeTraining.slang -E main_cs -T cs
8 | computeOptimizer.slang -E adam_cs -T cs
9 |
--------------------------------------------------------------------------------
/samples/SimpleInferencing/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | include(../../external/donut/compileshaders.cmake)
11 |
12 | set(shader_includes
13 | ${SAMPLES_SHADER_INCLUDE_DIR}
14 | ${CMAKE_CURRENT_LIST_DIR}
15 | )
16 |
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 |
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 |
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 |
23 | set(project SimpleInferencing)
24 | set(folder "Samples/SimpleInferencing")
25 |
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 |
29 | donut_compile_shaders_all_platforms(
30 | TARGET ${project}_shaders
31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | INCLUDES ${shader_includes}
33 | FOLDER ${folder}
34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 | SOURCES ${${project}_shaders}
39 | SLANG
40 | )
41 |
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 |
47 | if (MSVC)
48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()
--------------------------------------------------------------------------------
/samples/SimpleInferencing/NetworkConfig.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #ifndef __NETWORK_CONFIG_H__
12 | #define __NETWORK_CONFIG_H__
13 |
14 | #define VECTOR_FORMAT half
15 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16
16 |
17 | // When loading a model from file, these parameters must match
18 | #define INPUT_FEATURES 5
19 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input
20 | #define OUTPUT_NEURONS 4
21 | #define HIDDEN_NEURONS 32
22 |
23 | struct NeuralConstants
24 | {
25 | // Scene setup
26 | float4x4 viewProject;
27 | float4x4 view;
28 | float4 cameraPos;
29 |
30 | // Light setup
31 | float4 lightDir;
32 | float4 lightIntensity;
33 |
34 | // Material props
35 | float4 baseColor;
36 | float specular;
37 | float roughness;
38 | float metallic;
39 | float padding;
40 |
41 | // Neural weight & bias offsets
42 | uint4 weightOffsets; // Offsets to weight matrices in bytes.
43 | uint4 biasOffsets; // Offsets to bias vectors in bytes.
44 | };
45 |
46 | #endif //__NETWORK_CONFIG_H__
--------------------------------------------------------------------------------
/samples/SimpleInferencing/SimpleInferencing.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | import CooperativeVectorFunctions;
12 | import Utils;
13 | import LinearOps;
14 |
15 | #include "NetworkConfig.h"
16 | #include
17 |
18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
19 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0);
20 |
21 | struct VertexIn
22 | {
23 | float3 pos : POSITION;
24 | float3 norm : NORMAL;
25 | };
26 |
27 | struct VertexOut
28 | {
29 | float4 pos : SV_Position;
30 | float3 norm : NORMAL;
31 | float3 view : VIEW;
32 | }
33 |
34 | [shader("vertex")]
35 | void main_vs(
36 | VertexIn vIn,
37 | out VertexOut vOut)
38 | {
39 | vOut.pos = mul(float4(vIn.pos, 1), gConst.viewProject);
40 | vOut.norm = vIn.norm;
41 | vOut.view = gConst.cameraPos.xyz - vIn.pos;
42 | }
43 |
44 | float4 DisneyMLP(float NdotL, float NdotV, float NdotH, float LdotH, float roughness)
45 | {
46 | uint4 weightOffsets = gConst.weightOffsets;
47 | uint4 biasOffsets = gConst.biasOffsets;
48 |
49 | CoopVec inputParams;
50 | CoopVec hiddenParams;
51 | CoopVec outputParams;
52 |
53 | // Encode input parameters, 5 inputs to 30 parameters
54 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness };
55 | inputParams = rtxns::EncodeFrequency(params);
56 |
57 | // Forward propagation through the neural network
58 | // Input to hidden layer, then apply activation function
59 | hiddenParams = rtxns::LinearOp(
60 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
61 | hiddenParams = rtxns::relu(hiddenParams);
62 |
63 | // Hidden layer to hidden layer, then apply activation function
64 | hiddenParams = rtxns::LinearOp(
65 | hiddenParams, gMLPParams, weightOffsets[1], biasOffsets[1], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
66 | hiddenParams = rtxns::relu(hiddenParams);
67 |
68 | // Hidden layer to hidden layer, then apply activation function
69 | hiddenParams = rtxns::LinearOp(
70 | hiddenParams, gMLPParams, weightOffsets[2], biasOffsets[2], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
71 | hiddenParams = rtxns::relu(hiddenParams);
72 |
73 | // Hidden layer to output layer, then apply final activation function
74 | outputParams = rtxns::LinearOp(
75 | hiddenParams, gMLPParams, weightOffsets[3], biasOffsets[3], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION);
76 | outputParams = exp(outputParams);
77 |
78 | // Take the output from the neural network as the output color
79 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]);
80 | }
81 |
82 | [shader("fragment")]
83 | void main_ps(
84 | VertexOut vOut,
85 | out float4 o_color : SV_Target0)
86 | {
87 | float4 lightIntensity = gConst.lightIntensity;
88 | float4 lightDir = gConst.lightDir;
89 | float4 baseColor = gConst.baseColor;
90 | float specular = gConst.specular;
91 | float roughness = gConst.roughness;
92 | float metallic = gConst.metallic;
93 |
94 | // Prepare input parameters
95 | float3 view = normalize(vOut.view);
96 | float3 norm = normalize(vOut.norm);
97 | float3 h = normalize(-lightDir.xyz + view);
98 |
99 | float NdotL = max(0.f, dot(norm, -lightDir.xyz));
100 | float NdotV = max(0.f, dot(norm, view));
101 | float NdotH = max(0.f, dot(norm, h));
102 | float LdotH = max(0.f, dot(h, -lightDir.xyz));
103 |
104 | // Calculate approximated core shader part using MLP
105 | float4 outParams = DisneyMLP(NdotL, NdotV, NdotH, LdotH, roughness);
106 |
107 | // Calculate final color
108 | float3 Cdlin = float3(pow(baseColor.r, 2.2), pow(baseColor.g, 2.2), pow(baseColor.b, 2.2));
109 | float3 Cspec0 = lerp(specular * .08f * float3(1,1,1), Cdlin, metallic);
110 | float3 brdfn = outParams.x * Cdlin * (1 - metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w;
111 | float3 colorh = brdfn * float3(NdotL) * lightIntensity.rgb;
112 |
113 | o_color = float4(colorh, 1.f);
114 | }
115 |
--------------------------------------------------------------------------------
/samples/SimpleInferencing/shaders.cfg:
--------------------------------------------------------------------------------
1 | SimpleInferencing.slang -T vs -E main_vs
2 | SimpleInferencing.slang -T ps -E main_ps
3 |
--------------------------------------------------------------------------------
/samples/SimpleTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | include(../../external/donut/compileshaders.cmake)
11 |
12 | set(shader_includes
13 | ${SAMPLES_SHADER_INCLUDE_DIR}
14 | ${CMAKE_CURRENT_LIST_DIR}
15 | )
16 |
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 |
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 |
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 |
23 | set(project SimpleTraining)
24 | set(folder "Samples/SimpleTraining")
25 |
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 |
29 | donut_compile_shaders_all_platforms(
30 | TARGET ${project}_shaders
31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | INCLUDES ${shader_includes}
33 | FOLDER ${folder}
34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 | SOURCES ${${project}_shaders}
39 | SLANG
40 | )
41 |
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 |
47 | if (MSVC)
48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()
--------------------------------------------------------------------------------
/samples/SimpleTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #define INPUT_FEATURES 2
12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input
13 | #define OUTPUT_NEURONS 3
14 |
15 | #define HIDDEN_NEURONS 64
16 | #define NUM_HIDDEN_LAYERS 4
17 |
18 | #define LEARNING_RATE 0.001f
19 |
20 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1)
21 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4)
22 | #define LOSS_SCALE 128.0
23 | #define RELU_LEAK 0.01h
24 |
25 | #define VECTOR_FORMAT half
26 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16
27 | #define NETWORK_PRECISION rtxns::Precision::F16
28 |
29 | #define MATRIX_LAYOUT CoopVecMatrixLayout::TrainingOptimal
30 |
31 | #define BATCH_COUNT 128
32 | #define BATCH_SIZE_X 32
33 | #define BATCH_SIZE_Y 32
34 |
35 | enum class NetworkTransform
36 | {
37 | Identity,
38 | Zoom,
39 | Flip
40 | };
41 |
42 | struct NeuralConstants
43 | {
44 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4];
45 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4];
46 |
47 | uint32_t imageWidth;
48 | uint32_t imageHeight;
49 | uint32_t maxParamSize;
50 | float learningRate;
51 |
52 | uint32_t currentStep;
53 | uint32_t batchSizeX;
54 | uint32_t batchSizeY;
55 | NetworkTransform networkTransform;
56 | };
57 |
--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Inference.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import CooperativeVectorFunctions;
15 | import Utils;
16 | import LinearOps;
17 |
18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
19 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0);
20 | Texture2D inputTexture :REGISTER_SRV(1, 0);
21 | RWTexture2D outputTexture :REGISTER_UAV(0, 0);
22 |
23 | [shader("compute")]
24 | [numthreads(8, 8, 1)]
25 | void inference_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
26 | {
27 | // Set the input ID as the uv coordinate and frequency encode it for the network
28 | float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight));
29 | CoopVec inputParams = rtxns::EncodeFrequency({inputUV.x, inputUV.y});
30 |
31 | // Load offsets
32 | uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.weightOffsets);
33 | uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.biasOffsets);
34 |
35 | CoopVec hiddenParams;
36 | CoopVec outputParams;
37 |
38 | // Forward propagation through the neural network
39 | // Input to hidden layer, then apply activation function
40 | hiddenParams = rtxns::LinearOp(
41 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0],
42 | MATRIX_LAYOUT, TYPE_INTERPRETATION);
43 | hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK);
44 |
45 | // Hidden layers to hidden layers, then apply activation function
46 | [ForceUnroll]
47 | for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++)
48 | {
49 | hiddenParams = rtxns::LinearOp(
50 | hiddenParams, gMLPParams, weightOffsets[layer],
51 | biasOffsets[layer],
52 | MATRIX_LAYOUT, TYPE_INTERPRETATION);
53 | hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK);
54 | }
55 |
56 | // Hidden layer to output layer, then apply final activation function
57 | outputParams = rtxns::LinearOp(
58 | hiddenParams, gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS],
59 | MATRIX_LAYOUT, TYPE_INTERPRETATION);
60 | outputParams = rtxns::sigmoid(outputParams);
61 |
62 | // Take the output from the neural network as the output color
63 | float4 color = {outputParams[0], outputParams[1], outputParams[2], 1.f};
64 | outputTexture[dispatchThreadID.xy] = color;
65 | }
--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Optimizer.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import Optimizers;
15 |
16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
17 | RWBuffer gMLPParams :REGISTER_UAV(0, 0);
18 | RWBuffer gMLPParamsf :REGISTER_UAV(1, 0);
19 | RWBuffer gMLPParamsGradients :REGISTER_UAV(2, 0);
20 | RWBuffer gMoments1 :REGISTER_UAV(3, 0);
21 | RWBuffer gMoments2 :REGISTER_UAV(4, 0);
22 |
23 | [numthreads(32, 1, 1)]
24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
25 | {
26 | uint i = dispatchThreadID.x;
27 | if (i >= gConst.maxParamSize)
28 | return;
29 |
30 | float gradient = (float)gMLPParamsGradients[i];
31 | gMLPParamsGradients[i] = half(0.0);
32 |
33 | // Get the floating point params, not float16
34 | float weightbias = gMLPParamsf[i];
35 |
36 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE);
37 |
38 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep);
39 |
40 | gMLPParamsf[i] = adjustedWeightbias;
41 | gMLPParams[i] = (half)adjustedWeightbias;
42 | }
43 |
44 | [numthreads(32, 1, 1)]
45 | void convert_weights_cs(uint3 dispatchThreadID: SV_DispatchThreadID)
46 | {
47 | uint i = dispatchThreadID.x;
48 | if (i >= gConst.maxParamSize)
49 | return;
50 |
51 | half param = gMLPParams[i];
52 | gMLPParamsf[i] = float(param);
53 | }
--------------------------------------------------------------------------------
/samples/SimpleTraining/SimpleTraining_Training.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import CooperativeVectorDerivatives;
15 | import CooperativeVectorFunctions;
16 | import Utils;
17 | import LinearOps;
18 |
19 |
20 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
21 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0);
22 | Texture2D inputTexture :REGISTER_SRV(1, 0);
23 | RWByteAddressBuffer gMLPParamsGradients :REGISTER_UAV(0, 0);
24 | RWStructuredBuffer gRandState :REGISTER_UAV(1, 0);
25 | RWTexture2D outputTexture :REGISTER_UAV(2, 0);
26 | RWTexture2D lossTexture :REGISTER_UAV(3, 0);
27 |
28 | struct RNG
29 | {
30 | uint state;
31 |
32 | __init(uint state) { this.state = state; }
33 |
34 | [mutating]
35 | float next()
36 | {
37 | float r = (state >> 8) * 0x1p-24;
38 | state = state * 2739110765U + 2739110765U;
39 | return r;
40 | }
41 | }
42 |
43 | [shader("compute")]
44 | [numthreads(8, 8, 1)]
45 | void training_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
46 | {
47 | uint2 batchSize = uint2(gConst.batchSizeX, gConst.batchSizeY);
48 |
49 | uint dispatchThreadIdxy = dispatchThreadID.y * batchSize.x + dispatchThreadID.x;
50 |
51 | RNG rng = RNG(gRandState[dispatchThreadIdxy]);
52 |
53 | // Get a random uv coordinate for the input and frequency encode it for improved convergance
54 | float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0);
55 | CoopVec inputParams = rtxns::EncodeFrequency({inputUV.x, inputUV.y});
56 |
57 | // Load offsets
58 | uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.weightOffsets);
59 | uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.biasOffsets);
60 |
61 | // Create variables to cache the results from each stage
62 | CoopVec hiddenParams[NUM_HIDDEN_LAYERS];
63 | CoopVec hiddenActivated[NUM_HIDDEN_LAYERS];
64 | CoopVec outputParams;
65 | CoopVec outputActivated;
66 |
67 | // Forward propagation through the neural network
68 | // Input to hidden layer, then apply activation function
69 | hiddenParams[0] = rtxns::LinearOp(
70 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION);
71 | hiddenActivated[0] = rtxns::leakyReLU(hiddenParams[0], RELU_LEAK);
72 |
73 | // Hidden layers to hidden layers, then apply activation function
74 | [ForceUnroll]
75 | for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++)
76 | {
77 | hiddenParams[layer] = rtxns::LinearOp(
78 | hiddenActivated[layer - 1], gMLPParams, weightOffsets[layer], biasOffsets[layer],
79 | MATRIX_LAYOUT, TYPE_INTERPRETATION);
80 | hiddenActivated[layer] = rtxns::leakyReLU(hiddenParams[layer], RELU_LEAK);
81 | }
82 |
83 | // Hidden layer to output layer, then apply final activation function
84 | outputParams = rtxns::LinearOp(
85 | hiddenActivated[NUM_HIDDEN_LAYERS - 1], gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS],
86 | biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION);
87 | outputActivated = rtxns::sigmoid(outputParams);
88 |
89 | // Take the output from the neural network as the output color
90 | float3 predictedRGB = {outputActivated[0], outputActivated[1], outputActivated[2]};
91 |
92 | // Now transform the input UVs according to the NetworkModel enum.
93 | // This can easily be extended to try many different transforms.
94 | uint2 actualUV;
95 | if (gConst.networkTransform == NetworkTransform.Flip)
96 | {
97 | float2 flipUV = inputUV.yx;
98 | actualUV = uint2(flipUV.xy * float2(gConst.imageHeight, gConst.imageWidth));
99 | }
100 | else if (gConst.networkTransform == NetworkTransform.Zoom)
101 | {
102 | float2 zoomUV = inputUV * 0.5 + 0.25;
103 | actualUV = uint2(zoomUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
104 | }
105 | else
106 | {
107 | actualUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
108 | }
109 |
110 | // Load the texture according to the transformed input UVs. This will
111 | // provide the RGB that the model is trying to train towards.
112 | float3 actualRGB = inputTexture[actualUV].rgb;
113 |
114 | // Output the loss, scaled to greyscale for output
115 | uint2 lossUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight));
116 | const float lossScaleFactor = 10.0f; // scale it up for better vis
117 | lossTexture[lossUV] = float4((predictedRGB - actualRGB) * lossScaleFactor + 0.5, 1);
118 |
119 | // Compute the L2 loss gradient
120 | // L2Loss = (a-b)^2
121 | // L2Loss Derivative = 2(a-b)
122 | float3 lossGradient = 2.0 * (predictedRGB - actualRGB);
123 |
124 | // Scale by batch size
125 | lossGradient /= (batchSize.x * batchSize.y);
126 |
127 | // Apply the LOSS_SCALE factor to retain precision. Remove it in the optimizer pass before use.
128 | lossGradient *= LOSS_SCALE;
129 |
130 | CoopVec lossGradientCV = CoopVec(VECTOR_FORMAT(lossGradient[0]), VECTOR_FORMAT(lossGradient[1]), VECTOR_FORMAT(lossGradient[2]));
131 |
132 | // Back-propogation pass, generate the gradients and accumulate the results into memory to be applied in the optimisation pass.
133 | CoopVec outputGradient;
134 | CoopVec hiddenGradient;
135 |
136 | // Output layer (loss gradient) to final hidden layer
137 | outputGradient = rtxns::sigmoid_Derivative(outputParams, lossGradientCV);
138 | hiddenGradient = rtxns::LinearOp_Backward(
139 | hiddenActivated[NUM_HIDDEN_LAYERS - 1], outputGradient, gMLPParams, gMLPParamsGradients,
140 | weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION);
141 |
142 | // Hidden layer to hidden layer
143 | for(int layer = NUM_HIDDEN_LAYERS - 1; layer >= 1; layer--)
144 | {
145 | hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[layer], RELU_LEAK, hiddenGradient);
146 | hiddenGradient = rtxns::LinearOp_Backward
147 | (hiddenActivated[layer - 1], hiddenGradient, gMLPParams, gMLPParamsGradients,
148 | weightOffsets[layer], biasOffsets[layer], MATRIX_LAYOUT, TYPE_INTERPRETATION);
149 | }
150 |
151 | // First hidden layer to input layer
152 | hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[0], RELU_LEAK, hiddenGradient);
153 | rtxns::LinearOp_Backward(
154 | inputParams, hiddenGradient, gMLPParams, gMLPParamsGradients, weightOffsets[0],
155 | biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION);
156 |
157 | // Store the random state to continue iterating next time.
158 | gRandState[dispatchThreadIdxy] = rng.state;
159 | }
--------------------------------------------------------------------------------
/samples/SimpleTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | SimpleTraining_Inference.slang -E inference_cs -T cs
2 | SimpleTraining_Training.slang -E training_cs -T cs
3 | SimpleTraining_Optimizer.slang -E adam_cs -T cs
4 | SimpleTraining_Optimizer.slang -E convert_weights_cs -T cs
5 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | include(../../external/donut/compileshaders.cmake)
11 |
12 | set(shader_includes
13 | ${SAMPLES_SHADER_INCLUDE_DIR}
14 | ${CMAKE_CURRENT_LIST_DIR}
15 | )
16 |
17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" )
18 |
19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " )
20 |
21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " )
22 |
23 | set(project SlangpyTraining)
24 | set(folder "Samples/SlangpyTraining")
25 |
26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang")
27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md")
28 |
29 | donut_compile_shaders_all_platforms(
30 | TARGET ${project}_shaders
31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg
32 | INCLUDES ${shader_includes}
33 | FOLDER ${folder}
34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project}
35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS}
36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV}
37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL}
38 | SOURCES ${${project}_shaders}
39 | SLANG
40 | )
41 |
42 | add_executable(${project} WIN32 ${${project}_sources})
43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils)
44 | add_dependencies(${project} ${project}_shaders)
45 | set_target_properties(${project} PROPERTIES FOLDER ${folder})
46 |
47 | if (MSVC)
48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP")
49 | endif()
--------------------------------------------------------------------------------
/samples/SlangpyTraining/Helpers.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | #
3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # NVIDIA CORPORATION and its licensors retain all intellectual property
6 | # and proprietary rights in and to this software, related documentation
7 | # and any modifications thereto. Any use, reproduction, disclosure or
8 | # distribution of this software and related documentation without an express
9 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
10 | #
11 | from slangpy.backend import Device, DeviceType, TextureLoader, Bitmap, SlangCompilerOptions
12 | import slangpy as spy
13 | from pathlib import Path
14 | from typing import Any, Union
15 | import subprocess
16 | import os
17 |
18 | from NeuralModules import CoopVecModule
19 |
20 | class SDKSample:
21 | def __init__(self, args: list[str]):
22 | super().__init__()
23 |
24 | # Set up directories to find includes and executables
25 | self.spy_dir = Path(spy.__file__).parent / "slang"
26 | self.sdk_root = Path(__file__).parent.parent.parent
27 | self.sdk_data_dir = self.sdk_root / "assets/data"
28 | self.rtxns_dir = self.sdk_root / "src/NeuralShading_Shaders"
29 | self.spy_sample_dir = self.sdk_root / "samples/SlangpyTraining"
30 | self.donut_dir = self.sdk_root / "external/donut/include"
31 | self.slang_compiler = self.sdk_root / "bin/slangc.bat"
32 |
33 | search_root = self.sdk_root / "bin"
34 | bin_ext = ".exe" if os.name == "nt" else ""
35 | inference_candidates = [f for f in search_root.glob(f"**/SlangpyTraining{bin_ext}") if f.is_file()]
36 | shadermake_candidates = [f for f in search_root.glob(f"**/ShaderMake{bin_ext}") if f.is_file()]
37 |
38 | if len(inference_candidates) == 0:
39 | print(f"Warning: Could not find SlangpyTraining executable within {search_root}. "
40 | "C++ sample will not be launched after training.")
41 | self.inference_sample_path = None
42 | else:
43 | self.inference_sample_path = inference_candidates[0]
44 | if len(inference_candidates) > 1:
45 | print(f"Warning: Found multiple possible SlangpyTraining executables. Picking {self.inference_sample_path}")
46 | else:
47 | print(f"Found SlangpyTraining executable at {self.inference_sample_path}")
48 |
49 | if len(shadermake_candidates) == 0:
50 | print(f"Warning: Could not find ShaderMake executable within {search_root}. "
51 | "C++ sample will not be launched after training.")
52 | self.shadermake_path = None
53 | else:
54 | self.shadermake_path = shadermake_candidates[0]
55 | if len(shadermake_candidates) > 1:
56 | print(f"Warning: Found multiple possible ShaderMake executables. Picking {self.shadermake_path}")
57 | else:
58 | print(f"Found ShaderMake executable at {self.shadermake_path}")
59 |
60 | self.include_dirs = [
61 | self.rtxns_dir,
62 | self.spy_dir,
63 | self.spy_sample_dir
64 | ]
65 |
66 | for field in ("spy_dir", "sdk_root", "sdk_data_dir", "rtxns_dir", "spy_sample_dir", "donut_dir", "slang_compiler"):
67 | path: Path = getattr(self, field)
68 | if not path.exists():
69 | print(f"Warning: Can't find path {field} at {path}. This may cause errors.")
70 |
71 | self.device = self._create_device()
72 |
73 | # Create an sgl device and setup default include directories
74 | def _create_device(self):
75 | device = Device(
76 | type=DeviceType.vulkan,
77 | compiler_options=SlangCompilerOptions({
78 | "include_paths": self.include_dirs,
79 | "disable_warnings": [
80 | "41018", # Overzealous uninitialized-out-parameter warning
81 | "41012" # Coop vec capability warning
82 | ]
83 | }),
84 | )
85 |
86 | print("Selected adapter", device.info.adapter_name)
87 |
88 | return device
89 |
90 | def load_texture(self, path: Union[str,Path]):
91 | bmp = Bitmap(self.sdk_data_dir / path)
92 | loader = TextureLoader(self.device)
93 | target_tex = loader.load_texture(bmp, {"load_as_normalized": True})
94 | return target_tex
95 |
96 | # Take a trained model and distill it to defines and compile it
97 | def compile_inference_shader(self, model: CoopVecModule):
98 | if self.inference_sample_path is None or self.shadermake_path is None:
99 | print("Missing executables, skipping compilation.")
100 | return
101 |
102 | if len(model.parameters()) > 1:
103 | raise ValueError("Shader generation only supports a single parameter buffer")
104 |
105 | defines = [
106 | ("MODEL_TYPE", f'"{model.inference_type_name}"'),
107 | ("MODEL_INITIALIZER", f'"{model.get_initializer()}"'),
108 | ("VECTOR_FORMAT", model.elem_name),
109 | ]
110 |
111 | self.compile_shader("SlangpyInference.slang", defines)
112 |
113 | def compile_shader(self, shader_path: str, defines: list[Union[str,tuple[str, Any]]]):
114 | config_path = self.spy_sample_dir / "trained_shaders.cfg"
115 | with open(config_path, "w") as file:
116 | file.write(f"{shader_path} -E main_cs -T cs")
117 |
118 | output_path = self.inference_sample_path.parent / "shaders/SlangpyTraining/spirv"
119 |
120 | args = [
121 | self.shadermake_path,
122 | "--config", config_path,
123 | "-o", output_path,
124 | "--compiler", self.slang_compiler,
125 | "--platform", "SPIRV",
126 | "--flatten",
127 | "--binaryBlob",
128 | "--outputExt", ".bin",
129 | "--slang",
130 | "--tRegShift", "0",
131 | "--sRegShift", "128",
132 | "--bRegShift", "256",
133 | "--uRegShift", "384",
134 | "--vulkanVersion", "1.2",
135 | "--matrixRowMajor",
136 | "--force",
137 | "-X", "-capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV",
138 | ]
139 | for d in defines + ["SPIRV", "TARGET_VULKAN"]:
140 | if isinstance(d, str):
141 | args.extend(("-D", d))
142 | else:
143 | args.extend(("-D", f"{d[0]}={d[1]}"))
144 |
145 | for include_dir in self.include_dirs + [self.donut_dir]:
146 | args.extend(("-I", include_dir))
147 |
148 | result = subprocess.run(args, text=True, capture_output=True)
149 | if result.stderr:
150 | raise RuntimeError(f"ShaderMake exited with errors: {result.stderr}")
151 | stdout = str(result.stdout)
152 | if stdout.find(": error") != -1:
153 | raise RuntimeError(f"slang compiler exited with errors: {stdout}")
154 |
155 | def run_sdk_inference(self, model_weights: Path):
156 | if self.inference_sample_path is None or self.shadermake_path is None:
157 | print("Missing executables, skipping C++ sample.")
158 | return
159 |
160 | subprocess.run([self.inference_sample_path, model_weights])
161 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/NetworkConfig.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #define MAX_LAYER_COUNT 8
12 | #define MAX_LAYER_COUNT_ALIGN4 ((MAX_LAYER_COUNT + 3) / 4)
13 |
14 | // These defines will be overriden by texture-training.py with the
15 | // chosen network architecture. However, if we compile this file
16 | // from scratch, we provide a default architexture here so the sample
17 | // runs. We provide the trained weights for this network under
18 | // assets/data/slangpy-weights.json
19 | #ifndef MODEL_TYPE
20 | #define MODEL_TYPE \
21 | rtxns::ModuleChain, \
22 | rtxns::InferenceMLPModule, rtxns::mlp::SigmoidAct>>
23 |
24 | #define MODEL_INITIALIZER \
25 | { \
26 | {}, \
27 | { \
28 | weights, { wo[0], wo[1], wo[2], wo[3], wo[4] }, { bo[0], bo[1], bo[2], bo[3], bo[4] }, { 0.01h }, \
29 | { \
30 | } \
31 | } \
32 | }
33 | #define VECTOR_FORMAT half
34 | #endif
35 |
36 | struct NeuralConstants
37 | {
38 | uint4 weightOffsets[MAX_LAYER_COUNT_ALIGN4];
39 | uint4 biasOffsets[MAX_LAYER_COUNT_ALIGN4];
40 |
41 | uint32_t imageWidth;
42 | uint32_t imageHeight;
43 | };
44 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/NeuralModules.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 | __exported import CooperativeVectorDerivatives;
11 | __exported import CooperativeVectorFunctions;
12 | __exported import CooperativeVectorAutoDiff;
13 | __exported import Optimizers;
14 | __exported import Utils;
15 | __exported import LinearOps;
16 | __exported import MLP;
17 | __exported import Activation;
18 |
19 | namespace rtxns
20 | {
21 | ////////////////////////
22 | //
23 | // Root interface for neural modules and implementations of several
24 | // Takes a CoopVec of type T with NumInput elements and returns NumOutputs elements
25 | // Several RTXNS classes and functions are wrapped here to conform to the IModule interface
26 | // This lets you build network architectures with generic types
27 | //
28 | ////////////////////////
29 | interface IModule
30 | {
31 | [BackwardDifferentiable]
32 | CoopVec forward(CoopVec inputParams);
33 | }
34 |
35 | // Chain two modules together, i.e. pass the output of the first to the second
36 | // Can be nested arbitrarily
37 | struct ModuleChain<
38 | T : __BuiltinFloatingPointType,
39 | let NumInputs : int,
40 | let NumHidden : int,
41 | let NumOutputs : int,
42 | First : IModule,
43 | Second : IModule
44 | > : IModule
45 | {
46 | First first;
47 | Second second;
48 |
49 | [BackwardDifferentiable]
50 | CoopVec forward(CoopVec inputParams)
51 | {
52 | CoopVec middle = first.forward(inputParams);
53 | return second.forward(middle);
54 | }
55 | }
56 |
57 | struct TrainableMLPModule<
58 | T : __BuiltinFloatingPointType,
59 | let NumHiddenLayers : int,
60 | let InputNeurons : int,
61 | let HiddenNeurons : int,
62 | let OutputNeurons : int,
63 | let ComponentType : CoopVecComponentType,
64 | HiddenAct : mlp::IActivation,
65 | OutputAct : mlp::IActivation
66 | > : IModule
67 | {
68 | ByteAddressBuffer parameters;
69 | RWByteAddressBuffer derivatives;
70 | uint matrixOffsets[NumHiddenLayers + 1];
71 | uint biasOffsets[NumHiddenLayers + 1];
72 |
73 | HiddenAct hiddenAct;
74 | OutputAct outputAct;
75 |
76 | [BackwardDerivative(backward)]
77 | CoopVec forward(CoopVec inputParams)
78 | {
79 | var mlp = mlp::TrainingMLP<
80 | T,
81 | NumHiddenLayers,
82 | InputNeurons,
83 | HiddenNeurons,
84 | OutputNeurons,
85 | CoopVecMatrixLayout::TrainingOptimal,
86 | ComponentType
87 | >(parameters, derivatives, matrixOffsets, biasOffsets);
88 | return mlp.forward(inputParams, hiddenAct, outputAct);
89 | }
90 |
91 | void backward(inout DifferentialPair> inputParams, const CoopVec dOutputActivated)
92 | {
93 | var mlp = mlp::TrainingMLP<
94 | T,
95 | NumHiddenLayers,
96 | InputNeurons,
97 | HiddenNeurons,
98 | OutputNeurons,
99 | CoopVecMatrixLayout::TrainingOptimal,
100 | ComponentType
101 | >(parameters, derivatives, matrixOffsets, biasOffsets);
102 |
103 | mlp.backward(inputParams, hiddenAct, outputAct, dOutputActivated);
104 | }
105 | }
106 |
107 | struct InferenceMLPModule<
108 | T : __BuiltinFloatingPointType,
109 | let NumHiddenLayers : int,
110 | let InputNeurons : int,
111 | let HiddenNeurons : int,
112 | let OutputNeurons : int,
113 | let ComponentType : CoopVecComponentType,
114 | HiddenAct : mlp::IActivation,
115 | OutputAct : mlp::IActivation
116 | > : IModule
117 | {
118 | ByteAddressBuffer parameters;
119 | uint matrixOffsets[NumHiddenLayers + 1];
120 | uint biasOffsets[NumHiddenLayers + 1];
121 |
122 | HiddenAct hiddenAct;
123 | OutputAct outputAct;
124 |
125 | [TreatAsDifferentiable]
126 | CoopVec forward(CoopVec inputParams)
127 | {
128 | var mlp = mlp::InferenceMLP<
129 | T,
130 | NumHiddenLayers,
131 | InputNeurons,
132 | HiddenNeurons,
133 | OutputNeurons,
134 | CoopVecMatrixLayout::InferencingOptimal,
135 | ComponentType
136 | >(parameters, matrixOffsets, biasOffsets);
137 | return mlp.forward(inputParams, hiddenAct, outputAct);
138 | }
139 | }
140 |
141 | struct FrequencyEncoding : IModule
142 | {
143 | [BackwardDifferentiable]
144 | CoopVec forward(CoopVec inputParams)
145 | {
146 | return rtxns::EncodeFrequencyN(inputParams);
147 | }
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyInference.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "NetworkConfig.h"
12 | #include
13 |
14 | import SlangpyTraining;
15 |
16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0);
17 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0);
18 | Texture2D inputTexture :REGISTER_SRV(1, 0);
19 | RWTexture2D outputTexture :REGISTER_UAV(0, 0);
20 |
21 | float3 evalModel(ByteAddressBuffer weights, uint wo[MAX_LAYER_COUNT], uint bo[MAX_LAYER_COUNT], float2 uv)
22 | {
23 | // Auto-generated defines from texture-training.py
24 | MODEL_TYPE model = MODEL_INITIALIZER;
25 |
26 | let inputParams = rtxns::CoopVecFromVector(uv);
27 |
28 | let result = model.forward(inputParams);
29 |
30 | return rtxns::VectorFromCoopVec(result);
31 | }
32 |
33 | [shader("compute")]
34 | [numthreads(8, 8, 1)]
35 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID)
36 | {
37 | // Get the UV coordinate from the thread ID
38 | float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight));
39 |
40 | // Load offsets
41 | uint weightOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray(gConst.weightOffsets);
42 | uint biasOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray(gConst.biasOffsets);
43 |
44 | // Run the model
45 | float3 modelOutput = evalModel(gMLPParams, weightOffsets, biasOffsets, inputUV);
46 |
47 | // Write to output
48 | outputTexture[dispatchThreadID.xy] = float4(modelOutput, 1.0f);
49 | }
--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyTraining.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | from slangpy.backend import DataType
11 | from slangpy.core.module import Module
12 | from slangpy.types import NDBuffer, call_id
13 | import numpy as np
14 | import json
15 | import math
16 | import time
17 | import sys
18 |
19 | from Helpers import SDKSample
20 | from NeuralModules import CoopVecModule, TrainableMLP, FrequencyEncoding, ModuleChain
21 | from NeuralModules import Activation, NoneAct, LinearAct, ExponentialAct, ShiftedExponentialAct, ReLUAct, LeakyReLUAct, SigmoidAct, SwishAct, TanhAct
22 |
23 | # Set to true for an interactive training. This can be helpful
24 | # but slows down training quite a bit
25 | INTERACTIVE = True
26 | if INTERACTIVE:
27 | import matplotlib.pyplot as plt
28 |
29 | def training_main():
30 | ##
31 | ## Setup window, device and file paths
32 | ##
33 | sample = SDKSample(sys.argv[1:])
34 | device = sample.device
35 |
36 | ##
37 | ## Set up training constants.
38 | ## When we train interactively, choose smaller batches
39 | ## for faster feedback.
40 | ##
41 | batch_shape = (256, 256)
42 | learning_rate = 0.005
43 | grad_scale = 128.0
44 | loss_scale = grad_scale / math.prod(batch_shape)
45 |
46 | sample_target = 1000000000
47 | num_batches_per_epoch = 1000 if INTERACTIVE else 5000
48 | num_epochs = sample_target // (num_batches_per_epoch * math.prod(batch_shape))
49 |
50 | ##
51 | ## Set up models
52 | ##
53 |
54 | # A basic MLP with ReLU activations and a linear output that maps a 2D UV input
55 | # to an RGB color. This is a good baseline, but it won't achieve state-of-the-art
56 | basic_mlp = TrainableMLP(device, DataType.float16,
57 | num_hidden_layers=3,
58 | input_width=2,
59 | hidden_width=32,
60 | output_width=3,
61 | hidden_act=ReLUAct(),
62 | output_act=NoneAct())
63 |
64 | # Replacing ReLU with LeakyReLU makes training more stable for small networks,
65 | # and a Sigmoid activation at the output helps bring the network into the right range
66 | better_activations = TrainableMLP(device, DataType.float16,
67 | num_hidden_layers=3,
68 | input_width=2,
69 | hidden_width=32,
70 | output_width=3,
71 | hidden_act=LeakyReLUAct(),
72 | output_act=SigmoidAct())
73 |
74 | # For 2D or 3D inputs, we can do even better with an input encoding
75 | # We need to adjust the input width of the MLP to take the additional
76 | # outputs from the encoding
77 | encoding = FrequencyEncoding(DataType.float16, 2, 3)
78 | mlp_with_encoding = ModuleChain(
79 | encoding,
80 | TrainableMLP(device, DataType.float16,
81 | num_hidden_layers=3,
82 | input_width=encoding.fan_out,
83 | hidden_width=32,
84 | output_width=3,
85 | hidden_act=LeakyReLUAct(),
86 | output_act=SigmoidAct())
87 | )
88 |
89 | # We're not limited to predefined modules - for example, try using the custom
90 | # activation from the slang file:
91 | activation = SigmoidAct()
92 | #activation = Activation("SiLUActivation")
93 |
94 | # Now take the working model and scale up the number of weights by adding another layer
95 | larger_mlp = ModuleChain(
96 | encoding,
97 | TrainableMLP(device, DataType.float16,
98 | num_hidden_layers=4,
99 | input_width=encoding.fan_out,
100 | hidden_width=32,
101 | output_width=3,
102 | hidden_act=LeakyReLUAct(),
103 | output_act=activation)
104 | )
105 |
106 | # Make a list of models to be optimized so we can compare them
107 | models = [
108 | ("Basic MLP", basic_mlp),
109 | ("+Better activations", better_activations),
110 | ("+Frequency encoding", mlp_with_encoding),
111 | ("+More Weights", larger_mlp),
112 | ]
113 |
114 | # You can also play with different losses. For images, L2 is not a bad default
115 | loss_name = "rtxns::mlp::L2"
116 |
117 | ##
118 | ## Load training data and slang code
119 | ##
120 | target_tex = sample.load_texture("nvidia-logo.png")
121 |
122 | module = Module.load_from_file(device, "SlangpyTraining.slang")
123 |
124 | # Instantiate the slang RNG from the loaded module,
125 | # seeded with a random buffer of uints
126 | pcg = np.random.PCG64(seed=12345)
127 | seeds = pcg.random_raw(batch_shape).astype(np.uint32)
128 | rng = module.RNG(seeds)
129 |
130 | # Fill a buffer with UVs for later evaluating the model during training
131 | vis_resolution = 256
132 | span = np.linspace(0, 1, vis_resolution, dtype=np.float32)
133 | vis_uvs_np = np.stack(np.broadcast_arrays(span[None, :], span[:, None]), axis=2)
134 | vis_uvs = NDBuffer(device, module.float2.struct, shape=(vis_resolution, vis_resolution))
135 | vis_uvs.copy_from_numpy(vis_uvs_np)
136 |
137 | # Create a figure to fill out as we go
138 | if INTERACTIVE:
139 | n = len(models)
140 | fig, axes = plt.subplots(2, n, dpi=200, figsize=(2.4 * n, 4.8), squeeze=False)
141 | plt.ion()
142 | plt.show()
143 |
144 | black = np.zeros((vis_resolution, vis_resolution, 3), dtype=np.uint8)
145 | canvases = []
146 | for i, (model_name, _) in enumerate(models):
147 | axes[0, i].text(0.5, 1.05, f"{model_name}", horizontalalignment='center', size=8)
148 | top = axes[0, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1)
149 | bot = axes[1, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1)
150 | canvases.append([top, bot])
151 | axes[0, i].set_axis_off()
152 | axes[1, i].set_axis_off()
153 | fig.tight_layout(h_pad=-1, w_pad=0.5)
154 |
155 |
156 | for i, (model_name, model) in enumerate(models):
157 | print(f"Training model {model_name}")
158 |
159 | assert len(model.parameters()) == 1, "Only one set of parameters is supported in this sample"
160 | assert model.fan_in == 2 and model.fan_out == 3, "Model must have 2 inputs (UV) and 3 outputs (RGB)"
161 |
162 | ##
163 | ## Set up optimizer and specialize the slang functions to our model
164 | ##
165 | grads = model.gradients()[0]
166 | parameters = model.parameters()[0]
167 |
168 | parametersF = module.ConvertToFloat(parameters)
169 |
170 | # These match up with the argument names of optimizerStep in texture-training.slang
171 | optimizer_state = {
172 | "moments1": NDBuffer.zeros_like(parametersF),
173 | "moments2": NDBuffer.zeros_like(parametersF),
174 | "paramF": parametersF,
175 | "paramH": parameters,
176 | "grad": grads,
177 | "learningRate": learning_rate,
178 | "gradScale": grad_scale
179 | }
180 | num_params = parameters.shape[0]
181 |
182 | # Specialize slang functions by substituting generic parameters
183 | optimizer_step = module.OptimizerStep
184 | train_texture = module[f"TrainTexture<{model.type_name}, {loss_name} >"]
185 | eval_model = module[f"EvalModel<{model.type_name} >"]
186 | eval_loss = module[f"EvalLoss<{loss_name} >"]
187 |
188 | # Begin main training loop
189 | iteration = 1
190 | for epoch in range(num_epochs):
191 | start = time.time()
192 |
193 | cmd = device.create_command_buffer()
194 | cmd.open()
195 | # Each batch is submitted to a command buffer
196 | for batch in range(num_batches_per_epoch):
197 | # Compute gradients
198 | train_texture.append_to(cmd, model, rng, target_tex, loss_scale)
199 | # Do one parameter optimization step using those gradients
200 | optimizer_step.append_to(cmd, idx=call_id((num_params, )), iteration=iteration, **optimizer_state)
201 | iteration += 1
202 | cmd.close()
203 | device.submit_command_buffer(cmd)
204 | device.wait()
205 | end = time.time()
206 |
207 | device.run_garbage_collection()
208 |
209 | # Print out progress info
210 | elapsed = end - start
211 | num_samples_per_epoch = math.prod(batch_shape) * num_batches_per_epoch
212 | progress = (num_samples_per_epoch * (epoch + 1)) // 1000000
213 | info = (f"Epoch {epoch + 1} complete, "
214 | f"{progress}/{sample_target // 1000000} MSamples: "
215 | f"Time: {elapsed:.3f}s "
216 | f"Throughput: {num_samples_per_epoch / elapsed * 1e-6:.2f} MSamples/s")
217 |
218 | # In the interactive case, draw updates to window and compute loss. This goes
219 | # through the CPU, so this is quite slow
220 | if INTERACTIVE:
221 | current_prediction = eval_model(model, vis_uvs, _result=np.ndarray)
222 | loss_val = np.mean(eval_loss(vis_uvs, current_prediction, target_tex, _result=np.ndarray))
223 | diff = module.TextureDifference(vis_uvs, current_prediction, target_tex, 10.0, _result=np.ndarray)
224 |
225 | info += f" Loss: {loss_val:.3f}"
226 |
227 | current_prediction = np.clip(current_prediction, 0, 1)
228 | diff = np.clip(diff, 0, 1)
229 |
230 | canvases[i][0].set_data(current_prediction)
231 | canvases[i][1].set_data(diff)
232 | fig.canvas.draw()
233 | fig.canvas.flush_events()
234 |
235 | print(info)
236 |
237 | print("Training complete!")
238 |
239 | best_model = models[-1][1]
240 |
241 | weight_path = sample.spy_sample_dir / "weights.json"
242 | print(f"Writing trained weights of best model to {weight_path}")
243 | param_dict = best_model.serialize()
244 | open(weight_path, "w").write(json.dumps(param_dict, indent=4))
245 |
246 | print(f"Compiling inference shader...")
247 | sample.compile_inference_shader(best_model)
248 |
249 | print(f"Running RTXNS inference...")
250 | if INTERACTIVE:
251 | plt.close()
252 | sample.run_sdk_inference(weight_path)
253 |
254 | if __name__ == "__main__":
255 | training_main()
256 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/SlangpyTraining.slang:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: Apache-2.0
2 | // clang-format off
3 |
4 | /*
5 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
6 | *
7 | * NVIDIA CORPORATION and its licensors retain all intellectual property
8 | * and proprietary rights in and to this software, related documentation
9 | * and any modifications thereto. Any use, reproduction, disclosure or
10 | * distribution of this software and related documentation without an express
11 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
12 | */
13 |
14 | __exported import NeuralModules;
15 | __exported import Loss;
16 | __exported import Optimizers;
17 |
18 | struct RNG
19 | {
20 | uint state;
21 |
22 | __init(uint state) { this.state = state; }
23 |
24 | [mutating]
25 | float next()
26 | {
27 | float r = (state >> 8) * 0x1p-24;
28 | state = state * 2739110765U + 2739110765U;
29 | return r;
30 | }
31 | }
32 |
33 | // An example of adding a custom activation to your network
34 | // This implements the Sigmoid Linear Unit (SiLU)
35 | struct SiLUActivation : rtxns::mlp::IActivation
36 | {
37 | [Differentiable]
38 | CoopVec eval(CoopVec x)
39 | {
40 | return x * no_diff CoopVec(T(1.)) / (no_diff CoopVec(T(1.)) + exp(-x));
41 | }
42 | }
43 |
44 | // Utility function for nearest-neighbor sampling of texture
45 | T SampleTexture(Texture2D tex, float2 uv)
46 | {
47 | float2 size;
48 | tex.GetDimensions(size[0], size[1]);
49 | uint2 xy = uint2(uv * size);
50 | return tex[xy];
51 | }
52 |
53 | // Take one step with the adam optimizer
54 | void OptimizerStep(
55 | RWBuffer moments1,
56 | RWBuffer moments2,
57 | RWBuffer paramF,
58 | RWBuffer paramH,
59 | RWBuffer grad,
60 | uint idx,
61 | float learningRate,
62 | float gradScale,
63 | int iteration)
64 | {
65 | var optimizer = optimizers::Adam(moments1, moments2, learningRate, gradScale);
66 |
67 | // Parameters are converted to FP16 for computing gradients,
68 | // but we keep the FP32 originals around so we don't accumulate
69 | // rounding errors
70 | float parameter = paramF[idx];
71 | float gradient = (float)grad[idx];
72 |
73 | parameter = optimizer.step(parameter, idx, gradient, iteration);
74 |
75 | // Update the reference FP32 parameter, and convert the new value back to FP16
76 | paramF[idx] = parameter;
77 | paramH[idx] = (half)parameter;
78 | // Zero out gradients
79 | grad[idx] = 0.0h;
80 | }
81 |
82 | void TrainTexture, Loss : rtxns::mlp::ILoss>(Model model, inout RNG rng, Texture2D targetTex, float lossScale)
83 | {
84 | // Get a random uv coordinate for the input
85 | float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0);
86 |
87 | // Sample the target texture at the generated UV
88 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
89 |
90 | // Evaluate the current output of the model
91 | float3 predictedRGB = EvalModel(model, inputUV);
92 |
93 | // Evaluate the loss gradient
94 | float3 lossGradient = Loss.deriv(targetRGB, predictedRGB, lossScale);
95 |
96 | // Backpropragate gradient through network parameters
97 | bwd_diff(EvalModel)(model, inputUV, lossGradient);
98 | }
99 |
100 | // Convenience functions for evaluating the model from vector inputs
101 | // Converts to/from CoopVec internally
102 | [Differentiable]
103 | float3 EvalModel>(Model model, no_diff float2 inputUV)
104 | {
105 | var inputVec = rtxns::CoopVecFromVector(inputUV);
106 |
107 | var result = model.forward(inputVec);
108 |
109 | return rtxns::VectorFromCoopVec(result);
110 | }
111 |
112 | // Computes the loss between the predicted RGB at a given UV coordinate and a reference texture
113 | float3 EvalLoss>(float2 inputUV, float3 predictedRGB, Texture2D targetTex)
114 | {
115 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
116 |
117 | return Loss.value(targetRGB, predictedRGB, 1.0f);
118 | }
119 |
120 | // Computes the difference between the predicted RGB at a given UV coordinate and a reference texture
121 | // for visualization
122 | float3 TextureDifference(float2 inputUV, float3 predictedRGB, Texture2D targetTex, float scale)
123 | {
124 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb;
125 |
126 | return (predictedRGB - targetRGB) * scale + 0.5f;
127 | }
128 |
129 | // Convenience function to convert from half to float params
130 | float ConvertToFloat(half paramH)
131 | {
132 | return (float)paramH;
133 | }
134 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=3.0,<4.0
2 | numpy>=2.0,<3.0
3 | slangpy==0.19.4
4 |
--------------------------------------------------------------------------------
/samples/SlangpyTraining/shaders.cfg:
--------------------------------------------------------------------------------
1 | SlangpyInference.slang -E main_cs -T cs
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 | set(LIBRARY_FILTER src)
11 | add_subdirectory(NeuralShading)
12 | add_subdirectory(Utils)
13 | add_subdirectory(NeuralShading_Shaders)
14 |
15 |
--------------------------------------------------------------------------------
/src/NeuralShading/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 |
11 | file(GLOB sources "*.cpp" "*.h")
12 |
13 | set(project NeuralShading)
14 | set(folder "${LIBRARY_FILTER}/NeuralShading")
15 |
16 | add_library(${project} STATIC EXCLUDE_FROM_ALL ${sources})
17 | target_include_directories(${project} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | target_link_libraries(${project} donut_app donut_engine)
19 |
20 | set_target_properties(${project} PROPERTIES
21 | FOLDER ${folder}
22 | )
23 |
--------------------------------------------------------------------------------
/src/NeuralShading/CoopVector.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #include "CoopVector.h"
12 | #include
13 |
14 | #if DONUT_WITH_VULKAN
15 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
16 | #include
17 | #endif
18 |
19 | #if DONUT_WITH_DX12
20 | #include
21 | #include
22 | #endif
23 |
24 | using namespace rtxns;
25 |
26 | namespace
27 | {
28 | /**
29 | * Bytes between a consecutive row or column (if row/column-major layout).
30 | * The stride is only used for row/column major layouts
31 | **/
32 | size_t GetStride(const MatrixLayout layout, const uint32_t rows, const uint32_t cols, const size_t precision)
33 | {
34 | size_t stride = 0;
35 | if (layout == MatrixLayout::RowMajor)
36 | {
37 | stride = cols * precision;
38 | }
39 | else if (layout == MatrixLayout::ColumnMajor)
40 | {
41 | stride = rows * precision;
42 | }
43 | return stride;
44 | }
45 | } // namespace
46 |
47 | #if DONUT_WITH_VULKAN
48 | namespace
49 | {
50 |
51 | VkComponentTypeKHR GetVkComponentType(rtxns::Precision precision)
52 | {
53 | return precision == rtxns::Precision::F16 ? VK_COMPONENT_TYPE_FLOAT16_NV : VK_COMPONENT_TYPE_FLOAT32_NV;
54 | }
55 |
56 | VkCooperativeVectorMatrixLayoutNV GetVkLayout(const MatrixLayout layout)
57 | {
58 | switch (layout)
59 | {
60 | case MatrixLayout::RowMajor:
61 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_ROW_MAJOR_NV;
62 | case MatrixLayout::ColumnMajor:
63 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_COLUMN_MAJOR_NV;
64 | case MatrixLayout::InferencingOptimal:
65 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_INFERENCING_OPTIMAL_NV;
66 | case MatrixLayout::TrainingOptimal:
67 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_TRAINING_OPTIMAL_NV;
68 | default:
69 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_MAX_ENUM_NV;
70 | }
71 | }
72 |
73 | VkConvertCooperativeVectorMatrixInfoNV GetVkConvertLayerDesc(
74 | int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t* dstSize, uint64_t srcData = 0, uint64_t dstData = 0)
75 | {
76 | VkConvertCooperativeVectorMatrixInfoNV info{};
77 | info.sType = VK_STRUCTURE_TYPE_CONVERT_COOPERATIVE_VECTOR_MATRIX_INFO_NV;
78 | info.pNext = nullptr;
79 | info.numRows = rows;
80 | info.numColumns = columns;
81 | info.srcComponentType = GetVkComponentType(precision);
82 | info.srcLayout = GetVkLayout(srcLayout);
83 | info.srcStride = GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision));
84 | info.srcSize = srcSize;
85 | info.srcData.deviceAddress = srcData;
86 | info.dstComponentType = GetVkComponentType(precision);
87 | info.dstLayout = GetVkLayout(dstLayout);
88 | info.dstStride = GetStride(dstLayout, rows, columns, GetSize(precision));
89 | info.pDstSize = dstSize;
90 | info.dstData.deviceAddress = dstData;
91 | return info;
92 | }
93 |
94 | } // namespace
95 |
96 | CoopVectorUtils_VK::CoopVectorUtils_VK(VkDevice vkDevice)
97 | {
98 | m_vkDevice = vkDevice;
99 | assert(m_vkDevice != VK_NULL_HANDLE && "Failed to get Vulkan device handle from GFX.");
100 |
101 | m_vkConvertCooperativeVectorMatrixNV =
102 | (PFN_vkConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkConvertCooperativeVectorMatrixNV");
103 | assert(m_vkConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkConvertCooperativeVectorMatrixNV'.");
104 |
105 | m_vkCmdConvertCooperativeVectorMatrixNV =
106 | (PFN_vkCmdConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdConvertCooperativeVectorMatrixNV");
107 | assert(m_vkCmdConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkCmdConvertCooperativeVectorMatrixNV'.");
108 |
109 | m_vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdCopyBuffer");
110 | assert(m_vkCmdCopyBuffer != nullptr && "Failed to get Vulkan function 'vkCmdCopyBuffer'.");
111 |
112 | m_vkGetBufferDeviceAddress = (PFN_vkGetBufferDeviceAddress)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkGetBufferDeviceAddress");
113 | assert(m_vkGetBufferDeviceAddress != nullptr && "Failed to get Vulkan function 'vkGetBufferDeviceAddress'.");
114 | }
115 |
116 | size_t CoopVectorUtils_VK::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision)
117 | {
118 | assert(m_vkDevice);
119 | assert(m_vkConvertCooperativeVectorMatrixNV);
120 | assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128.");
121 | assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128.");
122 |
123 | size_t requiredSize = 0;
124 |
125 | VkConvertCooperativeVectorMatrixInfoNV info = GetVkConvertLayerDesc(rows, cols, precision, MatrixLayout::RowMajor, layout, 0, &requiredSize);
126 |
127 | VkResult res = m_vkConvertCooperativeVectorMatrixNV(m_vkDevice, &info);
128 | assert(res == VK_SUCCESS && "Call to vkConvertCooperativeVectorMatrixNV failed");
129 | assert(requiredSize > 0 && "Expected matrix size to be larger than zero.");
130 |
131 | return requiredSize;
132 | }
133 |
134 | void CoopVectorUtils_VK::ConvertDeviceMatrixLayout(
135 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const
136 | {
137 | VkCommandBuffer vkCmdBuf = static_cast(commandList);
138 | VkBuffer vkSrcBuffer = static_cast(srcBuffer);
139 | VkBuffer vkDstBuffer = static_cast(dstBuffer);
140 |
141 | // Obtain the device addresses of the buffers for the conversion functions
142 | VkBufferDeviceAddressInfo bufferDeviceAddressInfo{};
143 | bufferDeviceAddressInfo.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO;
144 | bufferDeviceAddressInfo.buffer = vkSrcBuffer;
145 | VkDeviceAddress const srcBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo);
146 | bufferDeviceAddressInfo.buffer = vkDstBuffer;
147 | VkDeviceAddress const dstBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo);
148 |
149 | // Convert weights
150 | std::vector convertInfos(srcLayout.networkLayers.size());
151 | for (int i = 0; i < srcLayout.networkLayers.size(); i++)
152 | {
153 | // Weights
154 | size_t dstLayerSize = dstLayout.networkLayers[i].weightSize;
155 | convertInfos[i] =
156 | GetVkConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout, dstLayout.matrixLayout,
157 | srcLayout.networkLayers[i].weightSize, &dstLayerSize, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset,
158 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset);
159 | }
160 | m_vkCmdConvertCooperativeVectorMatrixNV(vkCmdBuf, (uint32_t)convertInfos.size(), convertInfos.data());
161 |
162 | // Copy the bias
163 | std::vector copyRegions(srcLayout.networkLayers.size());
164 | for (int i = 0; i < srcLayout.networkLayers.size(); i++)
165 | {
166 | copyRegions[i].srcOffset = srcBufferOffset + srcLayout.networkLayers[i].biasOffset;
167 | copyRegions[i].dstOffset = dstBufferOffset + dstLayout.networkLayers[i].biasOffset;
168 | copyRegions[i].size = srcLayout.networkLayers[i].biasSize;
169 | }
170 | m_vkCmdCopyBuffer(vkCmdBuf, vkSrcBuffer, vkDstBuffer, (uint32_t)copyRegions.size(), copyRegions.data());
171 | }
172 | #endif
173 |
174 | #if DONUT_WITH_DX12
175 |
176 | namespace
177 | {
178 | D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT GetDX12MatrixLayout(const MatrixLayout layout)
179 | {
180 | switch (layout)
181 | {
182 | case MatrixLayout::RowMajor:
183 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
184 | case MatrixLayout::ColumnMajor:
185 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR;
186 | case MatrixLayout::InferencingOptimal:
187 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL;
188 | case MatrixLayout::TrainingOptimal:
189 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
190 | }
191 | }
192 |
193 | D3D12_LINEAR_ALGEBRA_DATATYPE GetDX12ComponentType(rtxns::Precision precision)
194 | {
195 | return precision == rtxns::Precision::F16 ? D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
196 | }
197 |
198 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO GetDX12ConvertLayerDestInfo(int rows, int columns, MatrixLayout layout, Precision precision)
199 | {
200 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info{};
201 | info.DestLayout = GetDX12MatrixLayout(layout);
202 | info.NumRows = rows;
203 | info.NumColumns = columns;
204 | info.DestStride = UINT(GetStride(layout, rows, columns, GetSize(precision)));
205 | info.DestDataType = GetDX12ComponentType(precision);
206 | return info;
207 | }
208 |
209 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12ConvertLayerDesc(
210 | int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t dstSize, uint64_t srcData, uint64_t dstData)
211 | {
212 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{};
213 | info.DestInfo = GetDX12ConvertLayerDestInfo(rows, columns, dstLayout, precision);
214 | info.DestInfo.DestSize = UINT(dstSize);
215 | info.SrcInfo.SrcSize = UINT(srcSize);
216 | info.SrcInfo.SrcDataType = GetDX12ComponentType(precision);
217 | info.SrcInfo.SrcLayout = GetDX12MatrixLayout(srcLayout);
218 | info.SrcInfo.SrcStride = UINT(GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision)));
219 | info.DataDesc.SrcVA = srcData;
220 | info.DataDesc.DestVA = dstData;
221 | return info;
222 | }
223 |
224 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12CopyScaleBiasDesc(size_t biasSize, Precision precision, uint64_t srcData, uint64_t dstData)
225 | {
226 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{};
227 | info.DestInfo.DestSize = UINT(biasSize);
228 | info.DestInfo.DestLayout = D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
229 | info.DestInfo.DestStride = info.DestInfo.DestSize;
230 | info.DestInfo.NumRows = 1;
231 | info.DestInfo.NumColumns = UINT(biasSize / GetSize(precision));
232 | info.DestInfo.DestDataType = GetDX12ComponentType(precision);
233 | info.SrcInfo.SrcSize = info.DestInfo.DestSize;
234 | info.SrcInfo.SrcDataType = info.DestInfo.DestDataType;
235 | info.SrcInfo.SrcLayout = info.DestInfo.DestLayout;
236 | info.SrcInfo.SrcStride = info.DestInfo.DestStride;
237 | info.DataDesc.SrcVA = srcData;
238 | info.DataDesc.DestVA = dstData;
239 | return info;
240 | }
241 | } // namespace
242 |
243 | CoopVectorUtils_DX12::CoopVectorUtils_DX12(ID3D12Device* d3d12Device)
244 | {
245 | m_d3d12Device = d3d12Device;
246 | assert(m_d3d12Device != nullptr && "Failed to get D3D12 device from GFX.");
247 | }
248 |
249 | /**
250 | * Query the size of a matrix in bytes.
251 | * @return Size of matrix in bytes.
252 | */
253 | size_t CoopVectorUtils_DX12::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision /*= Precision::F16*/)
254 | {
255 | assert(m_d3d12Device);
256 | assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128.");
257 | assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128.");
258 |
259 | Microsoft::WRL::ComPtr devicePreview;
260 | assert(m_d3d12Device->QueryInterface(IID_PPV_ARGS(&devicePreview)) == S_OK && "Failed to get device preview");
261 |
262 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info = GetDX12ConvertLayerDestInfo(rows, cols, layout, precision);
263 |
264 | devicePreview->GetLinearAlgebraMatrixConversionDestinationInfo(&info);
265 |
266 | assert(info.DestSize > 0 && "Expected matrix size to be larger than zero.");
267 | return info.DestSize;
268 | }
269 |
270 | void rtxns::CoopVectorUtils_DX12::ConvertDeviceMatrixLayout(
271 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const
272 | {
273 | ID3D12GraphicsCommandList* d3dCmdList = static_cast(commandList);
274 | ID3D12Resource* d3dSrcBuffer = static_cast(srcBuffer);
275 | ID3D12Resource* d3dDstBuffer = static_cast(dstBuffer);
276 |
277 | Microsoft::WRL::ComPtr commandListPreview;
278 | assert(d3dCmdList->QueryInterface(IID_PPV_ARGS(&commandListPreview)) == S_OK && "Command list provided does not support matrix conversion");
279 |
280 | D3D12_GPU_VIRTUAL_ADDRESS const srcBufferVA = d3dSrcBuffer->GetGPUVirtualAddress();
281 | D3D12_GPU_VIRTUAL_ADDRESS const dstBufferVA = d3dDstBuffer->GetGPUVirtualAddress();
282 |
283 | // We need conversion data for each of the weights and bias separately so we need two entry for each layer
284 | std::vector convertInfos(srcLayout.networkLayers.size() * 2);
285 |
286 | // Convert weights
287 | for (int i = 0; i < srcLayout.networkLayers.size(); i++)
288 | {
289 | // Weights
290 | convertInfos[i] = GetDX12ConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout,
291 | dstLayout.matrixLayout, srcLayout.networkLayers[i].weightSize, dstLayout.networkLayers[i].weightSize,
292 | srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset,
293 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset);
294 | }
295 |
296 | // Convert bias
297 | // D3D's CopyBufferRegion requires resource states incompatible with the conversion ops.
298 | // Use a degenerate form of a matrix conversion to copy the extra data to avoid placing a barrier.
299 | int infoOffset = int(srcLayout.networkLayers.size());
300 | for (int ii = 0; ii < srcLayout.networkLayers.size(); ii++)
301 | {
302 | convertInfos[ii + infoOffset] =
303 | GetDX12CopyScaleBiasDesc(srcLayout.networkLayers[ii].biasSize, srcLayout.matrixPrecision, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[ii].biasOffset,
304 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[ii].biasOffset);
305 | }
306 | commandListPreview->ConvertLinearAlgebraMatrix(convertInfos.data(), UINT(convertInfos.size()));
307 | }
308 | #endif
309 |
--------------------------------------------------------------------------------
/src/NeuralShading/CoopVector.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 |
13 | #if DONUT_WITH_DX12
14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h"
15 | #endif
16 |
17 | #include
18 | #include
19 |
20 |
21 | #include "Float16.h"
22 | #include "NeuralNetworkTypes.h"
23 |
24 | namespace rtxns
25 | {
26 |
27 | class ICoopVectorUtils
28 | {
29 | public:
30 | size_t GetMatrixAlignment()
31 | {
32 | return s_matrixAlignment;
33 | }
34 | size_t GetVectorAlignment()
35 | {
36 | return s_vectorAlignment;
37 | }
38 |
39 | /**
40 | * Query the size of a matrix in bytes.
41 | * @return Size of matrix in bytes.
42 | */
43 | virtual size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16) = 0;
44 |
45 | /**
46 | * Convert matrix on the device between any layouts.
47 | * The Precision must currently be the same.
48 | * @return Size of matrix in bytes.
49 | */
50 | virtual void ConvertDeviceMatrixLayout(NetworkLayout const& srcLayout,
51 | NetworkLayout const& dstLayout,
52 | void* srcBuffer,
53 | uint64_t srcBufferOffset,
54 | void* dstBuffer,
55 | uint64_t dstBufferOffset,
56 | void* commandList) const = 0;
57 |
58 | protected:
59 | static const size_t s_matrixAlignment = 64; ///< Minimum byte alignment according to spec.
60 | static const size_t s_vectorAlignment = 16; ///< Minimum byte alignment according to spec.
61 | };
62 |
63 | #if DONUT_WITH_VULKAN
64 | class CoopVectorUtils_VK : public ICoopVectorUtils
65 | {
66 | public:
67 | CoopVectorUtils_VK(VkDevice vkDevice);
68 |
69 | /**
70 | * Query the size of a matrix in bytes.
71 | * @return Size of matrix in bytes.
72 | */
73 | size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16);
74 |
75 | /**
76 | * Convert matrix on the device between any layouts.
77 | * The Precision must currently be the same.
78 | * @return Size of matrix in bytes.
79 | */
80 | void ConvertDeviceMatrixLayout(
81 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const;
82 |
83 | private:
84 | VkDevice m_vkDevice = nullptr;
85 | PFN_vkConvertCooperativeVectorMatrixNV m_vkConvertCooperativeVectorMatrixNV = nullptr;
86 | PFN_vkCmdConvertCooperativeVectorMatrixNV m_vkCmdConvertCooperativeVectorMatrixNV = nullptr;
87 | PFN_vkCmdCopyBuffer m_vkCmdCopyBuffer = nullptr;
88 | PFN_vkGetBufferDeviceAddress m_vkGetBufferDeviceAddress = nullptr;
89 | };
90 | #endif
91 |
92 | #if DONUT_WITH_DX12
93 | class CoopVectorUtils_DX12 : public ICoopVectorUtils
94 | {
95 | public:
96 | CoopVectorUtils_DX12(ID3D12Device* d3d12Device);
97 |
98 | /**
99 | * Query the size of a matrix in bytes.
100 | * @return Size of matrix in bytes.
101 | */
102 | size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16);
103 |
104 | /**
105 | * Convert matrix on the device between any layouts.
106 | * The Precision must currently be the same.
107 | * @return Size of matrix in bytes.
108 | */
109 | void ConvertDeviceMatrixLayout(
110 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const;
111 |
112 | private:
113 | ID3D12Device* m_d3d12Device = nullptr;
114 | };
115 | #endif
116 | } // namespace rtxns
--------------------------------------------------------------------------------
/src/NeuralShading/Float16.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | /**
12 | * Most of this code is derived from the GLM library at https://github.com/g-truc/glm
13 | *
14 | * License: https://github.com/g-truc/glm/blob/master/copying.txt
15 | */
16 |
17 | #include "Float16.h"
18 |
19 | namespace rtxns
20 | {
21 |
22 | static float overflow()
23 | {
24 | volatile float f = 1e10;
25 | for (int i = 0; i < 10; ++i)
26 | {
27 | f *= f; // this will overflow before the for loop terminates
28 | }
29 | return f;
30 | }
31 |
32 | union uif32
33 | {
34 | float f;
35 | unsigned int i;
36 | };
37 |
38 | uint16_t float32ToFloat16(float value)
39 | {
40 | uif32 entry;
41 | entry.f = value;
42 | int i = static_cast(entry.i);
43 |
44 | //
45 | // Our floating point number, f, is represented by the bit
46 | // pattern in integer i. Disassemble that bit pattern into
47 | // the sign, s, the exponent, e, and the significand, m.
48 | // Shift s into the position where it will go in the
49 | // resulting half number.
50 | // Adjust e, accounting for the different exponent bias
51 | // of float and half (127 versus 15).
52 | //
53 |
54 | int s = (i >> 16) & 0x00008000;
55 | int e = ((i >> 23) & 0x000000ff) - (127 - 15);
56 | int m = i & 0x007fffff;
57 |
58 | //
59 | // Now reassemble s, e and m into a half:
60 | //
61 |
62 | if (e <= 0)
63 | {
64 | if (e < -10)
65 | {
66 | //
67 | // E is less than -10. The absolute value of f is
68 | // less than half_MIN (f may be a small normalized
69 | // float, a denormalized float or a zero).
70 | //
71 | // We convert f to a half zero.
72 | //
73 |
74 | return uint16_t(s);
75 | }
76 |
77 | //
78 | // E is between -10 and 0. F is a normalized float,
79 | // whose magnitude is less than __half_NRM_MIN.
80 | //
81 | // We convert f to a denormalized half.
82 | //
83 |
84 | m = (m | 0x00800000) >> (1 - e);
85 |
86 | //
87 | // Round to nearest, round "0.5" up.
88 | //
89 | // Rounding may cause the significand to overflow and make
90 | // our number normalized. Because of the way a half's bits
91 | // are laid out, we don't have to treat this case separately;
92 | // the code below will handle it correctly.
93 | //
94 |
95 | if (m & 0x00001000)
96 | {
97 | m += 0x00002000;
98 | }
99 |
100 | //
101 | // Assemble the half from s, e (zero) and m.
102 | //
103 |
104 | return uint16_t(s | (m >> 13));
105 | }
106 | else if (e == 0xff - (127 - 15))
107 | {
108 | if (m == 0)
109 | {
110 | //
111 | // F is an infinity; convert f to a half
112 | // infinity with the same sign as f.
113 | //
114 |
115 | return uint16_t(s | 0x7c00);
116 | }
117 | else
118 | {
119 | //
120 | // F is a NAN; we produce a half NAN that preserves
121 | // the sign bit and the 10 leftmost bits of the
122 | // significand of f, with one exception: If the 10
123 | // leftmost bits are all zero, the NAN would turn
124 | // into an infinity, so we have to set at least one
125 | // bit in the significand.
126 | //
127 |
128 | m >>= 13;
129 |
130 | return uint16_t(s | 0x7c00 | m | (m == 0));
131 | }
132 | }
133 | else
134 | {
135 | //
136 | // E is greater than zero. F is a normalized float.
137 | // We try to convert f to a normalized half.
138 | //
139 |
140 | //
141 | // Round to nearest, round "0.5" up
142 | //
143 |
144 | if (m & 0x00001000)
145 | {
146 | m += 0x00002000;
147 |
148 | if (m & 0x00800000)
149 | {
150 | m = 0; // overflow in significand,
151 | e += 1; // adjust exponent
152 | }
153 | }
154 |
155 | //
156 | // Handle exponent overflow
157 | //
158 |
159 | if (e > 30)
160 | {
161 | overflow(); // Cause a hardware floating point overflow;
162 |
163 | return uint16_t(s | 0x7c00); // Return infinity with same sign as f.
164 | }
165 |
166 | //
167 | // Assemble the half from s, e and m.
168 | //
169 |
170 | return uint16_t(s | (e << 10) | (m >> 13));
171 | }
172 | }
173 |
174 | float float16ToFloat32(uint16_t value)
175 | {
176 | int s = (value >> 15) & 0x00000001;
177 | int e = (value >> 10) & 0x0000001f;
178 | int m = value & 0x000003ff;
179 |
180 | if (e == 0)
181 | {
182 | if (m == 0)
183 | {
184 | //
185 | // Plus or minus zero
186 | //
187 |
188 | uif32 result;
189 | result.i = static_cast(s << 31);
190 | return result.f;
191 | }
192 | else
193 | {
194 | //
195 | // Denormalized number -- renormalize it
196 | //
197 |
198 | while (!(m & 0x00000400))
199 | {
200 | m <<= 1;
201 | e -= 1;
202 | }
203 |
204 | e += 1;
205 | m &= ~0x00000400;
206 | }
207 | }
208 | else if (e == 31)
209 | {
210 | if (m == 0)
211 | {
212 | //
213 | // Positive or negative infinity
214 | //
215 |
216 | uif32 result;
217 | result.i = static_cast((s << 31) | 0x7f800000);
218 | return result.f;
219 | }
220 | else
221 | {
222 | //
223 | // Nan -- preserve sign and significand bits
224 | //
225 |
226 | uif32 result;
227 | result.i = static_cast((s << 31) | 0x7f800000 | (m << 13));
228 | return result.f;
229 | }
230 | }
231 |
232 | //
233 | // Normalized number
234 | //
235 |
236 | e = e + (127 - 15);
237 | m = m << 13;
238 |
239 | //
240 | // Assemble s, e and m.
241 | //
242 |
243 | uif32 result;
244 | result.i = static_cast((s << 31) | (e << 23) | m);
245 | return result.f;
246 | }
247 |
248 | } // namespace rtxns
249 |
--------------------------------------------------------------------------------
/src/NeuralShading/Float16.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 |
13 | #include
14 | #include
15 |
16 | namespace rtxns
17 | {
18 |
19 | uint16_t float32ToFloat16(float value);
20 | float float16ToFloat32(uint16_t value);
21 |
22 | } // namespace rtxns
--------------------------------------------------------------------------------
/src/NeuralShading/GraphicsResources.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 |
13 | #if DONUT_WITH_DX12
14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h"
15 | #include
16 | #include
17 | #endif
18 |
19 | #if DONUT_WITH_VULKAN
20 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
21 | #include
22 | #endif
23 |
24 | #include "GraphicsResources.h"
25 | #include
26 | #include
27 |
28 | namespace rtxns
29 | {
30 |
31 | GraphicsResources::GraphicsResources(nvrhi::DeviceHandle device)
32 | {
33 | #if DONUT_WITH_VULKAN
34 | if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::VULKAN)
35 | {
36 | VkInstance vkInstance = device->getNativeObject(nvrhi::ObjectTypes::VK_Instance);
37 | VkPhysicalDevice vkPhysicalDevice = device->getNativeObject(nvrhi::ObjectTypes::VK_PhysicalDevice);
38 |
39 | m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetInstanceProcAddr(
40 | vkInstance, "vkGetPhysicalDeviceCooperativeVectorPropertiesNV");
41 | assert(m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV != nullptr && "Failed to get Vulkan function 'vkGetPhysicalDeviceCooperativeVectorPropertiesNV'.");
42 |
43 | // Get the property count
44 | uint32_t propertyCount = 0;
45 | if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, nullptr) != VK_SUCCESS)
46 | {
47 | return;
48 | }
49 |
50 | // If we vkGetPhysicalDeviceCooperativeVectorPropertiesNV returns we have inference and training support
51 | m_coopVectorFeatures.inferenceSupported = true;
52 | m_coopVectorFeatures.trainingSupported = true;
53 |
54 | std::vector properties(propertyCount);
55 | // Init the sType fields
56 | for (auto& property : properties)
57 | {
58 | property.sType = VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV;
59 | }
60 |
61 | // Get the actual properties
62 | if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, properties.data()) != VK_SUCCESS)
63 | {
64 | return;
65 | }
66 |
67 | for (const auto& property : properties)
68 | {
69 | if (property.sType == VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV && property.inputType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
70 | property.inputInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR && property.matrixInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR &&
71 | property.resultType == VK_COMPONENT_TYPE_FLOAT16_KHR)
72 | {
73 | m_coopVectorFeatures.fp16InferencingSupported = true;
74 | m_coopVectorFeatures.fp16TrainingSupported = true;
75 | }
76 | }
77 | }
78 | #endif
79 |
80 | #if DONUT_WITH_DX12
81 | if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::D3D12)
82 | {
83 | ID3D12Device* d3d12Device = device->getNativeObject(nvrhi::ObjectTypes::D3D12_Device);
84 |
85 | // Check experimental features are enabled
86 | D3D12_FEATURE_DATA_D3D12_OPTIONS_EXPERIMENTAL experimentalOptions{};
87 | auto hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS_EXPERIMENTAL, &experimentalOptions, sizeof(experimentalOptions));
88 | if (hr != S_OK)
89 | {
90 | donut::log::error("Coop vector is not supported.");
91 | return;
92 | }
93 |
94 | // Mute preview shader model (6.9) validation warning.
95 | Microsoft::WRL::ComPtr infoQueue;
96 | if (d3d12Device->QueryInterface(IID_PPV_ARGS(&infoQueue)) == S_OK)
97 | {
98 | D3D12_MESSAGE_ID denyIds[] = { D3D12_MESSAGE_ID_NON_RETAIL_SHADER_MODEL_WONT_VALIDATE };
99 |
100 | D3D12_INFO_QUEUE_FILTER filter = {};
101 | filter.DenyList.NumIDs = _countof(denyIds);
102 | filter.DenyList.pIDList = denyIds;
103 |
104 | infoQueue->AddStorageFilterEntries(&filter);
105 | }
106 |
107 | // Check coop vector is supported
108 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_0)
109 | {
110 | m_coopVectorFeatures.inferenceSupported = true;
111 | }
112 | else
113 | {
114 | return;
115 | }
116 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1)
117 | {
118 | m_coopVectorFeatures.trainingSupported = true;
119 | }
120 |
121 | // Get supported coop vector formats
122 | D3D12_FEATURE_DATA_COOPERATIVE_VECTOR coopVecData{};
123 | hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData));
124 | if (hr != S_OK)
125 | {
126 | return;
127 | }
128 |
129 | std::vector mulProperties(coopVecData.MatrixVectorMulAddPropCount);
130 | std::vector outerProductProperties;
131 | std::vector vectorAccumlateProperties;
132 |
133 | coopVecData.pMatrixVectorMulAddProperties = mulProperties.data();
134 |
135 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1)
136 | {
137 | outerProductProperties.resize(coopVecData.OuterProductAccumulatePropCount);
138 | coopVecData.pOuterProductAccumulateProperties = outerProductProperties.data();
139 | vectorAccumlateProperties.resize(coopVecData.VectorAccumulatePropCount);
140 | coopVecData.pVectorAccumulateProperties = vectorAccumlateProperties.data();
141 | }
142 | else
143 | {
144 | coopVecData.OuterProductAccumulatePropCount = 0;
145 | coopVecData.VectorAccumulatePropCount = 0;
146 | }
147 |
148 | if (d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData)) != S_OK)
149 | {
150 | return;
151 | }
152 |
153 | for (const auto& properties : mulProperties)
154 | {
155 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 &&
156 | properties.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.OutputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
157 | {
158 | m_coopVectorFeatures.fp16InferencingSupported = true;
159 | }
160 | }
161 |
162 | bool opSupported = false;
163 | for (const auto& properties : outerProductProperties)
164 | {
165 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
166 | {
167 | opSupported = true;
168 | }
169 | }
170 |
171 | bool vaSupported = false;
172 | for (const auto& properties : vectorAccumlateProperties)
173 | {
174 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16)
175 | {
176 | vaSupported = true;
177 | }
178 | }
179 | m_coopVectorFeatures.fp16TrainingSupported = opSupported && vaSupported;
180 | }
181 | #endif
182 | }
183 |
184 | GraphicsResources::~GraphicsResources()
185 | {
186 | }
187 |
188 | } // namespace rtxns
189 |
--------------------------------------------------------------------------------
/src/NeuralShading/GraphicsResources.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 |
13 | #include
14 |
15 | namespace rtxns
16 | {
17 |
18 | struct CoopVectorFeatures
19 | {
20 | bool inferenceSupported = false;
21 | bool trainingSupported = false;
22 | bool fp16InferencingSupported = false;
23 | bool fp16TrainingSupported = false;
24 | };
25 |
26 | class GraphicsResources
27 | {
28 | public:
29 | GraphicsResources(nvrhi::DeviceHandle device);
30 | ~GraphicsResources();
31 | CoopVectorFeatures GetCoopVectorFeatures() const
32 | {
33 | return m_coopVectorFeatures;
34 | }
35 |
36 | private:
37 | CoopVectorFeatures m_coopVectorFeatures;
38 | #if DONUT_WITH_VULKAN
39 | PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = nullptr;
40 | #endif
41 | };
42 | } // namespace rtxns
43 |
--------------------------------------------------------------------------------
/src/NeuralShading/NeuralNetwork.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 |
13 | #include "CoopVector.h"
14 | #include
15 | #include
16 | #include
17 |
18 | #include "NeuralNetworkTypes.h"
19 |
20 | namespace rtxns
21 | {
22 |
23 | class NetworkUtilities
24 | {
25 | public:
26 | NetworkUtilities(nvrhi::DeviceHandle device);
27 | ~NetworkUtilities()
28 | {
29 | }
30 |
31 | bool ValidateNetworkArchitecture(NetworkArchitecture const& netArch);
32 |
33 | // Create host side network layout.
34 | NetworkLayout CreateHostNetworkLayout(NetworkArchitecture const& netArch);
35 |
36 | // Set the weights and bias size / offsets for each layer in the network.
37 | void SetNetworkLayerSizes(NetworkLayout& layout);
38 |
39 | // Returns a updated network layout where the weights and bias size / offsets have been update
40 | // for the new matrix layout
41 | // Can be device optimal matrix layout
42 | NetworkLayout GetNewMatrixLayout(NetworkLayout const& srcLayout, MatrixLayout newMatrixLayout);
43 |
44 | // Converts weights and bias buffers from src layout to the dst layout.
45 | // Both buffers must be device side.
46 | // Both networks must be of the same network layout, only differing in MatrixLayout
47 | void ConvertWeights(NetworkLayout const& srcLayout,
48 | NetworkLayout const& dstLayout,
49 | nvrhi::BufferHandle srcBuffer,
50 | uint64_t srcBufferOffset,
51 | nvrhi::BufferHandle dstBuffer,
52 | uint64_t dstBufferOffset,
53 | nvrhi::DeviceHandle device,
54 | nvrhi::CommandListHandle commandList);
55 |
56 | private:
57 | std::unique_ptr m_coopVecUtils;
58 | };
59 |
60 | // Represent a host side neural network.
61 | // Stores the network layout and parameters.
62 | // Functionality to initialize a network to starting values or load from file.
63 | // Also write parameters back to file
64 | class HostNetwork
65 | {
66 | public:
67 | HostNetwork(std::shared_ptr networkUtils);
68 | ~HostNetwork(){};
69 |
70 | // Create host side network from provided architecture with initial values.
71 | bool Initialise(const NetworkArchitecture& netArch);
72 |
73 | // Create host side network of provided architecture and initial values from a json file.
74 | bool InitialiseFromJson(donut::vfs::IFileSystem& fs, const std::string& fileName);
75 | // Create host side network of provided architecture and initial values from a file.
76 | bool InitialiseFromFile(const std::string& fileName);
77 | // Create host side network from an existing network.
78 | bool InitialiseFromNetwork(HostNetwork const& network);
79 | // Write the current network and parameters to file.
80 | bool WriteToFile(const std::string& fileName);
81 | // Convert device layout to host layout and update the host side parameters.
82 | void UpdateFromBufferToFile(nvrhi::BufferHandle hostLayoutBuffer,
83 | nvrhi::BufferHandle deviceLayoutBuffer,
84 | NetworkLayout const& hostLayout,
85 | NetworkLayout const& deviceLayout,
86 | const std::string& fileName,
87 | nvrhi::DeviceHandle device,
88 | nvrhi::CommandListHandle commandList);
89 |
90 | const NetworkArchitecture& GetNetworkArchitecture() const
91 | {
92 | return m_networkArchitecture;
93 | }
94 |
95 | const std::vector& GetNetworkParams() const
96 | {
97 | return m_networkParams;
98 | }
99 |
100 | const NetworkLayout& GetNetworkLayout() const
101 | {
102 | return m_networkLayout;
103 | }
104 |
105 | private:
106 | std::shared_ptr m_networkUtils;
107 | NetworkArchitecture m_networkArchitecture;
108 | std::vector m_networkParams;
109 | NetworkLayout m_networkLayout;
110 | };
111 | }; // namespace rtxns
--------------------------------------------------------------------------------
/src/NeuralShading/NeuralNetworkTypes.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | namespace rtxns
15 | {
16 |
17 | enum class MatrixLayout
18 | {
19 | RowMajor,
20 | ColumnMajor,
21 | InferencingOptimal,
22 | TrainingOptimal,
23 | };
24 |
25 | enum class Precision
26 | {
27 | F16,
28 | F32
29 | };
30 |
31 | struct NetworkArchitecture
32 | {
33 | uint32_t numHiddenLayers = 0;
34 | uint32_t inputNeurons = 0;
35 | uint32_t hiddenNeurons = 0;
36 | uint32_t outputNeurons = 0;
37 | Precision weightPrecision = Precision::F16;
38 | Precision biasPrecision = Precision::F16;
39 | };
40 |
41 | struct NetworkLayer
42 | {
43 | uint32_t inputs = 0; ///< Columns in the weight matrix.
44 | uint32_t outputs = 0; ///< Rows in the weight matrix.
45 | size_t weightSize = 0; ///< Size of the weight matrix in bytes.
46 | size_t biasSize = 0; ///< Size of the bias vector in bytes.
47 | uint32_t weightOffset = 0; ///< Offset to the weights in bytes.
48 | uint32_t biasOffset = 0; ///< Offset to the biases in bytes.
49 | };
50 |
51 | struct NetworkLayout
52 | {
53 | MatrixLayout matrixLayout = MatrixLayout::RowMajor;
54 | Precision matrixPrecision = Precision::F16;
55 | size_t networkSize = 0;
56 | std::vector networkLayers;
57 | };
58 |
59 | constexpr size_t GetSize(Precision precision)
60 | {
61 | switch (precision)
62 | {
63 | case Precision::F16:
64 | return sizeof(uint16_t); // 2 bytes
65 | case Precision::F32:
66 | return sizeof(float);
67 | default:
68 | return 0; // Should not get here
69 | }
70 | }
71 |
72 | } // namespace rtxns
--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/Activation.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | import CooperativeVectorAutoDiff;
12 | import CooperativeVectorFunctions;
13 |
14 | namespace rtxns
15 | {
16 | namespace mlp
17 | {
18 | ////////////////////////
19 | //
20 | // Activation function interface and implementation for several activation functions
21 | // for using with classes in MLP module
22 | //
23 | ////////////////////////
24 |
25 | // Base interface for activation functions
26 | interface IActivation
27 | {
28 | [Differentiable]
29 | CoopVec eval(CoopVec x);
30 | };
31 |
32 | // None activation function
33 | struct NoneAct : IActivation
34 | {
35 | [Differentiable]
36 | CoopVec eval(CoopVec x)
37 | {
38 | return x;
39 | }
40 | };
41 |
42 | // Linear activation function
43 | struct LinearAct : IActivation
44 | {
45 | T a;
46 |
47 | __init(T a)
48 | {
49 | this.a = a;
50 | }
51 |
52 | [Differentiable]
53 | CoopVec eval(CoopVec x)
54 | {
55 | return no_diff CoopVec(a) * x;
56 | }
57 | };
58 |
59 | // Exponential activation function
60 | struct ExponentialAct : IActivation
61 | {
62 | [Differentiable]
63 | CoopVec eval(CoopVec x)
64 | {
65 | // Exponent is builtin function.
66 | return exp(x);
67 | }
68 | };
69 |
70 | // Shifted exponential activation function
71 | struct ShiftedExponentialAct : IActivation
72 | {
73 | [Differentiable]
74 | CoopVec eval(CoopVec x)
75 | {
76 | return exp(x) - no_diff CoopVec(T(1.));
77 | }
78 | };
79 |
80 | // ReLU activation function
81 | struct ReLUAct : IActivation
82 | {
83 | [Differentiable]
84 | CoopVec eval(CoopVec x)
85 | {
86 | return relu(x);
87 | }
88 | };
89 |
90 | // Leaky ReLU activation function
91 | struct LeakyReLUAct : IActivation
92 | {
93 | T a;
94 |
95 | __init(T a)
96 | {
97 | this.a = a;
98 | }
99 |
100 | [Differentiable]
101 | CoopVec eval(CoopVec x)
102 | {
103 | return leakyReLU(x, a);
104 | }
105 | };
106 |
107 | // Sigmoid activation function
108 | struct SigmoidAct : IActivation
109 | {
110 | [Differentiable]
111 | CoopVec eval(CoopVec x)
112 | {
113 | // Sigmoid function calculation. Compiler will infer the derivative automatically (autodiff)
114 | return sigmoid(x);
115 | }
116 | };
117 |
118 | // Swish activation function
119 | struct SwishAct : IActivation
120 | {
121 | [Differentiable]
122 | CoopVec eval(CoopVec x)
123 | {
124 | return x / (no_diff CoopVec(T(1.)) + exp(no_diff CoopVec(T(-1.)) * x));
125 | }
126 | };
127 |
128 | // Tanh activation function
129 | struct TanhAct : IActivation
130 | {
131 | [Differentiable]
132 | CoopVec eval(CoopVec x)
133 | {
134 | var c1 = no_diff CoopVec(T(1.));
135 | return no_diff CoopVec(T(2.)) / (c1 + exp(no_diff CoopVec(T(-2.)) * x)) - c1;
136 | }
137 | };
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
9 |
10 |
11 | set(project NeuralShading_Shaders)
12 | set(folder "${LIBRARY_FILTER}/NeuralShading_Shaders")
13 |
14 | file(GLOB shaders "*.slang")
15 |
16 | set_source_files_properties(${shaders} PROPERTIES VS_TOOL_OVERRIDE "None")
17 | add_custom_target(${project}
18 | DEPENDS ShaderMake
19 | SOURCES ${shaders})
20 | set_target_properties(${project} PROPERTIES
21 | FOLDER ${folder}
22 | )
23 |
24 | set(SAMPLES_SHADER_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR} CACHE PATH "" FORCE)
--------------------------------------------------------------------------------
/src/NeuralShading_Shaders/CooperativeVectorAutoDiff.slang:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 | */
10 |
11 | import CooperativeVectorFunctions;
12 | import CooperativeVectorDerivatives;
13 |
14 | // Implementation to extend CoopVec to make it automatically differentiable (autodiff)
15 |
16 | namespace rtxns
17 | {
18 |
19 | // Extension for builtin type CoopVec to make it automatically differentiable (autodiff)
20 | extension CoopVec : IDifferentiable
21 | {
22 | typealias Differential = CoopVec;
23 | };
24 |
25 | typealias HCoopVec = CoopVec;
26 |
27 | ////////////////////////
28 | //
29 | // Additional functions and their derivatives for using in actvation functions
30 | // To support Slang autodiff, for each function its derivative should be defined
31 | //
32 | ////////////////////////
33 |
34 | // exp is builtin function, so we just need to define derivative for autodiff support
35 | [BackwardDerivativeOf(exp)]
36 | void exp_BackwardAutoDiff(inout DifferentialPair> p0, CoopVec.Differential dResult)
37 | {
38 | p0 = diffPair(p0.p, dResult * exp(p0.p));
39 | }
40 |
41 | // Relu backward derivative
42 | [BackwardDerivativeOf(relu)]
43 | void relu_BackwardAutoDiff