├── .gitattributes ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE.MD ├── README.md ├── assets └── data │ ├── disney.ns.bin │ ├── nvidia-logo.png │ └── slangpy-weights.json ├── docs ├── LibraryGuide.md ├── QuickStart.md ├── ShaderTraining.md ├── SimpleInferencing.md ├── SimpleTraining.md ├── SlangpyTraining.md ├── Tutorial.md ├── shader_training.png ├── simple_inferencing.png ├── simple_training.png ├── simple_training_trained.png └── slangpy_training.jpg ├── samples ├── CMakeLists.txt ├── ShaderTraining │ ├── CMakeLists.txt │ ├── Disney.slang │ ├── DisneyMLP.slang │ ├── NetworkConfig.h │ ├── ShaderTraining.cpp │ ├── computeOptimizer.slang │ ├── computeTraining.slang │ ├── renderDifference.slang │ ├── renderDisney.slang │ ├── renderInference.slang │ └── shaders.cfg ├── SimpleInferencing │ ├── CMakeLists.txt │ ├── NetworkConfig.h │ ├── SimpleInferencing.cpp │ ├── SimpleInferencing.slang │ └── shaders.cfg ├── SimpleTraining │ ├── CMakeLists.txt │ ├── NetworkConfig.h │ ├── SimpleTraining.cpp │ ├── SimpleTraining_Inference.slang │ ├── SimpleTraining_Optimizer.slang │ ├── SimpleTraining_Training.slang │ └── shaders.cfg └── SlangpyTraining │ ├── CMakeLists.txt │ ├── Helpers.py │ ├── NetworkConfig.h │ ├── NeuralModules.py │ ├── NeuralModules.slang │ ├── SlangpyInference.cpp │ ├── SlangpyInference.slang │ ├── SlangpyTraining.py │ ├── SlangpyTraining.slang │ ├── requirements.txt │ └── shaders.cfg ├── src ├── CMakeLists.txt ├── NeuralShading │ ├── CMakeLists.txt │ ├── CoopVector.cpp │ ├── CoopVector.h │ ├── Float16.cpp │ ├── Float16.h │ ├── GraphicsResources.cpp │ ├── GraphicsResources.h │ ├── NeuralNetwork.cpp │ ├── NeuralNetwork.h │ └── NeuralNetworkTypes.h ├── NeuralShading_Shaders │ ├── Activation.slang │ ├── CMakeLists.txt │ ├── CooperativeVectorAutoDiff.slang │ ├── CooperativeVectorDerivatives.slang │ ├── CooperativeVectorFunctions.slang │ ├── LinearOps.slang │ ├── Loss.slang │ ├── MLP.slang │ ├── Optimizers.slang │ ├── PCG32.slang │ └── Utils.slang └── Utils │ ├── CMakeLists.txt │ ├── DeviceUtils.cpp │ ├── DeviceUtils.h │ ├── DirectoryHelper.cpp │ ├── DirectoryHelper.h │ ├── GeometryUtils.cpp │ └── GeometryUtils.h └── support └── cmake ├── ConfigureAgilitySDK.cmake ├── FetchDXCPreview.cmake └── FetchPrebuildBinary.cmake /.gitattributes: -------------------------------------------------------------------------------- 1 | external/slang/windows-x64/release/dxcompiler.dll filter=lfs diff=lfs merge=lfs -text 2 | external/slang/windows-x64/release/slang.dll filter=lfs diff=lfs merge=lfs -text 3 | external/slang/windows-x64/release/slangc.exe filter=lfs diff=lfs merge=lfs -text 4 | external/slang/windows-x64/release/slangd.exe filter=lfs diff=lfs merge=lfs -text 5 | external/slang/windows-x64/release/slang-glslang.dll filter=lfs diff=lfs merge=lfs -text 6 | *.exe filter=lfs diff=lfs merge=lfs -text 7 | *.pdb filter=lfs diff=lfs merge=lfs -text 8 | *.dll filter=lfs diff=lfs merge=lfs -text 9 | *.zip filter=lfs diff=lfs merge=lfs -text 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | /bin 3 | /out 4 | /.vscode 5 | /.vs 6 | /*.zip 7 | external/slang/windows-x64/release/slang-stdlib.bin 8 | /external/dx12-agility-sdk 9 | /external/nvapi 10 | 11 | 12 | # Temp files from running Python sample. 13 | /.temp 14 | __pycache__/ 15 | 16 | # Generated shaders+weights from SlangPyTraining. 17 | samples/SlangpyTraining/trained_shaders.cfg 18 | samples/SlangpyTraining/weights.json 19 | /external/dx12-agility-sdk 20 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "donut"] 2 | path = external/donut 3 | url = https://github.com/NVIDIAGameWorks/donut 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # RTX Neural Shading Change Log 2 | 3 | ## 1.1.0 4 | - Added DX12 cooperative vector support using Preview Agility SDK. 5 | - Moved matrix conversion to GPU. 6 | 7 | ## 1.0.0 8 | 9 | Initial release. -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | cmake_minimum_required(VERSION 3.10) 11 | 12 | project( 13 | RtxNeuralShading 14 | DESCRIPTION "RTX Neural Shading" 15 | LANGUAGES CXX 16 | ) 17 | 18 | set(CMAKE_CXX_STANDARD 20) 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 20 | set(CMAKE_CXX_EXTENSIONS ON) 21 | 22 | option(ENABLE_DX12_COOP_VECTOR_PREVIEW "" OFF) 23 | option(DONUT_WITH_DX11 "Not supported in this SDK" OFF) 24 | option(DONUT_WITH_DX12 "DX12 is only supported with DX12_COOP_VECTOR_PREVIEW ON" OFF) 25 | option(DONUT_WITH_VULKAN "" ON) 26 | option(DONUT_WITH_STATIC_SHADERS "" ON) 27 | 28 | # Register our path for CMake modules 29 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/support/cmake") 30 | 31 | if (MSVC) 32 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D_ITERATOR_DEBUG_LEVEL=1") 33 | endif() 34 | 35 | option(DONUT_WITH_ASSIMP "" OFF) 36 | 37 | if(WIN32) 38 | set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/windows-x64" CACHE PATH "Output directory for the RTXNS build") 39 | else() 40 | set(RTXNS_BINARY_DIR "${CMAKE_SOURCE_DIR}/bin/linux-x64" CACHE PATH "Output directory for the RTXNS build") 41 | endif() 42 | 43 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${RTXNS_BINARY_DIR}") 44 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${RTXNS_BINARY_DIR}") 45 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${RTXNS_BINARY_DIR}") 46 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${RTXNS_BINARY_DIR}") 47 | 48 | set(SHADERMAKE_BIN_OUTPUT_PATH "${RTXNS_BINARY_DIR}/bin" CACHE STRING "Output directory for the ShaderMake executable") 49 | set(DONUT_SHADERS_OUTPUT_DIR "${RTXNS_BINARY_DIR}/bin/shaders/framework") 50 | 51 | # Get Slang 52 | set(SLANG_VERSION "2025.10") 53 | set(SLANG_URL_BASE "https://github.com/shader-slang/slang/releases/download/v${SLANG_VERSION}") 54 | if(WIN32) 55 | set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-windows-x86_64.zip") 56 | else() 57 | set(SLANG_URL "${SLANG_URL_BASE}/slang-${SLANG_VERSION}-linux-x86_64-glibc-2.17.tar.gz") 58 | endif() 59 | 60 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchPrebuildBinary.cmake") 61 | download_package(slang ${SLANG_URL}) 62 | 63 | if (WIN32) 64 | set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc.exe") 65 | if (ENABLE_DX12_COOP_VECTOR_PREVIEW) 66 | set(DONUT_WITH_DX12 ON) 67 | set(NVRHI WITH_DX12 ON) 68 | else() 69 | # DX12 is only supported with DX12_COOP_VECTOR_PREVIEW 70 | set(DONUT_WITH_DX12 OFF) 71 | set(NVRHI WITH_DX12 OFF) 72 | endif() 73 | else() 74 | set(SLANGC_PATH "${slang_SOURCE_DIR}/bin/slangc") 75 | endif() 76 | 77 | if (NOT SLANGC_PATH) 78 | message(FATAL_ERROR "Slang compiler not found - this is required for CoopVec support.") 79 | else() 80 | message("Slang compiler found: ${SLANGC_PATH}") 81 | endif() 82 | 83 | if (DONUT_WITH_DX12) 84 | # Get D3D Agility SDK Preview for Coop Vector support 85 | set(D3D_AGILITY_SDK_PREVIEW_VERSION "1.717.0-preview") 86 | set(DONUT_D3D_AGILITY_SDK_URL "https://www.nuget.org/api/v2/package/Microsoft.Direct3D.D3D12/${D3D_AGILITY_SDK_PREVIEW_VERSION}") 87 | set(DONUT_D3D_AGILITY_SDK_FETCH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/dx12-agility-sdk" CACHE STRING "" FORCE) 88 | include("${CMAKE_CURRENT_SOURCE_DIR}/external/donut/cmake/FetchAgilitySDK.cmake") 89 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/ConfigureAgilitySDK.cmake") 90 | 91 | # Get DXC preview for SM6.9 support 92 | set(DXC_PREVIEW_VERSION "1.8.2505.28") 93 | set(DXC_PREVIEW_PATH "" CACHE STRING "Directory to fetch the DXC to, empty uses build directory default") 94 | include("${CMAKE_CURRENT_SOURCE_DIR}/support/cmake/FetchDXCPreview.cmake") 95 | 96 | set(DXC_PATH "${DXC_PREVIEW_PATH}") 97 | 98 | # copy dxc to Slang 99 | foreach(file_name IN ITEMS dxc.exe dxcompiler.dll dxil.dll) 100 | set(src "${DXC_PREVIEW_BIN_PATH}/${file_name}") 101 | set(dst "${slang_SOURCE_DIR}/bin/") 102 | if(EXISTS "${src}") 103 | configure_file("${src}" "${dst}" COPYONLY) 104 | else() 105 | message(WARNING "DXC binary not found: ${src}") 106 | endif() 107 | endforeach() 108 | endif() 109 | 110 | add_subdirectory(external/donut) 111 | add_subdirectory(src) 112 | add_subdirectory(samples) 113 | set_property (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT SimpleInferencing) 114 | 115 | file(WRITE "${CMAKE_SOURCE_DIR}/bin/slangc.bat" "${SLANGC_PATH} %*") 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RTX Neural Shading 2 | 3 | RTX Neural Shading (RTXNS) also known as RTX Neural Shaders, is intended as a starting point for developers interested in bringing Machine Learning (ML) to their graphics applications. It provides a number of examples to help the reader understand how to train their own neural networks and then use those models to perform inference alongside their normal graphics rendering. 4 | 5 | RTXNS uses the [Slang](https://shader-slang.com) shading language and it utilizes either the DirectX Preview Agility SDK or the Vulkan Cooperative Vectors extension to provide access to the GPUs ML acceleration. 6 | 7 | A number of examples are included which build upon each other from a simple inference example to more complex examples showing how to train a neural network to represent a shader or a texture. Helper functions to facilitate building your own neural networks are also included. 8 | 9 | Alongside the core samples is a SlangPy sample to demonstrate how to use python and SlangPy for fast iteration and development of neural networks which can then be integrated into RTXNS for inference. 10 | 11 | When exploring RTXNS, it is assumed that the reader is already familiar with ML and neural networks. 12 | 13 | ## Requirements 14 | 15 | ### General 16 | [CMake v3.24.3][CMake] **|** [VS 2022][VS22] **|** [Slang v2025.10](https://shader-slang.com/tools/) 17 | 18 | ### DirectX 19 | [DirectX Preview Agility SDK 1.717.0-preview](https://www.nuget.org/packages/Microsoft.Direct3D.D3D12/1.717.0-preview) **|** [Microsoft DXC 1.8.2505.28](https://www.nuget.org/packages/Microsoft.Direct3D.DXC/1.8.2505.28) **|** [Shader Model 6-9-Preview Driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver) 20 | 21 | ### Vulkan 22 | GPU must support the Vulkan `VK_NV_cooperative_vector` extension (minimum NVIDIA RTX 20XX) **|** [Vulkan SDK 1.3.296.0](https://vulkan.lunarg.com/sdk/home) **|** Public Driver ≥ 572.16 23 | 24 | ## Known Issues 25 | 05/30/2025: When updating from v1.0.0 to v1.1.0 is it recommended to delete the cmake cache to avoid build errors. 26 | 27 | ## Project structure 28 | 29 | | Directory | Details | 30 | | --------------------------------- | -------------------------------------- | 31 | | [/assets](assets) | _Asset files for samples_ | 32 | | [/docs](docs) | _Documentation for showcased tech_ | 33 | | [/samples](samples) | _Samples showcasing usage of MLPs_ | 34 | | [/external/donut](external/donut) | _Framework used for the examples_ | 35 | | [/external](external) | _Helper dependencies for the examples_ | 36 | | [/src](src) | _Helper and utility functions_ | 37 | 38 | ## Getting started 39 | 40 | - [Quick start guide](docs/QuickStart.md) for building and running the neural shading samples. 41 | - [Library usage guide](docs/LibraryGuide.md) for using helper functions 42 | 43 | ### External Resources 44 | 45 | This project uses [Slang](https://shader-slang.com) and the Vulkan CoopVector extensions. The following links provide more detail on these, and other technologies which may help the reader to better understand the relevant technologies, or just to provide further reading. 46 | 47 | * [Slang User Guide](https://shader-slang.com/slang/user-guide/) 48 | 49 | * [Automatic Differentiation](https://shader-slang.com/slang/user-guide/autodiff.html) 50 | 51 | * [SlangPy](https://slangpy.readthedocs.io/en/latest/) 52 | 53 | * [Vulkan `VK_NV_cooperative_vector` extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html) 54 | 55 | * [Donut](https://github.com/NVIDIAGameWorks/donut) 56 | 57 | ## Contact 58 | 59 | RTXNS is actively being developed. Please report any issues directly through the GitHub issue tracker, and for any information or suggestions contact us at rtxns-sdk-support@nvidia.com 60 | 61 | ## Citation 62 | 63 | Use the following BibTex entry to cite the usage of RTXNS in published research: 64 | 65 | ```bibtex 66 | @online{RTXNS, 67 | title = {{{NVIDIA}}\textregistered{} {RTXNS}}, 68 | author = {{NVIDIA}}, 69 | year = 2025, 70 | url = {https://github.com/NVIDIA-RTX/RTXNS}, 71 | urldate = {2025-02-03}, 72 | } 73 | ``` 74 | 75 | ## License 76 | 77 | See [LICENSE.md](LICENSE.MD) 78 | 79 | [VS22]: https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community&channel=Release&version=VS2022&source=VSLandingPage&passive=false&cid=2030 80 | 81 | [CMake]: https://github.com/Kitware/CMake/releases/download/v3.24.3/cmake-3.24.3-windows-x86_64.msi 82 | -------------------------------------------------------------------------------- /assets/data/disney.ns.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/disney.ns.bin -------------------------------------------------------------------------------- /assets/data/nvidia-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/assets/data/nvidia-logo.png -------------------------------------------------------------------------------- /docs/QuickStart.md: -------------------------------------------------------------------------------- 1 | # RTX Neural Shading: Quick Start Guide 2 | 3 | ## Build steps 4 | 5 | 1. Clone the project recursively: 6 | 7 | ``` 8 | git clone --recursive https://github.com/NVIDIA-RTX/RTXNS 9 | ``` 10 | 11 | 2. Configure and then generate the solution using CMake GUI (or the CLI) by setting the repository root as _source_ and specifying a new _build_ directory in the root. 12 | 13 | ``` 14 | cd Rtxns 15 | mkdir build 16 | cd build 17 | cmake .. 18 | ``` 19 | To enable DX12 Cooperative Vector set the option `ENABLE_DX12_COOP_VECTOR_PREVIEW` on. 20 | ``` 21 | cmake -DENABLE_DX12_COOP_VECTOR_PREVIEW=ON 22 | ``` 23 | 24 | 3. Open build/RtxNeuralShading.sln in Visual Studio and build all projects, or build using the CMake CLI 25 | 26 | ``` 27 | cmake --build . 28 | ``` 29 | 30 | 4. All of the binaries can be found in `\bin` such as 31 | 32 | ``` 33 | bin\Debug\SimpleInferencing.exe 34 | 35 | 5. Each of the samples can be build and launched as either DX12 or Vulkan with the respective commandline: `-dx12` or `-vk` 36 | 37 | ## About 38 | 39 | All of the samples are built using Slang and can be compiled to either DX12 or Vulkan using DirectX Preview Agility SDK or Vulkan Cooperative Vector extension respectively. 40 | 41 | - [DirectX Preview Agility SDK](https://devblogs.microsoft.com/directx/directx12agility/). 42 | - [Vulkan Cooperative Vector extension](https://registry.khronos.org/vulkan/specs/latest/man/html/VK_NV_cooperative_vector.html). 43 | 44 | ## Driver Requirements 45 | - Use the DirectX Preview Agility SDK requires a shader model 6.9 preview [driver](https://developer.nvidia.com/downloads/shadermodel6-9-preview-driver) 46 | - Vulkan Cooperative Vector extension requires a release [driver](https://www.nvidia.com/en-gb/geforce/drivers) from R570 onwards 47 | 48 | ### Samples 49 | 50 | | Sample Name | Output | Description | 51 | | ------------------------------------------ | ------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 52 | | [Simple Inferencing](SimpleInferencing.md) | [](simple_inferencing.png) | This sample demonstrates how to implement an inference shader using some of the low-level building blocks from RTXNS. The sample loads a trained network from a file and uses the network to approximate a Disney BRDF shader. The sample is interactive; the light source can be rotated and various material parameters can be modified at runtime. | 53 | | [Simple Training](SimpleTraining.md) | [](simple_training.png) | This sample builds on the Simple Inferencing sample to provide an introduction to training a neural network for use in a shader. The network replicates a transformed texture. | 54 | | [Shader Training](ShaderTraining.md) | [](shader_training.png) | This sample extends the techniques shown in the Simple Training example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the Simple Inferencing sample. | 55 | | [SlangPy Training](SlangpyTraining.md) | [](slangpy_training.jpg) | This sample shows how to create and train network architectures in python using SlangPy. This lets you experiment with different networks, encodings and more using the building blocks from RTXNS, but without needing to change or rebuild C++ code. As a demonstration this sample instantiates multiple different network architectures and trains them side-by-side on the same data. It also shows one possible approach of exporting the network parameters and architecture to disk so it can be loaded in C++. | 56 | 57 | ### Tutorial 58 | 59 | * [Tutorial](Tutorial.md) 60 | A tutorial to help guide you to create your own neural shader based on the [Shader Training](ShaderTraining.md) example. 61 | 62 | ### Library 63 | 64 | * [Library](LibraryGuide.md) 65 | A guide to using the library / helper functions to create and manage your neural networks. 66 | -------------------------------------------------------------------------------- /docs/ShaderTraining.md: -------------------------------------------------------------------------------- 1 | # RTX Neural Shading: Shader Training Example 2 | 3 | ## Purpose 4 | 5 | This sample extends on the techniques shown in the [Simple Training](SimpleTraining.md) example and introduces Slangs AutoDiff functionality, via a full MLP (Multi Layered Perceptron) abstraction. The MLP is implemented using the `CoopVector` training code previously introduced and provides a simple interface for training networks with Slang. The sample creates a network and trains a model on the Disney BRDF shader that was used in the [Simple Inferencing](SimpleInferencing.md) sample. 6 | 7 | ![Shader Training Output](shader_training.png) 8 | 9 | When the executable is built and run, the output shows 3 images where the image on the left is sphere lit with the full Disney BRDF shader. the middle image is the same sphere lit with the trained neural network and the final image on the right shows the loss delta. There is a UI which allows some control of the material properties as well as buttons to pause and reset the training as well as to save/load the current network. 10 | 11 | ## Training Flow 12 | 13 | To create and train a neural network with RTXNS, several stages are needed which will be described in more detail below. This differs from the previous [Simple Training](SimpleTraining.md) example which had a specific compute shader pass for training and another for inference. In this example, the training and optimization passes are still compute based, but the inference is integrated into an existing pixel shader. 14 | 15 | 1. Create the host side neural network storage and initialize it 16 | 17 | 2. Create a device optimal layout and GPU buffer 18 | 19 | 3. Convert the host layout network to the device optimal layout on the GPU 20 | 21 | 4. Create auxiliary buffers for loss gradients and the optimizer pass 22 | 23 | 5. Run batches of the training shader followed by the optimizer shader on random inputs adjusting for the loss at each epoch 24 | 25 | 6. Render the sphere with the inference pixel shader to generate the output image 26 | 27 | ## Network Configuration 28 | 29 | The network details can be found in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h) and are configured as follows : 30 | 31 | | Property | Value | Notes | 32 | | -------------- | ------- | ------------------------------------------- | 33 | | Input Features | 5 | 5 input parameters | 34 | | Input Neurons | 30 | 5 input parameters encoded to 6 inputs each | 35 | | Output Neurons | 4 | 4 BRDF values | 36 | | Hidden Neurons | 32 | | 37 | | Hidden Layers | 3 | | 38 | | Precision | float16 | | 39 | 40 | ## Application Code 41 | 42 | On the host, the setup of the neural network is quite simple and broadly similar to [Simple Training](SimpleTraining.md) so we shall only highlight the differences in this document. 43 | 44 | ### Training Loop 45 | 46 | After creating the appropriate pipelines and allocating the GPU buffers, the training loop is similar to the Simple Training example. The training and optimization passes are executed multiple times per frame (`g_trainingStepsPerFrame = 100`) to speed up the training time whilst also running the inference pass at a reasonable rate to see the model convergence. 47 | 48 | ``` 49 | for (int i = 0; i < g_trainingStepsPerFrame; ++i) 50 | { 51 | nvrhi::ComputeState state; 52 |     ... 53 | // Training pass 54 | state.bindings = { m_trainingPass.bindingSet }; 55 | state.pipeline = m_trainingPass.pipeline; 56 |     ... 57 | m_commandList->setComputeState(state); 58 | m_commandList->dispatch(m_batchSize / 64, 1, 1); 59 |     ... 60 | // Optimizer pass 61 | state.bindings = { m_optimizerPass.bindingSet }; 62 | state.pipeline = m_optimizerPass.pipeline; 63 | ... 64 | m_commandList->setComputeState(state); 65 | m_commandList->dispatch(div_ceil(m_totalParameterCount, 32), 1, 1); 66 |     ... 67 | } 68 | ``` 69 | 70 | Some of the timer related queries have been removed from the code for ease of understanding. 71 | 72 | After the training pass, the 2 spheres are rendered as expected, but using 2 different pipelines: `m_directPass` for the native Disney BRDF shader and `m_inferencePass` for the trained neural model. 73 | 74 | ## Shader Code 75 | 76 | The neural network in this sample is trying to encode the following : 77 | 78 | ``` 79 | Disney(NdotL, NdotV, NdotH, LdotH, roughness); 80 | ``` 81 | 82 | The shader code extends the concepts shown in the [Simple Training](SimpleTraining.md) example by using Slangs [AutoDiff](https://shader-slang.org/slang/user-guide/autodiff.html) feature to create a templated training class `TrainingMLP`, implemented in [MLP.slang](../src/NeuralShading_Shaders/MLP.slang), that can be used to help train your own models. Using the Autodiff features means we don't need to implement a full backwards pass containing all of the derivative activation functions as it is automatically derived for you. 83 | 84 | The main 3 shaders are: [training](../samples/ShaderTraining/computeTraining.slang), [optimizer](../samples/ShaderTraining/computeOptimizer.slang) and [inference](../samples/ShaderTraining/renderInference.slang). 85 | 86 | ### Training 87 | 88 | The training shader starts by generating the random inputs and encoding them ready for passing to the neural network. 89 | 90 | ``` 91 | //----------- Training step 92 | float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness}; 93 | var inputParams = rtxns::EncodeFrequency(params); 94 | ``` 95 | 96 | Next, the model is created and the inputs are passed to the model for the forwards pass. 97 | 98 | ``` 99 | var model = rtxns::mlp::TrainingMLP( 102 | gMLPParams, 103 | gMLPParamsGradients, 104 | rtxns::UnpackArray(gConst.weightOffsets), 105 | rtxns::UnpackArray(gConst.biasOffsets)); 106 | ``` 107 | 108 | The `TrainingMLP` is heavily templated with lots of parameters, but the templated parameters consist of : 109 | 110 | * Number of hidden layers 111 | * Number of input neurons 112 | * Number of neurons per hidden layer 113 | * Number of output neurons 114 | * Matrix layout 115 | * Precision 116 | 117 | The non templated parameters consist of : 118 | 119 | * Weight/Bias buffer 120 | * Gradient buffer 121 | * Weight offsets per layer 122 | * Bias offsets per layer 123 | 124 | Once the model has been created, executing the forward pass is trivial and involves assigning templated activation functions to the forward pass before passing the input parameters in. The detailed implementation is described in the [Library Guide](LibraryGuide.md) 125 | 126 | ``` 127 | var hiddenActivation = rtxns::mlp::ReLUAct(); 128 | var finalActivation = rtxns::mlp::ExponentialAct(); 129 | 130 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation); 131 | ``` 132 | 133 | To generate the loss gradient, this example uses the `L2Relative` derivative function of the output of the actual disney BRDF shader and the output of the forward pass. 134 | 135 | ``` 136 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] }; 137 | 138 | float4 lossGradient = rtxns::mlp::L2Relative.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS); 139 | ``` 140 | 141 | Finally, the loss gradient along with the input vector are passed through the models backward propagation function to update the gradient parameters. 142 | 143 | ``` 144 | model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3])); 145 | ``` 146 | 147 | ### Optimizer 148 | 149 | The optimizer shader is no different to the one used in the [Simple Training](SimpleTraining.md) example. 150 | 151 | ``` 152 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID) 153 | { 154 | uint i = dispatchThreadID.x; 155 | if (i >= maxParamSize) 156 | return; 157 | 158 | float gradient = (float)gMLPParamsGradients[i]; 159 | gMLPParamsGradients[i] = half(0.0); 160 | 161 | float weightbias = gMLPParams32[i]; 162 | 163 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, learningRate, LOSS_SCALE); 164 | 165 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, currentStep); 166 | 167 | gMLPParams32[i] = adjustedWeightbias; 168 | gMLPParams[i] = (half)adjustedWeightbias; 169 | } 170 | ``` 171 | 172 | ### Inference 173 | 174 | The inference pass is nearly identical to the forward pass of the training shader. It currently uses `CoopVecMatrixLayout::TrainingOptimal` layout as it is run directly after a batch of training without converting the weights to an inference layout, but the default layout for inference should be `CoopVecMatrixLayout::InferencingOptimal`. 175 | 176 | ``` 177 | float4 DisneyMLP( 178 | float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer, 179 | uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1]) 180 | { 181 | // Calculate approximated core shader part using MLP 182 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness }; 183 | 184 | var inputParams = rtxns::EncodeFrequency(params); 185 | 186 | var model = rtxns::mlp::InferenceMLP 193 | (mlpBuffer, weightOffsets, biasOffsets); 194 | 195 | var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct(), rtxns::mlp::ExponentialAct()); 196 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]); 197 | } 198 | ``` 199 | -------------------------------------------------------------------------------- /docs/Tutorial.md: -------------------------------------------------------------------------------- 1 | # RTX Neural Shading: How to Write Your First Neural Shader 2 | 3 | ## Purpose 4 | 5 | Using [Shader Training](ShaderTraining.md) as the basis of this tutorial, we will briefly discuss an approach to writing your first neural shader. 6 | 7 | The main areas we will focus on are : 8 | 9 | 1. Extracting the key features from the shader to be trained 10 | 11 | 2. Modifying the network configuration 12 | 13 | 3. Modifying the activation and loss functions 14 | 15 | It is outside the scope of this document to discuss how AI training and optimization works and instead we will focus on modifying the existing sample to configure and train the network with different content. 16 | 17 | ## Extracting the Key Features for Training Input 18 | 19 | When implementing the Disney BRDF for use in the [Shader Training](ShaderTraining.md) example, the first task was feature extraction. Which features from the shader should be inferred from the network and which should be calculated to ensure the network is not over specialized or overly complex. The network for the Disney BRDF takes inputs such as the `view`, `light` and `normal` vectors as well as `material roughness`. Other variables, such as `light intensity`, `material metallicness` and various `material color` components have been left as part of the shader. This is a balancing act which may require some experimentation. 20 | 21 | Once the key features are identified as potential training inputs, look to optimize them where possible by reducing their form and scaling them to be in the range `0-1` or `-1 - 1` which is preferred by networks. In the Disney BRDF, this was done by recognizing that the input vectors where always normalized and used in their dot product form, so the inputs were reduced from 3 `float3` vectors, to 4 `float` dot products. 22 | 23 | Next, the network inputs may benefit from encoding which research has shown to improve the performance of the network. The library provides 2 encoders, `EncodeFrequency` and `EncodeTriangle` which encode the inputs into some form of wave function. The shader training example uses the frequency encoder which increases the number of inputs by a factor of 6 but provides a better network as a result. You should experiment with encoders to find the one suitable for your dataset. 24 | 25 | At this point, you should know the number of (encoded) input parameters and output parameters, so it is time to configure the network. 26 | 27 | ## Modifying the Network Configuration 28 | 29 | The network configuration is stored in [NetworkConfig.h](../samples/ShaderTraining/NetworkConfig.h), and may require modification. Some elements are fixed for your dataset, like the input and output neuron counts and others are available for configuration. In the provided samples, the configuration is hard-coded for ease of understanding, but in a production system they are fully expected to be a configurable part of the training pipeline. 30 | 31 | These are fixed configuration parameters that are directly tied to the shader you are trying to train from : 32 | 33 | - `INPUT_NEURONS` should equal the number of encoded input parameters from above that are directly passed into the network. 34 | 35 | - `OUTPUT_NEURONS` should equal the output parameters that the network generates. This may be an RGB triple, or just a number of unconnected outputs like for the DisneyBRDF. 36 | 37 | The following parameters are available for experimentation and should be modified to find suitable settings for the network you are trying to train : 38 | 39 | - `NUM_HIDDEN_LAYERS` - The number of hidden layers that make up the network. 40 | 41 | - `HIDDEN_NEURONS` - The number of neurons in the hidden layers of the network. Changing this can make significant differences to the accuracy and cost of your network. 42 | 43 | - `LEARNING_RATE` - This should be tuned to improve convergence of your model. 44 | 45 | In future versions of the library, precision of the neurons may be alterable which could change the quality and performance of the network. The current version is fixed to `float16`. 46 | 47 | Changing any of these parameters should not require any further code changes as the defines are shared amongst the C++ and shader code; they will just require a re-compile. The exception may be when changing the size of input/output `CoopVecs` and any code that dereferences their elements directly, such as : 48 | 49 | ``` 50 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] }; 51 | ``` 52 | 53 | As always, experimentation will be required to find the right set of configuration parameters for the optimal training of your network. 54 | 55 | ## Modifying the Activation and Loss Functions 56 | 57 | The Simple Shading example uses the `TrainingMLP` which abstracts much of the training shader code for the user : 58 | 59 | ``` 60 | var model = rtxns::mlp::TrainingMLP( 63 | gMLPParams, 64 | gMLPParamsGradients, 65 | rtxns::UnpackArray(gConst.weightOffsets), 66 | rtxns::UnpackArray(gConst.biasOffsets)); 67 | 68 | var hiddenActivation = rtxns::mlp::ReLUAct(); 69 | var finalActivation = rtxns::mlp::ExponentialAct(); 70 | 71 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation); 72 | ``` 73 | 74 | The activation functions are passed into the models forward and backward pass (`ReLUAct` and `ExponentialAct`) for use with the `TrainingMLP` and `InferenceMLP`. These can be found in [CooperativeVectorFunctions.slang](../src/NeuralShading_Shaders/CooperativeVectorFunctions.slang) and extended as necessary. The current version of RTXNS provides a limited set of activation functions, but these can be examined and modified to support more activation functions as required. 75 | 76 | The choice of loss function to use will be dependent on your dataset. The Simple Training example uses a simple L2 loss function whereas the Shader Training example uses a more complex L2 relative loss function. Any loss function can be trivially coded in slang to help tune your shader. 77 | 78 | ## Hyper Parameters 79 | 80 | These are some of the hyper parameters that are available for tuning for your dataset. 81 | 82 | | Parameter | Value | 83 | | --------------------------- | ---------------- | 84 | | HIDDEN_NEURONS | 32 | 85 | | NUM_HIDDEN_LAYERS | 3 | 86 | | LEARNING_RATE | 1e-2f | 87 | | BATCH_SIZE | (1 << 16) | 88 | | BATCH_COUNT | 100 | 89 | | Hidden Activation Functions | ReLUAct() | 90 | | Final Activation Functions | ExponentialAct() | 91 | | Loss Function | L2Relative() | 92 | 93 | ## Summary 94 | 95 | The Shader Training sample is a good place to start to train your own neural shader. It will require some thought as to how to decompose your shader into network inputs and shader inputs and then the network can be re-configured through experimentation to find the suitable model that can handle your dataset. 96 | -------------------------------------------------------------------------------- /docs/shader_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/shader_training.png -------------------------------------------------------------------------------- /docs/simple_inferencing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_inferencing.png -------------------------------------------------------------------------------- /docs/simple_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training.png -------------------------------------------------------------------------------- /docs/simple_training_trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/simple_training_trained.png -------------------------------------------------------------------------------- /docs/slangpy_training.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-RTX/RTXNS/d8d2d956d11b1c1a0b043df1a1bf003f82b36c1c/docs/slangpy_training.jpg -------------------------------------------------------------------------------- /samples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | add_subdirectory(SimpleInferencing) 11 | add_subdirectory(ShaderTraining) 12 | add_subdirectory(SimpleTraining) 13 | add_subdirectory(SlangpyTraining) 14 | -------------------------------------------------------------------------------- /samples/ShaderTraining/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | include(../../external/donut/compileshaders.cmake) 11 | 12 | set(shader_includes 13 | ${SAMPLES_SHADER_INCLUDE_DIR} 14 | ${CMAKE_CURRENT_LIST_DIR} 15 | ) 16 | 17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" ) 18 | 19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " ) 20 | 21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " ) 22 | 23 | set(project ShaderTraining) 24 | set(folder "Samples/ShaderTraining") 25 | 26 | file(GLOB ${project}_shaders "*.slang") 27 | file(GLOB ${project}_sources "*.cpp" "*.h") 28 | 29 | donut_compile_shaders_all_platforms( 30 | TARGET ${project}_shaders 31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg 32 | INCLUDES ${shader_includes} 33 | FOLDER ${folder} 34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project} 35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS} 36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV} 37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL} 38 | SOURCES ${${project}_shaders} 39 | SLANG 40 | ) 41 | 42 | add_executable(${project} WIN32 ${${project}_sources}) 43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils) 44 | add_dependencies(${project} ${project}_shaders) 45 | set_target_properties(${project} PROPERTIES FOLDER ${folder}) 46 | 47 | if (MSVC) 48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP") 49 | endif() -------------------------------------------------------------------------------- /samples/ShaderTraining/Disney.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | //----------- Core part of the shader 12 | 13 | const static float PI = 3.14159265358979323846; 14 | 15 | float SchlickFresnel(float u) 16 | { 17 | float m = clamp(1 - u, 0, 1); 18 | float m2 = m * m; 19 | return m2 * m2 * m; // pow(m,5) 20 | } 21 | 22 | float Gtr1(float NdotH, float a) 23 | { 24 | if (a >= 1) 25 | { 26 | return 1 / PI; 27 | } 28 | float a2 = a * a; 29 | float t = 1 + (a2 - 1) * NdotH * NdotH; 30 | return (a2 - 1) / (PI * log(a2) * t); 31 | } 32 | 33 | float Gtr2(float NdotH, float ax) 34 | { 35 | float a = ax * (1 / ax / ax * (1 - NdotH * NdotH) + NdotH * NdotH); 36 | return 1 / (PI * a * a); 37 | } 38 | 39 | float SmithGGX(float NdotV, float alphaG) 40 | { 41 | float a = alphaG * alphaG; 42 | float b = NdotV * NdotV; 43 | return 1 / (NdotV + sqrt(a + b - a * b)); 44 | } 45 | 46 | float SmithGGXAnisotropy(float NdotV, float ax) 47 | { 48 | return 1 / (NdotV + sqrt(ax * ax * (1 - NdotV * NdotV) + NdotV * NdotV)); 49 | } 50 | 51 | float4 Disney(float NdotL, float NdotV, float NdotH, float LdotH, float roughness) 52 | { 53 | float FL = SchlickFresnel(NdotL), FV = SchlickFresnel(NdotV); 54 | float Fss90 = LdotH * LdotH * roughness; 55 | float Fss = lerp(1.0f, Fss90, FL) * lerp(1.0f, Fss90, FV); 56 | float ss = 1.25f * (Fss * (1.f / (NdotL + NdotV) - .5f) + .5f); 57 | 58 | // specular 59 | float ax = max(.001f, roughness * roughness); 60 | float Ds = Gtr2(NdotH, ax); 61 | float FH = SchlickFresnel(LdotH); 62 | float Gs = SmithGGXAnisotropy(NdotL, ax); 63 | Gs *= SmithGGXAnisotropy(NdotV, ax); 64 | 65 | // clearcoat (ior = 1.5 -> F0 = 0.04) 66 | float Dr = Gtr1(NdotH, .01f); 67 | float Fr = lerp(.04f, 1.0f, FH); 68 | float Gr = SmithGGX(NdotL, .25f) * SmithGGX(NdotV, .25f); 69 | 70 | return float4((1 / PI) * ss, Gs * Ds, FH, .25 * Gr * Fr * Dr); 71 | } 72 | -------------------------------------------------------------------------------- /samples/ShaderTraining/DisneyMLP.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import MLP; 12 | import CooperativeVectorFunctions; 13 | import Activation; 14 | import Utils; 15 | 16 | // 5 inputs are passed into this function; NdotL, NdotV, NdotH, LdotH, roughness 17 | #define INPUT_FEATURES 5 18 | 19 | // The output is float4 20 | #define OUTPUT_NEURONS 4 21 | 22 | // EncodeFrequency expands the input by 6 per input feature 23 | #define FREQUENCY_EXPANSION 6 24 | 25 | float4 DisneyMLP( 26 | float NdotL, float NdotV, float NdotH, float LdotH, float roughness, ByteAddressBuffer mlpBuffer, 27 | uint weightOffsets[HIDDEN_LAYERS+1], uint biasOffsets[HIDDEN_LAYERS+1]) 28 | { 29 | // Calculate approximated core shader part using MLP 30 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness }; 31 | 32 | var inputParams = rtxns::EncodeFrequency(params); 33 | 34 | var model = rtxns::mlp::InferenceMLP 41 | (mlpBuffer, weightOffsets, biasOffsets); 42 | 43 | var outputParams = model.forward(inputParams, rtxns::mlp::ReLUAct(), rtxns::mlp::ExponentialAct()); 44 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]); 45 | } 46 | -------------------------------------------------------------------------------- /samples/ShaderTraining/NetworkConfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #define INPUT_FEATURES 5 12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // 6* from Frequency Encoding 13 | #define OUTPUT_NEURONS 4 14 | 15 | #define HIDDEN_NEURONS 32 16 | #define NUM_HIDDEN_LAYERS 3 17 | #define BATCH_SIZE (1 << 16) 18 | #define BATCH_COUNT 100 19 | 20 | #define LEARNING_RATE 1e-2f 21 | #define COMPONENT_WEIGHTS float4(1.f, 10.f, 1.f, 5.f) 22 | 23 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1) 24 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4) 25 | #define LOSS_SCALE 128.0 26 | 27 | struct DirectConstantBufferEntry 28 | { 29 | // Scene setup 30 | float4x4 viewProject; 31 | float4x4 view; 32 | float4 cameraPos; 33 | 34 | // Light setup 35 | float4 lightDir; 36 | float4 lightIntensity; 37 | 38 | // Material props 39 | float4 baseColor; 40 | float specular = 0; 41 | float roughness = 0; 42 | float metallic = 0; 43 | 44 | // Alignment 45 | float pad = 0; 46 | }; 47 | 48 | struct InferenceConstantBufferEntry : DirectConstantBufferEntry 49 | { 50 | // Neural weight & bias offsets 51 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4]; 52 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4]; 53 | }; 54 | 55 | struct TrainingConstantBufferEntry 56 | { 57 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4]; 58 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4]; 59 | uint32_t maxParamSize; 60 | float learningRate; 61 | float currentStep; 62 | uint32_t batchSize; 63 | uint64_t seed; 64 | }; 65 | -------------------------------------------------------------------------------- /samples/ShaderTraining/computeOptimizer.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import Optimizers; 15 | 16 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0); 17 | RWBuffer gMLPParams : REGISTER_UAV(0, 0); 18 | RWBuffer gMLPParams32 : REGISTER_UAV(1, 0); 19 | RWBuffer gMLPParamsGradients : REGISTER_UAV(2, 0); 20 | RWBuffer gMoments1 : REGISTER_UAV(3, 0); 21 | RWBuffer gMoments2 : REGISTER_UAV(4, 0); 22 | 23 | [numthreads(32, 1, 1)] 24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID) 25 | { 26 | uint i = dispatchThreadID.x; 27 | if (i >= gConst.maxParamSize) 28 | return; 29 | 30 | float gradient = (float)gMLPParamsGradients[i]; 31 | gMLPParamsGradients[i] = half(0.0); 32 | 33 | if (isfinite(gradient)) 34 | { 35 | float weightbias = gMLPParams32[i]; 36 | 37 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE); 38 | 39 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep); 40 | 41 | gMLPParams32[i] = adjustedWeightbias; 42 | gMLPParams[i] = (half) adjustedWeightbias; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /samples/ShaderTraining/computeTraining.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import CooperativeVectorAutoDiff; 15 | import CooperativeVectorFunctions; 16 | import Utils; 17 | import Activation; 18 | import MLP; 19 | import Loss; 20 | import PCG32; 21 | import Disney; 22 | 23 | DECLARE_CBUFFER(TrainingConstantBufferEntry, gConst, 0, 0); 24 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0); 25 | RWByteAddressBuffer gMLPParamsGradients : REGISTER_UAV(0, 0); 26 | 27 | [shader("compute")] 28 | [numthreads(64, 1, 1)] 29 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID) 30 | { 31 | //----------- Randomly generate input parameters 32 | uint idx = dispatchThreadID.x; 33 | PCG32 rng = PCG32(gConst.seed, idx); 34 | 35 | // Using tangent coordinate system. N = (0,0,1) 36 | // L is arbitrary, but (N,L) >= 0 => L.z > 0, so generate random L in XZ plane's first quadrant 37 | float3 L; 38 | L.y = 0.f; 39 | sincos(rng.nextFloat()*PI/2, L.z, L.x); 40 | 41 | // V is random direction, but (N,V) >= 0 => V.z > 0 42 | float sa, ca; // Azimuth [-PI/2,PI/2] 43 | sincos(-PI + 2 * PI * rng.nextFloat(), sa, ca); 44 | float se, ce; // Elevation [0,PI/2] 45 | sincos(PI/2 * rng.nextFloat(), se, ce); 46 | float3 V = float3(ce*ca, ce*sa, se); 47 | 48 | float NdotL = L.z; 49 | float NdotV = V.z; 50 | 51 | float3 H = normalize(L+V); 52 | float NdotH = H.z; 53 | float LdotH = dot(L,H); 54 | 55 | float roughness = rng.nextFloat()*0.7f+0.3f; 56 | 57 | //----------- Calculate core shader part DIRECTLY 58 | float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, roughness); 59 | 60 | //----------- Training step 61 | float params[INPUT_FEATURES] = {NdotL, NdotV, NdotH, LdotH, roughness}; 62 | var inputParams = rtxns::EncodeFrequency(params); 63 | 64 | var model = rtxns::mlp::TrainingMLP( 67 | gMLPParams, 68 | gMLPParamsGradients, 69 | rtxns::UnpackArray(gConst.weightOffsets), 70 | rtxns::UnpackArray(gConst.biasOffsets)); 71 | 72 | var hiddenActivation = rtxns::mlp::ReLUAct(); 73 | var finalActivation = rtxns::mlp::ExponentialAct(); 74 | 75 | var outputParams = model.forward(inputParams, hiddenActivation, finalActivation); 76 | 77 | float4 predictedDisney = { outputParams[0], outputParams[1], outputParams[2], outputParams[3] }; 78 | 79 | float4 lossGradient = rtxns::mlp::L2Relative.deriv(actualDisney, predictedDisney, float4(LOSS_SCALE / (gConst.batchSize * 4)) * COMPONENT_WEIGHTS); 80 | 81 | model.backward(inputParams, hiddenActivation, finalActivation, rtxns::HCoopVec(lossGradient[0], lossGradient[1], lossGradient[2], lossGradient[3])); 82 | } 83 | -------------------------------------------------------------------------------- /samples/ShaderTraining/renderDifference.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import CooperativeVectorAutoDiff; 15 | import CooperativeVectorFunctions; 16 | import Utils; 17 | import Activation; 18 | import MLP; 19 | import Disney; 20 | import DisneyMLP; 21 | 22 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0); 23 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0); 24 | 25 | struct PS_INPUT 26 | { 27 | float4 pos : SV_Position; 28 | float3 norm : NORMAL; 29 | float3 view : VIEW; 30 | } 31 | 32 | [shader("vertex")] 33 | void main_vs( 34 | float3 i_pos : POSITION, 35 | float3 i_norm : NORMAL, 36 | out PS_INPUT output) 37 | { 38 | output.pos = mul(float4(i_pos, 1), gConst.viewProject); 39 | output.norm = i_norm; 40 | output.view = gConst.cameraPos.xyz - i_pos; 41 | } 42 | 43 | float3 calcColor(float4 params) 44 | { 45 | float3 Cdlin = pow(gConst.baseColor.rgb, 2.2); 46 | float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic); 47 | float3 brdfn = params.x * Cdlin * (1 - gConst.metallic) + params.y * lerp(Cspec0, float3(1), params.z) + params.w; 48 | return clamp(brdfn * gConst.lightIntensity.rgb, 0, 1); 49 | } 50 | 51 | [shader("fragment")] 52 | void main_ps( 53 | PS_INPUT input, 54 | out float4 o_color : SV_Target0) 55 | { 56 | // Prepare input parameters 57 | float3 view = normalize(input.view); 58 | float3 norm = normalize(input.norm); 59 | float3 h = normalize(-gConst.lightDir.xyz + view); 60 | 61 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz)); 62 | float NdotV = max(0.f, dot(norm, view)); 63 | float NdotH = max(0.f, dot(norm, h)); 64 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz)); 65 | 66 | //----------- Calculate core shader part DIRECTLY 67 | float4 actualDisney = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness); 68 | 69 | // Calculate approximated core shader 70 | float4 outParams = DisneyMLP( 71 | NdotL, NdotV, NdotH, LdotH, gConst.roughness, 72 | gMLPParams, 73 | rtxns::UnpackArray(gConst.weightOffsets), 74 | rtxns::UnpackArray(gConst.biasOffsets) 75 | ); 76 | 77 | o_color = float4((calcColor(actualDisney) - calcColor(outParams)) * NdotL * 4 + 0.5, 1.f); 78 | } 79 | -------------------------------------------------------------------------------- /samples/ShaderTraining/renderDisney.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | DECLARE_CBUFFER(DirectConstantBufferEntry, gConst, 0, 0); 15 | 16 | struct PS_INPUT 17 | { 18 | float4 pos : SV_Position; 19 | float3 norm : NORMAL; 20 | float3 view : VIEW; 21 | } 22 | 23 | [shader("vertex")] 24 | void main_vs( 25 | float3 i_pos : POSITION, 26 | float3 i_norm : NORMAL, 27 | out PS_INPUT output) 28 | { 29 | output.pos = mul(float4(i_pos, 1), gConst.viewProject); 30 | output.norm = i_norm; 31 | output.view = gConst.cameraPos.xyz - i_pos; 32 | } 33 | 34 | import Disney; 35 | 36 | [shader("fragment")] 37 | void main_ps( 38 | PS_INPUT input, 39 | out float4 o_color : SV_Target0) 40 | { 41 | //----------- Prepare input parameters 42 | float3 view = normalize(input.view); 43 | float3 norm = normalize(input.norm); 44 | float3 h = normalize(-gConst.lightDir.xyz + view); 45 | 46 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz)); 47 | float NdotV = max(0.f, dot(norm, view)); 48 | float NdotH = max(0.f, dot(norm, h)); 49 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz)); 50 | 51 | //----------- Calculate core shader part DIRECTLY 52 | float4 outParams = Disney(NdotL, NdotV, NdotH, LdotH, gConst.roughness); 53 | 54 | //----------- Calculate final color 55 | float3 Cdlin = float3(pow(gConst.baseColor[0], 2.2), pow(gConst.baseColor[1], 2.2), pow(gConst.baseColor[2], 2.2)); 56 | float3 Cspec0 = lerp(gConst.specular * .08 * float3(1), Cdlin, gConst.metallic); 57 | float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w; 58 | float3 colorh = brdfn * float3(NdotL) * gConst.lightIntensity.rgb; 59 | 60 | o_color = float4(colorh, 1.f); 61 | } 62 | -------------------------------------------------------------------------------- /samples/ShaderTraining/renderInference.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import CooperativeVectorAutoDiff; 15 | import CooperativeVectorFunctions; 16 | import Utils; 17 | import Activation; 18 | import MLP; 19 | import DisneyMLP; 20 | 21 | DECLARE_CBUFFER(InferenceConstantBufferEntry, gConst, 0, 0); 22 | ByteAddressBuffer gMLPParams : REGISTER_SRV(0, 0); 23 | 24 | struct PS_INPUT 25 | { 26 | float4 pos : SV_Position; 27 | float3 norm : NORMAL; 28 | float3 view : VIEW; 29 | } 30 | 31 | [shader("vertex")] 32 | void main_vs( 33 | float3 i_pos : POSITION, 34 | float3 i_norm : NORMAL, 35 | out PS_INPUT output) 36 | { 37 | output.pos = mul(float4(i_pos, 1), gConst.viewProject); 38 | output.norm = i_norm; 39 | output.view = gConst.cameraPos.xyz - i_pos; 40 | } 41 | 42 | [shader("fragment")] 43 | void main_ps( 44 | PS_INPUT input, 45 | out float4 o_color : SV_Target0) 46 | { 47 | // Prepare input parameters 48 | float3 view = normalize(input.view); 49 | float3 norm = normalize(input.norm); 50 | float3 h = normalize(-gConst.lightDir.xyz + view); 51 | 52 | float NdotL = max(0.f, dot(norm, -gConst.lightDir.xyz)); 53 | float NdotV = max(0.f, dot(norm, view)); 54 | float NdotH = max(0.f, dot(norm, h)); 55 | float LdotH = max(0.f, dot(h, -gConst.lightDir.xyz)); 56 | 57 | // Calculate approximated core shader 58 | float4 outParams = DisneyMLP( 59 | NdotL, NdotV, NdotH, LdotH, gConst.roughness, 60 | gMLPParams, 61 | rtxns::UnpackArray(gConst.weightOffsets), 62 | rtxns::UnpackArray(gConst.biasOffsets) 63 | ); 64 | 65 | // Calculate final color 66 | float3 Cdlin = pow(gConst.baseColor.rgb, 2.2); 67 | float3 Cspec0 = lerp(gConst.specular * float3(.08f), Cdlin, gConst.metallic); 68 | float3 brdfn = outParams.x * Cdlin * (1 - gConst.metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w; 69 | float3 colorh = brdfn * NdotL * gConst.lightIntensity.rgb; 70 | 71 | o_color = float4(colorh, 1.f); 72 | } 73 | -------------------------------------------------------------------------------- /samples/ShaderTraining/shaders.cfg: -------------------------------------------------------------------------------- 1 | renderDisney.slang -E main_vs -T vs 2 | renderDisney.slang -E main_ps -T ps 3 | renderInference.slang -E main_vs -T vs 4 | renderInference.slang -E main_ps -T ps 5 | renderDifference.slang -E main_vs -T vs 6 | renderDifference.slang -E main_ps -T ps 7 | computeTraining.slang -E main_cs -T cs 8 | computeOptimizer.slang -E adam_cs -T cs 9 | -------------------------------------------------------------------------------- /samples/SimpleInferencing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | include(../../external/donut/compileshaders.cmake) 11 | 12 | set(shader_includes 13 | ${SAMPLES_SHADER_INCLUDE_DIR} 14 | ${CMAKE_CURRENT_LIST_DIR} 15 | ) 16 | 17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" ) 18 | 19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " ) 20 | 21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " ) 22 | 23 | set(project SimpleInferencing) 24 | set(folder "Samples/SimpleInferencing") 25 | 26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang") 27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md") 28 | 29 | donut_compile_shaders_all_platforms( 30 | TARGET ${project}_shaders 31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg 32 | INCLUDES ${shader_includes} 33 | FOLDER ${folder} 34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project} 35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS} 36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV} 37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL} 38 | SOURCES ${${project}_shaders} 39 | SLANG 40 | ) 41 | 42 | add_executable(${project} WIN32 ${${project}_sources}) 43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils) 44 | add_dependencies(${project} ${project}_shaders) 45 | set_target_properties(${project} PROPERTIES FOLDER ${folder}) 46 | 47 | if (MSVC) 48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP") 49 | endif() -------------------------------------------------------------------------------- /samples/SimpleInferencing/NetworkConfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #ifndef __NETWORK_CONFIG_H__ 12 | #define __NETWORK_CONFIG_H__ 13 | 14 | #define VECTOR_FORMAT half 15 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16 16 | 17 | // When loading a model from file, these parameters must match 18 | #define INPUT_FEATURES 5 19 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input 20 | #define OUTPUT_NEURONS 4 21 | #define HIDDEN_NEURONS 32 22 | 23 | struct NeuralConstants 24 | { 25 | // Scene setup 26 | float4x4 viewProject; 27 | float4x4 view; 28 | float4 cameraPos; 29 | 30 | // Light setup 31 | float4 lightDir; 32 | float4 lightIntensity; 33 | 34 | // Material props 35 | float4 baseColor; 36 | float specular; 37 | float roughness; 38 | float metallic; 39 | float padding; 40 | 41 | // Neural weight & bias offsets 42 | uint4 weightOffsets; // Offsets to weight matrices in bytes. 43 | uint4 biasOffsets; // Offsets to bias vectors in bytes. 44 | }; 45 | 46 | #endif //__NETWORK_CONFIG_H__ -------------------------------------------------------------------------------- /samples/SimpleInferencing/SimpleInferencing.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorFunctions; 12 | import Utils; 13 | import LinearOps; 14 | 15 | #include "NetworkConfig.h" 16 | #include 17 | 18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0); 19 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0); 20 | 21 | struct VertexIn 22 | { 23 | float3 pos : POSITION; 24 | float3 norm : NORMAL; 25 | }; 26 | 27 | struct VertexOut 28 | { 29 | float4 pos : SV_Position; 30 | float3 norm : NORMAL; 31 | float3 view : VIEW; 32 | } 33 | 34 | [shader("vertex")] 35 | void main_vs( 36 | VertexIn vIn, 37 | out VertexOut vOut) 38 | { 39 | vOut.pos = mul(float4(vIn.pos, 1), gConst.viewProject); 40 | vOut.norm = vIn.norm; 41 | vOut.view = gConst.cameraPos.xyz - vIn.pos; 42 | } 43 | 44 | float4 DisneyMLP(float NdotL, float NdotV, float NdotH, float LdotH, float roughness) 45 | { 46 | uint4 weightOffsets = gConst.weightOffsets; 47 | uint4 biasOffsets = gConst.biasOffsets; 48 | 49 | CoopVec inputParams; 50 | CoopVec hiddenParams; 51 | CoopVec outputParams; 52 | 53 | // Encode input parameters, 5 inputs to 30 parameters 54 | float params[INPUT_FEATURES] = { NdotL, NdotV, NdotH, LdotH, roughness }; 55 | inputParams = rtxns::EncodeFrequency(params); 56 | 57 | // Forward propagation through the neural network 58 | // Input to hidden layer, then apply activation function 59 | hiddenParams = rtxns::LinearOp( 60 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION); 61 | hiddenParams = rtxns::relu(hiddenParams); 62 | 63 | // Hidden layer to hidden layer, then apply activation function 64 | hiddenParams = rtxns::LinearOp( 65 | hiddenParams, gMLPParams, weightOffsets[1], biasOffsets[1], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION); 66 | hiddenParams = rtxns::relu(hiddenParams); 67 | 68 | // Hidden layer to hidden layer, then apply activation function 69 | hiddenParams = rtxns::LinearOp( 70 | hiddenParams, gMLPParams, weightOffsets[2], biasOffsets[2], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION); 71 | hiddenParams = rtxns::relu(hiddenParams); 72 | 73 | // Hidden layer to output layer, then apply final activation function 74 | outputParams = rtxns::LinearOp( 75 | hiddenParams, gMLPParams, weightOffsets[3], biasOffsets[3], CoopVecMatrixLayout::InferencingOptimal, TYPE_INTERPRETATION); 76 | outputParams = exp(outputParams); 77 | 78 | // Take the output from the neural network as the output color 79 | return float4(outputParams[0], outputParams[1], outputParams[2], outputParams[3]); 80 | } 81 | 82 | [shader("fragment")] 83 | void main_ps( 84 | VertexOut vOut, 85 | out float4 o_color : SV_Target0) 86 | { 87 | float4 lightIntensity = gConst.lightIntensity; 88 | float4 lightDir = gConst.lightDir; 89 | float4 baseColor = gConst.baseColor; 90 | float specular = gConst.specular; 91 | float roughness = gConst.roughness; 92 | float metallic = gConst.metallic; 93 | 94 | // Prepare input parameters 95 | float3 view = normalize(vOut.view); 96 | float3 norm = normalize(vOut.norm); 97 | float3 h = normalize(-lightDir.xyz + view); 98 | 99 | float NdotL = max(0.f, dot(norm, -lightDir.xyz)); 100 | float NdotV = max(0.f, dot(norm, view)); 101 | float NdotH = max(0.f, dot(norm, h)); 102 | float LdotH = max(0.f, dot(h, -lightDir.xyz)); 103 | 104 | // Calculate approximated core shader part using MLP 105 | float4 outParams = DisneyMLP(NdotL, NdotV, NdotH, LdotH, roughness); 106 | 107 | // Calculate final color 108 | float3 Cdlin = float3(pow(baseColor.r, 2.2), pow(baseColor.g, 2.2), pow(baseColor.b, 2.2)); 109 | float3 Cspec0 = lerp(specular * .08f * float3(1,1,1), Cdlin, metallic); 110 | float3 brdfn = outParams.x * Cdlin * (1 - metallic) + outParams.y * lerp(Cspec0, float3(1), outParams.z) + outParams.w; 111 | float3 colorh = brdfn * float3(NdotL) * lightIntensity.rgb; 112 | 113 | o_color = float4(colorh, 1.f); 114 | } 115 | -------------------------------------------------------------------------------- /samples/SimpleInferencing/shaders.cfg: -------------------------------------------------------------------------------- 1 | SimpleInferencing.slang -T vs -E main_vs 2 | SimpleInferencing.slang -T ps -E main_ps 3 | -------------------------------------------------------------------------------- /samples/SimpleTraining/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | include(../../external/donut/compileshaders.cmake) 11 | 12 | set(shader_includes 13 | ${SAMPLES_SHADER_INCLUDE_DIR} 14 | ${CMAKE_CURRENT_LIST_DIR} 15 | ) 16 | 17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" ) 18 | 19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " ) 20 | 21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " ) 22 | 23 | set(project SimpleTraining) 24 | set(folder "Samples/SimpleTraining") 25 | 26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang") 27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md") 28 | 29 | donut_compile_shaders_all_platforms( 30 | TARGET ${project}_shaders 31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg 32 | INCLUDES ${shader_includes} 33 | FOLDER ${folder} 34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project} 35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS} 36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV} 37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL} 38 | SOURCES ${${project}_shaders} 39 | SLANG 40 | ) 41 | 42 | add_executable(${project} WIN32 ${${project}_sources}) 43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils) 44 | add_dependencies(${project} ${project}_shaders) 45 | set_target_properties(${project} PROPERTIES FOLDER ${folder}) 46 | 47 | if (MSVC) 48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP") 49 | endif() -------------------------------------------------------------------------------- /samples/SimpleTraining/NetworkConfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #define INPUT_FEATURES 2 12 | #define INPUT_NEURONS (INPUT_FEATURES * 6) // Frequency encoding increases the input by 6 for each input 13 | #define OUTPUT_NEURONS 3 14 | 15 | #define HIDDEN_NEURONS 64 16 | #define NUM_HIDDEN_LAYERS 4 17 | 18 | #define LEARNING_RATE 0.001f 19 | 20 | #define NUM_TRANSITIONS (NUM_HIDDEN_LAYERS + 1) 21 | #define NUM_TRANSITIONS_ALIGN4 ((NUM_TRANSITIONS + 3) / 4) 22 | #define LOSS_SCALE 128.0 23 | #define RELU_LEAK 0.01h 24 | 25 | #define VECTOR_FORMAT half 26 | #define TYPE_INTERPRETATION CoopVecComponentType::Float16 27 | #define NETWORK_PRECISION rtxns::Precision::F16 28 | 29 | #define MATRIX_LAYOUT CoopVecMatrixLayout::TrainingOptimal 30 | 31 | #define BATCH_COUNT 128 32 | #define BATCH_SIZE_X 32 33 | #define BATCH_SIZE_Y 32 34 | 35 | enum class NetworkTransform 36 | { 37 | Identity, 38 | Zoom, 39 | Flip 40 | }; 41 | 42 | struct NeuralConstants 43 | { 44 | uint4 weightOffsets[NUM_TRANSITIONS_ALIGN4]; 45 | uint4 biasOffsets[NUM_TRANSITIONS_ALIGN4]; 46 | 47 | uint32_t imageWidth; 48 | uint32_t imageHeight; 49 | uint32_t maxParamSize; 50 | float learningRate; 51 | 52 | uint32_t currentStep; 53 | uint32_t batchSizeX; 54 | uint32_t batchSizeY; 55 | NetworkTransform networkTransform; 56 | }; 57 | -------------------------------------------------------------------------------- /samples/SimpleTraining/SimpleTraining_Inference.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import CooperativeVectorFunctions; 15 | import Utils; 16 | import LinearOps; 17 | 18 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0); 19 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0); 20 | Texture2D inputTexture :REGISTER_SRV(1, 0); 21 | RWTexture2D outputTexture :REGISTER_UAV(0, 0); 22 | 23 | [shader("compute")] 24 | [numthreads(8, 8, 1)] 25 | void inference_cs(uint3 dispatchThreadID : SV_DispatchThreadID) 26 | { 27 | // Set the input ID as the uv coordinate and frequency encode it for the network 28 | float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight)); 29 | CoopVec inputParams = rtxns::EncodeFrequency({inputUV.x, inputUV.y}); 30 | 31 | // Load offsets 32 | uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.weightOffsets); 33 | uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.biasOffsets); 34 | 35 | CoopVec hiddenParams; 36 | CoopVec outputParams; 37 | 38 | // Forward propagation through the neural network 39 | // Input to hidden layer, then apply activation function 40 | hiddenParams = rtxns::LinearOp( 41 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], 42 | MATRIX_LAYOUT, TYPE_INTERPRETATION); 43 | hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK); 44 | 45 | // Hidden layers to hidden layers, then apply activation function 46 | [ForceUnroll] 47 | for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++) 48 | { 49 | hiddenParams = rtxns::LinearOp( 50 | hiddenParams, gMLPParams, weightOffsets[layer], 51 | biasOffsets[layer], 52 | MATRIX_LAYOUT, TYPE_INTERPRETATION); 53 | hiddenParams = rtxns::leakyReLU(hiddenParams, RELU_LEAK); 54 | } 55 | 56 | // Hidden layer to output layer, then apply final activation function 57 | outputParams = rtxns::LinearOp( 58 | hiddenParams, gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS], 59 | MATRIX_LAYOUT, TYPE_INTERPRETATION); 60 | outputParams = rtxns::sigmoid(outputParams); 61 | 62 | // Take the output from the neural network as the output color 63 | float4 color = {outputParams[0], outputParams[1], outputParams[2], 1.f}; 64 | outputTexture[dispatchThreadID.xy] = color; 65 | } -------------------------------------------------------------------------------- /samples/SimpleTraining/SimpleTraining_Optimizer.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import Optimizers; 15 | 16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0); 17 | RWBuffer gMLPParams :REGISTER_UAV(0, 0); 18 | RWBuffer gMLPParamsf :REGISTER_UAV(1, 0); 19 | RWBuffer gMLPParamsGradients :REGISTER_UAV(2, 0); 20 | RWBuffer gMoments1 :REGISTER_UAV(3, 0); 21 | RWBuffer gMoments2 :REGISTER_UAV(4, 0); 22 | 23 | [numthreads(32, 1, 1)] 24 | void adam_cs(uint3 dispatchThreadID: SV_DispatchThreadID) 25 | { 26 | uint i = dispatchThreadID.x; 27 | if (i >= gConst.maxParamSize) 28 | return; 29 | 30 | float gradient = (float)gMLPParamsGradients[i]; 31 | gMLPParamsGradients[i] = half(0.0); 32 | 33 | // Get the floating point params, not float16 34 | float weightbias = gMLPParamsf[i]; 35 | 36 | optimizers::Adam optimizer = optimizers::Adam(gMoments1, gMoments2, gConst.learningRate, LOSS_SCALE); 37 | 38 | float adjustedWeightbias = optimizer.step(weightbias, i, gradient, gConst.currentStep); 39 | 40 | gMLPParamsf[i] = adjustedWeightbias; 41 | gMLPParams[i] = (half)adjustedWeightbias; 42 | } 43 | 44 | [numthreads(32, 1, 1)] 45 | void convert_weights_cs(uint3 dispatchThreadID: SV_DispatchThreadID) 46 | { 47 | uint i = dispatchThreadID.x; 48 | if (i >= gConst.maxParamSize) 49 | return; 50 | 51 | half param = gMLPParams[i]; 52 | gMLPParamsf[i] = float(param); 53 | } -------------------------------------------------------------------------------- /samples/SimpleTraining/SimpleTraining_Training.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import CooperativeVectorDerivatives; 15 | import CooperativeVectorFunctions; 16 | import Utils; 17 | import LinearOps; 18 | 19 | 20 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0); 21 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0); 22 | Texture2D inputTexture :REGISTER_SRV(1, 0); 23 | RWByteAddressBuffer gMLPParamsGradients :REGISTER_UAV(0, 0); 24 | RWStructuredBuffer gRandState :REGISTER_UAV(1, 0); 25 | RWTexture2D outputTexture :REGISTER_UAV(2, 0); 26 | RWTexture2D lossTexture :REGISTER_UAV(3, 0); 27 | 28 | struct RNG 29 | { 30 | uint state; 31 | 32 | __init(uint state) { this.state = state; } 33 | 34 | [mutating] 35 | float next() 36 | { 37 | float r = (state >> 8) * 0x1p-24; 38 | state = state * 2739110765U + 2739110765U; 39 | return r; 40 | } 41 | } 42 | 43 | [shader("compute")] 44 | [numthreads(8, 8, 1)] 45 | void training_cs(uint3 dispatchThreadID : SV_DispatchThreadID) 46 | { 47 | uint2 batchSize = uint2(gConst.batchSizeX, gConst.batchSizeY); 48 | 49 | uint dispatchThreadIdxy = dispatchThreadID.y * batchSize.x + dispatchThreadID.x; 50 | 51 | RNG rng = RNG(gRandState[dispatchThreadIdxy]); 52 | 53 | // Get a random uv coordinate for the input and frequency encode it for improved convergance 54 | float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0); 55 | CoopVec inputParams = rtxns::EncodeFrequency({inputUV.x, inputUV.y}); 56 | 57 | // Load offsets 58 | uint weightOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.weightOffsets); 59 | uint biasOffsets[NUM_TRANSITIONS] = rtxns::UnpackArray(gConst.biasOffsets); 60 | 61 | // Create variables to cache the results from each stage 62 | CoopVec hiddenParams[NUM_HIDDEN_LAYERS]; 63 | CoopVec hiddenActivated[NUM_HIDDEN_LAYERS]; 64 | CoopVec outputParams; 65 | CoopVec outputActivated; 66 | 67 | // Forward propagation through the neural network 68 | // Input to hidden layer, then apply activation function 69 | hiddenParams[0] = rtxns::LinearOp( 70 | inputParams, gMLPParams, weightOffsets[0], biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION); 71 | hiddenActivated[0] = rtxns::leakyReLU(hiddenParams[0], RELU_LEAK); 72 | 73 | // Hidden layers to hidden layers, then apply activation function 74 | [ForceUnroll] 75 | for (uint layer = 1; layer < NUM_HIDDEN_LAYERS; layer++) 76 | { 77 | hiddenParams[layer] = rtxns::LinearOp( 78 | hiddenActivated[layer - 1], gMLPParams, weightOffsets[layer], biasOffsets[layer], 79 | MATRIX_LAYOUT, TYPE_INTERPRETATION); 80 | hiddenActivated[layer] = rtxns::leakyReLU(hiddenParams[layer], RELU_LEAK); 81 | } 82 | 83 | // Hidden layer to output layer, then apply final activation function 84 | outputParams = rtxns::LinearOp( 85 | hiddenActivated[NUM_HIDDEN_LAYERS - 1], gMLPParams, weightOffsets[NUM_HIDDEN_LAYERS], 86 | biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION); 87 | outputActivated = rtxns::sigmoid(outputParams); 88 | 89 | // Take the output from the neural network as the output color 90 | float3 predictedRGB = {outputActivated[0], outputActivated[1], outputActivated[2]}; 91 | 92 | // Now transform the input UVs according to the NetworkModel enum. 93 | // This can easily be extended to try many different transforms. 94 | uint2 actualUV; 95 | if (gConst.networkTransform == NetworkTransform.Flip) 96 | { 97 | float2 flipUV = inputUV.yx; 98 | actualUV = uint2(flipUV.xy * float2(gConst.imageHeight, gConst.imageWidth)); 99 | } 100 | else if (gConst.networkTransform == NetworkTransform.Zoom) 101 | { 102 | float2 zoomUV = inputUV * 0.5 + 0.25; 103 | actualUV = uint2(zoomUV.xy * float2(gConst.imageWidth, gConst.imageHeight)); 104 | } 105 | else 106 | { 107 | actualUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight)); 108 | } 109 | 110 | // Load the texture according to the transformed input UVs. This will 111 | // provide the RGB that the model is trying to train towards. 112 | float3 actualRGB = inputTexture[actualUV].rgb; 113 | 114 | // Output the loss, scaled to greyscale for output 115 | uint2 lossUV = uint2(inputUV.xy * float2(gConst.imageWidth, gConst.imageHeight)); 116 | const float lossScaleFactor = 10.0f; // scale it up for better vis 117 | lossTexture[lossUV] = float4((predictedRGB - actualRGB) * lossScaleFactor + 0.5, 1); 118 | 119 | // Compute the L2 loss gradient 120 | // L2Loss = (a-b)^2 121 | // L2Loss Derivative = 2(a-b) 122 | float3 lossGradient = 2.0 * (predictedRGB - actualRGB); 123 | 124 | // Scale by batch size 125 | lossGradient /= (batchSize.x * batchSize.y); 126 | 127 | // Apply the LOSS_SCALE factor to retain precision. Remove it in the optimizer pass before use. 128 | lossGradient *= LOSS_SCALE; 129 | 130 | CoopVec lossGradientCV = CoopVec(VECTOR_FORMAT(lossGradient[0]), VECTOR_FORMAT(lossGradient[1]), VECTOR_FORMAT(lossGradient[2])); 131 | 132 | // Back-propogation pass, generate the gradients and accumulate the results into memory to be applied in the optimisation pass. 133 | CoopVec outputGradient; 134 | CoopVec hiddenGradient; 135 | 136 | // Output layer (loss gradient) to final hidden layer 137 | outputGradient = rtxns::sigmoid_Derivative(outputParams, lossGradientCV); 138 | hiddenGradient = rtxns::LinearOp_Backward( 139 | hiddenActivated[NUM_HIDDEN_LAYERS - 1], outputGradient, gMLPParams, gMLPParamsGradients, 140 | weightOffsets[NUM_HIDDEN_LAYERS], biasOffsets[NUM_HIDDEN_LAYERS], MATRIX_LAYOUT, TYPE_INTERPRETATION); 141 | 142 | // Hidden layer to hidden layer 143 | for(int layer = NUM_HIDDEN_LAYERS - 1; layer >= 1; layer--) 144 | { 145 | hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[layer], RELU_LEAK, hiddenGradient); 146 | hiddenGradient = rtxns::LinearOp_Backward 147 | (hiddenActivated[layer - 1], hiddenGradient, gMLPParams, gMLPParamsGradients, 148 | weightOffsets[layer], biasOffsets[layer], MATRIX_LAYOUT, TYPE_INTERPRETATION); 149 | } 150 | 151 | // First hidden layer to input layer 152 | hiddenGradient = rtxns::leakyReLU_Derivative(hiddenParams[0], RELU_LEAK, hiddenGradient); 153 | rtxns::LinearOp_Backward( 154 | inputParams, hiddenGradient, gMLPParams, gMLPParamsGradients, weightOffsets[0], 155 | biasOffsets[0], MATRIX_LAYOUT, TYPE_INTERPRETATION); 156 | 157 | // Store the random state to continue iterating next time. 158 | gRandState[dispatchThreadIdxy] = rng.state; 159 | } -------------------------------------------------------------------------------- /samples/SimpleTraining/shaders.cfg: -------------------------------------------------------------------------------- 1 | SimpleTraining_Inference.slang -E inference_cs -T cs 2 | SimpleTraining_Training.slang -E training_cs -T cs 3 | SimpleTraining_Optimizer.slang -E adam_cs -T cs 4 | SimpleTraining_Optimizer.slang -E convert_weights_cs -T cs 5 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | include(../../external/donut/compileshaders.cmake) 11 | 12 | set(shader_includes 13 | ${SAMPLES_SHADER_INCLUDE_DIR} 14 | ${CMAKE_CURRENT_LIST_DIR} 15 | ) 16 | 17 | set(SHADER_COMPILE_OPTIONS "--matrixRowMajor --hlsl2021" ) 18 | 19 | set(SHADER_COMPILE_OPTIONS_SPIRV " -X \"-Wno-41017 -capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV\" " ) 20 | 21 | set(SHADER_COMPILE_OPTIONS_DXIL " --shaderModel 6_9 --hlsl2021 -X \"-Wno-41012 -Wno-41016 -Wno-41017 -Xdxc -Vd\" " ) 22 | 23 | set(project SlangpyTraining) 24 | set(folder "Samples/SlangpyTraining") 25 | 26 | file(GLOB_RECURSE ${project}_shaders "*.hlsl" "*.hlsli" "*.slang") 27 | file(GLOB_RECURSE ${project}_sources "*.cpp" "*.h" "*.md") 28 | 29 | donut_compile_shaders_all_platforms( 30 | TARGET ${project}_shaders 31 | CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/shaders.cfg 32 | INCLUDES ${shader_includes} 33 | FOLDER ${folder} 34 | OUTPUT_BASE ${RTXNS_BINARY_DIR}/shaders/${project} 35 | SHADERMAKE_OPTIONS ${SHADER_COMPILE_OPTIONS} 36 | SHADERMAKE_OPTIONS_SPIRV ${SHADER_COMPILE_OPTIONS_SPIRV} 37 | SHADERMAKE_OPTIONS_DXIL ${SHADER_COMPILE_OPTIONS_DXIL} 38 | SOURCES ${${project}_shaders} 39 | SLANG 40 | ) 41 | 42 | add_executable(${project} WIN32 ${${project}_sources}) 43 | target_link_libraries(${project} donut_app donut_engine NeuralShading Utils) 44 | add_dependencies(${project} ${project}_shaders) 45 | set_target_properties(${project} PROPERTIES FOLDER ${folder}) 46 | 47 | if (MSVC) 48 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /MP") 49 | endif() -------------------------------------------------------------------------------- /samples/SlangpyTraining/Helpers.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # 3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # NVIDIA CORPORATION and its licensors retain all intellectual property 6 | # and proprietary rights in and to this software, related documentation 7 | # and any modifications thereto. Any use, reproduction, disclosure or 8 | # distribution of this software and related documentation without an express 9 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 10 | # 11 | from slangpy.backend import Device, DeviceType, TextureLoader, Bitmap, SlangCompilerOptions 12 | import slangpy as spy 13 | from pathlib import Path 14 | from typing import Any, Union 15 | import subprocess 16 | import os 17 | 18 | from NeuralModules import CoopVecModule 19 | 20 | class SDKSample: 21 | def __init__(self, args: list[str]): 22 | super().__init__() 23 | 24 | # Set up directories to find includes and executables 25 | self.spy_dir = Path(spy.__file__).parent / "slang" 26 | self.sdk_root = Path(__file__).parent.parent.parent 27 | self.sdk_data_dir = self.sdk_root / "assets/data" 28 | self.rtxns_dir = self.sdk_root / "src/NeuralShading_Shaders" 29 | self.spy_sample_dir = self.sdk_root / "samples/SlangpyTraining" 30 | self.donut_dir = self.sdk_root / "external/donut/include" 31 | self.slang_compiler = self.sdk_root / "bin/slangc.bat" 32 | 33 | search_root = self.sdk_root / "bin" 34 | bin_ext = ".exe" if os.name == "nt" else "" 35 | inference_candidates = [f for f in search_root.glob(f"**/SlangpyTraining{bin_ext}") if f.is_file()] 36 | shadermake_candidates = [f for f in search_root.glob(f"**/ShaderMake{bin_ext}") if f.is_file()] 37 | 38 | if len(inference_candidates) == 0: 39 | print(f"Warning: Could not find SlangpyTraining executable within {search_root}. " 40 | "C++ sample will not be launched after training.") 41 | self.inference_sample_path = None 42 | else: 43 | self.inference_sample_path = inference_candidates[0] 44 | if len(inference_candidates) > 1: 45 | print(f"Warning: Found multiple possible SlangpyTraining executables. Picking {self.inference_sample_path}") 46 | else: 47 | print(f"Found SlangpyTraining executable at {self.inference_sample_path}") 48 | 49 | if len(shadermake_candidates) == 0: 50 | print(f"Warning: Could not find ShaderMake executable within {search_root}. " 51 | "C++ sample will not be launched after training.") 52 | self.shadermake_path = None 53 | else: 54 | self.shadermake_path = shadermake_candidates[0] 55 | if len(shadermake_candidates) > 1: 56 | print(f"Warning: Found multiple possible ShaderMake executables. Picking {self.shadermake_path}") 57 | else: 58 | print(f"Found ShaderMake executable at {self.shadermake_path}") 59 | 60 | self.include_dirs = [ 61 | self.rtxns_dir, 62 | self.spy_dir, 63 | self.spy_sample_dir 64 | ] 65 | 66 | for field in ("spy_dir", "sdk_root", "sdk_data_dir", "rtxns_dir", "spy_sample_dir", "donut_dir", "slang_compiler"): 67 | path: Path = getattr(self, field) 68 | if not path.exists(): 69 | print(f"Warning: Can't find path {field} at {path}. This may cause errors.") 70 | 71 | self.device = self._create_device() 72 | 73 | # Create an sgl device and setup default include directories 74 | def _create_device(self): 75 | device = Device( 76 | type=DeviceType.vulkan, 77 | compiler_options=SlangCompilerOptions({ 78 | "include_paths": self.include_dirs, 79 | "disable_warnings": [ 80 | "41018", # Overzealous uninitialized-out-parameter warning 81 | "41012" # Coop vec capability warning 82 | ] 83 | }), 84 | ) 85 | 86 | print("Selected adapter", device.info.adapter_name) 87 | 88 | return device 89 | 90 | def load_texture(self, path: Union[str,Path]): 91 | bmp = Bitmap(self.sdk_data_dir / path) 92 | loader = TextureLoader(self.device) 93 | target_tex = loader.load_texture(bmp, {"load_as_normalized": True}) 94 | return target_tex 95 | 96 | # Take a trained model and distill it to defines and compile it 97 | def compile_inference_shader(self, model: CoopVecModule): 98 | if self.inference_sample_path is None or self.shadermake_path is None: 99 | print("Missing executables, skipping compilation.") 100 | return 101 | 102 | if len(model.parameters()) > 1: 103 | raise ValueError("Shader generation only supports a single parameter buffer") 104 | 105 | defines = [ 106 | ("MODEL_TYPE", f'"{model.inference_type_name}"'), 107 | ("MODEL_INITIALIZER", f'"{model.get_initializer()}"'), 108 | ("VECTOR_FORMAT", model.elem_name), 109 | ] 110 | 111 | self.compile_shader("SlangpyInference.slang", defines) 112 | 113 | def compile_shader(self, shader_path: str, defines: list[Union[str,tuple[str, Any]]]): 114 | config_path = self.spy_sample_dir / "trained_shaders.cfg" 115 | with open(config_path, "w") as file: 116 | file.write(f"{shader_path} -E main_cs -T cs") 117 | 118 | output_path = self.inference_sample_path.parent / "shaders/SlangpyTraining/spirv" 119 | 120 | args = [ 121 | self.shadermake_path, 122 | "--config", config_path, 123 | "-o", output_path, 124 | "--compiler", self.slang_compiler, 125 | "--platform", "SPIRV", 126 | "--flatten", 127 | "--binaryBlob", 128 | "--outputExt", ".bin", 129 | "--slang", 130 | "--tRegShift", "0", 131 | "--sRegShift", "128", 132 | "--bRegShift", "256", 133 | "--uRegShift", "384", 134 | "--vulkanVersion", "1.2", 135 | "--matrixRowMajor", 136 | "--force", 137 | "-X", "-capability spvCooperativeVectorNV -capability spvCooperativeVectorTrainingNV", 138 | ] 139 | for d in defines + ["SPIRV", "TARGET_VULKAN"]: 140 | if isinstance(d, str): 141 | args.extend(("-D", d)) 142 | else: 143 | args.extend(("-D", f"{d[0]}={d[1]}")) 144 | 145 | for include_dir in self.include_dirs + [self.donut_dir]: 146 | args.extend(("-I", include_dir)) 147 | 148 | result = subprocess.run(args, text=True, capture_output=True) 149 | if result.stderr: 150 | raise RuntimeError(f"ShaderMake exited with errors: {result.stderr}") 151 | stdout = str(result.stdout) 152 | if stdout.find(": error") != -1: 153 | raise RuntimeError(f"slang compiler exited with errors: {stdout}") 154 | 155 | def run_sdk_inference(self, model_weights: Path): 156 | if self.inference_sample_path is None or self.shadermake_path is None: 157 | print("Missing executables, skipping C++ sample.") 158 | return 159 | 160 | subprocess.run([self.inference_sample_path, model_weights]) 161 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/NetworkConfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #define MAX_LAYER_COUNT 8 12 | #define MAX_LAYER_COUNT_ALIGN4 ((MAX_LAYER_COUNT + 3) / 4) 13 | 14 | // These defines will be overriden by texture-training.py with the 15 | // chosen network architecture. However, if we compile this file 16 | // from scratch, we provide a default architexture here so the sample 17 | // runs. We provide the trained weights for this network under 18 | // assets/data/slangpy-weights.json 19 | #ifndef MODEL_TYPE 20 | #define MODEL_TYPE \ 21 | rtxns::ModuleChain, \ 22 | rtxns::InferenceMLPModule, rtxns::mlp::SigmoidAct>> 23 | 24 | #define MODEL_INITIALIZER \ 25 | { \ 26 | {}, \ 27 | { \ 28 | weights, { wo[0], wo[1], wo[2], wo[3], wo[4] }, { bo[0], bo[1], bo[2], bo[3], bo[4] }, { 0.01h }, \ 29 | { \ 30 | } \ 31 | } \ 32 | } 33 | #define VECTOR_FORMAT half 34 | #endif 35 | 36 | struct NeuralConstants 37 | { 38 | uint4 weightOffsets[MAX_LAYER_COUNT_ALIGN4]; 39 | uint4 biasOffsets[MAX_LAYER_COUNT_ALIGN4]; 40 | 41 | uint32_t imageWidth; 42 | uint32_t imageHeight; 43 | }; 44 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/NeuralModules.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | __exported import CooperativeVectorDerivatives; 11 | __exported import CooperativeVectorFunctions; 12 | __exported import CooperativeVectorAutoDiff; 13 | __exported import Optimizers; 14 | __exported import Utils; 15 | __exported import LinearOps; 16 | __exported import MLP; 17 | __exported import Activation; 18 | 19 | namespace rtxns 20 | { 21 | //////////////////////// 22 | // 23 | // Root interface for neural modules and implementations of several 24 | // Takes a CoopVec of type T with NumInput elements and returns NumOutputs elements 25 | // Several RTXNS classes and functions are wrapped here to conform to the IModule interface 26 | // This lets you build network architectures with generic types 27 | // 28 | //////////////////////// 29 | interface IModule 30 | { 31 | [BackwardDifferentiable] 32 | CoopVec forward(CoopVec inputParams); 33 | } 34 | 35 | // Chain two modules together, i.e. pass the output of the first to the second 36 | // Can be nested arbitrarily 37 | struct ModuleChain< 38 | T : __BuiltinFloatingPointType, 39 | let NumInputs : int, 40 | let NumHidden : int, 41 | let NumOutputs : int, 42 | First : IModule, 43 | Second : IModule 44 | > : IModule 45 | { 46 | First first; 47 | Second second; 48 | 49 | [BackwardDifferentiable] 50 | CoopVec forward(CoopVec inputParams) 51 | { 52 | CoopVec middle = first.forward(inputParams); 53 | return second.forward(middle); 54 | } 55 | } 56 | 57 | struct TrainableMLPModule< 58 | T : __BuiltinFloatingPointType, 59 | let NumHiddenLayers : int, 60 | let InputNeurons : int, 61 | let HiddenNeurons : int, 62 | let OutputNeurons : int, 63 | let ComponentType : CoopVecComponentType, 64 | HiddenAct : mlp::IActivation, 65 | OutputAct : mlp::IActivation 66 | > : IModule 67 | { 68 | ByteAddressBuffer parameters; 69 | RWByteAddressBuffer derivatives; 70 | uint matrixOffsets[NumHiddenLayers + 1]; 71 | uint biasOffsets[NumHiddenLayers + 1]; 72 | 73 | HiddenAct hiddenAct; 74 | OutputAct outputAct; 75 | 76 | [BackwardDerivative(backward)] 77 | CoopVec forward(CoopVec inputParams) 78 | { 79 | var mlp = mlp::TrainingMLP< 80 | T, 81 | NumHiddenLayers, 82 | InputNeurons, 83 | HiddenNeurons, 84 | OutputNeurons, 85 | CoopVecMatrixLayout::TrainingOptimal, 86 | ComponentType 87 | >(parameters, derivatives, matrixOffsets, biasOffsets); 88 | return mlp.forward(inputParams, hiddenAct, outputAct); 89 | } 90 | 91 | void backward(inout DifferentialPair> inputParams, const CoopVec dOutputActivated) 92 | { 93 | var mlp = mlp::TrainingMLP< 94 | T, 95 | NumHiddenLayers, 96 | InputNeurons, 97 | HiddenNeurons, 98 | OutputNeurons, 99 | CoopVecMatrixLayout::TrainingOptimal, 100 | ComponentType 101 | >(parameters, derivatives, matrixOffsets, biasOffsets); 102 | 103 | mlp.backward(inputParams, hiddenAct, outputAct, dOutputActivated); 104 | } 105 | } 106 | 107 | struct InferenceMLPModule< 108 | T : __BuiltinFloatingPointType, 109 | let NumHiddenLayers : int, 110 | let InputNeurons : int, 111 | let HiddenNeurons : int, 112 | let OutputNeurons : int, 113 | let ComponentType : CoopVecComponentType, 114 | HiddenAct : mlp::IActivation, 115 | OutputAct : mlp::IActivation 116 | > : IModule 117 | { 118 | ByteAddressBuffer parameters; 119 | uint matrixOffsets[NumHiddenLayers + 1]; 120 | uint biasOffsets[NumHiddenLayers + 1]; 121 | 122 | HiddenAct hiddenAct; 123 | OutputAct outputAct; 124 | 125 | [TreatAsDifferentiable] 126 | CoopVec forward(CoopVec inputParams) 127 | { 128 | var mlp = mlp::InferenceMLP< 129 | T, 130 | NumHiddenLayers, 131 | InputNeurons, 132 | HiddenNeurons, 133 | OutputNeurons, 134 | CoopVecMatrixLayout::InferencingOptimal, 135 | ComponentType 136 | >(parameters, matrixOffsets, biasOffsets); 137 | return mlp.forward(inputParams, hiddenAct, outputAct); 138 | } 139 | } 140 | 141 | struct FrequencyEncoding : IModule 142 | { 143 | [BackwardDifferentiable] 144 | CoopVec forward(CoopVec inputParams) 145 | { 146 | return rtxns::EncodeFrequencyN(inputParams); 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/SlangpyInference.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "NetworkConfig.h" 12 | #include 13 | 14 | import SlangpyTraining; 15 | 16 | DECLARE_CBUFFER(NeuralConstants, gConst, 0, 0); 17 | ByteAddressBuffer gMLPParams :REGISTER_SRV(0, 0); 18 | Texture2D inputTexture :REGISTER_SRV(1, 0); 19 | RWTexture2D outputTexture :REGISTER_UAV(0, 0); 20 | 21 | float3 evalModel(ByteAddressBuffer weights, uint wo[MAX_LAYER_COUNT], uint bo[MAX_LAYER_COUNT], float2 uv) 22 | { 23 | // Auto-generated defines from texture-training.py 24 | MODEL_TYPE model = MODEL_INITIALIZER; 25 | 26 | let inputParams = rtxns::CoopVecFromVector(uv); 27 | 28 | let result = model.forward(inputParams); 29 | 30 | return rtxns::VectorFromCoopVec(result); 31 | } 32 | 33 | [shader("compute")] 34 | [numthreads(8, 8, 1)] 35 | void main_cs(uint3 dispatchThreadID : SV_DispatchThreadID) 36 | { 37 | // Get the UV coordinate from the thread ID 38 | float2 inputUV = float2(dispatchThreadID.x / float(gConst.imageWidth), dispatchThreadID.y / float(gConst.imageHeight)); 39 | 40 | // Load offsets 41 | uint weightOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray(gConst.weightOffsets); 42 | uint biasOffsets[MAX_LAYER_COUNT] = rtxns::UnpackArray(gConst.biasOffsets); 43 | 44 | // Run the model 45 | float3 modelOutput = evalModel(gMLPParams, weightOffsets, biasOffsets, inputUV); 46 | 47 | // Write to output 48 | outputTexture[dispatchThreadID.xy] = float4(modelOutput, 1.0f); 49 | } -------------------------------------------------------------------------------- /samples/SlangpyTraining/SlangpyTraining.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | from slangpy.backend import DataType 11 | from slangpy.core.module import Module 12 | from slangpy.types import NDBuffer, call_id 13 | import numpy as np 14 | import json 15 | import math 16 | import time 17 | import sys 18 | 19 | from Helpers import SDKSample 20 | from NeuralModules import CoopVecModule, TrainableMLP, FrequencyEncoding, ModuleChain 21 | from NeuralModules import Activation, NoneAct, LinearAct, ExponentialAct, ShiftedExponentialAct, ReLUAct, LeakyReLUAct, SigmoidAct, SwishAct, TanhAct 22 | 23 | # Set to true for an interactive training. This can be helpful 24 | # but slows down training quite a bit 25 | INTERACTIVE = True 26 | if INTERACTIVE: 27 | import matplotlib.pyplot as plt 28 | 29 | def training_main(): 30 | ## 31 | ## Setup window, device and file paths 32 | ## 33 | sample = SDKSample(sys.argv[1:]) 34 | device = sample.device 35 | 36 | ## 37 | ## Set up training constants. 38 | ## When we train interactively, choose smaller batches 39 | ## for faster feedback. 40 | ## 41 | batch_shape = (256, 256) 42 | learning_rate = 0.005 43 | grad_scale = 128.0 44 | loss_scale = grad_scale / math.prod(batch_shape) 45 | 46 | sample_target = 1000000000 47 | num_batches_per_epoch = 1000 if INTERACTIVE else 5000 48 | num_epochs = sample_target // (num_batches_per_epoch * math.prod(batch_shape)) 49 | 50 | ## 51 | ## Set up models 52 | ## 53 | 54 | # A basic MLP with ReLU activations and a linear output that maps a 2D UV input 55 | # to an RGB color. This is a good baseline, but it won't achieve state-of-the-art 56 | basic_mlp = TrainableMLP(device, DataType.float16, 57 | num_hidden_layers=3, 58 | input_width=2, 59 | hidden_width=32, 60 | output_width=3, 61 | hidden_act=ReLUAct(), 62 | output_act=NoneAct()) 63 | 64 | # Replacing ReLU with LeakyReLU makes training more stable for small networks, 65 | # and a Sigmoid activation at the output helps bring the network into the right range 66 | better_activations = TrainableMLP(device, DataType.float16, 67 | num_hidden_layers=3, 68 | input_width=2, 69 | hidden_width=32, 70 | output_width=3, 71 | hidden_act=LeakyReLUAct(), 72 | output_act=SigmoidAct()) 73 | 74 | # For 2D or 3D inputs, we can do even better with an input encoding 75 | # We need to adjust the input width of the MLP to take the additional 76 | # outputs from the encoding 77 | encoding = FrequencyEncoding(DataType.float16, 2, 3) 78 | mlp_with_encoding = ModuleChain( 79 | encoding, 80 | TrainableMLP(device, DataType.float16, 81 | num_hidden_layers=3, 82 | input_width=encoding.fan_out, 83 | hidden_width=32, 84 | output_width=3, 85 | hidden_act=LeakyReLUAct(), 86 | output_act=SigmoidAct()) 87 | ) 88 | 89 | # We're not limited to predefined modules - for example, try using the custom 90 | # activation from the slang file: 91 | activation = SigmoidAct() 92 | #activation = Activation("SiLUActivation") 93 | 94 | # Now take the working model and scale up the number of weights by adding another layer 95 | larger_mlp = ModuleChain( 96 | encoding, 97 | TrainableMLP(device, DataType.float16, 98 | num_hidden_layers=4, 99 | input_width=encoding.fan_out, 100 | hidden_width=32, 101 | output_width=3, 102 | hidden_act=LeakyReLUAct(), 103 | output_act=activation) 104 | ) 105 | 106 | # Make a list of models to be optimized so we can compare them 107 | models = [ 108 | ("Basic MLP", basic_mlp), 109 | ("+Better activations", better_activations), 110 | ("+Frequency encoding", mlp_with_encoding), 111 | ("+More Weights", larger_mlp), 112 | ] 113 | 114 | # You can also play with different losses. For images, L2 is not a bad default 115 | loss_name = "rtxns::mlp::L2" 116 | 117 | ## 118 | ## Load training data and slang code 119 | ## 120 | target_tex = sample.load_texture("nvidia-logo.png") 121 | 122 | module = Module.load_from_file(device, "SlangpyTraining.slang") 123 | 124 | # Instantiate the slang RNG from the loaded module, 125 | # seeded with a random buffer of uints 126 | pcg = np.random.PCG64(seed=12345) 127 | seeds = pcg.random_raw(batch_shape).astype(np.uint32) 128 | rng = module.RNG(seeds) 129 | 130 | # Fill a buffer with UVs for later evaluating the model during training 131 | vis_resolution = 256 132 | span = np.linspace(0, 1, vis_resolution, dtype=np.float32) 133 | vis_uvs_np = np.stack(np.broadcast_arrays(span[None, :], span[:, None]), axis=2) 134 | vis_uvs = NDBuffer(device, module.float2.struct, shape=(vis_resolution, vis_resolution)) 135 | vis_uvs.copy_from_numpy(vis_uvs_np) 136 | 137 | # Create a figure to fill out as we go 138 | if INTERACTIVE: 139 | n = len(models) 140 | fig, axes = plt.subplots(2, n, dpi=200, figsize=(2.4 * n, 4.8), squeeze=False) 141 | plt.ion() 142 | plt.show() 143 | 144 | black = np.zeros((vis_resolution, vis_resolution, 3), dtype=np.uint8) 145 | canvases = [] 146 | for i, (model_name, _) in enumerate(models): 147 | axes[0, i].text(0.5, 1.05, f"{model_name}", horizontalalignment='center', size=8) 148 | top = axes[0, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1) 149 | bot = axes[1, i].imshow(black, extent=(0, 1, 0, 1), vmin=0, vmax=1) 150 | canvases.append([top, bot]) 151 | axes[0, i].set_axis_off() 152 | axes[1, i].set_axis_off() 153 | fig.tight_layout(h_pad=-1, w_pad=0.5) 154 | 155 | 156 | for i, (model_name, model) in enumerate(models): 157 | print(f"Training model {model_name}") 158 | 159 | assert len(model.parameters()) == 1, "Only one set of parameters is supported in this sample" 160 | assert model.fan_in == 2 and model.fan_out == 3, "Model must have 2 inputs (UV) and 3 outputs (RGB)" 161 | 162 | ## 163 | ## Set up optimizer and specialize the slang functions to our model 164 | ## 165 | grads = model.gradients()[0] 166 | parameters = model.parameters()[0] 167 | 168 | parametersF = module.ConvertToFloat(parameters) 169 | 170 | # These match up with the argument names of optimizerStep in texture-training.slang 171 | optimizer_state = { 172 | "moments1": NDBuffer.zeros_like(parametersF), 173 | "moments2": NDBuffer.zeros_like(parametersF), 174 | "paramF": parametersF, 175 | "paramH": parameters, 176 | "grad": grads, 177 | "learningRate": learning_rate, 178 | "gradScale": grad_scale 179 | } 180 | num_params = parameters.shape[0] 181 | 182 | # Specialize slang functions by substituting generic parameters 183 | optimizer_step = module.OptimizerStep 184 | train_texture = module[f"TrainTexture<{model.type_name}, {loss_name} >"] 185 | eval_model = module[f"EvalModel<{model.type_name} >"] 186 | eval_loss = module[f"EvalLoss<{loss_name} >"] 187 | 188 | # Begin main training loop 189 | iteration = 1 190 | for epoch in range(num_epochs): 191 | start = time.time() 192 | 193 | cmd = device.create_command_buffer() 194 | cmd.open() 195 | # Each batch is submitted to a command buffer 196 | for batch in range(num_batches_per_epoch): 197 | # Compute gradients 198 | train_texture.append_to(cmd, model, rng, target_tex, loss_scale) 199 | # Do one parameter optimization step using those gradients 200 | optimizer_step.append_to(cmd, idx=call_id((num_params, )), iteration=iteration, **optimizer_state) 201 | iteration += 1 202 | cmd.close() 203 | device.submit_command_buffer(cmd) 204 | device.wait() 205 | end = time.time() 206 | 207 | device.run_garbage_collection() 208 | 209 | # Print out progress info 210 | elapsed = end - start 211 | num_samples_per_epoch = math.prod(batch_shape) * num_batches_per_epoch 212 | progress = (num_samples_per_epoch * (epoch + 1)) // 1000000 213 | info = (f"Epoch {epoch + 1} complete, " 214 | f"{progress}/{sample_target // 1000000} MSamples: " 215 | f"Time: {elapsed:.3f}s " 216 | f"Throughput: {num_samples_per_epoch / elapsed * 1e-6:.2f} MSamples/s") 217 | 218 | # In the interactive case, draw updates to window and compute loss. This goes 219 | # through the CPU, so this is quite slow 220 | if INTERACTIVE: 221 | current_prediction = eval_model(model, vis_uvs, _result=np.ndarray) 222 | loss_val = np.mean(eval_loss(vis_uvs, current_prediction, target_tex, _result=np.ndarray)) 223 | diff = module.TextureDifference(vis_uvs, current_prediction, target_tex, 10.0, _result=np.ndarray) 224 | 225 | info += f" Loss: {loss_val:.3f}" 226 | 227 | current_prediction = np.clip(current_prediction, 0, 1) 228 | diff = np.clip(diff, 0, 1) 229 | 230 | canvases[i][0].set_data(current_prediction) 231 | canvases[i][1].set_data(diff) 232 | fig.canvas.draw() 233 | fig.canvas.flush_events() 234 | 235 | print(info) 236 | 237 | print("Training complete!") 238 | 239 | best_model = models[-1][1] 240 | 241 | weight_path = sample.spy_sample_dir / "weights.json" 242 | print(f"Writing trained weights of best model to {weight_path}") 243 | param_dict = best_model.serialize() 244 | open(weight_path, "w").write(json.dumps(param_dict, indent=4)) 245 | 246 | print(f"Compiling inference shader...") 247 | sample.compile_inference_shader(best_model) 248 | 249 | print(f"Running RTXNS inference...") 250 | if INTERACTIVE: 251 | plt.close() 252 | sample.run_sdk_inference(weight_path) 253 | 254 | if __name__ == "__main__": 255 | training_main() 256 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/SlangpyTraining.slang: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | // clang-format off 3 | 4 | /* 5 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * NVIDIA CORPORATION and its licensors retain all intellectual property 8 | * and proprietary rights in and to this software, related documentation 9 | * and any modifications thereto. Any use, reproduction, disclosure or 10 | * distribution of this software and related documentation without an express 11 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 12 | */ 13 | 14 | __exported import NeuralModules; 15 | __exported import Loss; 16 | __exported import Optimizers; 17 | 18 | struct RNG 19 | { 20 | uint state; 21 | 22 | __init(uint state) { this.state = state; } 23 | 24 | [mutating] 25 | float next() 26 | { 27 | float r = (state >> 8) * 0x1p-24; 28 | state = state * 2739110765U + 2739110765U; 29 | return r; 30 | } 31 | } 32 | 33 | // An example of adding a custom activation to your network 34 | // This implements the Sigmoid Linear Unit (SiLU) 35 | struct SiLUActivation : rtxns::mlp::IActivation 36 | { 37 | [Differentiable] 38 | CoopVec eval(CoopVec x) 39 | { 40 | return x * no_diff CoopVec(T(1.)) / (no_diff CoopVec(T(1.)) + exp(-x)); 41 | } 42 | } 43 | 44 | // Utility function for nearest-neighbor sampling of texture 45 | T SampleTexture(Texture2D tex, float2 uv) 46 | { 47 | float2 size; 48 | tex.GetDimensions(size[0], size[1]); 49 | uint2 xy = uint2(uv * size); 50 | return tex[xy]; 51 | } 52 | 53 | // Take one step with the adam optimizer 54 | void OptimizerStep( 55 | RWBuffer moments1, 56 | RWBuffer moments2, 57 | RWBuffer paramF, 58 | RWBuffer paramH, 59 | RWBuffer grad, 60 | uint idx, 61 | float learningRate, 62 | float gradScale, 63 | int iteration) 64 | { 65 | var optimizer = optimizers::Adam(moments1, moments2, learningRate, gradScale); 66 | 67 | // Parameters are converted to FP16 for computing gradients, 68 | // but we keep the FP32 originals around so we don't accumulate 69 | // rounding errors 70 | float parameter = paramF[idx]; 71 | float gradient = (float)grad[idx]; 72 | 73 | parameter = optimizer.step(parameter, idx, gradient, iteration); 74 | 75 | // Update the reference FP32 parameter, and convert the new value back to FP16 76 | paramF[idx] = parameter; 77 | paramH[idx] = (half)parameter; 78 | // Zero out gradients 79 | grad[idx] = 0.0h; 80 | } 81 | 82 | void TrainTexture, Loss : rtxns::mlp::ILoss>(Model model, inout RNG rng, Texture2D targetTex, float lossScale) 83 | { 84 | // Get a random uv coordinate for the input 85 | float2 inputUV = clamp(float2(rng.next(), rng.next()), 0.0, 1.0); 86 | 87 | // Sample the target texture at the generated UV 88 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb; 89 | 90 | // Evaluate the current output of the model 91 | float3 predictedRGB = EvalModel(model, inputUV); 92 | 93 | // Evaluate the loss gradient 94 | float3 lossGradient = Loss.deriv(targetRGB, predictedRGB, lossScale); 95 | 96 | // Backpropragate gradient through network parameters 97 | bwd_diff(EvalModel)(model, inputUV, lossGradient); 98 | } 99 | 100 | // Convenience functions for evaluating the model from vector inputs 101 | // Converts to/from CoopVec internally 102 | [Differentiable] 103 | float3 EvalModel>(Model model, no_diff float2 inputUV) 104 | { 105 | var inputVec = rtxns::CoopVecFromVector(inputUV); 106 | 107 | var result = model.forward(inputVec); 108 | 109 | return rtxns::VectorFromCoopVec(result); 110 | } 111 | 112 | // Computes the loss between the predicted RGB at a given UV coordinate and a reference texture 113 | float3 EvalLoss>(float2 inputUV, float3 predictedRGB, Texture2D targetTex) 114 | { 115 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb; 116 | 117 | return Loss.value(targetRGB, predictedRGB, 1.0f); 118 | } 119 | 120 | // Computes the difference between the predicted RGB at a given UV coordinate and a reference texture 121 | // for visualization 122 | float3 TextureDifference(float2 inputUV, float3 predictedRGB, Texture2D targetTex, float scale) 123 | { 124 | float3 targetRGB = SampleTexture(targetTex, inputUV).rgb; 125 | 126 | return (predictedRGB - targetRGB) * scale + 0.5f; 127 | } 128 | 129 | // Convenience function to convert from half to float params 130 | float ConvertToFloat(half paramH) 131 | { 132 | return (float)paramH; 133 | } 134 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=3.0,<4.0 2 | numpy>=2.0,<3.0 3 | slangpy==0.19.4 4 | -------------------------------------------------------------------------------- /samples/SlangpyTraining/shaders.cfg: -------------------------------------------------------------------------------- 1 | SlangpyInference.slang -E main_cs -T cs -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | set(LIBRARY_FILTER src) 11 | add_subdirectory(NeuralShading) 12 | add_subdirectory(Utils) 13 | add_subdirectory(NeuralShading_Shaders) 14 | 15 | -------------------------------------------------------------------------------- /src/NeuralShading/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | 11 | file(GLOB sources "*.cpp" "*.h") 12 | 13 | set(project NeuralShading) 14 | set(folder "${LIBRARY_FILTER}/NeuralShading") 15 | 16 | add_library(${project} STATIC EXCLUDE_FROM_ALL ${sources}) 17 | target_include_directories(${project} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 18 | target_link_libraries(${project} donut_app donut_engine) 19 | 20 | set_target_properties(${project} PROPERTIES 21 | FOLDER ${folder} 22 | ) 23 | -------------------------------------------------------------------------------- /src/NeuralShading/CoopVector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #include "CoopVector.h" 12 | #include 13 | 14 | #if DONUT_WITH_VULKAN 15 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 16 | #include 17 | #endif 18 | 19 | #if DONUT_WITH_DX12 20 | #include 21 | #include 22 | #endif 23 | 24 | using namespace rtxns; 25 | 26 | namespace 27 | { 28 | /** 29 | * Bytes between a consecutive row or column (if row/column-major layout). 30 | * The stride is only used for row/column major layouts 31 | **/ 32 | size_t GetStride(const MatrixLayout layout, const uint32_t rows, const uint32_t cols, const size_t precision) 33 | { 34 | size_t stride = 0; 35 | if (layout == MatrixLayout::RowMajor) 36 | { 37 | stride = cols * precision; 38 | } 39 | else if (layout == MatrixLayout::ColumnMajor) 40 | { 41 | stride = rows * precision; 42 | } 43 | return stride; 44 | } 45 | } // namespace 46 | 47 | #if DONUT_WITH_VULKAN 48 | namespace 49 | { 50 | 51 | VkComponentTypeKHR GetVkComponentType(rtxns::Precision precision) 52 | { 53 | return precision == rtxns::Precision::F16 ? VK_COMPONENT_TYPE_FLOAT16_NV : VK_COMPONENT_TYPE_FLOAT32_NV; 54 | } 55 | 56 | VkCooperativeVectorMatrixLayoutNV GetVkLayout(const MatrixLayout layout) 57 | { 58 | switch (layout) 59 | { 60 | case MatrixLayout::RowMajor: 61 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_ROW_MAJOR_NV; 62 | case MatrixLayout::ColumnMajor: 63 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_COLUMN_MAJOR_NV; 64 | case MatrixLayout::InferencingOptimal: 65 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_INFERENCING_OPTIMAL_NV; 66 | case MatrixLayout::TrainingOptimal: 67 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_TRAINING_OPTIMAL_NV; 68 | default: 69 | return VK_COOPERATIVE_VECTOR_MATRIX_LAYOUT_MAX_ENUM_NV; 70 | } 71 | } 72 | 73 | VkConvertCooperativeVectorMatrixInfoNV GetVkConvertLayerDesc( 74 | int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t* dstSize, uint64_t srcData = 0, uint64_t dstData = 0) 75 | { 76 | VkConvertCooperativeVectorMatrixInfoNV info{}; 77 | info.sType = VK_STRUCTURE_TYPE_CONVERT_COOPERATIVE_VECTOR_MATRIX_INFO_NV; 78 | info.pNext = nullptr; 79 | info.numRows = rows; 80 | info.numColumns = columns; 81 | info.srcComponentType = GetVkComponentType(precision); 82 | info.srcLayout = GetVkLayout(srcLayout); 83 | info.srcStride = GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision)); 84 | info.srcSize = srcSize; 85 | info.srcData.deviceAddress = srcData; 86 | info.dstComponentType = GetVkComponentType(precision); 87 | info.dstLayout = GetVkLayout(dstLayout); 88 | info.dstStride = GetStride(dstLayout, rows, columns, GetSize(precision)); 89 | info.pDstSize = dstSize; 90 | info.dstData.deviceAddress = dstData; 91 | return info; 92 | } 93 | 94 | } // namespace 95 | 96 | CoopVectorUtils_VK::CoopVectorUtils_VK(VkDevice vkDevice) 97 | { 98 | m_vkDevice = vkDevice; 99 | assert(m_vkDevice != VK_NULL_HANDLE && "Failed to get Vulkan device handle from GFX."); 100 | 101 | m_vkConvertCooperativeVectorMatrixNV = 102 | (PFN_vkConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkConvertCooperativeVectorMatrixNV"); 103 | assert(m_vkConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkConvertCooperativeVectorMatrixNV'."); 104 | 105 | m_vkCmdConvertCooperativeVectorMatrixNV = 106 | (PFN_vkCmdConvertCooperativeVectorMatrixNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdConvertCooperativeVectorMatrixNV"); 107 | assert(m_vkCmdConvertCooperativeVectorMatrixNV != nullptr && "Failed to get Vulkan function 'vkCmdConvertCooperativeVectorMatrixNV'."); 108 | 109 | m_vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkCmdCopyBuffer"); 110 | assert(m_vkCmdCopyBuffer != nullptr && "Failed to get Vulkan function 'vkCmdCopyBuffer'."); 111 | 112 | m_vkGetBufferDeviceAddress = (PFN_vkGetBufferDeviceAddress)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr(m_vkDevice, "vkGetBufferDeviceAddress"); 113 | assert(m_vkGetBufferDeviceAddress != nullptr && "Failed to get Vulkan function 'vkGetBufferDeviceAddress'."); 114 | } 115 | 116 | size_t CoopVectorUtils_VK::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision) 117 | { 118 | assert(m_vkDevice); 119 | assert(m_vkConvertCooperativeVectorMatrixNV); 120 | assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128."); 121 | assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128."); 122 | 123 | size_t requiredSize = 0; 124 | 125 | VkConvertCooperativeVectorMatrixInfoNV info = GetVkConvertLayerDesc(rows, cols, precision, MatrixLayout::RowMajor, layout, 0, &requiredSize); 126 | 127 | VkResult res = m_vkConvertCooperativeVectorMatrixNV(m_vkDevice, &info); 128 | assert(res == VK_SUCCESS && "Call to vkConvertCooperativeVectorMatrixNV failed"); 129 | assert(requiredSize > 0 && "Expected matrix size to be larger than zero."); 130 | 131 | return requiredSize; 132 | } 133 | 134 | void CoopVectorUtils_VK::ConvertDeviceMatrixLayout( 135 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const 136 | { 137 | VkCommandBuffer vkCmdBuf = static_cast(commandList); 138 | VkBuffer vkSrcBuffer = static_cast(srcBuffer); 139 | VkBuffer vkDstBuffer = static_cast(dstBuffer); 140 | 141 | // Obtain the device addresses of the buffers for the conversion functions 142 | VkBufferDeviceAddressInfo bufferDeviceAddressInfo{}; 143 | bufferDeviceAddressInfo.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO; 144 | bufferDeviceAddressInfo.buffer = vkSrcBuffer; 145 | VkDeviceAddress const srcBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo); 146 | bufferDeviceAddressInfo.buffer = vkDstBuffer; 147 | VkDeviceAddress const dstBufferVA = m_vkGetBufferDeviceAddress(m_vkDevice, &bufferDeviceAddressInfo); 148 | 149 | // Convert weights 150 | std::vector convertInfos(srcLayout.networkLayers.size()); 151 | for (int i = 0; i < srcLayout.networkLayers.size(); i++) 152 | { 153 | // Weights 154 | size_t dstLayerSize = dstLayout.networkLayers[i].weightSize; 155 | convertInfos[i] = 156 | GetVkConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout, dstLayout.matrixLayout, 157 | srcLayout.networkLayers[i].weightSize, &dstLayerSize, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset, 158 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset); 159 | } 160 | m_vkCmdConvertCooperativeVectorMatrixNV(vkCmdBuf, (uint32_t)convertInfos.size(), convertInfos.data()); 161 | 162 | // Copy the bias 163 | std::vector copyRegions(srcLayout.networkLayers.size()); 164 | for (int i = 0; i < srcLayout.networkLayers.size(); i++) 165 | { 166 | copyRegions[i].srcOffset = srcBufferOffset + srcLayout.networkLayers[i].biasOffset; 167 | copyRegions[i].dstOffset = dstBufferOffset + dstLayout.networkLayers[i].biasOffset; 168 | copyRegions[i].size = srcLayout.networkLayers[i].biasSize; 169 | } 170 | m_vkCmdCopyBuffer(vkCmdBuf, vkSrcBuffer, vkDstBuffer, (uint32_t)copyRegions.size(), copyRegions.data()); 171 | } 172 | #endif 173 | 174 | #if DONUT_WITH_DX12 175 | 176 | namespace 177 | { 178 | D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT GetDX12MatrixLayout(const MatrixLayout layout) 179 | { 180 | switch (layout) 181 | { 182 | case MatrixLayout::RowMajor: 183 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; 184 | case MatrixLayout::ColumnMajor: 185 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR; 186 | case MatrixLayout::InferencingOptimal: 187 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL; 188 | case MatrixLayout::TrainingOptimal: 189 | return D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL; 190 | } 191 | } 192 | 193 | D3D12_LINEAR_ALGEBRA_DATATYPE GetDX12ComponentType(rtxns::Precision precision) 194 | { 195 | return precision == rtxns::Precision::F16 ? D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32; 196 | } 197 | 198 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO GetDX12ConvertLayerDestInfo(int rows, int columns, MatrixLayout layout, Precision precision) 199 | { 200 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info{}; 201 | info.DestLayout = GetDX12MatrixLayout(layout); 202 | info.NumRows = rows; 203 | info.NumColumns = columns; 204 | info.DestStride = UINT(GetStride(layout, rows, columns, GetSize(precision))); 205 | info.DestDataType = GetDX12ComponentType(precision); 206 | return info; 207 | } 208 | 209 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12ConvertLayerDesc( 210 | int rows, int columns, Precision precision, MatrixLayout srcLayout, MatrixLayout dstLayout, size_t srcSize, size_t dstSize, uint64_t srcData, uint64_t dstData) 211 | { 212 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{}; 213 | info.DestInfo = GetDX12ConvertLayerDestInfo(rows, columns, dstLayout, precision); 214 | info.DestInfo.DestSize = UINT(dstSize); 215 | info.SrcInfo.SrcSize = UINT(srcSize); 216 | info.SrcInfo.SrcDataType = GetDX12ComponentType(precision); 217 | info.SrcInfo.SrcLayout = GetDX12MatrixLayout(srcLayout); 218 | info.SrcInfo.SrcStride = UINT(GetStride(MatrixLayout::RowMajor, rows, columns, GetSize(precision))); 219 | info.DataDesc.SrcVA = srcData; 220 | info.DataDesc.DestVA = dstData; 221 | return info; 222 | } 223 | 224 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO GetDX12CopyScaleBiasDesc(size_t biasSize, Precision precision, uint64_t srcData, uint64_t dstData) 225 | { 226 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO info{}; 227 | info.DestInfo.DestSize = UINT(biasSize); 228 | info.DestInfo.DestLayout = D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; 229 | info.DestInfo.DestStride = info.DestInfo.DestSize; 230 | info.DestInfo.NumRows = 1; 231 | info.DestInfo.NumColumns = UINT(biasSize / GetSize(precision)); 232 | info.DestInfo.DestDataType = GetDX12ComponentType(precision); 233 | info.SrcInfo.SrcSize = info.DestInfo.DestSize; 234 | info.SrcInfo.SrcDataType = info.DestInfo.DestDataType; 235 | info.SrcInfo.SrcLayout = info.DestInfo.DestLayout; 236 | info.SrcInfo.SrcStride = info.DestInfo.DestStride; 237 | info.DataDesc.SrcVA = srcData; 238 | info.DataDesc.DestVA = dstData; 239 | return info; 240 | } 241 | } // namespace 242 | 243 | CoopVectorUtils_DX12::CoopVectorUtils_DX12(ID3D12Device* d3d12Device) 244 | { 245 | m_d3d12Device = d3d12Device; 246 | assert(m_d3d12Device != nullptr && "Failed to get D3D12 device from GFX."); 247 | } 248 | 249 | /** 250 | * Query the size of a matrix in bytes. 251 | * @return Size of matrix in bytes. 252 | */ 253 | size_t CoopVectorUtils_DX12::QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision /*= Precision::F16*/) 254 | { 255 | assert(m_d3d12Device); 256 | assert(rows > 0 && rows <= 128 && "Number of rows must be 1..128."); 257 | assert(cols > 0 && cols <= 128 && "Number of columns must be 1..128."); 258 | 259 | Microsoft::WRL::ComPtr devicePreview; 260 | assert(m_d3d12Device->QueryInterface(IID_PPV_ARGS(&devicePreview)) == S_OK && "Failed to get device preview"); 261 | 262 | D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO info = GetDX12ConvertLayerDestInfo(rows, cols, layout, precision); 263 | 264 | devicePreview->GetLinearAlgebraMatrixConversionDestinationInfo(&info); 265 | 266 | assert(info.DestSize > 0 && "Expected matrix size to be larger than zero."); 267 | return info.DestSize; 268 | } 269 | 270 | void rtxns::CoopVectorUtils_DX12::ConvertDeviceMatrixLayout( 271 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const 272 | { 273 | ID3D12GraphicsCommandList* d3dCmdList = static_cast(commandList); 274 | ID3D12Resource* d3dSrcBuffer = static_cast(srcBuffer); 275 | ID3D12Resource* d3dDstBuffer = static_cast(dstBuffer); 276 | 277 | Microsoft::WRL::ComPtr commandListPreview; 278 | assert(d3dCmdList->QueryInterface(IID_PPV_ARGS(&commandListPreview)) == S_OK && "Command list provided does not support matrix conversion"); 279 | 280 | D3D12_GPU_VIRTUAL_ADDRESS const srcBufferVA = d3dSrcBuffer->GetGPUVirtualAddress(); 281 | D3D12_GPU_VIRTUAL_ADDRESS const dstBufferVA = d3dDstBuffer->GetGPUVirtualAddress(); 282 | 283 | // We need conversion data for each of the weights and bias separately so we need two entry for each layer 284 | std::vector convertInfos(srcLayout.networkLayers.size() * 2); 285 | 286 | // Convert weights 287 | for (int i = 0; i < srcLayout.networkLayers.size(); i++) 288 | { 289 | // Weights 290 | convertInfos[i] = GetDX12ConvertLayerDesc(srcLayout.networkLayers[i].outputs, srcLayout.networkLayers[i].inputs, srcLayout.matrixPrecision, srcLayout.matrixLayout, 291 | dstLayout.matrixLayout, srcLayout.networkLayers[i].weightSize, dstLayout.networkLayers[i].weightSize, 292 | srcBufferVA + srcBufferOffset + srcLayout.networkLayers[i].weightOffset, 293 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[i].weightOffset); 294 | } 295 | 296 | // Convert bias 297 | // D3D's CopyBufferRegion requires resource states incompatible with the conversion ops. 298 | // Use a degenerate form of a matrix conversion to copy the extra data to avoid placing a barrier. 299 | int infoOffset = int(srcLayout.networkLayers.size()); 300 | for (int ii = 0; ii < srcLayout.networkLayers.size(); ii++) 301 | { 302 | convertInfos[ii + infoOffset] = 303 | GetDX12CopyScaleBiasDesc(srcLayout.networkLayers[ii].biasSize, srcLayout.matrixPrecision, srcBufferVA + srcBufferOffset + srcLayout.networkLayers[ii].biasOffset, 304 | dstBufferVA + dstBufferOffset + dstLayout.networkLayers[ii].biasOffset); 305 | } 306 | commandListPreview->ConvertLinearAlgebraMatrix(convertInfos.data(), UINT(convertInfos.size())); 307 | } 308 | #endif 309 | -------------------------------------------------------------------------------- /src/NeuralShading/CoopVector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | 13 | #if DONUT_WITH_DX12 14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h" 15 | #endif 16 | 17 | #include 18 | #include 19 | 20 | 21 | #include "Float16.h" 22 | #include "NeuralNetworkTypes.h" 23 | 24 | namespace rtxns 25 | { 26 | 27 | class ICoopVectorUtils 28 | { 29 | public: 30 | size_t GetMatrixAlignment() 31 | { 32 | return s_matrixAlignment; 33 | } 34 | size_t GetVectorAlignment() 35 | { 36 | return s_vectorAlignment; 37 | } 38 | 39 | /** 40 | * Query the size of a matrix in bytes. 41 | * @return Size of matrix in bytes. 42 | */ 43 | virtual size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16) = 0; 44 | 45 | /** 46 | * Convert matrix on the device between any layouts. 47 | * The Precision must currently be the same. 48 | * @return Size of matrix in bytes. 49 | */ 50 | virtual void ConvertDeviceMatrixLayout(NetworkLayout const& srcLayout, 51 | NetworkLayout const& dstLayout, 52 | void* srcBuffer, 53 | uint64_t srcBufferOffset, 54 | void* dstBuffer, 55 | uint64_t dstBufferOffset, 56 | void* commandList) const = 0; 57 | 58 | protected: 59 | static const size_t s_matrixAlignment = 64; ///< Minimum byte alignment according to spec. 60 | static const size_t s_vectorAlignment = 16; ///< Minimum byte alignment according to spec. 61 | }; 62 | 63 | #if DONUT_WITH_VULKAN 64 | class CoopVectorUtils_VK : public ICoopVectorUtils 65 | { 66 | public: 67 | CoopVectorUtils_VK(VkDevice vkDevice); 68 | 69 | /** 70 | * Query the size of a matrix in bytes. 71 | * @return Size of matrix in bytes. 72 | */ 73 | size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16); 74 | 75 | /** 76 | * Convert matrix on the device between any layouts. 77 | * The Precision must currently be the same. 78 | * @return Size of matrix in bytes. 79 | */ 80 | void ConvertDeviceMatrixLayout( 81 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const; 82 | 83 | private: 84 | VkDevice m_vkDevice = nullptr; 85 | PFN_vkConvertCooperativeVectorMatrixNV m_vkConvertCooperativeVectorMatrixNV = nullptr; 86 | PFN_vkCmdConvertCooperativeVectorMatrixNV m_vkCmdConvertCooperativeVectorMatrixNV = nullptr; 87 | PFN_vkCmdCopyBuffer m_vkCmdCopyBuffer = nullptr; 88 | PFN_vkGetBufferDeviceAddress m_vkGetBufferDeviceAddress = nullptr; 89 | }; 90 | #endif 91 | 92 | #if DONUT_WITH_DX12 93 | class CoopVectorUtils_DX12 : public ICoopVectorUtils 94 | { 95 | public: 96 | CoopVectorUtils_DX12(ID3D12Device* d3d12Device); 97 | 98 | /** 99 | * Query the size of a matrix in bytes. 100 | * @return Size of matrix in bytes. 101 | */ 102 | size_t QueryMatrixByteSize(const uint32_t rows, const uint32_t cols, const MatrixLayout layout, const Precision precision = Precision::F16); 103 | 104 | /** 105 | * Convert matrix on the device between any layouts. 106 | * The Precision must currently be the same. 107 | * @return Size of matrix in bytes. 108 | */ 109 | void ConvertDeviceMatrixLayout( 110 | NetworkLayout const& srcLayout, NetworkLayout const& dstLayout, void* srcBuffer, uint64_t srcBufferOffset, void* dstBuffer, uint64_t dstBufferOffset, void* commandList) const; 111 | 112 | private: 113 | ID3D12Device* m_d3d12Device = nullptr; 114 | }; 115 | #endif 116 | } // namespace rtxns -------------------------------------------------------------------------------- /src/NeuralShading/Float16.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | /** 12 | * Most of this code is derived from the GLM library at https://github.com/g-truc/glm 13 | * 14 | * License: https://github.com/g-truc/glm/blob/master/copying.txt 15 | */ 16 | 17 | #include "Float16.h" 18 | 19 | namespace rtxns 20 | { 21 | 22 | static float overflow() 23 | { 24 | volatile float f = 1e10; 25 | for (int i = 0; i < 10; ++i) 26 | { 27 | f *= f; // this will overflow before the for loop terminates 28 | } 29 | return f; 30 | } 31 | 32 | union uif32 33 | { 34 | float f; 35 | unsigned int i; 36 | }; 37 | 38 | uint16_t float32ToFloat16(float value) 39 | { 40 | uif32 entry; 41 | entry.f = value; 42 | int i = static_cast(entry.i); 43 | 44 | // 45 | // Our floating point number, f, is represented by the bit 46 | // pattern in integer i. Disassemble that bit pattern into 47 | // the sign, s, the exponent, e, and the significand, m. 48 | // Shift s into the position where it will go in the 49 | // resulting half number. 50 | // Adjust e, accounting for the different exponent bias 51 | // of float and half (127 versus 15). 52 | // 53 | 54 | int s = (i >> 16) & 0x00008000; 55 | int e = ((i >> 23) & 0x000000ff) - (127 - 15); 56 | int m = i & 0x007fffff; 57 | 58 | // 59 | // Now reassemble s, e and m into a half: 60 | // 61 | 62 | if (e <= 0) 63 | { 64 | if (e < -10) 65 | { 66 | // 67 | // E is less than -10. The absolute value of f is 68 | // less than half_MIN (f may be a small normalized 69 | // float, a denormalized float or a zero). 70 | // 71 | // We convert f to a half zero. 72 | // 73 | 74 | return uint16_t(s); 75 | } 76 | 77 | // 78 | // E is between -10 and 0. F is a normalized float, 79 | // whose magnitude is less than __half_NRM_MIN. 80 | // 81 | // We convert f to a denormalized half. 82 | // 83 | 84 | m = (m | 0x00800000) >> (1 - e); 85 | 86 | // 87 | // Round to nearest, round "0.5" up. 88 | // 89 | // Rounding may cause the significand to overflow and make 90 | // our number normalized. Because of the way a half's bits 91 | // are laid out, we don't have to treat this case separately; 92 | // the code below will handle it correctly. 93 | // 94 | 95 | if (m & 0x00001000) 96 | { 97 | m += 0x00002000; 98 | } 99 | 100 | // 101 | // Assemble the half from s, e (zero) and m. 102 | // 103 | 104 | return uint16_t(s | (m >> 13)); 105 | } 106 | else if (e == 0xff - (127 - 15)) 107 | { 108 | if (m == 0) 109 | { 110 | // 111 | // F is an infinity; convert f to a half 112 | // infinity with the same sign as f. 113 | // 114 | 115 | return uint16_t(s | 0x7c00); 116 | } 117 | else 118 | { 119 | // 120 | // F is a NAN; we produce a half NAN that preserves 121 | // the sign bit and the 10 leftmost bits of the 122 | // significand of f, with one exception: If the 10 123 | // leftmost bits are all zero, the NAN would turn 124 | // into an infinity, so we have to set at least one 125 | // bit in the significand. 126 | // 127 | 128 | m >>= 13; 129 | 130 | return uint16_t(s | 0x7c00 | m | (m == 0)); 131 | } 132 | } 133 | else 134 | { 135 | // 136 | // E is greater than zero. F is a normalized float. 137 | // We try to convert f to a normalized half. 138 | // 139 | 140 | // 141 | // Round to nearest, round "0.5" up 142 | // 143 | 144 | if (m & 0x00001000) 145 | { 146 | m += 0x00002000; 147 | 148 | if (m & 0x00800000) 149 | { 150 | m = 0; // overflow in significand, 151 | e += 1; // adjust exponent 152 | } 153 | } 154 | 155 | // 156 | // Handle exponent overflow 157 | // 158 | 159 | if (e > 30) 160 | { 161 | overflow(); // Cause a hardware floating point overflow; 162 | 163 | return uint16_t(s | 0x7c00); // Return infinity with same sign as f. 164 | } 165 | 166 | // 167 | // Assemble the half from s, e and m. 168 | // 169 | 170 | return uint16_t(s | (e << 10) | (m >> 13)); 171 | } 172 | } 173 | 174 | float float16ToFloat32(uint16_t value) 175 | { 176 | int s = (value >> 15) & 0x00000001; 177 | int e = (value >> 10) & 0x0000001f; 178 | int m = value & 0x000003ff; 179 | 180 | if (e == 0) 181 | { 182 | if (m == 0) 183 | { 184 | // 185 | // Plus or minus zero 186 | // 187 | 188 | uif32 result; 189 | result.i = static_cast(s << 31); 190 | return result.f; 191 | } 192 | else 193 | { 194 | // 195 | // Denormalized number -- renormalize it 196 | // 197 | 198 | while (!(m & 0x00000400)) 199 | { 200 | m <<= 1; 201 | e -= 1; 202 | } 203 | 204 | e += 1; 205 | m &= ~0x00000400; 206 | } 207 | } 208 | else if (e == 31) 209 | { 210 | if (m == 0) 211 | { 212 | // 213 | // Positive or negative infinity 214 | // 215 | 216 | uif32 result; 217 | result.i = static_cast((s << 31) | 0x7f800000); 218 | return result.f; 219 | } 220 | else 221 | { 222 | // 223 | // Nan -- preserve sign and significand bits 224 | // 225 | 226 | uif32 result; 227 | result.i = static_cast((s << 31) | 0x7f800000 | (m << 13)); 228 | return result.f; 229 | } 230 | } 231 | 232 | // 233 | // Normalized number 234 | // 235 | 236 | e = e + (127 - 15); 237 | m = m << 13; 238 | 239 | // 240 | // Assemble s, e and m. 241 | // 242 | 243 | uif32 result; 244 | result.i = static_cast((s << 31) | (e << 23) | m); 245 | return result.f; 246 | } 247 | 248 | } // namespace rtxns 249 | -------------------------------------------------------------------------------- /src/NeuralShading/Float16.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | namespace rtxns 17 | { 18 | 19 | uint16_t float32ToFloat16(float value); 20 | float float16ToFloat32(uint16_t value); 21 | 22 | } // namespace rtxns -------------------------------------------------------------------------------- /src/NeuralShading/GraphicsResources.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | 13 | #if DONUT_WITH_DX12 14 | #include "../../external/dx12-agility-sdk/build/native/include/d3d12.h" 15 | #include 16 | #include 17 | #endif 18 | 19 | #if DONUT_WITH_VULKAN 20 | #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 21 | #include 22 | #endif 23 | 24 | #include "GraphicsResources.h" 25 | #include 26 | #include 27 | 28 | namespace rtxns 29 | { 30 | 31 | GraphicsResources::GraphicsResources(nvrhi::DeviceHandle device) 32 | { 33 | #if DONUT_WITH_VULKAN 34 | if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::VULKAN) 35 | { 36 | VkInstance vkInstance = device->getNativeObject(nvrhi::ObjectTypes::VK_Instance); 37 | VkPhysicalDevice vkPhysicalDevice = device->getNativeObject(nvrhi::ObjectTypes::VK_PhysicalDevice); 38 | 39 | m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV)VULKAN_HPP_DEFAULT_DISPATCHER.vkGetInstanceProcAddr( 40 | vkInstance, "vkGetPhysicalDeviceCooperativeVectorPropertiesNV"); 41 | assert(m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV != nullptr && "Failed to get Vulkan function 'vkGetPhysicalDeviceCooperativeVectorPropertiesNV'."); 42 | 43 | // Get the property count 44 | uint32_t propertyCount = 0; 45 | if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, nullptr) != VK_SUCCESS) 46 | { 47 | return; 48 | } 49 | 50 | // If we vkGetPhysicalDeviceCooperativeVectorPropertiesNV returns we have inference and training support 51 | m_coopVectorFeatures.inferenceSupported = true; 52 | m_coopVectorFeatures.trainingSupported = true; 53 | 54 | std::vector properties(propertyCount); 55 | // Init the sType fields 56 | for (auto& property : properties) 57 | { 58 | property.sType = VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV; 59 | } 60 | 61 | // Get the actual properties 62 | if (m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV(vkPhysicalDevice, &propertyCount, properties.data()) != VK_SUCCESS) 63 | { 64 | return; 65 | } 66 | 67 | for (const auto& property : properties) 68 | { 69 | if (property.sType == VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV && property.inputType == VK_COMPONENT_TYPE_FLOAT16_KHR && 70 | property.inputInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR && property.matrixInterpretation == VK_COMPONENT_TYPE_FLOAT16_KHR && 71 | property.resultType == VK_COMPONENT_TYPE_FLOAT16_KHR) 72 | { 73 | m_coopVectorFeatures.fp16InferencingSupported = true; 74 | m_coopVectorFeatures.fp16TrainingSupported = true; 75 | } 76 | } 77 | } 78 | #endif 79 | 80 | #if DONUT_WITH_DX12 81 | if (device->getGraphicsAPI() == nvrhi::GraphicsAPI::D3D12) 82 | { 83 | ID3D12Device* d3d12Device = device->getNativeObject(nvrhi::ObjectTypes::D3D12_Device); 84 | 85 | // Check experimental features are enabled 86 | D3D12_FEATURE_DATA_D3D12_OPTIONS_EXPERIMENTAL experimentalOptions{}; 87 | auto hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS_EXPERIMENTAL, &experimentalOptions, sizeof(experimentalOptions)); 88 | if (hr != S_OK) 89 | { 90 | donut::log::error("Coop vector is not supported."); 91 | return; 92 | } 93 | 94 | // Mute preview shader model (6.9) validation warning. 95 | Microsoft::WRL::ComPtr infoQueue; 96 | if (d3d12Device->QueryInterface(IID_PPV_ARGS(&infoQueue)) == S_OK) 97 | { 98 | D3D12_MESSAGE_ID denyIds[] = { D3D12_MESSAGE_ID_NON_RETAIL_SHADER_MODEL_WONT_VALIDATE }; 99 | 100 | D3D12_INFO_QUEUE_FILTER filter = {}; 101 | filter.DenyList.NumIDs = _countof(denyIds); 102 | filter.DenyList.pIDList = denyIds; 103 | 104 | infoQueue->AddStorageFilterEntries(&filter); 105 | } 106 | 107 | // Check coop vector is supported 108 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_0) 109 | { 110 | m_coopVectorFeatures.inferenceSupported = true; 111 | } 112 | else 113 | { 114 | return; 115 | } 116 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1) 117 | { 118 | m_coopVectorFeatures.trainingSupported = true; 119 | } 120 | 121 | // Get supported coop vector formats 122 | D3D12_FEATURE_DATA_COOPERATIVE_VECTOR coopVecData{}; 123 | hr = d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData)); 124 | if (hr != S_OK) 125 | { 126 | return; 127 | } 128 | 129 | std::vector mulProperties(coopVecData.MatrixVectorMulAddPropCount); 130 | std::vector outerProductProperties; 131 | std::vector vectorAccumlateProperties; 132 | 133 | coopVecData.pMatrixVectorMulAddProperties = mulProperties.data(); 134 | 135 | if (experimentalOptions.CooperativeVectorTier >= D3D12_COOPERATIVE_VECTOR_TIER_1_1) 136 | { 137 | outerProductProperties.resize(coopVecData.OuterProductAccumulatePropCount); 138 | coopVecData.pOuterProductAccumulateProperties = outerProductProperties.data(); 139 | vectorAccumlateProperties.resize(coopVecData.VectorAccumulatePropCount); 140 | coopVecData.pVectorAccumulateProperties = vectorAccumlateProperties.data(); 141 | } 142 | else 143 | { 144 | coopVecData.OuterProductAccumulatePropCount = 0; 145 | coopVecData.VectorAccumulatePropCount = 0; 146 | } 147 | 148 | if (d3d12Device->CheckFeatureSupport(D3D12_FEATURE_COOPERATIVE_VECTOR, &coopVecData, sizeof(coopVecData)) != S_OK) 149 | { 150 | return; 151 | } 152 | 153 | for (const auto& properties : mulProperties) 154 | { 155 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && 156 | properties.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.OutputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) 157 | { 158 | m_coopVectorFeatures.fp16InferencingSupported = true; 159 | } 160 | } 161 | 162 | bool opSupported = false; 163 | for (const auto& properties : outerProductProperties) 164 | { 165 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) 166 | { 167 | opSupported = true; 168 | } 169 | } 170 | 171 | bool vaSupported = false; 172 | for (const auto& properties : vectorAccumlateProperties) 173 | { 174 | if (properties.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 && properties.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) 175 | { 176 | vaSupported = true; 177 | } 178 | } 179 | m_coopVectorFeatures.fp16TrainingSupported = opSupported && vaSupported; 180 | } 181 | #endif 182 | } 183 | 184 | GraphicsResources::~GraphicsResources() 185 | { 186 | } 187 | 188 | } // namespace rtxns 189 | -------------------------------------------------------------------------------- /src/NeuralShading/GraphicsResources.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | 13 | #include 14 | 15 | namespace rtxns 16 | { 17 | 18 | struct CoopVectorFeatures 19 | { 20 | bool inferenceSupported = false; 21 | bool trainingSupported = false; 22 | bool fp16InferencingSupported = false; 23 | bool fp16TrainingSupported = false; 24 | }; 25 | 26 | class GraphicsResources 27 | { 28 | public: 29 | GraphicsResources(nvrhi::DeviceHandle device); 30 | ~GraphicsResources(); 31 | CoopVectorFeatures GetCoopVectorFeatures() const 32 | { 33 | return m_coopVectorFeatures; 34 | } 35 | 36 | private: 37 | CoopVectorFeatures m_coopVectorFeatures; 38 | #if DONUT_WITH_VULKAN 39 | PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV m_vkGetPhysicalDeviceCooperativeVectorPropertiesNV = nullptr; 40 | #endif 41 | }; 42 | } // namespace rtxns 43 | -------------------------------------------------------------------------------- /src/NeuralShading/NeuralNetwork.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "CoopVector.h" 14 | #include 15 | #include 16 | #include 17 | 18 | #include "NeuralNetworkTypes.h" 19 | 20 | namespace rtxns 21 | { 22 | 23 | class NetworkUtilities 24 | { 25 | public: 26 | NetworkUtilities(nvrhi::DeviceHandle device); 27 | ~NetworkUtilities() 28 | { 29 | } 30 | 31 | bool ValidateNetworkArchitecture(NetworkArchitecture const& netArch); 32 | 33 | // Create host side network layout. 34 | NetworkLayout CreateHostNetworkLayout(NetworkArchitecture const& netArch); 35 | 36 | // Set the weights and bias size / offsets for each layer in the network. 37 | void SetNetworkLayerSizes(NetworkLayout& layout); 38 | 39 | // Returns a updated network layout where the weights and bias size / offsets have been update 40 | // for the new matrix layout 41 | // Can be device optimal matrix layout 42 | NetworkLayout GetNewMatrixLayout(NetworkLayout const& srcLayout, MatrixLayout newMatrixLayout); 43 | 44 | // Converts weights and bias buffers from src layout to the dst layout. 45 | // Both buffers must be device side. 46 | // Both networks must be of the same network layout, only differing in MatrixLayout 47 | void ConvertWeights(NetworkLayout const& srcLayout, 48 | NetworkLayout const& dstLayout, 49 | nvrhi::BufferHandle srcBuffer, 50 | uint64_t srcBufferOffset, 51 | nvrhi::BufferHandle dstBuffer, 52 | uint64_t dstBufferOffset, 53 | nvrhi::DeviceHandle device, 54 | nvrhi::CommandListHandle commandList); 55 | 56 | private: 57 | std::unique_ptr m_coopVecUtils; 58 | }; 59 | 60 | // Represent a host side neural network. 61 | // Stores the network layout and parameters. 62 | // Functionality to initialize a network to starting values or load from file. 63 | // Also write parameters back to file 64 | class HostNetwork 65 | { 66 | public: 67 | HostNetwork(std::shared_ptr networkUtils); 68 | ~HostNetwork(){}; 69 | 70 | // Create host side network from provided architecture with initial values. 71 | bool Initialise(const NetworkArchitecture& netArch); 72 | 73 | // Create host side network of provided architecture and initial values from a json file. 74 | bool InitialiseFromJson(donut::vfs::IFileSystem& fs, const std::string& fileName); 75 | // Create host side network of provided architecture and initial values from a file. 76 | bool InitialiseFromFile(const std::string& fileName); 77 | // Create host side network from an existing network. 78 | bool InitialiseFromNetwork(HostNetwork const& network); 79 | // Write the current network and parameters to file. 80 | bool WriteToFile(const std::string& fileName); 81 | // Convert device layout to host layout and update the host side parameters. 82 | void UpdateFromBufferToFile(nvrhi::BufferHandle hostLayoutBuffer, 83 | nvrhi::BufferHandle deviceLayoutBuffer, 84 | NetworkLayout const& hostLayout, 85 | NetworkLayout const& deviceLayout, 86 | const std::string& fileName, 87 | nvrhi::DeviceHandle device, 88 | nvrhi::CommandListHandle commandList); 89 | 90 | const NetworkArchitecture& GetNetworkArchitecture() const 91 | { 92 | return m_networkArchitecture; 93 | } 94 | 95 | const std::vector& GetNetworkParams() const 96 | { 97 | return m_networkParams; 98 | } 99 | 100 | const NetworkLayout& GetNetworkLayout() const 101 | { 102 | return m_networkLayout; 103 | } 104 | 105 | private: 106 | std::shared_ptr m_networkUtils; 107 | NetworkArchitecture m_networkArchitecture; 108 | std::vector m_networkParams; 109 | NetworkLayout m_networkLayout; 110 | }; 111 | }; // namespace rtxns -------------------------------------------------------------------------------- /src/NeuralShading/NeuralNetworkTypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | namespace rtxns 15 | { 16 | 17 | enum class MatrixLayout 18 | { 19 | RowMajor, 20 | ColumnMajor, 21 | InferencingOptimal, 22 | TrainingOptimal, 23 | }; 24 | 25 | enum class Precision 26 | { 27 | F16, 28 | F32 29 | }; 30 | 31 | struct NetworkArchitecture 32 | { 33 | uint32_t numHiddenLayers = 0; 34 | uint32_t inputNeurons = 0; 35 | uint32_t hiddenNeurons = 0; 36 | uint32_t outputNeurons = 0; 37 | Precision weightPrecision = Precision::F16; 38 | Precision biasPrecision = Precision::F16; 39 | }; 40 | 41 | struct NetworkLayer 42 | { 43 | uint32_t inputs = 0; ///< Columns in the weight matrix. 44 | uint32_t outputs = 0; ///< Rows in the weight matrix. 45 | size_t weightSize = 0; ///< Size of the weight matrix in bytes. 46 | size_t biasSize = 0; ///< Size of the bias vector in bytes. 47 | uint32_t weightOffset = 0; ///< Offset to the weights in bytes. 48 | uint32_t biasOffset = 0; ///< Offset to the biases in bytes. 49 | }; 50 | 51 | struct NetworkLayout 52 | { 53 | MatrixLayout matrixLayout = MatrixLayout::RowMajor; 54 | Precision matrixPrecision = Precision::F16; 55 | size_t networkSize = 0; 56 | std::vector networkLayers; 57 | }; 58 | 59 | constexpr size_t GetSize(Precision precision) 60 | { 61 | switch (precision) 62 | { 63 | case Precision::F16: 64 | return sizeof(uint16_t); // 2 bytes 65 | case Precision::F32: 66 | return sizeof(float); 67 | default: 68 | return 0; // Should not get here 69 | } 70 | } 71 | 72 | } // namespace rtxns -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/Activation.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorAutoDiff; 12 | import CooperativeVectorFunctions; 13 | 14 | namespace rtxns 15 | { 16 | namespace mlp 17 | { 18 | //////////////////////// 19 | // 20 | // Activation function interface and implementation for several activation functions 21 | // for using with classes in MLP module 22 | // 23 | //////////////////////// 24 | 25 | // Base interface for activation functions 26 | interface IActivation 27 | { 28 | [Differentiable] 29 | CoopVec eval(CoopVec x); 30 | }; 31 | 32 | // None activation function 33 | struct NoneAct : IActivation 34 | { 35 | [Differentiable] 36 | CoopVec eval(CoopVec x) 37 | { 38 | return x; 39 | } 40 | }; 41 | 42 | // Linear activation function 43 | struct LinearAct : IActivation 44 | { 45 | T a; 46 | 47 | __init(T a) 48 | { 49 | this.a = a; 50 | } 51 | 52 | [Differentiable] 53 | CoopVec eval(CoopVec x) 54 | { 55 | return no_diff CoopVec(a) * x; 56 | } 57 | }; 58 | 59 | // Exponential activation function 60 | struct ExponentialAct : IActivation 61 | { 62 | [Differentiable] 63 | CoopVec eval(CoopVec x) 64 | { 65 | // Exponent is builtin function. 66 | return exp(x); 67 | } 68 | }; 69 | 70 | // Shifted exponential activation function 71 | struct ShiftedExponentialAct : IActivation 72 | { 73 | [Differentiable] 74 | CoopVec eval(CoopVec x) 75 | { 76 | return exp(x) - no_diff CoopVec(T(1.)); 77 | } 78 | }; 79 | 80 | // ReLU activation function 81 | struct ReLUAct : IActivation 82 | { 83 | [Differentiable] 84 | CoopVec eval(CoopVec x) 85 | { 86 | return relu(x); 87 | } 88 | }; 89 | 90 | // Leaky ReLU activation function 91 | struct LeakyReLUAct : IActivation 92 | { 93 | T a; 94 | 95 | __init(T a) 96 | { 97 | this.a = a; 98 | } 99 | 100 | [Differentiable] 101 | CoopVec eval(CoopVec x) 102 | { 103 | return leakyReLU(x, a); 104 | } 105 | }; 106 | 107 | // Sigmoid activation function 108 | struct SigmoidAct : IActivation 109 | { 110 | [Differentiable] 111 | CoopVec eval(CoopVec x) 112 | { 113 | // Sigmoid function calculation. Compiler will infer the derivative automatically (autodiff) 114 | return sigmoid(x); 115 | } 116 | }; 117 | 118 | // Swish activation function 119 | struct SwishAct : IActivation 120 | { 121 | [Differentiable] 122 | CoopVec eval(CoopVec x) 123 | { 124 | return x / (no_diff CoopVec(T(1.)) + exp(no_diff CoopVec(T(-1.)) * x)); 125 | } 126 | }; 127 | 128 | // Tanh activation function 129 | struct TanhAct : IActivation 130 | { 131 | [Differentiable] 132 | CoopVec eval(CoopVec x) 133 | { 134 | var c1 = no_diff CoopVec(T(1.)); 135 | return no_diff CoopVec(T(2.)) / (c1 + exp(no_diff CoopVec(T(-2.)) * x)) - c1; 136 | } 137 | }; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | 10 | 11 | set(project NeuralShading_Shaders) 12 | set(folder "${LIBRARY_FILTER}/NeuralShading_Shaders") 13 | 14 | file(GLOB shaders "*.slang") 15 | 16 | set_source_files_properties(${shaders} PROPERTIES VS_TOOL_OVERRIDE "None") 17 | add_custom_target(${project} 18 | DEPENDS ShaderMake 19 | SOURCES ${shaders}) 20 | set_target_properties(${project} PROPERTIES 21 | FOLDER ${folder} 22 | ) 23 | 24 | set(SAMPLES_SHADER_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR} CACHE PATH "" FORCE) -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/CooperativeVectorAutoDiff.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorFunctions; 12 | import CooperativeVectorDerivatives; 13 | 14 | // Implementation to extend CoopVec to make it automatically differentiable (autodiff) 15 | 16 | namespace rtxns 17 | { 18 | 19 | // Extension for builtin type CoopVec to make it automatically differentiable (autodiff) 20 | extension CoopVec : IDifferentiable 21 | { 22 | typealias Differential = CoopVec; 23 | }; 24 | 25 | typealias HCoopVec = CoopVec; 26 | 27 | //////////////////////// 28 | // 29 | // Additional functions and their derivatives for using in actvation functions 30 | // To support Slang autodiff, for each function its derivative should be defined 31 | // 32 | //////////////////////// 33 | 34 | // exp is builtin function, so we just need to define derivative for autodiff support 35 | [BackwardDerivativeOf(exp)] 36 | void exp_BackwardAutoDiff(inout DifferentialPair> p0, CoopVec.Differential dResult) 37 | { 38 | p0 = diffPair(p0.p, dResult * exp(p0.p)); 39 | } 40 | 41 | // Relu backward derivative 42 | [BackwardDerivativeOf(relu)] 43 | void relu_BackwardAutoDiff(inout DifferentialPair> p0, CoopVec.Differential dResult) 44 | { 45 | CoopVec d; 46 | 47 | [ForceUnroll] 48 | for (int i = 0; i < K; ++i) 49 | { 50 | d[i] = p0.p[i] > T(0.) ? dResult[i] : T(0.); 51 | } 52 | 53 | p0 = diffPair(p0.p, d); 54 | } 55 | 56 | // LeakyRelu backward derivative 57 | [BackwardDerivativeOf(leakyReLU)] 58 | void leakyRelu_BackwardAutoDiff(inout DifferentialPair> p0, T a, CoopVec.Differential dResult) 59 | { 60 | p0 = diffPair(p0.p, leakyReLU_Derivative(p0.p, a, dResult)); 61 | } 62 | 63 | // Sigmoid backward derivative 64 | [BackwardDerivativeOf(sigmoid)] 65 | void sigmoid_BackwardAutoDiff(inout DifferentialPair> p0, CoopVec.Differential dResult) 66 | { 67 | p0 = diffPair(p0.p, sigmoid_Derivative(p0.p, dResult)); 68 | } 69 | 70 | } -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/CooperativeVectorDerivatives.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorFunctions; 12 | 13 | // Derivatives of the functions in the CooperativeVectorFunction module 14 | 15 | namespace rtxns 16 | { 17 | 18 | // Derivative of leaky relu 19 | CoopVec leakyReLU_Derivative(CoopVec p, T a, CoopVec dResult) 20 | { 21 | [ForceUnroll] 22 | for (int i = 0; i < K; ++i) 23 | { 24 | p[i] = p[i] > T(0.) ? dResult[i] : a * dResult[i]; 25 | } 26 | return p; 27 | } 28 | 29 | // Derivative of sigmoid 30 | CoopVec sigmoid_Derivative(CoopVec p, CoopVec dResult) 31 | { 32 | var sigmoidOut = sigmoid(p); 33 | return dResult * sigmoidOut * (CoopVec(T(1.)) - sigmoidOut); 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/CooperativeVectorFunctions.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | // Additional functions for use with the CoopVec type 12 | 13 | namespace rtxns 14 | { 15 | 16 | // Relu function implementation 17 | CoopVec relu(CoopVec v) 18 | { 19 | return max(v, CoopVec(T(0.))); 20 | } 21 | 22 | // Leaky relu function implementation 23 | CoopVec leakyReLU(CoopVec p, no_diff T a) 24 | { 25 | [ForceUnroll] 26 | for (int i = 0; i < K; ++i) 27 | { 28 | p[i] = p[i] < T(0.) ? a * p[i] : p[i]; 29 | } 30 | return p; 31 | } 32 | 33 | // Sigmoid function implementation 34 | CoopVec sigmoid(CoopVec v) 35 | { 36 | var c1 = CoopVec(T(1.)); 37 | return c1 / (c1 + exp(CoopVec(T(-1.)) * v)); 38 | } 39 | 40 | 41 | 42 | } -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/LinearOps.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorAutoDiff; 12 | import CooperativeVectorFunctions; 13 | 14 | namespace rtxns 15 | { 16 | // Linear (without activation function) forward step of MLP using Cooperative Vector extension functions 17 | // Weights matrix and biases vector are stored in a byteaddress buffer at offsets matrixOffset and biasOffset 18 | CoopVec LinearOp( 19 | CoopVec ip, 20 | ByteAddressBuffer matrixBiasBuffer, 21 | uint matrixOffset, 22 | uint biasOffset, 23 | constexpr CoopVecMatrixLayout matrixLayout, 24 | constexpr CoopVecComponentType componentType) 25 | { 26 | return coopVecMatMulAdd( 27 | ip, 28 | componentType, 29 | matrixBiasBuffer, 30 | matrixOffset, 31 | componentType, 32 | matrixBiasBuffer, 33 | biasOffset, 34 | componentType, 35 | matrixLayout, 36 | false, 37 | 0 38 | ); 39 | } 40 | 41 | // One linear backward step of MLP using Cooperative Vector extension functions 42 | // Weights matrix and biases vector are stored in byteaddress buffer at offsets matrixOffset and biasOffset 43 | // Derivates of weights matrix and derivatives of biases vector are stored in read write byteaddress buffer at offsets matrixOffset and biasOffset 44 | CoopVec LinearOp_Backward( 45 | CoopVec ip, 46 | CoopVec grad, 47 | ByteAddressBuffer matrixBiasBuffer, 48 | RWByteAddressBuffer matrixBiasBufferDerivative, 49 | uint matrixOffset, 50 | uint biasOffset, 51 | constexpr CoopVecMatrixLayout matrixLayout, 52 | constexpr CoopVecComponentType componentType) 53 | { 54 | coopVecOuterProductAccumulate(grad, ip, matrixBiasBufferDerivative, matrixOffset, 0, matrixLayout, componentType); 55 | coopVecReduceSumAccumulate(grad, matrixBiasBufferDerivative, biasOffset); 56 | 57 | return coopVecMatMul(grad, componentType, matrixBiasBuffer, matrixOffset, componentType, matrixLayout, true, 0); 58 | } 59 | } 60 | 61 | namespace rtxns 62 | { 63 | namespace mlp 64 | { 65 | // Structure to store derivatives of weights matrix and derivatives of biases vector 66 | // Extends IDifferentiablePtrType interface to support autodiff 67 | struct MatrixBiasBufferDifferential : IDifferentiablePtrType 68 | { 69 | typealias Differential = MatrixBiasBufferDifferential; 70 | 71 | __init(RWByteAddressBuffer buf) 72 | { 73 | buffer = buf; 74 | } 75 | 76 | RWByteAddressBuffer buffer; 77 | }; 78 | 79 | // Structure to store weights matrix and biases vector 80 | // Extends IDifferentiablePtrType interface to support autodiff 81 | struct MatrixBiasBuffer : IDifferentiablePtrType 82 | { 83 | typealias Differential = MatrixBiasBufferDifferential; 84 | 85 | __init(ByteAddressBuffer buf) 86 | { 87 | buffer = buf; 88 | } 89 | 90 | ByteAddressBuffer buffer; 91 | }; 92 | 93 | // Linear forward step of MLP using MatrixBiasBuffer structure to store weights and biases 94 | CoopVec LinearOp( 95 | CoopVec ip, 96 | MatrixBiasBuffer matrixBiasBuffer, 97 | uint2 offsets, 98 | constexpr CoopVecMatrixLayout matrixLayout, 99 | constexpr CoopVecComponentType componentType) 100 | { 101 | return LinearOp( 102 | ip, 103 | matrixBiasBuffer.buffer, 104 | offsets[0], 105 | offsets[1], 106 | matrixLayout, 107 | componentType 108 | ); 109 | } 110 | 111 | // Linear backward step of MLP using MatrixBiasBuffer and MatrixBiasBufferDifferential 112 | [BackwardDerivativeOf(LinearOp)] 113 | void LinearOp_BackwardAutoDiff( 114 | inout DifferentialPair> ip, 115 | DifferentialPtrPair matrixBiasBuffer, 116 | uint2 offsets, 117 | constexpr CoopVecMatrixLayout matrixLayout, 118 | constexpr CoopVecComponentType componentType, CoopVec.Differential grad) 119 | { 120 | ip = diffPair( 121 | ip.p, 122 | LinearOp_Backward( 123 | ip.p, 124 | grad, 125 | matrixBiasBuffer.p.buffer, 126 | matrixBiasBuffer.d.buffer, 127 | offsets[0], 128 | offsets[1], 129 | matrixLayout, 130 | componentType 131 | ) 132 | ); 133 | } 134 | } 135 | } -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/Loss.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorAutoDiff; 12 | import CooperativeVectorFunctions; 13 | 14 | namespace rtxns 15 | { 16 | namespace mlp 17 | { 18 | //////////////////////// 19 | // 20 | // Loss function interface and implementation for several loss functions 21 | // for using with classes in MLP module 22 | // 23 | //////////////////////// 24 | 25 | // Base interface for activation functions 26 | interface ILoss 27 | { 28 | static vector value(vector target, vector predicted, vector scale); 29 | static vector deriv(vector target, vector predicted, vector scale); 30 | }; 31 | 32 | // L1 33 | struct L1 : ILoss 34 | { 35 | static vector value(vector target, vector predicted, vector scale) 36 | { 37 | return scale * abs(predicted - target); 38 | } 39 | 40 | static vector deriv(vector target, vector predicted, vector scale) 41 | { 42 | return copysign(scale, predicted - target); 43 | } 44 | }; 45 | 46 | // Relative L1 47 | struct L1Relative : ILoss 48 | { 49 | static vector value(vector target, vector predicted, vector scale) 50 | { 51 | return scale * abs(predicted - target) / (abs(predicted) + vector(0.01f)); 52 | } 53 | 54 | static vector deriv(vector target, vector predicted, vector scale) 55 | { 56 | return copysign(scale, predicted - target) / (abs(predicted) + vector(0.01f)); 57 | } 58 | }; 59 | 60 | // Mean absolute percentage error (MAPE) 61 | struct MAPE : ILoss 62 | { 63 | static vector value(vector target, vector predicted, vector scale) 64 | { 65 | return scale * abs(predicted - target) / (abs(target) + vector(0.01f)); 66 | } 67 | 68 | static vector deriv(vector target, vector predicted, vector scale) 69 | { 70 | return copysign(scale, predicted - target) / (abs(target) + vector(0.01f)); 71 | } 72 | }; 73 | 74 | // Symmetric mean absolute percentage error (SMAPE) 75 | struct SMAPE : ILoss 76 | { 77 | static vector value(vector target, vector predicted, vector scale) 78 | { 79 | return scale * abs(predicted - target) / ((abs(target) + abs(predicted)) * T(0.5) + vector(0.01f)); 80 | } 81 | 82 | static vector deriv(vector target, vector predicted, vector scale) 83 | { 84 | return copysign(scale, predicted - target) / ((abs(target) + abs(predicted))*T(0.5) + vector(0.01f)); 85 | } 86 | }; 87 | 88 | // L2 89 | struct L2 : ILoss 90 | { 91 | static vector value(vector target, vector predicted, vector scale) 92 | { 93 | return scale * (predicted - target) * (predicted - target); 94 | } 95 | 96 | static vector deriv(vector target, vector predicted, vector scale) 97 | { 98 | return T(2) * scale * (predicted - target); 99 | } 100 | }; 101 | 102 | // Relative L2 103 | struct L2Relative : ILoss 104 | { 105 | static vector value(vector target, vector predicted, vector scale) 106 | { 107 | return scale * (predicted - target) * (predicted - target) / (predicted * predicted + vector(0.01f)); 108 | } 109 | 110 | static vector deriv(vector target, vector predicted, vector scale) 111 | { 112 | return T(2) * scale * (predicted - target) / (predicted * predicted + vector(0.01f)); 113 | } 114 | }; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/NeuralShading_Shaders/MLP.slang: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 - 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | */ 10 | 11 | import CooperativeVectorAutoDiff; 12 | import CooperativeVectorFunctions; 13 | import LinearOps; 14 | import Activation; 15 | 16 | namespace rtxns 17 | { 18 | namespace mlp 19 | { 20 | // Structure to store MLP layers. Implements full forward step for inference 21 | // MLP is defined by number of hidden layers and number of inputs, outputs and elements in hidden layers 22 | struct InferenceMLP< 23 | T : __BuiltinFloatingPointType, 24 | let HIDDEN_LAYERS : int, 25 | let INPUTS : int, 26 | let HIDDEN : int, 27 | let OUTPUTS : int, 28 | let matrixLayout : CoopVecMatrixLayout, 29 | let componentType : CoopVecComponentType 30 | > 31 | { 32 | // Initialized from buffer with weights and biases and two vectors of offsets 33 | __init(ByteAddressBuffer buf, uint matrixOffset[HIDDEN_LAYERS+1], uint biasOffset[HIDDEN_LAYERS+1]) 34 | { 35 | parameters = MatrixBiasBuffer(buf); 36 | 37 | [ForceUnroll] 38 | for (int i = 0; i <= HIDDEN_LAYERS; ++i) 39 | layerOffsets[i] = uint2(matrixOffset[i], biasOffset[i]); 40 | } 41 | 42 | // Full MLP forward step using one activation function for input and hidden layers and another for output 43 | // Returns MLP output 44 | CoopVec forward