├── .clang-format
├── .editorconfig
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── CMakeLists.txt
├── CONTRIBUTING
├── LICENSE
├── README.md
├── docs
    ├── continuous_lod_clusters.png
    ├── lod_allocation.png
    ├── lod_rendering.png
    ├── lod_streaming.png
    └── otherscenes.jpg
├── shaders
    ├── blas_clusters_insert.comp.glsl
    ├── blas_setup_insertion.comp.glsl
    ├── build_setup.comp.glsl
    ├── culling.glsl
    ├── fullscreen.vert.glsl
    ├── fullscreen_write_depth.frag.glsl
    ├── hbao.h
    ├── hbao_blur.comp.glsl
    ├── hbao_blur.glsl
    ├── hbao_blur_apply.comp.glsl
    ├── hbao_calc.comp.glsl
    ├── hbao_deinterleave.comp.glsl
    ├── hbao_depthlinearize.comp.glsl
    ├── hbao_reinterleave.comp.glsl
    ├── hbao_viewnormal.comp.glsl
    ├── nvhiz-update.comp.glsl
    ├── octant_encoding.h
    ├── render_instance_bbox.frag.glsl
    ├── render_instance_bbox.mesh.glsl
    ├── render_raster.frag.glsl
    ├── render_raster_clusters.mesh.glsl
    ├── render_raytrace.rgen.glsl
    ├── render_raytrace.rmiss.glsl
    ├── render_raytrace_clusters.rchit.glsl
    ├── render_shading.glsl
    ├── shaderio.h
    ├── shaderio_building.h
    ├── shaderio_core.h
    ├── shaderio_scene.h
    ├── shaderio_streaming.h
    ├── stream_agefilter_groups.comp.glsl
    ├── stream_allocator_build_freegaps.comp.glsl
    ├── stream_allocator_freegaps_insert.comp.glsl
    ├── stream_allocator_load_groups.comp.glsl
    ├── stream_allocator_setup_insertion.comp.glsl
    ├── stream_allocator_unload_groups.comp.glsl
    ├── stream_compaction_new_clas.comp.glsl
    ├── stream_compaction_old_clas.comp.glsl
    ├── stream_setup.comp.glsl
    ├── stream_update_scene.comp.glsl
    ├── traversal_init.comp.glsl
    ├── traversal_presort.comp.glsl
    └── traversal_run.comp.glsl
├── src
    ├── cgltf.cpp
    ├── hbao_pass.cpp
    ├── hbao_pass.hpp
    ├── lodclusters.cpp
    ├── lodclusters.hpp
    ├── lodclusters_ui.cpp
    ├── main.cpp
    ├── nvhiz_vk.cpp
    ├── nvhiz_vk.hpp
    ├── renderer.cpp
    ├── renderer.hpp
    ├── renderer_raster_clusters_lod.cpp
    ├── renderer_raytrace_clusters_lod.cpp
    ├── resources.cpp
    ├── resources.hpp
    ├── scene.cpp
    ├── scene.hpp
    ├── scene_cache.cpp
    ├── scene_gltf.cpp
    ├── scene_preloaded.cpp
    ├── scene_preloaded.hpp
    ├── scene_streaming.cpp
    ├── scene_streaming.hpp
    ├── scene_streaming_utils.cpp
    ├── scene_streaming_utils.hpp
    ├── vk_nv_cluster_acc.cpp
    └── vk_nv_cluster_acc.h
└── thirdparty
    └── vulkan_radix_sort
        ├── CMakeLists.txt
        ├── LICENSE
        ├── README.md
        ├── include
            └── vk_radix_sort.h
        └── src
            ├── generated
                ├── downsweep_comp.h
                ├── downsweep_key_value_comp.h
                ├── spine_comp.h
                └── upsweep_comp.h
            ├── shader
                ├── downsweep.comp
                ├── spine.comp
                └── upsweep.comp
            └── vk_radix_sort.cc


/.clang-format:
--------------------------------------------------------------------------------
 1 | ﻿BasedOnStyle: LLVM
 2 | AccessModifierOffset: '-2'
 3 | AlignAfterOpenBracket: Align
 4 | AlignConsecutiveAssignments: 'true'
 5 | AlignConsecutiveDeclarations: 'true'
 6 | AlignOperands: 'true'
 7 | AlignTrailingComments: 'true'
 8 | AllowAllParametersOfDeclarationOnNextLine: 'false'
 9 | AllowShortBlocksOnASingleLine: 'false'
10 | AllowShortCaseLabelsOnASingleLine: 'false'
11 | AllowShortFunctionsOnASingleLine: Inline
12 | AllowShortIfStatementsOnASingleLine: 'false'
13 | AllowShortLoopsOnASingleLine: 'false'
14 | AlwaysBreakAfterReturnType: None
15 | AlwaysBreakBeforeMultilineStrings: 'true'
16 | AlwaysBreakTemplateDeclarations: 'true'
17 | BinPackArguments: 'true'
18 | BinPackParameters: 'false'
19 | ExperimentalAutoDetectBinPacking: 'false'
20 | BreakBeforeBinaryOperators: NonAssignment
21 | BreakBeforeBraces: Custom
22 | BreakBeforeTernaryOperators: 'false'
23 | BreakConstructorInitializersBeforeComma: 'true'
24 | ColumnLimit: '120'
25 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'false'
26 | Cpp11BracedListStyle: 'true'
27 | IndentCaseLabels: 'true'
28 | IndentWidth: '2'
29 | KeepEmptyLinesAtTheStartOfBlocks: 'true'
30 | Language: Cpp
31 | MaxEmptyLinesToKeep: '2'
32 | NamespaceIndentation: None
33 | ObjCSpaceBeforeProtocolList: 'true'
34 | PointerAlignment: Left
35 | SpaceAfterCStyleCast: 'false'
36 | SpaceBeforeAssignmentOperators: 'true'
37 | SpaceBeforeParens: Never
38 | SpaceInEmptyParentheses: 'false'
39 | SpacesBeforeTrailingComments: '2'
40 | SpacesInAngles: 'false'
41 | SpacesInCStyleCastParentheses: 'false'
42 | SpacesInParentheses: 'false'
43 | SpacesInSquareBrackets: 'false'
44 | Standard: Cpp11
45 | TabWidth: '2'
46 | UseTab: Never
47 | SortIncludes: 'false'
48 | ReflowComments: 'false'
49 | BraceWrapping: {
50 |  AfterClass: 'true'
51 |  AfterControlStatement: 'true'
52 |  AfterEnum: 'true'
53 |  AfterFunction: 'true'
54 |  AfterNamespace: 'false' 
55 |  AfterStruct: 'true'
56 |  AfterUnion: 'true'
57 |  BeforeCatch: 'true'
58 |  BeforeElse: 'true'
59 |  IndentBraces: 'false'
60 | }
61 | PenaltyExcessCharacter: 1
62 | PenaltyBreakBeforeFirstCallParameter: 40
63 | PenaltyBreakFirstLessLess: 1
64 | PenaltyBreakComment: 30
65 | PenaltyBreakString: 30
66 | PenaltyReturnTypeOnItsOwnLine: 9999
67 | BreakStringLiterals: false


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # This is the top-most editor config file
2 | root = true
3 | 
4 | # Default to 2 space indentation for C/C++ files
5 | [*.{c,cpp,h,hpp,inl}]
6 | indent_size = 2
7 | indent_style = space
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #############################
 2 | # generic
 3 | #############################
 4 | 
 5 | *.bak
 6 | 
 7 | #############################
 8 | # spirv/sass
 9 | #############################
10 | 
11 | *.spv
12 | *.spva
13 | *.sass
14 | *.sassbin
15 | 
16 | #############################
17 | #specific to the project
18 | #############################
19 | 
20 | zbsgfxpack.lua
21 | cmake_built
22 | cmake_build
23 | build
24 | _install
25 | bin_x64
26 | external/downloaded_resources/
27 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/nv_cluster_lod_builder"]
2 | 	path = external/nv_cluster_lod_builder
3 | 	url = https://github.com/nvpro-samples/nv_cluster_lod_builder.git
4 | 	branch = main
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog for vk_lod_clusters
 2 | * 2026-4-26:
 3 |   * Added "Disable back-face culling" to "Scene Complexity" UI.
 4 | * 2026-4-25:
 5 |   * Added "Instance Sorting" option, does sort instances by distance to camera. `-instancesorting 0/1`.
 6 |   * bugfix gltf meshes with multiple primitives
 7 | * 2026-4-23:
 8 |   * Add `-processingthreadpct <float 0-1.0>` to control the percentage of threads doing the geometry processing (number of geometries in parallel). Percentage of what the system supports for concurrency. Default is `0.5`.
 9 |   * Add `-processingonly 0/1` to reduce peak memory consumption during processing and saving the cache file. This always saves a cache file (unless the old one was valid) and terminates the application afterwards.
10 | * 2026-4-11:
11 |   * Interleave geometry processing with loading to reduce peak memory consumption.
12 |   * Add visualization of instance bounding boxes
13 | * 2026-4-7:
14 |   * Bugfix to file cache header detection.
15 |   * The file cache can be used via memory mapping, avoiding a copy into system memory. `-mappedcache 0/1` defaults to true.
16 |   * Use "octant" encoding for vertex normals according to [A Survey of Efficient Representations for Independent Unit Vectors](http://jcgt.org/published/0003/02/01/paper.pdf)
17 | * 2025-4-4: 
18 |   * The file cache format now stores everything geometry related for rendering. Instance and material information, as well as original vertex/triangle counts still comes from the gltf. The new file ending is `.nvsngeo`, the old `.nvcllod` files no longer work.
19 |   * Added `-autoloadcache 0/1` option to disable loading from a cache file.
20 |   * Some basic preparation to allow working from memory mapped cache files without loading into system memory.
21 | * 2025-2-7:
22 |   * Added _"File > Save Cache"_ menu entry, as well as `-autosavecache 1` option. This allows to store the results of the lod cluster mesh processing into a file next to the original model.
23 |     This allows speeding up future load times of the model a lot. See new notes in **Model processing** section of README
24 |   * Improved warnings and some memory statistics.
25 |   * Streaming geometry memory now guaranteed to stay within limit.
26 | * 2025-1-30: Initial release


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.6...3.31)
  2 | 
  3 | get_filename_component(PROJNAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
  4 | Project(${PROJNAME})
  5 | Message(STATUS "-------------------------------")
  6 | Message(STATUS "Processing Project ${PROJNAME}:")
  7 | 
  8 | #####################################################################################
  9 | # look for nvpro_core 1) as a sub-folder 2) at some other locations
 10 | # this cannot be put anywhere else since we still didn't find setup.cmake yet
 11 | 
 12 | # which nvprocore tag or branch to download if repo not found
 13 | set(NVPRO_GIT_TAG main)
 14 | # Where to decompress nvprocore source code if repo not found
 15 | set(NVPRO_TGT_SRC_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps)
 16 | 
 17 | if(NOT BASE_DIRECTORY)
 18 |   find_path(BASE_DIRECTORY
 19 |     NAMES nvpro_core/cmake/setup.cmake
 20 |     PATHS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/../.. ${CMAKE_CURRENT_SOURCE_DIR}/external
 21 |     DOC "Directory containing nvpro_core"
 22 |     )
 23 | endif()
 24 | if(EXISTS ${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
 25 |   set(OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin_x64)
 26 |   include(${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
 27 | else()
 28 |   # nvpro_core not found, will try to download.
 29 |   # first find where the current sample comes from
 30 |   execute_process( 
 31 |     COMMAND git config --get remote.origin.url 
 32 |     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 
 33 |     OUTPUT_VARIABLE GIT_REPO_URL OUTPUT_STRIP_TRAILING_WHITESPACE 
 34 |   ) 
 35 |   # Check if "github.com" is in URL
 36 |   string(FIND "${GIT_REPO_URL}" "github.com" FOUND_INDEX)
 37 |   if (FOUND_INDEX GREATER -1)
 38 |     # Use regex to extract everything up to and including "github.com"
 39 |     string(REGEX MATCH ".*github\\.com" GIT_BASE_URL "${GIT_REPO_URL}")
 40 |     # construct URL
 41 |     string(FIND "${GIT_REPO_URL}" "git@" SSH_FOUND_INDEX)
 42 |     if (SSH_FOUND_INDEX GREATER -1) # ssh
 43 |       set(NVPRO_GIT_URL ${GIT_BASE_URL}:nvpro-samples/nvpro_core.git)
 44 |     else() # https
 45 |       set(NVPRO_GIT_URL ${GIT_BASE_URL}/nvpro-samples/nvpro_core.git)
 46 |     endif()
 47 |     if("${NVPRO_GIT_TAG}" STREQUAL "main" )
 48 |       set(NVPRO_GIT_TAG master)
 49 |     endif()
 50 |     message("Sample comes from github , nvprocore is at " ${NVPRO_GIT_URL} )
 51 |   else ()
 52 |     # reconstruct the path to nvpro_core, preserving the protocol
 53 |     string(REGEX MATCH "^[^/]+//[^/]+/" GIT_BASE_URL "${GIT_REPO_URL}")
 54 |     # construct URL
 55 |     set(NVPRO_GIT_URL ${GIT_BASE_URL}devtechproviz/nvpro-samples/nvpro_core.git)
 56 |     # message("Sample comes from prod server, nvprocore is at " ${NVPRO_GIT_URL})
 57 |   endif()
 58 |   # let's clone the commit we need, depth to 1 so that we do not download the full history
 59 |   execute_process( 
 60 |     COMMAND git clone --depth 1 --branch ${NVPRO_GIT_TAG} ${NVPRO_GIT_URL} ${CMAKE_CURRENT_BINARY_DIR}/_deps/nvpro_core
 61 |     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 
 62 |   ) 
 63 |   # do the search again with downloaded version, use find to be sure everyting runs ok
 64 |   find_path(BASE_DIRECTORY
 65 |     NAMES nvpro_core
 66 |     PATHS ${CMAKE_CURRENT_BINARY_DIR}/_deps
 67 |     REQUIRED
 68 |     DOC "Directory containing nvpro_core"
 69 |     )
 70 |   # invoke the setup
 71 |   if(EXISTS ${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
 72 |     set(OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin_x64)
 73 |     include(${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
 74 |   else()
 75 |     message(FATAL_ERROR "could not find base directory or download nvpro_core, please set BASE_DIRECTORY to folder containing nvpro_core")
 76 |   endif()
 77 | endif()
 78 | set(NVPRO_CORE_DIR ${BASE_DIRECTORY}/nvpro_core)
 79 | 
 80 | _add_project_definitions(${PROJNAME})
 81 | 
 82 | # Download the default scene
 83 | download_files(FILENAMES bunny_v2.zip EXTRACT)
 84 | 
 85 | #####################################################################################
 86 | # additions from packages needed for this sample
 87 | # add refs  in LIBRARIES_OPTIMIZED
 88 | # add refs  in LIBRARIES_DEBUG
 89 | # add files in PACKAGE_SOURCE_FILES
 90 | 
 91 | _add_package_VulkanSDK()
 92 | _add_package_ShaderC()
 93 | _add_package_IMGUI()
 94 | 
 95 | #_add_package_NVML()
 96 | 
 97 | #####################################################################################
 98 | # process the rest of some cmake code that needs to be done *after* the packages add
 99 | _add_nvpro_core_lib()
100 | 
101 | if(NOT TARGET nv_cluster_lod_builder)
102 |   add_subdirectory(external/nv_cluster_lod_builder)
103 | endif()
104 | 
105 | #####################################################################################
106 | # Source files for this project
107 | #
108 | file(GLOB SOURCE_FILES src/*.*)
109 | file(GLOB SHADER_FILES shaders/*.glsl shaders/*.h)
110 | list(APPEND SHADER_FILES ${NVPRO_CORE_DIR}/nvvkhl/shaders/dh_sky.h)
111 | file(GLOB VK_RADIX_SORT_FILES thirdparty/vulkan_radix_sort/src/vk_radix_sort.cc)
112 | 
113 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
114 | include_directories(${NVPRO_CORE_DIR}/nvvkhl/shaders)
115 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/vulkan_radix_sort/include)
116 | 
117 | #####################################################################################
118 | # Executable
119 | #
120 | 
121 | if(WIN32 AND NOT GLUT_FOUND)
122 |   add_definitions(/wd4996) #remove printf warning
123 |   add_definitions(/wd4244) #remove double to float conversion warning
124 |   add_definitions(/wd4305) #remove double to float truncation warning
125 | else()
126 |   add_definitions(-fpermissive)
127 | endif()
128 | add_executable(${PROJNAME} ${SOURCE_FILES} ${COMMON_SOURCE_FILES} ${PACKAGE_SOURCE_FILES} ${SHADER_FILES} ${MESHOPT_FILES} ${VK_RADIX_SORT_FILES})
129 | 
130 | set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJNAME})
131 | 
132 | target_compile_definitions(${PROJNAME} PRIVATE NVPRO_CORE_DIR="${NVPRO_CORE_DIR}")
133 | #####################################################################################
134 | # common source code needed for this sample
135 | #
136 | source_group(common FILES 
137 |   ${COMMON_SOURCE_FILES}
138 |   ${PACKAGE_SOURCE_FILES}
139 | )
140 | source_group("Shader Files" FILES ${SHADER_FILES})
141 | source_group("Source Files" FILES ${SOURCE_FILES})
142 | source_group("meshoptimizer" FILES ${MESHOPT_FILES})
143 | 
144 | if(UNIX)
145 |   set(UNIXLINKLIBS dl pthread)
146 | else()
147 |   set(UNIXLINKLIBS)
148 | endif()
149 | 
150 | #####################################################################################
151 | # Linkage
152 | #
153 | 
154 | target_link_libraries(${PROJNAME} ${PLATFORM_LIBRARIES} nvpro_core nv_cluster_lod_builder meshoptimizer)
155 | 
156 | foreach(DEBUGLIB ${LIBRARIES_DEBUG})
157 |   target_link_libraries(${PROJNAME} debug ${DEBUGLIB})
158 | endforeach(DEBUGLIB)
159 | 
160 | foreach(RELEASELIB ${LIBRARIES_OPTIMIZED})
161 |   target_link_libraries(${PROJNAME} optimized ${RELEASELIB})
162 | endforeach(RELEASELIB)
163 | 
164 | #####################################################################################
165 | # copies binaries that need to be put next to the exe files (ZLib, etc.)
166 | #
167 | 
168 | _finalize_target( ${PROJNAME} )
169 | 
170 | install(FILES ${SHADER_FILES} CONFIGURATIONS Release DESTINATION "bin_${ARCH}/GLSL_${PROJNAME}")
171 | install(FILES ${SHADER_FILES} CONFIGURATIONS Debug DESTINATION "bin_${ARCH}_debug/GLSL_${PROJNAME}")
172 | 


--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | https://developercertificate.org/
 2 | 
 3 | Developer Certificate of Origin
 4 | Version 1.1
 5 | 
 6 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 7 | 
 8 | Everyone is permitted to copy and distribute verbatim copies of this
 9 | license document, but changing it is not allowed.
10 | 
11 | 
12 | Developer's Certificate of Origin 1.1
13 | 
14 | By making a contribution to this project, I certify that:
15 | 
16 | (a) The contribution was created in whole or in part by me and I
17 |     have the right to submit it under the open source license
18 |     indicated in the file; or
19 | 
20 | (b) The contribution is based upon previous work that, to the best
21 |     of my knowledge, is covered under an appropriate open source
22 |     license and I have the right under that license to submit that
23 |     work with modifications, whether created in whole or in part
24 |     by me, under the same open source license (unless I am
25 |     permitted to submit under a different license), as indicated
26 |     in the file; or
27 | 
28 | (c) The contribution was provided directly to me by some other
29 |     person who certified (a), (b) or (c) and I have not modified
30 |     it.
31 | 
32 | (d) I understand and agree that this project and the contribution
33 |     are public and that a record of the contribution (including all
34 |     personal information I submit with it, including my sign-off) is
35 |     maintained indefinitely and may be redistributed consistent with
36 |     this project or the open source license(s) involved.


--------------------------------------------------------------------------------
/docs/continuous_lod_clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/continuous_lod_clusters.png


--------------------------------------------------------------------------------
/docs/lod_allocation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_allocation.png


--------------------------------------------------------------------------------
/docs/lod_rendering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_rendering.png


--------------------------------------------------------------------------------
/docs/lod_streaming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_streaming.png


--------------------------------------------------------------------------------
/docs/otherscenes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/otherscenes.jpg


--------------------------------------------------------------------------------
/shaders/blas_clusters_insert.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader inserts the CLAS clusters that should be rendered
 26 |   into the cluster references list for each instance's BLAS.
 27 | 
 28 |   A single thread represents one CLAS
 29 | */
 30 | 
 31 | #version 460
 32 | 
 33 | 
 34 | #extension GL_GOOGLE_include_directive : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 39 | #extension GL_EXT_buffer_reference : enable
 40 | #extension GL_EXT_buffer_reference2 : enable
 41 | #extension GL_EXT_scalar_block_layout : enable
 42 | #extension GL_EXT_shader_atomic_int64 : enable
 43 | 
 44 | #extension GL_EXT_control_flow_attributes : require
 45 | #extension GL_KHR_shader_subgroup_vote : require
 46 | #extension GL_KHR_shader_subgroup_ballot : require
 47 | #extension GL_KHR_shader_subgroup_shuffle : require
 48 | #extension GL_KHR_shader_subgroup_basic : require
 49 | #extension GL_KHR_shader_subgroup_clustered : require
 50 | #extension GL_KHR_shader_subgroup_arithmetic : require
 51 | 
 52 | #include "shaderio.h"
 53 | 
 54 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 55 | {
 56 |   FrameConstants view;
 57 | };
 58 | 
 59 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 60 | {
 61 |   Readback readback;
 62 | };
 63 | 
 64 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 65 | {
 66 |   RenderInstance instances[];
 67 | };
 68 | 
 69 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 70 | {
 71 |   Geometry geometries[];
 72 | };
 73 | 
 74 | layout(binding = BINDINGS_HIZ_TEX)  uniform sampler2D texHizFar;
 75 | 
 76 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 77 | {
 78 |   SceneBuilding build;  
 79 | };
 80 | 
 81 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
 82 | {
 83 |   SceneBuilding buildRW;  
 84 | };
 85 | 
 86 | #if USE_STREAMING
 87 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 88 | {
 89 |   SceneStreaming streaming;
 90 | };
 91 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 92 | {
 93 |   SceneStreaming streamingRW;
 94 | };
 95 | #endif
 96 | 
 97 | ////////////////////////////////////////////
 98 | 
 99 | layout(local_size_x=BLAS_INSERT_CLUSTERS_WORKGROUP) in;
100 | 
101 | ////////////////////////////////////////////
102 | 
103 | void main()
104 | {
105 |   uint renderClusterIndex = gl_GlobalInvocationID.x;
106 |   
107 |   if (renderClusterIndex < build.renderClusterCounter)
108 |   {
109 |     ClusterInfo cluster       = build.renderClusterInfos.d[renderClusterIndex];
110 |     uint instanceID           = cluster.instanceID;
111 |     uint clusterID            = cluster.clusterID;
112 |   #if USE_STREAMING
113 |     uint64_t clusterAddress   = streaming.resident.clasAddresses.d[clusterID];
114 |   #else
115 |     Geometry geometry         = geometries[instances[instanceID].geometryID];
116 |     uint64_t clusterAddress   = geometry.preloadedClusterClasAddresses.d[clusterID];
117 |   #endif
118 |     
119 |     uint idx = atomicAdd(build.blasBuildInfos.d[instanceID].clusterReferencesCount,1);
120 |     uint64s_inout clusterReferences = uint64s_inout(build.blasBuildInfos.d[instanceID].clusterReferences);
121 |     clusterReferences.d[idx] = clusterAddress;
122 |     
123 |   #if 1
124 |     // for statistics
125 |     #if USE_STREAMING
126 |       uint numTriangles = Cluster_in(streaming.resident.clusters.d[clusterID]).d.triangleCountMinusOne + 1;
127 |     #else
128 |       uint numTriangles = geometry.preloadedClusters.d[clusterID].triangleCountMinusOne + 1;
129 |     #endif
130 |     atomicAdd(readback.numRenderedTriangles, numTriangles);
131 |   #endif
132 |   }
133 | }


--------------------------------------------------------------------------------
/shaders/blas_setup_insertion.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | /*
 20 |   
 21 |   Shader Description
 22 |   ==================
 23 | 
 24 |   This compute shader sets up the per BLAS cluster references list start pointer.
 25 |   It does so by simply adding up the per-blas references counts that were filled during 
 26 |   `traversal_run.comp.glsl`.
 27 |   These count values are also reset, so that the `blas_clusters_insert.comp.glsl` kernel
 28 |   can increment them again when filling the lists.
 29 |   
 30 |   A single thread represents one BLAS
 31 | */
 32 | 
 33 | #version 460
 34 | 
 35 | #extension GL_GOOGLE_include_directive : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 39 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 40 | #extension GL_EXT_buffer_reference : enable
 41 | #extension GL_EXT_buffer_reference2 : enable
 42 | #extension GL_EXT_scalar_block_layout : enable
 43 | #extension GL_EXT_shader_atomic_int64 : enable
 44 | 
 45 | #extension GL_EXT_control_flow_attributes : require
 46 | #extension GL_KHR_shader_subgroup_vote : require
 47 | #extension GL_KHR_shader_subgroup_ballot : require
 48 | #extension GL_KHR_shader_subgroup_shuffle : require
 49 | #extension GL_KHR_shader_subgroup_basic : require
 50 | #extension GL_KHR_shader_subgroup_clustered : require
 51 | #extension GL_KHR_shader_subgroup_arithmetic : require
 52 | 
 53 | #include "shaderio.h"
 54 | 
 55 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 56 | {
 57 |   FrameConstants view;
 58 | };
 59 | 
 60 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 61 | {
 62 |   Readback readback;
 63 | };
 64 | 
 65 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 66 | {
 67 |   RenderInstance instances[];
 68 | };
 69 | 
 70 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 71 | {
 72 |   Geometry geometries[];
 73 | };
 74 | 
 75 | layout(binding = BINDINGS_HIZ_TEX)  uniform sampler2D texHizFar;
 76 | 
 77 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 78 | {
 79 |   SceneBuilding build;  
 80 | };
 81 | 
 82 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
 83 | {
 84 |   SceneBuilding buildRW;  
 85 | };
 86 | 
 87 | ////////////////////////////////////////////
 88 | 
 89 | layout(local_size_x=BLAS_SETUP_INSERTION_WORKGROUP) in;
 90 | 
 91 | ////////////////////////////////////////////
 92 | 
 93 | 
 94 | void main()
 95 | {
 96 |   uint instanceID = gl_GlobalInvocationID.x;
 97 |   
 98 |   if (instanceID < build.numRenderInstances)
 99 |   {
100 |     uint referencesCount  = build.blasBuildInfos.d[instanceID].clusterReferencesCount;
101 |     uint referencesOffset = atomicAdd(buildRW.blasClasCounter, referencesCount);
102 |     // reset count for insertion pass
103 |     build.blasBuildInfos.d[instanceID].clusterReferencesCount  = 0;
104 |     build.blasBuildInfos.d[instanceID].clusterReferencesStride = 8;
105 |     build.blasBuildInfos.d[instanceID].clusterReferences       = uint64_t(buildRW.blasClusterAddresses) + uint64_t(referencesOffset * 8);
106 |     
107 |     // sum up last frame's result for statistics
108 |     atomicAdd(readback.blasActualSizes, uint64_t(build.blasBuildSizes.d[instanceID]));
109 |   }
110 | }


--------------------------------------------------------------------------------
/shaders/build_setup.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | /*
 20 |   
 21 |   Shader Description
 22 |   ==================
 23 |   
 24 |   This compute shader does basic operations on a single thread.
 25 |   For example clamping atomic counters back to their limits or
 26 |   setting up indirect dispatches or draws etc.
 27 |   
 28 |   BUILD_SETUP_... are enums for the various operations
 29 | 
 30 | */
 31 | 
 32 | #version 460
 33 | 
 34 | #extension GL_GOOGLE_include_directive : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 39 | #extension GL_EXT_buffer_reference : enable
 40 | #extension GL_EXT_buffer_reference2 : enable
 41 | #extension GL_EXT_scalar_block_layout : enable
 42 | #extension GL_EXT_shader_atomic_int64 : enable
 43 | 
 44 | #extension GL_EXT_control_flow_attributes : require
 45 | #extension GL_KHR_shader_subgroup_ballot : require
 46 | #extension GL_KHR_shader_subgroup_shuffle : require
 47 | #extension GL_KHR_shader_subgroup_basic : require
 48 | #extension GL_KHR_shader_subgroup_clustered : require
 49 | #extension GL_KHR_shader_subgroup_arithmetic : require
 50 | 
 51 | #include "shaderio.h"
 52 | 
 53 | layout(push_constant) uniform pushData
 54 | {
 55 |   uint setup;
 56 | } push;
 57 | 
 58 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 59 | {
 60 |   FrameConstants view;
 61 |   FrameConstants viewLast;
 62 | };
 63 | 
 64 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 65 | {
 66 |   Readback readback;
 67 | };
 68 | 
 69 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 70 | {
 71 |   RenderInstance instances[];
 72 | };
 73 | 
 74 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 75 | {
 76 |   Geometry geometries[];
 77 | };
 78 | 
 79 | layout(binding = BINDINGS_HIZ_TEX)  uniform sampler2D texHizFar;
 80 | 
 81 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 82 | {
 83 |   SceneBuilding build;  
 84 | };
 85 | 
 86 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) coherent buffer buildBufferRW
 87 | {
 88 |   SceneBuilding buildRW;  
 89 | };
 90 | 
 91 | ////////////////////////////////////////////
 92 | 
 93 | layout(local_size_x=1) in;
 94 | 
 95 | ////////////////////////////////////////////
 96 | 
 97 | void main()
 98 | {  
 99 |   // special operations for setting up indirect dispatches
100 |   // or clamping other operations to actual limits
101 |   
102 |   if (push.setup == BUILD_SETUP_TRAVERSAL_RUN)
103 |   {
104 |     // during traversal_init we might overshoot the traversalTaskCounter  
105 |     int traversalTaskCounter = min(buildRW.traversalTaskCounter, int(build.maxTraversalInfos));
106 |     buildRW.traversalTaskCounter = traversalTaskCounter;
107 |     // also set up the initial writeCounter to be equal, so that new jobs are enqueued after it
108 |     buildRW.traversalInfoWriteCounter = uint(traversalTaskCounter);
109 |   }
110 | #if TARGETS_RASTERIZATION
111 |   else if (push.setup == BUILD_SETUP_DRAW)
112 |   {
113 |     // during traversal_run we might overshoot visibleClusterCounter  
114 |     uint renderClusterCounter  = buildRW.renderClusterCounter;
115 |     
116 |     // set drawindirect for actual rendered clusters
117 |     uint numRenderedClusters = min(renderClusterCounter, build.maxRenderClusters);
118 |     
119 |     buildRW.indirectDrawClusters.count = numRenderedClusters;
120 |     buildRW.indirectDrawClusters.first = 0;
121 | 
122 |     // keep originals for statistics 
123 |     readback.numRenderedClusters  = numRenderedClusters;
124 |     readback.numRenderClusters    = renderClusterCounter;
125 |     readback.numTraversalInfos    = buildRW.traversalInfoWriteCounter;
126 |   }
127 | #endif
128 | #if TARGETS_RAY_TRACING
129 |   else if (push.setup == BUILD_SETUP_BLAS_INSERTION)
130 |   {
131 |       // during traversal_run we might overshoot visibleClusterCounter  
132 |     uint renderClusterCounter  = buildRW.renderClusterCounter;
133 |     
134 |     // set drawindirect for actual rendered clusters
135 |     uint numRenderedClusters = min(renderClusterCounter, build.maxRenderClusters);
136 |     
137 |     buildRW.renderClusterCounter = numRenderedClusters;
138 |     buildRW.indirectDispatchBlasInsertion.gridX = (numRenderedClusters + BLAS_INSERT_CLUSTERS_WORKGROUP-1) / BLAS_INSERT_CLUSTERS_WORKGROUP;
139 |     buildRW.indirectDispatchBlasInsertion.gridY = 1;
140 |     buildRW.indirectDispatchBlasInsertion.gridZ = 1;
141 | 
142 |     // keep originals for statistics 
143 |     readback.numRenderedClusters  = numRenderedClusters;
144 |     readback.numRenderClusters    = renderClusterCounter;
145 |     readback.numTraversalInfos    = buildRW.traversalInfoWriteCounter;
146 |   }
147 | #endif
148 | }


--------------------------------------------------------------------------------
/shaders/culling.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | /*
 20 | 
 21 |   Utility code for frustum and occlusion culling of 
 22 |   bounding boxes
 23 |   
 24 | */
 25 | 
 26 | const float c_epsilon    = 1.2e-07f;
 27 | const float c_depthNudge = 2.0/float(1<<24);
 28 | 
 29 | bool intersectSize(vec4 clipMin, vec4 clipMax)
 30 | {
 31 |   vec2 rect = clipMax.xy - clipMin.xy;
 32 |   vec2 clipThreshold = vec2(2.0) / viewLast.viewportf.xy;
 33 |   return any(greaterThan(rect,clipThreshold));
 34 | }
 35 | 
 36 | vec4 getClip(vec4 hPos, out bool valid) {
 37 |   valid = !(-c_epsilon < hPos.w && hPos.w < c_epsilon);
 38 |   return vec4(hPos.xyz / abs(hPos.w), hPos.w);
 39 | }
 40 | 
 41 | uint getCullBits(vec4 hPos)
 42 | {
 43 |   uint cullBits = 0;
 44 |   cullBits |= hPos.x < -hPos.w ?  1 : 0;
 45 |   cullBits |= hPos.x >  hPos.w ?  2 : 0;
 46 |   cullBits |= hPos.y < -hPos.w ?  4 : 0;
 47 |   cullBits |= hPos.y >  hPos.w ?  8 : 0;
 48 |   cullBits |= hPos.z <  0      ? 16 : 0;
 49 |   cullBits |= hPos.z >  hPos.w ? 32 : 0;
 50 |   cullBits |= hPos.w <= 0      ? 64 : 0; 
 51 |   return cullBits;
 52 | }
 53 | 
 54 | vec4 getBoxCorner(vec3 bboxMin, vec3 bboxMax, int n)
 55 | {
 56 |   bvec3 useMax = bvec3((n & 1) != 0, (n & 2) != 0, (n & 4) != 0);
 57 |   return vec4(mix(bboxMin, bboxMax, useMax),1);
 58 | }
 59 | 
 60 | bool intersectFrustum(vec3 bboxMin, vec3 bboxMax, mat4 worldTM, out vec4 oClipmin, out vec4 oClipmax, out bool oClipvalid)
 61 | {
 62 |   mat4 worldViewProjTM = viewLast.viewProjMatrix * worldTM;
 63 |   bool valid;
 64 |   // clipspace bbox
 65 |   vec4 hPos     = worldViewProjTM * getBoxCorner(bboxMin, bboxMax, 0);
 66 |   vec4 clip     = getClip(hPos, valid);
 67 |   uint bits     = getCullBits(hPos);
 68 |   vec4 clipMin  = clip;
 69 |   vec4 clipMax  = clip;
 70 |   bool clipValid = valid;
 71 |   
 72 |   [[unroll]]
 73 |   for (int n = 1; n < 8; n++){
 74 |     hPos  = worldViewProjTM * getBoxCorner(bboxMin, bboxMax, n);
 75 |     clip  = getClip(hPos, valid);
 76 |     bits &= getCullBits(hPos);
 77 |     // TODO instead of loop unroll manually to do independent paired min/max to allow
 78 |     // instruction parallelism
 79 |     clipMin = min(clipMin,clip);
 80 |     clipMax = max(clipMax,clip);
 81 | 
 82 |     clipValid = clipValid && valid;
 83 |   }
 84 |   
 85 |   oClipvalid = clipValid;
 86 |   oClipmin = vec4(clamp(clipMin.xy, vec2(-1), vec2(1)), clipMin.zw);
 87 |   oClipmax = vec4(clamp(clipMax.xy, vec2(-1), vec2(1)), clipMax.zw);
 88 | 
 89 |   //return true;
 90 |   return bits == 0;
 91 | }
 92 | 
 93 | bool intersectHiz(vec4 clipMin, vec4 clipMax)
 94 | {
 95 |   clipMin.xy = clipMin.xy * 0.5 + 0.5;
 96 |   clipMax.xy = clipMax.xy * 0.5 + 0.5;
 97 |   
 98 |   clipMin.xy *= viewLast.hizSizeFactors.xy;
 99 |   clipMax.xy *= viewLast.hizSizeFactors.xy;
100 |    
101 |   clipMin.xy = min(clipMin.xy, viewLast.hizSizeFactors.zw);
102 |   clipMax.xy = min(clipMax.xy, viewLast.hizSizeFactors.zw);
103 |   
104 |   vec2  size = (clipMax.xy - clipMin.xy);
105 |   float maxsize = max(size.x, size.y) * viewLast.hizSizeMax;
106 |   float miplevel = ceil(log2(maxsize));
107 | 
108 |   float depth = textureLod(texHizFar, ((clipMin.xy + clipMax.xy)*0.5),miplevel).r;
109 |   bool result = clipMin.z <= depth + c_depthNudge;
110 | 
111 |   return result;
112 | }


--------------------------------------------------------------------------------
/shaders/fullscreen.vert.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | #version 460
21 | 
22 | // Simplistic screen-covering triangle
23 | 
24 | layout(location = 0) out vec2 uv;
25 | 
26 | void main()
27 | {
28 |   uv.x        = (gl_VertexIndex == 2) ? 2.0 : 0.0;
29 |   uv.y        = (gl_VertexIndex == 1) ? 2.0 : 0.0;
30 |   gl_Position = vec4(uv * vec2(2.0, -2.0) + vec2(-1.0, 1.0), 0.0, 1.0);
31 |   uv.y        = 1.0 - uv.y;
32 | }
33 | 


--------------------------------------------------------------------------------
/shaders/fullscreen_write_depth.frag.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | /*
21 |   
22 |   Shader Description
23 |   ==================
24 |   
25 |   A fragment shader that writes the ray tracing depth into the
26 |   framebuffers depth buffer.
27 | 
28 | */
29 | 
30 | #version 460
31 | #extension GL_GOOGLE_include_directive : enable
32 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
33 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
34 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
35 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
36 | #extension GL_EXT_buffer_reference : enable
37 | #extension GL_EXT_buffer_reference2 : enable
38 | #extension GL_EXT_scalar_block_layout : enable
39 | 
40 | #include "shaderio.h"
41 | 
42 | layout(set = 0, binding = BINDINGS_RAYTRACING_DEPTH, r32f) uniform image2D imgRaytracingDepth;
43 | 
44 | void main()
45 | {
46 |   ivec2 coord  = ivec2(gl_FragCoord.xy);
47 |   gl_FragDepth = imageLoad(imgRaytracingDepth, coord).x;
48 | }


--------------------------------------------------------------------------------
/shaders/hbao.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | #ifndef NVHBAO_H_
 21 | #define NVHBAO_H_
 22 | 
 23 | #define NVHBAO_RANDOMTEX_SIZE 4
 24 | #define NVHBAO_NUM_DIRECTIONS 8
 25 | 
 26 | #define NVHBAO_MAIN_UBO 0
 27 | #define NVHBAO_MAIN_TEX_DEPTH 1
 28 | #define NVHBAO_MAIN_TEX_LINDEPTH 2
 29 | #define NVHBAO_MAIN_TEX_VIEWNORMAL 3
 30 | #define NVHBAO_MAIN_TEX_DEPTHARRAY 4
 31 | #define NVHBAO_MAIN_TEX_RESULTARRAY 5
 32 | #define NVHBAO_MAIN_TEX_RESULT 6
 33 | #define NVHBAO_MAIN_TEX_BLUR 7
 34 | #define NVHBAO_MAIN_IMG_LINDEPTH 8
 35 | #define NVHBAO_MAIN_IMG_VIEWNORMAL 9
 36 | #define NVHBAO_MAIN_IMG_DEPTHARRAY 10
 37 | #define NVHBAO_MAIN_IMG_RESULTARRAY 11
 38 | #define NVHBAO_MAIN_IMG_RESULT 12
 39 | #define NVHBAO_MAIN_IMG_BLUR 13
 40 | #define NVHBAO_MAIN_IMG_OUT 14
 41 | 
 42 | #ifndef NVHBAO_BLUR
 43 | #define NVHBAO_BLUR 1
 44 | #endif
 45 | 
 46 | // 1 is slower
 47 | #ifndef NVHBAO_SKIP_INTERPASS
 48 | #define NVHBAO_SKIP_INTERPASS 0
 49 | #endif
 50 | 
 51 | #ifdef __cplusplus
 52 | namespace glsl {
 53 | using namespace glm;
 54 | #endif
 55 | 
 56 | struct NVHBAOData
 57 | {
 58 |   float RadiusToScreen;  // radius
 59 |   float R2;              // 1/radius
 60 |   float NegInvR2;        // radius * radius
 61 |   float NDotVBias;
 62 | 
 63 |   vec2 InvFullResolution;
 64 |   vec2 InvQuarterResolution;
 65 | 
 66 |   ivec2 SourceResolutionScale;
 67 |   float AOMultiplier;
 68 |   float PowExponent;
 69 | 
 70 |   vec4  projReconstruct;
 71 |   vec4  projInfo;
 72 |   int   projOrtho;
 73 |   int   _pad0;
 74 |   ivec2 _pad1;
 75 | 
 76 |   ivec2 FullResolution;
 77 |   ivec2 QuarterResolution;
 78 | 
 79 |   mat4 InvProjMatrix;
 80 | 
 81 |   vec4 float2Offsets[NVHBAO_RANDOMTEX_SIZE * NVHBAO_RANDOMTEX_SIZE];
 82 |   vec4 jitters[NVHBAO_RANDOMTEX_SIZE * NVHBAO_RANDOMTEX_SIZE];
 83 | };
 84 | 
 85 | // keep all these equal size
 86 | struct NVHBAOMainPush
 87 | {
 88 |   int   layer;
 89 |   int   _pad0;
 90 |   ivec2 _pad1;
 91 | };
 92 | 
 93 | struct NVHBAOBlurPush
 94 | {
 95 |   vec2  invResolutionDirection;
 96 |   float sharpness;
 97 |   float _pad;
 98 | };
 99 | 
100 | #ifdef __cplusplus
101 | }
102 | #else
103 | 
104 | layout(std140, binding = NVHBAO_MAIN_UBO) uniform controlBuffer
105 | {
106 |   NVHBAOData control;
107 | };
108 | 
109 | #ifndef NVHABO_GFX
110 | 
111 | layout(local_size_x = 32, local_size_y = 2) in;
112 | 
113 | bool setupCoord(inout ivec2 coord, inout vec2 texCoord, ivec2 res, vec2 invRes)
114 | {
115 |   ivec2 base   = ivec2(gl_WorkGroupID.xy) * 8;
116 |   ivec2 subset = ivec2(int(gl_LocalInvocationID.x) & 1, int(gl_LocalInvocationID.x) / 2);
117 |   subset += gl_LocalInvocationID.x >= 16 ? ivec2(2, -8) : ivec2(0, 0);
118 |   subset += ivec2(gl_LocalInvocationID.y * 4, 0);
119 | 
120 |   coord = base + subset;
121 | 
122 |   if(coord.x >= res.x || coord.y >= res.y)
123 |     return true;
124 | 
125 |   texCoord = (vec2(coord) + vec2(0.5)) * invRes;
126 | 
127 |   return false;
128 | }
129 | 
130 | bool setupCoordFull(inout ivec2 coord, inout vec2 texCoord)
131 | {
132 |   return setupCoord(coord, texCoord, control.FullResolution, control.InvFullResolution);
133 | }
134 | 
135 | bool setupCoordQuarter(inout ivec2 coord, inout vec2 texCoord)
136 | {
137 |   return setupCoord(coord, texCoord, control.QuarterResolution, control.InvQuarterResolution);
138 | }
139 | 
140 | #endif
141 | 
142 | #endif
143 | #endif
144 | 


--------------------------------------------------------------------------------
/shaders/hbao_blur.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | 
24 | #include "hbao.h"
25 | 
26 | layout(binding=NVHBAO_MAIN_IMG_BLUR, rg16f) uniform image2D   imgBlur;
27 | layout(binding=NVHBAO_MAIN_TEX_RESULT)      uniform sampler2D texSource;
28 | 
29 | #include "hbao_blur.glsl"
30 | 
31 | //-------------------------------------------------------------------------
32 | 
33 | void main()
34 | {
35 |   ivec2 intCoord;
36 |   vec2  texCoord;
37 |   
38 |   if (setupCoordFull(intCoord, texCoord)) return;
39 |   
40 |   vec2 res = BlurRun(texCoord);
41 |   imageStore(imgBlur, intCoord, vec4(res,0,0));
42 | }
43 | 


--------------------------------------------------------------------------------
/shaders/hbao_blur.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | layout(push_constant) uniform pushData {
21 |   NVHBAOBlurPush  blur;
22 | };
23 | 
24 | 
25 | const float KERNEL_RADIUS = 3;
26 | 
27 | //-------------------------------------------------------------------------
28 | 
29 | float BlurFunction(vec2 uv, float r, float center_c, float center_d, inout float w_total)
30 | {
31 |   vec2  aoz = texture(texSource, uv).xy;
32 |   float c = aoz.x;
33 |   float d = aoz.y;
34 |   
35 |   const float BlurSigma = float(KERNEL_RADIUS) * 0.5;
36 |   const float BlurFalloff = 1.0 / (2.0*BlurSigma*BlurSigma);
37 |   
38 |   float ddiff = (d - center_d) * blur.sharpness;
39 |   float w = exp2(-r*r*BlurFalloff - ddiff*ddiff);
40 |   w_total += w;
41 | 
42 |   return c*w;
43 | }
44 | 
45 | vec2 BlurRun(vec2 texCoord)
46 | {
47 |   vec2  aoz = texture(texSource, texCoord).xy;
48 |   float center_c = aoz.x;
49 |   float center_d = aoz.y;
50 |   
51 |   float c_total = center_c;
52 |   float w_total = 1.0;
53 |   
54 |   [[unroll]]
55 |   for (float r = 1; r <= KERNEL_RADIUS; ++r)
56 |   {
57 |     vec2 uv = texCoord + blur.invResolutionDirection * r;
58 |     c_total += BlurFunction(uv, r, center_c, center_d, w_total);  
59 |   }
60 |   
61 |   [[unroll]]
62 |   for (float r = 1; r <= KERNEL_RADIUS; ++r)
63 |   {
64 |     vec2 uv = texCoord - blur.invResolutionDirection * r;
65 |     c_total += BlurFunction(uv, r, center_c, center_d, w_total);  
66 |   }
67 |   
68 |   return vec2(c_total/w_total, center_d);
69 |   //return vec2(aoz);
70 | }
71 | 


--------------------------------------------------------------------------------
/shaders/hbao_blur_apply.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | #extension GL_EXT_shader_image_load_formatted : require
24 | 
25 | #include "hbao.h"
26 | 
27 | layout(binding=NVHBAO_MAIN_IMG_OUT)   uniform image2D   imgOut;
28 | layout(binding=NVHBAO_MAIN_TEX_BLUR)  uniform sampler2D texSource;
29 | 
30 | #include "hbao_blur.glsl"
31 | 
32 | //-------------------------------------------------------------------------
33 | 
34 | 
35 | void main()
36 | {
37 |   ivec2 intCoord;
38 |   vec2  texCoord;
39 |   
40 |   if (setupCoordFull(intCoord, texCoord)) return;
41 |   
42 |   vec2 res = BlurRun(texCoord);
43 |   vec4 color = imageLoad(imgOut, intCoord);
44 |   imageStore(imgOut, intCoord, vec4( vec3(color.xyz * res.x), 1));
45 | }
46 | 


--------------------------------------------------------------------------------
/shaders/hbao_calc.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | /* 
 21 | Based on DeinterleavedTexturing sample by Louis Bavoil
 22 | https://github.com/NVIDIAGameWorks/D3DSamples/tree/master/samples/DeinterleavedTexturing
 23 | 
 24 | */
 25 | 
 26 | #version 460
 27 | #extension GL_GOOGLE_include_directive : enable
 28 | #extension GL_EXT_control_flow_attributes : require
 29 | 
 30 | #include "hbao.h"
 31 | 
 32 | layout(push_constant) uniform pushData {
 33 |   NVHBAOMainPush  push;
 34 | };
 35 | 
 36 | #define M_PI 3.14159265f
 37 | 
 38 | // tweakables
 39 | const float  NUM_STEPS = 12;
 40 | const float  NUM_DIRECTIONS = NVHBAO_NUM_DIRECTIONS; // texRandom/g_Jitter initialization depends on this
 41 | 
 42 | layout(binding=NVHBAO_MAIN_TEX_DEPTHARRAY)   uniform sampler2DArray texLinearDepth;
 43 | layout(binding=NVHBAO_MAIN_TEX_VIEWNORMAL)   uniform sampler2D      texViewNormal;
 44 | 
 45 | 
 46 | #if NVHBAO_SKIP_INTERPASS
 47 |   #if NVHBAO_BLUR
 48 |     layout(binding=NVHBAO_MAIN_IMG_RESULT,rg16f) uniform image2D imgOutput;
 49 |   #else
 50 |     layout(binding=NVHBAO_MAIN_IMG_RESULT,r8)    uniform image2D imgOutput;
 51 |   #endif
 52 |   void outputColor(ivec2 icoord, vec4 color)
 53 |   {
 54 |     icoord = icoord * 4 + ivec2(push.layer & 3, push.layer / 4);
 55 |     if (icoord.x < control.FullResolution.x && icoord.y < control.FullResolution.y){
 56 |       imageStore(imgOutput, icoord, color);
 57 |     }
 58 |   }
 59 | #else
 60 |   #if NVHBAO_BLUR
 61 |     layout(binding=NVHBAO_MAIN_IMG_RESULTARRAY,rg16f) uniform image2DArray imgOutput;
 62 |   #else
 63 |     layout(binding=NVHBAO_MAIN_IMG_RESULTARRAY,r8)    uniform image2DArray imgOutput;
 64 |   #endif
 65 |   void outputColor(ivec2 icoord, vec4 color)
 66 |   {
 67 |     imageStore(imgOutput, ivec3(icoord, push.layer), color);
 68 |   }
 69 | #endif
 70 | 
 71 | 
 72 | vec2 g_Float2Offset = control.float2Offsets[push.layer].xy;
 73 | vec4 g_Jitter       = control.jitters[push.layer];
 74 | 
 75 | vec3 getQuarterCoord(vec2 UV){
 76 |   return vec3(UV,float(push.layer));
 77 | }
 78 | 
 79 | 
 80 | //----------------------------------------------------------------------------------
 81 | 
 82 | vec3 UVToView(vec2 uv, float eye_z)
 83 | {
 84 |   return vec3((uv * control.projInfo.xy + control.projInfo.zw) * (control.projOrtho != 0 ? 1. : eye_z), eye_z);
 85 | }
 86 | 
 87 | vec3 FetchQuarterResViewPos(vec2 UV)
 88 | {
 89 |   float ViewDepth = textureLod(texLinearDepth,getQuarterCoord(UV),0).x;
 90 |   return UVToView(UV, ViewDepth);
 91 | }
 92 | 
 93 | //----------------------------------------------------------------------------------
 94 | float Falloff(float DistanceSquare)
 95 | {
 96 |   // 1 scalar mad instruction
 97 |   return DistanceSquare * control.NegInvR2 + 1.0;
 98 | }
 99 | 
100 | //----------------------------------------------------------------------------------
101 | // P = view-space position at the kernel center
102 | // N = view-space normal at the kernel center
103 | // S = view-space position of the current sample
104 | //----------------------------------------------------------------------------------
105 | float ComputeAO(vec3 P, vec3 N, vec3 S)
106 | {
107 |   vec3 V = S - P;
108 |   float VdotV = dot(V, V);
109 |   float NdotV = dot(N, V) * 1.0/sqrt(VdotV);
110 | 
111 |   // Use saturate(x) instead of max(x,0.f) because that is faster on Kepler
112 |   return clamp(NdotV - control.NDotVBias,0,1) * clamp(Falloff(VdotV),0,1);
113 | }
114 | 
115 | //----------------------------------------------------------------------------------
116 | vec2 RotateDirection(vec2 Dir, vec2 CosSin)
117 | {
118 |   return vec2(Dir.x*CosSin.x - Dir.y*CosSin.y,
119 |               Dir.x*CosSin.y + Dir.y*CosSin.x);
120 | }
121 | 
122 | //----------------------------------------------------------------------------------
123 | vec4 GetJitter()
124 | {
125 |   // Get the current jitter vector from the per-pass constant buffer
126 |   return g_Jitter;
127 | }
128 | 
129 | //----------------------------------------------------------------------------------
130 | float ComputeCoarseAO(vec2 FullResUV, float RadiusPixels, vec4 Rand, vec3 ViewPosition, vec3 ViewNormal)
131 | {
132 |   RadiusPixels /= 4.0;
133 | 
134 |   // Divide by NUM_STEPS+1 so that the farthest samples are not fully attenuated
135 |   float StepSizePixels = RadiusPixels / (NUM_STEPS + 1);
136 | 
137 |   const float Alpha = 2.0 * M_PI / NUM_DIRECTIONS;
138 |   float AO = 0;
139 | 
140 |   [[unroll]]
141 |   for (float DirectionIndex = 0; DirectionIndex < NUM_DIRECTIONS; ++DirectionIndex)
142 |   {
143 |     float Angle = Alpha * DirectionIndex;
144 | 
145 |     // Compute normalized 2D direction
146 |     vec2 Direction = RotateDirection(vec2(cos(Angle), sin(Angle)), Rand.xy);
147 | 
148 |     // Jitter starting sample within the first step
149 |     float RayPixels = (Rand.z * StepSizePixels + 1.0);
150 | 
151 |     for (float StepIndex = 0; StepIndex < NUM_STEPS; ++StepIndex)
152 |     {
153 |       vec2 SnappedUV = round(RayPixels * Direction) * control.InvQuarterResolution + FullResUV;
154 |       vec3 S = FetchQuarterResViewPos(SnappedUV);
155 | 
156 |       RayPixels += StepSizePixels;
157 | 
158 |       AO += ComputeAO(ViewPosition, ViewNormal, S);
159 |     }
160 |   }
161 | 
162 |   AO *= control.AOMultiplier / (NUM_DIRECTIONS * NUM_STEPS);
163 |   return clamp(1.0 - AO * 2.0,0,1);
164 | }
165 | 
166 | //----------------------------------------------------------------------------------
167 | void main()
168 | {
169 |   ivec2 intCoord;
170 |   vec2  texCoord;
171 |   
172 |   if (setupCoordQuarter(intCoord, texCoord)) return;
173 |   
174 |   vec2 base = vec2(intCoord.xy) * 4.0 + g_Float2Offset;
175 |   vec2 uv = base * (control.InvQuarterResolution / 4.0);
176 | 
177 |   vec3 ViewPosition = FetchQuarterResViewPos(uv);
178 |   vec4 NormalAndAO =  texelFetch( texViewNormal, ivec2(base), 0);
179 |   vec3 ViewNormal =  -(NormalAndAO.xyz * 2.0 - 1.0);
180 | 
181 |   // Compute projection of disk of radius control.R into screen space
182 |   float RadiusPixels = control.RadiusToScreen / (control.projOrtho != 0 ? 1.0 : ViewPosition.z);
183 | 
184 |   // Get jitter vector for the current full-res pixel
185 |   vec4 Rand = GetJitter();
186 | 
187 |   float AO = ComputeCoarseAO(uv, RadiusPixels, Rand, ViewPosition, ViewNormal);
188 | 
189 | #if NVHBAO_BLUR
190 |   outputColor(intCoord, vec4(pow(AO, control.PowExponent), ViewPosition.z, 0, 0));
191 | #else
192 |   outputColor(intCoord, vec4(pow(AO, control.PowExponent)));
193 | #endif
194 |   
195 | }
196 | 


--------------------------------------------------------------------------------
/shaders/hbao_deinterleave.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | 
24 | #include "hbao.h"
25 | 
26 | layout(binding=NVHBAO_MAIN_TEX_LINDEPTH)         uniform sampler2D      texLinearDepth;
27 | layout(binding=NVHBAO_MAIN_IMG_DEPTHARRAY,r32f)  uniform image2DArray   imgDepthArray;
28 | 
29 | //----------------------------------------------------------------------------------
30 | 
31 | void outputColor(ivec2 intCoord, int layer, float value)
32 | {
33 |   imageStore(imgDepthArray, ivec3(intCoord,layer), vec4(value,0,0,0));
34 | }
35 | 
36 | void main()
37 | {
38 |   ivec2 intCoord;
39 |   vec2  texCoord;
40 |   
41 |   if (setupCoordQuarter(intCoord, texCoord)) return;
42 | 
43 |   vec2 uv = vec2(intCoord) * 4.0 + 0.5;
44 |   uv *= control.InvFullResolution;  
45 |   
46 |   vec4 S0 = textureGather      (texLinearDepth, uv, 0);
47 |   vec4 S1 = textureGatherOffset(texLinearDepth, uv, ivec2(2,0), 0);
48 |   vec4 S2 = textureGatherOffset(texLinearDepth, uv, ivec2(0,2), 0);
49 |   vec4 S3 = textureGatherOffset(texLinearDepth, uv, ivec2(2,2), 0);
50 |  
51 |   outputColor(intCoord, 0, S0.w);
52 |   outputColor(intCoord, 1, S0.z);
53 |   outputColor(intCoord, 2, S1.w);
54 |   outputColor(intCoord, 3, S1.z);
55 |   outputColor(intCoord, 4, S0.x);
56 |   outputColor(intCoord, 5, S0.y);
57 |   outputColor(intCoord, 6, S1.x);
58 |   outputColor(intCoord, 7, S1.y);
59 |   
60 |   outputColor(intCoord, 0 + 8, S2.w);
61 |   outputColor(intCoord, 1 + 8, S2.z);
62 |   outputColor(intCoord, 2 + 8, S3.w);
63 |   outputColor(intCoord, 3 + 8, S3.z);
64 |   outputColor(intCoord, 4 + 8, S2.x);
65 |   outputColor(intCoord, 5 + 8, S2.y);
66 |   outputColor(intCoord, 6 + 8, S3.x);
67 |   outputColor(intCoord, 7 + 8, S3.y);
68 | }
69 | 


--------------------------------------------------------------------------------
/shaders/hbao_depthlinearize.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | 
24 | #include "hbao.h"
25 | 
26 | layout(binding=NVHBAO_MAIN_TEX_DEPTH)           uniform sampler2D inputTexture;
27 | layout(binding=NVHBAO_MAIN_IMG_LINDEPTH, r32f)  uniform image2D   imgLinearDepth;
28 | #if NVHBAO_SKIP_INTERPASS
29 |   layout(binding=NVHBAO_MAIN_IMG_DEPTHARRAY, r32f)  uniform image2DArray  imgLinearDepthArray;
30 | #endif
31 | 
32 | 
33 | float reconstructCSZ(float d, vec4 clipInfo) {
34 | #if 1
35 |   vec4 ndc = vec4(0,0,d,1);
36 |   vec4 unproj = control.InvProjMatrix * ndc;
37 |   return unproj.z / unproj.w;
38 | #else
39 |    // clipInfo = z_n * z_f,  z_n - z_f,  z_f, perspective = 1 : 0
40 | 
41 |   if (clipInfo[3] != 0) {
42 |     return (clipInfo[0] / (clipInfo[1] * d + clipInfo[2]));
43 |   }
44 |   else {
45 |     return (clipInfo[1]+clipInfo[2] - d * clipInfo[1]);
46 |   }
47 | #endif
48 |   
49 | }
50 | /*
51 |     if (in_perspective == 1.0) // perspective
52 |     {
53 |         ze = (zNear * zFar) / (zFar - zb * (zFar - zNear)); 
54 |     }
55 |     else // orthographic proj 
56 |     {
57 |         ze  = zNear + zb  * (zFar - zNear);
58 |     }
59 | */
60 | void main() 
61 | {
62 |   ivec2 intCoord;
63 |   vec2  texCoord;
64 |   
65 |   if (setupCoordFull(intCoord, texCoord)) return;
66 | 
67 |   float depth = textureLod(inputTexture, texCoord.xy, 0).x;
68 |   float linDepth = reconstructCSZ(depth, control.projReconstruct);
69 |   imageStore(imgLinearDepth, intCoord, vec4(linDepth,0,0,0));
70 | #if NVHBAO_SKIP_INTERPASS
71 |   ivec2 FullResPos = intCoord;
72 |   ivec2 Offset = FullResPos & 3;
73 |   int SliceId = Offset.y * 4 + Offset.x;
74 |   ivec2 QuarterResPos = FullResPos >> 2;
75 |   imageStore(imgLinearDepthArray, ivec3(QuarterResPos, SliceId), vec4(linDepth,0,0,0));
76 | #endif
77 | }
78 | 


--------------------------------------------------------------------------------
/shaders/hbao_reinterleave.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | 
24 | #include "hbao.h"
25 | 
26 | layout(binding=NVHBAO_MAIN_TEX_RESULTARRAY)    uniform sampler2DArray texResultsArray;
27 | #if NVHBAO_BLUR
28 | layout(binding=NVHBAO_MAIN_IMG_RESULT, rg16f)  uniform image2D imgResult;
29 | #else
30 | layout(binding=NVHBAO_MAIN_IMG_RESULT, r8)     uniform image2D imgResult;
31 | #endif
32 | 
33 | //----------------------------------------------------------------------------------
34 | 
35 | void main() {
36 |   ivec2 intCoord;
37 |   vec2  texCoord;
38 |   
39 |   if (setupCoordFull(intCoord, texCoord)) return;
40 | 
41 |   ivec2 FullResPos = intCoord;
42 |   ivec2 Offset = FullResPos & 3;
43 |   int SliceId = Offset.y * 4 + Offset.x;
44 |   ivec2 QuarterResPos = FullResPos >> 2;
45 |   
46 | #if NVHBAO_BLUR
47 |   imageStore(imgResult, intCoord, vec4(texelFetch( texResultsArray, ivec3(QuarterResPos, SliceId), 0).xy,0,0));
48 | #else
49 |   imageStore(imgResult, intCoord, vec4(texelFetch( texResultsArray, ivec3(QuarterResPos, SliceId), 0).x));
50 | #endif
51 | }
52 | 


--------------------------------------------------------------------------------
/shaders/hbao_viewnormal.comp.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  *
16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION
17 |  * SPDX-License-Identifier: Apache-2.0
18 |  */
19 | 
20 | #version 460
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_control_flow_attributes : require
23 | 
24 | #include "hbao.h"
25 | 
26 | layout(binding=NVHBAO_MAIN_TEX_LINDEPTH)          uniform sampler2D texLinearDepth;
27 | layout(binding=NVHBAO_MAIN_IMG_VIEWNORMAL,rgba8)  uniform image2D   imgViewNormal;
28 | 
29 | //----------------------------------------------------------------------------------
30 | 
31 | vec3 UVToView(vec2 uv, float eye_z)
32 | {
33 |   return vec3((uv * control.projInfo.xy + control.projInfo.zw) * (control.projOrtho != 0 ? 1. : eye_z), eye_z);
34 | }
35 | 
36 | vec3 FetchViewPos(vec2 UV)
37 | {
38 |   float ViewDepth = textureLod(texLinearDepth,UV,0).x;
39 |   return UVToView(UV, ViewDepth);
40 | }
41 | 
42 | vec3 MinDiff(vec3 P, vec3 Pr, vec3 Pl)
43 | {
44 |   vec3 V1 = Pr - P;
45 |   vec3 V2 = P - Pl;
46 |   return (dot(V1,V1) < dot(V2,V2)) ? V1 : V2;
47 | }
48 | 
49 | vec3 ReconstructNormal(vec2 UV, vec3 P)
50 | {
51 |   vec3 Pr = FetchViewPos(UV + vec2(control.InvFullResolution.x, 0));
52 |   vec3 Pl = FetchViewPos(UV + vec2(-control.InvFullResolution.x, 0));
53 |   vec3 Pt = FetchViewPos(UV + vec2(0, control.InvFullResolution.y));
54 |   vec3 Pb = FetchViewPos(UV + vec2(0, -control.InvFullResolution.y));
55 |   return normalize(cross(MinDiff(P, Pr, Pl), MinDiff(P, Pt, Pb)));
56 | }
57 | 
58 | //----------------------------------------------------------------------------------
59 | 
60 | void main() {
61 |   ivec2 intCoord;
62 |   vec2  texCoord;
63 |   
64 |   if (setupCoordFull(intCoord, texCoord)) return;
65 | 
66 |   vec3 P  = FetchViewPos(texCoord);
67 |   vec3 N  = ReconstructNormal(texCoord, P);
68 |   
69 |   imageStore(imgViewNormal, intCoord, vec4(N*0.5 + 0.5,0));
70 | }
71 | 


--------------------------------------------------------------------------------
/shaders/nvhiz-update.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | #version 460
 21 | 
 22 | #ifndef NV_HIZ_MAX_LEVELS
 23 | #define NV_HIZ_MAX_LEVELS   16
 24 | #endif
 25 | 
 26 | #ifndef NV_HIZ_MSAA_SAMPLES
 27 | #define NV_HIZ_MSAA_SAMPLES 0
 28 | #endif
 29 | 
 30 | #ifndef NV_HIZ_IS_FIRST
 31 | #define NV_HIZ_IS_FIRST 1
 32 | #endif
 33 | 
 34 | #ifndef NV_HIZ_FORMAT
 35 | #define NV_HIZ_FORMAT r32f
 36 | #endif
 37 | 
 38 | #ifndef NV_HIZ_OUTPUT_NEAR
 39 | #define NV_HIZ_OUTPUT_NEAR 1
 40 | #endif
 41 | 
 42 | #ifndef NV_HIZ_LEVELS 
 43 | #define NV_HIZ_LEVELS 3
 44 | #endif
 45 | 
 46 | #ifndef NV_HIZ_NEAR_LEVEL
 47 | #define NV_HIZ_NEAR_LEVEL 0
 48 | #endif
 49 | 
 50 | #ifndef NV_HIZ_FAR_LEVEL
 51 | #define NV_HIZ_FAR_LEVEL 0
 52 | #endif
 53 | 
 54 | #ifndef NV_HIZ_REVERSED_Z
 55 | #define NV_HIZ_REVERSED_Z 0
 56 | #endif
 57 | 
 58 | #ifndef NV_HIZ_USE_STEREO 
 59 | #define NV_HIZ_USE_STEREO 0
 60 | #endif
 61 | 
 62 | #if NV_HIZ_LEVELS > 1
 63 |   #extension GL_KHR_shader_subgroup_basic : require
 64 |   #extension GL_KHR_shader_subgroup_shuffle : require
 65 | #endif
 66 | 
 67 | #if NV_HIZ_REVERSED_Z
 68 |   #define minOp max
 69 |   #define maxOp min
 70 | #else
 71 |   #define minOp min
 72 |   #define maxOp max
 73 | #endif
 74 | 
 75 | layout(local_size_x=32,local_size_y=2) in;
 76 | 
 77 | layout(push_constant) uniform passUniforms {
 78 |   // keep in sync with nvhiz_vk.cpp
 79 |   ivec4 srcSize;
 80 |   int   writeLod;
 81 |   int   startLod;
 82 |   int   layer;
 83 |   int   _pad0;
 84 |   bvec4  levelActive;
 85 | };
 86 | 
 87 | #if NV_HIZ_USE_STEREO
 88 |   #define samplerTypeMS sampler2DMSArray
 89 |   #define samplerType   sampler2DArray
 90 |   #define imageType     image2DArray
 91 |   #define IACCESS(v,l)  ivec3(v,l)
 92 | #else
 93 |   #define samplerTypeMS sampler2DMS
 94 |   #define samplerType   sampler2D
 95 |   #define imageType     image2D
 96 |   #define IACCESS(v,l)  v
 97 | #endif
 98 | 
 99 | #if NV_HIZ_IS_FIRST && NV_HIZ_MSAA_SAMPLES
100 |   layout(binding=0) uniform samplerTypeMS texDepth;
101 | #else
102 |   layout(binding=0) uniform samplerType   texDepth;
103 | #endif
104 |   layout(binding=1) uniform samplerType   texNear;
105 |   
106 |   layout(binding=2,NV_HIZ_FORMAT) uniform imageType imgNear;
107 |   layout(binding=3,NV_HIZ_FORMAT) uniform imageType imgLevels[NV_HIZ_MAX_LEVELS];
108 | 
109 | void main()
110 | {
111 |   ivec2 base = ivec2(gl_WorkGroupID.xy) * 8;
112 |   ivec2 subset = ivec2(int(gl_LocalInvocationID.x) & 1, int(gl_LocalInvocationID.x) / 2);
113 |   subset += gl_LocalInvocationID.x >= 16 ? ivec2(2,-8) : ivec2(0,0);
114 |   subset += ivec2(gl_LocalInvocationID.y * 4,0);
115 |   
116 | #if NV_HIZ_LEVELS > 1
117 |   uint laneID = gl_SubgroupInvocationID;
118 | #endif
119 | 
120 |   //ivec2 outcoord = base + 7 - subset;
121 |   ivec2 outcoord = base + subset;
122 |   ivec2 coord = outcoord * 2;
123 |   
124 |   float flayer = float(layer);
125 |   
126 | #if NV_HIZ_IS_FIRST && NV_HIZ_MSAA_SAMPLES
127 |   #if NV_HIZ_REVERSED_Z
128 |   float zMin = 0;
129 |   float zMax = 1;
130 |   #else
131 |   float zMin = 1;
132 |   float zMax = 0;
133 |   #endif
134 |   for (int i = 0; i < NV_HIZ_MSAA_SAMPLES; i++){
135 |     vec4 zRead = vec4(texelFetch(texDepth, IACCESS(min(coord + ivec2(0,0), srcSize.zw), layer), i).r,
136 |                       texelFetch(texDepth, IACCESS(min(coord + ivec2(1,0), srcSize.zw), layer), i).r,
137 |                       texelFetch(texDepth, IACCESS(min(coord + ivec2(0,1), srcSize.zw), layer), i).r,
138 |                       texelFetch(texDepth, IACCESS(min(coord + ivec2(1,1), srcSize.zw), layer), i).r);
139 |     zMin = minOp(zMin, minOp(minOp(minOp(zRead.x, zRead.y),zRead.z),zRead.w));
140 |     zMax = maxOp(zMax, maxOp(maxOp(maxOp(zRead.x, zRead.y),zRead.z),zRead.w));
141 |   }
142 | #else
143 |   #if NV_HIZ_IS_FIRST
144 |     #define texRead texDepth
145 |   #else
146 |     #define texRead texNear
147 |   #endif
148 | 
149 |   coord = min(coord, srcSize.zw);
150 |   vec4 zRead = vec4(texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(0,0)).r,
151 |                     texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(1,0)).r,
152 |                     texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(0,1)).r,
153 |                     texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(1,1)).r);
154 |   
155 |   float zMax = maxOp(maxOp(maxOp(zRead.x, zRead.y),zRead.z),zRead.w);
156 |   float zMin = minOp(minOp(minOp(zRead.x, zRead.y),zRead.z),zRead.w);
157 | #endif
158 | 
159 |   //zMax = float(gl_ThreadInWarpNV) / 32.0;
160 | #if !(NV_HIZ_IS_FIRST && NV_HIZ_FAR_LEVEL > 0)
161 |   imageStore(imgLevels[writeLod + 0], IACCESS(outcoord,layer), vec4(zMax));
162 | #endif
163 |   
164 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 0
165 |   imageStore(imgNear, IACCESS(outcoord,layer), vec4(zMin));
166 | #endif
167 | 
168 | #if NV_HIZ_LEVELS > 1
169 |   vec4 zRead0 = vec4( zMax,
170 |                       subgroupShuffle(zMax, laneID + 1),
171 |                       subgroupShuffle(zMax, laneID + 2),
172 |                       subgroupShuffle(zMax, laneID + 3));
173 |   
174 | 
175 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL >= 1
176 |   vec4 zRead1 = vec4( zMin,
177 |                       subgroupShuffle(zMin, laneID + 1),
178 |                       subgroupShuffle(zMin, laneID + 2),
179 |                       subgroupShuffle(zMin, laneID + 3));
180 | #endif
181 | 
182 |   if ((levelActive.y || levelActive.z) && (laneID & 3) == 0)
183 |   {
184 |     outcoord /= 2;
185 |     zMax = maxOp(maxOp(maxOp(zRead0.x, zRead0.y),zRead0.z),zRead0.w);
186 |   #if !(NV_HIZ_IS_FIRST && NV_HIZ_FAR_LEVEL > 1)
187 |     imageStore(imgLevels[writeLod + 1], IACCESS(outcoord, layer), vec4(zMax));
188 |   #endif
189 |   #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL >= 1
190 |     zMin = minOp(minOp(minOp(zRead1.x, zRead1.y),zRead1.z),zRead1.w);
191 |     #if NV_HIZ_NEAR_LEVEL == 1
192 |     imageStore(imgNear, IACCESS(outcoord, layer), vec4(zMin));
193 |     #endif
194 |   #endif
195 |     
196 |   #if NV_HIZ_LEVELS > 2
197 |     if (levelActive.z) {
198 |       outcoord /= 2;
199 |       zRead0 = vec4(  zMax,
200 |                       subgroupShuffle(zMax, laneID + 4),
201 |                       subgroupShuffle(zMax, laneID + 16),
202 |                       subgroupShuffle(zMax, laneID + 20));
203 |     #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 2
204 |       zRead1 = vec4(  zMin,
205 |                       subgroupShuffle(zMin, laneID + 4),
206 |                       subgroupShuffle(zMin, laneID + 16),
207 |                       subgroupShuffle(zMin, laneID + 20));
208 |     #endif
209 |       if ((laneID == 0) || (laneID == 8)) {
210 |         zMax = maxOp(maxOp(maxOp(zRead0.x, zRead0.y),zRead0.z),zRead0.w);
211 |         imageStore(imgLevels[writeLod + 2], IACCESS(outcoord, layer), vec4(zMax));
212 |       #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 2
213 |         zMin = minOp(minOp(minOp(zRead1.x, zRead1.y),zRead1.z),zRead1.w);
214 |         imageStore(imgNear, IACCESS(outcoord, layer), vec4(zMin));
215 |       #endif
216 |       }
217 |     }
218 |   #endif
219 |   }
220 | #endif
221 | }
222 | 


--------------------------------------------------------------------------------
/shaders/octant_encoding.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | #ifndef _OCTANT_ENCODING_H_
 21 | #define _OCTANT_ENCODING_H_
 22 | 
 23 | #ifdef __cplusplus
 24 | namespace shaderio {
 25 | #define OCT_INLINE inline
 26 | #define OCT_FLOOR glm::floor
 27 | #define OCT_CLAMP glm::clamp
 28 | #define OCT_ABS glm::abs
 29 | OCT_INLINE uint32_t pack_oct32(vec2 v)
 30 | {
 31 |   union
 32 |   {
 33 |     int16_t  snorm[2];
 34 |     uint32_t packed;
 35 |   };
 36 |   snorm[0] = static_cast<int16_t>(glm::clamp(int32_t(std::round(v.x * float(0x7FFF))), -0x7FFF, 0x7FFF));
 37 |   snorm[1] = static_cast<int16_t>(glm::clamp(int32_t(std::round(v.y * float(0x7FFF))), -0x7FFF, 0x7FFF));
 38 |   return packed;
 39 | }
 40 | OCT_INLINE vec2 unpack_oct32(uint32_t v)
 41 | {
 42 |   union
 43 |   {
 44 |     int16_t  snorm[2];
 45 |     uint32_t packed;
 46 |   };
 47 |   packed = v;
 48 |   return vec2(float(snorm[0]) / float(0x7FFF), float(snorm[1]) / float(0x7FFF));
 49 | }
 50 | #else
 51 | #define OCT_INLINE
 52 | #define OCT_FLOOR floor
 53 | #define OCT_CLAMP clamp
 54 | #define OCT_ABS abs
 55 | uint pack_oct32(vec2 v)
 56 | {
 57 |   return packSnorm2x16(v);
 58 | }
 59 | vec2 unpack_oct32(uint v)
 60 | {
 61 |   return unpackSnorm2x16(v);
 62 | }
 63 | #endif
 64 | 
 65 | // oct functions from http://jcgt.org/published/0003/02/01/paper.pdf
 66 | OCT_INLINE vec2 oct_signNotZero(vec2 v)
 67 | {
 68 |   return vec2((v.x >= 0.0f) ? +1.0f : -1.0f, (v.y >= 0.0f) ? +1.0 : -1.0f);
 69 | }
 70 | OCT_INLINE vec3 oct_to_vec(vec2 e)
 71 | {
 72 |   vec3 v = vec3(e.x, e.y, 1.0f - OCT_ABS(e.x) - OCT_ABS(e.y));
 73 |   if(v.z < 0.0f)
 74 |   {
 75 |     vec2 os = oct_signNotZero(e);
 76 |     v.x     = (1.0f - OCT_ABS(e.y)) * os.x;
 77 |     v.y     = (1.0f - OCT_ABS(e.x)) * os.y;
 78 |   }
 79 |   return normalize(v);
 80 | }
 81 | 
 82 | OCT_INLINE vec3 oct32_to_vec(uint32_t v)
 83 | {
 84 |   return oct_to_vec(unpack_oct32(v));
 85 | }
 86 | 
 87 | OCT_INLINE vec2 vec_to_oct(vec3 v)
 88 | {
 89 |   // Project the sphere onto the octahedron, and then onto the xy plane
 90 |   vec2 p = vec2(v.x, v.y) * (1.0f / (OCT_ABS(v.x) + OCT_ABS(v.y) + OCT_ABS(v.z)));
 91 |   // Reflect the folds of the lower hemisphere over the diagonals
 92 |   return (v.z <= 0.0f) ? (vec2(1.0f - OCT_ABS(p.y), 1.0f - OCT_ABS(p.x)) * oct_signNotZero(p)) : p;
 93 | }
 94 | 
 95 | OCT_INLINE vec2 vec_to_oct_precise(vec3 v, int bits)
 96 | {
 97 |   vec2 s = vec_to_oct(v);  // Remap to the square
 98 |                            // Each snorm's max value interpreted as an integer,
 99 |                            // e.g., 127.0 for snorm8
100 |   float M = float(1 << ((bits / 2) - 1)) - 1.0f;
101 |   // Remap components to snorm(n/2) precision...with floor instead
102 |   // of round (see equation 1)
103 |   s                        = OCT_FLOOR(OCT_CLAMP(s, -1.0f, +1.0f) * M) * (1.0f / M);
104 |   vec2  bestRepresentation = s;
105 |   float highestCosine      = dot(oct_to_vec(s), v);
106 |   // Test all combinations of floor and ceil and keep the best.
107 |   // Note that at +/- 1, this will exit the square... but that
108 |   // will be a worse encoding and never win.
109 |   for(int i = 0; i <= 1; ++i)
110 |   {
111 |     for(int j = 0; j <= 1; ++j)
112 |     {
113 |       // This branch will be evaluated at compile time
114 |       if((i != 0) || (j != 0))
115 |       {
116 |         // Offset the bit pattern (which is stored in floating
117 |         // point!) to effectively change the rounding mode
118 |         // (when i or j is 0: floor, when it is one: ceiling)
119 |         vec2  candidate = vec2(i, j) * (1 / M) + s;
120 |         float cosine    = dot(oct_to_vec(candidate), v);
121 |         if(cosine > highestCosine)
122 |         {
123 |           bestRepresentation = candidate;
124 |           highestCosine      = cosine;
125 |         }
126 |       }
127 |     }
128 |   }
129 |   return bestRepresentation;
130 | }
131 | 
132 | OCT_INLINE uint vec_to_oct32(vec3 v)
133 | {
134 |   return pack_oct32(vec_to_oct_precise(v, 32));
135 | }
136 | 
137 | #undef OCT_ABS
138 | #undef OCT_FLOOR
139 | #undef OCT_CLAMP
140 | #undef OCT_INLINE
141 | 
142 | #ifdef __cplusplus
143 | }
144 | #endif
145 | 
146 | #endif


--------------------------------------------------------------------------------
/shaders/render_instance_bbox.frag.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | #version 460
20 | 
21 | #extension GL_GOOGLE_include_directive : enable
22 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
23 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
24 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
25 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
26 | #extension GL_EXT_buffer_reference : enable
27 | #extension GL_EXT_buffer_reference2 : enable
28 | #extension GL_EXT_scalar_block_layout : enable
29 | 
30 | #include "shaderio.h"
31 | 
32 | ///////////////////////////////////////////////////
33 | 
34 | 
35 | layout(std140, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
36 | {
37 |   FrameConstants view;
38 | };
39 | 
40 | ///////////////////////////////////////////////////
41 | 
42 | #include "render_shading.glsl"
43 | 
44 | ///////////////////////////////////////////////////
45 | 
46 | layout(location=0) in Interpolants
47 | {
48 |   flat uint instanceID;
49 | } IN;
50 | 
51 | ///////////////////////////////////////////////////
52 | 
53 | layout(location=0,index=0) out vec4 out_Color;
54 | 
55 | ///////////////////////////////////////////////////
56 | 
57 | void main()
58 | {
59 |   out_Color = unpackUnorm4x8(murmurHash(IN.instanceID)) * 0.9 + 0.1;
60 |   out_Color.w = 1.0;
61 | }


--------------------------------------------------------------------------------
/shaders/render_instance_bbox.mesh.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #version 460
 21 | 
 22 | #extension GL_GOOGLE_include_directive : enable
 23 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 24 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 27 | #extension GL_EXT_buffer_reference : enable
 28 | #extension GL_EXT_buffer_reference2 : enable
 29 | #extension GL_EXT_scalar_block_layout : enable
 30 | 
 31 | #extension GL_NV_mesh_shader : require
 32 | #extension GL_EXT_control_flow_attributes: require
 33 | 
 34 | #include "shaderio.h"
 35 | 
 36 | layout(push_constant) uniform pushData
 37 | {
 38 |   uint numRenderInstances;
 39 | }
 40 | push;
 41 | 
 42 | layout(std140, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 43 | {
 44 |   FrameConstants view;
 45 | };
 46 | 
 47 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 48 | {
 49 |   RenderInstance instances[];
 50 | };
 51 | 
 52 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 53 | {
 54 |   Geometry geometries[];
 55 | };
 56 | 
 57 | ////////////////////////////////////////////
 58 | 
 59 | layout(location=0) out Interpolants {
 60 |   flat uint instanceID;
 61 | } OUT[];
 62 | 
 63 | ////////////////////////////////////////////
 64 | 
 65 | #define MESH_WORKGROUP_SIZE  32
 66 | 
 67 | #define BOX_VERTICES     8
 68 | #define BOX_LINES        12
 69 | #define BOX_LINE_THREADS 4
 70 | 
 71 | layout(local_size_x=MESH_WORKGROUP_SIZE) in;
 72 | layout(max_vertices=BBOXES_PER_MESHLET * BOX_VERTICES, max_primitives=BBOXES_PER_MESHLET * BOX_LINES) out;
 73 | layout(lines) out;
 74 | 
 75 | ////////////////////////////////////////////
 76 | 
 77 | void writePrimitiveLineIndices(uint idx, uvec2 vertexIndices)
 78 | {
 79 |   gl_PrimitiveIndicesNV[idx * 2 + 0] = vertexIndices.x;
 80 |   gl_PrimitiveIndicesNV[idx * 2 + 1] = vertexIndices.y;
 81 | }
 82 | 
 83 | void main()
 84 | {
 85 |   uint baseID   = gl_WorkGroupID.x * BBOXES_PER_MESHLET;  
 86 |   uint numBoxes = min(push.numRenderInstances, baseID + BBOXES_PER_MESHLET) - baseID;
 87 |   
 88 |   if (gl_LocalInvocationID.x == 0)
 89 |   {
 90 |     gl_PrimitiveCountNV = numBoxes * BOX_LINES;
 91 |   }
 92 |   
 93 |   const uint vertexRuns = ((BBOXES_PER_MESHLET * BOX_VERTICES) + MESH_WORKGROUP_SIZE-1) / MESH_WORKGROUP_SIZE;
 94 |   
 95 |   [[unroll]]
 96 |   for (uint32_t run = 0; run < vertexRuns; run++)
 97 |   {
 98 |     uint vert   = gl_LocalInvocationID.x + run * MESH_WORKGROUP_SIZE;
 99 |     uint box    = vert / BOX_VERTICES;
100 |     uint corner = vert % BOX_VERTICES;
101 |     
102 |     uint boxLoad = min(box,numBoxes-1);
103 |     
104 |     RenderInstance instance = instances[boxLoad + baseID];
105 |     BBox bbox = geometries[instance.geometryID].bbox;
106 |     
107 |     bvec3 weight   = bvec3((corner & 1) != 0, (corner & 2) != 0, (corner & 4) != 0);
108 |     vec3 cornerPos = mix(bbox.lo, bbox.hi, weight);
109 |     
110 |     if (box < numBoxes)
111 |     {
112 |       gl_MeshVerticesNV[vert].gl_Position = view.viewProjMatrix * (instance.worldMatrix * vec4(cornerPos,1));
113 |       OUT[vert].instanceID = baseID + box;
114 |     }
115 |   }
116 |   
117 |   {
118 |     uvec2 boxIndices[4] = uvec2[4](
119 |       uvec2(0,1),uvec2(1,3),uvec2(3,2),uvec2(2,0)
120 |     );
121 |   
122 |     uint subID = gl_LocalInvocationID.x & (BOX_LINE_THREADS-1);
123 |     uint box   = gl_LocalInvocationID.x / BOX_LINE_THREADS;
124 |   
125 |     uvec2 circle = boxIndices[subID];
126 |     
127 |     if (box < numBoxes)
128 |     {  
129 |       // lower
130 |       writePrimitiveLineIndices(box * 12 + subID + 0, circle + box * BOX_VERTICES);
131 |       // upper
132 |       writePrimitiveLineIndices(box * 12 + subID + 4, circle + 4 + box * BOX_VERTICES);
133 |       // connectors
134 |       writePrimitiveLineIndices(box * 12 + subID + 8, uvec2(subID, subID + 4) + box * BOX_VERTICES);
135 |     }
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/shaders/render_raster.frag.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #version 460
 21 | 
 22 | #extension GL_GOOGLE_include_directive : enable
 23 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 24 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 27 | #extension GL_EXT_buffer_reference : enable
 28 | #extension GL_EXT_buffer_reference2 : enable
 29 | #extension GL_EXT_scalar_block_layout : enable
 30 | #extension GL_EXT_shader_atomic_int64 : enable
 31 | #extension GL_EXT_fragment_shader_barycentric : enable
 32 | 
 33 | #include "shaderio.h"
 34 | 
 35 | layout(push_constant) uniform pushData
 36 | {
 37 |   uint instanceID;
 38 | }
 39 | push;
 40 | 
 41 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 42 | {
 43 |   FrameConstants view;
 44 | };
 45 | 
 46 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 47 | {
 48 |   Readback readback;
 49 | };
 50 | 
 51 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 52 | {
 53 |   RenderInstance instances[];
 54 | };
 55 | 
 56 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 57 | {
 58 |   Geometry geometries[];
 59 | };
 60 | 
 61 | #if USE_STREAMING
 62 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 63 | {
 64 |   SceneStreaming streaming;
 65 | };
 66 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 67 | {
 68 |   SceneStreaming streamingRW;
 69 | };
 70 | #endif
 71 | 
 72 | ///////////////////////////////////////////////////
 73 | 
 74 | #include "render_shading.glsl"
 75 | 
 76 | ///////////////////////////////////////////////////
 77 | 
 78 | layout(location = 0) in Interpolants
 79 | {
 80 | #if ALLOW_SHADING
 81 |   vec3 wPos;
 82 | #if ALLOW_VERTEX_NORMALS
 83 |   vec3 wNormal;
 84 | #endif
 85 | #endif
 86 |   flat uint clusterID;
 87 |   flat uint instanceID;
 88 | }
 89 | IN;
 90 | 
 91 | 
 92 | ///////////////////////////////////////////////////
 93 | 
 94 | layout(location = 0, index = 0) out vec4 out_Color;
 95 | layout(early_fragment_tests) in;
 96 | 
 97 | ///////////////////////////////////////////////////
 98 | 
 99 | 
100 | void main()
101 | {
102 |   vec3 wNormal;
103 | 
104 | #if ALLOW_SHADING
105 | #if ALLOW_VERTEX_NORMALS
106 |   if(view.facetShading != 0)
107 | #endif
108 |   {
109 |     wNormal = -cross(dFdx(IN.wPos), dFdy(IN.wPos));
110 |   }
111 | #if ALLOW_VERTEX_NORMALS
112 |   else
113 |   {
114 |     wNormal = IN.wNormal;
115 |     if(view.flipWinding == 1 || (view.flipWinding == 2 && !gl_FrontFacing))
116 |     {
117 |       wNormal = -wNormal;
118 |     }
119 |   }
120 | #endif
121 | #endif
122 | 
123 |   uint visData = IN.clusterID;
124 |   if (view.visualize == VISUALIZE_LOD || view.visualize == VISUALIZE_GROUP)
125 |   {
126 |     #if USE_STREAMING
127 |       Cluster cluster = Cluster_in(streaming.resident.clusters.d[IN.clusterID]).d;
128 |     #else
129 |       Geometry geometry = geometries[instances[IN.instanceID].geometryID];
130 |       Cluster cluster = geometry.preloadedClusters.d[IN.clusterID];
131 |     #endif
132 |       if (view.visualize == VISUALIZE_LOD)
133 |       {
134 |         visData = floatBitsToUint(float(cluster.lodLevel) * instances[IN.instanceID].maxLodLevelRcp);
135 |       }
136 |       else {
137 |         visData = cluster.groupID;
138 |       }
139 |   }
140 |   else if (view.visualize == VISUALIZE_TRIANGLE)
141 |   {
142 |     visData = IN.clusterID * 256 + uint(gl_PrimitiveID);
143 |   }
144 | 
145 |   out_Color.w = 1.f;
146 | #if ALLOW_SHADING && 1
147 |   {
148 |     const float overHeadLight = 1.0f;
149 |     const float ambientLight  = 1.f;
150 | 
151 |     out_Color = shading(IN.instanceID, IN.wPos, wNormal, visData, overHeadLight, ambientLight);
152 |   }
153 | #else
154 |   {
155 |     out_Color = vec4(visualizeColor(visData), 1.0);
156 |   }
157 | #endif
158 | 
159 | #if DEBUG_VISUALIZATION
160 |   if(view.doWireframe != 0 || (view.visFilterInstanceID == IN.instanceID && view.visFilterClusterID == IN.clusterID))
161 |   {
162 |     out_Color.xyz = addWireframe(out_Color.xyz, gl_BaryCoordEXT, gl_FrontFacing, fwidthFine(gl_BaryCoordEXT), view.wireColor);
163 |   }
164 | #endif
165 | 
166 |   uvec2 pixelCoord = uvec2(gl_FragCoord.xy);
167 |   if(pixelCoord == view.mousePosition)
168 |   {
169 |     uint32_t packedClusterTriangleId = (IN.clusterID << 8) | (gl_PrimitiveID & 0xFF);
170 |     atomicMax(readback.clusterTriangleId, packPickingValue(packedClusterTriangleId, gl_FragCoord.z));
171 |     atomicMax(readback.instanceId, packPickingValue(IN.instanceID, gl_FragCoord.z));
172 |   }
173 | }


--------------------------------------------------------------------------------
/shaders/render_raster_clusters.mesh.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | /*
 20 |   
 21 |   Shader Description
 22 |   ==================
 23 | 
 24 |   This mesh shader renders a single cluster.
 25 | 
 26 | */
 27 | 
 28 | 
 29 | #version 460
 30 | 
 31 | #extension GL_GOOGLE_include_directive : enable
 32 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 33 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 36 | #extension GL_EXT_buffer_reference : enable
 37 | #extension GL_EXT_buffer_reference2 : enable
 38 | #extension GL_EXT_scalar_block_layout : enable
 39 | 
 40 | #extension GL_NV_mesh_shader : require
 41 | #extension GL_EXT_control_flow_attributes : require
 42 | 
 43 | #include "shaderio.h"
 44 | #include "octant_encoding.h"
 45 | 
 46 | layout(push_constant) uniform pushData
 47 | {
 48 |   uint instanceID;
 49 | }
 50 | push;
 51 | 
 52 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 53 | {
 54 |   FrameConstants view;
 55 | };
 56 | 
 57 | layout(scalar,binding=BINDINGS_READBACK_SSBO,set=0) buffer readbackBuffer
 58 | {
 59 |   Readback readback;
 60 | };
 61 | 
 62 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 63 | {
 64 |   RenderInstance instances[];
 65 | };
 66 | 
 67 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 68 | {
 69 |   Geometry geometries[];
 70 | };
 71 | 
 72 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 73 | {
 74 |   SceneBuilding build;  
 75 | };
 76 | 
 77 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
 78 | {
 79 |   SceneBuilding buildRW;  
 80 | };
 81 | 
 82 | #if USE_STREAMING
 83 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 84 | {
 85 |   SceneStreaming streaming;
 86 | };
 87 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 88 | {
 89 |   SceneStreaming streamingRW;
 90 | };
 91 | #endif
 92 | 
 93 | ////////////////////////////////////////////
 94 | 
 95 | layout(location = 0) out Interpolants
 96 | {
 97 | #if ALLOW_SHADING
 98 |   vec3      wPos;
 99 | #if ALLOW_VERTEX_NORMALS
100 |   vec3      wNormal;
101 | #endif
102 | #endif
103 |   flat uint clusterID;
104 |   flat uint instanceID;
105 | }
106 | OUT[];
107 | 
108 | ////////////////////////////////////////////
109 | 
110 | #ifndef MESHSHADER_WORKGROUP_SIZE
111 | #define MESHSHADER_WORKGROUP_SIZE 32
112 | #endif
113 | 
114 | layout(local_size_x = MESHSHADER_WORKGROUP_SIZE) in;
115 | layout(max_vertices = CLUSTER_VERTEX_COUNT, max_primitives = CLUSTER_TRIANGLE_COUNT) out;
116 | layout(triangles) out;
117 | 
118 | const uint MESHLET_VERTEX_ITERATIONS = ((CLUSTER_VERTEX_COUNT + MESHSHADER_WORKGROUP_SIZE - 1) / MESHSHADER_WORKGROUP_SIZE);
119 | const uint MESHLET_TRIANGLE_ITERATIONS = ((CLUSTER_TRIANGLE_COUNT + MESHSHADER_WORKGROUP_SIZE - 1) / MESHSHADER_WORKGROUP_SIZE);
120 | 
121 | ////////////////////////////////////////////
122 | 
123 | void main()
124 | {
125 |   ClusterInfo cinfo = build.renderClusterInfos.d[gl_WorkGroupID.x];
126 | 
127 |   uint instanceID = cinfo.instanceID;
128 |   uint clusterID  = cinfo.clusterID;
129 | 
130 |   RenderInstance instance = instances[instanceID];
131 |   Geometry geometry       = geometries[instance.geometryID];
132 | 
133 | #if USE_STREAMING
134 |   Cluster cluster = Cluster_in(streaming.resident.clusters.d[clusterID]).d;
135 | #else
136 |   Cluster cluster = geometry.preloadedClusters.d[clusterID];
137 | #endif
138 | 
139 |   uint vertMax = cluster.vertexCountMinusOne;
140 |   uint triMax  = cluster.triangleCountMinusOne;
141 | 
142 |   if (gl_LocalInvocationID.x == 0) {
143 |     gl_PrimitiveCountNV = triMax + 1;
144 |     // just for stats
145 |     atomicAdd(readback.numRenderedTriangles, uint(triMax + 1));
146 |   }
147 | 
148 |   vec4s_in  oVertices      = vec4s_in(cluster.vertices);
149 |   uint8s_in localTriangles = uint8s_in(cluster.localTriangles);
150 | 
151 |   mat4 worldMatrix   = instance.worldMatrix;
152 |   mat3 worldMatrixIT = transpose(inverse(mat3(worldMatrix)));
153 | 
154 | 
155 |   [[unroll]] for(uint i = 0; i < uint(MESHLET_VERTEX_ITERATIONS); i++)
156 |   {
157 |     uint vert        = gl_LocalInvocationID.x + i * MESHSHADER_WORKGROUP_SIZE;
158 |     uint vertLoad    = min(vert, vertMax);
159 |     
160 |     vec4 oVertex = oVertices.d[vertLoad];
161 | 
162 |     vec3 oPos = oVertex.xyz;    
163 |     vec4 wPos = worldMatrix * vec4(oPos, 1.0f);
164 |   #if ALLOW_VERTEX_NORMALS
165 |     vec3 oNormal = oct32_to_vec(floatBitsToUint(oVertex.w));
166 |   #endif
167 | 
168 |     if(vert <= vertMax)
169 |     {
170 |       gl_MeshVerticesNV[vert].gl_Position = view.viewProjMatrix * wPos;
171 |     #if ALLOW_SHADING
172 |       OUT[vert].wPos                      = wPos.xyz;
173 |     #if ALLOW_VERTEX_NORMALS
174 |       OUT[vert].wNormal                   = normalize(worldMatrixIT * oNormal);
175 |     #endif
176 |     #endif
177 |       OUT[vert].clusterID                 = clusterID;
178 |       OUT[vert].instanceID                = instanceID;
179 |     }
180 |   }
181 | 
182 |   [[unroll]] for(uint i = 0; i < uint(MESHLET_TRIANGLE_ITERATIONS); i++)
183 |   {
184 |     uint tri     = gl_LocalInvocationID.x + i * MESHSHADER_WORKGROUP_SIZE;
185 |     uint triLoad = min(tri, triMax);
186 | 
187 |     uvec3 indices = uvec3(localTriangles.d[triLoad * 3 + 0],
188 |                           localTriangles.d[triLoad * 3 + 1],
189 |                           localTriangles.d[triLoad * 3 + 2]);
190 | 
191 |     if(tri <= triMax)
192 |     {
193 |       gl_PrimitiveIndicesNV[tri * 3 + 0] = indices.x;
194 |       gl_PrimitiveIndicesNV[tri * 3 + 1] = indices.y;
195 |       gl_PrimitiveIndicesNV[tri * 3 + 2] = indices.z;
196 |       gl_MeshPrimitivesNV[tri].gl_PrimitiveID = int(tri);
197 |     }
198 |   }
199 | }


--------------------------------------------------------------------------------
/shaders/render_raytrace.rgen.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | #version 460
21 | 
22 | #extension GL_GOOGLE_include_directive : enable
23 | 
24 | #extension GL_EXT_ray_tracing : require
25 | 
26 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
27 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
28 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
29 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
30 | #extension GL_EXT_buffer_reference : enable
31 | #extension GL_EXT_scalar_block_layout : enable
32 | 
33 | #include "shaderio.h"
34 | 
35 | //////////////////////////////////////////////////////////////
36 | 
37 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
38 | {
39 |   FrameConstants view;
40 | };
41 | 
42 | layout(set = 0, binding = BINDINGS_TLAS) uniform accelerationStructureEXT asScene;
43 | layout(set = 0, binding = BINDINGS_RENDER_TARGET, rgba8) uniform image2D imgColor;
44 | 
45 | layout(set = 0, binding = BINDINGS_RAYTRACING_DEPTH, r32f) uniform image2D imgRaytracingDepth;
46 | 
47 | //////////////////////////////////////////////////////////////
48 | 
49 | layout(location = 0) rayPayloadEXT RayPayload rayHit;
50 | 
51 | //////////////////////////////////////////////////////////////
52 | 
53 | void main()
54 | {
55 |   // for writing debugging values to stats.debug etc.
56 |   bool center = gl_LaunchIDEXT.xy == (gl_LaunchSizeEXT.xy / 2);
57 | 
58 |   ivec2 screen = ivec2(gl_LaunchIDEXT.xy);
59 |   vec2  uv     = (vec2(gl_LaunchIDEXT.xy) + vec2(0.5)) / vec2(gl_LaunchSizeEXT.xy);
60 | 
61 | 
62 |   vec2 d = uv * 2.0 - 1.0;
63 | 
64 | 
65 |   vec4 origin    = view.viewMatrixI * vec4(0, 0, 0, 1);
66 |   vec4 target    = normalize(view.projMatrixI * vec4(d.x, d.y, 1, 1));
67 |   vec4 direction = normalize(view.viewMatrixI * vec4(target.xyz, 0));
68 | 
69 |   float tMin = view.nearPlane;
70 |   float tMax = view.farPlane;
71 | 
72 | #if DEBUG_VISUALIZATION
73 |   vec2 uvOffset            = (vec2(gl_LaunchIDEXT.xy) + vec2(1.5, 1.5)) / vec2(gl_LaunchSizeEXT.xy);
74 |   vec2 dOffset             = uvOffset * 2.0 - 1.0;
75 |   vec4 targetOffsetX       = normalize(view.projMatrixI * vec4(dOffset.x, d.y, 1, 1));
76 |   vec4 targetOffsetY       = normalize(view.projMatrixI * vec4(d.x, dOffset.y, 1, 1));
77 |   vec4 directionOffsetX    = normalize(view.viewMatrixI * vec4(targetOffsetX.xyz, 0));
78 |   vec4 directionOffsetY    = normalize(view.viewMatrixI * vec4(targetOffsetY.xyz, 0));
79 |   rayHit.color.xyz         = directionOffsetX.xyz;
80 |   rayHit.differentialY.xyz = directionOffsetY.xyz;
81 | #endif
82 | 
83 |   traceRayEXT(asScene, view.flipWinding == 2 ? 0 : gl_RayFlagsCullBackFacingTrianglesEXT, 0xff, 0, 0,  // hit offset, hit stride
84 |               0,                                                           // miss offset
85 |               origin.xyz, tMin, direction.xyz, tMax,
86 |               0  // rayPayloadNV location qualifier
87 |   );
88 | 
89 |   {
90 |     imageStore(imgColor, screen, vec4(rayHit.color.xyz, 1));
91 |     imageStore(imgRaytracingDepth, screen, vec4(rayHit.color.w == 0 ? 1.0 : rayHit.color.w, 0.f, 0.f, 0.f));
92 |   }
93 | }


--------------------------------------------------------------------------------
/shaders/render_raytrace.rmiss.glsl:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | #version 460
21 | 
22 | #extension GL_GOOGLE_include_directive : enable
23 | 
24 | #extension GL_EXT_ray_tracing : require
25 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
26 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
27 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
28 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
29 | 
30 | #include "shaderio.h"
31 | 
32 | //////////////////////////////////////////////////////////////
33 | 
34 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
35 | {
36 |   FrameConstants view;
37 | };
38 | 
39 | //////////////////////////////////////////////////////////////
40 | 
41 | layout(location = RAYTRACING_PAYLOAD_INDEX) rayPayloadInEXT RayPayload rayHit;
42 | 
43 | //////////////////////////////////////////////////////////////
44 | 
45 | void main()
46 | {
47 |   vec3 skyColor = evalSimpleSky(view.skyParams, gl_WorldRayDirectionEXT);
48 | 
49 |   rayHit.color.rgb = skyColor;
50 |   rayHit.color.w = 0.f;
51 | }
52 | 


--------------------------------------------------------------------------------
/shaders/shaderio.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #ifndef _SHADERIO_H_
 21 | #define _SHADERIO_H_
 22 | 
 23 | #include "shaderio_core.h"
 24 | #include "shaderio_scene.h"
 25 | #include "shaderio_streaming.h"
 26 | #include "shaderio_building.h"
 27 | #include "dh_sky.h"
 28 | 
 29 | /////////////////////////////////////////
 30 | 
 31 | #define ALLOW_SHADING 1
 32 | #define ALLOW_VERTEX_NORMALS 1
 33 | 
 34 | /////////////////////////////////////////
 35 | 
 36 | #define VISUALIZE_NONE 0
 37 | #define VISUALIZE_CLUSTER 1
 38 | #define VISUALIZE_GROUP 2
 39 | #define VISUALIZE_LOD 3
 40 | #define VISUALIZE_TRIANGLE 4
 41 | 
 42 | #define BBOXES_PER_MESHLET 8
 43 | 
 44 | /////////////////////////////////////////
 45 | 
 46 | #define BINDINGS_FRAME_UBO 0
 47 | #define BINDINGS_READBACK_SSBO 1
 48 | #define BINDINGS_GEOMETRIES_SSBO 2
 49 | #define BINDINGS_RENDERINSTANCES_SSBO 3
 50 | #define BINDINGS_SCENEBUILDING_SSBO 4
 51 | #define BINDINGS_SCENEBUILDING_UBO 5
 52 | #define BINDINGS_HIZ_TEX 6
 53 | #define BINDINGS_STREAMING_UBO 7
 54 | #define BINDINGS_STREAMING_SSBO 8
 55 | #define BINDINGS_TLAS 9
 56 | #define BINDINGS_RENDER_TARGET 10
 57 | #define BINDINGS_RAYTRACING_DEPTH 11
 58 | 
 59 | /////////////////////////////////////////
 60 | 
 61 | #define BUILD_SETUP_TRAVERSAL_RUN 1
 62 | #define BUILD_SETUP_DRAW 2
 63 | #define BUILD_SETUP_BLAS_INSERTION 3
 64 | 
 65 | /////////////////////////////////////////
 66 | 
 67 | #define STREAM_SETUP_COMPACTION_OLD_NO_UNLOADS 0
 68 | #define STREAM_SETUP_COMPACTION_STATUS 1
 69 | #define STREAM_SETUP_ALLOCATOR_FREEINSERT 2
 70 | #define STREAM_SETUP_ALLOCATOR_STATUS 3
 71 | 
 72 | /////////////////////////////////////////
 73 | 
 74 | #define TRAVERSAL_PRESORT_WORKGROUP 128
 75 | #define TRAVERSAL_INIT_WORKGROUP 128
 76 | #define TRAVERSAL_RUN_WORKGROUP 64
 77 | #define BLAS_SETUP_INSERTION_WORKGROUP 128
 78 | #define BLAS_INSERT_CLUSTERS_WORKGROUP 128
 79 | 
 80 | // must be power of 2
 81 | #define STREAM_UPDATE_SCENE_WORKGROUP 64
 82 | #define STREAM_AGEFILTER_GROUPS_WORKGROUP 128
 83 | #define STREAM_COMPACTION_NEW_CLAS_WORKGROUP 128
 84 | #define STREAM_COMPACTION_OLD_CLAS_WORKGROUP 64
 85 | #define STREAM_ALLOCATOR_LOAD_GROUPS_WORKGROUP 64
 86 | #define STREAM_ALLOCATOR_UNLOAD_GROUPS_WORKGROUP 64
 87 | #define STREAM_ALLOCATOR_BUILD_FREEGAPS_WORKGROUP 64
 88 | #define STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP 64
 89 | #define STREAM_ALLOCATOR_SETUP_INSERTION_WORKGROUP 64
 90 | 
 91 | /////////////////////////////////////////
 92 | 
 93 | #ifndef USE_CULLING
 94 | #define USE_CULLING 1
 95 | #endif
 96 | 
 97 | #ifndef USE_INSTANCE_SORTING
 98 | #define USE_INSTANCE_SORTING 1
 99 | #endif
100 | 
101 | 
102 | #ifndef USE_STREAMING
103 | #define USE_STREAMING 1
104 | #endif
105 | 
106 | #ifndef MAX_VISIBLE_CLUSTERS
107 | #define MAX_VISIBLE_CLUSTERS 1024
108 | #endif
109 | 
110 | #ifndef TARGETS_RASTERIZATION
111 | #define TARGETS_RASTERIZATION 1
112 | #endif
113 | 
114 | #define TARGETS_RAY_TRACING (!(TARGETS_RASTERIZATION))
115 | 
116 | /////////////////////////////////////////
117 | 
118 | #ifdef __cplusplus
119 | namespace shaderio {
120 | using namespace glm;
121 | using namespace nvvkhl_shaders;
122 | #endif
123 | 
124 | struct FrameConstants
125 | {
126 |   mat4 projMatrix;
127 |   mat4 projMatrixI;
128 | 
129 |   mat4 viewProjMatrix;
130 |   mat4 viewProjMatrixI;
131 |   mat4 viewMatrix;
132 |   mat4 viewMatrixI;
133 |   vec4 viewPos;
134 |   vec4 viewDir;
135 |   vec4 viewPlane;
136 | 
137 |   ivec2 viewport;
138 |   vec2  viewportf;
139 | 
140 |   vec2 viewPixelSize;
141 |   vec2 viewClipSize;
142 | 
143 |   vec3  wLightPos;
144 |   float lightMixer;
145 | 
146 |   vec3  wUpDir;
147 |   float sceneSize;
148 | 
149 |   uint  flipWinding;
150 |   uint  tintTessellated;
151 |   uint  visualize;
152 |   float fov;
153 | 
154 |   float   nearPlane;
155 |   float   farPlane;
156 |   float   ambientOcclusionRadius;
157 |   int32_t ambientOcclusionSamples;
158 | 
159 |   vec4 hizSizeFactors;
160 |   vec4 nearSizeFactors;
161 | 
162 |   float hizSizeMax;
163 |   int   facetShading;
164 |   int   supersample;
165 |   uint  colorXor;
166 | 
167 |   uint  dbgUint;
168 |   float dbgFloat;
169 |   uint  frame;
170 |   uint  doShadow;
171 | 
172 |   vec4 bgColor;
173 | 
174 |   uvec2 mousePosition;
175 |   float wireThickness;
176 |   float wireSmoothing;
177 | 
178 |   vec3 wireColor;
179 |   uint wireStipple;
180 | 
181 |   vec3  wireBackfaceColor;
182 |   float wireStippleRepeats;
183 | 
184 |   float wireStippleLength;
185 |   uint  doWireframe;
186 |   uint  visFilterInstanceID;
187 |   uint  visFilterClusterID;
188 | 
189 |   SimpleSkyParameters skyParams;
190 | };
191 | 
192 | struct Readback
193 | {
194 |   uint numRenderClusters;
195 |   uint numTraversalInfos;
196 |   uint numRenderedClusters;
197 |   uint numRenderedTriangles;
198 | 
199 |   uint64_t blasActualSizes;
200 | 
201 | #ifdef __cplusplus
202 |   uint32_t clusterTriangleId;
203 |   uint32_t _packedDepth0;
204 | 
205 |   uint32_t instanceId;
206 |   uint32_t _packedDepth1;
207 | #else
208 |   uint64_t clusterTriangleId;
209 |   uint64_t instanceId;
210 | #endif
211 | 
212 |   uint64_t debugU64;
213 | 
214 |   int  debugI;
215 |   uint debugUI;
216 |   uint debugF;
217 | 
218 |   uint debugA[64];
219 |   uint debugB[64];
220 |   uint debugC[64];
221 | };
222 | 
223 | 
224 | struct RayPayload
225 | {
226 |   // Ray gen writes the direction through the pixel at x+1 for ray differentials.
227 |   // Closest hit returns the shaded color there.
228 |   vec4 color;
229 | #if DEBUG_VISUALIZATION
230 |   // Ray direction through the pixel at y+1 for ray differentials
231 |   vec4 differentialY;
232 | #endif
233 | };
234 | 
235 | #ifdef __cplusplus
236 | }
237 | #endif
238 | #endif  // _SHADERIO_H_
239 | 


--------------------------------------------------------------------------------
/shaders/shaderio_building.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #include "shaderio_streaming.h"
 21 | 
 22 | #ifndef _SHADERIO_BUILDING_H_
 23 | #define _SHADERIO_BUILDING_H_
 24 | 
 25 | #ifdef __cplusplus
 26 | namespace shaderio {
 27 | using namespace glm;
 28 | #else
 29 | 
 30 | #define INSTANCE_FRUSTUM_BIT 1
 31 | #define INSTANCE_VISIBLE_BIT 2
 32 | 
 33 | #endif
 34 | 
 35 | // The item descriptor used in the lod hierarchy traversal
 36 | // producer/consumer queue.
 37 | // It can can encode a lod hierarchy node, or a cluster group of an instance.
 38 | // must fit in 64-bit
 39 | struct TraversalInfo
 40 | {
 41 |   uint32_t instanceID;
 42 |   uint32_t packedNode;
 43 | };
 44 | #ifndef __cplusplus
 45 | TraversalInfo unpackTraversalInfo(uint64_t packed64) {
 46 |   u32vec2 data = unpack32(packed64);
 47 |   TraversalInfo info;
 48 |   info.instanceID = data.x;
 49 |   info.packedNode = data.y;
 50 |   return info;
 51 | }
 52 | uint64_t packTraversalInfo(TraversalInfo info)
 53 | {
 54 |   return pack64(u32vec2(info.instanceID,info.packedNode));
 55 | }
 56 | #endif
 57 | 
 58 | // A renderable cluster
 59 | // must fit in 64-bit, and can be overlayed with `TraversalInfo`
 60 | // thereore instanceID must come first.
 61 | struct ClusterInfo
 62 | {
 63 |   uint32_t instanceID;
 64 |   uint32_t clusterID;
 65 | };
 66 | BUFFER_REF_DECLARE_ARRAY(ClusterInfos_inout, ClusterInfo, , 8);
 67 | 
 68 | // Indirect build information to build a BLAS from an array of CLAS references
 69 | struct BlasBuildInfo
 70 | {
 71 |   // the number of CLAS that this BLAS references
 72 |   uint32_t clusterReferencesCount;
 73 |   // stride of array (typically 8 for 64-bit)
 74 |   uint32_t clusterReferencesStride;
 75 |   // start address of the array
 76 |   uint64_t clusterReferences;
 77 | };
 78 | BUFFER_REF_DECLARE_ARRAY(BlasBuildInfo_inout, BlasBuildInfo, , 16);
 79 | 
 80 | // Indirect build information for a TLAS instance
 81 | struct TlasInstance
 82 | {
 83 |   mat3x4    worldMatrix;
 84 |   uint32_t  instanceCustomIndex24_mask8;
 85 |   uint32_t  instanceShaderBindingTableRecordOffset24_flags8;
 86 |   uint64_t  blasReference;
 87 | };
 88 | BUFFER_REF_DECLARE_ARRAY(TlasInstances_inout, TlasInstance, , 16);
 89 | 
 90 | // The central structure that contains relevant information to
 91 | // perform the runtime lod hierchy traversal and building of 
 92 | // all relevant clusters to be rendered in the current frame.
 93 | // (not optimally packed for cache efficiency but readability)
 94 | struct SceneBuilding
 95 | {
 96 |   mat4  traversalViewMatrix;
 97 | 
 98 |   uint  numRenderInstances;
 99 |   uint  maxRenderClusters;
100 |   uint  maxTraversalInfos;
101 |   float errorOverDistanceThreshold;
102 |   
103 |   uint renderClusterCounter;
104 |   int  traversalTaskCounter;
105 |   uint traversalInfoReadCounter;
106 |   uint traversalInfoWriteCounter;
107 |   
108 |   // result of traversal init & scratch for traversal run
109 |   BUFFER_REF(uint64s_coh_volatile) traversalNodeInfos;
110 |   // result of traversal run
111 |   BUFFER_REF(ClusterInfos_inout) renderClusterInfos;
112 |   
113 |   // rasterization related
114 |   //////////////////////////////////////////////////
115 |   
116 |   DrawMeshTasksIndirectCommandNV indirectDrawClusters;
117 |   
118 |   // ray tracing related
119 |   //////////////////////////////////////////////////
120 |   
121 |   DispatchIndirectCommand indirectDispatchBlasInsertion;
122 |   
123 |   uint blasClasCounter;
124 |   
125 |   // instance states store culling/visibility related information
126 |   BUFFER_REF(uint32s_inout) instanceStates;
127 |   
128 |   BUFFER_REF(uint32s_inout) instanceSortValues;
129 |   BUFFER_REF(uint32s_inout) instanceSortKeys;
130 |   
131 |   BUFFER_REF(TlasInstances_inout) tlasInstances;
132 |   
133 |   // per instance
134 |   BUFFER_REF(BlasBuildInfo_inout) blasBuildInfos;
135 |   BUFFER_REF(uint32s_inout) blasBuildSizes;
136 |   // split into per-instance regions
137 |   BUFFER_REF(uint64s_inout) blasClusterAddresses;
138 |   uint64_t blasBuildData;
139 | };
140 | 
141 | 
142 | 
143 | #ifdef __cplusplus
144 | }
145 | #endif
146 | #endif // _SHADERIO_BUILDING_H_


--------------------------------------------------------------------------------
/shaders/shaderio_core.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #ifndef _SHADERIO_CORE_H_
 21 | #define _SHADERIO_CORE_H_
 22 | 
 23 | #ifndef SUBGROUP_SIZE
 24 | #define SUBGROUP_SIZE 32
 25 | #endif
 26 | 
 27 | #ifdef __cplusplus
 28 | namespace shaderio {
 29 | using namespace glm;
 30 | #define BUFFER_REF(refname) uint64_t
 31 | 
 32 | static uint32_t inline adjustClusterProperty(uint32_t in)
 33 | {
 34 |   return (in + 31) & ~31;
 35 | }
 36 | 
 37 | #define BUFFER_REF_DECLARE(refname, typ, keywords, alignment)                                                          \
 38 |   static_assert(alignof(typ) == alignment || (alignment > alignof(typ) && ((alignment % alignof(typ)) == 0)),          \
 39 |                 "Alignment incompatible: " #refname)
 40 | 
 41 | #define BUFFER_REF_DECLARE_ARRAY(refname, typ, keywords, alignment)                                                    \
 42 |   static_assert(alignof(typ) == alignment || (alignment > alignof(typ) && ((alignment % alignof(typ)) == 0)),          \
 43 |                 "Alignment incompatible: " #refname)
 44 | 
 45 | #define BUFFER_REF_DECLARE_SIZE(sizename, typ, size) static_assert(sizeof(typ) == size_t(size), "GLSL vs C++ size mismatch: " #typ)
 46 | 
 47 | #else  // GLSL
 48 | 
 49 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 50 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 51 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 52 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 53 | #extension GL_EXT_buffer_reference : enable
 54 | #extension GL_EXT_buffer_reference2 : enable
 55 | #extension GL_EXT_scalar_block_layout : enable
 56 | #extension GL_EXT_shader_atomic_int64 : enable
 57 | 
 58 | #define PACKED_GET(flag, cfg)   (((flag) >> (true ? cfg)) & ((1 << (false ? cfg))-1))
 59 | #define PACKED_FLAG(cfg, val)   ((val) << (true ? cfg))
 60 | #define PACKED_MASK(cfg)        (((1 << (false ? cfg))-1) << (true ? cfg))
 61 | 
 62 | #define BUFFER_REF(refname) refname
 63 | 
 64 | #define BUFFER_REF_DECLARE(refname, typ, keywords, alignment)                                                          \
 65 |   layout(buffer_reference, buffer_reference_align = alignment, scalar) keywords buffer refname                         \
 66 |   {                                                                                                                    \
 67 |     typ d;                                                                                                             \
 68 |   };
 69 | 
 70 | #define BUFFER_REF_DECLARE_ARRAY(refname, typ, keywords, alignment)                                                    \
 71 |   layout(buffer_reference, buffer_reference_align = alignment, scalar) keywords buffer refname                         \
 72 |   {                                                                                                                    \
 73 |     typ d[];                                                                                                           \
 74 |   };
 75 | 
 76 | #define BUFFER_REF_DECLARE_SIZE(sizename, typ, size) const uint32_t sizename = size
 77 | 
 78 | #endif
 79 | 
 80 | BUFFER_REF_DECLARE_ARRAY(uint8s_in, uint8_t, readonly, 1);
 81 | BUFFER_REF_DECLARE_ARRAY(uint16s_in, uint16_t, readonly, 2);
 82 | BUFFER_REF_DECLARE_ARRAY(uint16s_inout, uint16_t, , 2);
 83 | BUFFER_REF_DECLARE_ARRAY(uint32s_in, uint32_t, readonly, 4);
 84 | BUFFER_REF_DECLARE_ARRAY(uint32s_inout, uint32_t, , 4);
 85 | BUFFER_REF_DECLARE_ARRAY(int32s_inout, int32_t, , 4);
 86 | BUFFER_REF_DECLARE_ARRAY(uvec2s_in, uvec2, , 8);
 87 | BUFFER_REF_DECLARE_ARRAY(uvec2s_inout, uvec2, , 8);
 88 | BUFFER_REF_DECLARE_ARRAY(uint64s_in, uint64_t, readonly, 8);
 89 | BUFFER_REF_DECLARE_ARRAY(uint64s_inout, uint64_t, , 8);
 90 | BUFFER_REF_DECLARE_ARRAY(uint64s_coh, uint64_t, coherent, 8);
 91 | BUFFER_REF_DECLARE_ARRAY(uint64s_coh_volatile, uint64_t, coherent volatile, 8);
 92 | BUFFER_REF_DECLARE_ARRAY(vec3s_in, vec3, readonly, 4);
 93 | BUFFER_REF_DECLARE_ARRAY(vec4s_in, vec4, readonly, 16);
 94 | 
 95 | struct DispatchIndirectCommand
 96 | {
 97 |   uint gridX;
 98 |   uint gridY;
 99 |   uint gridZ;
100 | };
101 | 
102 | struct DrawMeshTasksIndirectCommandNV
103 | {
104 |   uint count;
105 |   uint first;
106 | };
107 | 
108 | #ifdef __cplusplus
109 | }
110 | #endif
111 | #endif // _SHADERIO_CORE_H_


--------------------------------------------------------------------------------
/shaders/shaderio_scene.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | #include "shaderio_core.h"
 21 | 
 22 | #ifndef _SHADERIO_SCENE_H_
 23 | #define _SHADERIO_SCENE_H_
 24 | 
 25 | #ifdef __cplusplus
 26 | namespace shaderio {
 27 | using namespace glm;
 28 | #else
 29 | 
 30 | #ifndef CLUSTER_VERTEX_COUNT
 31 | #define CLUSTER_VERTEX_COUNT 32
 32 | #endif
 33 | 
 34 | #ifndef CLUSTER_TRIANGLE_COUNT
 35 | #define CLUSTER_TRIANGLE_COUNT 32
 36 | #endif
 37 | 
 38 | #endif
 39 | 
 40 | #define SHADERIO_ORIGINAL_MESH_GROUP 0xffffffffu
 41 | 
 42 | struct BBox
 43 | {
 44 |   vec3 lo;
 45 |   vec3 hi;
 46 |   // relevant to cluster's triangles
 47 |   float shortestEdge;
 48 |   float longestEdge;
 49 | };
 50 | BUFFER_REF_DECLARE_ARRAY(BBoxes_in, BBox, readonly, 16);
 51 | 
 52 | // A cluster contains a small number of triangles and vertices.
 53 | // It is always part of a group.
 54 | struct Cluster
 55 | {
 56 |   uint8_t triangleCountMinusOne;
 57 |   uint8_t vertexCountMinusOne;
 58 |   uint8_t lodLevel;
 59 |   uint8_t groupChildIndex;
 60 | 
 61 |   uint32_t groupID;
 62 | 
 63 |   BUFFER_REF(vec4s_in) vertices;
 64 |   BUFFER_REF(uint8s_in) localTriangles;
 65 | 
 66 |   uint64_t _pad;
 67 | };
 68 | BUFFER_REF_DECLARE(Cluster_in, Cluster, , 16);
 69 | BUFFER_REF_DECLARE_ARRAY(Clusters_inout, Cluster, , 16);
 70 | BUFFER_REF_DECLARE_SIZE(Cluster_size, Cluster, 32);
 71 | 
 72 | // A group contains multiple clusters that are the result of
 73 | // a common mesh decimation operation. Clusters within a group
 74 | // are watertight to each other. Groups are always streamed in
 75 | // completely, which simplifies the streaming management.
 76 | 
 77 | struct TraversalMetric
 78 | {
 79 |   // scalar by design, avoid hiccups with packing
 80 |   // order must match `nvclusterlod::Node`
 81 |   float boundingSphereX;
 82 |   float boundingSphereY;
 83 |   float boundingSphereZ;
 84 |   float boundingSphereRadius;
 85 |   float maxQuadricError;
 86 | };
 87 | 
 88 | struct Group
 89 | {
 90 |   uint32_t geometryID;
 91 |   uint32_t groupID;
 92 | 
 93 |   // streaming: global unique id given on load
 94 |   //            clusters array starts directly after group
 95 |   // preloaded: local id within geometry
 96 |   uint32_t residentID;
 97 |   uint32_t clusterResidentID;
 98 | 
 99 |   // when this group is first loaded, this is where the
100 |   // temporary clas builds start.
101 |   uint32_t streamingNewBuildOffset;
102 |   
103 |   uint16_t lodLevel;
104 |   uint16_t clusterCount;
105 | 
106 |   TraversalMetric traversalMetric;
107 |   
108 |   BUFFER_REF(uint32s_in) clusterGeneratingGroups;
109 |   BUFFER_REF(BBoxes_in)  clusterBboxes;
110 | };
111 | 
112 | BUFFER_REF_DECLARE(Group_in, Group, , 16);
113 | BUFFER_REF_DECLARE_ARRAY(Groups_in, Group, , 16);
114 | BUFFER_REF_DECLARE_SIZE(Group_size, Group, 64);
115 | 
116 | #ifdef __cplusplus
117 | // must match `nvclusterlod::InteriorNode`
118 | struct NodeRange
119 | {
120 |   uint32_t isNode : 1;
121 |   uint32_t childOffset : 26;
122 |   uint32_t childCountMinusOne : 5;
123 | };
124 | 
125 | // must match `nvclusterlod::LeafNode`
126 | struct GroupRange
127 | {
128 |   uint32_t isNode : 1;
129 |   uint32_t groupIndex : 23;
130 |   uint32_t groupClusterCountMinusOne : 8;
131 | };
132 | #endif
133 | 
134 | // must match `nvclusterlod::Node`
135 | struct Node
136 | {
137 | #ifdef __cplusplus
138 |   union
139 |   {
140 |     NodeRange  nodeRange;
141 |     GroupRange groupRange;
142 |   };
143 | #else
144 |   uint32_t packed;
145 | 
146 | #define Node_packed_isGroup 0 : 1
147 | 
148 | #define Node_packed_nodeChildOffset 1 : 26
149 | #define Node_packed_nodeChildCountMinusOne 27 : 5
150 | 
151 | #define Node_packed_groupIndex 1 : 23
152 | #define Node_packed_groupClusterCountMinusOne 24 : 8
153 | 
154 | #endif
155 |   // use scalar to avoid glsl alignment hiccups
156 |   TraversalMetric traversalMetric;
157 | };
158 | BUFFER_REF_DECLARE_ARRAY(Nodes_in, Node, readonly, 8);
159 | 
160 | struct Geometry
161 | {
162 |   uint32_t clustersCount;
163 |   uint32_t groupsCount;
164 |   uint32_t nodesCount;
165 |   uint32_t _pad;
166 |     
167 |   // object space geometry bbox
168 |   BBox bbox;
169 | 
170 |   // lod hierarchy traversal
171 |   BUFFER_REF(Nodes_in) nodes;
172 |   BUFFER_REF(BBoxes_in) nodeBboxes;
173 | 
174 | 
175 |   // streaming (null if preloaded)
176 |   // provides memory address of a resident group.
177 |   //
178 |   // Note this 64-bit value uses a special encoding.
179 |   // only addresses < STREAMING_INVALID_ADDRESS_BEGIN can be dereferenced.
180 |   BUFFER_REF(uint64s_inout) streamingGroupAddresses;
181 | 
182 |   // preloaded (null if streaming)
183 |   // clusters
184 |   BUFFER_REF(Groups_in) preloadedGroups;
185 |   BUFFER_REF(Clusters_inout) preloadedClusters;
186 |   // for ray tracing
187 |   BUFFER_REF(uint64s_in) preloadedClusterClasAddresses;
188 |   BUFFER_REF(uint32s_in) preloadedClusterClasSizes;
189 | };
190 | BUFFER_REF_DECLARE(Geometry_in, Geometry, readonly, 8);
191 | 
192 | struct RenderInstance
193 | {
194 |   mat4 worldMatrix;
195 | 
196 |   uint32_t geometryID;
197 |   float    maxLodLevelRcp;
198 |   uint32_t _pad[2];
199 | };
200 | BUFFER_REF_DECLARE_ARRAY(RenderInstances_in, RenderInstance, readonly, 16);
201 | 
202 | #ifdef __cplusplus
203 | // clusters are stored right next to group
204 | static_assert((sizeof(Group) % sizeof(Cluster)) == 0);
205 | }
206 | #endif
207 | 
208 | #endif
209 | 


--------------------------------------------------------------------------------
/shaders/stream_agefilter_groups.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader writes the streaming request for
 26 |   groups to be unloaded. We determine this based on an
 27 |   age since the group has been used last.
 28 |   
 29 |   A thread represents one resident group.
 30 | */
 31 | 
 32 | #version 460
 33 | 
 34 | #extension GL_GOOGLE_include_directive : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 39 | #extension GL_EXT_buffer_reference : enable
 40 | #extension GL_EXT_buffer_reference2 : enable
 41 | #extension GL_EXT_scalar_block_layout : enable
 42 | #extension GL_EXT_shader_atomic_int64 : enable
 43 | 
 44 | #extension GL_EXT_control_flow_attributes : require
 45 | #extension GL_KHR_shader_subgroup_vote : require
 46 | #extension GL_KHR_shader_subgroup_ballot : require
 47 | #extension GL_KHR_shader_subgroup_shuffle : require
 48 | #extension GL_KHR_shader_subgroup_basic : require
 49 | #extension GL_KHR_shader_subgroup_clustered : require
 50 | #extension GL_KHR_shader_subgroup_arithmetic : require
 51 | 
 52 | #include "shaderio.h"
 53 | 
 54 | ////////////////////////////////////////////
 55 | 
 56 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 57 | {
 58 |   Readback readback;
 59 | };
 60 | 
 61 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 62 | {
 63 |   Geometry geometries[];
 64 | };
 65 | 
 66 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 67 | {
 68 |   SceneStreaming streaming;
 69 | };
 70 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 71 | {
 72 |   SceneStreaming streamingRW;
 73 | };
 74 | 
 75 | ////////////////////////////////////////////
 76 | 
 77 | layout(local_size_x=STREAM_AGEFILTER_GROUPS_WORKGROUP) in;
 78 | 
 79 | ////////////////////////////////////////////
 80 | 
 81 | void main()
 82 | {
 83 |   // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_AGEFILTER_CLUSTERS_WORKGROUP
 84 |   uint residentID = streaming.resident.activeGroups.d[gl_GlobalInvocationID.x];
 85 |   if (gl_GlobalInvocationID.x < streaming.resident.activeGroupsCount)
 86 |   {
 87 |   #if STREAMING_DEBUG_ADDRESSES
 88 |     if (uint64_t(streaming.resident.groups.d[residentID].group) >= STREAMING_INVALID_ADDRESS_START)
 89 |     {
 90 |       streamingRW.request.errorAgeFilter = residentID;
 91 |       return;
 92 |     }
 93 |   #endif
 94 |   
 95 |     // increase the age of a resident group  
 96 |     int age = ++streaming.resident.groups.d[residentID].age;      
 97 |     
 98 |     // detect if we are over the age limit and request the group to be unloaded
 99 |     if (age > streaming.ageThreshold)
100 |     {    
101 |       uint unloadOffset = atomicAdd(streamingRW.request.unloadCounter, 1);
102 |       if (unloadOffset <= streaming.request.maxUnloads) {
103 |         Group_in groupRef = streaming.resident.groups.d[residentID].group;
104 |         streaming.request.unloadGeometryGroups.d[unloadOffset] = uvec2(groupRef.d.geometryID, groupRef.d.groupID);
105 |       }
106 |     }
107 |   }
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/shaders/stream_allocator_freegaps_insert.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |     Note: The sample showcases two ways to manage CLAS memory on the device.
 26 |     One using a persistent allocator system (`stream_allocator...` files),
 27 |     and one using a simple compaction scheme (`stream_compaction...` files).
 28 |     This file is part of the allocator system.
 29 |   
 30 |   This compute shader bins the free gaps based on their size.
 31 |   It enables the allocator to provide empty gaps of certain sizes during
 32 |   the allocation process within `stream_allocator_load_groups.comp.glsl`.
 33 |   
 34 |   We read `streaming.clasAllocator.freeGapsPos` and `streaming.clasAllocator.freeGapsSize`
 35 |   and bin into `streaming.clasAllocator.freeGapsPosBinned` using the appropriate
 36 |   `streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].offset`
 37 |   
 38 |   One thread operates on one free gap
 39 | */
 40 | 
 41 | #version 460
 42 | 
 43 | #extension GL_GOOGLE_include_directive : enable
 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 45 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 48 | #extension GL_EXT_buffer_reference : enable
 49 | #extension GL_EXT_buffer_reference2 : enable
 50 | #extension GL_EXT_scalar_block_layout : enable
 51 | #extension GL_EXT_shader_atomic_int64 : enable
 52 | 
 53 | #extension GL_EXT_control_flow_attributes : require
 54 | #extension GL_KHR_shader_subgroup_vote : require
 55 | #extension GL_KHR_shader_subgroup_ballot : require
 56 | #extension GL_KHR_shader_subgroup_shuffle : require
 57 | #extension GL_KHR_shader_subgroup_basic : require
 58 | #extension GL_KHR_shader_subgroup_clustered : require
 59 | #extension GL_KHR_shader_subgroup_arithmetic : require
 60 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require
 61 | 
 62 | #include "shaderio.h"
 63 | 
 64 | ////////////////////////////////////////////
 65 | 
 66 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 67 | {
 68 |   Readback readback;
 69 | };
 70 | 
 71 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 72 | {
 73 |   Geometry geometries[];
 74 | };
 75 | 
 76 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 77 | {
 78 |   SceneStreaming streaming;
 79 | };
 80 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 81 | {
 82 |   SceneStreaming streamingRW;
 83 | };
 84 | 
 85 | ////////////////////////////////////////////
 86 | 
 87 | layout(local_size_x=STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP) in;
 88 | 
 89 | ////////////////////////////////////////////
 90 | 
 91 | void main()
 92 | {
 93 |   uint threadID = gl_GlobalInvocationID.x;
 94 |   bool valid    = threadID < streaming.clasAllocator.freeGapsCounter;
 95 | 
 96 |   if (valid)
 97 |   {
 98 |     // get the details of the free gap, it was computed in
 99 |     // `stream_allocator_build_freegaps.comp.glsl`.
100 | 
101 |     uint freeGapPos  = streaming.clasAllocator.freeGapsPos.d[threadID];
102 |     uint freeGapSize = streaming.clasAllocator.freeGapsSize.d[threadID];
103 |   
104 |     // bin the gap into `streaming.clasAllocator.freeGapsPosBinned` based on size
105 |     int32_t rangeIndex = atomicAdd(streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].count, 1);
106 |     uint rangeOffset   = streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].offset;
107 |     
108 |     uint storeOffset   = rangeIndex + uint(rangeOffset);
109 |     streaming.clasAllocator.freeGapsPosBinned.d[storeOffset] = freeGapPos;
110 |   }
111 | }


--------------------------------------------------------------------------------
/shaders/stream_allocator_setup_insertion.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |     Note: The sample showcases two ways to manage CLAS memory on the device.
 26 |     One using a persistent allocator system (`stream_allocator...` files),
 27 |     and one using a simple compaction scheme (`stream_compaction...` files).
 28 |     This file is part of the allocator system.
 29 |   
 30 |   This compute shader prepares the ranges of free gaps
 31 |   based on their size. It is required to handle the size-based
 32 |   binding within `stream_allocator_freegaps_insert.comp.glsl`.
 33 |   
 34 |   One thread represent one free gap size
 35 |   
 36 | */
 37 | 
 38 | #version 460
 39 | 
 40 | #extension GL_GOOGLE_include_directive : enable
 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 43 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 45 | #extension GL_EXT_buffer_reference : enable
 46 | #extension GL_EXT_buffer_reference2 : enable
 47 | #extension GL_EXT_scalar_block_layout : enable
 48 | #extension GL_EXT_shader_atomic_int64 : enable
 49 | 
 50 | #extension GL_EXT_control_flow_attributes : require
 51 | #extension GL_KHR_shader_subgroup_vote : require
 52 | #extension GL_KHR_shader_subgroup_ballot : require
 53 | #extension GL_KHR_shader_subgroup_shuffle : require
 54 | #extension GL_KHR_shader_subgroup_basic : require
 55 | #extension GL_KHR_shader_subgroup_clustered : require
 56 | #extension GL_KHR_shader_subgroup_arithmetic : require
 57 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require
 58 | 
 59 | #include "shaderio.h"
 60 | 
 61 | ////////////////////////////////////////////
 62 | 
 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 64 | {
 65 |   Readback readback;
 66 | };
 67 | 
 68 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 69 | {
 70 |   Geometry geometries[];
 71 | };
 72 | 
 73 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 74 | {
 75 |   SceneStreaming streaming;
 76 | };
 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 78 | {
 79 |   SceneStreaming streamingRW;
 80 | };
 81 | 
 82 | ////////////////////////////////////////////
 83 | 
 84 | layout(local_size_x=STREAM_ALLOCATOR_SETUP_INSERTION_WORKGROUP) in;
 85 | 
 86 | ////////////////////////////////////////////
 87 | 
 88 | void main()
 89 | {
 90 |   uint threadID = gl_GlobalInvocationID.x;
 91 |   bool valid    = threadID < streaming.clasAllocator.maxAllocationSize;
 92 |   
 93 |   if (valid)
 94 |   {
 95 |     // from the previous kernel `stream_allocator_build_freegaps.comp.glsl` we know how
 96 |     // many slots the size-binned array will need
 97 |     uint rangeCount  = uint(streaming.clasAllocator.freeSizeRanges.d[threadID].count);
 98 |     // get an offset into `streaming.clasAllocator.freeGapsPosBinned` for the list of
 99 |     uint rangeOffset = atomicAdd(streamingRW.clasAllocator.freeGapsCounter, rangeCount);
100 |     // setup range offset
101 |     streaming.clasAllocator.freeSizeRanges.d[threadID].offset = rangeOffset;
102 |     // reset to zero for insertion done in `stream_allocator_freegaps_insert.comp.glsl`
103 |     streaming.clasAllocator.freeSizeRanges.d[threadID].count  = 0;
104 |   }
105 | }
106 | 
107 | 


--------------------------------------------------------------------------------
/shaders/stream_allocator_unload_groups.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |     Note: The sample showcases two ways to manage CLAS memory on the device.
 26 |     One using a persistent allocator system (`stream_allocator...` files),
 27 |     and one using a simple compaction scheme (`stream_compaction...` files).
 28 |     This file is part of the allocator system.
 29 |   
 30 |   This compute shader handles de-allocation of clas memory space
 31 |   of unloaded groups.
 32 |   
 33 |   It marks the appropriate bits of the memory regions as empty again.
 34 |   `streaming.clasAllocator.usedBits` is modified accordingly.
 35 |   
 36 |   One thread represents an unloaded group
 37 |   
 38 |   TODO might want to improve divergence in the loops
 39 | */
 40 | 
 41 | #version 460
 42 | 
 43 | #extension GL_GOOGLE_include_directive : enable
 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 45 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 48 | #extension GL_EXT_buffer_reference : enable
 49 | #extension GL_EXT_buffer_reference2 : enable
 50 | #extension GL_EXT_scalar_block_layout : enable
 51 | #extension GL_EXT_shader_atomic_int64 : enable
 52 | 
 53 | #extension GL_EXT_control_flow_attributes : require
 54 | #extension GL_KHR_shader_subgroup_vote : require
 55 | #extension GL_KHR_shader_subgroup_ballot : require
 56 | #extension GL_KHR_shader_subgroup_shuffle : require
 57 | #extension GL_KHR_shader_subgroup_basic : require
 58 | #extension GL_KHR_shader_subgroup_clustered : require
 59 | #extension GL_KHR_shader_subgroup_arithmetic : require
 60 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require
 61 | 
 62 | #include "shaderio.h"
 63 | 
 64 | ////////////////////////////////////////////
 65 | 
 66 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 67 | {
 68 |   Readback readback;
 69 | };
 70 | 
 71 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 72 | {
 73 |   Geometry geometries[];
 74 | };
 75 | 
 76 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 77 | {
 78 |   SceneStreaming streaming;
 79 | };
 80 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 81 | {
 82 |   SceneStreaming streamingRW;
 83 | };
 84 | 
 85 | ////////////////////////////////////////////
 86 | 
 87 | layout(local_size_x=STREAM_ALLOCATOR_UNLOAD_GROUPS_WORKGROUP) in;
 88 | 
 89 | ////////////////////////////////////////////
 90 | 
 91 | void main()
 92 | {
 93 |   uint threadID             = gl_GlobalInvocationID.x;
 94 |   bool valid                = threadID < streaming.update.patchUnloadGroupsCount;
 95 |   
 96 |   // unloads come first in patches
 97 |   StreamingPatch spatch = streaming.update.patches.d[threadID];
 98 |   
 99 |   if (valid)
100 |   {  
101 |     Group group = Group_in(geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex]).d;
102 |     
103 |     // get the first clas address of the group, as all clas of a 
104 |     // group are allocated together
105 |     uint64_t firstClasAddress = streaming.resident.clasAddresses.d[group.clusterResidentID];
106 |     // then convert this into a relative address compared to the clas base address
107 |     uint64_t firstClasOffset  = firstClasAddress - streaming.resident.clasBaseAddress;
108 |     
109 |     // recreate the allocation properties of the group
110 |     // get allocation position in units
111 |     uint allocPos   = uint(firstClasOffset >> streaming.clasAllocator.granularityByteShift);
112 |     // retrieve the size of allocation as well as the associated memory waste
113 |     uvec2 groupSize = streaming.resident.groupClasSizes.d[group.residentID];
114 |     // allocation size was stored in units, which is what we need here, but wasted size in bytes
115 |     uint allocSize  = groupSize.x;
116 |     uint wastedByteSize = groupSize.y;
117 |     
118 |     // for stats
119 |     atomicAdd(streamingRW.clasAllocator.stats.d.allocatedSize, -int64_t(allocSize << streaming.clasAllocator.granularityByteShift));
120 |     atomicAdd(streamingRW.clasAllocator.stats.d.wastedSize, -int64_t(wastedByteSize));
121 |     
122 |     // for allocation management, tag bits as unusued
123 |     //
124 |     // allocPos and allocSize are in minimum granularity,
125 |     // which is what we use to tag the appropriate bits.
126 |     
127 |     uint startPos = allocPos;
128 |     uint endPos   = allocPos + allocSize - 1;
129 |     
130 |     uint startBit = (startPos) & 31;
131 |     uint endBit   = (endPos) & 31;
132 |     
133 |     uint start32 = startPos / 32;
134 |     uint end32   = endPos / 32;
135 |     
136 |     uint startMask = ~0;
137 |     uint endMask   = ~0;
138 |     
139 |     if (startBit != 0)
140 |     {
141 |       startMask = ~((1u << (startBit))-1);
142 |     }
143 |     if (endBit != 31)
144 |     {
145 |       endMask =  (1u << (endBit + 1))-1;
146 |     }
147 |     
148 |     bool single32 = start32 == end32;      
149 |     if (single32)
150 |     {
151 |       startMask = endMask | startMask;
152 |     }
153 |     
154 |     // start and end of an allocated region may end up in the same u32,
155 |     // hence we need atomics for start and end
156 |     
157 |     uint oldMask = atomicAnd(streaming.clasAllocator.usedBits.d[start32], ~startMask);
158 |   #if STREAMING_DEBUG_FREEGAPS_OVERLAP
159 |     // for debugging we test if the region was indeed fully used
160 |     bool hadError = false;
161 |     if ((oldMask & startMask) != startMask){
162 |       hadError = true;
163 |     }
164 |   #endif
165 |     
166 |     if (!single32) 
167 |     {
168 |       // process the region that is exclusively covered by this allocation
169 |       for (uint32_t i = start32 + 1; i < end32; i++)
170 |       {
171 |       #if STREAMING_DEBUG_FREEGAPS_OVERLAP
172 |         if(streaming.clasAllocator.usedBits.d[i] == 0){
173 |           hadError = true;
174 |         }
175 |       #endif
176 |         streaming.clasAllocator.usedBits.d[i] = 0;
177 |       }
178 |       
179 |       oldMask = atomicAnd(streaming.clasAllocator.usedBits.d[end32], ~endMask);
180 |     #if STREAMING_DEBUG_FREEGAPS_OVERLAP
181 |       if ((oldMask & endMask) != endMask){
182 |         hadError = true;
183 |       }
184 |     #endif
185 |     }
186 |   #if STREAMING_DEBUG_FREEGAPS_OVERLAP
187 |     if (hadError){
188 |       streamingRW.request.errorClasDealloc = 1 + threadID;
189 |     }
190 |   #endif
191 |   }
192 | }
193 | 
194 | 


--------------------------------------------------------------------------------
/shaders/stream_compaction_new_clas.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |     Note: The sample showcases two ways to manage CLAS memory on the device.
 26 |     One using a persistent allocator system (`stream_allocator...` files),
 27 |     and one using a simple compaction scheme (`stream_compaction...` files).
 28 |   
 29 |   This compute shader compacts cluster CLAS storage
 30 |   of all newly built clusters. They are appended after the
 31 |   compaction of old clusters CLAS.
 32 |   
 33 |   The compaction is done in `stream_compaction_old_clas.comp.glsl`
 34 |   
 35 |   A thread represents one newly built CLAS.
 36 | */
 37 | 
 38 | #version 460
 39 | 
 40 | #extension GL_GOOGLE_include_directive : enable
 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 43 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 45 | #extension GL_EXT_buffer_reference : enable
 46 | #extension GL_EXT_buffer_reference2 : enable
 47 | #extension GL_EXT_scalar_block_layout : enable
 48 | #extension GL_EXT_shader_atomic_int64 : enable
 49 | 
 50 | #extension GL_EXT_control_flow_attributes : require
 51 | #extension GL_KHR_shader_subgroup_vote : require
 52 | #extension GL_KHR_shader_subgroup_ballot : require
 53 | #extension GL_KHR_shader_subgroup_shuffle : require
 54 | #extension GL_KHR_shader_subgroup_basic : require
 55 | #extension GL_KHR_shader_subgroup_clustered : require
 56 | #extension GL_KHR_shader_subgroup_arithmetic : require
 57 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require
 58 | 
 59 | #include "shaderio.h"
 60 | 
 61 | ////////////////////////////////////////////
 62 | 
 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 64 | {
 65 |   Readback readback;
 66 | };
 67 | 
 68 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 69 | {
 70 |   Geometry geometries[];
 71 | };
 72 | 
 73 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 74 | {
 75 |   SceneStreaming streaming;
 76 | };
 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 78 | {
 79 |   SceneStreaming streamingRW;
 80 | };
 81 | 
 82 | ////////////////////////////////////////////
 83 | 
 84 | layout(local_size_x=STREAM_COMPACTION_NEW_CLAS_WORKGROUP) in;
 85 | 
 86 | ////////////////////////////////////////////
 87 | 
 88 | void main()
 89 | {
 90 |   // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_MOVE_NEW_CLAS_WORKGROUP
 91 | 
 92 |   uint newID             = gl_GlobalInvocationID.x;
 93 |   uint clusterResidentID = streaming.update.newClasResidentIDs.d[newID];
 94 |   bool valid             = newID < streaming.update.newClasCount;
 95 |   
 96 |   uint     clasSize    = 0;
 97 |   uint64_t clasAddress = 0;
 98 |   
 99 |   if (valid)
100 |   {
101 |     clasSize    = streaming.update.newClasSizes.d[newID];
102 |     clasAddress = streaming.update.newClasAddresses.d[newID];
103 |   }  
104 |   
105 |   uint64_t clasNewAddress = atomicAdd(streamingRW.update.moveClasSize, uint64_t(clasSize)) +
106 |                             streaming.resident.clasBaseAddress;
107 |   
108 |   uint  moveOffset = newID;
109 |   
110 |   if (valid) {
111 |     // set up move to new destination
112 |     streaming.update.moveClasSrcAddresses.d[moveOffset] = clasAddress;
113 |     streaming.update.moveClasDstAddresses.d[moveOffset] = clasNewAddress;
114 |     // update internal state of destination
115 |     streaming.resident.clasAddresses.d[clusterResidentID] = clasNewAddress;
116 |     streaming.resident.clasSizes.d[clusterResidentID]     = clasSize;
117 |   }
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/shaders/stream_compaction_old_clas.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |     Note: The sample showcases two ways to manage CLAS memory on the device.
 26 |     One using a persistent allocator system (`stream_allocator...` files),
 27 |     and one using a simple compaction scheme (`stream_compaction...` files).
 28 |     This file is part of the compaction scheme.
 29 |   
 30 |   This compute shader compacts / defrags cluster CLAS storage
 31 |   of all previously active resident groups.
 32 |   
 33 |   A thread represents one resident group.
 34 | */
 35 | 
 36 | #version 460
 37 | 
 38 | #extension GL_GOOGLE_include_directive : enable
 39 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 40 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 43 | #extension GL_EXT_buffer_reference : enable
 44 | #extension GL_EXT_buffer_reference2 : enable
 45 | #extension GL_EXT_scalar_block_layout : enable
 46 | #extension GL_EXT_shader_atomic_int64 : enable
 47 | 
 48 | #extension GL_EXT_control_flow_attributes : require
 49 | #extension GL_KHR_shader_subgroup_vote : require
 50 | #extension GL_KHR_shader_subgroup_ballot : require
 51 | #extension GL_KHR_shader_subgroup_shuffle : require
 52 | #extension GL_KHR_shader_subgroup_basic : require
 53 | #extension GL_KHR_shader_subgroup_clustered : require
 54 | #extension GL_KHR_shader_subgroup_arithmetic : require
 55 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require
 56 | 
 57 | #include "shaderio.h"
 58 | 
 59 | ////////////////////////////////////////////
 60 | 
 61 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 62 | {
 63 |   Readback readback;
 64 | };
 65 | 
 66 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 67 | {
 68 |   Geometry geometries[];
 69 | };
 70 | 
 71 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 72 | {
 73 |   SceneStreaming streaming;
 74 | };
 75 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 76 | {
 77 |   SceneStreaming streamingRW;
 78 | };
 79 | 
 80 | ////////////////////////////////////////////
 81 | 
 82 | layout(local_size_x=STREAM_COMPACTION_OLD_CLAS_WORKGROUP) in;
 83 | 
 84 | ////////////////////////////////////////////
 85 | 
 86 | void main()
 87 | {
 88 |   // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_MOVE_OLD_CLAS_WORKGROUP
 89 | 
 90 |   uint threadID        = gl_GlobalInvocationID.x;
 91 |   uint groupResidentID = streaming.resident.activeGroups.d[threadID];
 92 |   
 93 |   // old resident groups come first, then after this offset are the newly loaded,
 94 |   // which we can ignore here.
 95 |   bool valid           = threadID < streaming.update.loadActiveGroupsOffset;
 96 |   
 97 |   if (valid)
 98 |   {
 99 |     // Walk over all old resident groups' clusters and compact their clas
100 |     // objects storage so that the newly built clas can be appended to the
101 |     // end.
102 |     
103 |     // This will result in a lot of movement of clas and is not recommended,
104 |     // but avoids a more sophisticated clas allocation scheme.
105 |     
106 |     Group group = streaming.resident.groups.d[groupResidentID].group.d;
107 |     
108 |     // TODO improve divergence
109 |     for (uint c = 0; c < group.clusterCount; c++)
110 |     {
111 |       uint clusterResidentID = group.clusterResidentID + c;
112 |       
113 |       uint clasSize        = streaming.resident.clasSizes.d[clusterResidentID];
114 |       uint64_t clasAddress = streaming.resident.clasAddresses.d[clusterResidentID];
115 |       
116 |       uint64_t clasNewAddress = atomicAdd(streamingRW.update.moveClasSize, uint64_t(clasSize)) +
117 |                                 streaming.resident.clasBaseAddress;
118 |       
119 |       // don't move identical addresses (in reality this will hardly happen due to
120 |       // non-deterministic nature of atomicAdd)
121 |       bool move       = clasNewAddress != clasAddress;      
122 |       uint moveOffset = atomicAdd(streamingRW.update.moveClasCounter, move ? 1 : 0);
123 |       
124 |       if (move) {
125 |         // set up move to new destination
126 |         streaming.update.moveClasSrcAddresses.d[moveOffset] = clasAddress;
127 |         streaming.update.moveClasDstAddresses.d[moveOffset] = clasNewAddress;
128 |         // update internal state of destination
129 |         streaming.resident.clasAddresses.d[clusterResidentID] = clasNewAddress;
130 |         streaming.resident.clasSizes.d[clusterResidentID]     = clasSize;
131 |       }
132 |     }
133 |   }  
134 | }
135 | 
136 | 


--------------------------------------------------------------------------------
/shaders/stream_setup.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader does a few simple operations that require only a single thread.
 26 | 
 27 |   STREAM_SETUP_... are enums for the various operations
 28 |   
 29 | */
 30 | 
 31 | #version 460
 32 | 
 33 | #extension GL_GOOGLE_include_directive : enable
 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 38 | #extension GL_EXT_buffer_reference : enable
 39 | #extension GL_EXT_buffer_reference2 : enable
 40 | #extension GL_EXT_scalar_block_layout : enable
 41 | #extension GL_EXT_shader_atomic_int64 : enable
 42 | 
 43 | #extension GL_EXT_control_flow_attributes : require
 44 | #extension GL_KHR_shader_subgroup_ballot : require
 45 | #extension GL_KHR_shader_subgroup_shuffle : require
 46 | #extension GL_KHR_shader_subgroup_basic : require
 47 | #extension GL_KHR_shader_subgroup_clustered : require
 48 | #extension GL_KHR_shader_subgroup_arithmetic : require
 49 | 
 50 | #include "shaderio.h"
 51 | 
 52 | layout(push_constant) uniform pushData
 53 | {
 54 |   uint setup;
 55 | } push;
 56 | 
 57 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 58 | {
 59 |   FrameConstants view;
 60 | };
 61 | 
 62 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 63 | {
 64 |   Readback readback;
 65 | };
 66 | 
 67 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 68 | {
 69 |   Geometry geometries[];
 70 | };
 71 | 
 72 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 73 | {
 74 |   SceneStreaming streaming;
 75 | };
 76 | 
 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) coherent buffer streamingBufferRW
 78 | {
 79 |   SceneStreaming streamingRW;
 80 | };
 81 | 
 82 | ////////////////////////////////////////////
 83 | 
 84 | layout(local_size_x=1) in;
 85 | 
 86 | ////////////////////////////////////////////
 87 | 
 88 | void main()
 89 | {
 90 |   if (push.setup == STREAM_SETUP_COMPACTION_OLD_NO_UNLOADS)
 91 |   {
 92 |     // we will not do compaction of old when there are no unloads.
 93 |     // However appending new still depends on the/ `moveClasSize` to be configured 
 94 |     // correctly, so that we will append after it.
 95 |     
 96 |     // first streaming frame has special rule
 97 |     // (note we start at frame 1 not 0)
 98 |     if (streaming.frameIndex == 1)
 99 |     {
100 |       // reset the persistent stored value to zero
101 |       streaming.resident.clasCompactionUsedSize.d[0] = 0;
102 |       streamingRW.update.moveClasSize = 0;
103 |     }
104 |     else {    
105 |       streamingRW.update.moveClasSize = streaming.resident.clasCompactionUsedSize.d[0];
106 |     }
107 |   }
108 |   else if (push.setup == STREAM_SETUP_COMPACTION_STATUS)
109 |   {
110 |     // move compaction for clas memory management
111 |     if (streaming.update.patchGroupsCount > 0) {
112 |       // persistently store the total compacted clas size
113 |       streaming.resident.clasCompactionUsedSize.d[0] = streamingRW.update.moveClasSize;
114 |       // for readback
115 |       streamingRW.request.clasCompactionUsedSize = streamingRW.update.moveClasSize;
116 |       streamingRW.request.clasCompactionCount    = streamingRW.update.moveClasCounter;
117 |     }
118 |     else {
119 |       // no update, pull value from persistent storage
120 |       streamingRW.request.clasCompactionUsedSize = streaming.resident.clasCompactionUsedSize.d[0];
121 |       streamingRW.request.clasCompactionCount    = 0;
122 |     }
123 |   }
124 |   else if (push.setup == STREAM_SETUP_ALLOCATOR_FREEINSERT)
125 |   {
126 |     uint freeGaps = streaming.clasAllocator.freeGapsCounter;
127 |     uint maxFreeGaps = (streaming.clasAllocator.sectorCount << streaming.clasAllocator.sectorSizeShift);
128 |   
129 |     // reset to zero for `stream_allocator_setup_insertion.comp.glsl`
130 |     streamingRW.clasAllocator.freeGapsCounter = 0;
131 |     
132 |     // and setup actual dispatch that inserts the freegaps into the lists 
133 |     // within `stream_allocator_freelist_insert.comp.glsl`
134 |     streamingRW.clasAllocator.dispatchFreeGapsInsert.gridX = (min(freeGaps,maxFreeGaps) + STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP -1) / STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP;
135 |     streamingRW.clasAllocator.dispatchFreeGapsInsert.gridY = 1;
136 |     streamingRW.clasAllocator.dispatchFreeGapsInsert.gridZ = 1;
137 |   #if STREAMING_DEBUG_USEDBITS_COUNT
138 |     // error check allocation state prior adding new groups
139 |     uint64_t allocatedSize = streaming.clasAllocator.stats.d.allocatedSize;    
140 |     if (streaming.clasAllocator.usedBitsCount > 0 && 
141 |         allocatedSize != uint64_t(streaming.clasAllocator.usedBitsCount) << streaming.clasAllocator.granularityByteShift)
142 |     {
143 |       streamingRW.request.errorClasUsedVsAlloc = int(allocatedSize >> streaming.clasAllocator.granularityByteShift) - int(streaming.clasAllocator.usedBitsCount);
144 |     }
145 |   #endif
146 |   }
147 |   else if (push.setup == STREAM_SETUP_ALLOCATOR_STATUS)
148 |   {
149 |     if (streaming.frameIndex == 1)
150 |     {
151 |       // seed all available for first frame
152 |       uint clasAllocatedMaxSizedLeft = streaming.clasAllocator.sectorMaxAllocationSized * streaming.clasAllocator.sectorCount;
153 |       streaming.clasAllocator.stats.d.allocatedSize = 0;
154 |       streaming.clasAllocator.stats.d.wastedSize    = streaming.clasAllocator.baseWastedSize << streaming.clasAllocator.granularityByteShift;
155 |       streaming.resident.clasAllocatedMaxSizedLeft.d[0] = clasAllocatedMaxSizedLeft;
156 |       streamingRW.request.clasAllocatedMaxSizedLeft     = clasAllocatedMaxSizedLeft;
157 |     }
158 |     else {
159 |       // persistent allocator for clas memory management
160 |       if (streaming.update.patchGroupsCount > 0) {
161 |         // count can be negative
162 |         uint clasAllocatedMaxSizedLeft = uint(max(0,streaming.clasAllocator.freeSizeRanges.d[streaming.clasAllocator.maxAllocationSize-1].count));
163 |         streaming.resident.clasAllocatedMaxSizedLeft.d[0] = clasAllocatedMaxSizedLeft;
164 |         streamingRW.request.clasAllocatedMaxSizedLeft     = clasAllocatedMaxSizedLeft;
165 |       }
166 |       else {
167 |         // no update, pull value from persistent storage
168 |         streamingRW.request.clasAllocatedMaxSizedLeft = streaming.resident.clasAllocatedMaxSizedLeft.d[0];
169 |       }
170 |     }
171 |     
172 |     streamingRW.request.clasAllocatedUsedSize   = streaming.clasAllocator.stats.d.allocatedSize;
173 |     streamingRW.request.clasAllocatedWastedSize = streaming.clasAllocator.stats.d.wastedSize;
174 |   }
175 | }


--------------------------------------------------------------------------------
/shaders/stream_update_scene.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader handles updating the scene.
 26 |   Previous requests to load/unload have been completed and
 27 |   are provided for patching the scene.
 28 |   
 29 |   Effectively we are manipulating the geometries' 
 30 |   `streamingGroupAddresses` array that points to the resident
 31 |   memory location of a group (or tags it invalid).
 32 |   
 33 |   Furthermore when ray tracing is required we prepare building
 34 |   new CLAS for the loaded groups' clusters.
 35 |   
 36 |   After building is completed we run the `stream_move_new_clas.comp.glsl`
 37 |   to move them from temporary to final location.
 38 | 
 39 |   A thread represents a single patch operation, which takes care of
 40 |   one group.
 41 | */
 42 | 
 43 | #version 460
 44 | 
 45 | #extension GL_GOOGLE_include_directive : enable
 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 48 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 49 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 50 | #extension GL_EXT_buffer_reference : enable
 51 | #extension GL_EXT_buffer_reference2 : enable
 52 | #extension GL_EXT_scalar_block_layout : enable
 53 | #extension GL_EXT_shader_atomic_int64 : enable
 54 | 
 55 | #extension GL_EXT_control_flow_attributes : require
 56 | #extension GL_KHR_shader_subgroup_vote : require
 57 | #extension GL_KHR_shader_subgroup_ballot : require
 58 | #extension GL_KHR_shader_subgroup_shuffle : require
 59 | #extension GL_KHR_shader_subgroup_basic : require
 60 | #extension GL_KHR_shader_subgroup_clustered : require
 61 | #extension GL_KHR_shader_subgroup_arithmetic : require
 62 | 
 63 | #include "shaderio.h"
 64 | 
 65 | ////////////////////////////////////////////
 66 | 
 67 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 68 | {
 69 |   Readback readback;
 70 | };
 71 | 
 72 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 73 | {
 74 |   Geometry geometries[];
 75 | };
 76 | 
 77 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer
 78 | {
 79 |   SceneStreaming streaming;
 80 | };
 81 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW
 82 | {
 83 |   SceneStreaming streamingRW;
 84 | };
 85 | 
 86 | ////////////////////////////////////////////
 87 | 
 88 | layout(local_size_x=STREAM_UPDATE_SCENE_WORKGROUP) in;
 89 | 
 90 | ////////////////////////////////////////////
 91 | 
 92 | void main()
 93 | {
 94 |   // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_UPDATE_SCENE_WORKGROUP
 95 |   
 96 |   uint threadID = gl_GlobalInvocationID.x;  
 97 | 
 98 |   // works for both load and unload
 99 |   StreamingPatch spatch = streaming.update.patches.d[threadID];
100 |   
101 |   if (threadID < streaming.update.patchGroupsCount)
102 |   {
103 |   #if STREAMING_DEBUG_ADDRESSES
104 |     uint oldResidentID = 0;
105 |     if (threadID < streaming.update.patchUnloadGroupsCount)
106 |     {
107 |       Group group = Group_in(geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex]).d;
108 |       oldResidentID = group.residentID;
109 |     }
110 |   #endif
111 |     
112 |     geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex] = spatch.groupAddress;
113 |     
114 |     if (threadID < streaming.update.patchUnloadGroupsCount)
115 |     {
116 |     #if STREAMING_DEBUG_ADDRESSES
117 |       streaming.resident.groups.d[oldResidentID].group = Group_in(STREAMING_INVALID_ADDRESS_START);
118 |     #endif
119 |     }
120 |     else
121 |     {
122 |       uint loadGroupIndex = threadID - streaming.update.patchUnloadGroupsCount;
123 | 
124 |       Group group = Group_in(spatch.groupAddress).d;
125 |     
126 |       uint groupResidentID = group.residentID;
127 |       StreamingGroup residentGroup;
128 |       residentGroup.clusterCount = group.clusterCount;
129 |       residentGroup.age = 0;
130 |       residentGroup.group = Group_in(spatch.groupAddress);
131 |     #if STREAMING_DEBUG_ADDRESSES
132 |       if (uint64_t(streaming.resident.groups.d[groupResidentID].group) < STREAMING_INVALID_ADDRESS_START)
133 |         streamingRW.request.errorUpdate = groupResidentID;
134 |     #endif
135 |       
136 |       // update description in residency table
137 |       streaming.resident.groups.d[groupResidentID] = residentGroup;
138 | 
139 |       // insert ourselves into the list of all active groups
140 |       streaming.resident.activeGroups.d[streaming.update.loadActiveGroupsOffset + loadGroupIndex] = groupResidentID;
141 |       
142 |       // We might have a bit of divergence here, but shouldn't be a mission critical issue
143 |       
144 |       // All new groups need to build new clusters.
145 |       // These are built into scratch space first, and then moved to final locations.
146 |       
147 |       uint newBuildOffset = group.streamingNewBuildOffset;
148 |       for (uint c = 0; c < group.clusterCount; c++)
149 |       {
150 |         uint clusterResidentID = group.clusterResidentID + c;
151 |         
152 |         Cluster_in clusterRef = Cluster_in(spatch.groupAddress + Group_size + Cluster_size * c);
153 |         streaming.resident.clusters.d[clusterResidentID] = uint64_t(clusterRef);
154 |         
155 |       #if TARGETS_RAY_TRACING
156 |         Cluster cluster = clusterRef.d;
157 |       
158 |         ClasBuildInfo buildInfo;
159 |         buildInfo.clusterID    = clusterResidentID;
160 |         buildInfo.clusterFlags = 0;
161 |         
162 |         buildInfo.packed = 0;
163 |         buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_triangleCount, cluster.triangleCountMinusOne+1);
164 |         buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_vertexCount, cluster.vertexCountMinusOne+1);
165 |         buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_indexType, 1);
166 |         
167 |         buildInfo.baseGeometryIndexAndFlags = ClasGeometryFlag_OPAQUE_BIT_NV;
168 |         
169 |         buildInfo.indexBufferStride                 = uint16_t(1);
170 |         buildInfo.vertexBufferStride                = uint16_t(4 * 4);
171 |         buildInfo.geometryIndexAndFlagsBufferStride = uint16_t(0);
172 |         buildInfo.opacityMicromapIndexBufferStride  = uint16_t(0);
173 |     
174 |         buildInfo.vertexBuffer = uint64_t(cluster.vertices);
175 |         buildInfo.indexBuffer  = uint64_t(cluster.localTriangles);
176 |         
177 |         buildInfo.geometryIndexAndFlagsBuffer = 0;
178 |         buildInfo.opacityMicromapArray        = 0;
179 |         buildInfo.opacityMicromapIndexBuffer  = 0;
180 |         
181 |         streaming.update.newClasBuilds.d[newBuildOffset + c]      = buildInfo;
182 |         streaming.update.newClasResidentIDs.d[newBuildOffset + c] = clusterResidentID;
183 |       #endif
184 |       }
185 |     }
186 |   }
187 | }
188 | 
189 | 


--------------------------------------------------------------------------------
/shaders/traversal_init.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader initializes the traversal queue with the 
 26 |   root nodes of the lod hierarchy of rendered instances.
 27 | 
 28 |   A thread represents one instance.
 29 | */
 30 | 
 31 | #version 460
 32 | 
 33 | #extension GL_GOOGLE_include_directive : enable
 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 38 | #extension GL_EXT_buffer_reference : enable
 39 | #extension GL_EXT_buffer_reference2 : enable
 40 | #extension GL_EXT_scalar_block_layout : enable
 41 | #extension GL_EXT_shader_atomic_int64 : enable
 42 | 
 43 | #extension GL_EXT_control_flow_attributes : require
 44 | #extension GL_KHR_shader_subgroup_vote : require
 45 | #extension GL_KHR_shader_subgroup_ballot : require
 46 | #extension GL_KHR_shader_subgroup_shuffle : require
 47 | #extension GL_KHR_shader_subgroup_basic : require
 48 | #extension GL_KHR_shader_subgroup_clustered : require
 49 | #extension GL_KHR_shader_subgroup_arithmetic : require
 50 | 
 51 | #extension GL_NV_shader_subgroup_partitioned : require
 52 | 
 53 | #include "shaderio.h"
 54 | 
 55 | ////////////////////////////////////////////
 56 | 
 57 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 58 | {
 59 |   FrameConstants view;
 60 |   FrameConstants viewLast;
 61 | };
 62 | 
 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 64 | {
 65 |   Readback readback;
 66 | };
 67 | 
 68 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 69 | {
 70 |   RenderInstance instances[];
 71 | };
 72 | 
 73 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 74 | {
 75 |   Geometry geometries[];
 76 | };
 77 | 
 78 | layout(binding = BINDINGS_HIZ_TEX)  uniform sampler2D texHizFar;
 79 | 
 80 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 81 | {
 82 |   SceneBuilding build;  
 83 | };
 84 | 
 85 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
 86 | {
 87 |   SceneBuilding buildRW;  
 88 | };
 89 | 
 90 | 
 91 | ////////////////////////////////////////////
 92 | 
 93 | layout(local_size_x=TRAVERSAL_INIT_WORKGROUP) in;
 94 | 
 95 | #include "culling.glsl"
 96 | 
 97 | ////////////////////////////////////////////
 98 | 
 99 | void main()
100 | {
101 |   uint instanceID   = gl_GlobalInvocationID.x;
102 |   uint instanceLoad = min(build.numRenderInstances-1, instanceID);
103 |   bool isValid      = instanceID == instanceLoad;
104 | 
105 | #if USE_SORTING
106 |   instanceLoad = build.instanceSortValues.d[instanceLoad];
107 |   instanceID   = instanceLoad;
108 | #endif
109 | 
110 |   // TODO optimization:
111 |   // For better loading behavior when streaming, the instances should be sorted
112 |   // relative to camera position.
113 |   
114 |   RenderInstance instance = instances[instanceLoad];
115 |   Geometry geometry = geometries[instance.geometryID];
116 |   
117 |   vec4 clipMin;
118 |   vec4 clipMax;
119 |   bool clipValid;
120 |   
121 |   uint status = 0;
122 |   
123 |   bool inFrustum = intersectFrustum(geometry.bbox.lo, geometry.bbox.hi, instance.worldMatrix, clipMin, clipMax, clipValid);
124 |   bool isVisible = inFrustum && (!clipValid || (intersectSize(clipMin, clipMax) && intersectHiz(clipMin, clipMax)));
125 |   
126 |   status  = (inFrustum ? INSTANCE_FRUSTUM_BIT : 0) |
127 |             (isVisible ? INSTANCE_VISIBLE_BIT : 0);
128 |   
129 | 
130 |   bool doNode = isValid
131 |   #if USE_CULLING && TARGETS_RASTERIZATION
132 |     && isVisible
133 |   #endif
134 |     ;
135 |   uvec4 voteNodes = subgroupBallot(doNode);
136 |   
137 |   // TODO optimization: enqueue all root children, so traversal can start with more nodes immediately
138 |   // TODO feature: allow single-lod level render option by picking a single appropriate child of the root node
139 |   // The root hierarchy node of a geometry is up to 32 wide, and each child represents one distinct lod level.
140 |   
141 |   uint offsetNodes = 0;
142 |   if (subgroupElect())
143 |   {
144 |     offsetNodes = atomicAdd(buildRW.traversalTaskCounter, int(subgroupBallotBitCount(voteNodes)));
145 |   }
146 |   
147 |   offsetNodes = subgroupBroadcastFirst(offsetNodes);  
148 |   offsetNodes += subgroupBallotExclusiveBitCount(voteNodes);
149 |       
150 |   if (doNode && offsetNodes < build.maxTraversalInfos) {
151 |     uint packedNode = geometry.nodes.d[0].packed;
152 |     TraversalInfo traversalInfo;
153 |     traversalInfo.instanceID = instanceID;
154 |     traversalInfo.packedNode = packedNode;
155 |     build.traversalNodeInfos.d[offsetNodes] = packTraversalInfo(traversalInfo);
156 |   }
157 | 
158 |   #if TARGETS_RAY_TRACING
159 |   if (instanceID == instanceLoad) {
160 |     build.instanceStates.d[instanceID] = status;
161 |     build.blasBuildInfos.d[instanceID].clusterReferencesCount = 0;
162 |     build.blasBuildInfos.d[instanceID].clusterReferencesStride = 8;
163 |   }
164 |   #endif
165 | }


--------------------------------------------------------------------------------
/shaders/traversal_presort.comp.glsl:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | 
 20 | /*
 21 |   
 22 |   Shader Description
 23 |   ==================
 24 |   
 25 |   This compute shader computes the distance of the instance to the camera.
 26 | 
 27 |   A thread represents one instance.
 28 | */
 29 | 
 30 | #version 460
 31 | 
 32 | #extension GL_GOOGLE_include_directive : enable
 33 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
 37 | #extension GL_EXT_buffer_reference : enable
 38 | #extension GL_EXT_buffer_reference2 : enable
 39 | #extension GL_EXT_scalar_block_layout : enable
 40 | #extension GL_EXT_shader_atomic_int64 : enable
 41 | 
 42 | #extension GL_EXT_control_flow_attributes : require
 43 | #extension GL_KHR_shader_subgroup_vote : require
 44 | #extension GL_KHR_shader_subgroup_ballot : require
 45 | #extension GL_KHR_shader_subgroup_shuffle : require
 46 | #extension GL_KHR_shader_subgroup_basic : require
 47 | #extension GL_KHR_shader_subgroup_clustered : require
 48 | #extension GL_KHR_shader_subgroup_arithmetic : require
 49 | 
 50 | #extension GL_NV_shader_subgroup_partitioned : require
 51 | 
 52 | #include "shaderio.h"
 53 | 
 54 | ////////////////////////////////////////////
 55 | 
 56 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
 57 | {
 58 |   FrameConstants view;
 59 |   FrameConstants viewLast;
 60 | };
 61 | 
 62 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
 63 | {
 64 |   Readback readback;
 65 | };
 66 | 
 67 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
 68 | {
 69 |   RenderInstance instances[];
 70 | };
 71 | 
 72 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer
 73 | {
 74 |   Geometry geometries[];
 75 | };
 76 | 
 77 | layout(binding = BINDINGS_HIZ_TEX)  uniform sampler2D texHizFar;
 78 | 
 79 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
 80 | {
 81 |   SceneBuilding build;  
 82 | };
 83 | 
 84 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
 85 | {
 86 |   SceneBuilding buildRW;  
 87 | };
 88 | 
 89 | 
 90 | ////////////////////////////////////////////
 91 | 
 92 | layout(local_size_x=TRAVERSAL_PRESORT_WORKGROUP) in;
 93 | 
 94 | ////////////////////////////////////////////
 95 | 
 96 | void main()
 97 | {
 98 |   uint instanceID = gl_GlobalInvocationID.x;
 99 |   uint instanceLoad = min(build.numRenderInstances-1, instanceID);
100 |   
101 |   RenderInstance instance = instances[instanceLoad];
102 |   Geometry geometry = geometries[instance.geometryID];
103 |   
104 |   mat4 worldToObject = inverse(instance.worldMatrix);
105 |   
106 |   vec3 oPos = (worldToObject * vec4(view.viewPos.xyz,1)).xyz;
107 |   
108 |   bool isInside = all(equal(greaterThanEqual(oPos, geometry.bbox.lo),lessThanEqual(oPos, geometry.bbox.hi)));
109 |   
110 |   vec3 oPosClamp = isInside ? (geometry.bbox.lo + geometry.bbox.hi) * 0.5 :
111 |     clamp(oPos, geometry.bbox.lo, geometry.bbox.hi);
112 |   
113 |   vec4 wPos = instance.worldMatrix * vec4(oPosClamp, 1);
114 |   
115 |   if (instanceID == instanceLoad) {
116 |     build.instanceSortValues.d[instanceID] = instanceID;
117 |     build.instanceSortKeys.d[instanceID]   = floatBitsToUint(distance(wPos.xyz, view.viewPos.xyz));
118 |   }
119 | }


--------------------------------------------------------------------------------
/src/cgltf.cpp:
--------------------------------------------------------------------------------
1 | 
2 | #define  CGLTF_IMPLEMENTATION
3 | #include <cgltf.h>
4 | 


--------------------------------------------------------------------------------
/src/hbao_pass.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2018-2025 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | 
 21 | #ifndef HBAOPASS_H__
 22 | #define HBAOPASS_H__
 23 | 
 24 | #include <assert.h>
 25 | 
 26 | #include <vulkan/vulkan_core.h>
 27 | #include <nvvk/shadermodulemanager_vk.hpp>
 28 | #include <nvvk/descriptorsets_vk.hpp>
 29 | #include <nvvk/resourceallocator_vk.hpp>
 30 | #include <nvh/trangeallocator.hpp>
 31 | #include <glm/glm.hpp>
 32 | 
 33 | //////////////////////////////////////////////////////////////////////////
 34 | 
 35 | /// HbaoSystem implements a screen-space
 36 | /// ambient occlusion effect using
 37 | /// horizon-based ambient occlusion.
 38 | /// See https://github.com/nvpro-samples/gl_ssao
 39 | /// for more details
 40 | 
 41 | class HbaoPass
 42 | {
 43 | public:
 44 |   static const int RANDOM_SIZE     = 4;
 45 |   static const int RANDOM_ELEMENTS = RANDOM_SIZE * RANDOM_SIZE;
 46 | 
 47 |   struct Config
 48 |   {
 49 |     VkFormat targetFormat;
 50 |     uint32_t maxFrames;
 51 |   };
 52 | 
 53 |   void init(VkDevice device, nvvk::ResourceAllocator* allocator, nvvk::ShaderModuleManager* shaderManager, const Config& config);
 54 |   void reloadShaders();
 55 |   void deinit();
 56 | 
 57 |   struct FrameConfig
 58 |   {
 59 |     bool blend;
 60 | 
 61 |     uint32_t sourceWidthScale;
 62 |     uint32_t sourceHeightScale;
 63 | 
 64 |     uint32_t targetWidth;
 65 |     uint32_t targetHeight;
 66 | 
 67 |     VkDescriptorImageInfo sourceDepth;
 68 |     VkDescriptorImageInfo targetColor;
 69 |   };
 70 | 
 71 |   struct FrameIMGs
 72 |   {
 73 |     nvvk::Texture depthlinear, viewnormal, result, blur, resultarray, deptharray;
 74 |   };
 75 | 
 76 |   struct Frame
 77 |   {
 78 |     uint32_t slot = ~0u;
 79 | 
 80 |     FrameIMGs images;
 81 |     int       width;
 82 |     int       height;
 83 | 
 84 |     FrameConfig config;
 85 |   };
 86 | 
 87 |   bool initFrame(Frame& frame, const FrameConfig& config, VkCommandBuffer cmd);
 88 |   void deinitFrame(Frame& frame);
 89 | 
 90 | 
 91 |   struct View
 92 |   {
 93 |     bool      isOrtho;
 94 |     float     nearPlane;
 95 |     float     farPlane;
 96 |     float     halfFovyTan;
 97 |     glm::mat4 projectionMatrix;
 98 |   };
 99 | 
100 |   struct Settings
101 |   {
102 |     View view;
103 | 
104 |     float unit2viewspace = 1.0f;
105 |     float intensity      = 1.0f;
106 |     float radius         = 1.0f;
107 |     float bias           = 0.1f;
108 |     float blurSharpness  = 40.0f;
109 |   };
110 | 
111 |   // before: must do appropriate barriers for color write access and depth read access
112 |   // after:  from compute write to whatever output image needs
113 |   void cmdCompute(VkCommandBuffer cmd, const Frame& frame, const Settings& settings) const;
114 | 
115 | private:
116 |   struct Shaders
117 |   {
118 |     nvvk::ShaderModuleID depth_linearize, viewnormal, blur, blur_apply, deinterleave, calc, reinterleave;
119 |   };
120 | 
121 |   struct Pipelines
122 |   {
123 |     VkPipeline depth_linearize = VK_NULL_HANDLE;
124 |     VkPipeline viewnormal      = VK_NULL_HANDLE;
125 |     VkPipeline blur            = VK_NULL_HANDLE;
126 |     VkPipeline blur_apply      = VK_NULL_HANDLE;
127 |     VkPipeline deinterleave    = VK_NULL_HANDLE;
128 |     VkPipeline calc            = VK_NULL_HANDLE;
129 |     VkPipeline reinterleave    = VK_NULL_HANDLE;
130 |   };
131 | 
132 |   VkDevice                   m_device;
133 |   nvvk::ResourceAllocator*   m_allocator;
134 |   nvvk::ShaderModuleManager* m_shaderManager;
135 |   nvh::TRangeAllocator<1>    m_slots;
136 |   Config                     m_config;
137 | 
138 |   nvvk::DescriptorSetContainer m_setup;
139 | 
140 |   nvvk::Buffer           m_ubo;
141 |   VkDescriptorBufferInfo m_uboInfo;
142 | 
143 |   VkSampler m_linearSampler;
144 | 
145 |   Shaders   m_shaders;
146 |   Pipelines m_pipelines;
147 | 
148 |   glm::vec4 m_hbaoRandom[RANDOM_ELEMENTS];
149 | 
150 |   void updatePipelines();
151 |   void updateUbo(VkCommandBuffer cmd, const Frame& frame, const Settings& settings) const;
152 | };
153 | 
154 | #endif


--------------------------------------------------------------------------------
/src/nvhiz_vk.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION
 17 |  * SPDX-License-Identifier: Apache-2.0
 18 |  */
 19 | 
 20 | #ifndef _NVHIZ_H__
 21 | #define _NVHIZ_H__
 22 | 
 23 | #include <stdint.h>
 24 | #include <string>
 25 | #include <vector>
 26 | 
 27 | #include <platform.h>
 28 | #include <vulkan/vulkan_core.h>
 29 | 
 30 | class NVHizVK
 31 | {
 32 | private:
 33 |   enum ProgViewMode : uint32_t
 34 |   {
 35 |     PROG_VIEW_MONO,
 36 |     PROG_VIEW_STEREO,
 37 |     PROG_VIEW_COUNT,
 38 |   };
 39 | 
 40 |   enum ProgHizMode : uint32_t
 41 |   {
 42 |     PROG_HIZ_FAR,
 43 |     PROG_HIZ_FAR_AND_NEAR,
 44 |     PROG_HIZ_FAR_REST,
 45 |     PROG_HIZ_COUNT,
 46 |   };
 47 | 
 48 | public:
 49 |   static const uint32_t MAX_MIP_LEVELS = 16;
 50 |   static const uint32_t SHADER_COUNT   = (uint32_t(PROG_HIZ_COUNT) * uint32_t(PROG_VIEW_COUNT));
 51 | 
 52 |   enum BindingSlots
 53 |   {
 54 |     // keep in sync with glsl
 55 |     BINDING_READ_DEPTH,
 56 |     BINDING_READ_FAR,
 57 |     BINDING_WRITE_NEAR,
 58 |     BINDING_WRITE_FAR,
 59 |     BINDING_COUNT,
 60 |   };
 61 | 
 62 |   struct TextureInfo
 63 |   {
 64 |     // allocation
 65 |     uint32_t           width;
 66 |     uint32_t           height;
 67 |     uint32_t           mipLevels;
 68 |     VkFormat           format;
 69 |     VkImageAspectFlags aspect;
 70 | 
 71 |     // the system may use only a sub-rectangle of the allocated width/height
 72 |     // you should clamp access to this, when sampling the texture
 73 |     uint32_t usedWidth;
 74 |     uint32_t usedHeight;
 75 | 
 76 |     // xy scale and zw clamp
 77 |     // use min(uv*factor.xy,factor.zw) for lookups
 78 |     void  getShaderFactors(float factors[4]) const;
 79 |     float getSizeMax() const;
 80 |   };
 81 | 
 82 |   struct Update
 83 |   {
 84 |     // provide texture/views that are not layered
 85 |     VkImageView sourceImageView;                // 2DMS if createInfo.msaaLevel set, otherwise 2D
 86 |     VkImageView nearImageView;                  // 2D optional
 87 |     VkImageView farImageView;                   // 2D all mips
 88 |     VkImageView farImageViews[MAX_MIP_LEVELS];  // 2D single mip
 89 | 
 90 |     VkDescriptorImageInfo farImageInfo;
 91 |     VkDescriptorImageInfo nearImageInfo;
 92 | 
 93 |     VkImage sourceImage;
 94 |     VkImage nearImage;  // optional
 95 |     VkImage farImage;
 96 | 
 97 |     TextureInfo sourceInfo;
 98 |     TextureInfo farInfo;
 99 |     TextureInfo nearInfo;
100 |     bool        stereo;  // textures are layered, and updates layer 0,1
101 | 
102 |     Update() { memset(this, 0, sizeof(Update)); }
103 |   };
104 | 
105 |   struct DescriptorUpdate
106 |   {
107 |     VkWriteDescriptorSet  writeSets[BINDING_COUNT];
108 |     VkDescriptorImageInfo imageInfos[BINDING_COUNT + MAX_MIP_LEVELS - 1];
109 |   };
110 | 
111 |   struct Config
112 |   {
113 |     int  msaaSamples             = 0;
114 |     bool reversedZ               = false;
115 |     bool supportsSubGroupShuffle = false;
116 |     bool supportsMinmaxFilter    = false;
117 |   };
118 | 
119 | 
120 |   void init(VkDevice device, const Config& config, uint32_t descrSetsCount);
121 | 
122 |   VkSampler                   getReadFarSampler() const;
123 |   const VkDescriptorPoolSize* getDescriptorPoolSizes(uint32_t& count) const;
124 |   VkDescriptorSetLayout       getDescriptorSetLayout() const;
125 |   std::string                 getShaderDefines(uint32_t shader) const;
126 | #if 0
127 |   void                        appendShaderDefines(uint32_t shader, shaderc::CompileOptions& options) const;
128 | #endif
129 |   void initPipelines(const VkShaderModule modules[SHADER_COUNT]);
130 | 
131 |   void deinit();
132 | 
133 |   void setupUpdateInfos(Update& update, uint32_t width, uint32_t height, VkFormat sourceFormat, VkImageAspectFlags sourceAspect) const;
134 |   void setupDescriptorUpdate(DescriptorUpdate& updateWrite, const Update& update, VkDescriptorSet set) const;
135 | 
136 |   void cmdUpdateHiz(VkCommandBuffer cmd, const Update& update, VkDescriptorSet set) const;
137 | 
138 |   // optional utility functions
139 |   void initUpdateViews(Update& update) const;
140 |   void deinitUpdateViews(Update& update) const;
141 | 
142 |   // if descrSetsCount was non zero
143 |   void updateDescriptorSet(const Update& update, uint32_t setIdx) const;
144 |   // if descrSetsCount was non zero
145 |   void cmdUpdateHiz(VkCommandBuffer cmd, const Update& update, uint32_t setIdx) const
146 |   {
147 |     cmdUpdateHiz(cmd, update, m_descrSets[setIdx]);
148 |   }
149 | 
150 | private:
151 |   struct InternalConfig : public Config
152 |   {
153 |     uint32_t hizLevels    = 1;
154 |     uint32_t hizNearLevel = 0;
155 |     uint32_t hizFarLevel  = 0;
156 |   };
157 | 
158 |   static void getShaderIndexConfig(uint32_t index, ProgHizMode& hiz, ProgViewMode& view)
159 |   {
160 |     hiz  = ProgHizMode(index % uint32_t(PROG_HIZ_COUNT));
161 |     view = ProgViewMode(index / uint32_t(PROG_HIZ_COUNT));
162 |   }
163 | 
164 |   static uint32_t getShaderIndex(ProgHizMode hiz, ProgViewMode view) { return view * uint32_t(PROG_HIZ_COUNT) + hiz; }
165 | 
166 |   struct PushConstants
167 |   {
168 |     // keep in sync with glsl
169 |     int srcSize[4];
170 |     int writeLod;
171 |     int startLod;
172 |     int layer;
173 |     int _pad0;
174 |     int levelActive[4];
175 |   };
176 | 
177 |   void deinitPipelines();
178 | 
179 |   InternalConfig        m_config                  = {};
180 |   VkDevice              m_device                  = {};
181 |   VkSampler             m_readDepthSampler        = {};
182 |   VkSampler             m_readFarSampler          = {};
183 |   VkSampler             m_readNearSampler         = {};
184 |   VkPipeline            m_pipelines[SHADER_COUNT] = {0};
185 |   VkPipelineLayout      m_pipelineLayout          = {};
186 |   VkDescriptorSetLayout m_descrLayout             = {};
187 |   VkDescriptorPoolSize  m_poolSizes[2];
188 |   uint32_t              m_descrSetsCount = 0;
189 |   VkDescriptorPool      m_descrPool      = {};
190 |   VkDescriptorSet*      m_descrSets      = {};
191 | };
192 | 
193 | #endif
194 | 


--------------------------------------------------------------------------------
/src/renderer.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | *     http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | *
 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 17 | * SPDX-License-Identifier: Apache-2.0
 18 | */
 19 | #pragma once
 20 | 
 21 | #include <memory>
 22 | 
 23 | #include "resources.hpp"
 24 | #include "scene.hpp"
 25 | #include "scene_preloaded.hpp"
 26 | #include "scene_streaming.hpp"
 27 | 
 28 | namespace lodclusters {
 29 | 
 30 | // There are two implementations for a renderable scene.
 31 | // Everything is preloaded or we stream in data dynamically.
 32 | class RenderScene
 33 | {
 34 | public:
 35 |   const Scene*   scene        = nullptr;
 36 |   bool           useStreaming = false;
 37 |   ScenePreloaded scenePreloaded;
 38 |   SceneStreaming sceneStreaming;
 39 | 
 40 |   // pointers must stay valid during lifetime
 41 |   bool init(Resources* res, const Scene* scene_, const StreamingConfig& streamingConfig_, bool useStreaming_);
 42 |   void deinit();
 43 | 
 44 |   void streamingReset();
 45 | 
 46 |   bool updateClasRequired(bool state);
 47 | 
 48 |   const RBufferTyped<shaderio::Geometry>& getShaderGeometriesBuffer() const;
 49 |   size_t                                  getClasSize(bool reserved) const;
 50 |   size_t                                  getOperationsSize() const;
 51 |   size_t                                  getGeometrySize(bool reserved) const;
 52 | };
 53 | 
 54 | struct RendererConfig
 55 | {
 56 |   bool flipWinding = false;
 57 |   bool twoSided    = false;
 58 |   bool useSorting  = false;
 59 | 
 60 |   // the maximum number of renderable clusters per frame in bits i.e. (1 << number)
 61 |   uint32_t numRenderClusterBits = 20;
 62 |   // the maximum number of traversal intermediate tasks
 63 |   uint32_t numTraversalTaskBits = 20;
 64 | 
 65 |   // build flags for the cluster BLAS
 66 |   VkBuildAccelerationStructureFlagsKHR clusterBlasFlags = 0;
 67 | };
 68 | 
 69 | class Renderer
 70 | {
 71 | public:
 72 |   struct ResourceUsageInfo
 73 |   {
 74 |     size_t rtTlasMemBytes{};
 75 |     size_t rtBlasMemBytes{};
 76 |     size_t rtClasMemBytes{};
 77 |     size_t operationsMemBytes{};
 78 |     size_t geometryMemBytes{};
 79 | 
 80 |     void add(const ResourceUsageInfo& other)
 81 |     {
 82 |       rtTlasMemBytes += other.rtTlasMemBytes;
 83 |       rtBlasMemBytes += other.rtBlasMemBytes;
 84 |       rtClasMemBytes += other.rtClasMemBytes;
 85 |       operationsMemBytes += other.operationsMemBytes;
 86 |       geometryMemBytes += other.geometryMemBytes;
 87 |     }
 88 |     size_t getTotalSum() const
 89 |     {
 90 |       return rtTlasMemBytes + rtBlasMemBytes + rtClasMemBytes + geometryMemBytes + operationsMemBytes;
 91 |     }
 92 |   };
 93 | 
 94 |   virtual bool init(Resources& res, RenderScene& rscene, const RendererConfig& config) = 0;
 95 |   virtual void render(VkCommandBuffer primary, Resources& res, RenderScene& rscene, const FrameConfig& frame, nvvk::ProfilerVK& profiler) = 0;
 96 |   virtual void deinit(Resources& res) = 0;
 97 |   virtual ~Renderer() {};  // Defined only so that inherited classes also have virtual destructors. Use deinit().
 98 |   virtual void updatedFrameBuffer(Resources& res) { updatedFrameBufferBasics(res); };
 99 | 
100 |   virtual bool supportsClusters() const { return true; }
101 | 
102 |   inline ResourceUsageInfo getResourceUsage(bool reserved) const
103 |   {
104 |     return reserved ? m_resourceReservedUsage : m_resourceActualUsage;
105 |   };
106 | 
107 | protected:
108 |   bool initBasicShaders(Resources& res);
109 |   void initBasics(Resources& res, RenderScene& rscene, const RendererConfig& config);
110 |   void deinitBasics(Resources& res);
111 | 
112 |   void updatedFrameBufferBasics(Resources& res);
113 | 
114 |   void initWriteRayTracingDepthBuffer(Resources& res);
115 |   void writeRayTracingDepthBuffer(VkCommandBuffer cmd);
116 | 
117 |   void initRenderInstanceBboxes(Resources& res, RenderScene& rscene);
118 |   void renderInstanceBboxes(VkCommandBuffer cmd);
119 | 
120 |   struct BasicShaders
121 |   {
122 |     nvvk::ShaderModuleID fullScreenVertexShader;
123 |     nvvk::ShaderModuleID fullScreenWriteDepthFragShader;
124 |     nvvk::ShaderModuleID renderInstanceBboxesFragmentShader;
125 |     nvvk::ShaderModuleID renderInstanceBboxesMeshShader;
126 |   };
127 | 
128 |   BasicShaders m_basicShaders;
129 | 
130 |   std::vector<shaderio::RenderInstance> m_renderInstances;
131 |   RBuffer                               m_renderInstanceBuffer;
132 | 
133 |   ResourceUsageInfo m_resourceReservedUsage{};
134 |   ResourceUsageInfo m_resourceActualUsage{};
135 | 
136 |   nvvk::DescriptorSetContainer m_writeDepthBufferDsetContainer;
137 |   VkPipeline                   m_writeDepthBufferPipeline = nullptr;
138 | 
139 |   nvvk::DescriptorSetContainer m_renderInstanceBboxesDsetContainer;
140 |   VkPipeline                   m_renderInstanceBboxesPipeline = nullptr;
141 | 
142 |   RBuffer m_sortingAuxBuffer;
143 | };
144 | 
145 | //////////////////////////////////////////////////////////////////////////
146 | 
147 | std::unique_ptr<Renderer> makeRendererRasterClustersLod();
148 | std::unique_ptr<Renderer> makeRendererRayTraceClustersLod();
149 | 
150 | }  // namespace lodclusters
151 | 


--------------------------------------------------------------------------------
/src/scene_preloaded.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | #pragma once
21 | 
22 | #include "scene.hpp"
23 | #include "resources.hpp"
24 | #include "vk_nv_cluster_acc.h"
25 | 
26 | namespace lodclusters {
27 | 
28 | // With this class we pre-load all lod levels of the rendered scene.
29 | // It is much more memory intensive.
30 | class ScenePreloaded
31 | {
32 | public:
33 |   struct Config
34 |   {
35 |     VkBuildAccelerationStructureFlagsKHR clasBuildFlags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR;
36 |     uint32_t                             clasPositionTruncateBits = 0;
37 |   };
38 | 
39 |   // pointers must stay valid during lifetime
40 |   bool init(Resources* res, const Scene* scene, const Config& config);
41 | 
42 |   // run prior the renderer starts referencing resources
43 |   // if true CLAS for all clusters will be built
44 |   bool updateClasRequired(bool state);
45 | 
46 |   // tear down, safe to call without init
47 |   void deinit();
48 | 
49 |   // renderers need to access this buffer
50 |   const RBufferTyped<shaderio::Geometry>& getShaderGeometriesBuffer() const { return m_shaderGeometriesBuffer; }
51 | 
52 |   // device memory usage
53 |   size_t getClasSize() const { return m_clasSize; }
54 |   size_t getGeometrySize() const { return m_geometrySize; }
55 |   size_t getOperationsSize() const { return m_operationsSize + m_clasOperationsSize; }
56 | 
57 | private:
58 |   struct Geometry
59 |   {
60 |     RBufferTyped<shaderio::Node> nodes;
61 |     RBufferTyped<shaderio::BBox> nodeBboxes;
62 | 
63 |     RBufferTyped<shaderio::Group> groups;
64 | 
65 |     RBufferTyped<uint8_t>   localTriangles;
66 |     RBufferTyped<glm::vec4> vertices;
67 | 
68 |     RBufferTyped<shaderio::Cluster> clusters;
69 |     RBufferTyped<uint32_t>          clusterGeneratingGroups;
70 |     RBufferTyped<shaderio::BBox>    clusterBboxes;
71 | 
72 |     // for ray tracing
73 |     RBufferTyped<uint64_t> clusterClasAddresses;
74 |     RBufferTyped<uint32_t> clusterClasSizes;
75 |     RBuffer                clasData;
76 |   };
77 | 
78 |   Config       m_config;
79 |   bool         m_hasClas   = false;
80 |   Resources*   m_resources = nullptr;
81 |   const Scene* m_scene     = nullptr;
82 | 
83 |   size_t m_clasSize           = 0;
84 |   size_t m_clasOperationsSize = 0;
85 |   size_t m_geometrySize       = 0;
86 |   size_t m_operationsSize     = 0;
87 | 
88 |   std::vector<ScenePreloaded::Geometry> m_geometries;
89 |   std::vector<shaderio::Geometry>       m_shaderGeometries;
90 | 
91 |   RBufferTyped<shaderio::Geometry> m_shaderGeometriesBuffer;
92 | 
93 |   bool initClas();
94 |   void deinitClas();
95 | };
96 | }  // namespace lodclusters
97 | 


--------------------------------------------------------------------------------
/src/vk_nv_cluster_acc.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *     http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
17 | * SPDX-License-Identifier: Apache-2.0
18 | */
19 | 
20 | #include "vk_nv_cluster_acc.h"
21 | #include <nvvk/extensions_vk.hpp>
22 | 
23 | static PFN_vkGetClusterAccelerationStructureBuildSizesNV s_vkGetClusterAccelerationStructureBuildSizesNV = nullptr;
24 | static PFN_vkCmdBuildClusterAccelerationStructureIndirectNV s_vkCmdBuildClusterAccelerationStructureIndirectNV = nullptr;
25 | 
26 | #ifndef NVVK_HAS_VK_NV_cluster_acceleration_structure
27 | VKAPI_ATTR void VKAPI_CALL vkGetClusterAccelerationStructureBuildSizesNV(VkDevice device,
28 |                                                                          const VkClusterAccelerationStructureInputInfoNV* input,
29 |                                                                          VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo)
30 | {
31 |   s_vkGetClusterAccelerationStructureBuildSizesNV(device, input, pSizeInfo);
32 | }
33 | 
34 | VKAPI_ATTR void VKAPI_CALL vkCmdBuildClusterAccelerationStructureIndirectNV(VkCommandBuffer commandBuffer,
35 |                                                                             const VkClusterAccelerationStructureCommandsInfoNV* cmdInfo)
36 | {
37 |   s_vkCmdBuildClusterAccelerationStructureIndirectNV(commandBuffer, cmdInfo);
38 | }
39 | #endif
40 | 
41 | VkBool32 load_VK_NV_cluster_accleration_structure(VkInstance instance, VkDevice device)
42 | {
43 |   s_vkGetClusterAccelerationStructureBuildSizesNV    = nullptr;
44 |   s_vkCmdBuildClusterAccelerationStructureIndirectNV = nullptr;
45 | 
46 |   s_vkGetClusterAccelerationStructureBuildSizesNV =
47 |       (PFN_vkGetClusterAccelerationStructureBuildSizesNV)vkGetDeviceProcAddr(device, "vkGetClusterAccelerationStructureBuildSizesNV");
48 |   s_vkCmdBuildClusterAccelerationStructureIndirectNV =
49 |       (PFN_vkCmdBuildClusterAccelerationStructureIndirectNV)vkGetDeviceProcAddr(device, "vkCmdBuildClusterAccelerationStructureIndirectNV");
50 | 
51 |   return s_vkGetClusterAccelerationStructureBuildSizesNV && s_vkCmdBuildClusterAccelerationStructureIndirectNV;
52 | }
53 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15)
 2 | 
 3 | project(vk_radix_sort LANGUAGES C CXX CUDA)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 7 | 
 8 | find_package(Vulkan REQUIRED)
 9 | 
10 | # adds -fPIC, works for linux, when building shared library
11 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
12 | 
13 | # shaders
14 | file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/generated)
15 | 
16 | # add_shader(TARGET SHADER OUTPUT DEFINE...)
17 | function(add_shader)
18 |   list(POP_FRONT ARGV TARGET SHADER OUTPUT)
19 |   list(TRANSFORM ARGV PREPEND "-D" OUTPUT_VARIABLE DEFINES)
20 | 
21 |   get_filename_component(SHADER ${SHADER} ABSOLUTE)
22 | 
23 |   add_custom_command(
24 |     OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h
25 |     COMMAND
26 |       ${Vulkan_GLSLANG_VALIDATOR_EXECUTABLE}
27 |       --target-env spirv1.5
28 |       -V
29 |       --vn ${OUTPUT}
30 |       -o ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h
31 |       ${DEFINES}
32 |       ${SHADER}
33 |     DEPENDS ${SHADER}
34 |     COMMENT "Compiling ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h"
35 |   )
36 | 
37 |   add_custom_target(${OUTPUT} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h)
38 |   add_dependencies(${TARGET} ${OUTPUT})
39 | endfunction()
40 | 
41 | # library
42 | add_library(vk_radix_sort STATIC
43 |   src/vk_radix_sort.cc
44 | )
45 | 
46 | target_include_directories(vk_radix_sort
47 |   PUBLIC include
48 |   PRIVATE src
49 | )
50 | 
51 | target_link_libraries(vk_radix_sort
52 |   PUBLIC Vulkan::Vulkan
53 | )
54 | 
55 | add_shader(vk_radix_sort src/shader/upsweep.comp upsweep_comp)
56 | add_shader(vk_radix_sort src/shader/spine.comp spine_comp)
57 | add_shader(vk_radix_sort src/shader/downsweep.comp downsweep_comp)
58 | add_shader(vk_radix_sort src/shader/downsweep.comp downsweep_key_value_comp KEY_VALUE)
59 | 
60 | # bench
61 | if (PROJECT_IS_TOP_LEVEL)
62 |   set(BENCH_SOURCES
63 |     bench/bench.cc
64 |     bench/benchmark_factory.cc
65 |     bench/cpu_benchmark.cc
66 |     bench/data_generator.cc
67 |     bench/vma_impl.cc
68 |     bench/vulkan_benchmark.cc
69 |   )
70 | 
71 |   # if CUDA is available, add CUB benchmark
72 |   include(CheckLanguage)
73 |   check_language(CUDA)
74 |   if (CMAKE_CUDA_COMPILER)
75 |     enable_language(CUDA)
76 |     set(CMAKE_CUDA_STANDARD 17)
77 |     set(CMAKE_CUDA_STANDARD_REQUIRED True)
78 |     list(APPEND BENCH_SOURCES
79 |       bench/cuda_benchmark.cu
80 |     )
81 |   endif()
82 | 
83 |   message(${BENCH_SOURCES})
84 |   add_executable(bench ${BENCH_SOURCES})
85 | 
86 |   if (CMAKE_CUDA_COMPILER)
87 |     target_compile_definitions(bench PUBLIC BENCH_CUDA)
88 |   endif()
89 | 
90 |   # if VMA is already added from parent project, skip
91 |   if (NOT TARGET VulkanMemoryAllocator)
92 |     add_subdirectory(third_party/VulkanMemoryAllocator EXCLUDE_FROM_ALL)
93 |   endif()
94 | 
95 |   target_link_libraries(bench PRIVATE vk_radix_sort VulkanMemoryAllocator)
96 | endif()
97 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 jaesung-cs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/README.md:
--------------------------------------------------------------------------------
  1 | # vulkan_radix_sort
  2 | 
  3 | Vulkan implementation of radix sort.
  4 | 
  5 | Reduce-then-scan GPU radix sort algorithm is implemented (Onesweep is abandoned.)
  6 | 
  7 | 
  8 | ## Requirements
  9 | - `VulkanSDK>=1.2`
 10 |   - Download from https://vulkan.lunarg.com/ and follow install instruction.
 11 |   - Requires several features available in `1.2`.
 12 |   - Must support `VK_KHR_buffer_device_address`:
 13 |     - Run `vulkaninfo` and check if `VK_KHR_buffer_device_address` device extension is available.
 14 | - `cmake>=3.15`
 15 | 
 16 | 
 17 | ## Build
 18 | ```bash
 19 | $ cmake . -B build
 20 | $ cmake --build build --config Release -j
 21 | ```
 22 | 
 23 | ## Test
 24 | ```bash
 25 | $ ./build/Release/bench.exe <N> <type>  # Windows
 26 | $ ./build/bench <N> <type>              # Linux
 27 | $ ./build/bench 10000000 vulkan
 28 | ```
 29 | - N = number of elements to sort
 30 | - type = one of cpu,vulkan,cuda
 31 | 
 32 | 
 33 | ### Test Environment
 34 | - Windows, NVIDIA GeForce RTX 4090.
 35 | 
 36 | 
 37 | ### Benchmark Result
 38 | - Not precisely benchmarked, but the speed is competitive compare to CUB radix sort.
 39 | - 32-bit key-only: my implementation is 10% slower when sorting 33M (2^25) elements.
 40 | - 32-bit Key-value: my implementation is 15-25% faster when sorting 33M (2^25) key-value pairs.
 41 | - Note that CUB radix sort is not in-place operation. It may require an additional copy operation, or double storage.
 42 | - vulkan
 43 |   ```bash
 44 |   > .\build\Release\bench.exe 33554432 vulkan
 45 |   vk_radix_sort benchmark
 46 |   ================ sort ================
 47 |   total time: 2.67571ms (12.5404 GItems/s)
 48 |   ================ sort key value ================
 49 |   total time: 3.42221ms (9.80491 GItems/s)
 50 |   ================ sort key value speed ================
 51 |   [0] total time: 3.41706ms (9.81969 GItems/s)
 52 |   [1] total time: 3.43142ms (9.77857 GItems/s)
 53 |   [2] total time: 3.42298ms (9.80271 GItems/s)
 54 |   [3] total time: 3.46208ms (9.69199 GItems/s)
 55 |   [4] total time: 3.42426ms (9.79904 GItems/s)
 56 |   [5] total time: 3.43725ms (9.762 GItems/s)
 57 |   [6] total time: 3.42016ms (9.81078 GItems/s)
 58 |   [7] total time: 3.42016ms (9.81078 GItems/s)
 59 |   [8] total time: 3.42099ms (9.80839 GItems/s)
 60 |   [9] total time: 3.41606ms (9.82254 GItems/s)
 61 |   ...
 62 |   ```
 63 | - CUDA Version 12.6 CUB
 64 |   ```bash
 65 |   > .\build\Release\bench.exe 33554432 cuda
 66 |   vk_radix_sort benchmark
 67 |   ================ sort ================
 68 |   total time: 2.5047ms (13.3966 GItems/s)
 69 |   ================ sort key value ================
 70 |   total time: 4.19226ms (8.00391 GItems/s)
 71 |   ================ sort key value speed ================
 72 |   [0] total time: 4.20352ms (7.98246 GItems/s)
 73 |   [1] total time: 4.50355ms (7.45066 GItems/s)
 74 |   [2] total time: 4.21376ms (7.96306 GItems/s)
 75 |   [3] total time: 4.22298ms (7.94568 GItems/s)
 76 |   [4] total time: 4.22208ms (7.94737 GItems/s)
 77 |   [5] total time: 4.2199ms (7.95147 GItems/s)
 78 |   [6] total time: 4.21274ms (7.965 GItems/s)
 79 |   [7] total time: 4.20352ms (7.98246 GItems/s)
 80 |   [8] total time: 4.21376ms (7.96306 GItems/s)
 81 |   [9] total time: 4.21478ms (7.96113 GItems/s)
 82 |   ...
 83 |   ```
 84 | 
 85 | ## Use as a Library with CMake
 86 | - Add subdirectory `vulkan_radix_sort`
 87 |     ```cmake
 88 |     add_subdirectory(path/to/vulkan_radix_sort)
 89 |     ```
 90 | 
 91 | - Link to `vk_radix_sort` in your project (library, binary)
 92 |     ```cmake
 93 |     target_link_libraries(my_project PRIVATE Vulkan::Vulkan VulkanMemoryAllocator vk_radix_sort)
 94 |     ```
 95 | 
 96 | ## Usage
 97 | 1. When creating `VkDevice`, enable `VkPhysicalDeviceBufferAddressFeatures`.
 98 | 
 99 | 1. When creating `VmaAllocator`, enable `VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT` flag.
100 | 
101 | 1. Create `VkBuffer` for keys and values, with `VK_BUFFER_USAGE_STORAGE_BUFFER_BIT` and `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT`.
102 | 
103 | 1. Create `VrdxSorter`
104 | 
105 |     It creates shared resources: pipeline layouts, pipelines, etc.
106 | 
107 |     ```c++
108 |     VrdxSorter sorter = VK_NULL_HANDLE;
109 |     VrdxSorterCreateInfo sorterInfo = {};
110 |     sorterInfo.physicalDevice = physicalDevice;
111 |     sorterInfo.device = device;
112 |     sorterInfo.pipelineCache = pipelineCache;
113 |     vrdxCreateSorter(&sorterInfo, &sorter);
114 |     ```
115 | 
116 | 1. Create a temporary storage buffer for sort.
117 | 
118 |     ```c++
119 |     // request storage buffer request
120 |     VrdxSorterStorageRequirements requirements;
121 |     // for key-only
122 |     vrdxGetSorterStorageRequirements(sorter, elementCount, &requirements);
123 |     // for key-value
124 |     vrdxGetSorterKeyValueStorageRequirements(sorter, elementCount, &requirements);
125 | 
126 |     // create or reuse buffer
127 |     VkBufferCreateInfo bufferInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
128 |     bufferInfo.size = requirements.size;
129 |     bufferInfo.usage = requirements.usage;
130 |     // ...
131 |     ```
132 | 
133 | 1. Record sort commands.
134 | 
135 |     This command binds pipeline, pipeline layout, and push constants internally.
136 | 
137 |     So, users must not expect previously bound targets retain after the sort command.
138 | 
139 |     Users must add proper **execution barriers**.
140 | 
141 |     One can use buffer memory barrier, but in general, global barriers are more efficient than per-resource, according to [official synchronization examples](https://github.com/KhronosGroup/Vulkan-Docs/wiki/Synchronization-Examples#three-dispatches-first-dispatch-writes-to-one-storage-buffer-second-dispatch-writes-to-a-different-storage-buffer-third-dispatch-reads-both):
142 | 
143 |     > ... global memory barrier covers all resources. Generally considered more efficient to do a global memory barrier than per-resource barriers, per-resource barriers should usually be used for queue ownership transfers and image layout transitions - otherwise use global barriers.
144 | 
145 |     The sort command will read from key/value buffers (and elementCount buffer for indirect sort) in compute shader stage, and write to output key/value buffers in later compute shader stage.
146 | 
147 |     The second synchronization scope **before** sort command must include `COMPUTE_SHADER` stage (and `TRANSFER` for indirect sort) and `SHADER_READ` access (and `TRANSFER_READ` for indirect sort).
148 | 
149 |     The first synchronization scope **after** sort command must include `COMPUTE_SHADER` stage and `SHADER_WRITE` access.
150 | 
151 |     ```c++
152 |     VkQueryPool queryPool;  // VK_NULL_HANDLE, or a valid timestamp query pool with size at least 8.
153 | 
154 |     // sort keys
155 |     vrdxCmdSort(commandBuffer, sorter, elementCount,
156 |                 keysBuffer, 0,
157 |                 storageBuffer, 0,
158 |                 queryPool, 0);
159 | 
160 |     // sort keys with values
161 |     vrdxCmdSortKeyValue(commandBuffer, sorter, elementCount,
162 |                         keysBuffer, 0,
163 |                         valuesBuffer, 0,
164 |                         storageBuffer, 0,
165 |                         queryPool, 0);
166 | 
167 |     // indirectBuffer contains elementCount, a single uint entry in GPU buffer.
168 |     // maxElementCount is required for storage buffer offsets.
169 |     // element count in the indirect buffer must not be greater than maxElementCount. Otherwise, undefined behavior.
170 |     vrdxCmdSortKeyValueIndirect(commandBuffer, sorter, maxElementCount,
171 |                                 indirectBuffer, 0,
172 |                                 keysBuffer, 0,
173 |                                 valuesBuffer, 0,
174 |                                 storageBuffer, 0,
175 |                                 queryPool, 0);
176 |     ```
177 | 
178 | 
179 | ## TODO
180 | - [x] Use `VkPhysicalDeviceLimits` to get compute shader-related limits, such as `maxComputeWorkGroupSize` or `maxComputeSharedMemorySize`.
181 | - [x] Increase allowed `maxElementCount` by allocating buffers properly.
182 | - [x] Compare with CUB radix sort
183 | - [ ] Compare with VkRadixSort
184 | - [ ] Compare with Fuchsia radix sort
185 | - [ ] Find best `WORKGROUP_SIZE` and `PARTITION_DIVISION` for different devices.
186 | - [x] Support for SubgroupSize=64.
187 | 
188 | 
189 | ## References
190 | - https://github.com/b0nes164/GPUSorting : their CUDA kernel codes were very helpful when trying to catch the idea of how the algorithm works.
191 | 
192 | 
193 | ## Troubleshooting
194 | - (NVIDIA GPU, Windows) Slow runtime after a few seconds.
195 |   - Reason: NVidia driver adjust GPU/Memory clock.
196 |     Open Performance Overlay (Alt+R), then you will see GPU/Memory Clock gets down.
197 |   - Solution: change performance mode in control panel.
198 |     ![](media/performance_mode.jpg)
199 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/include/vk_radix_sort.h:
--------------------------------------------------------------------------------
 1 | #ifndef VK_RADIX_SORT_H
 2 | #define VK_RADIX_SORT_H
 3 | 
 4 | #include <vulkan/vulkan.h>
 5 | 
 6 | struct VrdxSorter_T;
 7 | 
 8 | /**
 9 |  * VrdxSorter creates pipelines.
10 |  */
11 | VK_DEFINE_HANDLE(VrdxSorter)
12 | 
13 | struct VrdxSorterCreateInfo {
14 |   VkPhysicalDevice physicalDevice;
15 |   VkDevice device;
16 |   VkPipelineCache pipelineCache;
17 | };
18 | 
19 | void vrdxCreateSorter(const VrdxSorterCreateInfo* pCreateInfo,
20 |                       VrdxSorter* pSorter);
21 | 
22 | void vrdxDestroySorter(VrdxSorter sorter);
23 | 
24 | struct VrdxSorterStorageRequirements {
25 |   VkDeviceSize size;
26 |   VkBufferUsageFlags usage;
27 | };
28 | 
29 | void vrdxGetSorterStorageRequirements(
30 |     VrdxSorter sorter, uint32_t maxElementCount,
31 |     VrdxSorterStorageRequirements* requirements);
32 | 
33 | void vrdxGetSorterKeyValueStorageRequirements(
34 |     VrdxSorter sorter, uint32_t maxElementCount,
35 |     VrdxSorterStorageRequirements* requirements);
36 | 
37 | /**
38 |  * if queryPool is not VK_NULL_HANDLE, it writes timestamps to N entries
39 |  * [query..query+N-1].
40 |  *
41 |  * N=15
42 |  * query + 0: start timestamp (VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)
43 |  * query + 1: transfer timestamp (VK_PIPELINE_STAGE_TRANSFER_BIT)
44 |  * query + 2 + (3 * i) + 0: upsweep (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
45 |  * query + 2 + (3 * i) + 1: spine (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
46 |  * query + 2 + (3 * i) + 2: downsweep (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
47 |  * query + 14: sort end timestamp (VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)
48 |  */
49 | void vrdxCmdSort(VkCommandBuffer commandBuffer, VrdxSorter sorter,
50 |                  uint32_t elementCount, VkBuffer keysBuffer,
51 |                  VkDeviceSize keysOffset, VkBuffer storageBuffer,
52 |                  VkDeviceSize storageOffset, VkQueryPool queryPool,
53 |                  uint32_t query);
54 | 
55 | void vrdxCmdSortIndirect(VkCommandBuffer commandBuffer, VrdxSorter sorter,
56 |                          uint32_t maxElementCount, VkBuffer indirectBuffer,
57 |                          VkDeviceSize indirectOffset, VkBuffer keysBuffer,
58 |                          VkDeviceSize keysOffset, VkBuffer storageBuffer,
59 |                          VkDeviceSize storageOffset, VkQueryPool queryPool,
60 |                          uint32_t query);
61 | 
62 | void vrdxCmdSortKeyValue(VkCommandBuffer commandBuffer, VrdxSorter sorter,
63 |                          uint32_t elementCount, VkBuffer keysBuffer,
64 |                          VkDeviceSize keysOffset, VkBuffer valuesBuffer,
65 |                          VkDeviceSize valuesOffset, VkBuffer storageBuffer,
66 |                          VkDeviceSize storageOffset, VkQueryPool queryPool,
67 |                          uint32_t query);
68 | 
69 | /**
70 |  * indirectBuffer contains elementCount.
71 |  *
72 |  * The sort command reads a uint32_t value from indirectBuffer at
73 |  * indirectOffset.
74 |  *
75 |  * User must add barrier with second synchronization scope
76 |  * COMPUTE_SHADER stage and SHADER_READ access.
77 |  *
78 |  * indirectBuffer requires TRANSFER_SRC buffer usage flag.
79 |  */
80 | void vrdxCmdSortKeyValueIndirect(
81 |     VkCommandBuffer commandBuffer, VrdxSorter sorter, uint32_t maxElementCount,
82 |     VkBuffer indirectBuffer, VkDeviceSize indirectOffset, VkBuffer keysBuffer,
83 |     VkDeviceSize keysOffset, VkBuffer valuesBuffer, VkDeviceSize valuesOffset,
84 |     VkBuffer storageBuffer, VkDeviceSize storageOffset, VkQueryPool queryPool,
85 |     uint32_t query);
86 | 
87 | #endif  // VK_RADIX_SORT_H
88 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/src/shader/spine.comp:
--------------------------------------------------------------------------------
  1 | #version 460 core
  2 | 
  3 | #extension GL_EXT_buffer_reference : require
  4 | #extension GL_KHR_shader_subgroup_basic: enable
  5 | #extension GL_KHR_shader_subgroup_arithmetic: enable
  6 | #extension GL_KHR_shader_subgroup_ballot: enable
  7 | 
  8 | const int RADIX = 256;
  9 | #define MAX_SUBGROUP_SIZE 128
 10 | #define WORKGROUP_SIZE 512
 11 | #define PARTITION_DIVISION 8
 12 | const int PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE;
 13 | 
 14 | // dispatch this shader (RADIX, 1, 1), so that gl_WorkGroupID.x is radix
 15 | layout (local_size_x = WORKGROUP_SIZE) in;
 16 | 
 17 | layout (buffer_reference, std430) readonly buffer ElementCount {
 18 |   uint elementCount;
 19 | };
 20 | 
 21 | layout (buffer_reference, std430) buffer GlobalHistogram {
 22 |   uint globalHistogram[];  // (4, R)
 23 | };
 24 | 
 25 | layout (buffer_reference, std430) buffer PartitionHistogram {
 26 |   uint partitionHistogram[];  // (P, R)
 27 | };
 28 | 
 29 | layout (push_constant) uniform PushConstant {
 30 |   int pass;
 31 |   restrict ElementCount elementCountReference;
 32 |   restrict GlobalHistogram globalHistogramReference;
 33 |   restrict PartitionHistogram partitionHistogramReference;
 34 | };
 35 | 
 36 | shared uint reduction;
 37 | // we only need array length equal to subgroup size = 32 or 64,
 38 | // but 128 shouldn't affect performance.
 39 | shared uint intermediate[MAX_SUBGROUP_SIZE];
 40 | 
 41 | void main() {
 42 |   uint threadIndex = gl_SubgroupInvocationID;  // 0..31 or 0..63
 43 |   uint subgroupIndex = gl_SubgroupID;  // 0..15 or 0..7
 44 |   uint index = subgroupIndex * gl_SubgroupSize + threadIndex;
 45 |   uint radix = gl_WorkGroupID.x;
 46 | 
 47 |   uint elementCount = elementCountReference.elementCount;
 48 | 
 49 |   uint partitionCount = (elementCount + PARTITION_SIZE - 1) / PARTITION_SIZE;
 50 | 
 51 |   if (index == 0) {
 52 |     reduction = 0;
 53 |   }
 54 |   barrier();
 55 | 
 56 |   for (uint i = 0; WORKGROUP_SIZE * i < partitionCount; ++i) {
 57 |     uint partitionIndex = WORKGROUP_SIZE * i + index;
 58 |     uint value = partitionIndex < partitionCount ? partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + radix] : 0;
 59 |     uint excl = subgroupExclusiveAdd(value) + reduction;
 60 |     uint sum = subgroupAdd(value);
 61 | 
 62 |     if (subgroupElect()) {
 63 |       intermediate[subgroupIndex] = sum;
 64 |     }
 65 |     barrier();
 66 | 
 67 |     if (index < gl_NumSubgroups) {
 68 |       uint excl = subgroupExclusiveAdd(intermediate[index]);
 69 |       uint sum = subgroupAdd(intermediate[index]);
 70 |       intermediate[index] = excl;
 71 | 
 72 |       if (index == 0) {
 73 |         reduction += sum;
 74 |       }
 75 |     }
 76 |     barrier();
 77 | 
 78 |     if (partitionIndex < partitionCount) {
 79 |       excl += intermediate[subgroupIndex];
 80 |       partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + radix] = excl;
 81 |     }
 82 |     barrier();
 83 |   }
 84 | 
 85 |   if (gl_WorkGroupID.x == 0) {
 86 |     // one workgroup is responsible for global histogram prefix sum
 87 |     if (index < RADIX) {
 88 |       uint value = globalHistogramReference.globalHistogram[RADIX * pass + index];
 89 |       uint excl = subgroupExclusiveAdd(value);
 90 |       uint sum = subgroupAdd(value);
 91 | 
 92 |       if (subgroupElect()) {
 93 |         intermediate[subgroupIndex] = sum;
 94 |       }
 95 |       barrier();
 96 | 
 97 |       if (index < RADIX / gl_SubgroupSize) {
 98 |         uint excl = subgroupExclusiveAdd(intermediate[index]);
 99 |         intermediate[index] = excl;
100 |       }
101 |       barrier();
102 | 
103 |       excl += intermediate[subgroupIndex];
104 |       globalHistogramReference.globalHistogram[RADIX * pass + index] = excl;
105 |     }
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/thirdparty/vulkan_radix_sort/src/shader/upsweep.comp:
--------------------------------------------------------------------------------
 1 | #version 460 core
 2 | 
 3 | #extension GL_EXT_buffer_reference : require
 4 | #extension GL_KHR_shader_subgroup_basic: enable
 5 | 
 6 | const int RADIX = 256;
 7 | #define WORKGROUP_SIZE 512
 8 | #define PARTITION_DIVISION 8
 9 | const int PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE;
10 | 
11 | layout (local_size_x = WORKGROUP_SIZE) in;
12 | 
13 | layout (buffer_reference, std430) readonly buffer ElementCount {
14 |   uint elementCount;
15 | };
16 | 
17 | layout (buffer_reference, std430) buffer GlobalHistogram {
18 |   uint globalHistogram[];  // (4, R)
19 | };
20 | 
21 | layout (buffer_reference, std430) writeonly buffer PartitionHistogram {
22 |   uint partitionHistogram[];  // (P, R)
23 | };
24 | 
25 | layout (buffer_reference, std430) readonly buffer Keys {
26 |   uint keys[];  // (N)
27 | };
28 | 
29 | layout (push_constant) uniform PushConstant {
30 |   int pass;
31 |   restrict ElementCount elementCountReference;
32 |   restrict GlobalHistogram globalHistogramReference;
33 |   restrict PartitionHistogram partitionHistogramReference;
34 |   restrict Keys keysInReference;
35 | };
36 | 
37 | shared uint localHistogram[RADIX];
38 | 
39 | void main() {
40 |   uint threadIndex = gl_SubgroupInvocationID;  // 0..31 or 0..63
41 |   uint subgroupIndex = gl_SubgroupID;  // 0..15 or 0..7
42 |   uint index = subgroupIndex * gl_SubgroupSize + threadIndex;
43 | 
44 |   uint elementCount = elementCountReference.elementCount;
45 | 
46 |   uint partitionIndex = gl_WorkGroupID.x;
47 |   uint partitionStart = partitionIndex * PARTITION_SIZE;
48 | 
49 |   // discard all workgroup invocations
50 |   if (partitionStart >= elementCount) {
51 |     return;
52 |   }
53 | 
54 |   if (index < RADIX) {
55 |     localHistogram[index] = 0;
56 |   }
57 |   barrier();
58 | 
59 |   // local histogram
60 |   for (int i = 0; i < PARTITION_DIVISION; ++i) {
61 |     uint keyIndex = partitionStart + WORKGROUP_SIZE * i + index;
62 |     uint key = keyIndex < elementCount ? keysInReference.keys[keyIndex] : 0xffffffff;
63 |     uint radix = bitfieldExtract(key, 8 * pass, 8);
64 |     atomicAdd(localHistogram[radix], 1);
65 |   }
66 |   barrier();
67 | 
68 |   if (index < RADIX) {
69 |     // set to partition histogram
70 |     partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + index] = localHistogram[index];
71 | 
72 |     // add to global histogram
73 |     atomicAdd(globalHistogramReference.globalHistogram[RADIX * pass + index], localHistogram[index]);
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------