├── .clang-format ├── .editorconfig ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CMakeLists.txt ├── CONTRIBUTING ├── LICENSE ├── README.md ├── docs ├── continuous_lod_clusters.png ├── lod_allocation.png ├── lod_rendering.png ├── lod_streaming.png └── otherscenes.jpg ├── shaders ├── blas_clusters_insert.comp.glsl ├── blas_setup_insertion.comp.glsl ├── build_setup.comp.glsl ├── culling.glsl ├── fullscreen.vert.glsl ├── fullscreen_write_depth.frag.glsl ├── hbao.h ├── hbao_blur.comp.glsl ├── hbao_blur.glsl ├── hbao_blur_apply.comp.glsl ├── hbao_calc.comp.glsl ├── hbao_deinterleave.comp.glsl ├── hbao_depthlinearize.comp.glsl ├── hbao_reinterleave.comp.glsl ├── hbao_viewnormal.comp.glsl ├── nvhiz-update.comp.glsl ├── octant_encoding.h ├── render_instance_bbox.frag.glsl ├── render_instance_bbox.mesh.glsl ├── render_raster.frag.glsl ├── render_raster_clusters.mesh.glsl ├── render_raytrace.rgen.glsl ├── render_raytrace.rmiss.glsl ├── render_raytrace_clusters.rchit.glsl ├── render_shading.glsl ├── shaderio.h ├── shaderio_building.h ├── shaderio_core.h ├── shaderio_scene.h ├── shaderio_streaming.h ├── stream_agefilter_groups.comp.glsl ├── stream_allocator_build_freegaps.comp.glsl ├── stream_allocator_freegaps_insert.comp.glsl ├── stream_allocator_load_groups.comp.glsl ├── stream_allocator_setup_insertion.comp.glsl ├── stream_allocator_unload_groups.comp.glsl ├── stream_compaction_new_clas.comp.glsl ├── stream_compaction_old_clas.comp.glsl ├── stream_setup.comp.glsl ├── stream_update_scene.comp.glsl ├── traversal_init.comp.glsl ├── traversal_presort.comp.glsl └── traversal_run.comp.glsl ├── src ├── cgltf.cpp ├── hbao_pass.cpp ├── hbao_pass.hpp ├── lodclusters.cpp ├── lodclusters.hpp ├── lodclusters_ui.cpp ├── main.cpp ├── nvhiz_vk.cpp ├── nvhiz_vk.hpp ├── renderer.cpp ├── renderer.hpp ├── renderer_raster_clusters_lod.cpp ├── renderer_raytrace_clusters_lod.cpp ├── resources.cpp ├── resources.hpp ├── scene.cpp ├── scene.hpp ├── scene_cache.cpp ├── scene_gltf.cpp ├── scene_preloaded.cpp ├── scene_preloaded.hpp ├── scene_streaming.cpp ├── scene_streaming.hpp ├── scene_streaming_utils.cpp ├── scene_streaming_utils.hpp ├── vk_nv_cluster_acc.cpp └── vk_nv_cluster_acc.h └── thirdparty └── vulkan_radix_sort ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include └── vk_radix_sort.h └── src ├── generated ├── downsweep_comp.h ├── downsweep_key_value_comp.h ├── spine_comp.h └── upsweep_comp.h ├── shader ├── downsweep.comp ├── spine.comp └── upsweep.comp └── vk_radix_sort.cc /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AccessModifierOffset: '-2' 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: 'true' 5 | AlignConsecutiveDeclarations: 'true' 6 | AlignOperands: 'true' 7 | AlignTrailingComments: 'true' 8 | AllowAllParametersOfDeclarationOnNextLine: 'false' 9 | AllowShortBlocksOnASingleLine: 'false' 10 | AllowShortCaseLabelsOnASingleLine: 'false' 11 | AllowShortFunctionsOnASingleLine: Inline 12 | AllowShortIfStatementsOnASingleLine: 'false' 13 | AllowShortLoopsOnASingleLine: 'false' 14 | AlwaysBreakAfterReturnType: None 15 | AlwaysBreakBeforeMultilineStrings: 'true' 16 | AlwaysBreakTemplateDeclarations: 'true' 17 | BinPackArguments: 'true' 18 | BinPackParameters: 'false' 19 | ExperimentalAutoDetectBinPacking: 'false' 20 | BreakBeforeBinaryOperators: NonAssignment 21 | BreakBeforeBraces: Custom 22 | BreakBeforeTernaryOperators: 'false' 23 | BreakConstructorInitializersBeforeComma: 'true' 24 | ColumnLimit: '120' 25 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'false' 26 | Cpp11BracedListStyle: 'true' 27 | IndentCaseLabels: 'true' 28 | IndentWidth: '2' 29 | KeepEmptyLinesAtTheStartOfBlocks: 'true' 30 | Language: Cpp 31 | MaxEmptyLinesToKeep: '2' 32 | NamespaceIndentation: None 33 | ObjCSpaceBeforeProtocolList: 'true' 34 | PointerAlignment: Left 35 | SpaceAfterCStyleCast: 'false' 36 | SpaceBeforeAssignmentOperators: 'true' 37 | SpaceBeforeParens: Never 38 | SpaceInEmptyParentheses: 'false' 39 | SpacesBeforeTrailingComments: '2' 40 | SpacesInAngles: 'false' 41 | SpacesInCStyleCastParentheses: 'false' 42 | SpacesInParentheses: 'false' 43 | SpacesInSquareBrackets: 'false' 44 | Standard: Cpp11 45 | TabWidth: '2' 46 | UseTab: Never 47 | SortIncludes: 'false' 48 | ReflowComments: 'false' 49 | BraceWrapping: { 50 | AfterClass: 'true' 51 | AfterControlStatement: 'true' 52 | AfterEnum: 'true' 53 | AfterFunction: 'true' 54 | AfterNamespace: 'false' 55 | AfterStruct: 'true' 56 | AfterUnion: 'true' 57 | BeforeCatch: 'true' 58 | BeforeElse: 'true' 59 | IndentBraces: 'false' 60 | } 61 | PenaltyExcessCharacter: 1 62 | PenaltyBreakBeforeFirstCallParameter: 40 63 | PenaltyBreakFirstLessLess: 1 64 | PenaltyBreakComment: 30 65 | PenaltyBreakString: 30 66 | PenaltyReturnTypeOnItsOwnLine: 9999 67 | BreakStringLiterals: false -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # This is the top-most editor config file 2 | root = true 3 | 4 | # Default to 2 space indentation for C/C++ files 5 | [*.{c,cpp,h,hpp,inl}] 6 | indent_size = 2 7 | indent_style = space 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############################# 2 | # generic 3 | ############################# 4 | 5 | *.bak 6 | 7 | ############################# 8 | # spirv/sass 9 | ############################# 10 | 11 | *.spv 12 | *.spva 13 | *.sass 14 | *.sassbin 15 | 16 | ############################# 17 | #specific to the project 18 | ############################# 19 | 20 | zbsgfxpack.lua 21 | cmake_built 22 | cmake_build 23 | build 24 | _install 25 | bin_x64 26 | external/downloaded_resources/ 27 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/nv_cluster_lod_builder"] 2 | path = external/nv_cluster_lod_builder 3 | url = https://github.com/nvpro-samples/nv_cluster_lod_builder.git 4 | branch = main 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog for vk_lod_clusters 2 | * 2026-4-26: 3 | * Added "Disable back-face culling" to "Scene Complexity" UI. 4 | * 2026-4-25: 5 | * Added "Instance Sorting" option, does sort instances by distance to camera. `-instancesorting 0/1`. 6 | * bugfix gltf meshes with multiple primitives 7 | * 2026-4-23: 8 | * Add `-processingthreadpct ` to control the percentage of threads doing the geometry processing (number of geometries in parallel). Percentage of what the system supports for concurrency. Default is `0.5`. 9 | * Add `-processingonly 0/1` to reduce peak memory consumption during processing and saving the cache file. This always saves a cache file (unless the old one was valid) and terminates the application afterwards. 10 | * 2026-4-11: 11 | * Interleave geometry processing with loading to reduce peak memory consumption. 12 | * Add visualization of instance bounding boxes 13 | * 2026-4-7: 14 | * Bugfix to file cache header detection. 15 | * The file cache can be used via memory mapping, avoiding a copy into system memory. `-mappedcache 0/1` defaults to true. 16 | * Use "octant" encoding for vertex normals according to [A Survey of Efficient Representations for Independent Unit Vectors](http://jcgt.org/published/0003/02/01/paper.pdf) 17 | * 2025-4-4: 18 | * The file cache format now stores everything geometry related for rendering. Instance and material information, as well as original vertex/triangle counts still comes from the gltf. The new file ending is `.nvsngeo`, the old `.nvcllod` files no longer work. 19 | * Added `-autoloadcache 0/1` option to disable loading from a cache file. 20 | * Some basic preparation to allow working from memory mapped cache files without loading into system memory. 21 | * 2025-2-7: 22 | * Added _"File > Save Cache"_ menu entry, as well as `-autosavecache 1` option. This allows to store the results of the lod cluster mesh processing into a file next to the original model. 23 | This allows speeding up future load times of the model a lot. See new notes in **Model processing** section of README 24 | * Improved warnings and some memory statistics. 25 | * Streaming geometry memory now guaranteed to stay within limit. 26 | * 2025-1-30: Initial release -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.6...3.31) 2 | 3 | get_filename_component(PROJNAME ${CMAKE_CURRENT_SOURCE_DIR} NAME) 4 | Project(${PROJNAME}) 5 | Message(STATUS "-------------------------------") 6 | Message(STATUS "Processing Project ${PROJNAME}:") 7 | 8 | ##################################################################################### 9 | # look for nvpro_core 1) as a sub-folder 2) at some other locations 10 | # this cannot be put anywhere else since we still didn't find setup.cmake yet 11 | 12 | # which nvprocore tag or branch to download if repo not found 13 | set(NVPRO_GIT_TAG main) 14 | # Where to decompress nvprocore source code if repo not found 15 | set(NVPRO_TGT_SRC_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps) 16 | 17 | if(NOT BASE_DIRECTORY) 18 | find_path(BASE_DIRECTORY 19 | NAMES nvpro_core/cmake/setup.cmake 20 | PATHS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/../.. ${CMAKE_CURRENT_SOURCE_DIR}/external 21 | DOC "Directory containing nvpro_core" 22 | ) 23 | endif() 24 | if(EXISTS ${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake) 25 | set(OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin_x64) 26 | include(${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake) 27 | else() 28 | # nvpro_core not found, will try to download. 29 | # first find where the current sample comes from 30 | execute_process( 31 | COMMAND git config --get remote.origin.url 32 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 33 | OUTPUT_VARIABLE GIT_REPO_URL OUTPUT_STRIP_TRAILING_WHITESPACE 34 | ) 35 | # Check if "github.com" is in URL 36 | string(FIND "${GIT_REPO_URL}" "github.com" FOUND_INDEX) 37 | if (FOUND_INDEX GREATER -1) 38 | # Use regex to extract everything up to and including "github.com" 39 | string(REGEX MATCH ".*github\\.com" GIT_BASE_URL "${GIT_REPO_URL}") 40 | # construct URL 41 | string(FIND "${GIT_REPO_URL}" "git@" SSH_FOUND_INDEX) 42 | if (SSH_FOUND_INDEX GREATER -1) # ssh 43 | set(NVPRO_GIT_URL ${GIT_BASE_URL}:nvpro-samples/nvpro_core.git) 44 | else() # https 45 | set(NVPRO_GIT_URL ${GIT_BASE_URL}/nvpro-samples/nvpro_core.git) 46 | endif() 47 | if("${NVPRO_GIT_TAG}" STREQUAL "main" ) 48 | set(NVPRO_GIT_TAG master) 49 | endif() 50 | message("Sample comes from github , nvprocore is at " ${NVPRO_GIT_URL} ) 51 | else () 52 | # reconstruct the path to nvpro_core, preserving the protocol 53 | string(REGEX MATCH "^[^/]+//[^/]+/" GIT_BASE_URL "${GIT_REPO_URL}") 54 | # construct URL 55 | set(NVPRO_GIT_URL ${GIT_BASE_URL}devtechproviz/nvpro-samples/nvpro_core.git) 56 | # message("Sample comes from prod server, nvprocore is at " ${NVPRO_GIT_URL}) 57 | endif() 58 | # let's clone the commit we need, depth to 1 so that we do not download the full history 59 | execute_process( 60 | COMMAND git clone --depth 1 --branch ${NVPRO_GIT_TAG} ${NVPRO_GIT_URL} ${CMAKE_CURRENT_BINARY_DIR}/_deps/nvpro_core 61 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 62 | ) 63 | # do the search again with downloaded version, use find to be sure everyting runs ok 64 | find_path(BASE_DIRECTORY 65 | NAMES nvpro_core 66 | PATHS ${CMAKE_CURRENT_BINARY_DIR}/_deps 67 | REQUIRED 68 | DOC "Directory containing nvpro_core" 69 | ) 70 | # invoke the setup 71 | if(EXISTS ${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake) 72 | set(OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin_x64) 73 | include(${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake) 74 | else() 75 | message(FATAL_ERROR "could not find base directory or download nvpro_core, please set BASE_DIRECTORY to folder containing nvpro_core") 76 | endif() 77 | endif() 78 | set(NVPRO_CORE_DIR ${BASE_DIRECTORY}/nvpro_core) 79 | 80 | _add_project_definitions(${PROJNAME}) 81 | 82 | # Download the default scene 83 | download_files(FILENAMES bunny_v2.zip EXTRACT) 84 | 85 | ##################################################################################### 86 | # additions from packages needed for this sample 87 | # add refs in LIBRARIES_OPTIMIZED 88 | # add refs in LIBRARIES_DEBUG 89 | # add files in PACKAGE_SOURCE_FILES 90 | 91 | _add_package_VulkanSDK() 92 | _add_package_ShaderC() 93 | _add_package_IMGUI() 94 | 95 | #_add_package_NVML() 96 | 97 | ##################################################################################### 98 | # process the rest of some cmake code that needs to be done *after* the packages add 99 | _add_nvpro_core_lib() 100 | 101 | if(NOT TARGET nv_cluster_lod_builder) 102 | add_subdirectory(external/nv_cluster_lod_builder) 103 | endif() 104 | 105 | ##################################################################################### 106 | # Source files for this project 107 | # 108 | file(GLOB SOURCE_FILES src/*.*) 109 | file(GLOB SHADER_FILES shaders/*.glsl shaders/*.h) 110 | list(APPEND SHADER_FILES ${NVPRO_CORE_DIR}/nvvkhl/shaders/dh_sky.h) 111 | file(GLOB VK_RADIX_SORT_FILES thirdparty/vulkan_radix_sort/src/vk_radix_sort.cc) 112 | 113 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 114 | include_directories(${NVPRO_CORE_DIR}/nvvkhl/shaders) 115 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/vulkan_radix_sort/include) 116 | 117 | ##################################################################################### 118 | # Executable 119 | # 120 | 121 | if(WIN32 AND NOT GLUT_FOUND) 122 | add_definitions(/wd4996) #remove printf warning 123 | add_definitions(/wd4244) #remove double to float conversion warning 124 | add_definitions(/wd4305) #remove double to float truncation warning 125 | else() 126 | add_definitions(-fpermissive) 127 | endif() 128 | add_executable(${PROJNAME} ${SOURCE_FILES} ${COMMON_SOURCE_FILES} ${PACKAGE_SOURCE_FILES} ${SHADER_FILES} ${MESHOPT_FILES} ${VK_RADIX_SORT_FILES}) 129 | 130 | set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJNAME}) 131 | 132 | target_compile_definitions(${PROJNAME} PRIVATE NVPRO_CORE_DIR="${NVPRO_CORE_DIR}") 133 | ##################################################################################### 134 | # common source code needed for this sample 135 | # 136 | source_group(common FILES 137 | ${COMMON_SOURCE_FILES} 138 | ${PACKAGE_SOURCE_FILES} 139 | ) 140 | source_group("Shader Files" FILES ${SHADER_FILES}) 141 | source_group("Source Files" FILES ${SOURCE_FILES}) 142 | source_group("meshoptimizer" FILES ${MESHOPT_FILES}) 143 | 144 | if(UNIX) 145 | set(UNIXLINKLIBS dl pthread) 146 | else() 147 | set(UNIXLINKLIBS) 148 | endif() 149 | 150 | ##################################################################################### 151 | # Linkage 152 | # 153 | 154 | target_link_libraries(${PROJNAME} ${PLATFORM_LIBRARIES} nvpro_core nv_cluster_lod_builder meshoptimizer) 155 | 156 | foreach(DEBUGLIB ${LIBRARIES_DEBUG}) 157 | target_link_libraries(${PROJNAME} debug ${DEBUGLIB}) 158 | endforeach(DEBUGLIB) 159 | 160 | foreach(RELEASELIB ${LIBRARIES_OPTIMIZED}) 161 | target_link_libraries(${PROJNAME} optimized ${RELEASELIB}) 162 | endforeach(RELEASELIB) 163 | 164 | ##################################################################################### 165 | # copies binaries that need to be put next to the exe files (ZLib, etc.) 166 | # 167 | 168 | _finalize_target( ${PROJNAME} ) 169 | 170 | install(FILES ${SHADER_FILES} CONFIGURATIONS Release DESTINATION "bin_${ARCH}/GLSL_${PROJNAME}") 171 | install(FILES ${SHADER_FILES} CONFIGURATIONS Debug DESTINATION "bin_${ARCH}_debug/GLSL_${PROJNAME}") 172 | -------------------------------------------------------------------------------- /CONTRIBUTING: -------------------------------------------------------------------------------- 1 | https://developercertificate.org/ 2 | 3 | Developer Certificate of Origin 4 | Version 1.1 5 | 6 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | 12 | Developer's Certificate of Origin 1.1 13 | 14 | By making a contribution to this project, I certify that: 15 | 16 | (a) The contribution was created in whole or in part by me and I 17 | have the right to submit it under the open source license 18 | indicated in the file; or 19 | 20 | (b) The contribution is based upon previous work that, to the best 21 | of my knowledge, is covered under an appropriate open source 22 | license and I have the right under that license to submit that 23 | work with modifications, whether created in whole or in part 24 | by me, under the same open source license (unless I am 25 | permitted to submit under a different license), as indicated 26 | in the file; or 27 | 28 | (c) The contribution was provided directly to me by some other 29 | person who certified (a), (b) or (c) and I have not modified 30 | it. 31 | 32 | (d) I understand and agree that this project and the contribution 33 | are public and that a record of the contribution (including all 34 | personal information I submit with it, including my sign-off) is 35 | maintained indefinitely and may be redistributed consistent with 36 | this project or the open source license(s) involved. -------------------------------------------------------------------------------- /docs/continuous_lod_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/continuous_lod_clusters.png -------------------------------------------------------------------------------- /docs/lod_allocation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_allocation.png -------------------------------------------------------------------------------- /docs/lod_rendering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_rendering.png -------------------------------------------------------------------------------- /docs/lod_streaming.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/lod_streaming.png -------------------------------------------------------------------------------- /docs/otherscenes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvpro-samples/vk_lod_clusters/39f62a8d10ddd52c372dbd904316dda5d76e6d2b/docs/otherscenes.jpg -------------------------------------------------------------------------------- /shaders/blas_clusters_insert.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader inserts the CLAS clusters that should be rendered 26 | into the cluster references list for each instance's BLAS. 27 | 28 | A single thread represents one CLAS 29 | */ 30 | 31 | #version 460 32 | 33 | 34 | #extension GL_GOOGLE_include_directive : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 39 | #extension GL_EXT_buffer_reference : enable 40 | #extension GL_EXT_buffer_reference2 : enable 41 | #extension GL_EXT_scalar_block_layout : enable 42 | #extension GL_EXT_shader_atomic_int64 : enable 43 | 44 | #extension GL_EXT_control_flow_attributes : require 45 | #extension GL_KHR_shader_subgroup_vote : require 46 | #extension GL_KHR_shader_subgroup_ballot : require 47 | #extension GL_KHR_shader_subgroup_shuffle : require 48 | #extension GL_KHR_shader_subgroup_basic : require 49 | #extension GL_KHR_shader_subgroup_clustered : require 50 | #extension GL_KHR_shader_subgroup_arithmetic : require 51 | 52 | #include "shaderio.h" 53 | 54 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 55 | { 56 | FrameConstants view; 57 | }; 58 | 59 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 60 | { 61 | Readback readback; 62 | }; 63 | 64 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 65 | { 66 | RenderInstance instances[]; 67 | }; 68 | 69 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 70 | { 71 | Geometry geometries[]; 72 | }; 73 | 74 | layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar; 75 | 76 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 77 | { 78 | SceneBuilding build; 79 | }; 80 | 81 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW 82 | { 83 | SceneBuilding buildRW; 84 | }; 85 | 86 | #if USE_STREAMING 87 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 88 | { 89 | SceneStreaming streaming; 90 | }; 91 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 92 | { 93 | SceneStreaming streamingRW; 94 | }; 95 | #endif 96 | 97 | //////////////////////////////////////////// 98 | 99 | layout(local_size_x=BLAS_INSERT_CLUSTERS_WORKGROUP) in; 100 | 101 | //////////////////////////////////////////// 102 | 103 | void main() 104 | { 105 | uint renderClusterIndex = gl_GlobalInvocationID.x; 106 | 107 | if (renderClusterIndex < build.renderClusterCounter) 108 | { 109 | ClusterInfo cluster = build.renderClusterInfos.d[renderClusterIndex]; 110 | uint instanceID = cluster.instanceID; 111 | uint clusterID = cluster.clusterID; 112 | #if USE_STREAMING 113 | uint64_t clusterAddress = streaming.resident.clasAddresses.d[clusterID]; 114 | #else 115 | Geometry geometry = geometries[instances[instanceID].geometryID]; 116 | uint64_t clusterAddress = geometry.preloadedClusterClasAddresses.d[clusterID]; 117 | #endif 118 | 119 | uint idx = atomicAdd(build.blasBuildInfos.d[instanceID].clusterReferencesCount,1); 120 | uint64s_inout clusterReferences = uint64s_inout(build.blasBuildInfos.d[instanceID].clusterReferences); 121 | clusterReferences.d[idx] = clusterAddress; 122 | 123 | #if 1 124 | // for statistics 125 | #if USE_STREAMING 126 | uint numTriangles = Cluster_in(streaming.resident.clusters.d[clusterID]).d.triangleCountMinusOne + 1; 127 | #else 128 | uint numTriangles = geometry.preloadedClusters.d[clusterID].triangleCountMinusOne + 1; 129 | #endif 130 | atomicAdd(readback.numRenderedTriangles, numTriangles); 131 | #endif 132 | } 133 | } -------------------------------------------------------------------------------- /shaders/blas_setup_insertion.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /* 20 | 21 | Shader Description 22 | ================== 23 | 24 | This compute shader sets up the per BLAS cluster references list start pointer. 25 | It does so by simply adding up the per-blas references counts that were filled during 26 | `traversal_run.comp.glsl`. 27 | These count values are also reset, so that the `blas_clusters_insert.comp.glsl` kernel 28 | can increment them again when filling the lists. 29 | 30 | A single thread represents one BLAS 31 | */ 32 | 33 | #version 460 34 | 35 | #extension GL_GOOGLE_include_directive : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 39 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 40 | #extension GL_EXT_buffer_reference : enable 41 | #extension GL_EXT_buffer_reference2 : enable 42 | #extension GL_EXT_scalar_block_layout : enable 43 | #extension GL_EXT_shader_atomic_int64 : enable 44 | 45 | #extension GL_EXT_control_flow_attributes : require 46 | #extension GL_KHR_shader_subgroup_vote : require 47 | #extension GL_KHR_shader_subgroup_ballot : require 48 | #extension GL_KHR_shader_subgroup_shuffle : require 49 | #extension GL_KHR_shader_subgroup_basic : require 50 | #extension GL_KHR_shader_subgroup_clustered : require 51 | #extension GL_KHR_shader_subgroup_arithmetic : require 52 | 53 | #include "shaderio.h" 54 | 55 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 56 | { 57 | FrameConstants view; 58 | }; 59 | 60 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 61 | { 62 | Readback readback; 63 | }; 64 | 65 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 66 | { 67 | RenderInstance instances[]; 68 | }; 69 | 70 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 71 | { 72 | Geometry geometries[]; 73 | }; 74 | 75 | layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar; 76 | 77 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 78 | { 79 | SceneBuilding build; 80 | }; 81 | 82 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW 83 | { 84 | SceneBuilding buildRW; 85 | }; 86 | 87 | //////////////////////////////////////////// 88 | 89 | layout(local_size_x=BLAS_SETUP_INSERTION_WORKGROUP) in; 90 | 91 | //////////////////////////////////////////// 92 | 93 | 94 | void main() 95 | { 96 | uint instanceID = gl_GlobalInvocationID.x; 97 | 98 | if (instanceID < build.numRenderInstances) 99 | { 100 | uint referencesCount = build.blasBuildInfos.d[instanceID].clusterReferencesCount; 101 | uint referencesOffset = atomicAdd(buildRW.blasClasCounter, referencesCount); 102 | // reset count for insertion pass 103 | build.blasBuildInfos.d[instanceID].clusterReferencesCount = 0; 104 | build.blasBuildInfos.d[instanceID].clusterReferencesStride = 8; 105 | build.blasBuildInfos.d[instanceID].clusterReferences = uint64_t(buildRW.blasClusterAddresses) + uint64_t(referencesOffset * 8); 106 | 107 | // sum up last frame's result for statistics 108 | atomicAdd(readback.blasActualSizes, uint64_t(build.blasBuildSizes.d[instanceID])); 109 | } 110 | } -------------------------------------------------------------------------------- /shaders/build_setup.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /* 20 | 21 | Shader Description 22 | ================== 23 | 24 | This compute shader does basic operations on a single thread. 25 | For example clamping atomic counters back to their limits or 26 | setting up indirect dispatches or draws etc. 27 | 28 | BUILD_SETUP_... are enums for the various operations 29 | 30 | */ 31 | 32 | #version 460 33 | 34 | #extension GL_GOOGLE_include_directive : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 39 | #extension GL_EXT_buffer_reference : enable 40 | #extension GL_EXT_buffer_reference2 : enable 41 | #extension GL_EXT_scalar_block_layout : enable 42 | #extension GL_EXT_shader_atomic_int64 : enable 43 | 44 | #extension GL_EXT_control_flow_attributes : require 45 | #extension GL_KHR_shader_subgroup_ballot : require 46 | #extension GL_KHR_shader_subgroup_shuffle : require 47 | #extension GL_KHR_shader_subgroup_basic : require 48 | #extension GL_KHR_shader_subgroup_clustered : require 49 | #extension GL_KHR_shader_subgroup_arithmetic : require 50 | 51 | #include "shaderio.h" 52 | 53 | layout(push_constant) uniform pushData 54 | { 55 | uint setup; 56 | } push; 57 | 58 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 59 | { 60 | FrameConstants view; 61 | FrameConstants viewLast; 62 | }; 63 | 64 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 65 | { 66 | Readback readback; 67 | }; 68 | 69 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 70 | { 71 | RenderInstance instances[]; 72 | }; 73 | 74 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 75 | { 76 | Geometry geometries[]; 77 | }; 78 | 79 | layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar; 80 | 81 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 82 | { 83 | SceneBuilding build; 84 | }; 85 | 86 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) coherent buffer buildBufferRW 87 | { 88 | SceneBuilding buildRW; 89 | }; 90 | 91 | //////////////////////////////////////////// 92 | 93 | layout(local_size_x=1) in; 94 | 95 | //////////////////////////////////////////// 96 | 97 | void main() 98 | { 99 | // special operations for setting up indirect dispatches 100 | // or clamping other operations to actual limits 101 | 102 | if (push.setup == BUILD_SETUP_TRAVERSAL_RUN) 103 | { 104 | // during traversal_init we might overshoot the traversalTaskCounter 105 | int traversalTaskCounter = min(buildRW.traversalTaskCounter, int(build.maxTraversalInfos)); 106 | buildRW.traversalTaskCounter = traversalTaskCounter; 107 | // also set up the initial writeCounter to be equal, so that new jobs are enqueued after it 108 | buildRW.traversalInfoWriteCounter = uint(traversalTaskCounter); 109 | } 110 | #if TARGETS_RASTERIZATION 111 | else if (push.setup == BUILD_SETUP_DRAW) 112 | { 113 | // during traversal_run we might overshoot visibleClusterCounter 114 | uint renderClusterCounter = buildRW.renderClusterCounter; 115 | 116 | // set drawindirect for actual rendered clusters 117 | uint numRenderedClusters = min(renderClusterCounter, build.maxRenderClusters); 118 | 119 | buildRW.indirectDrawClusters.count = numRenderedClusters; 120 | buildRW.indirectDrawClusters.first = 0; 121 | 122 | // keep originals for statistics 123 | readback.numRenderedClusters = numRenderedClusters; 124 | readback.numRenderClusters = renderClusterCounter; 125 | readback.numTraversalInfos = buildRW.traversalInfoWriteCounter; 126 | } 127 | #endif 128 | #if TARGETS_RAY_TRACING 129 | else if (push.setup == BUILD_SETUP_BLAS_INSERTION) 130 | { 131 | // during traversal_run we might overshoot visibleClusterCounter 132 | uint renderClusterCounter = buildRW.renderClusterCounter; 133 | 134 | // set drawindirect for actual rendered clusters 135 | uint numRenderedClusters = min(renderClusterCounter, build.maxRenderClusters); 136 | 137 | buildRW.renderClusterCounter = numRenderedClusters; 138 | buildRW.indirectDispatchBlasInsertion.gridX = (numRenderedClusters + BLAS_INSERT_CLUSTERS_WORKGROUP-1) / BLAS_INSERT_CLUSTERS_WORKGROUP; 139 | buildRW.indirectDispatchBlasInsertion.gridY = 1; 140 | buildRW.indirectDispatchBlasInsertion.gridZ = 1; 141 | 142 | // keep originals for statistics 143 | readback.numRenderedClusters = numRenderedClusters; 144 | readback.numRenderClusters = renderClusterCounter; 145 | readback.numTraversalInfos = buildRW.traversalInfoWriteCounter; 146 | } 147 | #endif 148 | } -------------------------------------------------------------------------------- /shaders/culling.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /* 20 | 21 | Utility code for frustum and occlusion culling of 22 | bounding boxes 23 | 24 | */ 25 | 26 | const float c_epsilon = 1.2e-07f; 27 | const float c_depthNudge = 2.0/float(1<<24); 28 | 29 | bool intersectSize(vec4 clipMin, vec4 clipMax) 30 | { 31 | vec2 rect = clipMax.xy - clipMin.xy; 32 | vec2 clipThreshold = vec2(2.0) / viewLast.viewportf.xy; 33 | return any(greaterThan(rect,clipThreshold)); 34 | } 35 | 36 | vec4 getClip(vec4 hPos, out bool valid) { 37 | valid = !(-c_epsilon < hPos.w && hPos.w < c_epsilon); 38 | return vec4(hPos.xyz / abs(hPos.w), hPos.w); 39 | } 40 | 41 | uint getCullBits(vec4 hPos) 42 | { 43 | uint cullBits = 0; 44 | cullBits |= hPos.x < -hPos.w ? 1 : 0; 45 | cullBits |= hPos.x > hPos.w ? 2 : 0; 46 | cullBits |= hPos.y < -hPos.w ? 4 : 0; 47 | cullBits |= hPos.y > hPos.w ? 8 : 0; 48 | cullBits |= hPos.z < 0 ? 16 : 0; 49 | cullBits |= hPos.z > hPos.w ? 32 : 0; 50 | cullBits |= hPos.w <= 0 ? 64 : 0; 51 | return cullBits; 52 | } 53 | 54 | vec4 getBoxCorner(vec3 bboxMin, vec3 bboxMax, int n) 55 | { 56 | bvec3 useMax = bvec3((n & 1) != 0, (n & 2) != 0, (n & 4) != 0); 57 | return vec4(mix(bboxMin, bboxMax, useMax),1); 58 | } 59 | 60 | bool intersectFrustum(vec3 bboxMin, vec3 bboxMax, mat4 worldTM, out vec4 oClipmin, out vec4 oClipmax, out bool oClipvalid) 61 | { 62 | mat4 worldViewProjTM = viewLast.viewProjMatrix * worldTM; 63 | bool valid; 64 | // clipspace bbox 65 | vec4 hPos = worldViewProjTM * getBoxCorner(bboxMin, bboxMax, 0); 66 | vec4 clip = getClip(hPos, valid); 67 | uint bits = getCullBits(hPos); 68 | vec4 clipMin = clip; 69 | vec4 clipMax = clip; 70 | bool clipValid = valid; 71 | 72 | [[unroll]] 73 | for (int n = 1; n < 8; n++){ 74 | hPos = worldViewProjTM * getBoxCorner(bboxMin, bboxMax, n); 75 | clip = getClip(hPos, valid); 76 | bits &= getCullBits(hPos); 77 | // TODO instead of loop unroll manually to do independent paired min/max to allow 78 | // instruction parallelism 79 | clipMin = min(clipMin,clip); 80 | clipMax = max(clipMax,clip); 81 | 82 | clipValid = clipValid && valid; 83 | } 84 | 85 | oClipvalid = clipValid; 86 | oClipmin = vec4(clamp(clipMin.xy, vec2(-1), vec2(1)), clipMin.zw); 87 | oClipmax = vec4(clamp(clipMax.xy, vec2(-1), vec2(1)), clipMax.zw); 88 | 89 | //return true; 90 | return bits == 0; 91 | } 92 | 93 | bool intersectHiz(vec4 clipMin, vec4 clipMax) 94 | { 95 | clipMin.xy = clipMin.xy * 0.5 + 0.5; 96 | clipMax.xy = clipMax.xy * 0.5 + 0.5; 97 | 98 | clipMin.xy *= viewLast.hizSizeFactors.xy; 99 | clipMax.xy *= viewLast.hizSizeFactors.xy; 100 | 101 | clipMin.xy = min(clipMin.xy, viewLast.hizSizeFactors.zw); 102 | clipMax.xy = min(clipMax.xy, viewLast.hizSizeFactors.zw); 103 | 104 | vec2 size = (clipMax.xy - clipMin.xy); 105 | float maxsize = max(size.x, size.y) * viewLast.hizSizeMax; 106 | float miplevel = ceil(log2(maxsize)); 107 | 108 | float depth = textureLod(texHizFar, ((clipMin.xy + clipMax.xy)*0.5),miplevel).r; 109 | bool result = clipMin.z <= depth + c_depthNudge; 110 | 111 | return result; 112 | } -------------------------------------------------------------------------------- /shaders/fullscreen.vert.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | // Simplistic screen-covering triangle 23 | 24 | layout(location = 0) out vec2 uv; 25 | 26 | void main() 27 | { 28 | uv.x = (gl_VertexIndex == 2) ? 2.0 : 0.0; 29 | uv.y = (gl_VertexIndex == 1) ? 2.0 : 0.0; 30 | gl_Position = vec4(uv * vec2(2.0, -2.0) + vec2(-1.0, 1.0), 0.0, 1.0); 31 | uv.y = 1.0 - uv.y; 32 | } 33 | -------------------------------------------------------------------------------- /shaders/fullscreen_write_depth.frag.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | A fragment shader that writes the ray tracing depth into the 26 | framebuffers depth buffer. 27 | 28 | */ 29 | 30 | #version 460 31 | #extension GL_GOOGLE_include_directive : enable 32 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 33 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 36 | #extension GL_EXT_buffer_reference : enable 37 | #extension GL_EXT_buffer_reference2 : enable 38 | #extension GL_EXT_scalar_block_layout : enable 39 | 40 | #include "shaderio.h" 41 | 42 | layout(set = 0, binding = BINDINGS_RAYTRACING_DEPTH, r32f) uniform image2D imgRaytracingDepth; 43 | 44 | void main() 45 | { 46 | ivec2 coord = ivec2(gl_FragCoord.xy); 47 | gl_FragDepth = imageLoad(imgRaytracingDepth, coord).x; 48 | } -------------------------------------------------------------------------------- /shaders/hbao.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef NVHBAO_H_ 21 | #define NVHBAO_H_ 22 | 23 | #define NVHBAO_RANDOMTEX_SIZE 4 24 | #define NVHBAO_NUM_DIRECTIONS 8 25 | 26 | #define NVHBAO_MAIN_UBO 0 27 | #define NVHBAO_MAIN_TEX_DEPTH 1 28 | #define NVHBAO_MAIN_TEX_LINDEPTH 2 29 | #define NVHBAO_MAIN_TEX_VIEWNORMAL 3 30 | #define NVHBAO_MAIN_TEX_DEPTHARRAY 4 31 | #define NVHBAO_MAIN_TEX_RESULTARRAY 5 32 | #define NVHBAO_MAIN_TEX_RESULT 6 33 | #define NVHBAO_MAIN_TEX_BLUR 7 34 | #define NVHBAO_MAIN_IMG_LINDEPTH 8 35 | #define NVHBAO_MAIN_IMG_VIEWNORMAL 9 36 | #define NVHBAO_MAIN_IMG_DEPTHARRAY 10 37 | #define NVHBAO_MAIN_IMG_RESULTARRAY 11 38 | #define NVHBAO_MAIN_IMG_RESULT 12 39 | #define NVHBAO_MAIN_IMG_BLUR 13 40 | #define NVHBAO_MAIN_IMG_OUT 14 41 | 42 | #ifndef NVHBAO_BLUR 43 | #define NVHBAO_BLUR 1 44 | #endif 45 | 46 | // 1 is slower 47 | #ifndef NVHBAO_SKIP_INTERPASS 48 | #define NVHBAO_SKIP_INTERPASS 0 49 | #endif 50 | 51 | #ifdef __cplusplus 52 | namespace glsl { 53 | using namespace glm; 54 | #endif 55 | 56 | struct NVHBAOData 57 | { 58 | float RadiusToScreen; // radius 59 | float R2; // 1/radius 60 | float NegInvR2; // radius * radius 61 | float NDotVBias; 62 | 63 | vec2 InvFullResolution; 64 | vec2 InvQuarterResolution; 65 | 66 | ivec2 SourceResolutionScale; 67 | float AOMultiplier; 68 | float PowExponent; 69 | 70 | vec4 projReconstruct; 71 | vec4 projInfo; 72 | int projOrtho; 73 | int _pad0; 74 | ivec2 _pad1; 75 | 76 | ivec2 FullResolution; 77 | ivec2 QuarterResolution; 78 | 79 | mat4 InvProjMatrix; 80 | 81 | vec4 float2Offsets[NVHBAO_RANDOMTEX_SIZE * NVHBAO_RANDOMTEX_SIZE]; 82 | vec4 jitters[NVHBAO_RANDOMTEX_SIZE * NVHBAO_RANDOMTEX_SIZE]; 83 | }; 84 | 85 | // keep all these equal size 86 | struct NVHBAOMainPush 87 | { 88 | int layer; 89 | int _pad0; 90 | ivec2 _pad1; 91 | }; 92 | 93 | struct NVHBAOBlurPush 94 | { 95 | vec2 invResolutionDirection; 96 | float sharpness; 97 | float _pad; 98 | }; 99 | 100 | #ifdef __cplusplus 101 | } 102 | #else 103 | 104 | layout(std140, binding = NVHBAO_MAIN_UBO) uniform controlBuffer 105 | { 106 | NVHBAOData control; 107 | }; 108 | 109 | #ifndef NVHABO_GFX 110 | 111 | layout(local_size_x = 32, local_size_y = 2) in; 112 | 113 | bool setupCoord(inout ivec2 coord, inout vec2 texCoord, ivec2 res, vec2 invRes) 114 | { 115 | ivec2 base = ivec2(gl_WorkGroupID.xy) * 8; 116 | ivec2 subset = ivec2(int(gl_LocalInvocationID.x) & 1, int(gl_LocalInvocationID.x) / 2); 117 | subset += gl_LocalInvocationID.x >= 16 ? ivec2(2, -8) : ivec2(0, 0); 118 | subset += ivec2(gl_LocalInvocationID.y * 4, 0); 119 | 120 | coord = base + subset; 121 | 122 | if(coord.x >= res.x || coord.y >= res.y) 123 | return true; 124 | 125 | texCoord = (vec2(coord) + vec2(0.5)) * invRes; 126 | 127 | return false; 128 | } 129 | 130 | bool setupCoordFull(inout ivec2 coord, inout vec2 texCoord) 131 | { 132 | return setupCoord(coord, texCoord, control.FullResolution, control.InvFullResolution); 133 | } 134 | 135 | bool setupCoordQuarter(inout ivec2 coord, inout vec2 texCoord) 136 | { 137 | return setupCoord(coord, texCoord, control.QuarterResolution, control.InvQuarterResolution); 138 | } 139 | 140 | #endif 141 | 142 | #endif 143 | #endif 144 | -------------------------------------------------------------------------------- /shaders/hbao_blur.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | 24 | #include "hbao.h" 25 | 26 | layout(binding=NVHBAO_MAIN_IMG_BLUR, rg16f) uniform image2D imgBlur; 27 | layout(binding=NVHBAO_MAIN_TEX_RESULT) uniform sampler2D texSource; 28 | 29 | #include "hbao_blur.glsl" 30 | 31 | //------------------------------------------------------------------------- 32 | 33 | void main() 34 | { 35 | ivec2 intCoord; 36 | vec2 texCoord; 37 | 38 | if (setupCoordFull(intCoord, texCoord)) return; 39 | 40 | vec2 res = BlurRun(texCoord); 41 | imageStore(imgBlur, intCoord, vec4(res,0,0)); 42 | } 43 | -------------------------------------------------------------------------------- /shaders/hbao_blur.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | layout(push_constant) uniform pushData { 21 | NVHBAOBlurPush blur; 22 | }; 23 | 24 | 25 | const float KERNEL_RADIUS = 3; 26 | 27 | //------------------------------------------------------------------------- 28 | 29 | float BlurFunction(vec2 uv, float r, float center_c, float center_d, inout float w_total) 30 | { 31 | vec2 aoz = texture(texSource, uv).xy; 32 | float c = aoz.x; 33 | float d = aoz.y; 34 | 35 | const float BlurSigma = float(KERNEL_RADIUS) * 0.5; 36 | const float BlurFalloff = 1.0 / (2.0*BlurSigma*BlurSigma); 37 | 38 | float ddiff = (d - center_d) * blur.sharpness; 39 | float w = exp2(-r*r*BlurFalloff - ddiff*ddiff); 40 | w_total += w; 41 | 42 | return c*w; 43 | } 44 | 45 | vec2 BlurRun(vec2 texCoord) 46 | { 47 | vec2 aoz = texture(texSource, texCoord).xy; 48 | float center_c = aoz.x; 49 | float center_d = aoz.y; 50 | 51 | float c_total = center_c; 52 | float w_total = 1.0; 53 | 54 | [[unroll]] 55 | for (float r = 1; r <= KERNEL_RADIUS; ++r) 56 | { 57 | vec2 uv = texCoord + blur.invResolutionDirection * r; 58 | c_total += BlurFunction(uv, r, center_c, center_d, w_total); 59 | } 60 | 61 | [[unroll]] 62 | for (float r = 1; r <= KERNEL_RADIUS; ++r) 63 | { 64 | vec2 uv = texCoord - blur.invResolutionDirection * r; 65 | c_total += BlurFunction(uv, r, center_c, center_d, w_total); 66 | } 67 | 68 | return vec2(c_total/w_total, center_d); 69 | //return vec2(aoz); 70 | } 71 | -------------------------------------------------------------------------------- /shaders/hbao_blur_apply.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | #extension GL_EXT_shader_image_load_formatted : require 24 | 25 | #include "hbao.h" 26 | 27 | layout(binding=NVHBAO_MAIN_IMG_OUT) uniform image2D imgOut; 28 | layout(binding=NVHBAO_MAIN_TEX_BLUR) uniform sampler2D texSource; 29 | 30 | #include "hbao_blur.glsl" 31 | 32 | //------------------------------------------------------------------------- 33 | 34 | 35 | void main() 36 | { 37 | ivec2 intCoord; 38 | vec2 texCoord; 39 | 40 | if (setupCoordFull(intCoord, texCoord)) return; 41 | 42 | vec2 res = BlurRun(texCoord); 43 | vec4 color = imageLoad(imgOut, intCoord); 44 | imageStore(imgOut, intCoord, vec4( vec3(color.xyz * res.x), 1)); 45 | } 46 | -------------------------------------------------------------------------------- /shaders/hbao_calc.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | Based on DeinterleavedTexturing sample by Louis Bavoil 22 | https://github.com/NVIDIAGameWorks/D3DSamples/tree/master/samples/DeinterleavedTexturing 23 | 24 | */ 25 | 26 | #version 460 27 | #extension GL_GOOGLE_include_directive : enable 28 | #extension GL_EXT_control_flow_attributes : require 29 | 30 | #include "hbao.h" 31 | 32 | layout(push_constant) uniform pushData { 33 | NVHBAOMainPush push; 34 | }; 35 | 36 | #define M_PI 3.14159265f 37 | 38 | // tweakables 39 | const float NUM_STEPS = 12; 40 | const float NUM_DIRECTIONS = NVHBAO_NUM_DIRECTIONS; // texRandom/g_Jitter initialization depends on this 41 | 42 | layout(binding=NVHBAO_MAIN_TEX_DEPTHARRAY) uniform sampler2DArray texLinearDepth; 43 | layout(binding=NVHBAO_MAIN_TEX_VIEWNORMAL) uniform sampler2D texViewNormal; 44 | 45 | 46 | #if NVHBAO_SKIP_INTERPASS 47 | #if NVHBAO_BLUR 48 | layout(binding=NVHBAO_MAIN_IMG_RESULT,rg16f) uniform image2D imgOutput; 49 | #else 50 | layout(binding=NVHBAO_MAIN_IMG_RESULT,r8) uniform image2D imgOutput; 51 | #endif 52 | void outputColor(ivec2 icoord, vec4 color) 53 | { 54 | icoord = icoord * 4 + ivec2(push.layer & 3, push.layer / 4); 55 | if (icoord.x < control.FullResolution.x && icoord.y < control.FullResolution.y){ 56 | imageStore(imgOutput, icoord, color); 57 | } 58 | } 59 | #else 60 | #if NVHBAO_BLUR 61 | layout(binding=NVHBAO_MAIN_IMG_RESULTARRAY,rg16f) uniform image2DArray imgOutput; 62 | #else 63 | layout(binding=NVHBAO_MAIN_IMG_RESULTARRAY,r8) uniform image2DArray imgOutput; 64 | #endif 65 | void outputColor(ivec2 icoord, vec4 color) 66 | { 67 | imageStore(imgOutput, ivec3(icoord, push.layer), color); 68 | } 69 | #endif 70 | 71 | 72 | vec2 g_Float2Offset = control.float2Offsets[push.layer].xy; 73 | vec4 g_Jitter = control.jitters[push.layer]; 74 | 75 | vec3 getQuarterCoord(vec2 UV){ 76 | return vec3(UV,float(push.layer)); 77 | } 78 | 79 | 80 | //---------------------------------------------------------------------------------- 81 | 82 | vec3 UVToView(vec2 uv, float eye_z) 83 | { 84 | return vec3((uv * control.projInfo.xy + control.projInfo.zw) * (control.projOrtho != 0 ? 1. : eye_z), eye_z); 85 | } 86 | 87 | vec3 FetchQuarterResViewPos(vec2 UV) 88 | { 89 | float ViewDepth = textureLod(texLinearDepth,getQuarterCoord(UV),0).x; 90 | return UVToView(UV, ViewDepth); 91 | } 92 | 93 | //---------------------------------------------------------------------------------- 94 | float Falloff(float DistanceSquare) 95 | { 96 | // 1 scalar mad instruction 97 | return DistanceSquare * control.NegInvR2 + 1.0; 98 | } 99 | 100 | //---------------------------------------------------------------------------------- 101 | // P = view-space position at the kernel center 102 | // N = view-space normal at the kernel center 103 | // S = view-space position of the current sample 104 | //---------------------------------------------------------------------------------- 105 | float ComputeAO(vec3 P, vec3 N, vec3 S) 106 | { 107 | vec3 V = S - P; 108 | float VdotV = dot(V, V); 109 | float NdotV = dot(N, V) * 1.0/sqrt(VdotV); 110 | 111 | // Use saturate(x) instead of max(x,0.f) because that is faster on Kepler 112 | return clamp(NdotV - control.NDotVBias,0,1) * clamp(Falloff(VdotV),0,1); 113 | } 114 | 115 | //---------------------------------------------------------------------------------- 116 | vec2 RotateDirection(vec2 Dir, vec2 CosSin) 117 | { 118 | return vec2(Dir.x*CosSin.x - Dir.y*CosSin.y, 119 | Dir.x*CosSin.y + Dir.y*CosSin.x); 120 | } 121 | 122 | //---------------------------------------------------------------------------------- 123 | vec4 GetJitter() 124 | { 125 | // Get the current jitter vector from the per-pass constant buffer 126 | return g_Jitter; 127 | } 128 | 129 | //---------------------------------------------------------------------------------- 130 | float ComputeCoarseAO(vec2 FullResUV, float RadiusPixels, vec4 Rand, vec3 ViewPosition, vec3 ViewNormal) 131 | { 132 | RadiusPixels /= 4.0; 133 | 134 | // Divide by NUM_STEPS+1 so that the farthest samples are not fully attenuated 135 | float StepSizePixels = RadiusPixels / (NUM_STEPS + 1); 136 | 137 | const float Alpha = 2.0 * M_PI / NUM_DIRECTIONS; 138 | float AO = 0; 139 | 140 | [[unroll]] 141 | for (float DirectionIndex = 0; DirectionIndex < NUM_DIRECTIONS; ++DirectionIndex) 142 | { 143 | float Angle = Alpha * DirectionIndex; 144 | 145 | // Compute normalized 2D direction 146 | vec2 Direction = RotateDirection(vec2(cos(Angle), sin(Angle)), Rand.xy); 147 | 148 | // Jitter starting sample within the first step 149 | float RayPixels = (Rand.z * StepSizePixels + 1.0); 150 | 151 | for (float StepIndex = 0; StepIndex < NUM_STEPS; ++StepIndex) 152 | { 153 | vec2 SnappedUV = round(RayPixels * Direction) * control.InvQuarterResolution + FullResUV; 154 | vec3 S = FetchQuarterResViewPos(SnappedUV); 155 | 156 | RayPixels += StepSizePixels; 157 | 158 | AO += ComputeAO(ViewPosition, ViewNormal, S); 159 | } 160 | } 161 | 162 | AO *= control.AOMultiplier / (NUM_DIRECTIONS * NUM_STEPS); 163 | return clamp(1.0 - AO * 2.0,0,1); 164 | } 165 | 166 | //---------------------------------------------------------------------------------- 167 | void main() 168 | { 169 | ivec2 intCoord; 170 | vec2 texCoord; 171 | 172 | if (setupCoordQuarter(intCoord, texCoord)) return; 173 | 174 | vec2 base = vec2(intCoord.xy) * 4.0 + g_Float2Offset; 175 | vec2 uv = base * (control.InvQuarterResolution / 4.0); 176 | 177 | vec3 ViewPosition = FetchQuarterResViewPos(uv); 178 | vec4 NormalAndAO = texelFetch( texViewNormal, ivec2(base), 0); 179 | vec3 ViewNormal = -(NormalAndAO.xyz * 2.0 - 1.0); 180 | 181 | // Compute projection of disk of radius control.R into screen space 182 | float RadiusPixels = control.RadiusToScreen / (control.projOrtho != 0 ? 1.0 : ViewPosition.z); 183 | 184 | // Get jitter vector for the current full-res pixel 185 | vec4 Rand = GetJitter(); 186 | 187 | float AO = ComputeCoarseAO(uv, RadiusPixels, Rand, ViewPosition, ViewNormal); 188 | 189 | #if NVHBAO_BLUR 190 | outputColor(intCoord, vec4(pow(AO, control.PowExponent), ViewPosition.z, 0, 0)); 191 | #else 192 | outputColor(intCoord, vec4(pow(AO, control.PowExponent))); 193 | #endif 194 | 195 | } 196 | -------------------------------------------------------------------------------- /shaders/hbao_deinterleave.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | 24 | #include "hbao.h" 25 | 26 | layout(binding=NVHBAO_MAIN_TEX_LINDEPTH) uniform sampler2D texLinearDepth; 27 | layout(binding=NVHBAO_MAIN_IMG_DEPTHARRAY,r32f) uniform image2DArray imgDepthArray; 28 | 29 | //---------------------------------------------------------------------------------- 30 | 31 | void outputColor(ivec2 intCoord, int layer, float value) 32 | { 33 | imageStore(imgDepthArray, ivec3(intCoord,layer), vec4(value,0,0,0)); 34 | } 35 | 36 | void main() 37 | { 38 | ivec2 intCoord; 39 | vec2 texCoord; 40 | 41 | if (setupCoordQuarter(intCoord, texCoord)) return; 42 | 43 | vec2 uv = vec2(intCoord) * 4.0 + 0.5; 44 | uv *= control.InvFullResolution; 45 | 46 | vec4 S0 = textureGather (texLinearDepth, uv, 0); 47 | vec4 S1 = textureGatherOffset(texLinearDepth, uv, ivec2(2,0), 0); 48 | vec4 S2 = textureGatherOffset(texLinearDepth, uv, ivec2(0,2), 0); 49 | vec4 S3 = textureGatherOffset(texLinearDepth, uv, ivec2(2,2), 0); 50 | 51 | outputColor(intCoord, 0, S0.w); 52 | outputColor(intCoord, 1, S0.z); 53 | outputColor(intCoord, 2, S1.w); 54 | outputColor(intCoord, 3, S1.z); 55 | outputColor(intCoord, 4, S0.x); 56 | outputColor(intCoord, 5, S0.y); 57 | outputColor(intCoord, 6, S1.x); 58 | outputColor(intCoord, 7, S1.y); 59 | 60 | outputColor(intCoord, 0 + 8, S2.w); 61 | outputColor(intCoord, 1 + 8, S2.z); 62 | outputColor(intCoord, 2 + 8, S3.w); 63 | outputColor(intCoord, 3 + 8, S3.z); 64 | outputColor(intCoord, 4 + 8, S2.x); 65 | outputColor(intCoord, 5 + 8, S2.y); 66 | outputColor(intCoord, 6 + 8, S3.x); 67 | outputColor(intCoord, 7 + 8, S3.y); 68 | } 69 | -------------------------------------------------------------------------------- /shaders/hbao_depthlinearize.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | 24 | #include "hbao.h" 25 | 26 | layout(binding=NVHBAO_MAIN_TEX_DEPTH) uniform sampler2D inputTexture; 27 | layout(binding=NVHBAO_MAIN_IMG_LINDEPTH, r32f) uniform image2D imgLinearDepth; 28 | #if NVHBAO_SKIP_INTERPASS 29 | layout(binding=NVHBAO_MAIN_IMG_DEPTHARRAY, r32f) uniform image2DArray imgLinearDepthArray; 30 | #endif 31 | 32 | 33 | float reconstructCSZ(float d, vec4 clipInfo) { 34 | #if 1 35 | vec4 ndc = vec4(0,0,d,1); 36 | vec4 unproj = control.InvProjMatrix * ndc; 37 | return unproj.z / unproj.w; 38 | #else 39 | // clipInfo = z_n * z_f, z_n - z_f, z_f, perspective = 1 : 0 40 | 41 | if (clipInfo[3] != 0) { 42 | return (clipInfo[0] / (clipInfo[1] * d + clipInfo[2])); 43 | } 44 | else { 45 | return (clipInfo[1]+clipInfo[2] - d * clipInfo[1]); 46 | } 47 | #endif 48 | 49 | } 50 | /* 51 | if (in_perspective == 1.0) // perspective 52 | { 53 | ze = (zNear * zFar) / (zFar - zb * (zFar - zNear)); 54 | } 55 | else // orthographic proj 56 | { 57 | ze = zNear + zb * (zFar - zNear); 58 | } 59 | */ 60 | void main() 61 | { 62 | ivec2 intCoord; 63 | vec2 texCoord; 64 | 65 | if (setupCoordFull(intCoord, texCoord)) return; 66 | 67 | float depth = textureLod(inputTexture, texCoord.xy, 0).x; 68 | float linDepth = reconstructCSZ(depth, control.projReconstruct); 69 | imageStore(imgLinearDepth, intCoord, vec4(linDepth,0,0,0)); 70 | #if NVHBAO_SKIP_INTERPASS 71 | ivec2 FullResPos = intCoord; 72 | ivec2 Offset = FullResPos & 3; 73 | int SliceId = Offset.y * 4 + Offset.x; 74 | ivec2 QuarterResPos = FullResPos >> 2; 75 | imageStore(imgLinearDepthArray, ivec3(QuarterResPos, SliceId), vec4(linDepth,0,0,0)); 76 | #endif 77 | } 78 | -------------------------------------------------------------------------------- /shaders/hbao_reinterleave.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | 24 | #include "hbao.h" 25 | 26 | layout(binding=NVHBAO_MAIN_TEX_RESULTARRAY) uniform sampler2DArray texResultsArray; 27 | #if NVHBAO_BLUR 28 | layout(binding=NVHBAO_MAIN_IMG_RESULT, rg16f) uniform image2D imgResult; 29 | #else 30 | layout(binding=NVHBAO_MAIN_IMG_RESULT, r8) uniform image2D imgResult; 31 | #endif 32 | 33 | //---------------------------------------------------------------------------------- 34 | 35 | void main() { 36 | ivec2 intCoord; 37 | vec2 texCoord; 38 | 39 | if (setupCoordFull(intCoord, texCoord)) return; 40 | 41 | ivec2 FullResPos = intCoord; 42 | ivec2 Offset = FullResPos & 3; 43 | int SliceId = Offset.y * 4 + Offset.x; 44 | ivec2 QuarterResPos = FullResPos >> 2; 45 | 46 | #if NVHBAO_BLUR 47 | imageStore(imgResult, intCoord, vec4(texelFetch( texResultsArray, ivec3(QuarterResPos, SliceId), 0).xy,0,0)); 48 | #else 49 | imageStore(imgResult, intCoord, vec4(texelFetch( texResultsArray, ivec3(QuarterResPos, SliceId), 0).x)); 50 | #endif 51 | } 52 | -------------------------------------------------------------------------------- /shaders/hbao_viewnormal.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_control_flow_attributes : require 23 | 24 | #include "hbao.h" 25 | 26 | layout(binding=NVHBAO_MAIN_TEX_LINDEPTH) uniform sampler2D texLinearDepth; 27 | layout(binding=NVHBAO_MAIN_IMG_VIEWNORMAL,rgba8) uniform image2D imgViewNormal; 28 | 29 | //---------------------------------------------------------------------------------- 30 | 31 | vec3 UVToView(vec2 uv, float eye_z) 32 | { 33 | return vec3((uv * control.projInfo.xy + control.projInfo.zw) * (control.projOrtho != 0 ? 1. : eye_z), eye_z); 34 | } 35 | 36 | vec3 FetchViewPos(vec2 UV) 37 | { 38 | float ViewDepth = textureLod(texLinearDepth,UV,0).x; 39 | return UVToView(UV, ViewDepth); 40 | } 41 | 42 | vec3 MinDiff(vec3 P, vec3 Pr, vec3 Pl) 43 | { 44 | vec3 V1 = Pr - P; 45 | vec3 V2 = P - Pl; 46 | return (dot(V1,V1) < dot(V2,V2)) ? V1 : V2; 47 | } 48 | 49 | vec3 ReconstructNormal(vec2 UV, vec3 P) 50 | { 51 | vec3 Pr = FetchViewPos(UV + vec2(control.InvFullResolution.x, 0)); 52 | vec3 Pl = FetchViewPos(UV + vec2(-control.InvFullResolution.x, 0)); 53 | vec3 Pt = FetchViewPos(UV + vec2(0, control.InvFullResolution.y)); 54 | vec3 Pb = FetchViewPos(UV + vec2(0, -control.InvFullResolution.y)); 55 | return normalize(cross(MinDiff(P, Pr, Pl), MinDiff(P, Pt, Pb))); 56 | } 57 | 58 | //---------------------------------------------------------------------------------- 59 | 60 | void main() { 61 | ivec2 intCoord; 62 | vec2 texCoord; 63 | 64 | if (setupCoordFull(intCoord, texCoord)) return; 65 | 66 | vec3 P = FetchViewPos(texCoord); 67 | vec3 N = ReconstructNormal(texCoord, P); 68 | 69 | imageStore(imgViewNormal, intCoord, vec4(N*0.5 + 0.5,0)); 70 | } 71 | -------------------------------------------------------------------------------- /shaders/nvhiz-update.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | #ifndef NV_HIZ_MAX_LEVELS 23 | #define NV_HIZ_MAX_LEVELS 16 24 | #endif 25 | 26 | #ifndef NV_HIZ_MSAA_SAMPLES 27 | #define NV_HIZ_MSAA_SAMPLES 0 28 | #endif 29 | 30 | #ifndef NV_HIZ_IS_FIRST 31 | #define NV_HIZ_IS_FIRST 1 32 | #endif 33 | 34 | #ifndef NV_HIZ_FORMAT 35 | #define NV_HIZ_FORMAT r32f 36 | #endif 37 | 38 | #ifndef NV_HIZ_OUTPUT_NEAR 39 | #define NV_HIZ_OUTPUT_NEAR 1 40 | #endif 41 | 42 | #ifndef NV_HIZ_LEVELS 43 | #define NV_HIZ_LEVELS 3 44 | #endif 45 | 46 | #ifndef NV_HIZ_NEAR_LEVEL 47 | #define NV_HIZ_NEAR_LEVEL 0 48 | #endif 49 | 50 | #ifndef NV_HIZ_FAR_LEVEL 51 | #define NV_HIZ_FAR_LEVEL 0 52 | #endif 53 | 54 | #ifndef NV_HIZ_REVERSED_Z 55 | #define NV_HIZ_REVERSED_Z 0 56 | #endif 57 | 58 | #ifndef NV_HIZ_USE_STEREO 59 | #define NV_HIZ_USE_STEREO 0 60 | #endif 61 | 62 | #if NV_HIZ_LEVELS > 1 63 | #extension GL_KHR_shader_subgroup_basic : require 64 | #extension GL_KHR_shader_subgroup_shuffle : require 65 | #endif 66 | 67 | #if NV_HIZ_REVERSED_Z 68 | #define minOp max 69 | #define maxOp min 70 | #else 71 | #define minOp min 72 | #define maxOp max 73 | #endif 74 | 75 | layout(local_size_x=32,local_size_y=2) in; 76 | 77 | layout(push_constant) uniform passUniforms { 78 | // keep in sync with nvhiz_vk.cpp 79 | ivec4 srcSize; 80 | int writeLod; 81 | int startLod; 82 | int layer; 83 | int _pad0; 84 | bvec4 levelActive; 85 | }; 86 | 87 | #if NV_HIZ_USE_STEREO 88 | #define samplerTypeMS sampler2DMSArray 89 | #define samplerType sampler2DArray 90 | #define imageType image2DArray 91 | #define IACCESS(v,l) ivec3(v,l) 92 | #else 93 | #define samplerTypeMS sampler2DMS 94 | #define samplerType sampler2D 95 | #define imageType image2D 96 | #define IACCESS(v,l) v 97 | #endif 98 | 99 | #if NV_HIZ_IS_FIRST && NV_HIZ_MSAA_SAMPLES 100 | layout(binding=0) uniform samplerTypeMS texDepth; 101 | #else 102 | layout(binding=0) uniform samplerType texDepth; 103 | #endif 104 | layout(binding=1) uniform samplerType texNear; 105 | 106 | layout(binding=2,NV_HIZ_FORMAT) uniform imageType imgNear; 107 | layout(binding=3,NV_HIZ_FORMAT) uniform imageType imgLevels[NV_HIZ_MAX_LEVELS]; 108 | 109 | void main() 110 | { 111 | ivec2 base = ivec2(gl_WorkGroupID.xy) * 8; 112 | ivec2 subset = ivec2(int(gl_LocalInvocationID.x) & 1, int(gl_LocalInvocationID.x) / 2); 113 | subset += gl_LocalInvocationID.x >= 16 ? ivec2(2,-8) : ivec2(0,0); 114 | subset += ivec2(gl_LocalInvocationID.y * 4,0); 115 | 116 | #if NV_HIZ_LEVELS > 1 117 | uint laneID = gl_SubgroupInvocationID; 118 | #endif 119 | 120 | //ivec2 outcoord = base + 7 - subset; 121 | ivec2 outcoord = base + subset; 122 | ivec2 coord = outcoord * 2; 123 | 124 | float flayer = float(layer); 125 | 126 | #if NV_HIZ_IS_FIRST && NV_HIZ_MSAA_SAMPLES 127 | #if NV_HIZ_REVERSED_Z 128 | float zMin = 0; 129 | float zMax = 1; 130 | #else 131 | float zMin = 1; 132 | float zMax = 0; 133 | #endif 134 | for (int i = 0; i < NV_HIZ_MSAA_SAMPLES; i++){ 135 | vec4 zRead = vec4(texelFetch(texDepth, IACCESS(min(coord + ivec2(0,0), srcSize.zw), layer), i).r, 136 | texelFetch(texDepth, IACCESS(min(coord + ivec2(1,0), srcSize.zw), layer), i).r, 137 | texelFetch(texDepth, IACCESS(min(coord + ivec2(0,1), srcSize.zw), layer), i).r, 138 | texelFetch(texDepth, IACCESS(min(coord + ivec2(1,1), srcSize.zw), layer), i).r); 139 | zMin = minOp(zMin, minOp(minOp(minOp(zRead.x, zRead.y),zRead.z),zRead.w)); 140 | zMax = maxOp(zMax, maxOp(maxOp(maxOp(zRead.x, zRead.y),zRead.z),zRead.w)); 141 | } 142 | #else 143 | #if NV_HIZ_IS_FIRST 144 | #define texRead texDepth 145 | #else 146 | #define texRead texNear 147 | #endif 148 | 149 | coord = min(coord, srcSize.zw); 150 | vec4 zRead = vec4(texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(0,0)).r, 151 | texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(1,0)).r, 152 | texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(0,1)).r, 153 | texelFetchOffset(texRead, IACCESS(coord, layer), startLod, ivec2(1,1)).r); 154 | 155 | float zMax = maxOp(maxOp(maxOp(zRead.x, zRead.y),zRead.z),zRead.w); 156 | float zMin = minOp(minOp(minOp(zRead.x, zRead.y),zRead.z),zRead.w); 157 | #endif 158 | 159 | //zMax = float(gl_ThreadInWarpNV) / 32.0; 160 | #if !(NV_HIZ_IS_FIRST && NV_HIZ_FAR_LEVEL > 0) 161 | imageStore(imgLevels[writeLod + 0], IACCESS(outcoord,layer), vec4(zMax)); 162 | #endif 163 | 164 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 0 165 | imageStore(imgNear, IACCESS(outcoord,layer), vec4(zMin)); 166 | #endif 167 | 168 | #if NV_HIZ_LEVELS > 1 169 | vec4 zRead0 = vec4( zMax, 170 | subgroupShuffle(zMax, laneID + 1), 171 | subgroupShuffle(zMax, laneID + 2), 172 | subgroupShuffle(zMax, laneID + 3)); 173 | 174 | 175 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL >= 1 176 | vec4 zRead1 = vec4( zMin, 177 | subgroupShuffle(zMin, laneID + 1), 178 | subgroupShuffle(zMin, laneID + 2), 179 | subgroupShuffle(zMin, laneID + 3)); 180 | #endif 181 | 182 | if ((levelActive.y || levelActive.z) && (laneID & 3) == 0) 183 | { 184 | outcoord /= 2; 185 | zMax = maxOp(maxOp(maxOp(zRead0.x, zRead0.y),zRead0.z),zRead0.w); 186 | #if !(NV_HIZ_IS_FIRST && NV_HIZ_FAR_LEVEL > 1) 187 | imageStore(imgLevels[writeLod + 1], IACCESS(outcoord, layer), vec4(zMax)); 188 | #endif 189 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL >= 1 190 | zMin = minOp(minOp(minOp(zRead1.x, zRead1.y),zRead1.z),zRead1.w); 191 | #if NV_HIZ_NEAR_LEVEL == 1 192 | imageStore(imgNear, IACCESS(outcoord, layer), vec4(zMin)); 193 | #endif 194 | #endif 195 | 196 | #if NV_HIZ_LEVELS > 2 197 | if (levelActive.z) { 198 | outcoord /= 2; 199 | zRead0 = vec4( zMax, 200 | subgroupShuffle(zMax, laneID + 4), 201 | subgroupShuffle(zMax, laneID + 16), 202 | subgroupShuffle(zMax, laneID + 20)); 203 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 2 204 | zRead1 = vec4( zMin, 205 | subgroupShuffle(zMin, laneID + 4), 206 | subgroupShuffle(zMin, laneID + 16), 207 | subgroupShuffle(zMin, laneID + 20)); 208 | #endif 209 | if ((laneID == 0) || (laneID == 8)) { 210 | zMax = maxOp(maxOp(maxOp(zRead0.x, zRead0.y),zRead0.z),zRead0.w); 211 | imageStore(imgLevels[writeLod + 2], IACCESS(outcoord, layer), vec4(zMax)); 212 | #if NV_HIZ_IS_FIRST && NV_HIZ_OUTPUT_NEAR && NV_HIZ_NEAR_LEVEL == 2 213 | zMin = minOp(minOp(minOp(zRead1.x, zRead1.y),zRead1.z),zRead1.w); 214 | imageStore(imgNear, IACCESS(outcoord, layer), vec4(zMin)); 215 | #endif 216 | } 217 | } 218 | #endif 219 | } 220 | #endif 221 | } 222 | -------------------------------------------------------------------------------- /shaders/octant_encoding.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef _OCTANT_ENCODING_H_ 21 | #define _OCTANT_ENCODING_H_ 22 | 23 | #ifdef __cplusplus 24 | namespace shaderio { 25 | #define OCT_INLINE inline 26 | #define OCT_FLOOR glm::floor 27 | #define OCT_CLAMP glm::clamp 28 | #define OCT_ABS glm::abs 29 | OCT_INLINE uint32_t pack_oct32(vec2 v) 30 | { 31 | union 32 | { 33 | int16_t snorm[2]; 34 | uint32_t packed; 35 | }; 36 | snorm[0] = static_cast(glm::clamp(int32_t(std::round(v.x * float(0x7FFF))), -0x7FFF, 0x7FFF)); 37 | snorm[1] = static_cast(glm::clamp(int32_t(std::round(v.y * float(0x7FFF))), -0x7FFF, 0x7FFF)); 38 | return packed; 39 | } 40 | OCT_INLINE vec2 unpack_oct32(uint32_t v) 41 | { 42 | union 43 | { 44 | int16_t snorm[2]; 45 | uint32_t packed; 46 | }; 47 | packed = v; 48 | return vec2(float(snorm[0]) / float(0x7FFF), float(snorm[1]) / float(0x7FFF)); 49 | } 50 | #else 51 | #define OCT_INLINE 52 | #define OCT_FLOOR floor 53 | #define OCT_CLAMP clamp 54 | #define OCT_ABS abs 55 | uint pack_oct32(vec2 v) 56 | { 57 | return packSnorm2x16(v); 58 | } 59 | vec2 unpack_oct32(uint v) 60 | { 61 | return unpackSnorm2x16(v); 62 | } 63 | #endif 64 | 65 | // oct functions from http://jcgt.org/published/0003/02/01/paper.pdf 66 | OCT_INLINE vec2 oct_signNotZero(vec2 v) 67 | { 68 | return vec2((v.x >= 0.0f) ? +1.0f : -1.0f, (v.y >= 0.0f) ? +1.0 : -1.0f); 69 | } 70 | OCT_INLINE vec3 oct_to_vec(vec2 e) 71 | { 72 | vec3 v = vec3(e.x, e.y, 1.0f - OCT_ABS(e.x) - OCT_ABS(e.y)); 73 | if(v.z < 0.0f) 74 | { 75 | vec2 os = oct_signNotZero(e); 76 | v.x = (1.0f - OCT_ABS(e.y)) * os.x; 77 | v.y = (1.0f - OCT_ABS(e.x)) * os.y; 78 | } 79 | return normalize(v); 80 | } 81 | 82 | OCT_INLINE vec3 oct32_to_vec(uint32_t v) 83 | { 84 | return oct_to_vec(unpack_oct32(v)); 85 | } 86 | 87 | OCT_INLINE vec2 vec_to_oct(vec3 v) 88 | { 89 | // Project the sphere onto the octahedron, and then onto the xy plane 90 | vec2 p = vec2(v.x, v.y) * (1.0f / (OCT_ABS(v.x) + OCT_ABS(v.y) + OCT_ABS(v.z))); 91 | // Reflect the folds of the lower hemisphere over the diagonals 92 | return (v.z <= 0.0f) ? (vec2(1.0f - OCT_ABS(p.y), 1.0f - OCT_ABS(p.x)) * oct_signNotZero(p)) : p; 93 | } 94 | 95 | OCT_INLINE vec2 vec_to_oct_precise(vec3 v, int bits) 96 | { 97 | vec2 s = vec_to_oct(v); // Remap to the square 98 | // Each snorm's max value interpreted as an integer, 99 | // e.g., 127.0 for snorm8 100 | float M = float(1 << ((bits / 2) - 1)) - 1.0f; 101 | // Remap components to snorm(n/2) precision...with floor instead 102 | // of round (see equation 1) 103 | s = OCT_FLOOR(OCT_CLAMP(s, -1.0f, +1.0f) * M) * (1.0f / M); 104 | vec2 bestRepresentation = s; 105 | float highestCosine = dot(oct_to_vec(s), v); 106 | // Test all combinations of floor and ceil and keep the best. 107 | // Note that at +/- 1, this will exit the square... but that 108 | // will be a worse encoding and never win. 109 | for(int i = 0; i <= 1; ++i) 110 | { 111 | for(int j = 0; j <= 1; ++j) 112 | { 113 | // This branch will be evaluated at compile time 114 | if((i != 0) || (j != 0)) 115 | { 116 | // Offset the bit pattern (which is stored in floating 117 | // point!) to effectively change the rounding mode 118 | // (when i or j is 0: floor, when it is one: ceiling) 119 | vec2 candidate = vec2(i, j) * (1 / M) + s; 120 | float cosine = dot(oct_to_vec(candidate), v); 121 | if(cosine > highestCosine) 122 | { 123 | bestRepresentation = candidate; 124 | highestCosine = cosine; 125 | } 126 | } 127 | } 128 | } 129 | return bestRepresentation; 130 | } 131 | 132 | OCT_INLINE uint vec_to_oct32(vec3 v) 133 | { 134 | return pack_oct32(vec_to_oct_precise(v, 32)); 135 | } 136 | 137 | #undef OCT_ABS 138 | #undef OCT_FLOOR 139 | #undef OCT_CLAMP 140 | #undef OCT_INLINE 141 | 142 | #ifdef __cplusplus 143 | } 144 | #endif 145 | 146 | #endif -------------------------------------------------------------------------------- /shaders/render_instance_bbox.frag.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #version 460 20 | 21 | #extension GL_GOOGLE_include_directive : enable 22 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 23 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 24 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 26 | #extension GL_EXT_buffer_reference : enable 27 | #extension GL_EXT_buffer_reference2 : enable 28 | #extension GL_EXT_scalar_block_layout : enable 29 | 30 | #include "shaderio.h" 31 | 32 | /////////////////////////////////////////////////// 33 | 34 | 35 | layout(std140, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 36 | { 37 | FrameConstants view; 38 | }; 39 | 40 | /////////////////////////////////////////////////// 41 | 42 | #include "render_shading.glsl" 43 | 44 | /////////////////////////////////////////////////// 45 | 46 | layout(location=0) in Interpolants 47 | { 48 | flat uint instanceID; 49 | } IN; 50 | 51 | /////////////////////////////////////////////////// 52 | 53 | layout(location=0,index=0) out vec4 out_Color; 54 | 55 | /////////////////////////////////////////////////// 56 | 57 | void main() 58 | { 59 | out_Color = unpackUnorm4x8(murmurHash(IN.instanceID)) * 0.9 + 0.1; 60 | out_Color.w = 1.0; 61 | } -------------------------------------------------------------------------------- /shaders/render_instance_bbox.mesh.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | #extension GL_GOOGLE_include_directive : enable 23 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 24 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 27 | #extension GL_EXT_buffer_reference : enable 28 | #extension GL_EXT_buffer_reference2 : enable 29 | #extension GL_EXT_scalar_block_layout : enable 30 | 31 | #extension GL_NV_mesh_shader : require 32 | #extension GL_EXT_control_flow_attributes: require 33 | 34 | #include "shaderio.h" 35 | 36 | layout(push_constant) uniform pushData 37 | { 38 | uint numRenderInstances; 39 | } 40 | push; 41 | 42 | layout(std140, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 43 | { 44 | FrameConstants view; 45 | }; 46 | 47 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 48 | { 49 | RenderInstance instances[]; 50 | }; 51 | 52 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 53 | { 54 | Geometry geometries[]; 55 | }; 56 | 57 | //////////////////////////////////////////// 58 | 59 | layout(location=0) out Interpolants { 60 | flat uint instanceID; 61 | } OUT[]; 62 | 63 | //////////////////////////////////////////// 64 | 65 | #define MESH_WORKGROUP_SIZE 32 66 | 67 | #define BOX_VERTICES 8 68 | #define BOX_LINES 12 69 | #define BOX_LINE_THREADS 4 70 | 71 | layout(local_size_x=MESH_WORKGROUP_SIZE) in; 72 | layout(max_vertices=BBOXES_PER_MESHLET * BOX_VERTICES, max_primitives=BBOXES_PER_MESHLET * BOX_LINES) out; 73 | layout(lines) out; 74 | 75 | //////////////////////////////////////////// 76 | 77 | void writePrimitiveLineIndices(uint idx, uvec2 vertexIndices) 78 | { 79 | gl_PrimitiveIndicesNV[idx * 2 + 0] = vertexIndices.x; 80 | gl_PrimitiveIndicesNV[idx * 2 + 1] = vertexIndices.y; 81 | } 82 | 83 | void main() 84 | { 85 | uint baseID = gl_WorkGroupID.x * BBOXES_PER_MESHLET; 86 | uint numBoxes = min(push.numRenderInstances, baseID + BBOXES_PER_MESHLET) - baseID; 87 | 88 | if (gl_LocalInvocationID.x == 0) 89 | { 90 | gl_PrimitiveCountNV = numBoxes * BOX_LINES; 91 | } 92 | 93 | const uint vertexRuns = ((BBOXES_PER_MESHLET * BOX_VERTICES) + MESH_WORKGROUP_SIZE-1) / MESH_WORKGROUP_SIZE; 94 | 95 | [[unroll]] 96 | for (uint32_t run = 0; run < vertexRuns; run++) 97 | { 98 | uint vert = gl_LocalInvocationID.x + run * MESH_WORKGROUP_SIZE; 99 | uint box = vert / BOX_VERTICES; 100 | uint corner = vert % BOX_VERTICES; 101 | 102 | uint boxLoad = min(box,numBoxes-1); 103 | 104 | RenderInstance instance = instances[boxLoad + baseID]; 105 | BBox bbox = geometries[instance.geometryID].bbox; 106 | 107 | bvec3 weight = bvec3((corner & 1) != 0, (corner & 2) != 0, (corner & 4) != 0); 108 | vec3 cornerPos = mix(bbox.lo, bbox.hi, weight); 109 | 110 | if (box < numBoxes) 111 | { 112 | gl_MeshVerticesNV[vert].gl_Position = view.viewProjMatrix * (instance.worldMatrix * vec4(cornerPos,1)); 113 | OUT[vert].instanceID = baseID + box; 114 | } 115 | } 116 | 117 | { 118 | uvec2 boxIndices[4] = uvec2[4]( 119 | uvec2(0,1),uvec2(1,3),uvec2(3,2),uvec2(2,0) 120 | ); 121 | 122 | uint subID = gl_LocalInvocationID.x & (BOX_LINE_THREADS-1); 123 | uint box = gl_LocalInvocationID.x / BOX_LINE_THREADS; 124 | 125 | uvec2 circle = boxIndices[subID]; 126 | 127 | if (box < numBoxes) 128 | { 129 | // lower 130 | writePrimitiveLineIndices(box * 12 + subID + 0, circle + box * BOX_VERTICES); 131 | // upper 132 | writePrimitiveLineIndices(box * 12 + subID + 4, circle + 4 + box * BOX_VERTICES); 133 | // connectors 134 | writePrimitiveLineIndices(box * 12 + subID + 8, uvec2(subID, subID + 4) + box * BOX_VERTICES); 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /shaders/render_raster.frag.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | #extension GL_GOOGLE_include_directive : enable 23 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 24 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 27 | #extension GL_EXT_buffer_reference : enable 28 | #extension GL_EXT_buffer_reference2 : enable 29 | #extension GL_EXT_scalar_block_layout : enable 30 | #extension GL_EXT_shader_atomic_int64 : enable 31 | #extension GL_EXT_fragment_shader_barycentric : enable 32 | 33 | #include "shaderio.h" 34 | 35 | layout(push_constant) uniform pushData 36 | { 37 | uint instanceID; 38 | } 39 | push; 40 | 41 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 42 | { 43 | FrameConstants view; 44 | }; 45 | 46 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 47 | { 48 | Readback readback; 49 | }; 50 | 51 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 52 | { 53 | RenderInstance instances[]; 54 | }; 55 | 56 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 57 | { 58 | Geometry geometries[]; 59 | }; 60 | 61 | #if USE_STREAMING 62 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 63 | { 64 | SceneStreaming streaming; 65 | }; 66 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 67 | { 68 | SceneStreaming streamingRW; 69 | }; 70 | #endif 71 | 72 | /////////////////////////////////////////////////// 73 | 74 | #include "render_shading.glsl" 75 | 76 | /////////////////////////////////////////////////// 77 | 78 | layout(location = 0) in Interpolants 79 | { 80 | #if ALLOW_SHADING 81 | vec3 wPos; 82 | #if ALLOW_VERTEX_NORMALS 83 | vec3 wNormal; 84 | #endif 85 | #endif 86 | flat uint clusterID; 87 | flat uint instanceID; 88 | } 89 | IN; 90 | 91 | 92 | /////////////////////////////////////////////////// 93 | 94 | layout(location = 0, index = 0) out vec4 out_Color; 95 | layout(early_fragment_tests) in; 96 | 97 | /////////////////////////////////////////////////// 98 | 99 | 100 | void main() 101 | { 102 | vec3 wNormal; 103 | 104 | #if ALLOW_SHADING 105 | #if ALLOW_VERTEX_NORMALS 106 | if(view.facetShading != 0) 107 | #endif 108 | { 109 | wNormal = -cross(dFdx(IN.wPos), dFdy(IN.wPos)); 110 | } 111 | #if ALLOW_VERTEX_NORMALS 112 | else 113 | { 114 | wNormal = IN.wNormal; 115 | if(view.flipWinding == 1 || (view.flipWinding == 2 && !gl_FrontFacing)) 116 | { 117 | wNormal = -wNormal; 118 | } 119 | } 120 | #endif 121 | #endif 122 | 123 | uint visData = IN.clusterID; 124 | if (view.visualize == VISUALIZE_LOD || view.visualize == VISUALIZE_GROUP) 125 | { 126 | #if USE_STREAMING 127 | Cluster cluster = Cluster_in(streaming.resident.clusters.d[IN.clusterID]).d; 128 | #else 129 | Geometry geometry = geometries[instances[IN.instanceID].geometryID]; 130 | Cluster cluster = geometry.preloadedClusters.d[IN.clusterID]; 131 | #endif 132 | if (view.visualize == VISUALIZE_LOD) 133 | { 134 | visData = floatBitsToUint(float(cluster.lodLevel) * instances[IN.instanceID].maxLodLevelRcp); 135 | } 136 | else { 137 | visData = cluster.groupID; 138 | } 139 | } 140 | else if (view.visualize == VISUALIZE_TRIANGLE) 141 | { 142 | visData = IN.clusterID * 256 + uint(gl_PrimitiveID); 143 | } 144 | 145 | out_Color.w = 1.f; 146 | #if ALLOW_SHADING && 1 147 | { 148 | const float overHeadLight = 1.0f; 149 | const float ambientLight = 1.f; 150 | 151 | out_Color = shading(IN.instanceID, IN.wPos, wNormal, visData, overHeadLight, ambientLight); 152 | } 153 | #else 154 | { 155 | out_Color = vec4(visualizeColor(visData), 1.0); 156 | } 157 | #endif 158 | 159 | #if DEBUG_VISUALIZATION 160 | if(view.doWireframe != 0 || (view.visFilterInstanceID == IN.instanceID && view.visFilterClusterID == IN.clusterID)) 161 | { 162 | out_Color.xyz = addWireframe(out_Color.xyz, gl_BaryCoordEXT, gl_FrontFacing, fwidthFine(gl_BaryCoordEXT), view.wireColor); 163 | } 164 | #endif 165 | 166 | uvec2 pixelCoord = uvec2(gl_FragCoord.xy); 167 | if(pixelCoord == view.mousePosition) 168 | { 169 | uint32_t packedClusterTriangleId = (IN.clusterID << 8) | (gl_PrimitiveID & 0xFF); 170 | atomicMax(readback.clusterTriangleId, packPickingValue(packedClusterTriangleId, gl_FragCoord.z)); 171 | atomicMax(readback.instanceId, packPickingValue(IN.instanceID, gl_FragCoord.z)); 172 | } 173 | } -------------------------------------------------------------------------------- /shaders/render_raster_clusters.mesh.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /* 20 | 21 | Shader Description 22 | ================== 23 | 24 | This mesh shader renders a single cluster. 25 | 26 | */ 27 | 28 | 29 | #version 460 30 | 31 | #extension GL_GOOGLE_include_directive : enable 32 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 33 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 36 | #extension GL_EXT_buffer_reference : enable 37 | #extension GL_EXT_buffer_reference2 : enable 38 | #extension GL_EXT_scalar_block_layout : enable 39 | 40 | #extension GL_NV_mesh_shader : require 41 | #extension GL_EXT_control_flow_attributes : require 42 | 43 | #include "shaderio.h" 44 | #include "octant_encoding.h" 45 | 46 | layout(push_constant) uniform pushData 47 | { 48 | uint instanceID; 49 | } 50 | push; 51 | 52 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 53 | { 54 | FrameConstants view; 55 | }; 56 | 57 | layout(scalar,binding=BINDINGS_READBACK_SSBO,set=0) buffer readbackBuffer 58 | { 59 | Readback readback; 60 | }; 61 | 62 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 63 | { 64 | RenderInstance instances[]; 65 | }; 66 | 67 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 68 | { 69 | Geometry geometries[]; 70 | }; 71 | 72 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 73 | { 74 | SceneBuilding build; 75 | }; 76 | 77 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW 78 | { 79 | SceneBuilding buildRW; 80 | }; 81 | 82 | #if USE_STREAMING 83 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 84 | { 85 | SceneStreaming streaming; 86 | }; 87 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 88 | { 89 | SceneStreaming streamingRW; 90 | }; 91 | #endif 92 | 93 | //////////////////////////////////////////// 94 | 95 | layout(location = 0) out Interpolants 96 | { 97 | #if ALLOW_SHADING 98 | vec3 wPos; 99 | #if ALLOW_VERTEX_NORMALS 100 | vec3 wNormal; 101 | #endif 102 | #endif 103 | flat uint clusterID; 104 | flat uint instanceID; 105 | } 106 | OUT[]; 107 | 108 | //////////////////////////////////////////// 109 | 110 | #ifndef MESHSHADER_WORKGROUP_SIZE 111 | #define MESHSHADER_WORKGROUP_SIZE 32 112 | #endif 113 | 114 | layout(local_size_x = MESHSHADER_WORKGROUP_SIZE) in; 115 | layout(max_vertices = CLUSTER_VERTEX_COUNT, max_primitives = CLUSTER_TRIANGLE_COUNT) out; 116 | layout(triangles) out; 117 | 118 | const uint MESHLET_VERTEX_ITERATIONS = ((CLUSTER_VERTEX_COUNT + MESHSHADER_WORKGROUP_SIZE - 1) / MESHSHADER_WORKGROUP_SIZE); 119 | const uint MESHLET_TRIANGLE_ITERATIONS = ((CLUSTER_TRIANGLE_COUNT + MESHSHADER_WORKGROUP_SIZE - 1) / MESHSHADER_WORKGROUP_SIZE); 120 | 121 | //////////////////////////////////////////// 122 | 123 | void main() 124 | { 125 | ClusterInfo cinfo = build.renderClusterInfos.d[gl_WorkGroupID.x]; 126 | 127 | uint instanceID = cinfo.instanceID; 128 | uint clusterID = cinfo.clusterID; 129 | 130 | RenderInstance instance = instances[instanceID]; 131 | Geometry geometry = geometries[instance.geometryID]; 132 | 133 | #if USE_STREAMING 134 | Cluster cluster = Cluster_in(streaming.resident.clusters.d[clusterID]).d; 135 | #else 136 | Cluster cluster = geometry.preloadedClusters.d[clusterID]; 137 | #endif 138 | 139 | uint vertMax = cluster.vertexCountMinusOne; 140 | uint triMax = cluster.triangleCountMinusOne; 141 | 142 | if (gl_LocalInvocationID.x == 0) { 143 | gl_PrimitiveCountNV = triMax + 1; 144 | // just for stats 145 | atomicAdd(readback.numRenderedTriangles, uint(triMax + 1)); 146 | } 147 | 148 | vec4s_in oVertices = vec4s_in(cluster.vertices); 149 | uint8s_in localTriangles = uint8s_in(cluster.localTriangles); 150 | 151 | mat4 worldMatrix = instance.worldMatrix; 152 | mat3 worldMatrixIT = transpose(inverse(mat3(worldMatrix))); 153 | 154 | 155 | [[unroll]] for(uint i = 0; i < uint(MESHLET_VERTEX_ITERATIONS); i++) 156 | { 157 | uint vert = gl_LocalInvocationID.x + i * MESHSHADER_WORKGROUP_SIZE; 158 | uint vertLoad = min(vert, vertMax); 159 | 160 | vec4 oVertex = oVertices.d[vertLoad]; 161 | 162 | vec3 oPos = oVertex.xyz; 163 | vec4 wPos = worldMatrix * vec4(oPos, 1.0f); 164 | #if ALLOW_VERTEX_NORMALS 165 | vec3 oNormal = oct32_to_vec(floatBitsToUint(oVertex.w)); 166 | #endif 167 | 168 | if(vert <= vertMax) 169 | { 170 | gl_MeshVerticesNV[vert].gl_Position = view.viewProjMatrix * wPos; 171 | #if ALLOW_SHADING 172 | OUT[vert].wPos = wPos.xyz; 173 | #if ALLOW_VERTEX_NORMALS 174 | OUT[vert].wNormal = normalize(worldMatrixIT * oNormal); 175 | #endif 176 | #endif 177 | OUT[vert].clusterID = clusterID; 178 | OUT[vert].instanceID = instanceID; 179 | } 180 | } 181 | 182 | [[unroll]] for(uint i = 0; i < uint(MESHLET_TRIANGLE_ITERATIONS); i++) 183 | { 184 | uint tri = gl_LocalInvocationID.x + i * MESHSHADER_WORKGROUP_SIZE; 185 | uint triLoad = min(tri, triMax); 186 | 187 | uvec3 indices = uvec3(localTriangles.d[triLoad * 3 + 0], 188 | localTriangles.d[triLoad * 3 + 1], 189 | localTriangles.d[triLoad * 3 + 2]); 190 | 191 | if(tri <= triMax) 192 | { 193 | gl_PrimitiveIndicesNV[tri * 3 + 0] = indices.x; 194 | gl_PrimitiveIndicesNV[tri * 3 + 1] = indices.y; 195 | gl_PrimitiveIndicesNV[tri * 3 + 2] = indices.z; 196 | gl_MeshPrimitivesNV[tri].gl_PrimitiveID = int(tri); 197 | } 198 | } 199 | } -------------------------------------------------------------------------------- /shaders/render_raytrace.rgen.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | #extension GL_GOOGLE_include_directive : enable 23 | 24 | #extension GL_EXT_ray_tracing : require 25 | 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 27 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 28 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 29 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 30 | #extension GL_EXT_buffer_reference : enable 31 | #extension GL_EXT_scalar_block_layout : enable 32 | 33 | #include "shaderio.h" 34 | 35 | ////////////////////////////////////////////////////////////// 36 | 37 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 38 | { 39 | FrameConstants view; 40 | }; 41 | 42 | layout(set = 0, binding = BINDINGS_TLAS) uniform accelerationStructureEXT asScene; 43 | layout(set = 0, binding = BINDINGS_RENDER_TARGET, rgba8) uniform image2D imgColor; 44 | 45 | layout(set = 0, binding = BINDINGS_RAYTRACING_DEPTH, r32f) uniform image2D imgRaytracingDepth; 46 | 47 | ////////////////////////////////////////////////////////////// 48 | 49 | layout(location = 0) rayPayloadEXT RayPayload rayHit; 50 | 51 | ////////////////////////////////////////////////////////////// 52 | 53 | void main() 54 | { 55 | // for writing debugging values to stats.debug etc. 56 | bool center = gl_LaunchIDEXT.xy == (gl_LaunchSizeEXT.xy / 2); 57 | 58 | ivec2 screen = ivec2(gl_LaunchIDEXT.xy); 59 | vec2 uv = (vec2(gl_LaunchIDEXT.xy) + vec2(0.5)) / vec2(gl_LaunchSizeEXT.xy); 60 | 61 | 62 | vec2 d = uv * 2.0 - 1.0; 63 | 64 | 65 | vec4 origin = view.viewMatrixI * vec4(0, 0, 0, 1); 66 | vec4 target = normalize(view.projMatrixI * vec4(d.x, d.y, 1, 1)); 67 | vec4 direction = normalize(view.viewMatrixI * vec4(target.xyz, 0)); 68 | 69 | float tMin = view.nearPlane; 70 | float tMax = view.farPlane; 71 | 72 | #if DEBUG_VISUALIZATION 73 | vec2 uvOffset = (vec2(gl_LaunchIDEXT.xy) + vec2(1.5, 1.5)) / vec2(gl_LaunchSizeEXT.xy); 74 | vec2 dOffset = uvOffset * 2.0 - 1.0; 75 | vec4 targetOffsetX = normalize(view.projMatrixI * vec4(dOffset.x, d.y, 1, 1)); 76 | vec4 targetOffsetY = normalize(view.projMatrixI * vec4(d.x, dOffset.y, 1, 1)); 77 | vec4 directionOffsetX = normalize(view.viewMatrixI * vec4(targetOffsetX.xyz, 0)); 78 | vec4 directionOffsetY = normalize(view.viewMatrixI * vec4(targetOffsetY.xyz, 0)); 79 | rayHit.color.xyz = directionOffsetX.xyz; 80 | rayHit.differentialY.xyz = directionOffsetY.xyz; 81 | #endif 82 | 83 | traceRayEXT(asScene, view.flipWinding == 2 ? 0 : gl_RayFlagsCullBackFacingTrianglesEXT, 0xff, 0, 0, // hit offset, hit stride 84 | 0, // miss offset 85 | origin.xyz, tMin, direction.xyz, tMax, 86 | 0 // rayPayloadNV location qualifier 87 | ); 88 | 89 | { 90 | imageStore(imgColor, screen, vec4(rayHit.color.xyz, 1)); 91 | imageStore(imgRaytracingDepth, screen, vec4(rayHit.color.w == 0 ? 1.0 : rayHit.color.w, 0.f, 0.f, 0.f)); 92 | } 93 | } -------------------------------------------------------------------------------- /shaders/render_raytrace.rmiss.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #version 460 21 | 22 | #extension GL_GOOGLE_include_directive : enable 23 | 24 | #extension GL_EXT_ray_tracing : require 25 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 26 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 27 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 28 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 29 | 30 | #include "shaderio.h" 31 | 32 | ////////////////////////////////////////////////////////////// 33 | 34 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 35 | { 36 | FrameConstants view; 37 | }; 38 | 39 | ////////////////////////////////////////////////////////////// 40 | 41 | layout(location = RAYTRACING_PAYLOAD_INDEX) rayPayloadInEXT RayPayload rayHit; 42 | 43 | ////////////////////////////////////////////////////////////// 44 | 45 | void main() 46 | { 47 | vec3 skyColor = evalSimpleSky(view.skyParams, gl_WorldRayDirectionEXT); 48 | 49 | rayHit.color.rgb = skyColor; 50 | rayHit.color.w = 0.f; 51 | } 52 | -------------------------------------------------------------------------------- /shaders/shaderio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef _SHADERIO_H_ 21 | #define _SHADERIO_H_ 22 | 23 | #include "shaderio_core.h" 24 | #include "shaderio_scene.h" 25 | #include "shaderio_streaming.h" 26 | #include "shaderio_building.h" 27 | #include "dh_sky.h" 28 | 29 | ///////////////////////////////////////// 30 | 31 | #define ALLOW_SHADING 1 32 | #define ALLOW_VERTEX_NORMALS 1 33 | 34 | ///////////////////////////////////////// 35 | 36 | #define VISUALIZE_NONE 0 37 | #define VISUALIZE_CLUSTER 1 38 | #define VISUALIZE_GROUP 2 39 | #define VISUALIZE_LOD 3 40 | #define VISUALIZE_TRIANGLE 4 41 | 42 | #define BBOXES_PER_MESHLET 8 43 | 44 | ///////////////////////////////////////// 45 | 46 | #define BINDINGS_FRAME_UBO 0 47 | #define BINDINGS_READBACK_SSBO 1 48 | #define BINDINGS_GEOMETRIES_SSBO 2 49 | #define BINDINGS_RENDERINSTANCES_SSBO 3 50 | #define BINDINGS_SCENEBUILDING_SSBO 4 51 | #define BINDINGS_SCENEBUILDING_UBO 5 52 | #define BINDINGS_HIZ_TEX 6 53 | #define BINDINGS_STREAMING_UBO 7 54 | #define BINDINGS_STREAMING_SSBO 8 55 | #define BINDINGS_TLAS 9 56 | #define BINDINGS_RENDER_TARGET 10 57 | #define BINDINGS_RAYTRACING_DEPTH 11 58 | 59 | ///////////////////////////////////////// 60 | 61 | #define BUILD_SETUP_TRAVERSAL_RUN 1 62 | #define BUILD_SETUP_DRAW 2 63 | #define BUILD_SETUP_BLAS_INSERTION 3 64 | 65 | ///////////////////////////////////////// 66 | 67 | #define STREAM_SETUP_COMPACTION_OLD_NO_UNLOADS 0 68 | #define STREAM_SETUP_COMPACTION_STATUS 1 69 | #define STREAM_SETUP_ALLOCATOR_FREEINSERT 2 70 | #define STREAM_SETUP_ALLOCATOR_STATUS 3 71 | 72 | ///////////////////////////////////////// 73 | 74 | #define TRAVERSAL_PRESORT_WORKGROUP 128 75 | #define TRAVERSAL_INIT_WORKGROUP 128 76 | #define TRAVERSAL_RUN_WORKGROUP 64 77 | #define BLAS_SETUP_INSERTION_WORKGROUP 128 78 | #define BLAS_INSERT_CLUSTERS_WORKGROUP 128 79 | 80 | // must be power of 2 81 | #define STREAM_UPDATE_SCENE_WORKGROUP 64 82 | #define STREAM_AGEFILTER_GROUPS_WORKGROUP 128 83 | #define STREAM_COMPACTION_NEW_CLAS_WORKGROUP 128 84 | #define STREAM_COMPACTION_OLD_CLAS_WORKGROUP 64 85 | #define STREAM_ALLOCATOR_LOAD_GROUPS_WORKGROUP 64 86 | #define STREAM_ALLOCATOR_UNLOAD_GROUPS_WORKGROUP 64 87 | #define STREAM_ALLOCATOR_BUILD_FREEGAPS_WORKGROUP 64 88 | #define STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP 64 89 | #define STREAM_ALLOCATOR_SETUP_INSERTION_WORKGROUP 64 90 | 91 | ///////////////////////////////////////// 92 | 93 | #ifndef USE_CULLING 94 | #define USE_CULLING 1 95 | #endif 96 | 97 | #ifndef USE_INSTANCE_SORTING 98 | #define USE_INSTANCE_SORTING 1 99 | #endif 100 | 101 | 102 | #ifndef USE_STREAMING 103 | #define USE_STREAMING 1 104 | #endif 105 | 106 | #ifndef MAX_VISIBLE_CLUSTERS 107 | #define MAX_VISIBLE_CLUSTERS 1024 108 | #endif 109 | 110 | #ifndef TARGETS_RASTERIZATION 111 | #define TARGETS_RASTERIZATION 1 112 | #endif 113 | 114 | #define TARGETS_RAY_TRACING (!(TARGETS_RASTERIZATION)) 115 | 116 | ///////////////////////////////////////// 117 | 118 | #ifdef __cplusplus 119 | namespace shaderio { 120 | using namespace glm; 121 | using namespace nvvkhl_shaders; 122 | #endif 123 | 124 | struct FrameConstants 125 | { 126 | mat4 projMatrix; 127 | mat4 projMatrixI; 128 | 129 | mat4 viewProjMatrix; 130 | mat4 viewProjMatrixI; 131 | mat4 viewMatrix; 132 | mat4 viewMatrixI; 133 | vec4 viewPos; 134 | vec4 viewDir; 135 | vec4 viewPlane; 136 | 137 | ivec2 viewport; 138 | vec2 viewportf; 139 | 140 | vec2 viewPixelSize; 141 | vec2 viewClipSize; 142 | 143 | vec3 wLightPos; 144 | float lightMixer; 145 | 146 | vec3 wUpDir; 147 | float sceneSize; 148 | 149 | uint flipWinding; 150 | uint tintTessellated; 151 | uint visualize; 152 | float fov; 153 | 154 | float nearPlane; 155 | float farPlane; 156 | float ambientOcclusionRadius; 157 | int32_t ambientOcclusionSamples; 158 | 159 | vec4 hizSizeFactors; 160 | vec4 nearSizeFactors; 161 | 162 | float hizSizeMax; 163 | int facetShading; 164 | int supersample; 165 | uint colorXor; 166 | 167 | uint dbgUint; 168 | float dbgFloat; 169 | uint frame; 170 | uint doShadow; 171 | 172 | vec4 bgColor; 173 | 174 | uvec2 mousePosition; 175 | float wireThickness; 176 | float wireSmoothing; 177 | 178 | vec3 wireColor; 179 | uint wireStipple; 180 | 181 | vec3 wireBackfaceColor; 182 | float wireStippleRepeats; 183 | 184 | float wireStippleLength; 185 | uint doWireframe; 186 | uint visFilterInstanceID; 187 | uint visFilterClusterID; 188 | 189 | SimpleSkyParameters skyParams; 190 | }; 191 | 192 | struct Readback 193 | { 194 | uint numRenderClusters; 195 | uint numTraversalInfos; 196 | uint numRenderedClusters; 197 | uint numRenderedTriangles; 198 | 199 | uint64_t blasActualSizes; 200 | 201 | #ifdef __cplusplus 202 | uint32_t clusterTriangleId; 203 | uint32_t _packedDepth0; 204 | 205 | uint32_t instanceId; 206 | uint32_t _packedDepth1; 207 | #else 208 | uint64_t clusterTriangleId; 209 | uint64_t instanceId; 210 | #endif 211 | 212 | uint64_t debugU64; 213 | 214 | int debugI; 215 | uint debugUI; 216 | uint debugF; 217 | 218 | uint debugA[64]; 219 | uint debugB[64]; 220 | uint debugC[64]; 221 | }; 222 | 223 | 224 | struct RayPayload 225 | { 226 | // Ray gen writes the direction through the pixel at x+1 for ray differentials. 227 | // Closest hit returns the shaded color there. 228 | vec4 color; 229 | #if DEBUG_VISUALIZATION 230 | // Ray direction through the pixel at y+1 for ray differentials 231 | vec4 differentialY; 232 | #endif 233 | }; 234 | 235 | #ifdef __cplusplus 236 | } 237 | #endif 238 | #endif // _SHADERIO_H_ 239 | -------------------------------------------------------------------------------- /shaders/shaderio_building.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include "shaderio_streaming.h" 21 | 22 | #ifndef _SHADERIO_BUILDING_H_ 23 | #define _SHADERIO_BUILDING_H_ 24 | 25 | #ifdef __cplusplus 26 | namespace shaderio { 27 | using namespace glm; 28 | #else 29 | 30 | #define INSTANCE_FRUSTUM_BIT 1 31 | #define INSTANCE_VISIBLE_BIT 2 32 | 33 | #endif 34 | 35 | // The item descriptor used in the lod hierarchy traversal 36 | // producer/consumer queue. 37 | // It can can encode a lod hierarchy node, or a cluster group of an instance. 38 | // must fit in 64-bit 39 | struct TraversalInfo 40 | { 41 | uint32_t instanceID; 42 | uint32_t packedNode; 43 | }; 44 | #ifndef __cplusplus 45 | TraversalInfo unpackTraversalInfo(uint64_t packed64) { 46 | u32vec2 data = unpack32(packed64); 47 | TraversalInfo info; 48 | info.instanceID = data.x; 49 | info.packedNode = data.y; 50 | return info; 51 | } 52 | uint64_t packTraversalInfo(TraversalInfo info) 53 | { 54 | return pack64(u32vec2(info.instanceID,info.packedNode)); 55 | } 56 | #endif 57 | 58 | // A renderable cluster 59 | // must fit in 64-bit, and can be overlayed with `TraversalInfo` 60 | // thereore instanceID must come first. 61 | struct ClusterInfo 62 | { 63 | uint32_t instanceID; 64 | uint32_t clusterID; 65 | }; 66 | BUFFER_REF_DECLARE_ARRAY(ClusterInfos_inout, ClusterInfo, , 8); 67 | 68 | // Indirect build information to build a BLAS from an array of CLAS references 69 | struct BlasBuildInfo 70 | { 71 | // the number of CLAS that this BLAS references 72 | uint32_t clusterReferencesCount; 73 | // stride of array (typically 8 for 64-bit) 74 | uint32_t clusterReferencesStride; 75 | // start address of the array 76 | uint64_t clusterReferences; 77 | }; 78 | BUFFER_REF_DECLARE_ARRAY(BlasBuildInfo_inout, BlasBuildInfo, , 16); 79 | 80 | // Indirect build information for a TLAS instance 81 | struct TlasInstance 82 | { 83 | mat3x4 worldMatrix; 84 | uint32_t instanceCustomIndex24_mask8; 85 | uint32_t instanceShaderBindingTableRecordOffset24_flags8; 86 | uint64_t blasReference; 87 | }; 88 | BUFFER_REF_DECLARE_ARRAY(TlasInstances_inout, TlasInstance, , 16); 89 | 90 | // The central structure that contains relevant information to 91 | // perform the runtime lod hierchy traversal and building of 92 | // all relevant clusters to be rendered in the current frame. 93 | // (not optimally packed for cache efficiency but readability) 94 | struct SceneBuilding 95 | { 96 | mat4 traversalViewMatrix; 97 | 98 | uint numRenderInstances; 99 | uint maxRenderClusters; 100 | uint maxTraversalInfos; 101 | float errorOverDistanceThreshold; 102 | 103 | uint renderClusterCounter; 104 | int traversalTaskCounter; 105 | uint traversalInfoReadCounter; 106 | uint traversalInfoWriteCounter; 107 | 108 | // result of traversal init & scratch for traversal run 109 | BUFFER_REF(uint64s_coh_volatile) traversalNodeInfos; 110 | // result of traversal run 111 | BUFFER_REF(ClusterInfos_inout) renderClusterInfos; 112 | 113 | // rasterization related 114 | ////////////////////////////////////////////////// 115 | 116 | DrawMeshTasksIndirectCommandNV indirectDrawClusters; 117 | 118 | // ray tracing related 119 | ////////////////////////////////////////////////// 120 | 121 | DispatchIndirectCommand indirectDispatchBlasInsertion; 122 | 123 | uint blasClasCounter; 124 | 125 | // instance states store culling/visibility related information 126 | BUFFER_REF(uint32s_inout) instanceStates; 127 | 128 | BUFFER_REF(uint32s_inout) instanceSortValues; 129 | BUFFER_REF(uint32s_inout) instanceSortKeys; 130 | 131 | BUFFER_REF(TlasInstances_inout) tlasInstances; 132 | 133 | // per instance 134 | BUFFER_REF(BlasBuildInfo_inout) blasBuildInfos; 135 | BUFFER_REF(uint32s_inout) blasBuildSizes; 136 | // split into per-instance regions 137 | BUFFER_REF(uint64s_inout) blasClusterAddresses; 138 | uint64_t blasBuildData; 139 | }; 140 | 141 | 142 | 143 | #ifdef __cplusplus 144 | } 145 | #endif 146 | #endif // _SHADERIO_BUILDING_H_ -------------------------------------------------------------------------------- /shaders/shaderio_core.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef _SHADERIO_CORE_H_ 21 | #define _SHADERIO_CORE_H_ 22 | 23 | #ifndef SUBGROUP_SIZE 24 | #define SUBGROUP_SIZE 32 25 | #endif 26 | 27 | #ifdef __cplusplus 28 | namespace shaderio { 29 | using namespace glm; 30 | #define BUFFER_REF(refname) uint64_t 31 | 32 | static uint32_t inline adjustClusterProperty(uint32_t in) 33 | { 34 | return (in + 31) & ~31; 35 | } 36 | 37 | #define BUFFER_REF_DECLARE(refname, typ, keywords, alignment) \ 38 | static_assert(alignof(typ) == alignment || (alignment > alignof(typ) && ((alignment % alignof(typ)) == 0)), \ 39 | "Alignment incompatible: " #refname) 40 | 41 | #define BUFFER_REF_DECLARE_ARRAY(refname, typ, keywords, alignment) \ 42 | static_assert(alignof(typ) == alignment || (alignment > alignof(typ) && ((alignment % alignof(typ)) == 0)), \ 43 | "Alignment incompatible: " #refname) 44 | 45 | #define BUFFER_REF_DECLARE_SIZE(sizename, typ, size) static_assert(sizeof(typ) == size_t(size), "GLSL vs C++ size mismatch: " #typ) 46 | 47 | #else // GLSL 48 | 49 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 50 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 51 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 52 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 53 | #extension GL_EXT_buffer_reference : enable 54 | #extension GL_EXT_buffer_reference2 : enable 55 | #extension GL_EXT_scalar_block_layout : enable 56 | #extension GL_EXT_shader_atomic_int64 : enable 57 | 58 | #define PACKED_GET(flag, cfg) (((flag) >> (true ? cfg)) & ((1 << (false ? cfg))-1)) 59 | #define PACKED_FLAG(cfg, val) ((val) << (true ? cfg)) 60 | #define PACKED_MASK(cfg) (((1 << (false ? cfg))-1) << (true ? cfg)) 61 | 62 | #define BUFFER_REF(refname) refname 63 | 64 | #define BUFFER_REF_DECLARE(refname, typ, keywords, alignment) \ 65 | layout(buffer_reference, buffer_reference_align = alignment, scalar) keywords buffer refname \ 66 | { \ 67 | typ d; \ 68 | }; 69 | 70 | #define BUFFER_REF_DECLARE_ARRAY(refname, typ, keywords, alignment) \ 71 | layout(buffer_reference, buffer_reference_align = alignment, scalar) keywords buffer refname \ 72 | { \ 73 | typ d[]; \ 74 | }; 75 | 76 | #define BUFFER_REF_DECLARE_SIZE(sizename, typ, size) const uint32_t sizename = size 77 | 78 | #endif 79 | 80 | BUFFER_REF_DECLARE_ARRAY(uint8s_in, uint8_t, readonly, 1); 81 | BUFFER_REF_DECLARE_ARRAY(uint16s_in, uint16_t, readonly, 2); 82 | BUFFER_REF_DECLARE_ARRAY(uint16s_inout, uint16_t, , 2); 83 | BUFFER_REF_DECLARE_ARRAY(uint32s_in, uint32_t, readonly, 4); 84 | BUFFER_REF_DECLARE_ARRAY(uint32s_inout, uint32_t, , 4); 85 | BUFFER_REF_DECLARE_ARRAY(int32s_inout, int32_t, , 4); 86 | BUFFER_REF_DECLARE_ARRAY(uvec2s_in, uvec2, , 8); 87 | BUFFER_REF_DECLARE_ARRAY(uvec2s_inout, uvec2, , 8); 88 | BUFFER_REF_DECLARE_ARRAY(uint64s_in, uint64_t, readonly, 8); 89 | BUFFER_REF_DECLARE_ARRAY(uint64s_inout, uint64_t, , 8); 90 | BUFFER_REF_DECLARE_ARRAY(uint64s_coh, uint64_t, coherent, 8); 91 | BUFFER_REF_DECLARE_ARRAY(uint64s_coh_volatile, uint64_t, coherent volatile, 8); 92 | BUFFER_REF_DECLARE_ARRAY(vec3s_in, vec3, readonly, 4); 93 | BUFFER_REF_DECLARE_ARRAY(vec4s_in, vec4, readonly, 16); 94 | 95 | struct DispatchIndirectCommand 96 | { 97 | uint gridX; 98 | uint gridY; 99 | uint gridZ; 100 | }; 101 | 102 | struct DrawMeshTasksIndirectCommandNV 103 | { 104 | uint count; 105 | uint first; 106 | }; 107 | 108 | #ifdef __cplusplus 109 | } 110 | #endif 111 | #endif // _SHADERIO_CORE_H_ -------------------------------------------------------------------------------- /shaders/shaderio_scene.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include "shaderio_core.h" 21 | 22 | #ifndef _SHADERIO_SCENE_H_ 23 | #define _SHADERIO_SCENE_H_ 24 | 25 | #ifdef __cplusplus 26 | namespace shaderio { 27 | using namespace glm; 28 | #else 29 | 30 | #ifndef CLUSTER_VERTEX_COUNT 31 | #define CLUSTER_VERTEX_COUNT 32 32 | #endif 33 | 34 | #ifndef CLUSTER_TRIANGLE_COUNT 35 | #define CLUSTER_TRIANGLE_COUNT 32 36 | #endif 37 | 38 | #endif 39 | 40 | #define SHADERIO_ORIGINAL_MESH_GROUP 0xffffffffu 41 | 42 | struct BBox 43 | { 44 | vec3 lo; 45 | vec3 hi; 46 | // relevant to cluster's triangles 47 | float shortestEdge; 48 | float longestEdge; 49 | }; 50 | BUFFER_REF_DECLARE_ARRAY(BBoxes_in, BBox, readonly, 16); 51 | 52 | // A cluster contains a small number of triangles and vertices. 53 | // It is always part of a group. 54 | struct Cluster 55 | { 56 | uint8_t triangleCountMinusOne; 57 | uint8_t vertexCountMinusOne; 58 | uint8_t lodLevel; 59 | uint8_t groupChildIndex; 60 | 61 | uint32_t groupID; 62 | 63 | BUFFER_REF(vec4s_in) vertices; 64 | BUFFER_REF(uint8s_in) localTriangles; 65 | 66 | uint64_t _pad; 67 | }; 68 | BUFFER_REF_DECLARE(Cluster_in, Cluster, , 16); 69 | BUFFER_REF_DECLARE_ARRAY(Clusters_inout, Cluster, , 16); 70 | BUFFER_REF_DECLARE_SIZE(Cluster_size, Cluster, 32); 71 | 72 | // A group contains multiple clusters that are the result of 73 | // a common mesh decimation operation. Clusters within a group 74 | // are watertight to each other. Groups are always streamed in 75 | // completely, which simplifies the streaming management. 76 | 77 | struct TraversalMetric 78 | { 79 | // scalar by design, avoid hiccups with packing 80 | // order must match `nvclusterlod::Node` 81 | float boundingSphereX; 82 | float boundingSphereY; 83 | float boundingSphereZ; 84 | float boundingSphereRadius; 85 | float maxQuadricError; 86 | }; 87 | 88 | struct Group 89 | { 90 | uint32_t geometryID; 91 | uint32_t groupID; 92 | 93 | // streaming: global unique id given on load 94 | // clusters array starts directly after group 95 | // preloaded: local id within geometry 96 | uint32_t residentID; 97 | uint32_t clusterResidentID; 98 | 99 | // when this group is first loaded, this is where the 100 | // temporary clas builds start. 101 | uint32_t streamingNewBuildOffset; 102 | 103 | uint16_t lodLevel; 104 | uint16_t clusterCount; 105 | 106 | TraversalMetric traversalMetric; 107 | 108 | BUFFER_REF(uint32s_in) clusterGeneratingGroups; 109 | BUFFER_REF(BBoxes_in) clusterBboxes; 110 | }; 111 | 112 | BUFFER_REF_DECLARE(Group_in, Group, , 16); 113 | BUFFER_REF_DECLARE_ARRAY(Groups_in, Group, , 16); 114 | BUFFER_REF_DECLARE_SIZE(Group_size, Group, 64); 115 | 116 | #ifdef __cplusplus 117 | // must match `nvclusterlod::InteriorNode` 118 | struct NodeRange 119 | { 120 | uint32_t isNode : 1; 121 | uint32_t childOffset : 26; 122 | uint32_t childCountMinusOne : 5; 123 | }; 124 | 125 | // must match `nvclusterlod::LeafNode` 126 | struct GroupRange 127 | { 128 | uint32_t isNode : 1; 129 | uint32_t groupIndex : 23; 130 | uint32_t groupClusterCountMinusOne : 8; 131 | }; 132 | #endif 133 | 134 | // must match `nvclusterlod::Node` 135 | struct Node 136 | { 137 | #ifdef __cplusplus 138 | union 139 | { 140 | NodeRange nodeRange; 141 | GroupRange groupRange; 142 | }; 143 | #else 144 | uint32_t packed; 145 | 146 | #define Node_packed_isGroup 0 : 1 147 | 148 | #define Node_packed_nodeChildOffset 1 : 26 149 | #define Node_packed_nodeChildCountMinusOne 27 : 5 150 | 151 | #define Node_packed_groupIndex 1 : 23 152 | #define Node_packed_groupClusterCountMinusOne 24 : 8 153 | 154 | #endif 155 | // use scalar to avoid glsl alignment hiccups 156 | TraversalMetric traversalMetric; 157 | }; 158 | BUFFER_REF_DECLARE_ARRAY(Nodes_in, Node, readonly, 8); 159 | 160 | struct Geometry 161 | { 162 | uint32_t clustersCount; 163 | uint32_t groupsCount; 164 | uint32_t nodesCount; 165 | uint32_t _pad; 166 | 167 | // object space geometry bbox 168 | BBox bbox; 169 | 170 | // lod hierarchy traversal 171 | BUFFER_REF(Nodes_in) nodes; 172 | BUFFER_REF(BBoxes_in) nodeBboxes; 173 | 174 | 175 | // streaming (null if preloaded) 176 | // provides memory address of a resident group. 177 | // 178 | // Note this 64-bit value uses a special encoding. 179 | // only addresses < STREAMING_INVALID_ADDRESS_BEGIN can be dereferenced. 180 | BUFFER_REF(uint64s_inout) streamingGroupAddresses; 181 | 182 | // preloaded (null if streaming) 183 | // clusters 184 | BUFFER_REF(Groups_in) preloadedGroups; 185 | BUFFER_REF(Clusters_inout) preloadedClusters; 186 | // for ray tracing 187 | BUFFER_REF(uint64s_in) preloadedClusterClasAddresses; 188 | BUFFER_REF(uint32s_in) preloadedClusterClasSizes; 189 | }; 190 | BUFFER_REF_DECLARE(Geometry_in, Geometry, readonly, 8); 191 | 192 | struct RenderInstance 193 | { 194 | mat4 worldMatrix; 195 | 196 | uint32_t geometryID; 197 | float maxLodLevelRcp; 198 | uint32_t _pad[2]; 199 | }; 200 | BUFFER_REF_DECLARE_ARRAY(RenderInstances_in, RenderInstance, readonly, 16); 201 | 202 | #ifdef __cplusplus 203 | // clusters are stored right next to group 204 | static_assert((sizeof(Group) % sizeof(Cluster)) == 0); 205 | } 206 | #endif 207 | 208 | #endif 209 | -------------------------------------------------------------------------------- /shaders/stream_agefilter_groups.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader writes the streaming request for 26 | groups to be unloaded. We determine this based on an 27 | age since the group has been used last. 28 | 29 | A thread represents one resident group. 30 | */ 31 | 32 | #version 460 33 | 34 | #extension GL_GOOGLE_include_directive : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 38 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 39 | #extension GL_EXT_buffer_reference : enable 40 | #extension GL_EXT_buffer_reference2 : enable 41 | #extension GL_EXT_scalar_block_layout : enable 42 | #extension GL_EXT_shader_atomic_int64 : enable 43 | 44 | #extension GL_EXT_control_flow_attributes : require 45 | #extension GL_KHR_shader_subgroup_vote : require 46 | #extension GL_KHR_shader_subgroup_ballot : require 47 | #extension GL_KHR_shader_subgroup_shuffle : require 48 | #extension GL_KHR_shader_subgroup_basic : require 49 | #extension GL_KHR_shader_subgroup_clustered : require 50 | #extension GL_KHR_shader_subgroup_arithmetic : require 51 | 52 | #include "shaderio.h" 53 | 54 | //////////////////////////////////////////// 55 | 56 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 57 | { 58 | Readback readback; 59 | }; 60 | 61 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 62 | { 63 | Geometry geometries[]; 64 | }; 65 | 66 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 67 | { 68 | SceneStreaming streaming; 69 | }; 70 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 71 | { 72 | SceneStreaming streamingRW; 73 | }; 74 | 75 | //////////////////////////////////////////// 76 | 77 | layout(local_size_x=STREAM_AGEFILTER_GROUPS_WORKGROUP) in; 78 | 79 | //////////////////////////////////////////// 80 | 81 | void main() 82 | { 83 | // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_AGEFILTER_CLUSTERS_WORKGROUP 84 | uint residentID = streaming.resident.activeGroups.d[gl_GlobalInvocationID.x]; 85 | if (gl_GlobalInvocationID.x < streaming.resident.activeGroupsCount) 86 | { 87 | #if STREAMING_DEBUG_ADDRESSES 88 | if (uint64_t(streaming.resident.groups.d[residentID].group) >= STREAMING_INVALID_ADDRESS_START) 89 | { 90 | streamingRW.request.errorAgeFilter = residentID; 91 | return; 92 | } 93 | #endif 94 | 95 | // increase the age of a resident group 96 | int age = ++streaming.resident.groups.d[residentID].age; 97 | 98 | // detect if we are over the age limit and request the group to be unloaded 99 | if (age > streaming.ageThreshold) 100 | { 101 | uint unloadOffset = atomicAdd(streamingRW.request.unloadCounter, 1); 102 | if (unloadOffset <= streaming.request.maxUnloads) { 103 | Group_in groupRef = streaming.resident.groups.d[residentID].group; 104 | streaming.request.unloadGeometryGroups.d[unloadOffset] = uvec2(groupRef.d.geometryID, groupRef.d.groupID); 105 | } 106 | } 107 | } 108 | } 109 | 110 | -------------------------------------------------------------------------------- /shaders/stream_allocator_freegaps_insert.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | Note: The sample showcases two ways to manage CLAS memory on the device. 26 | One using a persistent allocator system (`stream_allocator...` files), 27 | and one using a simple compaction scheme (`stream_compaction...` files). 28 | This file is part of the allocator system. 29 | 30 | This compute shader bins the free gaps based on their size. 31 | It enables the allocator to provide empty gaps of certain sizes during 32 | the allocation process within `stream_allocator_load_groups.comp.glsl`. 33 | 34 | We read `streaming.clasAllocator.freeGapsPos` and `streaming.clasAllocator.freeGapsSize` 35 | and bin into `streaming.clasAllocator.freeGapsPosBinned` using the appropriate 36 | `streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].offset` 37 | 38 | One thread operates on one free gap 39 | */ 40 | 41 | #version 460 42 | 43 | #extension GL_GOOGLE_include_directive : enable 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 45 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 48 | #extension GL_EXT_buffer_reference : enable 49 | #extension GL_EXT_buffer_reference2 : enable 50 | #extension GL_EXT_scalar_block_layout : enable 51 | #extension GL_EXT_shader_atomic_int64 : enable 52 | 53 | #extension GL_EXT_control_flow_attributes : require 54 | #extension GL_KHR_shader_subgroup_vote : require 55 | #extension GL_KHR_shader_subgroup_ballot : require 56 | #extension GL_KHR_shader_subgroup_shuffle : require 57 | #extension GL_KHR_shader_subgroup_basic : require 58 | #extension GL_KHR_shader_subgroup_clustered : require 59 | #extension GL_KHR_shader_subgroup_arithmetic : require 60 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require 61 | 62 | #include "shaderio.h" 63 | 64 | //////////////////////////////////////////// 65 | 66 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 67 | { 68 | Readback readback; 69 | }; 70 | 71 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 72 | { 73 | Geometry geometries[]; 74 | }; 75 | 76 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 77 | { 78 | SceneStreaming streaming; 79 | }; 80 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 81 | { 82 | SceneStreaming streamingRW; 83 | }; 84 | 85 | //////////////////////////////////////////// 86 | 87 | layout(local_size_x=STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP) in; 88 | 89 | //////////////////////////////////////////// 90 | 91 | void main() 92 | { 93 | uint threadID = gl_GlobalInvocationID.x; 94 | bool valid = threadID < streaming.clasAllocator.freeGapsCounter; 95 | 96 | if (valid) 97 | { 98 | // get the details of the free gap, it was computed in 99 | // `stream_allocator_build_freegaps.comp.glsl`. 100 | 101 | uint freeGapPos = streaming.clasAllocator.freeGapsPos.d[threadID]; 102 | uint freeGapSize = streaming.clasAllocator.freeGapsSize.d[threadID]; 103 | 104 | // bin the gap into `streaming.clasAllocator.freeGapsPosBinned` based on size 105 | int32_t rangeIndex = atomicAdd(streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].count, 1); 106 | uint rangeOffset = streaming.clasAllocator.freeSizeRanges.d[freeGapSize-1].offset; 107 | 108 | uint storeOffset = rangeIndex + uint(rangeOffset); 109 | streaming.clasAllocator.freeGapsPosBinned.d[storeOffset] = freeGapPos; 110 | } 111 | } -------------------------------------------------------------------------------- /shaders/stream_allocator_setup_insertion.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | Note: The sample showcases two ways to manage CLAS memory on the device. 26 | One using a persistent allocator system (`stream_allocator...` files), 27 | and one using a simple compaction scheme (`stream_compaction...` files). 28 | This file is part of the allocator system. 29 | 30 | This compute shader prepares the ranges of free gaps 31 | based on their size. It is required to handle the size-based 32 | binding within `stream_allocator_freegaps_insert.comp.glsl`. 33 | 34 | One thread represent one free gap size 35 | 36 | */ 37 | 38 | #version 460 39 | 40 | #extension GL_GOOGLE_include_directive : enable 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 43 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 45 | #extension GL_EXT_buffer_reference : enable 46 | #extension GL_EXT_buffer_reference2 : enable 47 | #extension GL_EXT_scalar_block_layout : enable 48 | #extension GL_EXT_shader_atomic_int64 : enable 49 | 50 | #extension GL_EXT_control_flow_attributes : require 51 | #extension GL_KHR_shader_subgroup_vote : require 52 | #extension GL_KHR_shader_subgroup_ballot : require 53 | #extension GL_KHR_shader_subgroup_shuffle : require 54 | #extension GL_KHR_shader_subgroup_basic : require 55 | #extension GL_KHR_shader_subgroup_clustered : require 56 | #extension GL_KHR_shader_subgroup_arithmetic : require 57 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require 58 | 59 | #include "shaderio.h" 60 | 61 | //////////////////////////////////////////// 62 | 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 64 | { 65 | Readback readback; 66 | }; 67 | 68 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 69 | { 70 | Geometry geometries[]; 71 | }; 72 | 73 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 74 | { 75 | SceneStreaming streaming; 76 | }; 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 78 | { 79 | SceneStreaming streamingRW; 80 | }; 81 | 82 | //////////////////////////////////////////// 83 | 84 | layout(local_size_x=STREAM_ALLOCATOR_SETUP_INSERTION_WORKGROUP) in; 85 | 86 | //////////////////////////////////////////// 87 | 88 | void main() 89 | { 90 | uint threadID = gl_GlobalInvocationID.x; 91 | bool valid = threadID < streaming.clasAllocator.maxAllocationSize; 92 | 93 | if (valid) 94 | { 95 | // from the previous kernel `stream_allocator_build_freegaps.comp.glsl` we know how 96 | // many slots the size-binned array will need 97 | uint rangeCount = uint(streaming.clasAllocator.freeSizeRanges.d[threadID].count); 98 | // get an offset into `streaming.clasAllocator.freeGapsPosBinned` for the list of 99 | uint rangeOffset = atomicAdd(streamingRW.clasAllocator.freeGapsCounter, rangeCount); 100 | // setup range offset 101 | streaming.clasAllocator.freeSizeRanges.d[threadID].offset = rangeOffset; 102 | // reset to zero for insertion done in `stream_allocator_freegaps_insert.comp.glsl` 103 | streaming.clasAllocator.freeSizeRanges.d[threadID].count = 0; 104 | } 105 | } 106 | 107 | -------------------------------------------------------------------------------- /shaders/stream_allocator_unload_groups.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | Note: The sample showcases two ways to manage CLAS memory on the device. 26 | One using a persistent allocator system (`stream_allocator...` files), 27 | and one using a simple compaction scheme (`stream_compaction...` files). 28 | This file is part of the allocator system. 29 | 30 | This compute shader handles de-allocation of clas memory space 31 | of unloaded groups. 32 | 33 | It marks the appropriate bits of the memory regions as empty again. 34 | `streaming.clasAllocator.usedBits` is modified accordingly. 35 | 36 | One thread represents an unloaded group 37 | 38 | TODO might want to improve divergence in the loops 39 | */ 40 | 41 | #version 460 42 | 43 | #extension GL_GOOGLE_include_directive : enable 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 45 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 48 | #extension GL_EXT_buffer_reference : enable 49 | #extension GL_EXT_buffer_reference2 : enable 50 | #extension GL_EXT_scalar_block_layout : enable 51 | #extension GL_EXT_shader_atomic_int64 : enable 52 | 53 | #extension GL_EXT_control_flow_attributes : require 54 | #extension GL_KHR_shader_subgroup_vote : require 55 | #extension GL_KHR_shader_subgroup_ballot : require 56 | #extension GL_KHR_shader_subgroup_shuffle : require 57 | #extension GL_KHR_shader_subgroup_basic : require 58 | #extension GL_KHR_shader_subgroup_clustered : require 59 | #extension GL_KHR_shader_subgroup_arithmetic : require 60 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require 61 | 62 | #include "shaderio.h" 63 | 64 | //////////////////////////////////////////// 65 | 66 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 67 | { 68 | Readback readback; 69 | }; 70 | 71 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 72 | { 73 | Geometry geometries[]; 74 | }; 75 | 76 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 77 | { 78 | SceneStreaming streaming; 79 | }; 80 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 81 | { 82 | SceneStreaming streamingRW; 83 | }; 84 | 85 | //////////////////////////////////////////// 86 | 87 | layout(local_size_x=STREAM_ALLOCATOR_UNLOAD_GROUPS_WORKGROUP) in; 88 | 89 | //////////////////////////////////////////// 90 | 91 | void main() 92 | { 93 | uint threadID = gl_GlobalInvocationID.x; 94 | bool valid = threadID < streaming.update.patchUnloadGroupsCount; 95 | 96 | // unloads come first in patches 97 | StreamingPatch spatch = streaming.update.patches.d[threadID]; 98 | 99 | if (valid) 100 | { 101 | Group group = Group_in(geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex]).d; 102 | 103 | // get the first clas address of the group, as all clas of a 104 | // group are allocated together 105 | uint64_t firstClasAddress = streaming.resident.clasAddresses.d[group.clusterResidentID]; 106 | // then convert this into a relative address compared to the clas base address 107 | uint64_t firstClasOffset = firstClasAddress - streaming.resident.clasBaseAddress; 108 | 109 | // recreate the allocation properties of the group 110 | // get allocation position in units 111 | uint allocPos = uint(firstClasOffset >> streaming.clasAllocator.granularityByteShift); 112 | // retrieve the size of allocation as well as the associated memory waste 113 | uvec2 groupSize = streaming.resident.groupClasSizes.d[group.residentID]; 114 | // allocation size was stored in units, which is what we need here, but wasted size in bytes 115 | uint allocSize = groupSize.x; 116 | uint wastedByteSize = groupSize.y; 117 | 118 | // for stats 119 | atomicAdd(streamingRW.clasAllocator.stats.d.allocatedSize, -int64_t(allocSize << streaming.clasAllocator.granularityByteShift)); 120 | atomicAdd(streamingRW.clasAllocator.stats.d.wastedSize, -int64_t(wastedByteSize)); 121 | 122 | // for allocation management, tag bits as unusued 123 | // 124 | // allocPos and allocSize are in minimum granularity, 125 | // which is what we use to tag the appropriate bits. 126 | 127 | uint startPos = allocPos; 128 | uint endPos = allocPos + allocSize - 1; 129 | 130 | uint startBit = (startPos) & 31; 131 | uint endBit = (endPos) & 31; 132 | 133 | uint start32 = startPos / 32; 134 | uint end32 = endPos / 32; 135 | 136 | uint startMask = ~0; 137 | uint endMask = ~0; 138 | 139 | if (startBit != 0) 140 | { 141 | startMask = ~((1u << (startBit))-1); 142 | } 143 | if (endBit != 31) 144 | { 145 | endMask = (1u << (endBit + 1))-1; 146 | } 147 | 148 | bool single32 = start32 == end32; 149 | if (single32) 150 | { 151 | startMask = endMask | startMask; 152 | } 153 | 154 | // start and end of an allocated region may end up in the same u32, 155 | // hence we need atomics for start and end 156 | 157 | uint oldMask = atomicAnd(streaming.clasAllocator.usedBits.d[start32], ~startMask); 158 | #if STREAMING_DEBUG_FREEGAPS_OVERLAP 159 | // for debugging we test if the region was indeed fully used 160 | bool hadError = false; 161 | if ((oldMask & startMask) != startMask){ 162 | hadError = true; 163 | } 164 | #endif 165 | 166 | if (!single32) 167 | { 168 | // process the region that is exclusively covered by this allocation 169 | for (uint32_t i = start32 + 1; i < end32; i++) 170 | { 171 | #if STREAMING_DEBUG_FREEGAPS_OVERLAP 172 | if(streaming.clasAllocator.usedBits.d[i] == 0){ 173 | hadError = true; 174 | } 175 | #endif 176 | streaming.clasAllocator.usedBits.d[i] = 0; 177 | } 178 | 179 | oldMask = atomicAnd(streaming.clasAllocator.usedBits.d[end32], ~endMask); 180 | #if STREAMING_DEBUG_FREEGAPS_OVERLAP 181 | if ((oldMask & endMask) != endMask){ 182 | hadError = true; 183 | } 184 | #endif 185 | } 186 | #if STREAMING_DEBUG_FREEGAPS_OVERLAP 187 | if (hadError){ 188 | streamingRW.request.errorClasDealloc = 1 + threadID; 189 | } 190 | #endif 191 | } 192 | } 193 | 194 | -------------------------------------------------------------------------------- /shaders/stream_compaction_new_clas.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | Note: The sample showcases two ways to manage CLAS memory on the device. 26 | One using a persistent allocator system (`stream_allocator...` files), 27 | and one using a simple compaction scheme (`stream_compaction...` files). 28 | 29 | This compute shader compacts cluster CLAS storage 30 | of all newly built clusters. They are appended after the 31 | compaction of old clusters CLAS. 32 | 33 | The compaction is done in `stream_compaction_old_clas.comp.glsl` 34 | 35 | A thread represents one newly built CLAS. 36 | */ 37 | 38 | #version 460 39 | 40 | #extension GL_GOOGLE_include_directive : enable 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 43 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 44 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 45 | #extension GL_EXT_buffer_reference : enable 46 | #extension GL_EXT_buffer_reference2 : enable 47 | #extension GL_EXT_scalar_block_layout : enable 48 | #extension GL_EXT_shader_atomic_int64 : enable 49 | 50 | #extension GL_EXT_control_flow_attributes : require 51 | #extension GL_KHR_shader_subgroup_vote : require 52 | #extension GL_KHR_shader_subgroup_ballot : require 53 | #extension GL_KHR_shader_subgroup_shuffle : require 54 | #extension GL_KHR_shader_subgroup_basic : require 55 | #extension GL_KHR_shader_subgroup_clustered : require 56 | #extension GL_KHR_shader_subgroup_arithmetic : require 57 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require 58 | 59 | #include "shaderio.h" 60 | 61 | //////////////////////////////////////////// 62 | 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 64 | { 65 | Readback readback; 66 | }; 67 | 68 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 69 | { 70 | Geometry geometries[]; 71 | }; 72 | 73 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 74 | { 75 | SceneStreaming streaming; 76 | }; 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 78 | { 79 | SceneStreaming streamingRW; 80 | }; 81 | 82 | //////////////////////////////////////////// 83 | 84 | layout(local_size_x=STREAM_COMPACTION_NEW_CLAS_WORKGROUP) in; 85 | 86 | //////////////////////////////////////////// 87 | 88 | void main() 89 | { 90 | // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_MOVE_NEW_CLAS_WORKGROUP 91 | 92 | uint newID = gl_GlobalInvocationID.x; 93 | uint clusterResidentID = streaming.update.newClasResidentIDs.d[newID]; 94 | bool valid = newID < streaming.update.newClasCount; 95 | 96 | uint clasSize = 0; 97 | uint64_t clasAddress = 0; 98 | 99 | if (valid) 100 | { 101 | clasSize = streaming.update.newClasSizes.d[newID]; 102 | clasAddress = streaming.update.newClasAddresses.d[newID]; 103 | } 104 | 105 | uint64_t clasNewAddress = atomicAdd(streamingRW.update.moveClasSize, uint64_t(clasSize)) + 106 | streaming.resident.clasBaseAddress; 107 | 108 | uint moveOffset = newID; 109 | 110 | if (valid) { 111 | // set up move to new destination 112 | streaming.update.moveClasSrcAddresses.d[moveOffset] = clasAddress; 113 | streaming.update.moveClasDstAddresses.d[moveOffset] = clasNewAddress; 114 | // update internal state of destination 115 | streaming.resident.clasAddresses.d[clusterResidentID] = clasNewAddress; 116 | streaming.resident.clasSizes.d[clusterResidentID] = clasSize; 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /shaders/stream_compaction_old_clas.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | Note: The sample showcases two ways to manage CLAS memory on the device. 26 | One using a persistent allocator system (`stream_allocator...` files), 27 | and one using a simple compaction scheme (`stream_compaction...` files). 28 | This file is part of the compaction scheme. 29 | 30 | This compute shader compacts / defrags cluster CLAS storage 31 | of all previously active resident groups. 32 | 33 | A thread represents one resident group. 34 | */ 35 | 36 | #version 460 37 | 38 | #extension GL_GOOGLE_include_directive : enable 39 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 40 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 41 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 42 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 43 | #extension GL_EXT_buffer_reference : enable 44 | #extension GL_EXT_buffer_reference2 : enable 45 | #extension GL_EXT_scalar_block_layout : enable 46 | #extension GL_EXT_shader_atomic_int64 : enable 47 | 48 | #extension GL_EXT_control_flow_attributes : require 49 | #extension GL_KHR_shader_subgroup_vote : require 50 | #extension GL_KHR_shader_subgroup_ballot : require 51 | #extension GL_KHR_shader_subgroup_shuffle : require 52 | #extension GL_KHR_shader_subgroup_basic : require 53 | #extension GL_KHR_shader_subgroup_clustered : require 54 | #extension GL_KHR_shader_subgroup_arithmetic : require 55 | #extension GL_EXT_shader_subgroup_extended_types_int64 : require 56 | 57 | #include "shaderio.h" 58 | 59 | //////////////////////////////////////////// 60 | 61 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 62 | { 63 | Readback readback; 64 | }; 65 | 66 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 67 | { 68 | Geometry geometries[]; 69 | }; 70 | 71 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 72 | { 73 | SceneStreaming streaming; 74 | }; 75 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 76 | { 77 | SceneStreaming streamingRW; 78 | }; 79 | 80 | //////////////////////////////////////////// 81 | 82 | layout(local_size_x=STREAM_COMPACTION_OLD_CLAS_WORKGROUP) in; 83 | 84 | //////////////////////////////////////////// 85 | 86 | void main() 87 | { 88 | // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_MOVE_OLD_CLAS_WORKGROUP 89 | 90 | uint threadID = gl_GlobalInvocationID.x; 91 | uint groupResidentID = streaming.resident.activeGroups.d[threadID]; 92 | 93 | // old resident groups come first, then after this offset are the newly loaded, 94 | // which we can ignore here. 95 | bool valid = threadID < streaming.update.loadActiveGroupsOffset; 96 | 97 | if (valid) 98 | { 99 | // Walk over all old resident groups' clusters and compact their clas 100 | // objects storage so that the newly built clas can be appended to the 101 | // end. 102 | 103 | // This will result in a lot of movement of clas and is not recommended, 104 | // but avoids a more sophisticated clas allocation scheme. 105 | 106 | Group group = streaming.resident.groups.d[groupResidentID].group.d; 107 | 108 | // TODO improve divergence 109 | for (uint c = 0; c < group.clusterCount; c++) 110 | { 111 | uint clusterResidentID = group.clusterResidentID + c; 112 | 113 | uint clasSize = streaming.resident.clasSizes.d[clusterResidentID]; 114 | uint64_t clasAddress = streaming.resident.clasAddresses.d[clusterResidentID]; 115 | 116 | uint64_t clasNewAddress = atomicAdd(streamingRW.update.moveClasSize, uint64_t(clasSize)) + 117 | streaming.resident.clasBaseAddress; 118 | 119 | // don't move identical addresses (in reality this will hardly happen due to 120 | // non-deterministic nature of atomicAdd) 121 | bool move = clasNewAddress != clasAddress; 122 | uint moveOffset = atomicAdd(streamingRW.update.moveClasCounter, move ? 1 : 0); 123 | 124 | if (move) { 125 | // set up move to new destination 126 | streaming.update.moveClasSrcAddresses.d[moveOffset] = clasAddress; 127 | streaming.update.moveClasDstAddresses.d[moveOffset] = clasNewAddress; 128 | // update internal state of destination 129 | streaming.resident.clasAddresses.d[clusterResidentID] = clasNewAddress; 130 | streaming.resident.clasSizes.d[clusterResidentID] = clasSize; 131 | } 132 | } 133 | } 134 | } 135 | 136 | -------------------------------------------------------------------------------- /shaders/stream_setup.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader does a few simple operations that require only a single thread. 26 | 27 | STREAM_SETUP_... are enums for the various operations 28 | 29 | */ 30 | 31 | #version 460 32 | 33 | #extension GL_GOOGLE_include_directive : enable 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 38 | #extension GL_EXT_buffer_reference : enable 39 | #extension GL_EXT_buffer_reference2 : enable 40 | #extension GL_EXT_scalar_block_layout : enable 41 | #extension GL_EXT_shader_atomic_int64 : enable 42 | 43 | #extension GL_EXT_control_flow_attributes : require 44 | #extension GL_KHR_shader_subgroup_ballot : require 45 | #extension GL_KHR_shader_subgroup_shuffle : require 46 | #extension GL_KHR_shader_subgroup_basic : require 47 | #extension GL_KHR_shader_subgroup_clustered : require 48 | #extension GL_KHR_shader_subgroup_arithmetic : require 49 | 50 | #include "shaderio.h" 51 | 52 | layout(push_constant) uniform pushData 53 | { 54 | uint setup; 55 | } push; 56 | 57 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 58 | { 59 | FrameConstants view; 60 | }; 61 | 62 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 63 | { 64 | Readback readback; 65 | }; 66 | 67 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 68 | { 69 | Geometry geometries[]; 70 | }; 71 | 72 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 73 | { 74 | SceneStreaming streaming; 75 | }; 76 | 77 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) coherent buffer streamingBufferRW 78 | { 79 | SceneStreaming streamingRW; 80 | }; 81 | 82 | //////////////////////////////////////////// 83 | 84 | layout(local_size_x=1) in; 85 | 86 | //////////////////////////////////////////// 87 | 88 | void main() 89 | { 90 | if (push.setup == STREAM_SETUP_COMPACTION_OLD_NO_UNLOADS) 91 | { 92 | // we will not do compaction of old when there are no unloads. 93 | // However appending new still depends on the/ `moveClasSize` to be configured 94 | // correctly, so that we will append after it. 95 | 96 | // first streaming frame has special rule 97 | // (note we start at frame 1 not 0) 98 | if (streaming.frameIndex == 1) 99 | { 100 | // reset the persistent stored value to zero 101 | streaming.resident.clasCompactionUsedSize.d[0] = 0; 102 | streamingRW.update.moveClasSize = 0; 103 | } 104 | else { 105 | streamingRW.update.moveClasSize = streaming.resident.clasCompactionUsedSize.d[0]; 106 | } 107 | } 108 | else if (push.setup == STREAM_SETUP_COMPACTION_STATUS) 109 | { 110 | // move compaction for clas memory management 111 | if (streaming.update.patchGroupsCount > 0) { 112 | // persistently store the total compacted clas size 113 | streaming.resident.clasCompactionUsedSize.d[0] = streamingRW.update.moveClasSize; 114 | // for readback 115 | streamingRW.request.clasCompactionUsedSize = streamingRW.update.moveClasSize; 116 | streamingRW.request.clasCompactionCount = streamingRW.update.moveClasCounter; 117 | } 118 | else { 119 | // no update, pull value from persistent storage 120 | streamingRW.request.clasCompactionUsedSize = streaming.resident.clasCompactionUsedSize.d[0]; 121 | streamingRW.request.clasCompactionCount = 0; 122 | } 123 | } 124 | else if (push.setup == STREAM_SETUP_ALLOCATOR_FREEINSERT) 125 | { 126 | uint freeGaps = streaming.clasAllocator.freeGapsCounter; 127 | uint maxFreeGaps = (streaming.clasAllocator.sectorCount << streaming.clasAllocator.sectorSizeShift); 128 | 129 | // reset to zero for `stream_allocator_setup_insertion.comp.glsl` 130 | streamingRW.clasAllocator.freeGapsCounter = 0; 131 | 132 | // and setup actual dispatch that inserts the freegaps into the lists 133 | // within `stream_allocator_freelist_insert.comp.glsl` 134 | streamingRW.clasAllocator.dispatchFreeGapsInsert.gridX = (min(freeGaps,maxFreeGaps) + STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP -1) / STREAM_ALLOCATOR_FREEGAPS_INSERT_WORKGROUP; 135 | streamingRW.clasAllocator.dispatchFreeGapsInsert.gridY = 1; 136 | streamingRW.clasAllocator.dispatchFreeGapsInsert.gridZ = 1; 137 | #if STREAMING_DEBUG_USEDBITS_COUNT 138 | // error check allocation state prior adding new groups 139 | uint64_t allocatedSize = streaming.clasAllocator.stats.d.allocatedSize; 140 | if (streaming.clasAllocator.usedBitsCount > 0 && 141 | allocatedSize != uint64_t(streaming.clasAllocator.usedBitsCount) << streaming.clasAllocator.granularityByteShift) 142 | { 143 | streamingRW.request.errorClasUsedVsAlloc = int(allocatedSize >> streaming.clasAllocator.granularityByteShift) - int(streaming.clasAllocator.usedBitsCount); 144 | } 145 | #endif 146 | } 147 | else if (push.setup == STREAM_SETUP_ALLOCATOR_STATUS) 148 | { 149 | if (streaming.frameIndex == 1) 150 | { 151 | // seed all available for first frame 152 | uint clasAllocatedMaxSizedLeft = streaming.clasAllocator.sectorMaxAllocationSized * streaming.clasAllocator.sectorCount; 153 | streaming.clasAllocator.stats.d.allocatedSize = 0; 154 | streaming.clasAllocator.stats.d.wastedSize = streaming.clasAllocator.baseWastedSize << streaming.clasAllocator.granularityByteShift; 155 | streaming.resident.clasAllocatedMaxSizedLeft.d[0] = clasAllocatedMaxSizedLeft; 156 | streamingRW.request.clasAllocatedMaxSizedLeft = clasAllocatedMaxSizedLeft; 157 | } 158 | else { 159 | // persistent allocator for clas memory management 160 | if (streaming.update.patchGroupsCount > 0) { 161 | // count can be negative 162 | uint clasAllocatedMaxSizedLeft = uint(max(0,streaming.clasAllocator.freeSizeRanges.d[streaming.clasAllocator.maxAllocationSize-1].count)); 163 | streaming.resident.clasAllocatedMaxSizedLeft.d[0] = clasAllocatedMaxSizedLeft; 164 | streamingRW.request.clasAllocatedMaxSizedLeft = clasAllocatedMaxSizedLeft; 165 | } 166 | else { 167 | // no update, pull value from persistent storage 168 | streamingRW.request.clasAllocatedMaxSizedLeft = streaming.resident.clasAllocatedMaxSizedLeft.d[0]; 169 | } 170 | } 171 | 172 | streamingRW.request.clasAllocatedUsedSize = streaming.clasAllocator.stats.d.allocatedSize; 173 | streamingRW.request.clasAllocatedWastedSize = streaming.clasAllocator.stats.d.wastedSize; 174 | } 175 | } -------------------------------------------------------------------------------- /shaders/stream_update_scene.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader handles updating the scene. 26 | Previous requests to load/unload have been completed and 27 | are provided for patching the scene. 28 | 29 | Effectively we are manipulating the geometries' 30 | `streamingGroupAddresses` array that points to the resident 31 | memory location of a group (or tags it invalid). 32 | 33 | Furthermore when ray tracing is required we prepare building 34 | new CLAS for the loaded groups' clusters. 35 | 36 | After building is completed we run the `stream_move_new_clas.comp.glsl` 37 | to move them from temporary to final location. 38 | 39 | A thread represents a single patch operation, which takes care of 40 | one group. 41 | */ 42 | 43 | #version 460 44 | 45 | #extension GL_GOOGLE_include_directive : enable 46 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 47 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 48 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 49 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 50 | #extension GL_EXT_buffer_reference : enable 51 | #extension GL_EXT_buffer_reference2 : enable 52 | #extension GL_EXT_scalar_block_layout : enable 53 | #extension GL_EXT_shader_atomic_int64 : enable 54 | 55 | #extension GL_EXT_control_flow_attributes : require 56 | #extension GL_KHR_shader_subgroup_vote : require 57 | #extension GL_KHR_shader_subgroup_ballot : require 58 | #extension GL_KHR_shader_subgroup_shuffle : require 59 | #extension GL_KHR_shader_subgroup_basic : require 60 | #extension GL_KHR_shader_subgroup_clustered : require 61 | #extension GL_KHR_shader_subgroup_arithmetic : require 62 | 63 | #include "shaderio.h" 64 | 65 | //////////////////////////////////////////// 66 | 67 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 68 | { 69 | Readback readback; 70 | }; 71 | 72 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 73 | { 74 | Geometry geometries[]; 75 | }; 76 | 77 | layout(scalar, binding = BINDINGS_STREAMING_UBO, set = 0) uniform streamingBuffer 78 | { 79 | SceneStreaming streaming; 80 | }; 81 | layout(scalar, binding = BINDINGS_STREAMING_SSBO, set = 0) buffer streamingBufferRW 82 | { 83 | SceneStreaming streamingRW; 84 | }; 85 | 86 | //////////////////////////////////////////// 87 | 88 | layout(local_size_x=STREAM_UPDATE_SCENE_WORKGROUP) in; 89 | 90 | //////////////////////////////////////////// 91 | 92 | void main() 93 | { 94 | // can load pre-emptively given the array is guaranteed to be sized as multiple of STREAM_UPDATE_SCENE_WORKGROUP 95 | 96 | uint threadID = gl_GlobalInvocationID.x; 97 | 98 | // works for both load and unload 99 | StreamingPatch spatch = streaming.update.patches.d[threadID]; 100 | 101 | if (threadID < streaming.update.patchGroupsCount) 102 | { 103 | #if STREAMING_DEBUG_ADDRESSES 104 | uint oldResidentID = 0; 105 | if (threadID < streaming.update.patchUnloadGroupsCount) 106 | { 107 | Group group = Group_in(geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex]).d; 108 | oldResidentID = group.residentID; 109 | } 110 | #endif 111 | 112 | geometries[spatch.geometryID].streamingGroupAddresses.d[spatch.groupIndex] = spatch.groupAddress; 113 | 114 | if (threadID < streaming.update.patchUnloadGroupsCount) 115 | { 116 | #if STREAMING_DEBUG_ADDRESSES 117 | streaming.resident.groups.d[oldResidentID].group = Group_in(STREAMING_INVALID_ADDRESS_START); 118 | #endif 119 | } 120 | else 121 | { 122 | uint loadGroupIndex = threadID - streaming.update.patchUnloadGroupsCount; 123 | 124 | Group group = Group_in(spatch.groupAddress).d; 125 | 126 | uint groupResidentID = group.residentID; 127 | StreamingGroup residentGroup; 128 | residentGroup.clusterCount = group.clusterCount; 129 | residentGroup.age = 0; 130 | residentGroup.group = Group_in(spatch.groupAddress); 131 | #if STREAMING_DEBUG_ADDRESSES 132 | if (uint64_t(streaming.resident.groups.d[groupResidentID].group) < STREAMING_INVALID_ADDRESS_START) 133 | streamingRW.request.errorUpdate = groupResidentID; 134 | #endif 135 | 136 | // update description in residency table 137 | streaming.resident.groups.d[groupResidentID] = residentGroup; 138 | 139 | // insert ourselves into the list of all active groups 140 | streaming.resident.activeGroups.d[streaming.update.loadActiveGroupsOffset + loadGroupIndex] = groupResidentID; 141 | 142 | // We might have a bit of divergence here, but shouldn't be a mission critical issue 143 | 144 | // All new groups need to build new clusters. 145 | // These are built into scratch space first, and then moved to final locations. 146 | 147 | uint newBuildOffset = group.streamingNewBuildOffset; 148 | for (uint c = 0; c < group.clusterCount; c++) 149 | { 150 | uint clusterResidentID = group.clusterResidentID + c; 151 | 152 | Cluster_in clusterRef = Cluster_in(spatch.groupAddress + Group_size + Cluster_size * c); 153 | streaming.resident.clusters.d[clusterResidentID] = uint64_t(clusterRef); 154 | 155 | #if TARGETS_RAY_TRACING 156 | Cluster cluster = clusterRef.d; 157 | 158 | ClasBuildInfo buildInfo; 159 | buildInfo.clusterID = clusterResidentID; 160 | buildInfo.clusterFlags = 0; 161 | 162 | buildInfo.packed = 0; 163 | buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_triangleCount, cluster.triangleCountMinusOne+1); 164 | buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_vertexCount, cluster.vertexCountMinusOne+1); 165 | buildInfo.packed |= PACKED_FLAG(ClasBuildInfo_packed_indexType, 1); 166 | 167 | buildInfo.baseGeometryIndexAndFlags = ClasGeometryFlag_OPAQUE_BIT_NV; 168 | 169 | buildInfo.indexBufferStride = uint16_t(1); 170 | buildInfo.vertexBufferStride = uint16_t(4 * 4); 171 | buildInfo.geometryIndexAndFlagsBufferStride = uint16_t(0); 172 | buildInfo.opacityMicromapIndexBufferStride = uint16_t(0); 173 | 174 | buildInfo.vertexBuffer = uint64_t(cluster.vertices); 175 | buildInfo.indexBuffer = uint64_t(cluster.localTriangles); 176 | 177 | buildInfo.geometryIndexAndFlagsBuffer = 0; 178 | buildInfo.opacityMicromapArray = 0; 179 | buildInfo.opacityMicromapIndexBuffer = 0; 180 | 181 | streaming.update.newClasBuilds.d[newBuildOffset + c] = buildInfo; 182 | streaming.update.newClasResidentIDs.d[newBuildOffset + c] = clusterResidentID; 183 | #endif 184 | } 185 | } 186 | } 187 | } 188 | 189 | -------------------------------------------------------------------------------- /shaders/traversal_init.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader initializes the traversal queue with the 26 | root nodes of the lod hierarchy of rendered instances. 27 | 28 | A thread represents one instance. 29 | */ 30 | 31 | #version 460 32 | 33 | #extension GL_GOOGLE_include_directive : enable 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 37 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 38 | #extension GL_EXT_buffer_reference : enable 39 | #extension GL_EXT_buffer_reference2 : enable 40 | #extension GL_EXT_scalar_block_layout : enable 41 | #extension GL_EXT_shader_atomic_int64 : enable 42 | 43 | #extension GL_EXT_control_flow_attributes : require 44 | #extension GL_KHR_shader_subgroup_vote : require 45 | #extension GL_KHR_shader_subgroup_ballot : require 46 | #extension GL_KHR_shader_subgroup_shuffle : require 47 | #extension GL_KHR_shader_subgroup_basic : require 48 | #extension GL_KHR_shader_subgroup_clustered : require 49 | #extension GL_KHR_shader_subgroup_arithmetic : require 50 | 51 | #extension GL_NV_shader_subgroup_partitioned : require 52 | 53 | #include "shaderio.h" 54 | 55 | //////////////////////////////////////////// 56 | 57 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 58 | { 59 | FrameConstants view; 60 | FrameConstants viewLast; 61 | }; 62 | 63 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 64 | { 65 | Readback readback; 66 | }; 67 | 68 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 69 | { 70 | RenderInstance instances[]; 71 | }; 72 | 73 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 74 | { 75 | Geometry geometries[]; 76 | }; 77 | 78 | layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar; 79 | 80 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 81 | { 82 | SceneBuilding build; 83 | }; 84 | 85 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW 86 | { 87 | SceneBuilding buildRW; 88 | }; 89 | 90 | 91 | //////////////////////////////////////////// 92 | 93 | layout(local_size_x=TRAVERSAL_INIT_WORKGROUP) in; 94 | 95 | #include "culling.glsl" 96 | 97 | //////////////////////////////////////////// 98 | 99 | void main() 100 | { 101 | uint instanceID = gl_GlobalInvocationID.x; 102 | uint instanceLoad = min(build.numRenderInstances-1, instanceID); 103 | bool isValid = instanceID == instanceLoad; 104 | 105 | #if USE_SORTING 106 | instanceLoad = build.instanceSortValues.d[instanceLoad]; 107 | instanceID = instanceLoad; 108 | #endif 109 | 110 | // TODO optimization: 111 | // For better loading behavior when streaming, the instances should be sorted 112 | // relative to camera position. 113 | 114 | RenderInstance instance = instances[instanceLoad]; 115 | Geometry geometry = geometries[instance.geometryID]; 116 | 117 | vec4 clipMin; 118 | vec4 clipMax; 119 | bool clipValid; 120 | 121 | uint status = 0; 122 | 123 | bool inFrustum = intersectFrustum(geometry.bbox.lo, geometry.bbox.hi, instance.worldMatrix, clipMin, clipMax, clipValid); 124 | bool isVisible = inFrustum && (!clipValid || (intersectSize(clipMin, clipMax) && intersectHiz(clipMin, clipMax))); 125 | 126 | status = (inFrustum ? INSTANCE_FRUSTUM_BIT : 0) | 127 | (isVisible ? INSTANCE_VISIBLE_BIT : 0); 128 | 129 | 130 | bool doNode = isValid 131 | #if USE_CULLING && TARGETS_RASTERIZATION 132 | && isVisible 133 | #endif 134 | ; 135 | uvec4 voteNodes = subgroupBallot(doNode); 136 | 137 | // TODO optimization: enqueue all root children, so traversal can start with more nodes immediately 138 | // TODO feature: allow single-lod level render option by picking a single appropriate child of the root node 139 | // The root hierarchy node of a geometry is up to 32 wide, and each child represents one distinct lod level. 140 | 141 | uint offsetNodes = 0; 142 | if (subgroupElect()) 143 | { 144 | offsetNodes = atomicAdd(buildRW.traversalTaskCounter, int(subgroupBallotBitCount(voteNodes))); 145 | } 146 | 147 | offsetNodes = subgroupBroadcastFirst(offsetNodes); 148 | offsetNodes += subgroupBallotExclusiveBitCount(voteNodes); 149 | 150 | if (doNode && offsetNodes < build.maxTraversalInfos) { 151 | uint packedNode = geometry.nodes.d[0].packed; 152 | TraversalInfo traversalInfo; 153 | traversalInfo.instanceID = instanceID; 154 | traversalInfo.packedNode = packedNode; 155 | build.traversalNodeInfos.d[offsetNodes] = packTraversalInfo(traversalInfo); 156 | } 157 | 158 | #if TARGETS_RAY_TRACING 159 | if (instanceID == instanceLoad) { 160 | build.instanceStates.d[instanceID] = status; 161 | build.blasBuildInfos.d[instanceID].clusterReferencesCount = 0; 162 | build.blasBuildInfos.d[instanceID].clusterReferencesStride = 8; 163 | } 164 | #endif 165 | } -------------------------------------------------------------------------------- /shaders/traversal_presort.comp.glsl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | /* 21 | 22 | Shader Description 23 | ================== 24 | 25 | This compute shader computes the distance of the instance to the camera. 26 | 27 | A thread represents one instance. 28 | */ 29 | 30 | #version 460 31 | 32 | #extension GL_GOOGLE_include_directive : enable 33 | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable 34 | #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable 35 | #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable 36 | #extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable 37 | #extension GL_EXT_buffer_reference : enable 38 | #extension GL_EXT_buffer_reference2 : enable 39 | #extension GL_EXT_scalar_block_layout : enable 40 | #extension GL_EXT_shader_atomic_int64 : enable 41 | 42 | #extension GL_EXT_control_flow_attributes : require 43 | #extension GL_KHR_shader_subgroup_vote : require 44 | #extension GL_KHR_shader_subgroup_ballot : require 45 | #extension GL_KHR_shader_subgroup_shuffle : require 46 | #extension GL_KHR_shader_subgroup_basic : require 47 | #extension GL_KHR_shader_subgroup_clustered : require 48 | #extension GL_KHR_shader_subgroup_arithmetic : require 49 | 50 | #extension GL_NV_shader_subgroup_partitioned : require 51 | 52 | #include "shaderio.h" 53 | 54 | //////////////////////////////////////////// 55 | 56 | layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer 57 | { 58 | FrameConstants view; 59 | FrameConstants viewLast; 60 | }; 61 | 62 | layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer 63 | { 64 | Readback readback; 65 | }; 66 | 67 | layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer 68 | { 69 | RenderInstance instances[]; 70 | }; 71 | 72 | layout(scalar, binding = BINDINGS_GEOMETRIES_SSBO, set = 0) buffer geometryBuffer 73 | { 74 | Geometry geometries[]; 75 | }; 76 | 77 | layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar; 78 | 79 | layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer 80 | { 81 | SceneBuilding build; 82 | }; 83 | 84 | layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW 85 | { 86 | SceneBuilding buildRW; 87 | }; 88 | 89 | 90 | //////////////////////////////////////////// 91 | 92 | layout(local_size_x=TRAVERSAL_PRESORT_WORKGROUP) in; 93 | 94 | //////////////////////////////////////////// 95 | 96 | void main() 97 | { 98 | uint instanceID = gl_GlobalInvocationID.x; 99 | uint instanceLoad = min(build.numRenderInstances-1, instanceID); 100 | 101 | RenderInstance instance = instances[instanceLoad]; 102 | Geometry geometry = geometries[instance.geometryID]; 103 | 104 | mat4 worldToObject = inverse(instance.worldMatrix); 105 | 106 | vec3 oPos = (worldToObject * vec4(view.viewPos.xyz,1)).xyz; 107 | 108 | bool isInside = all(equal(greaterThanEqual(oPos, geometry.bbox.lo),lessThanEqual(oPos, geometry.bbox.hi))); 109 | 110 | vec3 oPosClamp = isInside ? (geometry.bbox.lo + geometry.bbox.hi) * 0.5 : 111 | clamp(oPos, geometry.bbox.lo, geometry.bbox.hi); 112 | 113 | vec4 wPos = instance.worldMatrix * vec4(oPosClamp, 1); 114 | 115 | if (instanceID == instanceLoad) { 116 | build.instanceSortValues.d[instanceID] = instanceID; 117 | build.instanceSortKeys.d[instanceID] = floatBitsToUint(distance(wPos.xyz, view.viewPos.xyz)); 118 | } 119 | } -------------------------------------------------------------------------------- /src/cgltf.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define CGLTF_IMPLEMENTATION 3 | #include 4 | -------------------------------------------------------------------------------- /src/hbao_pass.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2018-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | 21 | #ifndef HBAOPASS_H__ 22 | #define HBAOPASS_H__ 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | ////////////////////////////////////////////////////////////////////////// 34 | 35 | /// HbaoSystem implements a screen-space 36 | /// ambient occlusion effect using 37 | /// horizon-based ambient occlusion. 38 | /// See https://github.com/nvpro-samples/gl_ssao 39 | /// for more details 40 | 41 | class HbaoPass 42 | { 43 | public: 44 | static const int RANDOM_SIZE = 4; 45 | static const int RANDOM_ELEMENTS = RANDOM_SIZE * RANDOM_SIZE; 46 | 47 | struct Config 48 | { 49 | VkFormat targetFormat; 50 | uint32_t maxFrames; 51 | }; 52 | 53 | void init(VkDevice device, nvvk::ResourceAllocator* allocator, nvvk::ShaderModuleManager* shaderManager, const Config& config); 54 | void reloadShaders(); 55 | void deinit(); 56 | 57 | struct FrameConfig 58 | { 59 | bool blend; 60 | 61 | uint32_t sourceWidthScale; 62 | uint32_t sourceHeightScale; 63 | 64 | uint32_t targetWidth; 65 | uint32_t targetHeight; 66 | 67 | VkDescriptorImageInfo sourceDepth; 68 | VkDescriptorImageInfo targetColor; 69 | }; 70 | 71 | struct FrameIMGs 72 | { 73 | nvvk::Texture depthlinear, viewnormal, result, blur, resultarray, deptharray; 74 | }; 75 | 76 | struct Frame 77 | { 78 | uint32_t slot = ~0u; 79 | 80 | FrameIMGs images; 81 | int width; 82 | int height; 83 | 84 | FrameConfig config; 85 | }; 86 | 87 | bool initFrame(Frame& frame, const FrameConfig& config, VkCommandBuffer cmd); 88 | void deinitFrame(Frame& frame); 89 | 90 | 91 | struct View 92 | { 93 | bool isOrtho; 94 | float nearPlane; 95 | float farPlane; 96 | float halfFovyTan; 97 | glm::mat4 projectionMatrix; 98 | }; 99 | 100 | struct Settings 101 | { 102 | View view; 103 | 104 | float unit2viewspace = 1.0f; 105 | float intensity = 1.0f; 106 | float radius = 1.0f; 107 | float bias = 0.1f; 108 | float blurSharpness = 40.0f; 109 | }; 110 | 111 | // before: must do appropriate barriers for color write access and depth read access 112 | // after: from compute write to whatever output image needs 113 | void cmdCompute(VkCommandBuffer cmd, const Frame& frame, const Settings& settings) const; 114 | 115 | private: 116 | struct Shaders 117 | { 118 | nvvk::ShaderModuleID depth_linearize, viewnormal, blur, blur_apply, deinterleave, calc, reinterleave; 119 | }; 120 | 121 | struct Pipelines 122 | { 123 | VkPipeline depth_linearize = VK_NULL_HANDLE; 124 | VkPipeline viewnormal = VK_NULL_HANDLE; 125 | VkPipeline blur = VK_NULL_HANDLE; 126 | VkPipeline blur_apply = VK_NULL_HANDLE; 127 | VkPipeline deinterleave = VK_NULL_HANDLE; 128 | VkPipeline calc = VK_NULL_HANDLE; 129 | VkPipeline reinterleave = VK_NULL_HANDLE; 130 | }; 131 | 132 | VkDevice m_device; 133 | nvvk::ResourceAllocator* m_allocator; 134 | nvvk::ShaderModuleManager* m_shaderManager; 135 | nvh::TRangeAllocator<1> m_slots; 136 | Config m_config; 137 | 138 | nvvk::DescriptorSetContainer m_setup; 139 | 140 | nvvk::Buffer m_ubo; 141 | VkDescriptorBufferInfo m_uboInfo; 142 | 143 | VkSampler m_linearSampler; 144 | 145 | Shaders m_shaders; 146 | Pipelines m_pipelines; 147 | 148 | glm::vec4 m_hbaoRandom[RANDOM_ELEMENTS]; 149 | 150 | void updatePipelines(); 151 | void updateUbo(VkCommandBuffer cmd, const Frame& frame, const Settings& settings) const; 152 | }; 153 | 154 | #endif -------------------------------------------------------------------------------- /src/nvhiz_vk.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef _NVHIZ_H__ 21 | #define _NVHIZ_H__ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | class NVHizVK 31 | { 32 | private: 33 | enum ProgViewMode : uint32_t 34 | { 35 | PROG_VIEW_MONO, 36 | PROG_VIEW_STEREO, 37 | PROG_VIEW_COUNT, 38 | }; 39 | 40 | enum ProgHizMode : uint32_t 41 | { 42 | PROG_HIZ_FAR, 43 | PROG_HIZ_FAR_AND_NEAR, 44 | PROG_HIZ_FAR_REST, 45 | PROG_HIZ_COUNT, 46 | }; 47 | 48 | public: 49 | static const uint32_t MAX_MIP_LEVELS = 16; 50 | static const uint32_t SHADER_COUNT = (uint32_t(PROG_HIZ_COUNT) * uint32_t(PROG_VIEW_COUNT)); 51 | 52 | enum BindingSlots 53 | { 54 | // keep in sync with glsl 55 | BINDING_READ_DEPTH, 56 | BINDING_READ_FAR, 57 | BINDING_WRITE_NEAR, 58 | BINDING_WRITE_FAR, 59 | BINDING_COUNT, 60 | }; 61 | 62 | struct TextureInfo 63 | { 64 | // allocation 65 | uint32_t width; 66 | uint32_t height; 67 | uint32_t mipLevels; 68 | VkFormat format; 69 | VkImageAspectFlags aspect; 70 | 71 | // the system may use only a sub-rectangle of the allocated width/height 72 | // you should clamp access to this, when sampling the texture 73 | uint32_t usedWidth; 74 | uint32_t usedHeight; 75 | 76 | // xy scale and zw clamp 77 | // use min(uv*factor.xy,factor.zw) for lookups 78 | void getShaderFactors(float factors[4]) const; 79 | float getSizeMax() const; 80 | }; 81 | 82 | struct Update 83 | { 84 | // provide texture/views that are not layered 85 | VkImageView sourceImageView; // 2DMS if createInfo.msaaLevel set, otherwise 2D 86 | VkImageView nearImageView; // 2D optional 87 | VkImageView farImageView; // 2D all mips 88 | VkImageView farImageViews[MAX_MIP_LEVELS]; // 2D single mip 89 | 90 | VkDescriptorImageInfo farImageInfo; 91 | VkDescriptorImageInfo nearImageInfo; 92 | 93 | VkImage sourceImage; 94 | VkImage nearImage; // optional 95 | VkImage farImage; 96 | 97 | TextureInfo sourceInfo; 98 | TextureInfo farInfo; 99 | TextureInfo nearInfo; 100 | bool stereo; // textures are layered, and updates layer 0,1 101 | 102 | Update() { memset(this, 0, sizeof(Update)); } 103 | }; 104 | 105 | struct DescriptorUpdate 106 | { 107 | VkWriteDescriptorSet writeSets[BINDING_COUNT]; 108 | VkDescriptorImageInfo imageInfos[BINDING_COUNT + MAX_MIP_LEVELS - 1]; 109 | }; 110 | 111 | struct Config 112 | { 113 | int msaaSamples = 0; 114 | bool reversedZ = false; 115 | bool supportsSubGroupShuffle = false; 116 | bool supportsMinmaxFilter = false; 117 | }; 118 | 119 | 120 | void init(VkDevice device, const Config& config, uint32_t descrSetsCount); 121 | 122 | VkSampler getReadFarSampler() const; 123 | const VkDescriptorPoolSize* getDescriptorPoolSizes(uint32_t& count) const; 124 | VkDescriptorSetLayout getDescriptorSetLayout() const; 125 | std::string getShaderDefines(uint32_t shader) const; 126 | #if 0 127 | void appendShaderDefines(uint32_t shader, shaderc::CompileOptions& options) const; 128 | #endif 129 | void initPipelines(const VkShaderModule modules[SHADER_COUNT]); 130 | 131 | void deinit(); 132 | 133 | void setupUpdateInfos(Update& update, uint32_t width, uint32_t height, VkFormat sourceFormat, VkImageAspectFlags sourceAspect) const; 134 | void setupDescriptorUpdate(DescriptorUpdate& updateWrite, const Update& update, VkDescriptorSet set) const; 135 | 136 | void cmdUpdateHiz(VkCommandBuffer cmd, const Update& update, VkDescriptorSet set) const; 137 | 138 | // optional utility functions 139 | void initUpdateViews(Update& update) const; 140 | void deinitUpdateViews(Update& update) const; 141 | 142 | // if descrSetsCount was non zero 143 | void updateDescriptorSet(const Update& update, uint32_t setIdx) const; 144 | // if descrSetsCount was non zero 145 | void cmdUpdateHiz(VkCommandBuffer cmd, const Update& update, uint32_t setIdx) const 146 | { 147 | cmdUpdateHiz(cmd, update, m_descrSets[setIdx]); 148 | } 149 | 150 | private: 151 | struct InternalConfig : public Config 152 | { 153 | uint32_t hizLevels = 1; 154 | uint32_t hizNearLevel = 0; 155 | uint32_t hizFarLevel = 0; 156 | }; 157 | 158 | static void getShaderIndexConfig(uint32_t index, ProgHizMode& hiz, ProgViewMode& view) 159 | { 160 | hiz = ProgHizMode(index % uint32_t(PROG_HIZ_COUNT)); 161 | view = ProgViewMode(index / uint32_t(PROG_HIZ_COUNT)); 162 | } 163 | 164 | static uint32_t getShaderIndex(ProgHizMode hiz, ProgViewMode view) { return view * uint32_t(PROG_HIZ_COUNT) + hiz; } 165 | 166 | struct PushConstants 167 | { 168 | // keep in sync with glsl 169 | int srcSize[4]; 170 | int writeLod; 171 | int startLod; 172 | int layer; 173 | int _pad0; 174 | int levelActive[4]; 175 | }; 176 | 177 | void deinitPipelines(); 178 | 179 | InternalConfig m_config = {}; 180 | VkDevice m_device = {}; 181 | VkSampler m_readDepthSampler = {}; 182 | VkSampler m_readFarSampler = {}; 183 | VkSampler m_readNearSampler = {}; 184 | VkPipeline m_pipelines[SHADER_COUNT] = {0}; 185 | VkPipelineLayout m_pipelineLayout = {}; 186 | VkDescriptorSetLayout m_descrLayout = {}; 187 | VkDescriptorPoolSize m_poolSizes[2]; 188 | uint32_t m_descrSetsCount = 0; 189 | VkDescriptorPool m_descrPool = {}; 190 | VkDescriptorSet* m_descrSets = {}; 191 | }; 192 | 193 | #endif 194 | -------------------------------------------------------------------------------- /src/renderer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "resources.hpp" 24 | #include "scene.hpp" 25 | #include "scene_preloaded.hpp" 26 | #include "scene_streaming.hpp" 27 | 28 | namespace lodclusters { 29 | 30 | // There are two implementations for a renderable scene. 31 | // Everything is preloaded or we stream in data dynamically. 32 | class RenderScene 33 | { 34 | public: 35 | const Scene* scene = nullptr; 36 | bool useStreaming = false; 37 | ScenePreloaded scenePreloaded; 38 | SceneStreaming sceneStreaming; 39 | 40 | // pointers must stay valid during lifetime 41 | bool init(Resources* res, const Scene* scene_, const StreamingConfig& streamingConfig_, bool useStreaming_); 42 | void deinit(); 43 | 44 | void streamingReset(); 45 | 46 | bool updateClasRequired(bool state); 47 | 48 | const RBufferTyped& getShaderGeometriesBuffer() const; 49 | size_t getClasSize(bool reserved) const; 50 | size_t getOperationsSize() const; 51 | size_t getGeometrySize(bool reserved) const; 52 | }; 53 | 54 | struct RendererConfig 55 | { 56 | bool flipWinding = false; 57 | bool twoSided = false; 58 | bool useSorting = false; 59 | 60 | // the maximum number of renderable clusters per frame in bits i.e. (1 << number) 61 | uint32_t numRenderClusterBits = 20; 62 | // the maximum number of traversal intermediate tasks 63 | uint32_t numTraversalTaskBits = 20; 64 | 65 | // build flags for the cluster BLAS 66 | VkBuildAccelerationStructureFlagsKHR clusterBlasFlags = 0; 67 | }; 68 | 69 | class Renderer 70 | { 71 | public: 72 | struct ResourceUsageInfo 73 | { 74 | size_t rtTlasMemBytes{}; 75 | size_t rtBlasMemBytes{}; 76 | size_t rtClasMemBytes{}; 77 | size_t operationsMemBytes{}; 78 | size_t geometryMemBytes{}; 79 | 80 | void add(const ResourceUsageInfo& other) 81 | { 82 | rtTlasMemBytes += other.rtTlasMemBytes; 83 | rtBlasMemBytes += other.rtBlasMemBytes; 84 | rtClasMemBytes += other.rtClasMemBytes; 85 | operationsMemBytes += other.operationsMemBytes; 86 | geometryMemBytes += other.geometryMemBytes; 87 | } 88 | size_t getTotalSum() const 89 | { 90 | return rtTlasMemBytes + rtBlasMemBytes + rtClasMemBytes + geometryMemBytes + operationsMemBytes; 91 | } 92 | }; 93 | 94 | virtual bool init(Resources& res, RenderScene& rscene, const RendererConfig& config) = 0; 95 | virtual void render(VkCommandBuffer primary, Resources& res, RenderScene& rscene, const FrameConfig& frame, nvvk::ProfilerVK& profiler) = 0; 96 | virtual void deinit(Resources& res) = 0; 97 | virtual ~Renderer() {}; // Defined only so that inherited classes also have virtual destructors. Use deinit(). 98 | virtual void updatedFrameBuffer(Resources& res) { updatedFrameBufferBasics(res); }; 99 | 100 | virtual bool supportsClusters() const { return true; } 101 | 102 | inline ResourceUsageInfo getResourceUsage(bool reserved) const 103 | { 104 | return reserved ? m_resourceReservedUsage : m_resourceActualUsage; 105 | }; 106 | 107 | protected: 108 | bool initBasicShaders(Resources& res); 109 | void initBasics(Resources& res, RenderScene& rscene, const RendererConfig& config); 110 | void deinitBasics(Resources& res); 111 | 112 | void updatedFrameBufferBasics(Resources& res); 113 | 114 | void initWriteRayTracingDepthBuffer(Resources& res); 115 | void writeRayTracingDepthBuffer(VkCommandBuffer cmd); 116 | 117 | void initRenderInstanceBboxes(Resources& res, RenderScene& rscene); 118 | void renderInstanceBboxes(VkCommandBuffer cmd); 119 | 120 | struct BasicShaders 121 | { 122 | nvvk::ShaderModuleID fullScreenVertexShader; 123 | nvvk::ShaderModuleID fullScreenWriteDepthFragShader; 124 | nvvk::ShaderModuleID renderInstanceBboxesFragmentShader; 125 | nvvk::ShaderModuleID renderInstanceBboxesMeshShader; 126 | }; 127 | 128 | BasicShaders m_basicShaders; 129 | 130 | std::vector m_renderInstances; 131 | RBuffer m_renderInstanceBuffer; 132 | 133 | ResourceUsageInfo m_resourceReservedUsage{}; 134 | ResourceUsageInfo m_resourceActualUsage{}; 135 | 136 | nvvk::DescriptorSetContainer m_writeDepthBufferDsetContainer; 137 | VkPipeline m_writeDepthBufferPipeline = nullptr; 138 | 139 | nvvk::DescriptorSetContainer m_renderInstanceBboxesDsetContainer; 140 | VkPipeline m_renderInstanceBboxesPipeline = nullptr; 141 | 142 | RBuffer m_sortingAuxBuffer; 143 | }; 144 | 145 | ////////////////////////////////////////////////////////////////////////// 146 | 147 | std::unique_ptr makeRendererRasterClustersLod(); 148 | std::unique_ptr makeRendererRayTraceClustersLod(); 149 | 150 | } // namespace lodclusters 151 | -------------------------------------------------------------------------------- /src/scene_preloaded.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #pragma once 21 | 22 | #include "scene.hpp" 23 | #include "resources.hpp" 24 | #include "vk_nv_cluster_acc.h" 25 | 26 | namespace lodclusters { 27 | 28 | // With this class we pre-load all lod levels of the rendered scene. 29 | // It is much more memory intensive. 30 | class ScenePreloaded 31 | { 32 | public: 33 | struct Config 34 | { 35 | VkBuildAccelerationStructureFlagsKHR clasBuildFlags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; 36 | uint32_t clasPositionTruncateBits = 0; 37 | }; 38 | 39 | // pointers must stay valid during lifetime 40 | bool init(Resources* res, const Scene* scene, const Config& config); 41 | 42 | // run prior the renderer starts referencing resources 43 | // if true CLAS for all clusters will be built 44 | bool updateClasRequired(bool state); 45 | 46 | // tear down, safe to call without init 47 | void deinit(); 48 | 49 | // renderers need to access this buffer 50 | const RBufferTyped& getShaderGeometriesBuffer() const { return m_shaderGeometriesBuffer; } 51 | 52 | // device memory usage 53 | size_t getClasSize() const { return m_clasSize; } 54 | size_t getGeometrySize() const { return m_geometrySize; } 55 | size_t getOperationsSize() const { return m_operationsSize + m_clasOperationsSize; } 56 | 57 | private: 58 | struct Geometry 59 | { 60 | RBufferTyped nodes; 61 | RBufferTyped nodeBboxes; 62 | 63 | RBufferTyped groups; 64 | 65 | RBufferTyped localTriangles; 66 | RBufferTyped vertices; 67 | 68 | RBufferTyped clusters; 69 | RBufferTyped clusterGeneratingGroups; 70 | RBufferTyped clusterBboxes; 71 | 72 | // for ray tracing 73 | RBufferTyped clusterClasAddresses; 74 | RBufferTyped clusterClasSizes; 75 | RBuffer clasData; 76 | }; 77 | 78 | Config m_config; 79 | bool m_hasClas = false; 80 | Resources* m_resources = nullptr; 81 | const Scene* m_scene = nullptr; 82 | 83 | size_t m_clasSize = 0; 84 | size_t m_clasOperationsSize = 0; 85 | size_t m_geometrySize = 0; 86 | size_t m_operationsSize = 0; 87 | 88 | std::vector m_geometries; 89 | std::vector m_shaderGeometries; 90 | 91 | RBufferTyped m_shaderGeometriesBuffer; 92 | 93 | bool initClas(); 94 | void deinitClas(); 95 | }; 96 | } // namespace lodclusters 97 | -------------------------------------------------------------------------------- /src/vk_nv_cluster_acc.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include "vk_nv_cluster_acc.h" 21 | #include 22 | 23 | static PFN_vkGetClusterAccelerationStructureBuildSizesNV s_vkGetClusterAccelerationStructureBuildSizesNV = nullptr; 24 | static PFN_vkCmdBuildClusterAccelerationStructureIndirectNV s_vkCmdBuildClusterAccelerationStructureIndirectNV = nullptr; 25 | 26 | #ifndef NVVK_HAS_VK_NV_cluster_acceleration_structure 27 | VKAPI_ATTR void VKAPI_CALL vkGetClusterAccelerationStructureBuildSizesNV(VkDevice device, 28 | const VkClusterAccelerationStructureInputInfoNV* input, 29 | VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) 30 | { 31 | s_vkGetClusterAccelerationStructureBuildSizesNV(device, input, pSizeInfo); 32 | } 33 | 34 | VKAPI_ATTR void VKAPI_CALL vkCmdBuildClusterAccelerationStructureIndirectNV(VkCommandBuffer commandBuffer, 35 | const VkClusterAccelerationStructureCommandsInfoNV* cmdInfo) 36 | { 37 | s_vkCmdBuildClusterAccelerationStructureIndirectNV(commandBuffer, cmdInfo); 38 | } 39 | #endif 40 | 41 | VkBool32 load_VK_NV_cluster_accleration_structure(VkInstance instance, VkDevice device) 42 | { 43 | s_vkGetClusterAccelerationStructureBuildSizesNV = nullptr; 44 | s_vkCmdBuildClusterAccelerationStructureIndirectNV = nullptr; 45 | 46 | s_vkGetClusterAccelerationStructureBuildSizesNV = 47 | (PFN_vkGetClusterAccelerationStructureBuildSizesNV)vkGetDeviceProcAddr(device, "vkGetClusterAccelerationStructureBuildSizesNV"); 48 | s_vkCmdBuildClusterAccelerationStructureIndirectNV = 49 | (PFN_vkCmdBuildClusterAccelerationStructureIndirectNV)vkGetDeviceProcAddr(device, "vkCmdBuildClusterAccelerationStructureIndirectNV"); 50 | 51 | return s_vkGetClusterAccelerationStructureBuildSizesNV && s_vkCmdBuildClusterAccelerationStructureIndirectNV; 52 | } 53 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | 3 | project(vk_radix_sort LANGUAGES C CXX CUDA) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | set(CMAKE_CXX_STANDARD_REQUIRED True) 7 | 8 | find_package(Vulkan REQUIRED) 9 | 10 | # adds -fPIC, works for linux, when building shared library 11 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 12 | 13 | # shaders 14 | file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/generated) 15 | 16 | # add_shader(TARGET SHADER OUTPUT DEFINE...) 17 | function(add_shader) 18 | list(POP_FRONT ARGV TARGET SHADER OUTPUT) 19 | list(TRANSFORM ARGV PREPEND "-D" OUTPUT_VARIABLE DEFINES) 20 | 21 | get_filename_component(SHADER ${SHADER} ABSOLUTE) 22 | 23 | add_custom_command( 24 | OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h 25 | COMMAND 26 | ${Vulkan_GLSLANG_VALIDATOR_EXECUTABLE} 27 | --target-env spirv1.5 28 | -V 29 | --vn ${OUTPUT} 30 | -o ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h 31 | ${DEFINES} 32 | ${SHADER} 33 | DEPENDS ${SHADER} 34 | COMMENT "Compiling ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h" 35 | ) 36 | 37 | add_custom_target(${OUTPUT} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/${OUTPUT}.h) 38 | add_dependencies(${TARGET} ${OUTPUT}) 39 | endfunction() 40 | 41 | # library 42 | add_library(vk_radix_sort STATIC 43 | src/vk_radix_sort.cc 44 | ) 45 | 46 | target_include_directories(vk_radix_sort 47 | PUBLIC include 48 | PRIVATE src 49 | ) 50 | 51 | target_link_libraries(vk_radix_sort 52 | PUBLIC Vulkan::Vulkan 53 | ) 54 | 55 | add_shader(vk_radix_sort src/shader/upsweep.comp upsweep_comp) 56 | add_shader(vk_radix_sort src/shader/spine.comp spine_comp) 57 | add_shader(vk_radix_sort src/shader/downsweep.comp downsweep_comp) 58 | add_shader(vk_radix_sort src/shader/downsweep.comp downsweep_key_value_comp KEY_VALUE) 59 | 60 | # bench 61 | if (PROJECT_IS_TOP_LEVEL) 62 | set(BENCH_SOURCES 63 | bench/bench.cc 64 | bench/benchmark_factory.cc 65 | bench/cpu_benchmark.cc 66 | bench/data_generator.cc 67 | bench/vma_impl.cc 68 | bench/vulkan_benchmark.cc 69 | ) 70 | 71 | # if CUDA is available, add CUB benchmark 72 | include(CheckLanguage) 73 | check_language(CUDA) 74 | if (CMAKE_CUDA_COMPILER) 75 | enable_language(CUDA) 76 | set(CMAKE_CUDA_STANDARD 17) 77 | set(CMAKE_CUDA_STANDARD_REQUIRED True) 78 | list(APPEND BENCH_SOURCES 79 | bench/cuda_benchmark.cu 80 | ) 81 | endif() 82 | 83 | message(${BENCH_SOURCES}) 84 | add_executable(bench ${BENCH_SOURCES}) 85 | 86 | if (CMAKE_CUDA_COMPILER) 87 | target_compile_definitions(bench PUBLIC BENCH_CUDA) 88 | endif() 89 | 90 | # if VMA is already added from parent project, skip 91 | if (NOT TARGET VulkanMemoryAllocator) 92 | add_subdirectory(third_party/VulkanMemoryAllocator EXCLUDE_FROM_ALL) 93 | endif() 94 | 95 | target_link_libraries(bench PRIVATE vk_radix_sort VulkanMemoryAllocator) 96 | endif() 97 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 jaesung-cs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/README.md: -------------------------------------------------------------------------------- 1 | # vulkan_radix_sort 2 | 3 | Vulkan implementation of radix sort. 4 | 5 | Reduce-then-scan GPU radix sort algorithm is implemented (Onesweep is abandoned.) 6 | 7 | 8 | ## Requirements 9 | - `VulkanSDK>=1.2` 10 | - Download from https://vulkan.lunarg.com/ and follow install instruction. 11 | - Requires several features available in `1.2`. 12 | - Must support `VK_KHR_buffer_device_address`: 13 | - Run `vulkaninfo` and check if `VK_KHR_buffer_device_address` device extension is available. 14 | - `cmake>=3.15` 15 | 16 | 17 | ## Build 18 | ```bash 19 | $ cmake . -B build 20 | $ cmake --build build --config Release -j 21 | ``` 22 | 23 | ## Test 24 | ```bash 25 | $ ./build/Release/bench.exe # Windows 26 | $ ./build/bench # Linux 27 | $ ./build/bench 10000000 vulkan 28 | ``` 29 | - N = number of elements to sort 30 | - type = one of cpu,vulkan,cuda 31 | 32 | 33 | ### Test Environment 34 | - Windows, NVIDIA GeForce RTX 4090. 35 | 36 | 37 | ### Benchmark Result 38 | - Not precisely benchmarked, but the speed is competitive compare to CUB radix sort. 39 | - 32-bit key-only: my implementation is 10% slower when sorting 33M (2^25) elements. 40 | - 32-bit Key-value: my implementation is 15-25% faster when sorting 33M (2^25) key-value pairs. 41 | - Note that CUB radix sort is not in-place operation. It may require an additional copy operation, or double storage. 42 | - vulkan 43 | ```bash 44 | > .\build\Release\bench.exe 33554432 vulkan 45 | vk_radix_sort benchmark 46 | ================ sort ================ 47 | total time: 2.67571ms (12.5404 GItems/s) 48 | ================ sort key value ================ 49 | total time: 3.42221ms (9.80491 GItems/s) 50 | ================ sort key value speed ================ 51 | [0] total time: 3.41706ms (9.81969 GItems/s) 52 | [1] total time: 3.43142ms (9.77857 GItems/s) 53 | [2] total time: 3.42298ms (9.80271 GItems/s) 54 | [3] total time: 3.46208ms (9.69199 GItems/s) 55 | [4] total time: 3.42426ms (9.79904 GItems/s) 56 | [5] total time: 3.43725ms (9.762 GItems/s) 57 | [6] total time: 3.42016ms (9.81078 GItems/s) 58 | [7] total time: 3.42016ms (9.81078 GItems/s) 59 | [8] total time: 3.42099ms (9.80839 GItems/s) 60 | [9] total time: 3.41606ms (9.82254 GItems/s) 61 | ... 62 | ``` 63 | - CUDA Version 12.6 CUB 64 | ```bash 65 | > .\build\Release\bench.exe 33554432 cuda 66 | vk_radix_sort benchmark 67 | ================ sort ================ 68 | total time: 2.5047ms (13.3966 GItems/s) 69 | ================ sort key value ================ 70 | total time: 4.19226ms (8.00391 GItems/s) 71 | ================ sort key value speed ================ 72 | [0] total time: 4.20352ms (7.98246 GItems/s) 73 | [1] total time: 4.50355ms (7.45066 GItems/s) 74 | [2] total time: 4.21376ms (7.96306 GItems/s) 75 | [3] total time: 4.22298ms (7.94568 GItems/s) 76 | [4] total time: 4.22208ms (7.94737 GItems/s) 77 | [5] total time: 4.2199ms (7.95147 GItems/s) 78 | [6] total time: 4.21274ms (7.965 GItems/s) 79 | [7] total time: 4.20352ms (7.98246 GItems/s) 80 | [8] total time: 4.21376ms (7.96306 GItems/s) 81 | [9] total time: 4.21478ms (7.96113 GItems/s) 82 | ... 83 | ``` 84 | 85 | ## Use as a Library with CMake 86 | - Add subdirectory `vulkan_radix_sort` 87 | ```cmake 88 | add_subdirectory(path/to/vulkan_radix_sort) 89 | ``` 90 | 91 | - Link to `vk_radix_sort` in your project (library, binary) 92 | ```cmake 93 | target_link_libraries(my_project PRIVATE Vulkan::Vulkan VulkanMemoryAllocator vk_radix_sort) 94 | ``` 95 | 96 | ## Usage 97 | 1. When creating `VkDevice`, enable `VkPhysicalDeviceBufferAddressFeatures`. 98 | 99 | 1. When creating `VmaAllocator`, enable `VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT` flag. 100 | 101 | 1. Create `VkBuffer` for keys and values, with `VK_BUFFER_USAGE_STORAGE_BUFFER_BIT` and `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT`. 102 | 103 | 1. Create `VrdxSorter` 104 | 105 | It creates shared resources: pipeline layouts, pipelines, etc. 106 | 107 | ```c++ 108 | VrdxSorter sorter = VK_NULL_HANDLE; 109 | VrdxSorterCreateInfo sorterInfo = {}; 110 | sorterInfo.physicalDevice = physicalDevice; 111 | sorterInfo.device = device; 112 | sorterInfo.pipelineCache = pipelineCache; 113 | vrdxCreateSorter(&sorterInfo, &sorter); 114 | ``` 115 | 116 | 1. Create a temporary storage buffer for sort. 117 | 118 | ```c++ 119 | // request storage buffer request 120 | VrdxSorterStorageRequirements requirements; 121 | // for key-only 122 | vrdxGetSorterStorageRequirements(sorter, elementCount, &requirements); 123 | // for key-value 124 | vrdxGetSorterKeyValueStorageRequirements(sorter, elementCount, &requirements); 125 | 126 | // create or reuse buffer 127 | VkBufferCreateInfo bufferInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; 128 | bufferInfo.size = requirements.size; 129 | bufferInfo.usage = requirements.usage; 130 | // ... 131 | ``` 132 | 133 | 1. Record sort commands. 134 | 135 | This command binds pipeline, pipeline layout, and push constants internally. 136 | 137 | So, users must not expect previously bound targets retain after the sort command. 138 | 139 | Users must add proper **execution barriers**. 140 | 141 | One can use buffer memory barrier, but in general, global barriers are more efficient than per-resource, according to [official synchronization examples](https://github.com/KhronosGroup/Vulkan-Docs/wiki/Synchronization-Examples#three-dispatches-first-dispatch-writes-to-one-storage-buffer-second-dispatch-writes-to-a-different-storage-buffer-third-dispatch-reads-both): 142 | 143 | > ... global memory barrier covers all resources. Generally considered more efficient to do a global memory barrier than per-resource barriers, per-resource barriers should usually be used for queue ownership transfers and image layout transitions - otherwise use global barriers. 144 | 145 | The sort command will read from key/value buffers (and elementCount buffer for indirect sort) in compute shader stage, and write to output key/value buffers in later compute shader stage. 146 | 147 | The second synchronization scope **before** sort command must include `COMPUTE_SHADER` stage (and `TRANSFER` for indirect sort) and `SHADER_READ` access (and `TRANSFER_READ` for indirect sort). 148 | 149 | The first synchronization scope **after** sort command must include `COMPUTE_SHADER` stage and `SHADER_WRITE` access. 150 | 151 | ```c++ 152 | VkQueryPool queryPool; // VK_NULL_HANDLE, or a valid timestamp query pool with size at least 8. 153 | 154 | // sort keys 155 | vrdxCmdSort(commandBuffer, sorter, elementCount, 156 | keysBuffer, 0, 157 | storageBuffer, 0, 158 | queryPool, 0); 159 | 160 | // sort keys with values 161 | vrdxCmdSortKeyValue(commandBuffer, sorter, elementCount, 162 | keysBuffer, 0, 163 | valuesBuffer, 0, 164 | storageBuffer, 0, 165 | queryPool, 0); 166 | 167 | // indirectBuffer contains elementCount, a single uint entry in GPU buffer. 168 | // maxElementCount is required for storage buffer offsets. 169 | // element count in the indirect buffer must not be greater than maxElementCount. Otherwise, undefined behavior. 170 | vrdxCmdSortKeyValueIndirect(commandBuffer, sorter, maxElementCount, 171 | indirectBuffer, 0, 172 | keysBuffer, 0, 173 | valuesBuffer, 0, 174 | storageBuffer, 0, 175 | queryPool, 0); 176 | ``` 177 | 178 | 179 | ## TODO 180 | - [x] Use `VkPhysicalDeviceLimits` to get compute shader-related limits, such as `maxComputeWorkGroupSize` or `maxComputeSharedMemorySize`. 181 | - [x] Increase allowed `maxElementCount` by allocating buffers properly. 182 | - [x] Compare with CUB radix sort 183 | - [ ] Compare with VkRadixSort 184 | - [ ] Compare with Fuchsia radix sort 185 | - [ ] Find best `WORKGROUP_SIZE` and `PARTITION_DIVISION` for different devices. 186 | - [x] Support for SubgroupSize=64. 187 | 188 | 189 | ## References 190 | - https://github.com/b0nes164/GPUSorting : their CUDA kernel codes were very helpful when trying to catch the idea of how the algorithm works. 191 | 192 | 193 | ## Troubleshooting 194 | - (NVIDIA GPU, Windows) Slow runtime after a few seconds. 195 | - Reason: NVidia driver adjust GPU/Memory clock. 196 | Open Performance Overlay (Alt+R), then you will see GPU/Memory Clock gets down. 197 | - Solution: change performance mode in control panel. 198 | ![](media/performance_mode.jpg) 199 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/include/vk_radix_sort.h: -------------------------------------------------------------------------------- 1 | #ifndef VK_RADIX_SORT_H 2 | #define VK_RADIX_SORT_H 3 | 4 | #include 5 | 6 | struct VrdxSorter_T; 7 | 8 | /** 9 | * VrdxSorter creates pipelines. 10 | */ 11 | VK_DEFINE_HANDLE(VrdxSorter) 12 | 13 | struct VrdxSorterCreateInfo { 14 | VkPhysicalDevice physicalDevice; 15 | VkDevice device; 16 | VkPipelineCache pipelineCache; 17 | }; 18 | 19 | void vrdxCreateSorter(const VrdxSorterCreateInfo* pCreateInfo, 20 | VrdxSorter* pSorter); 21 | 22 | void vrdxDestroySorter(VrdxSorter sorter); 23 | 24 | struct VrdxSorterStorageRequirements { 25 | VkDeviceSize size; 26 | VkBufferUsageFlags usage; 27 | }; 28 | 29 | void vrdxGetSorterStorageRequirements( 30 | VrdxSorter sorter, uint32_t maxElementCount, 31 | VrdxSorterStorageRequirements* requirements); 32 | 33 | void vrdxGetSorterKeyValueStorageRequirements( 34 | VrdxSorter sorter, uint32_t maxElementCount, 35 | VrdxSorterStorageRequirements* requirements); 36 | 37 | /** 38 | * if queryPool is not VK_NULL_HANDLE, it writes timestamps to N entries 39 | * [query..query+N-1]. 40 | * 41 | * N=15 42 | * query + 0: start timestamp (VK_PIPELINE_STAGE_ALL_COMMANDS_BIT) 43 | * query + 1: transfer timestamp (VK_PIPELINE_STAGE_TRANSFER_BIT) 44 | * query + 2 + (3 * i) + 0: upsweep (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) 45 | * query + 2 + (3 * i) + 1: spine (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) 46 | * query + 2 + (3 * i) + 2: downsweep (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) 47 | * query + 14: sort end timestamp (VK_PIPELINE_STAGE_ALL_COMMANDS_BIT) 48 | */ 49 | void vrdxCmdSort(VkCommandBuffer commandBuffer, VrdxSorter sorter, 50 | uint32_t elementCount, VkBuffer keysBuffer, 51 | VkDeviceSize keysOffset, VkBuffer storageBuffer, 52 | VkDeviceSize storageOffset, VkQueryPool queryPool, 53 | uint32_t query); 54 | 55 | void vrdxCmdSortIndirect(VkCommandBuffer commandBuffer, VrdxSorter sorter, 56 | uint32_t maxElementCount, VkBuffer indirectBuffer, 57 | VkDeviceSize indirectOffset, VkBuffer keysBuffer, 58 | VkDeviceSize keysOffset, VkBuffer storageBuffer, 59 | VkDeviceSize storageOffset, VkQueryPool queryPool, 60 | uint32_t query); 61 | 62 | void vrdxCmdSortKeyValue(VkCommandBuffer commandBuffer, VrdxSorter sorter, 63 | uint32_t elementCount, VkBuffer keysBuffer, 64 | VkDeviceSize keysOffset, VkBuffer valuesBuffer, 65 | VkDeviceSize valuesOffset, VkBuffer storageBuffer, 66 | VkDeviceSize storageOffset, VkQueryPool queryPool, 67 | uint32_t query); 68 | 69 | /** 70 | * indirectBuffer contains elementCount. 71 | * 72 | * The sort command reads a uint32_t value from indirectBuffer at 73 | * indirectOffset. 74 | * 75 | * User must add barrier with second synchronization scope 76 | * COMPUTE_SHADER stage and SHADER_READ access. 77 | * 78 | * indirectBuffer requires TRANSFER_SRC buffer usage flag. 79 | */ 80 | void vrdxCmdSortKeyValueIndirect( 81 | VkCommandBuffer commandBuffer, VrdxSorter sorter, uint32_t maxElementCount, 82 | VkBuffer indirectBuffer, VkDeviceSize indirectOffset, VkBuffer keysBuffer, 83 | VkDeviceSize keysOffset, VkBuffer valuesBuffer, VkDeviceSize valuesOffset, 84 | VkBuffer storageBuffer, VkDeviceSize storageOffset, VkQueryPool queryPool, 85 | uint32_t query); 86 | 87 | #endif // VK_RADIX_SORT_H 88 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/src/shader/spine.comp: -------------------------------------------------------------------------------- 1 | #version 460 core 2 | 3 | #extension GL_EXT_buffer_reference : require 4 | #extension GL_KHR_shader_subgroup_basic: enable 5 | #extension GL_KHR_shader_subgroup_arithmetic: enable 6 | #extension GL_KHR_shader_subgroup_ballot: enable 7 | 8 | const int RADIX = 256; 9 | #define MAX_SUBGROUP_SIZE 128 10 | #define WORKGROUP_SIZE 512 11 | #define PARTITION_DIVISION 8 12 | const int PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; 13 | 14 | // dispatch this shader (RADIX, 1, 1), so that gl_WorkGroupID.x is radix 15 | layout (local_size_x = WORKGROUP_SIZE) in; 16 | 17 | layout (buffer_reference, std430) readonly buffer ElementCount { 18 | uint elementCount; 19 | }; 20 | 21 | layout (buffer_reference, std430) buffer GlobalHistogram { 22 | uint globalHistogram[]; // (4, R) 23 | }; 24 | 25 | layout (buffer_reference, std430) buffer PartitionHistogram { 26 | uint partitionHistogram[]; // (P, R) 27 | }; 28 | 29 | layout (push_constant) uniform PushConstant { 30 | int pass; 31 | restrict ElementCount elementCountReference; 32 | restrict GlobalHistogram globalHistogramReference; 33 | restrict PartitionHistogram partitionHistogramReference; 34 | }; 35 | 36 | shared uint reduction; 37 | // we only need array length equal to subgroup size = 32 or 64, 38 | // but 128 shouldn't affect performance. 39 | shared uint intermediate[MAX_SUBGROUP_SIZE]; 40 | 41 | void main() { 42 | uint threadIndex = gl_SubgroupInvocationID; // 0..31 or 0..63 43 | uint subgroupIndex = gl_SubgroupID; // 0..15 or 0..7 44 | uint index = subgroupIndex * gl_SubgroupSize + threadIndex; 45 | uint radix = gl_WorkGroupID.x; 46 | 47 | uint elementCount = elementCountReference.elementCount; 48 | 49 | uint partitionCount = (elementCount + PARTITION_SIZE - 1) / PARTITION_SIZE; 50 | 51 | if (index == 0) { 52 | reduction = 0; 53 | } 54 | barrier(); 55 | 56 | for (uint i = 0; WORKGROUP_SIZE * i < partitionCount; ++i) { 57 | uint partitionIndex = WORKGROUP_SIZE * i + index; 58 | uint value = partitionIndex < partitionCount ? partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + radix] : 0; 59 | uint excl = subgroupExclusiveAdd(value) + reduction; 60 | uint sum = subgroupAdd(value); 61 | 62 | if (subgroupElect()) { 63 | intermediate[subgroupIndex] = sum; 64 | } 65 | barrier(); 66 | 67 | if (index < gl_NumSubgroups) { 68 | uint excl = subgroupExclusiveAdd(intermediate[index]); 69 | uint sum = subgroupAdd(intermediate[index]); 70 | intermediate[index] = excl; 71 | 72 | if (index == 0) { 73 | reduction += sum; 74 | } 75 | } 76 | barrier(); 77 | 78 | if (partitionIndex < partitionCount) { 79 | excl += intermediate[subgroupIndex]; 80 | partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + radix] = excl; 81 | } 82 | barrier(); 83 | } 84 | 85 | if (gl_WorkGroupID.x == 0) { 86 | // one workgroup is responsible for global histogram prefix sum 87 | if (index < RADIX) { 88 | uint value = globalHistogramReference.globalHistogram[RADIX * pass + index]; 89 | uint excl = subgroupExclusiveAdd(value); 90 | uint sum = subgroupAdd(value); 91 | 92 | if (subgroupElect()) { 93 | intermediate[subgroupIndex] = sum; 94 | } 95 | barrier(); 96 | 97 | if (index < RADIX / gl_SubgroupSize) { 98 | uint excl = subgroupExclusiveAdd(intermediate[index]); 99 | intermediate[index] = excl; 100 | } 101 | barrier(); 102 | 103 | excl += intermediate[subgroupIndex]; 104 | globalHistogramReference.globalHistogram[RADIX * pass + index] = excl; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /thirdparty/vulkan_radix_sort/src/shader/upsweep.comp: -------------------------------------------------------------------------------- 1 | #version 460 core 2 | 3 | #extension GL_EXT_buffer_reference : require 4 | #extension GL_KHR_shader_subgroup_basic: enable 5 | 6 | const int RADIX = 256; 7 | #define WORKGROUP_SIZE 512 8 | #define PARTITION_DIVISION 8 9 | const int PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; 10 | 11 | layout (local_size_x = WORKGROUP_SIZE) in; 12 | 13 | layout (buffer_reference, std430) readonly buffer ElementCount { 14 | uint elementCount; 15 | }; 16 | 17 | layout (buffer_reference, std430) buffer GlobalHistogram { 18 | uint globalHistogram[]; // (4, R) 19 | }; 20 | 21 | layout (buffer_reference, std430) writeonly buffer PartitionHistogram { 22 | uint partitionHistogram[]; // (P, R) 23 | }; 24 | 25 | layout (buffer_reference, std430) readonly buffer Keys { 26 | uint keys[]; // (N) 27 | }; 28 | 29 | layout (push_constant) uniform PushConstant { 30 | int pass; 31 | restrict ElementCount elementCountReference; 32 | restrict GlobalHistogram globalHistogramReference; 33 | restrict PartitionHistogram partitionHistogramReference; 34 | restrict Keys keysInReference; 35 | }; 36 | 37 | shared uint localHistogram[RADIX]; 38 | 39 | void main() { 40 | uint threadIndex = gl_SubgroupInvocationID; // 0..31 or 0..63 41 | uint subgroupIndex = gl_SubgroupID; // 0..15 or 0..7 42 | uint index = subgroupIndex * gl_SubgroupSize + threadIndex; 43 | 44 | uint elementCount = elementCountReference.elementCount; 45 | 46 | uint partitionIndex = gl_WorkGroupID.x; 47 | uint partitionStart = partitionIndex * PARTITION_SIZE; 48 | 49 | // discard all workgroup invocations 50 | if (partitionStart >= elementCount) { 51 | return; 52 | } 53 | 54 | if (index < RADIX) { 55 | localHistogram[index] = 0; 56 | } 57 | barrier(); 58 | 59 | // local histogram 60 | for (int i = 0; i < PARTITION_DIVISION; ++i) { 61 | uint keyIndex = partitionStart + WORKGROUP_SIZE * i + index; 62 | uint key = keyIndex < elementCount ? keysInReference.keys[keyIndex] : 0xffffffff; 63 | uint radix = bitfieldExtract(key, 8 * pass, 8); 64 | atomicAdd(localHistogram[radix], 1); 65 | } 66 | barrier(); 67 | 68 | if (index < RADIX) { 69 | // set to partition histogram 70 | partitionHistogramReference.partitionHistogram[RADIX * partitionIndex + index] = localHistogram[index]; 71 | 72 | // add to global histogram 73 | atomicAdd(globalHistogramReference.globalHistogram[RADIX * pass + index], localHistogram[index]); 74 | } 75 | } 76 | --------------------------------------------------------------------------------