├── .gitignore ├── doc ├── output.svg ├── input.svg ├── clusters.svg ├── underfill_cost.svg ├── valid_split_positions.svg └── adjacency_sweep.svg ├── CHANGELOG.md ├── CONTRIBUTING.txt ├── test ├── src │ ├── test_clusterizer.c │ ├── test_perf.cpp │ ├── tree_gen.hpp │ └── test_util.hpp └── CMakeLists.txt ├── .clang-format ├── src ├── connections.hpp ├── clusterizer.hpp ├── underfill_cost.hpp ├── connections.cpp └── nvcluster.cpp ├── CMakeLists.txt ├── include └── nvcluster │ ├── nvcluster_storage.hpp │ ├── util │ ├── parallel_execution_libcxx.hpp │ ├── parallel.hpp │ └── objects.hpp │ └── nvcluster.h ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .vscode 3 | .cache 4 | _install -------------------------------------------------------------------------------- /doc/output.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # Version 2 3 | 4 | ## Features 5 | 6 | - Vertex limit, [`maxClusterVertices`](include/nvcluster/nvcluster.h) 7 | - Vertex underfill cost, [`costUnderfillVertices`](include/nvcluster/nvcluster.h) 8 | - Implicit connection computation with [`itemVertices`](include/nvcluster/nvcluster.h) 9 | - Shared library support in cmake, [`NVCLUSTER_BUILDER_SHARED`](CMakeLists.txt) 10 | - Dynamic `parallelize` switch in [`nvcluster_ContextCreateInfo`](include/nvcluster/nvcluster.h) 11 | 12 | ## Code Quality 13 | 14 | - Real C API, removing namespace, adding prefixes, symbol export 15 | - Flattened API structs, avoiding pointer chains 16 | - Removed macro based parallel for loops 17 | - Internal use of std::span instead of raw pointers 18 | - vec3f and AABB objects instead of inlined operations 19 | - Fallback for missing libc++ parallel execution 20 | -------------------------------------------------------------------------------- /CONTRIBUTING.txt: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 6 | Everyone is permitted to copy and distribute verbatim copies of this 7 | license document, but changing it is not allowed. 8 | 9 | 10 | Developer's Certificate of Origin 1.1 11 | 12 | By making a contribution to this project, I certify that: 13 | 14 | (a) The contribution was created in whole or in part by me and I 15 | have the right to submit it under the open source license 16 | indicated in the file; or 17 | 18 | (b) The contribution is based upon previous work that, to the best 19 | of my knowledge, is covered under an appropriate open source 20 | license and I have the right under that license to submit that 21 | work with modifications, whether created in whole or in part 22 | by me, under the same open source license (unless I am 23 | permitted to submit under a different license), as indicated 24 | in the file; or 25 | 26 | (c) The contribution was provided directly to me by some other 27 | person who certified (a), (b) or (c) and I have not modified 28 | it. 29 | 30 | (d) I understand and agree that this project and the contribution 31 | are public and that a record of the contribution (including all 32 | personal information I submit with it, including my sign-off) is 33 | maintained indefinitely and may be redistributed consistent with 34 | this project or the open source license(s) involved. 35 | -------------------------------------------------------------------------------- /test/src/test_clusterizer.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifdef __cplusplus 21 | #error This file verifies the API is C compatible 22 | #endif 23 | 24 | #include 25 | #include 26 | 27 | int runCTest(void) 28 | { 29 | nvcluster_ContextCreateInfo createInfo = nvcluster_defaultContextCreateInfo(); 30 | nvcluster_Context context = 0; 31 | nvcluster_Result createResult = nvclusterCreateContext(&createInfo, &context); 32 | if(createResult != NVCLUSTER_SUCCESS) 33 | { 34 | printf("Create Context Result: %s\n", nvclusterResultString(createResult)); 35 | return 0; 36 | } 37 | 38 | nvcluster_Result destroyResult = nvclusterDestroyContext(context); 39 | if(destroyResult != NVCLUSTER_SUCCESS) 40 | { 41 | printf("Destroy Context Result: %s\n", nvclusterResultString(destroyResult)); 42 | return 0; 43 | } 44 | return 1; 45 | } 46 | -------------------------------------------------------------------------------- /doc/input.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AccessModifierOffset: '-2' 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: 'true' 5 | AlignConsecutiveDeclarations: 'true' 6 | AlignOperands: 'true' 7 | AlignTrailingComments: 'true' 8 | AllowAllParametersOfDeclarationOnNextLine: 'false' 9 | AllowShortBlocksOnASingleLine: 'false' 10 | AllowShortCaseLabelsOnASingleLine: 'false' 11 | AllowShortFunctionsOnASingleLine: Inline 12 | AllowShortIfStatementsOnASingleLine: 'false' 13 | AllowShortLoopsOnASingleLine: 'false' 14 | AlwaysBreakAfterReturnType: None 15 | AlwaysBreakBeforeMultilineStrings: 'true' 16 | AlwaysBreakTemplateDeclarations: 'true' 17 | BinPackArguments: 'true' 18 | BinPackParameters: 'false' 19 | ExperimentalAutoDetectBinPacking: 'false' 20 | BreakBeforeBinaryOperators: NonAssignment 21 | BreakBeforeBraces: Custom 22 | BreakBeforeTernaryOperators: 'false' 23 | BreakConstructorInitializersBeforeComma: 'true' 24 | ColumnLimit: '120' 25 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'false' 26 | Cpp11BracedListStyle: 'true' 27 | IndentCaseLabels: 'true' 28 | IndentWidth: '2' 29 | KeepEmptyLinesAtTheStartOfBlocks: 'true' 30 | Language: Cpp 31 | MaxEmptyLinesToKeep: '2' 32 | NamespaceIndentation: None 33 | ObjCSpaceBeforeProtocolList: 'true' 34 | PointerAlignment: Left 35 | SpaceAfterCStyleCast: 'false' 36 | SpaceBeforeAssignmentOperators: 'true' 37 | SpaceBeforeParens: Never 38 | SpaceInEmptyParentheses: 'false' 39 | SpacesBeforeTrailingComments: '2' 40 | SpacesInAngles: 'false' 41 | SpacesInCStyleCastParentheses: 'false' 42 | SpacesInParentheses: 'false' 43 | SpacesInSquareBrackets: 'false' 44 | Standard: Cpp11 45 | TabWidth: '2' 46 | UseTab: Never 47 | SortIncludes: 'true' 48 | ReflowComments: 'false' 49 | BraceWrapping: { 50 | AfterClass: 'true', 51 | AfterControlStatement: 'true', 52 | AfterEnum: 'true', 53 | AfterFunction: 'true', 54 | AfterNamespace: 'false', 55 | AfterStruct: 'true', 56 | AfterUnion: 'true', 57 | BeforeCatch: 'true', 58 | BeforeElse: 'true', 59 | IndentBraces: 'false' 60 | } 61 | PenaltyExcessCharacter: 1 62 | PenaltyBreakBeforeFirstCallParameter: 40 63 | PenaltyBreakFirstLessLess: 1 64 | PenaltyBreakComment: 30 65 | PenaltyBreakString: 30 66 | PenaltyReturnTypeOnItsOwnLine: 9999 67 | -------------------------------------------------------------------------------- /src/connections.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | namespace nvcluster { 29 | 30 | // A 2D uint32_t array pointer, used to interpret nvcluster_Input::itemVertices. 31 | // Rename to UintSpan2D if used multiple times. Could replace with 32 | // mdspan. 33 | struct ItemVertices 34 | { 35 | public: 36 | ItemVertices(const uint32_t* itemVertices, uint32_t itemCount, uint32_t itemVertexCount) 37 | : m_itemVertices(itemVertices) 38 | , m_itemCount(itemCount) 39 | , m_itemVertexCount(itemVertexCount) 40 | { 41 | } 42 | uint32_t itemCount() const { return m_itemCount; } // mdspan::extent(0) 43 | uint32_t itemVertexCount() const { return m_itemVertexCount; } // mdspan::extent(1) 44 | std::span vertices(size_t itemIndex) const // ~submdspan 45 | { 46 | return std::span(m_itemVertices, m_itemCount * m_itemVertexCount).subspan(itemIndex * m_itemVertexCount, m_itemVertexCount); 47 | } 48 | 49 | private: 50 | const uint32_t* m_itemVertices; 51 | uint32_t m_itemCount; 52 | uint32_t m_itemVertexCount; 53 | }; 54 | 55 | // Utility to generate item connections and vertex bits to use the vertex limit 56 | // feature. 57 | struct MeshConnections 58 | { 59 | std::vector connectionRanges; 60 | std::vector connectionItems; 61 | std::vector connectionVertexBits; 62 | }; 63 | 64 | NVCLUSTER_API MeshConnections makeMeshConnections(bool parallelize, ItemVertices itemVertices, uint32_t vertexCount); 65 | 66 | } // namespace nvcluster 67 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | find_package(GTest QUIET) 17 | if(NOT GTest_FOUND) 18 | include(FetchContent) 19 | FetchContent_Declare( 20 | googletest 21 | GIT_REPOSITORY https://github.com/google/googletest.git 22 | GIT_TAG v1.14.0 23 | GIT_SHALLOW TRUE 24 | ) 25 | FetchContent_MakeAvailable(googletest) 26 | endif() 27 | 28 | find_package(nanobench QUIET) 29 | if(NOT nanobench_FOUND) 30 | include(FetchContent) 31 | FetchContent_Declare( 32 | nanobench 33 | GIT_REPOSITORY https://github.com/martinus/nanobench.git 34 | GIT_TAG v4.3.11 35 | GIT_SHALLOW TRUE) 36 | FetchContent_MakeAvailable(nanobench) 37 | endif() 38 | 39 | add_executable(nv_cluster_builder_tests 40 | src/test_clusterizer.c 41 | src/test_clusterizer.cpp 42 | src/test_meshes.cpp 43 | src/test_perf.cpp 44 | ) 45 | 46 | option(NVCLUSTER_TEST_MESHES "FetchContent cgltf to test meshes in current directory" OFF) 47 | if(NVCLUSTER_TEST_MESHES) 48 | if(NOT TARGET cgltf_static) 49 | set(CGLTF_INCLUDE "${CMAKE_BINARY_DIR}/cgltf") 50 | file(MAKE_DIRECTORY "${CGLTF_INCLUDE}") 51 | file(DOWNLOAD https://raw.githubusercontent.com/jkuhlmann/cgltf/refs/tags/v1.15/cgltf.h "${CGLTF_INCLUDE}/cgltf.h") 52 | file(WRITE "${CMAKE_BINARY_DIR}/cgltf.cpp" "#define CGLTF_IMPLEMENTATION\n#include \n") 53 | add_library(cgltf_static "${CMAKE_BINARY_DIR}/cgltf.cpp") 54 | target_include_directories(cgltf_static PUBLIC "${CGLTF_INCLUDE}") 55 | endif() 56 | target_compile_definitions(nv_cluster_builder_tests PRIVATE TEST_MESHES) 57 | target_link_libraries(nv_cluster_builder_tests PRIVATE cgltf_static) 58 | endif() 59 | 60 | target_include_directories(nv_cluster_builder_tests PRIVATE src ../src) # adds internal src directory to allow unit testing 61 | target_link_libraries(nv_cluster_builder_tests PRIVATE nv_cluster_builder gtest_main gmock_main nanobench) 62 | 63 | if(MSVC) 64 | target_compile_options(nv_cluster_builder_tests PRIVATE 65 | /W4 66 | /WX 67 | ) 68 | target_compile_definitions(nv_cluster_builder_tests PRIVATE WIN32_LEAN_AND_MEAN=1 NOMINMAX) 69 | else() 70 | target_compile_options(nv_cluster_builder_tests PRIVATE 71 | -Wall 72 | -Wextra 73 | -Wpedantic 74 | -Wshadow 75 | -Wconversion 76 | -Werror 77 | ) 78 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 79 | target_compile_definitions(nv_cluster_builder_tests PRIVATE 80 | $<$:_GLIBCXX_ASSERTIONS> 81 | ) 82 | endif() 83 | endif() 84 | 85 | include(GoogleTest) 86 | gtest_discover_tests(nv_cluster_builder_tests) 87 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 16 | # SPDX-License-Identifier: Apache-2.0 17 | 18 | cmake_minimum_required(VERSION 3.20) 19 | project(nv_cluster_builder VERSION 2.0) 20 | 21 | set(SOURCES 22 | src/clusterizer.cpp 23 | src/connections.cpp 24 | src/nvcluster.cpp 25 | ) 26 | file(GLOB HEADERS_INTERNAL 27 | src/*.hpp 28 | ) 29 | file(GLOB HEADERS_PUBLIC 30 | include/nvcluster/*.h 31 | include/nvcluster/*.hpp 32 | include/nvcluster/util/*.hpp 33 | ) 34 | 35 | source_group("public_include" FILES ${HEADERS_PUBLIC}) 36 | source_group("source" FILES ${SOURCES} ${HEADERS_INTERNAL}) 37 | 38 | # Optionally build as a shared library 39 | include(CMakeDependentOption) 40 | cmake_dependent_option( 41 | NVCLUSTER_BUILDER_SHARED # option variable 42 | "Build shared library" # description 43 | OFF # default value if exposed; user can override 44 | "NOT BUILD_SHARED_LIBS" # condition to expose option 45 | ON # value if not exposed; user can't override 46 | ) 47 | 48 | if (NVCLUSTER_BUILDER_SHARED) 49 | set(CMAKE_C_VISIBILITY_PRESET hidden) 50 | set(CMAKE_CXX_VISIBILITY_PRESET hidden) 51 | set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) 52 | add_library(nv_cluster_builder SHARED ${SOURCES} ${HEADERS_INTERNAL} ${HEADERS_PUBLIC}) 53 | target_compile_definitions(nv_cluster_builder PUBLIC NVCLUSTER_BUILDER_SHARED) 54 | else() 55 | add_library(nv_cluster_builder STATIC ${SOURCES} ${HEADERS_INTERNAL} ${HEADERS_PUBLIC}) 56 | endif () 57 | target_compile_features(nv_cluster_builder PUBLIC cxx_std_20) 58 | target_include_directories(nv_cluster_builder PUBLIC include) 59 | target_include_directories(nv_cluster_builder PRIVATE src) 60 | target_compile_definitions(nv_cluster_builder PRIVATE NVCLUSTER_BUILDER_COMPILING) 61 | 62 | # All the warnings. Branch on COMPILE_LANGUAGE to avoid passing unknowns to nvcc 63 | if(MSVC) 64 | target_compile_options(nv_cluster_builder PRIVATE 65 | $<$:/W4> 66 | $<$:/WX> 67 | $<$:/wd4127> # 'conditional expression is constant' unhelpful when mixing c.t. and dynamic 68 | ) 69 | target_compile_definitions(nv_cluster_builder PRIVATE WIN32_LEAN_AND_MEAN=1 NOMINMAX) 70 | else() 71 | target_compile_options(nv_cluster_builder PRIVATE 72 | -fno-math-errno 73 | -fno-trapping-math 74 | #-funsafe-math-optimizations 75 | $<$:-Wall> 76 | $<$:-Wextra> 77 | $<$:-Wpedantic> 78 | $<$:-Wconversion> 79 | $<$:-Werror> 80 | ) 81 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 82 | target_compile_definitions(nv_cluster_builder PRIVATE 83 | $<$:_GLIBCXX_ASSERTIONS> 84 | # Do not use ABI breaking _GLIBCXX_DEBUG or _GLIBCXX_DEBUG_BACKTRACE 85 | ) 86 | endif() 87 | endif() 88 | 89 | option(NVCLUSTER_MULTITHREADED "Build with multithreaded cluster generation support" ON) 90 | if(NVCLUSTER_MULTITHREADED) 91 | target_compile_definitions(nv_cluster_builder PRIVATE NVCLUSTER_MULTITHREADED=1) 92 | 93 | # Optional TBB for std::execution on linux 94 | if(NOT MSVC) 95 | find_library(TBB_LIBRARIES NAMES tbb HINTS ${TBB_DIR}) 96 | if(TBB_LIBRARIES) 97 | message(STATUS "TBB: ${TBB_LIBRARIES}") 98 | target_link_libraries(nv_cluster_builder PRIVATE ${TBB_LIBRARIES}) 99 | else() 100 | message(STATUS "TBB not found for std::execution") 101 | endif() 102 | endif() 103 | else() 104 | target_compile_definitions(nv_cluster_builder PRIVATE NVCLUSTER_MULTITHREADED=0) 105 | endif() 106 | 107 | if(BUILD_TESTING) 108 | option(BUILD_NV_CLUSTER_BUILDER_TESTING "Build nv_cluster_builder tests" ON) 109 | if(BUILD_NV_CLUSTER_BUILDER_TESTING) 110 | enable_testing() 111 | add_subdirectory(test) 112 | endif() 113 | endif() 114 | 115 | install(TARGETS nv_cluster_builder) 116 | -------------------------------------------------------------------------------- /include/nvcluster/nvcluster_storage.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #pragma once 21 | #include 22 | 23 | #include 24 | 25 | namespace nvcluster { 26 | 27 | // Utility storage for clustering output 28 | // Construct with generateClusters() 29 | struct ClusterStorage 30 | { 31 | std::vector clusterItemRanges; 32 | std::vector items; 33 | 34 | void shrink_to_fit() 35 | { 36 | // clusterItemRanges is conservatively sized for clustering output. If this 37 | // object is kept around, memory can be saved by reallocating. 38 | clusterItemRanges.shrink_to_fit(); 39 | } 40 | }; 41 | 42 | // Utility storage for segmented clustering output 43 | // Construct with generateSegmentedClusters() 44 | struct SegmentedClusterStorage 45 | { 46 | std::vector segmentClusterRanges; 47 | std::vector clusterItemRanges; 48 | std::vector items; 49 | 50 | void shrink_to_fit() 51 | { 52 | // clusterItemRanges is conservatively sized for clustering output. If this 53 | // object is kept around, memory can be saved by reallocating. 54 | clusterItemRanges.shrink_to_fit(); 55 | } 56 | }; 57 | 58 | // ClusterStorage delayed init constructor 59 | inline nvcluster_Result generateClusters(nvcluster_Context context, 60 | const nvcluster_Config& config, 61 | const nvcluster_Input& input, 62 | ClusterStorage& clusterStorage) 63 | { 64 | // Query output upper limit 65 | nvcluster_Counts requiredCounts; 66 | nvcluster_Result result = nvclusterGetRequirements(context, &config, input.itemCount, &requiredCounts); 67 | if(result != nvcluster_Result::NVCLUSTER_SUCCESS) 68 | { 69 | return result; 70 | } 71 | 72 | // Resize to the upper limit 73 | clusterStorage.clusterItemRanges.resize(requiredCounts.clusterCount); 74 | clusterStorage.items.resize(input.itemCount); 75 | 76 | // Build clusters 77 | nvcluster_OutputClusters outputClusters{ 78 | .clusterItemRanges = clusterStorage.clusterItemRanges.data(), 79 | .items = clusterStorage.items.data(), 80 | .clusterCount = uint32_t(clusterStorage.clusterItemRanges.size()), 81 | .itemCount = uint32_t(clusterStorage.items.size()), 82 | }; 83 | result = nvclusterBuild(context, &config, &input, &outputClusters); 84 | if(result != nvcluster_Result::NVCLUSTER_SUCCESS) 85 | { 86 | return result; 87 | } 88 | 89 | // Resize down to what was written. Let the user call shrink_to_fit() if the 90 | // object is not temporary. 91 | clusterStorage.clusterItemRanges.resize(outputClusters.clusterCount); 92 | return result; 93 | } 94 | 95 | inline nvcluster_Result generateSegmentedClusters(nvcluster_Context context, 96 | const nvcluster_Config& config, 97 | const nvcluster_Input& input, 98 | const nvcluster_Segments& segments, 99 | SegmentedClusterStorage& segmentedClusterStorage) 100 | { 101 | // Query output upper limit 102 | nvcluster_Counts requiredCounts; 103 | nvcluster_Result result = nvclusterGetRequirementsSegmented(context, &config, input.itemCount, &segments, &requiredCounts); 104 | if(result != nvcluster_Result::NVCLUSTER_SUCCESS) 105 | { 106 | return result; 107 | } 108 | 109 | // Resize to the upper limit 110 | segmentedClusterStorage.segmentClusterRanges.resize(segments.segmentCount); 111 | segmentedClusterStorage.clusterItemRanges.resize(requiredCounts.clusterCount); 112 | segmentedClusterStorage.items.resize(input.itemCount); 113 | 114 | // Build clusters 115 | nvcluster_OutputClusters outputClusters{ 116 | .clusterItemRanges = segmentedClusterStorage.clusterItemRanges.data(), 117 | .items = segmentedClusterStorage.items.data(), 118 | .clusterCount = uint32_t(segmentedClusterStorage.clusterItemRanges.size()), 119 | .itemCount = uint32_t(segmentedClusterStorage.items.size()), 120 | }; 121 | result = nvclusterBuildSegmented(context, &config, &input, &segments, &outputClusters, 122 | segmentedClusterStorage.segmentClusterRanges.data()); 123 | if(result != nvcluster_Result::NVCLUSTER_SUCCESS) 124 | { 125 | return result; 126 | } 127 | 128 | // Resize down to what was written. Let the user call shrink_to_fit() if the 129 | // object is not temporary. 130 | segmentedClusterStorage.clusterItemRanges.resize(outputClusters.clusterCount); 131 | return result; 132 | } 133 | 134 | } // namespace nvcluster 135 | -------------------------------------------------------------------------------- /src/clusterizer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace nvcluster { 27 | 28 | inline MeshConnections makeMeshConnections(bool parallelize, const nvcluster_Config& inputConfig, const nvcluster_Input& input) 29 | { 30 | return makeMeshConnections(parallelize, ItemVertices(input.itemVertices, input.itemCount, inputConfig.itemVertexCount), 31 | input.vertexCount); 32 | } 33 | 34 | struct Input 35 | { 36 | Input(const nvcluster_Config& inputConfig, const nvcluster_Input& input, const nvcluster_Segments& inputSegments) 37 | : Input(inputConfig, 38 | std::span(reinterpret_cast(input.itemBoundingBoxes), input.itemCount), 39 | std::span(reinterpret_cast(input.itemCentroids), input.itemCount), 40 | std::span(reinterpret_cast(inputSegments.segmentItemRanges), inputSegments.segmentCount), 41 | maybeNull(reinterpret_cast(input.itemConnectionRanges), input.itemCount), 42 | maybeNull(input.connectionTargetItems, input.connectionCount), 43 | maybeNull(input.connectionWeights, input.connectionCount), 44 | maybeNull(input.connectionVertexBits, input.connectionCount)) 45 | { 46 | } 47 | 48 | Input(const nvcluster_Config& inputConfig, const nvcluster_Input& input, const nvcluster_Segments& inputSegments, const MeshConnections& meshConnections) 49 | : Input(inputConfig, 50 | std::span(reinterpret_cast(input.itemBoundingBoxes), input.itemCount), 51 | std::span(reinterpret_cast(input.itemCentroids), input.itemCount), 52 | std::span(reinterpret_cast(inputSegments.segmentItemRanges), inputSegments.segmentCount), 53 | meshConnections.connectionRanges, 54 | meshConnections.connectionItems, 55 | {}, // incompatible with auto-computed connections 56 | meshConnections.connectionVertexBits) 57 | { 58 | } 59 | 60 | Input(const nvcluster_Config& config_, 61 | std::span boundingBoxes_, 62 | std::span centroids_, 63 | std::span segments_, 64 | std::span itemConnectionRanges_ = {}, 65 | std::span connectionTargetItems_ = {}, 66 | std::span connectionWeights_ = {}, 67 | std::span connectionVertexBits_ = {}) 68 | : config(config_) 69 | , boundingBoxes(boundingBoxes_) 70 | , centroids(centroids_) 71 | , segments(segments_) 72 | , itemConnectionRanges(itemConnectionRanges_) 73 | , connectionTargetItems(connectionTargetItems_) 74 | , connectionWeights(connectionWeights_) 75 | , connectionVertexBits(connectionVertexBits_) 76 | { 77 | // NOTE: validation is done by the C API and none here to avoid throwing 78 | // more exceptions than the standard library already does, e.g. bad_alloc 79 | } 80 | 81 | // Minimal spatial-only input 82 | const nvcluster_Config& config; 83 | std::span boundingBoxes; 84 | std::span centroids; 85 | 86 | // Clusterize within each range of items 87 | std::span segments; 88 | 89 | // Optional connections (may be empty) 90 | std::span itemConnectionRanges; 91 | std::span connectionTargetItems; 92 | std::span connectionWeights; 93 | std::span connectionVertexBits; 94 | 95 | private: 96 | template 97 | std::span maybeNull(const T* ptr, uint32_t size) 98 | { 99 | return ptr ? std::span{ptr, size} : std::span{}; 100 | } 101 | }; 102 | 103 | struct OutputClusters 104 | { 105 | OutputClusters(nvcluster_OutputClusters& output, nvcluster_Range* outputSegments, uint32_t outputSegmentCount) 106 | : clusterItemRanges(reinterpret_cast(output.clusterItemRanges), output.clusterCount) 107 | , items(reinterpret_cast(output.items), output.itemCount) 108 | , segments(reinterpret_cast(outputSegments), outputSegmentCount) 109 | , clusterCount(output.clusterCount) 110 | , itemCount(output.itemCount) 111 | { 112 | } 113 | std::span clusterItemRanges; 114 | std::span items; 115 | std::span segments; 116 | uint32_t& clusterCount; // output count reference 117 | uint32_t& itemCount; // output count reference 118 | }; 119 | 120 | NVCLUSTER_API [[nodiscard]] nvcluster_Result clusterize(bool parallelize, const Input& input, const OutputClusters& clusters); 121 | 122 | } // namespace nvcluster 123 | -------------------------------------------------------------------------------- /src/underfill_cost.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /// @file Heuristics to compute item/triangle and vertex "underfill costs", 20 | /// which encourage the clusterizer to form bigger clusters, still within the 21 | /// maximums. Separated from the source file for unit testing. 22 | #pragma once 23 | 24 | #include 25 | #include 26 | 27 | namespace nvcluster { 28 | 29 | // Switch to compute a connectedness metric that indicates how many vertices 30 | // will be duplicated after cutting the node, rather than assume the square root 31 | // of vertices will be cut. Takes a bit longer but can help with long skinny 32 | // geometry like triangle strips. 33 | static constexpr bool COMPUTE_AVERAGE_CUT_VERTICES = true; 34 | 35 | struct Underfill 36 | { 37 | uint32_t underfillCount = 0; 38 | 39 | // underfillCount is unique vertices if true, otherwise count is items 40 | bool vertexLimited = false; 41 | }; 42 | 43 | inline float guessRequiredClustersForVertexLimit(float currentVertices, float targetVertices) 44 | { 45 | // s=\frac{nv-2\sqrt{n}v+2\sqrt{\left(\sqrt{n}-1\right)^{4}v}+n-2\sqrt{n}+v+1}{\left(v-1\right)^{2}} 46 | float sqrtN = sqrtf(currentVertices); 47 | float sqrtNMinus1_4 = powf(sqrtN - 1.0f, 4.0f); 48 | float numerator = currentVertices * targetVertices // 49 | - 2.0f * sqrtN * targetVertices // 50 | + 2.0f * sqrtf(sqrtNMinus1_4 * targetVertices) // 51 | + currentVertices // 52 | - 2.0f * sqrtN // 53 | + targetVertices // 54 | + 1.0f; 55 | float denominator = (targetVertices - 1.0f) * (targetVertices - 1.0f); 56 | return numerator / denominator; 57 | } 58 | 59 | inline float guessRequiredClustersForVertexLimit(float currentVertices, float averageCutVertices, float targetVertices) 60 | { 61 | // (2 sqrt((a - 1)^2 (a^2 - 2 a v + n (v - 1) + v)) + 2 a^2 - 2 a (v + 1) + n (v - 1) + v + 1)/(v - 1)^2 62 | float a = averageCutVertices; 63 | float v = targetVertices; 64 | float n = currentVertices; 65 | float t1 = a * a - 2.0f * a * v + n * (v - 1.0f) + v; 66 | if(t1 < 0.0f) // candidate split with less than the average cut vertices (e.g. first or last few) 67 | return 1.0f; 68 | float t2 = 2.0f * (a - 1.0f) * sqrtf(t1) + 2.0f * a * a - 2.0f * a * (v + 1.0f) + n * (v - 1.0f) + v + 1.0f; 69 | return t2 / ((v - 1.0f) * (v - 1.0f)); 70 | } 71 | 72 | // Inverse of guessVertexLimitRequiredClusters() 73 | inline float guessVerticesAfterClustering(float currentVertices, float clusters) 74 | { 75 | // v\left(n,s\right)=\frac{\left(\sqrt{n}+\sqrt{s}-1\right)^{2}}{s} 76 | float t = sqrtf(currentVertices) + sqrtf(clusters) - 1.0f; 77 | return (t * t) / clusters; 78 | } 79 | 80 | // Inverse of guessVertexLimitRequiredClusters() 81 | inline float guessVerticesAfterClustering(float currentVertices, float averageCutVertices, float clusters) 82 | { 83 | // v(n, s) = (2 (a - 1) sqrt(s) - 2 a + n + s + 1)/s 84 | float t = 2.0f * (averageCutVertices - 1.0f) * sqrtf(clusters) - 2.0f * averageCutVertices + currentVertices + clusters + 1.0f; 85 | return t / clusters; 86 | } 87 | 88 | // Returns the number of items remaining to fill the last bucket 89 | inline uint32_t underfillCount(uint32_t bucketSize, uint32_t itemCount) 90 | { 91 | return div_ceil(itemCount, bucketSize) * bucketSize - itemCount; 92 | } 93 | 94 | // Computes the expected number of vertices less than the maximum in the 95 | // remaining cluster. This is entirely modelled off connections from shared 96 | // vertices between a rectangular grid of triangles. 97 | // TODO: remove AABB 98 | inline Underfill generalUnderfillCount(const Input& input, uint32_t itemCount, uint32_t vertexCount, float averageCutVertices) 99 | { 100 | float requiredClustersItems = float(itemCount) / float(input.config.maxClusterSize); 101 | float requiredClustersVertices = 102 | COMPUTE_AVERAGE_CUT_VERTICES ? 103 | guessRequiredClustersForVertexLimit(float(vertexCount), averageCutVertices, float(input.config.maxClusterVertices)) : 104 | guessRequiredClustersForVertexLimit(float(vertexCount), float(input.config.maxClusterVertices)); 105 | 106 | if(requiredClustersItems > requiredClustersVertices) 107 | { 108 | // Item limited 109 | return {underfillCount(input.config.maxClusterSize, itemCount), false}; 110 | } 111 | else 112 | { 113 | // Vertex limited 114 | float clusterCount = ceilf(requiredClustersVertices - 1e-6f); 115 | float verticesPerCluster = COMPUTE_AVERAGE_CUT_VERTICES ? 116 | guessVerticesAfterClustering(float(vertexCount), averageCutVertices, clusterCount) : 117 | guessVerticesAfterClustering(float(vertexCount), clusterCount); 118 | float availableVertices = clusterCount * float(input.config.maxClusterVertices); 119 | float underfill = availableVertices - verticesPerCluster * clusterCount + 0.5f; 120 | assert(verticesPerCluster > 1.0f); 121 | assert(underfill >= 0.0f); 122 | return {uint32_t(underfill), true}; 123 | } 124 | } 125 | 126 | } // namespace nvcluster 127 | -------------------------------------------------------------------------------- /include/nvcluster/util/parallel_execution_libcxx.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | /// @file Shim for missing libc++ features. libc++ is the LLVM implementation of 20 | /// the standard library. This project was developed with libstdc++ (the GNU 21 | /// implementation) and MSVC STL. The contents of this file provides workarounds 22 | /// for missing features, and disables parallel execution in the process. 23 | /// See: https://github.com/nvpro-samples/nv_cluster_lod_builder/issues/1 24 | /// TODO: parallel execution with e.g. https://github.com/mikekazakov/pstld 25 | #pragma once 26 | 27 | #include 28 | #include 29 | 30 | // TODO: add a numerical comparison for _LIBCPP_VERSION if std::execution 31 | // support is added 32 | #if defined(_LIBCPP_VERSION) 33 | 34 | // Disable parallel execution as it is not supported by libc++ or this shim 35 | #if !defined(NVCLUSTER_MULTITHREADED) 36 | #define NVCLUSTER_MULTITHREADED 0 37 | #else 38 | #undef NVCLUSTER_MULTITHREADED 39 | #define NVCLUSTER_MULTITHREADED 0 40 | #endif 41 | 42 | namespace std { 43 | 44 | // If you see duplicate definitions here, filter out the current _LIBCPP_VERSION 45 | namespace execution { 46 | class sequenced_policy 47 | { 48 | }; 49 | class parallel_policy 50 | { 51 | }; 52 | class parallel_unsequenced_policy 53 | { 54 | }; 55 | class unsequenced_policy 56 | { 57 | }; 58 | inline constexpr sequenced_policy seq{}; 59 | inline constexpr parallel_policy par{}; 60 | inline constexpr parallel_unsequenced_policy par_unseq{}; 61 | inline constexpr unsequenced_policy unseq{}; 62 | } // namespace execution 63 | 64 | template 65 | void for_each(ExecutionPolicy&&, 66 | 67 | ForwardIt first, 68 | ForwardIt last, 69 | UnaryFunc f) 70 | { 71 | for_each(first, last, f); 72 | } 73 | 74 | template 75 | requires std::same_as, execution::sequenced_policy> 76 | ForwardIt2 inclusive_scan(ExecutionPolicy&&, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first) 77 | { 78 | return inclusive_scan(first, last, d_first); 79 | } 80 | 81 | template 82 | requires std::same_as, execution::sequenced_policy> 83 | || std::same_as, execution::parallel_policy> 84 | || std::same_as, execution::parallel_unsequenced_policy> 85 | || std::same_as, execution::unsequenced_policy> 86 | ForwardIt2 exclusive_scan(ExecutionPolicy&&, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first, T init) 87 | { 88 | static_assert(std::same_as, execution::sequenced_policy>); // SFINAE delayed error 89 | return exclusive_scan(first, last, d_first, init); 90 | } 91 | 92 | template 93 | requires std::same_as, execution::sequenced_policy> 94 | ForwardIt2 exclusive_scan(ExecutionPolicy&&, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first, T init, BinaryOp op) 95 | { 96 | return exclusive_scan(first, last, d_first, init, op); 97 | } 98 | 99 | template 100 | requires std::same_as, execution::sequenced_policy> 101 | ForwardIt2 transform_exclusive_scan(ExecutionPolicy&&, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first, T init, BinaryOp binary_op, UnaryOp unary_op) 102 | { 103 | return transform_exclusive_scan(first, last, d_first, init, binary_op, unary_op); 104 | } 105 | 106 | template 107 | requires std::same_as, execution::sequenced_policy> 108 | ForwardIt2 transform_inclusive_scan(ExecutionPolicy&&, 109 | ForwardIt1 first, 110 | ForwardIt1 last, 111 | ForwardIt2 d_first, 112 | 113 | BinaryOp binary_op, 114 | UnaryOp unary_op) 115 | { 116 | #if 1 117 | auto transformed_view = std::ranges::subrange(first, last) | std::views::transform(unary_op); 118 | return std::inclusive_scan(transformed_view.begin(), transformed_view.end(), d_first, binary_op); 119 | #else 120 | // possible bug in libc++: typename iterator_traits<_InputIterator>::value_type __init = __u(*__first); 121 | return transform_inclusive_scan(first, last, d_first, binary_op, unary_op); 122 | #endif 123 | } 124 | 125 | template 126 | requires std::same_as, execution::sequenced_policy> 127 | BidirIt stable_partition(ExecutionPolicy&&, 128 | 129 | BidirIt first, 130 | BidirIt last, 131 | UnaryPred p) 132 | { 133 | return stable_partition(first, last, p); 134 | } 135 | 136 | template 137 | requires std::same_as, execution::sequenced_policy> 138 | void sort(ExecutionPolicy&&, RandomIt first, RandomIt last) 139 | { 140 | return sort(first, last); 141 | } 142 | 143 | template 144 | requires std::same_as, execution::sequenced_policy> 145 | void sort(ExecutionPolicy&&, RandomIt first, RandomIt last, Compare comp) 146 | { 147 | return sort(first, last, comp); 148 | } 149 | 150 | template 151 | requires std::same_as, execution::sequenced_policy> 152 | T transform_reduce(ExecutionPolicy&&, ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T init) 153 | { 154 | return transform_reduce(first1, last1, first2, init); 155 | } 156 | 157 | template 158 | requires std::same_as, execution::sequenced_policy> 159 | T transform_reduce(ExecutionPolicy&&, ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T init, BinaryOp1 reduce, BinaryOp2 transform) 160 | { 161 | return transform_reduce(first1, last1, first2, init, reduce, transform); 162 | } 163 | 164 | // Workaround for missing atomic_ref in libc++ 165 | #if _LIBCPP_VERSION < 190000 166 | struct atomic_ref 167 | { 168 | atomic_ref(uint32_t& v) 169 | : value(&v) 170 | { 171 | } 172 | uint32_t operator++() { return reinterpret_cast&>(*value).operator++(); } 173 | uint32_t operator++(int) { return reinterpret_cast&>(*value).operator++(0); } 174 | uint32_t* value; 175 | }; 176 | #endif 177 | 178 | } // namespace std 179 | #endif 180 | -------------------------------------------------------------------------------- /include/nvcluster/util/parallel.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | 23 | // Workaround for libc++ std::execution 24 | #include 25 | 26 | // Shortcut to select the parallel execution model depending on a bool, using 27 | // variable template specialization 28 | template 29 | inline constexpr auto exec = std::execution::seq; 30 | template <> 31 | inline constexpr auto exec = std::execution::par_unseq; 32 | 33 | // This is an iterator that counts upwards from an initial value. 34 | // std::views::iota would almost work for this, but iota on MSVC 2019 doesn't 35 | // support random access, which is necessary for parallelism. 36 | template 37 | struct iota_iterator 38 | { 39 | using value_type = T; 40 | // [iterator.traits] in the C++ standard requires this to be a signed type. 41 | // We choose int64_t here, because it's conceivable someone could use 42 | // T == uint32_t and then iterate over more than 2^31 - 1 elements. 43 | using difference_type = int64_t; 44 | using pointer = T*; 45 | using reference = T&; 46 | using iterator_category = std::random_access_iterator_tag; 47 | iota_iterator() = default; 48 | iota_iterator(const iota_iterator& other) noexcept = default; 49 | iota_iterator(iota_iterator&& other) noexcept = default; 50 | iota_iterator& operator=(const iota_iterator& other) noexcept = default; 51 | iota_iterator& operator=(iota_iterator&& other) noexcept = default; 52 | iota_iterator(T i_) 53 | : i(i_) 54 | { 55 | } 56 | value_type operator*() const { return i; } 57 | iota_iterator& operator++() 58 | { 59 | ++i; 60 | return *this; 61 | } 62 | iota_iterator operator++(int) 63 | { 64 | iota_iterator t(*this); 65 | ++*this; 66 | return t; 67 | } 68 | iota_iterator& operator--() 69 | { 70 | --i; 71 | return *this; 72 | } 73 | iota_iterator operator--(int) 74 | { 75 | iota_iterator t(*this); 76 | --*this; 77 | return t; 78 | } 79 | iota_iterator operator+(difference_type d) const { return {static_cast(static_cast(i) + d)}; } 80 | iota_iterator operator-(difference_type d) const { return {static_cast(static_cast(i) - d)}; } 81 | iota_iterator& operator+=(difference_type d) 82 | { 83 | i = static_cast(static_cast(i) + d); 84 | return *this; 85 | } 86 | iota_iterator& operator-=(difference_type d) 87 | { 88 | i = static_cast(static_cast(i) - d); 89 | return *this; 90 | } 91 | bool operator==(const iota_iterator& other) const { return i == other.i; } 92 | bool operator!=(const iota_iterator& other) const { return i != other.i; } 93 | bool operator<(const iota_iterator& other) const { return i < other.i; } 94 | bool operator<=(const iota_iterator& other) const { return i <= other.i; } 95 | bool operator>(const iota_iterator& other) const { return i > other.i; } 96 | bool operator>=(const iota_iterator& other) const { return i >= other.i; } 97 | difference_type operator-(const iota_iterator& other) const 98 | { 99 | return static_cast(i) - static_cast(other.i); 100 | } 101 | friend iota_iterator operator+(difference_type n, const iota_iterator& it) { return it + n; } 102 | T operator[](difference_type d) const { return static_cast(static_cast(i) + d); } 103 | 104 | private: 105 | T i = 0; 106 | }; 107 | 108 | // Expresses the range from m_begin to m_end - 1. 109 | template 110 | struct iota_view 111 | { 112 | using iterator = iota_iterator; 113 | iota_view(T begin, T end) 114 | : m_begin(begin) 115 | , m_end(end) 116 | { 117 | } 118 | iterator begin() const { return {m_begin}; }; 119 | iterator end() const { return {m_end}; }; 120 | 121 | private: 122 | T m_begin, m_end; 123 | }; 124 | 125 | // Runs a function in parallel for each index from 0 to numItems - 1. Uses 126 | // batches of size BatchSize for reduced overhead and better autovectorization. 127 | // 128 | // BatchSize will also be used as the threshold for when to switch from 129 | // single-threaded to multi-threaded execution. For this reason, it should be set 130 | // to a power of 2 around where multi - threaded is faster than single - threaded for 131 | // the given function.Some examples are : 132 | // * 8192 for trivial workloads(a * x + y) 133 | // * 2048 for animation workloads(multiplication by a single matrix) 134 | // * 512 for more computationally heavy workloads(run XTEA) 135 | // * 1 for full parallelization(load an image) 136 | // 137 | // This is a simpler version of nvh::parallel_batches, which you can find in 138 | // nvpro_core. 139 | template 140 | inline void parallel_batches(size_t numItems, F&& fn) 141 | { 142 | if constexpr(!Parallelize) 143 | { 144 | // Explicit constexpr case to avoid linking to the parallel implementation 145 | // if it's not used (and can't partially specialize the function). 146 | for(size_t i = 0; i < numItems; i++) 147 | { 148 | fn(i); 149 | } 150 | } 151 | else 152 | { 153 | // For small item counts, it's fastest to use a single thread and avoid the 154 | // overhead from invoking a parallel executor. 155 | if(numItems <= BatchSize) 156 | { 157 | for(size_t i = 0; i < numItems; i++) 158 | { 159 | fn(i); 160 | } 161 | } 162 | else 163 | { 164 | // Unroll the loop into batches of size BATCHSIZE or less. This worker 165 | // function will be run in parallel using 166 | // std::for_each(std::execution::par_unseq). 167 | const size_t numBatches = (numItems + BatchSize - 1) / BatchSize; 168 | auto worker = [&numItems, &fn](const size_t batchIndex) { 169 | const size_t start = BatchSize * batchIndex; 170 | const size_t itemsRemaining = numItems - start; 171 | // This split is necessary to make MSVC try to auto-vectorize the first 172 | // loop, which will be the most common case when numItems is large. 173 | if(itemsRemaining >= BatchSize) 174 | { 175 | // Exactly BATCHSIZE items to process 176 | for(size_t i = start; i < start + BatchSize; i++) 177 | { 178 | fn(i); 179 | } 180 | } 181 | else 182 | { 183 | // Variable-length loop 184 | for(size_t i = start; i < numItems; i++) 185 | { 186 | fn(i); 187 | } 188 | } 189 | }; 190 | 191 | // This runs the worker above for each batch from 0 to numBatches-1. 192 | iota_view batches(0, numBatches); 193 | std::for_each(std::execution::par_unseq, batches.begin(), batches.end(), worker); 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /test/src/test_perf.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace nb = ankerl::nanobench; 26 | 27 | struct SpatialDesc 28 | { 29 | SpatialDesc(const GeometryMesh& mesh) 30 | { 31 | boundingBoxes.resize(mesh.triangles.size()); 32 | std::ranges::transform(mesh.triangles, boundingBoxes.begin(), [&](vec3u tri) { return aabb(tri, mesh.positions); }); 33 | centroids.resize(boundingBoxes.size()); 34 | std::ranges::transform(boundingBoxes, centroids.begin(), [](AABB b) { return b.center(); }); 35 | } 36 | std::vector boundingBoxes; 37 | std::vector centroids; 38 | nvcluster_Input clusterInput(const GeometryMesh* mesh = nullptr) const 39 | { 40 | return nvcluster_Input{ 41 | .itemBoundingBoxes = reinterpret_cast(boundingBoxes.data()), 42 | .itemCentroids = reinterpret_cast(centroids.data()), 43 | .itemCount = uint32_t(boundingBoxes.size()), 44 | .itemConnectionRanges = nullptr, 45 | .connectionTargetItems = nullptr, 46 | .connectionWeights = nullptr, 47 | .connectionVertexBits = nullptr, 48 | .connectionCount = 0, 49 | .itemVertices = mesh ? reinterpret_cast(mesh->triangles.data()) : nullptr, 50 | .vertexCount = mesh ? uint32_t(mesh->triangles.size()) : 0, 51 | }; 52 | } 53 | }; 54 | 55 | TEST(Perf, All) 56 | { 57 | #if !defined(NDEBUG) 58 | GTEST_SKIP() << "Skipping performance tests in debug mode"; 59 | #else 60 | GeometryMesh sphere = makeIcosphere(4); 61 | SpatialDesc sphereDesc(sphere); 62 | GeometryMesh tree = generateTree(3); 63 | SpatialDesc treeDesc(tree); 64 | auto sphereSingleTri = sphere; 65 | sphereSingleTri.triangles.resize(1); 66 | nb::Bench() 67 | .minEpochTime(std::chrono::milliseconds(500)) 68 | .minEpochIterations(10) 69 | .warmup(1) 70 | .run("makeMeshConnections", [&] { nb::doNotOptimizeAway(makeMeshConnections(false, sphere)); }) 71 | .run("makeMeshConnections parallel", [&] { nb::doNotOptimizeAway(makeMeshConnections(true, sphere)); }) 72 | .run("makeMeshConnections parallel single tri", 73 | [&] { nb::doNotOptimizeAway(makeMeshConnections(true, sphereSingleTri)); }) 74 | .run("cluster sphere limit t=[28,32]", 75 | [&] { 76 | nb::doNotOptimizeAway(ClusterStorage( 77 | nvcluster_Config{ 78 | .minClusterSize = 28, 79 | .maxClusterSize = 32, 80 | .maxClusterVertices = ~0u, 81 | .costUnderfill = 0.0f, 82 | .costOverlap = 0.0f, 83 | .costUnderfillVertices = 0.0f, 84 | .itemVertexCount = 3, 85 | .preSplitThreshold = 0, 86 | }, 87 | sphereDesc.clusterInput())); 88 | }) 89 | .run("cluster sphere limit t=[28,32], v=32*3", 90 | [&] { 91 | nb::doNotOptimizeAway(ClusterStorage( 92 | nvcluster_Config{ 93 | .minClusterSize = 28, 94 | .maxClusterSize = 32, 95 | .maxClusterVertices = 32 * 3, 96 | .costUnderfill = 0.0f, 97 | .costOverlap = 0.0f, 98 | .costUnderfillVertices = 0.0f, 99 | .itemVertexCount = 3, 100 | .preSplitThreshold = 0, 101 | }, 102 | sphereDesc.clusterInput(&sphere))); 103 | }) 104 | .run("cluster sphere limit t=[28,32], v=16", 105 | [&] { 106 | nb::doNotOptimizeAway(ClusterStorage( 107 | nvcluster_Config{ 108 | .minClusterSize = 28, 109 | .maxClusterSize = 32, 110 | .maxClusterVertices = 16, 111 | .costUnderfill = 0.0f, 112 | .costOverlap = 0.0f, 113 | .costUnderfillVertices = 0.0f, 114 | .itemVertexCount = 3, 115 | .preSplitThreshold = 0, 116 | }, 117 | sphereDesc.clusterInput(&sphere))); 118 | }) 119 | .run("cluster sphere limit t=[28,32], v=16, costs", 120 | [&] { 121 | nb::doNotOptimizeAway(ClusterStorage( 122 | nvcluster_Config{ 123 | .minClusterSize = 28, 124 | .maxClusterSize = 32, 125 | .maxClusterVertices = 16, 126 | .costUnderfill = 0.1f, 127 | .costOverlap = 0.1f, 128 | .costUnderfillVertices = 0.1f, 129 | .itemVertexCount = 3, 130 | .preSplitThreshold = 0, 131 | }, 132 | sphereDesc.clusterInput(&sphere))); 133 | }) 134 | .run("cluster tree limit t=[28,32]", 135 | [&] { 136 | nb::doNotOptimizeAway(ClusterStorage( 137 | nvcluster_Config{ 138 | .minClusterSize = 28, 139 | .maxClusterSize = 32, 140 | .maxClusterVertices = ~0u, 141 | .costUnderfill = 0.0f, 142 | .costOverlap = 0.0f, 143 | .costUnderfillVertices = 0.0f, 144 | .itemVertexCount = 3, 145 | .preSplitThreshold = 0, 146 | }, 147 | treeDesc.clusterInput())); 148 | }) 149 | .run("cluster tree limit t=[28,32], v=16", 150 | [&] { 151 | nb::doNotOptimizeAway(ClusterStorage( 152 | nvcluster_Config{ 153 | .minClusterSize = 28, 154 | .maxClusterSize = 32, 155 | .maxClusterVertices = 16, 156 | .costUnderfill = 0.0f, 157 | .costOverlap = 0.0f, 158 | .costUnderfillVertices = 0.0f, 159 | .itemVertexCount = 3, 160 | .preSplitThreshold = 0, 161 | }, 162 | treeDesc.clusterInput(&tree))); 163 | }) 164 | .run("cluster tree limit t=[28,32], v=16, costs", [&] { 165 | nb::doNotOptimizeAway(ClusterStorage( 166 | nvcluster_Config{ 167 | .minClusterSize = 28, 168 | .maxClusterSize = 32, 169 | .maxClusterVertices = 16, 170 | .costUnderfill = 0.1f, 171 | .costOverlap = 0.1f, 172 | .costUnderfillVertices = 0.1f, 173 | .itemVertexCount = 3, 174 | .preSplitThreshold = 0, 175 | }, 176 | treeDesc.clusterInput(&tree))); 177 | }); 178 | #endif 179 | } 180 | -------------------------------------------------------------------------------- /test/src/tree_gen.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | // AI generated... 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | static constexpr float g_twoPi = 6.28318530718f; 30 | 31 | inline vec3f evaluateBezier(const vec3f& p0, const vec3f& p1, const vec3f& p2, float t) 32 | { 33 | float u = 1 - t; 34 | float tt = t * t; 35 | float uu = u * u; 36 | 37 | vec3f point = uu * p0; // Quadratic term 38 | point += 2 * u * t * p1; // Linear term 39 | point += tt * p2; // Constant term 40 | 41 | return point; 42 | } 43 | 44 | inline GeometryMesh makeTriangleStrip(std::function path, uint32_t segments, float width) 45 | { 46 | GeometryMesh mesh; 47 | 48 | float e = 0.001f; 49 | for(uint32_t i = 0; i <= segments; ++i) 50 | { 51 | float t = float(i) / float(segments); 52 | 53 | vec3f position = path(t); 54 | vec3f dp1 = path(t + e) - path(t); // First derivative (tangent vector) 55 | vec3f dp2 = path(t + 2 * e) - 2 * path(t + e) + path(t); // Second derivative 56 | 57 | vec3f normal = cross(dp1, dp2); 58 | normal = normalize(normal) * width; // Scale normal to desired strip width 59 | 60 | vec3f leftPoint = position - normal * 0.5f; 61 | vec3f rightPoint = position + normal * 0.5f; 62 | 63 | mesh.positions.push_back(leftPoint); 64 | mesh.positions.push_back(rightPoint); 65 | 66 | if(i > 0) // Add triangle indices after the first segment 67 | { 68 | size_t idx = mesh.positions.size(); 69 | mesh.triangles.push_back({idx - 2, idx - 3, idx - 1}); 70 | mesh.triangles.push_back({idx - 2, idx - 4, idx - 3}); 71 | } 72 | } 73 | 74 | return mesh; 75 | } 76 | 77 | inline GeometryMesh makeBranch(std::function path, uint32_t segments, uint32_t segmentsCircular, float radius) 78 | { 79 | GeometryMesh mesh; 80 | 81 | float e = 0.001f; 82 | for(uint32_t j = 0; j <= segments; ++j) 83 | { 84 | float t = float(j) / float(segments); 85 | vec3f position = path(t); 86 | vec3f tangent = normalize(path(t + e) - path(t)); // Compute tangent vector 87 | vec3f normal = normalize(vec3f(-tangent[1], tangent[0], tangent[2])); // Arbitrary normal perpendicular to tangent 88 | vec3f binormal = normalize(cross(tangent, normal)); // Compute binormal for perpendicularity 89 | 90 | size_t baseIndex = mesh.positions.size(); 91 | 92 | float segmentRadius = powf(1.0f - t, 0.1f) * radius; 93 | // Generate vertices for a ring around the path position 94 | for(uint32_t i = 0; i < segmentsCircular; ++i) 95 | { 96 | float angle = (g_twoPi * float(i)) / float(segmentsCircular); // Corrected calculation of angle using segmentsCircular 97 | vec3f offset = segmentRadius * (cosf(angle) * normal + sinf(angle) * binormal); 98 | vec3f vertex = position + offset; 99 | 100 | mesh.positions.push_back(vertex); 101 | } 102 | 103 | // Add triangle indices for the cylinder body 104 | if(j > 0) 105 | { 106 | for(uint32_t i = 0; i < segmentsCircular; ++i) 107 | { 108 | uint32_t next = (i + 1) % segmentsCircular; // Corrected modulo operation to properly handle adjacency 109 | 110 | mesh.triangles.push_back({baseIndex + i, baseIndex + next, baseIndex + i - segmentsCircular}); 111 | mesh.triangles.push_back({baseIndex + next, baseIndex + next - segmentsCircular, baseIndex + i - segmentsCircular}); 112 | } 113 | } 114 | } 115 | 116 | return mesh; 117 | } 118 | 119 | inline GeometryMesh makeCone(std::function path, float t, float radius, uint32_t segments) 120 | { 121 | GeometryMesh mesh; 122 | 123 | vec3f position = path(t); 124 | vec3f tangent = normalize(path(t + 0.01f) - path(t)); // Compute tangent vector 125 | vec3f normal = normalize(vec3f(-tangent[1], tangent[0], tangent[2])); // Arbitrary normal perpendicular to tangent 126 | vec3f binormal = cross(tangent, normal); // Compute binormal for perpendicularity 127 | 128 | // Generate vertices for the base ring 129 | size_t baseIndex = mesh.positions.size(); 130 | for(uint32_t j = 0; j <= segments; ++j) 131 | { 132 | float angle = (g_twoPi * float(j)) / float(segments); 133 | vec3f offset = radius * (cosf(angle) * normal + sinf(angle) * binormal); 134 | vec3f vertex = position + offset + tangent * radius; 135 | mesh.positions.push_back(vertex); 136 | } 137 | 138 | // Add the tip of the cone 139 | vec3f tip = position; // Position for cone tip 140 | uint32_t tipIndex = uint32_t(mesh.positions.size()); 141 | mesh.positions.push_back(tip); 142 | 143 | // Add triangle indices for the cone 144 | for(uint32_t i = 0; i < segments; ++i) 145 | { 146 | uint32_t next = (i + 1) % segments; 147 | mesh.triangles.push_back({baseIndex + i, baseIndex + next, tipIndex}); 148 | } 149 | 150 | return mesh; 151 | } 152 | 153 | inline GeometryMesh mergeMeshes(const GeometryMesh& mesh1, const GeometryMesh& mesh2) 154 | { 155 | GeometryMesh mergedMesh; 156 | 157 | // Combine positions 158 | mergedMesh.positions = mesh1.positions; 159 | mergedMesh.positions.insert(mergedMesh.positions.end(), mesh2.positions.begin(), mesh2.positions.end()); 160 | 161 | // Combine triangles, adjusting the indices of the second mesh 162 | mergedMesh.triangles = mesh1.triangles; 163 | size_t offset = mesh1.positions.size(); // Offset for indices of mesh2 164 | mergedMesh.triangles.reserve(mesh1.triangles.size() + mesh2.triangles.size()); 165 | for(const auto& triangle : mesh2.triangles) 166 | { 167 | mergedMesh.triangles.push_back({triangle[0] + offset, triangle[1] + offset, triangle[2] + offset}); 168 | } 169 | 170 | return mergedMesh; 171 | } 172 | 173 | inline float unitRand() 174 | { 175 | static std::mt19937 gen(0); 176 | static std::uniform_real_distribution dis(0.0f, 1.0f); 177 | return dis(gen); 178 | } 179 | 180 | inline uint32_t intRand(uint32_t min, uint32_t max) 181 | { 182 | static std::mt19937 gen(0); 183 | std::uniform_int_distribution dis(min, max); 184 | return dis(gen); 185 | } 186 | 187 | inline std::function branchPath(vec3f base, float sideScale, float height) 188 | { 189 | float angle = unitRand() * g_twoPi; 190 | vec2f side = vec2f{cosf(angle), sinf(angle)} * sideScale; 191 | return [base, side, height](float t) { 192 | return evaluateBezier(base, base + vec3f{side[0], height * 0.3f, side[1]}, base + vec3f{side[0], height, side[1]}, t); 193 | }; 194 | } 195 | 196 | inline void generateTree(GeometryMesh& treeMesh, vec3f base, float side, float height, uint32_t depth) 197 | { 198 | // Generate a branch path 199 | auto path = branchPath(base, side * (depth > 3 ? 0.3f : 1.0f), height); 200 | 201 | // Create geometry for the branch 202 | if(depth == 0) 203 | { 204 | treeMesh = mergeMeshes(treeMesh, makeTriangleStrip(path, 3u + uint32_t(height * 3.0f), height * 0.3f)); 205 | 206 | // Add cones along the triangle strip 207 | uint32_t cones = intRand(2, 5); 208 | for(uint32_t i = 1; i < cones; ++i) 209 | { 210 | treeMesh = mergeMeshes(treeMesh, makeCone(path, (float(i) - unitRand() * 0.5f) / float(cones + 1), height * 0.2f, 5)); 211 | } 212 | } 213 | else 214 | { 215 | treeMesh = mergeMeshes(treeMesh, makeBranch(path, depth * 2 + 4, depth + 4, height * 0.05f)); 216 | 217 | uint32_t branches = depth == 2 ? intRand(4, 16) : intRand(2, 5); 218 | for(uint32_t i = 1; i <= branches; ++i) 219 | { 220 | generateTree(treeMesh, path((float(i) + unitRand() * 0.5f) / float(branches + 1)), side * (0.5f + 0.4f * unitRand()), 221 | height * (0.4f + 0.5f * unitRand()), depth - intRand(1, std::max(1u, depth / 2u))); 222 | } 223 | } 224 | } 225 | 226 | inline GeometryMesh generateTree(uint32_t levels = 4) 227 | { 228 | GeometryMesh treeMesh; 229 | generateTree(treeMesh, {0.0f, 0.0f, 0.0f}, 4.0f, 6.0f, levels); 230 | treeMesh.name = "tree_l" + std::to_string(levels); 231 | return treeMesh; 232 | } 233 | -------------------------------------------------------------------------------- /doc/clusters.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | TrianglesQuadsClusters 13 | -------------------------------------------------------------------------------- /doc/underfill_cost.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | C = [1, 4] 75 | Underfill cost 76 | 77 | 78 | -------------------------------------------------------------------------------- /include/nvcluster/util/objects.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef min 34 | #error "Preprocessor min defined. Add NOMINMAX to the build system" 35 | #endif 36 | 37 | #ifdef max 38 | #undef "Preprocessor max defined. Add NOMINMAX to the build system" 39 | #endif 40 | 41 | namespace nvcluster { 42 | 43 | // Returns the ceiling of an integer division. Assumes positive values. 44 | template 45 | T div_ceil(const T& a, const T& b) 46 | { 47 | return (a + b - 1) / b; 48 | } 49 | 50 | // A tiny and general vector implementation, like glm 51 | // clang-format off 52 | template 53 | requires std::is_arithmetic_v 54 | struct vec : std::array { 55 | using std::array::array; 56 | using std::array::operator[]; 57 | using std::array::begin; 58 | using std::array::end; 59 | 60 | [[nodiscard]] constexpr vec() noexcept : std::array{} {} // zero initialize 61 | [[nodiscard]] constexpr vec(T all) noexcept { std::ranges::fill(*this, all); } 62 | 63 | // Workaround for aggregate std::array initialization, 64 | // https://stackoverflow.com/questions/8192185 65 | // TODO: remove unsafe static_cast! Not sure what to do to mimic brace initialization 66 | template 67 | requires (sizeof...(U) == N) && (std::is_convertible_v && ...) 68 | [[nodiscard]] constexpr vec(const U&... init) noexcept : std::array{ {static_cast(init)...} } {} 69 | 70 | // Creating an apply(..., std::plus()) could work too 71 | constexpr vec& operator+=(const vec& v) { for (std::size_t i = 0; i < N; ++i) (*this)[i] += v[i]; return *this; } 72 | constexpr vec& operator-=(const vec& v) { for (std::size_t i = 0; i < N; ++i) (*this)[i] -= v[i]; return *this; } 73 | constexpr vec& operator*=(T s) { for (std::size_t i = 0; i < N; ++i) (*this)[i] *= s; return *this; } 74 | constexpr vec& operator/=(T s) { for (std::size_t i = 0; i < N; ++i) (*this)[i] /= s; return *this; } 75 | 76 | [[nodiscard]] constexpr vec operator-() const { vec r; for (std::size_t i = 0; i < N; ++i) r[i] = -(*this)[i]; return r; } 77 | 78 | // "Hidden friends" for faster compilation 79 | [[nodiscard]] friend constexpr vec operator+(const vec& a, const vec& b) { return vec(a) += b; } 80 | [[nodiscard]] friend constexpr vec operator-(const vec& a, const vec& b) { return vec(a) -= b; } 81 | [[nodiscard]] friend constexpr vec operator*(const vec& v, T s) { return vec(v) *= s; } 82 | [[nodiscard]] friend constexpr vec operator*(T s, const vec& v) { return v * s; } 83 | [[nodiscard]] friend constexpr vec operator/(const vec& v, T s) { return vec(v) /= s; } 84 | [[nodiscard]] friend constexpr bool operator==(const vec& a, const vec& b) { return std::ranges::equal(a, b); } 85 | [[nodiscard]] friend constexpr bool operator!=(const vec& a, const vec& b) { return !(a == b); } 86 | 87 | operator nvcluster_Vec3f() const requires (N == 3 && std::same_as) { return {(*this)[0], (*this)[1], (*this)[2]}; } 88 | }; 89 | template [[nodiscard]] constexpr vec min(const vec& a, const vec& b) { vec r; for (std::size_t i = 0; i < N; ++i) r[i] = std::min(a[i], b[i]); return r; } 90 | template [[nodiscard]] constexpr vec max(const vec& a, const vec& b) { vec r; for (std::size_t i = 0; i < N; ++i) r[i] = std::max(a[i], b[i]); return r; } 91 | template [[nodiscard]] constexpr vec clamp(const vec& v, const vec& min_v, const vec& max_v) { vec r; for (std::size_t i = 0; i < N; ++i) r[i] = std::clamp(v[i], min_v[i], max_v[i]); return r; } 92 | template [[nodiscard]] constexpr T dot(const vec& a, const vec& b) { T r{}; for (std::size_t i = 0; i < N; ++i) r += a[i] * b[i]; return r; } 93 | template [[nodiscard]] constexpr T length_squared(const vec& v) { return dot(v, v); } 94 | template [[nodiscard]] T length(const vec& v) requires std::floating_point { return std::sqrt(length_squared(v)); } 95 | template [[nodiscard]] vec normalize(const vec& v) requires std::floating_point { return v * (T{1} / length(v)); } 96 | template [[nodiscard]] constexpr vec cross(const vec& a, const vec& b) requires (N == 3) && std::is_signed_v { 97 | return {a[1]*b[2] - a[2]*b[1], a[2]*b[0] - a[0]*b[2], a[0]*b[1] - a[1]*b[0]}; 98 | } 99 | // clang-format on 100 | 101 | using vec2f = vec; 102 | using vec3f = vec; 103 | using vec4f = vec; 104 | using vec2u = vec; 105 | using vec3u = vec; 106 | using vec4u = vec; 107 | using vec2i = vec; 108 | using vec3i = vec; 109 | using vec4i = vec; 110 | static_assert(sizeof(nvcluster_Vec3f) == sizeof(vec3f)); 111 | 112 | // Axis aligned bounding box 113 | struct AABB 114 | { 115 | vec3f min, max; 116 | 117 | // Plus returns the union of bounding boxes. 118 | // [[nodiscard]] allows the compiler to warn if the return value is ignored, 119 | // which would be a bug. E.g. a + b; but should be a += b; 120 | [[nodiscard]] constexpr AABB operator+(const AABB& other) const 121 | { 122 | return {nvcluster::min(min, other.min), nvcluster::max(max, other.max)}; 123 | } 124 | constexpr AABB& operator+=(const AABB& other) { return *this = *this + other; }; 125 | 126 | [[nodiscard]] constexpr vec3f size() const { return max - min; } 127 | [[nodiscard]] constexpr vec3f center() const { return (min + max) * 0.5f; } 128 | [[nodiscard]] constexpr vec3f positive_size() const { return nvcluster::max(vec3f(0.0f), size()); } 129 | [[nodiscard]] constexpr AABB positive() const { return {min, min + positive_size()}; } 130 | [[nodiscard]] constexpr float half_area() const 131 | { 132 | auto s = size(); 133 | return s[0] * (s[1] + s[2]) + s[1] * s[2]; 134 | } 135 | [[nodiscard]] constexpr AABB intersect(const AABB& other) const 136 | { 137 | return AABB{nvcluster::max(min, other.min), nvcluster::min(max, other.max)}.positive(); 138 | } 139 | [[nodiscard]] constexpr static AABB empty() 140 | { 141 | return {vec3f{std::numeric_limits::max()}, vec3f{std::numeric_limits::lowest()}}; 142 | } 143 | operator nvcluster_AABB() const { return {{min[0], min[1], min[2]}, {max[0], max[1], max[2]}}; } 144 | }; 145 | static_assert(sizeof(nvcluster_AABB) == sizeof(AABB)); 146 | 147 | // An index/cursor based subrange 148 | struct Range 149 | { 150 | uint32_t offset = {}; 151 | uint32_t count = {}; 152 | 153 | // Use iota() to make the range iterable 154 | // E.g.: for(uint32_t i : range.indices()) ... 155 | // std::views::iota() is similar to python's range() 156 | [[nodiscard]] auto indices() const { return std::views::iota(offset, offset + count); } 157 | [[nodiscard]] constexpr uint32_t end() const { return offset + count; } 158 | operator nvcluster_Range() { return {offset, count}; } 159 | }; 160 | static_assert(sizeof(nvcluster_Range) == sizeof(Range)); 161 | 162 | } // namespace nvcluster 163 | 164 | // hashing functions from https://stackoverflow.com/questions/35985960/c-why-is-boosthash-combine-the-best-way-to-combine-hash-values 165 | namespace { 166 | 167 | template 168 | constexpr T xorshift(const T& n, int i) 169 | { 170 | return n ^ (n >> i); 171 | } 172 | 173 | inline constexpr uint32_t hash(const uint32_t& n) 174 | { 175 | uint32_t p = 0x55555555ul; // pattern of alternating 0 and 1 176 | uint32_t c = 3423571495ul; // random uneven integer constant; 177 | return c * xorshift(p * xorshift(n, 16), 16); 178 | } 179 | 180 | inline constexpr uint64_t hash(const uint64_t& n) 181 | { 182 | uint64_t p = 0x5555555555555555ull; // pattern of alternating 0 and 1 183 | uint64_t c = 17316035218449499591ull; // random uneven integer constant; 184 | return c * xorshift(p * xorshift(n, 32), 32); 185 | } 186 | 187 | // call this function with the old seed and the new key to be hashed and 188 | // combined into the new seed value, respectively the final hash 189 | template 190 | constexpr size_t hash_combine(std::size_t& seed, const T& v) 191 | { 192 | return seed = std::rotl(seed, std::numeric_limits::digits / 3) ^ hash(std::hash{}(v)); 193 | } 194 | 195 | // From: https://blog.infotraining.pl/how-to-hash-objects-without-repetition 196 | template 197 | requires(sizeof...(TValues) > 1) 198 | constexpr size_t combined_hash(const TValues&... values) 199 | { 200 | size_t seed{}; 201 | (..., hash_combine(seed, values)); 202 | return seed; 203 | } 204 | 205 | // Adapter for std::array 206 | template 207 | constexpr size_t array_hash(const std::array& arr) 208 | { 209 | return [&arr](std::index_sequence) { return combined_hash(arr[I]...); }(std::make_index_sequence{}); 210 | } 211 | 212 | } // anonymous namespace 213 | 214 | // Define a hash so vec3 can be used in e.g. std::unordered_map 215 | template 216 | struct std::hash> 217 | { 218 | std::size_t operator()(const nvcluster::vec& v) const noexcept { return array_hash(v); } 219 | }; 220 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /test/src/test_util.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | #pragma once 20 | 21 | #include 22 | #include // internal, for unit testing 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include // for vec3 etc. 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | using nvcluster::AABB; 35 | using nvcluster::Range; 36 | using nvcluster::vec2f; 37 | using nvcluster::vec3f; 38 | using nvcluster::vec3u; 39 | 40 | // Computes the axis-aligned bounding box of a triangle with the given indices. 41 | inline AABB aabb(vec3u triangle, std::span positions) 42 | { 43 | using namespace nvcluster; 44 | return {min(min(positions[triangle[0]], positions[triangle[1]]), positions[triangle[2]]), 45 | max(max(positions[triangle[0]], positions[triangle[1]]), positions[triangle[2]])}; 46 | } 47 | 48 | template 49 | bool allUnique(const Range& range) 50 | { 51 | std::unordered_set unique(std::begin(range), std::end(range)); 52 | return unique.size() == std::ranges::size(range); 53 | } 54 | 55 | template 56 | bool contains(const Range& range, std::ranges::range_value_t value) 57 | { 58 | return std::ranges::find(range, value) != std::end(range); 59 | } 60 | 61 | // Shortcut for passing Range offset and count to std::span::subspan(), which 62 | // returns a span pointing to a possibly smaller range of values. 63 | template 64 | constexpr auto subspan(Items& items, nvcluster_Range range) 65 | { 66 | assert(range.count == 0 || range.offset < std::ranges::size(items)); 67 | assert(range.count == 0 || range.offset + range.count <= std::ranges::size(items)); 68 | return std::span(items).subspan(range.offset, range.count); 69 | } 70 | 71 | // Simple mesh struct. Triangle indices and vertex positions. Plus a name for 72 | // better context when tests fail. 73 | struct GeometryMesh 74 | { 75 | std::string name; 76 | std::vector triangles; 77 | std::vector positions; 78 | 79 | // Dump the mesh to a .obj file for testing 80 | void write(std::ostream& os) const 81 | { 82 | os << "g mesh\n"; 83 | for(auto& p : positions) 84 | os << "v " << p[0] << " " << p[1] << " " << p[2] << "\n"; 85 | for(auto& t : triangles) 86 | os << "f " << t[0] + 1 << " " << t[1] + 1 << " " << t[2] + 1 << "\n"; 87 | }; 88 | void write(const std::filesystem::path& path) const 89 | { 90 | std::ofstream ofile(path); 91 | write(ofile); 92 | } 93 | }; 94 | 95 | inline nvcluster::MeshConnections makeMeshConnections(bool parallelize, const GeometryMesh& mesh) 96 | { 97 | return nvcluster::makeMeshConnections(parallelize, 98 | nvcluster::ItemVertices(reinterpret_cast(mesh.triangles.data()), 99 | uint32_t(mesh.triangles.size()), 3u), 100 | uint32_t(mesh.positions.size())); 101 | } 102 | 103 | inline void check(nvcluster_Result result) 104 | { 105 | if(result != nvcluster_Result::NVCLUSTER_SUCCESS) 106 | throw std::runtime_error(nvclusterResultString(result)); 107 | } 108 | 109 | // nvcluster_Context wrapper handles ownership, lifetime, doesn't leak when 110 | // tests return etc. 111 | struct ScopedContext 112 | { 113 | ScopedContext(const nvcluster_ContextCreateInfo& createInfo = {}) 114 | { 115 | check(nvclusterCreateContext(&createInfo, &context)); 116 | } 117 | ~ScopedContext() { std::ignore = nvclusterDestroyContext(context); } 118 | ScopedContext(const ScopedContext& other) = delete; 119 | ScopedContext& operator=(const ScopedContext& other) = delete; 120 | operator nvcluster_Context() const { return context; } 121 | nvcluster_Context context = nullptr; 122 | }; 123 | 124 | // Shortcut to build clusters from various forms of inputs 125 | struct ClusterStorage : nvcluster::ClusterStorage 126 | { 127 | // External API 128 | ClusterStorage(const nvcluster_Config& config, const nvcluster_Input& input) 129 | { 130 | check(generateClusters(ScopedContext(), config, input, *this)); 131 | } 132 | 133 | // Internal interface, for unit testing 134 | ClusterStorage(const nvcluster::Input& input) 135 | { 136 | if(input.segments.size() != 1) 137 | throw std::runtime_error("segmented clustering not implemented in this test"); 138 | nvcluster_Counts requiredCounts; 139 | check(nvclusterGetRequirements(ScopedContext(), &input.config, uint32_t(input.boundingBoxes.size()), &requiredCounts)); 140 | clusterItemRanges.resize(requiredCounts.clusterCount); 141 | items.resize(input.boundingBoxes.size()); 142 | nvcluster_OutputClusters output{.clusterItemRanges = clusterItemRanges.data(), 143 | .items = items.data(), 144 | .clusterCount = uint32_t(clusterItemRanges.size()), 145 | .itemCount = uint32_t(items.size())}; 146 | nvcluster_Range outputSegment{}; 147 | check(clusterize(true, input, nvcluster::OutputClusters(output, &outputSegment, 1))); 148 | if(outputSegment.offset != 0 || size_t(outputSegment.count) != output.clusterCount) 149 | throw std::runtime_error("expected one segment with everything"); 150 | clusterItemRanges.resize(output.clusterCount); 151 | } 152 | }; 153 | 154 | // Returns the number of unique vertices per cluster to verify the vertex limit 155 | // feature 156 | inline std::vector countClusterVertices(const nvcluster::ClusterStorage& clustering, const GeometryMesh& mesh) 157 | { 158 | std::vector result; 159 | result.reserve(clustering.clusterItemRanges.size()); 160 | for(nvcluster_Range r : clustering.clusterItemRanges) 161 | { 162 | std::span cluster = subspan(clustering.items, r); 163 | std::unordered_set uniqueVertices; 164 | for(auto i : cluster) 165 | { 166 | uniqueVertices.insert(mesh.triangles[i][0]); 167 | uniqueVertices.insert(mesh.triangles[i][1]); 168 | uniqueVertices.insert(mesh.triangles[i][2]); 169 | } 170 | result.push_back(uint32_t(uniqueVertices.size())); 171 | } 172 | return result; 173 | } 174 | 175 | // Icosahedron data. 176 | namespace icosahedron { 177 | constexpr float X = .525731112119133606f; 178 | constexpr float Z = .850650808352039932f; 179 | static std::array positions = {{{-X, 0.0, Z}, 180 | {X, 0.0, Z}, 181 | {-X, 0.0, -Z}, 182 | {X, 0.0, -Z}, 183 | {0.0, Z, X}, 184 | {0.0, Z, -X}, 185 | {0.0, -Z, X}, 186 | {0.0, -Z, -X}, 187 | {Z, X, 0.0}, 188 | {-Z, X, 0.0}, 189 | {Z, -X, 0.0}, 190 | {-Z, -X, 0.0}}}; 191 | static std::array triangles = {{{0, 4, 1}, {0, 9, 4}, {9, 5, 4}, {4, 5, 8}, {4, 8, 1}, 192 | {8, 10, 1}, {8, 3, 10}, {5, 3, 8}, {5, 2, 3}, {2, 7, 3}, 193 | {7, 10, 3}, {7, 6, 10}, {7, 11, 6}, {11, 0, 6}, {0, 1, 6}, 194 | {6, 1, 10}, {9, 0, 11}, {9, 11, 2}, {9, 2, 5}, {7, 2, 11}}}; 195 | } // namespace icosahedron 196 | 197 | // Type of a function to call when creating a triangle. Takes 3 positions as 198 | // inputs. 199 | using triangle_callback = std::function; 200 | 201 | // Recursively subdivides a triangle on a sphere by a factor of 2^depth. 202 | // Calls the callback function on each new triangle. 203 | inline void subdivide(vec3f v0, vec3f v1, vec3f v2, int depth, triangle_callback& callback) 204 | { 205 | if(depth == 0) 206 | { 207 | callback(v0, v1, v2); 208 | } 209 | else 210 | { 211 | vec3f v01 = normalize(v0 + v1); 212 | vec3f v12 = normalize(v1 + v2); 213 | vec3f v20 = normalize(v2 + v0); 214 | subdivide(v0, v01, v20, depth - 1, callback); 215 | subdivide(v1, v12, v01, depth - 1, callback); 216 | subdivide(v2, v20, v12, depth - 1, callback); 217 | subdivide(v01, v12, v20, depth - 1, callback); 218 | } 219 | } 220 | 221 | // Makes an icosphere with 20 * (4^depth) triangles. 222 | inline void makeIcosphere(int depth, triangle_callback& callback) 223 | { 224 | for(size_t i = 0; i < icosahedron::triangles.size(); i++) 225 | { 226 | const vec3f v0 = icosahedron::positions[icosahedron::triangles[i][0]]; 227 | const vec3f v1 = icosahedron::positions[icosahedron::triangles[i][1]]; 228 | const vec3f v2 = icosahedron::positions[icosahedron::triangles[i][2]]; 229 | subdivide(v0, v1, v2, depth, callback); 230 | } 231 | } 232 | 233 | inline GeometryMesh makeIcosphere(int subdivision) 234 | { 235 | std::unordered_map vertexCache; 236 | std::vector triangles; 237 | // Our triangle callback function tries to place each of the vertices in the 238 | // vertex cache; each of the `it` iterators point to the existing value if 239 | // the vertex was already in the cache, or to a new value at the end of the 240 | // cache if it's a new vertex. 241 | triangle_callback callback = [&vertexCache, &triangles](vec3f v0, vec3f v1, vec3f v2) { 242 | auto [it0, new0] = vertexCache.try_emplace(v0, static_cast(vertexCache.size())); 243 | auto [it1, new1] = vertexCache.try_emplace(v1, static_cast(vertexCache.size())); 244 | auto [it2, new2] = vertexCache.try_emplace(v2, static_cast(vertexCache.size())); 245 | triangles.push_back({it0->second, it1->second, it2->second}); 246 | }; 247 | makeIcosphere(subdivision, callback); 248 | std::vector positions(vertexCache.size()); 249 | for(const auto& [position, index] : vertexCache) 250 | { 251 | positions[index] = position; 252 | } 253 | [[maybe_unused]] size_t edgeCount = (triangles.size() * 3) / 2; // 3 edges per triangle and each is shared between two trianlges exactly once 254 | [[maybe_unused]] size_t vertexCount = 2 + edgeCount - triangles.size(); // Euler's polyhedron formula 255 | assert(positions.size() == vertexCount); // Double check vertex cache deduplicates vertices as expected 256 | return {"icosphere" + std::to_string(subdivision) + "(" + std::to_string(triangles.size()) + " tris)", triangles, positions}; 257 | } 258 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nv_cluster_builder 2 | 3 | > [!IMPORTANT] 4 | > This repository has been archived and is no longer maintained by NVIDIA. It 5 | > was the result of R&D for ray tracing optimized clusters. Some core algorithm 6 | > concepts have been adopted and further optimized in [meshoptimizer's 7 | > `meshopt_buildMeshletsSpatial()`](https://github.com/zeux/meshoptimizer), to 8 | > which we refer interested developers. 9 | 10 | **nv_cluster_builder** is a small generic spatial clustering C++ library, 11 | created to cluster triangle meshes for ray tracing. It is very similar to a 12 | recursive node splitting algorithm to create a bounding volume hierarchy (BVH). 13 | It is limited to axis aligned splits but also produces clusters with desirable 14 | attributes for raytracing. 15 | 16 | ![clusters](doc/clusters.svg) 17 | 18 | **Input** 19 | 20 | - Spatial locality 21 | - ${\color{red}\text{Bounding\ boxes}}$ 22 | - ${\color{blue}\text{Centroids}}$ 23 | - ${\color{green}\text{Connectivity}}$ (Optional) 24 | - Adjacency lists 25 | - Weights 26 | 27 | ![input](doc/input.svg) 28 | 29 | **Output** 30 | 31 | Cluster items (membership) 32 | - Ranges: \{ \{ ${\color{blue}0,4}$ \} , \{ ${\color{red}4,4}$ \} \} 33 | - Items: \{ 34 | ${\color{blue}3}$, ${\color{blue}4}$, ${\color{blue}6}$, ${\color{blue}1}$, 35 | ${\color{red}2}$, ${\color{red}7}$, ${\color{red}0}$, ${\color{red}1}$ 36 | \} 37 | 38 | ![output](doc/output.svg) 39 | 40 | **Notable features:** 41 | 42 | - Primarily spatial, making clusters from bounding boxes 43 | - Optional user-defined weighted adjacency 44 | - Generic, not just triangles 45 | - Customizable [min–max] cluster sizes 46 | - Parallel, using std::execution 47 | - Segmented API for clustering multiple subsets at once 48 | - Knobs to balance optimizations 49 | 50 | For a complete usage example, see https://github.com/nvpro-samples/vk_animated_clusters. 51 | 52 | ## Usage Example 53 | 54 | For more details, refer to [`nvcluster.h`](include/nvcluster/nvcluster.h) (and 55 | optionally [`nvcluster_storage.hpp`](include/nvcluster/nvcluster_storage.hpp)). 56 | The [tests](test/src) may also be useful to look through. 57 | 58 | ``` 59 | #include 60 | #include 61 | 62 | ... 63 | 64 | // Create bounding boxes for each item to be clustered 65 | std::vector boundingBoxes{ 66 | nvcluster_AABB{{0, 0, 0}, {1, 1, 1}}, // for example 67 | ... 68 | }; 69 | 70 | // Generate centroids 71 | std::vector centroids(boundingBoxes.size()); 72 | for(size_t i = 0; i < boundingBoxes.size(); i++) 73 | { 74 | centroids[i] = 0.5f * (glm::vec3(boundingBoxes[i].bboxMin) + glm::vec3(boundingBoxes[i].bboxMax)); 75 | } 76 | 77 | // Input 78 | nvcluster_Input input{.itemBoundingBoxes = reinterpret_cast(boundingBoxes.data()), 79 | .itemCentroids = reinterpret_cast(centroids.data()), 80 | .itemCount = static_cast(boundingBoxes.size())}; 81 | nvcluster_Config config{ 82 | .minClusterSize = 128, 83 | .maxClusterSize = 128, 84 | .costUnderfill = 0.0f, // zero to one (exclusive) 85 | .costOverlap = 0.0f, // zero to one (exclusive) 86 | .preSplitThreshold = 0, // median-split bigger nodes (0=disable) 87 | }; 88 | 89 | // Create context (there's also ScopedContext in test_util.hpp) 90 | nvcluster_ContextCreateInfo info{}; 91 | nvcluster_Context context; 92 | nvclusterCreateContext(&info, &context); // Add error checking 93 | 94 | // Create clusters 95 | // This is a thin wrapper with std::vector storage for nvclusterBuild(...) 96 | nvcluster::ClusterStorage clustering; 97 | nvcluster::generateClusters(context, config, input, clustering); // Add error checking, don't leak context etc. 98 | 99 | // Do something with the result 100 | for(size_t clusterIndex = 0; clusterIndex < clustering.clusterItemRanges.size(); ++clusterIndex) 101 | { 102 | const nvcluster_Range& range = clustering.clusterItemRanges[clusterIndex]; 103 | for(uint32_t clusterItemIndex = 0; clusterItemIndex < range.count; ++clusterItemIndex) 104 | { 105 | uint32_t clusterItem = clustering.items[range.offset + clusterItemIndex]; 106 | ... 107 | } 108 | } 109 | 110 | // If not wrapping the C API, 111 | nvclusterDestroyContext(context); 112 | ``` 113 | 114 | ## Build Integration 115 | 116 | This library uses CMake and requires C++20. It compiles as a static library by 117 | default. Use `-DNVCLUSTER_BUILDER_SHARED=ON` to compile a shared library. Data 118 | is passed as structures of arrays and the output must be allocated by the user. 119 | Integration has been verified by directly including it with `add_subdirectory`: 120 | 121 | ``` 122 | add_subdirectory(nv_cluster_builder) 123 | ... 124 | target_link_libraries(my_target PUBLIC nv_cluster_builder) 125 | ``` 126 | 127 | If there is interest, please reach out for CMake config files (for 128 | `find_package()`) or any other features. GitHub issues are welcome. 129 | 130 | ### Dependencies 131 | 132 | Just a C++20 compiler. 133 | 134 | Parallel execution on linux uses `tbb` if available. For ubuntu, `sudo apt install libtbb-dev`. 135 | 136 | If tests are enabled (set the CMake `BUILD_TESTING` variable to `ON`), 137 | nv_cluster_builder will use [`FetchContent`](https://cmake.org/cmake/help/latest/module/FetchContent.html) 138 | to download GoogleTest. 139 | 140 | ## How it works 141 | 142 | Authors and contact: 143 | 144 | - Pyarelal Knowles (pknowles 'at' nvidia.com), NVIDIA 145 | - Karthik Vaidyanathan, NVIDIA 146 | 147 | Cluster goals: 148 | 149 | - Consistent size for batch processing 150 | - Spatially adjacent 151 | - Well connected 152 | - Small bounding box (low SAH cost) 153 | - Low overlap 154 | - Useful for ray tracing 155 | 156 | The algorithm is basic recursive bisection: 157 | 158 | 1. Sorts inputs by centroids on each axis 159 | 2. Initialize with a root node containing everything 160 | 3. Recursively split until the desired leaf size is reached 161 | - Compute candidate split costs for all positions in all axes 162 | - Split at the lowest cost, maintaining sorted centroids by partitioning 163 | 4. Leaves become clusters 164 | 165 | Novel additions: 166 | 167 | - Limit split candidates to guarantee fixed cluster sizes 168 | - Optimize for full clusters 169 | - Optimize for less bounding box overlap 170 | - Optimize for minimum *ratio cut* cost if adjacency exists 171 | 172 | The optimizations are implemented by converting and summing additional costs 173 | with the surface area heuristic (SAH) cost and choosing a split position on any 174 | axis with minimum cost. 175 | 176 | ### Fixed Size Clusters 177 | 178 | Only split at $i \bmod C = 0$, with $i$ items to the left of a candidate split, 179 | to make clusters of size $C$. There will be at most one undersized cluster. This 180 | rule alone will largely break SAH, as shown for the clustering along just one 181 | axis. In reality, split candidates would be chosen for any axis 182 | 183 | ![fixed_breaks_sah](doc/fixed_breaks_sah.svg) 184 | 185 | Relax the fixed $C$ constraint to allow a range, $[C_A, C_B]$ where $(1 \le C_A 186 | \le C_B)$. Only split if the target range cluster sizes could be formed on both 187 | sides. For example, the figure below shows forming clusters of size 127 or 128 188 | items. Choosing splits in grey regions will produce clusters in the left node 189 | (top) and right node (bottom) of the desired size range. Limit split candidates 190 | to the intersection of the grey regions. The equivalent conditions are described the equations below, where 191 | $n$ is the number of items in the node being split. 192 | 193 | ![valid_split_positions](doc/valid_split_positions.svg) 194 | 195 | $$𝑖 \bmod 𝐶_𝐴 \le (𝐶_𝐵 − 𝐶_𝐴) \lfloor \frac{𝑖}{𝐶_𝐴} \rfloor$$ 196 | $$(n - 𝑖) \bmod 𝐶_𝐴 \le (𝐶_𝐵 − 𝐶_𝐴) \lfloor \frac{n - 𝑖}{𝐶_𝐴} \rfloor$$ 197 | 198 | For small inputs it is possible that there is no overlap in valid ranges, in 199 | which case the algorithm falls back to choosing just one. Similarly to the fixed 200 | $C_A = C_B$ case, there will be at most one undersized cluster. 201 | 202 | ### Maximize Cluster Sizes 203 | 204 | A cluster "underfill" cost is introdued to encourage bigger clusters. For 205 | example, in the figure below a split position is being considered for clusters 206 | in the range [1, 4]. The split candidate would produce a node of 2.75 clusters 207 | on the left and 1.25 on the right. This results in $p$ missing cluster items. 208 | This value is converted to SAH units and summed. This library currently uses a 209 | linear cost with a tunable `costUnderfill` constant, but a transfer function to 210 | model the true cost, of e.g. perf or memory, would be ideal. 211 | 212 | ![underfill_cost](doc/underfill_cost.svg) 213 | 214 | $$p_{\text{left}} = C_B \lceil \frac{i}{C_B} \rceil - i$$ 215 | $$p_{\text{right}} = C_B \lceil \frac{n - i}{C_B} \rceil - (n - i)$$ 216 | $$p = C_B ( \lceil \frac{i}{C_B} \rceil + \lceil \frac{n - i}{C_B} \rceil ) - n$$ 217 | 218 | ### Minimize Bounding Box Overlap 219 | 220 | Bounding box overlap is bad for ray tracing because rays must enter both while 221 | in the overlap volume. A cost is added for overlapping bounding boxes, ver much 222 | like SAH it is just $n$ multiplied by the surface area of the bounding box 223 | intersection's box and balanced with a tunable `costOverlap` constant. 224 | 225 | ### Minimize Adjacency Cut Cost 226 | 227 | If provided, adjacency is integrated by adding the *cut cost* - the sum of 228 | weights of all item connections broken by the split - to each candidate split 229 | position. *Ratio cut* [Wei and Cheng, 1989] is used to avoid degenerate 230 | solutions. The cut cost is arbitrarily scaled by the number of items in the node 231 | to be SAH relative and added to the other costs above. 232 | 233 | To compute the cut cost, the adjacency data is rebuilt to reference node items 234 | before each iteration of recursive node splitting. This allows cut costs to be 235 | computed with a prefix sum scan of summed starting and ending connection 236 | weights. 237 | 238 | To explain, there are initially three arrays, sorted by item centroids in *X*, 239 | *Y* and *Z* respectively. After splitting, these arrays are partitioned, 240 | maintaining sorted order within nodes. These hold original item indices and in 241 | fact trivially hold the clustering result after splitting. The input adjacency 242 | arrays index original items, but we instead need the index in those initially 243 | sorted arrays. This is done by duplicating the adjacency arrays and scatter 244 | writing their node-sorted indices. The image below shows this for one axis. 245 | 246 | ![adjacency_sweep](doc/adjacency_sorted.svg) 247 | 248 | When computing cut costs for a node, an array of summed weights is created. The 249 | image below shows an example with unit weights. The array is initialized with 250 | the sum of connecting item weights - positive for connections to the right and 251 | negative for connections to the left. Connections to other nodes are ignored. 252 | The reindexed adjacency arrays trivially give this information, comparing the 253 | connection index with the current item's index and the node boundaries. The 254 | weights array is then prefix summed to obtain the cut cost for each position in 255 | the node. 256 | 257 | ![adjacency_sweep](doc/adjacency_sweep.svg) 258 | 259 | ### Citation 260 | 261 | The BibTex entry to cite `nv_cluster_builder` is 262 | 263 | ```bibtex 264 | @online{nv_cluster_builder, 265 | title = {{{NVIDIA}}\textregistered{} {nv_cluster_builder}}, 266 | author = {{NVIDIA}}, 267 | year = 2025, 268 | url = {https://github.com/nvpro-samples/nv_cluster_builder}, 269 | urldate = {2025-01-30}, 270 | } 271 | ``` 272 | 273 | ## Limitations 274 | 275 | Clusters are created by making recursive axis aligned splits. This is useful as 276 | it greatly reduces the search space and improves performance when clusters are 277 | used in ray tracing. However, more general clustering solutions than 278 | axis aligned splits are not considered. 279 | 280 | Recursively splitting is done greedily, picking the lowest cost split which may 281 | not be a global optimum. 282 | 283 | The algorithm is primarily spatial due to splitting in order of centroids, but 284 | solutions can be skewed by adjusting the costs in `nvcluster::Config` and 285 | adjacency weights in `nvcluster::Graph::connectionWeights`. For example, choosing 286 | adjacency weights to represent connected triangles or number of shared vertices 287 | can result in more vertex reuse within clusters. Weights may also represent face 288 | normal similarity or a balance of multiple attributes. 289 | 290 | Badly chosen weights can result in degenerate solutions where recursive 291 | bisection splits off single leaves. This is both slow and rarely desirable. 292 | 293 | Parallel execution is only supported with libstdc++ and MSVC STL, not libc++. 294 | -------------------------------------------------------------------------------- /doc/valid_split_positions.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | x126 47 | 48 | x126 49 | 50 | x126 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | x126 64 | 65 | 66 | 67 | 68 | 69 | x126 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | x126 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 1 132 | 2 133 | 3 134 | 4 135 | 136 | 137 | 138 | Left valid ranges 139 | Candidates 140 | Right valid ranges 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/connections.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | namespace nvcluster { 29 | 30 | // Initial reservation for item connections 31 | static constexpr uint32_t AVERAGE_ADJACENCY_GUESS = 12; 32 | 33 | // Sorted map small vector size 34 | static constexpr uint32_t SMALL_VECTOR_SIZE = 32; 35 | 36 | // Switch to indirect indexing if vertex count is this much more than the item 37 | // count 38 | static constexpr uint32_t INDIRECT_INDEXING_VERTEX_RATIO_THRESHOLD = 5; 39 | 40 | // Incomplete small vector implementation, just for use in SortedMap. Only 41 | // beneficial on Windows. 42 | template 43 | class SmallVector 44 | { 45 | public: 46 | T* begin() { return isStack() ? stackData() : m_heapData.data(); } 47 | T* end() { return begin() + m_size; } 48 | const T* begin() const { return isStack() ? stackData() : m_heapData.data(); } 49 | const T* end() const { return begin() + m_size; } 50 | size_t size() const { return m_size; } 51 | T* insert(T* pos, const T& value) 52 | { 53 | if(isStack()) 54 | { 55 | assert(pos >= stackData() && pos <= stackData() + m_size); 56 | if(m_size == N) 57 | { 58 | m_heapData.reserve(N * 2); 59 | m_heapData.insert(m_heapData.end(), stackData(), stackData() + N); 60 | pos = m_heapData.data() + (pos - stackData()); 61 | } 62 | else 63 | { 64 | std::copy_backward(pos, stackData() + m_size, stackData() + m_size + 1); 65 | *pos = value; 66 | ++m_size; 67 | return pos; 68 | } 69 | } 70 | assert(pos >= m_heapData.data() && pos <= m_heapData.data() + m_size); 71 | pos = &*m_heapData.insert(m_heapData.begin() + (pos - m_heapData.data()), value); 72 | ++m_size; 73 | return pos; 74 | } 75 | void erase(T* pos) 76 | { 77 | if(isStack()) 78 | { 79 | assert(pos >= stackData() && pos < stackData() + m_size); 80 | std::copy(pos + 1, stackData() + m_size, pos); 81 | --m_size; 82 | } 83 | else 84 | { 85 | assert(pos >= m_heapData.data() && pos < m_heapData.data() + m_size); 86 | m_heapData.erase(m_heapData.begin() + (pos - m_heapData.data())); 87 | --m_size; 88 | assert(m_size == m_heapData.size()); 89 | } 90 | } 91 | 92 | private: 93 | bool isStack() const { return m_heapData.empty(); } 94 | 95 | T* stackData() { return reinterpret_cast(&m_stackData); } 96 | const T* stackData() const { return reinterpret_cast(&m_stackData); } 97 | 98 | alignas(std::array) std::byte m_stackData[sizeof(std::array)] = {}; 99 | 100 | std::vector m_heapData; 101 | size_t m_size = 0; 102 | }; 103 | 104 | 105 | // A sorted vector used to map a small number of items. E.g. computing a list of 106 | // all adjacent triangles. 107 | template 108 | class SortedMap 109 | { 110 | public: 111 | Value& operator[](const Key& key) 112 | { 113 | auto it = std::ranges::lower_bound(m_data, key, {}, &std::pair::first); 114 | if(it == m_data.end() || it->first != key) 115 | { 116 | it = m_data.insert(it, {key, Value()}); 117 | } 118 | return it->second; 119 | } 120 | auto begin() const { return m_data.begin(); } 121 | auto end() const { return m_data.end(); } 122 | auto size() const { return m_data.size(); } 123 | void erase(const Key& key) 124 | { 125 | auto it = std::ranges::lower_bound(m_data, key, {}, &std::pair::first); 126 | if(it != m_data.end() && it->first == key) 127 | { 128 | m_data.erase(it); 129 | } 130 | } 131 | 132 | private: 133 | SmallVector, SMALL_VECTOR_SIZE> m_data; 134 | }; 135 | 136 | // Utility to compute vertex to item back references. VertexIndirection is 137 | // useful if items only reference a few vertices. 138 | template 139 | struct VertexConnections 140 | { 141 | template 142 | VertexConnections(ParallelizeType&&, ItemVertices itemVertices, uint32_t vertexCount) 143 | { 144 | constexpr bool Parallelize = ParallelizeType::value; 145 | 146 | // Compute vertex indirection - a map of unique vertex indices 147 | // TODO: Parallelize? concurrent map or per-thread dedupe and reduce 148 | if constexpr(VertexIndirection) 149 | { 150 | vertexIndirection.reserve(itemVertices.itemCount() * 2); 151 | for(size_t i = 0; i < itemVertices.itemCount(); ++i) 152 | { 153 | for(uint32_t vertexIndex : itemVertices.vertices(i)) 154 | vertexIndirection.try_emplace(vertexIndex, uint32_t(vertexIndirection.size())); 155 | } 156 | } 157 | 158 | size_t indirectSize = VertexIndirection ? vertexIndirection.size() : size_t(vertexCount); 159 | 160 | // Compute range sizes 161 | vertexItemCounts = std::vector(indirectSize, 0U); 162 | parallel_batches(itemVertices.itemCount(), [&](size_t itemIndex) { 163 | for(uint32_t vertexIndex : itemVertices.vertices(itemIndex)) 164 | { 165 | if constexpr(VertexIndirection) 166 | vertexIndex = vertexIndirection.at(vertexIndex); 167 | std::atomic_ref(vertexItemCounts[vertexIndex])++; 168 | } 169 | }); 170 | 171 | // Compute range offsets 172 | vertexItemOffsets = std::vector(indirectSize); 173 | std::exclusive_scan(exec, vertexItemCounts.begin(), vertexItemCounts.end(), vertexItemOffsets.begin(), 0U); 174 | uint32_t totalVertexItems = vertexItemOffsets.back() + vertexItemCounts.back(); 175 | 176 | // Compute vertexItems by scatter writing to vertex ranges of each item 177 | std::ranges::fill(vertexItemCounts, 0u); 178 | vertexItems = std::vector(totalVertexItems); 179 | parallel_batches(itemVertices.itemCount(), [&](size_t itemIndex) { 180 | for(uint32_t vertexIndex : itemVertices.vertices(itemIndex)) 181 | { 182 | if constexpr(VertexIndirection) 183 | vertexIndex = vertexIndirection.at(vertexIndex); 184 | vertexItems[vertexItemOffsets[vertexIndex] + std::atomic_ref(vertexItemCounts[vertexIndex])++] = uint32_t(itemIndex); 185 | } 186 | }); 187 | } 188 | 189 | // Return a list of items connected to a vertex 190 | std::span items(uint32_t vertexIndex) const 191 | { 192 | if constexpr(VertexIndirection) 193 | vertexIndex = vertexIndirection.at(vertexIndex); 194 | return std::span(vertexItems).subspan(vertexItemOffsets[vertexIndex], vertexItemCounts[vertexIndex]); 195 | }; 196 | 197 | // Compute items that each item connects to. It returns a 198 | // 'map[otherItem] = vertexBits' 199 | SortedMap itemConnectionVertexBits(uint32_t itemIndex, std::span vertices) const 200 | { 201 | // Create output 202 | SortedMap connections; 203 | 204 | // Scatter write connections and unique vertex bits (unique to just this item) 205 | for(size_t i = 0; i < vertices.size(); ++i) 206 | { 207 | for(uint32_t other : items(vertices[i])) 208 | connections[other] |= uint8_t(1) << i; 209 | } 210 | 211 | // Remove self 212 | connections.erase(itemIndex); 213 | return connections; 214 | } 215 | 216 | // Map of unique vertex indices to indices in counts and offsets arrays 217 | std::unordered_map vertexIndirection; 218 | 219 | std::vector vertexItemCounts; // aka. vertex valence 220 | std::vector vertexItemOffsets; // prefix sum of counts 221 | std::vector vertexItems; // linearized ranges of items (e.g. triangles) 222 | }; 223 | 224 | template 225 | MeshConnections makeMeshConnectionsParallel(ItemVertices itemVertices, uint32_t vertexCount) 226 | { 227 | // Compute lists of items sharing each vertex 228 | VertexConnections vertexItems(std::true_type{}, itemVertices, vertexCount); 229 | 230 | // Build ranges of the results of itemConnectionVertexBits() 231 | // There's a few ways to linearize in parallel: 232 | // 1. Compute and count each range, allocate total, recompute and fill 233 | // (computes twice) 234 | // 2. Compute and store each range in the heap, allocate total, fill (uses 235 | // more memory) 236 | // 3. Compute each range, hold a lock to allocate, fill directly (holds 237 | // locks, reallocates, computes once, reduced memory) 238 | // We'll go with option 1. *shrug* 239 | // TODO: combine counts/offsets into result ranges or split result ranges 240 | std::vector connectionItemCounts(itemVertices.itemCount(), 0); 241 | std::vector connectionItemOffsets(itemVertices.itemCount()); 242 | parallel_batches(itemVertices.itemCount(), [&](size_t itemIndex) { 243 | std::span vertices = itemVertices.vertices(itemIndex); 244 | SortedMap adjacency = vertexItems.itemConnectionVertexBits(uint32_t(itemIndex), vertices); 245 | connectionItemCounts[itemIndex] = uint32_t(adjacency.size()); 246 | }); 247 | std::exclusive_scan(exec, connectionItemCounts.begin(), connectionItemCounts.end(), connectionItemOffsets.begin(), 0U); 248 | 249 | MeshConnections result; 250 | result.connectionRanges.resize(itemVertices.itemCount()); 251 | result.connectionItems.resize(connectionItemOffsets.back() + connectionItemCounts.back()); 252 | result.connectionVertexBits.resize(connectionItemOffsets.back() + connectionItemCounts.back()); 253 | parallel_batches(itemVertices.itemCount(), [&](size_t itemIndex) { 254 | std::span vertices = itemVertices.vertices(itemIndex); 255 | SortedMap adjacency = vertexItems.itemConnectionVertexBits(uint32_t(itemIndex), vertices); 256 | Range range = {connectionItemOffsets[itemIndex], 0}; 257 | for(auto [item, bits] : adjacency) 258 | { 259 | result.connectionItems[range.offset + range.count] = item; 260 | result.connectionVertexBits[range.offset + range.count] = bits; 261 | range.count++; 262 | } 263 | result.connectionRanges[itemIndex] = range; 264 | }); 265 | return result; 266 | } 267 | 268 | // Faster specialization for single-threaded execution that does not compute 269 | // adjacency twice. 270 | template 271 | MeshConnections makeMeshConnectionsSequential(ItemVertices itemVertices, uint32_t vertexCount) 272 | { 273 | // Compute lists of items sharing each vertex 274 | VertexConnections vertexItems(std::false_type{}, itemVertices, vertexCount); 275 | 276 | // Build ranges of the results of itemConnectionVertexBits() 277 | MeshConnections result; 278 | result.connectionRanges.resize(itemVertices.itemCount()); 279 | result.connectionItems.reserve(itemVertices.itemCount() * AVERAGE_ADJACENCY_GUESS); 280 | result.connectionVertexBits.reserve(itemVertices.itemCount() * AVERAGE_ADJACENCY_GUESS); 281 | for(size_t itemIndex = 0; itemIndex < itemVertices.itemCount(); ++itemIndex) 282 | { 283 | std::span vertices = itemVertices.vertices(itemIndex); 284 | SortedMap adjacency = vertexItems.itemConnectionVertexBits(uint32_t(itemIndex), vertices); 285 | result.connectionRanges[itemIndex] = {uint32_t(result.connectionItems.size()), uint32_t(adjacency.size())}; 286 | for(auto [item, bits] : adjacency) 287 | { 288 | result.connectionItems.push_back(item); 289 | result.connectionVertexBits.push_back(bits); 290 | } 291 | } 292 | return result; 293 | } 294 | 295 | // Switch to vertex indirection if vertex count is much more than the item count 296 | template 297 | MeshConnections makeMeshConnections(ItemVertices itemVertices, uint32_t vertexCount) 298 | { 299 | if(vertexCount > itemVertices.itemCount() * itemVertices.itemVertexCount() * INDIRECT_INDEXING_VERTEX_RATIO_THRESHOLD) 300 | { 301 | if constexpr(Parallelize) 302 | return makeMeshConnectionsParallel(itemVertices, vertexCount); 303 | else 304 | return makeMeshConnectionsSequential(itemVertices, vertexCount); 305 | } 306 | else 307 | { 308 | if constexpr(Parallelize) 309 | return makeMeshConnectionsParallel(itemVertices, vertexCount); 310 | else 311 | return makeMeshConnectionsSequential(itemVertices, vertexCount); 312 | } 313 | } 314 | 315 | // Expand dynamic parallel flag to compile time permutations 316 | MeshConnections makeMeshConnections(bool parallelize, ItemVertices itemVertices, uint32_t vertexCount) 317 | { 318 | #if !defined(NVCLUSTER_MULTITHREADED) || NVCLUSTER_MULTITHREADED 319 | return parallelize ? makeMeshConnections(itemVertices, vertexCount) : makeMeshConnections(itemVertices, vertexCount); 320 | #else 321 | (void)parallelize; 322 | return makeMeshConnections(itemVertices, vertexCount); 323 | #endif 324 | } 325 | 326 | } // namespace nvcluster 327 | -------------------------------------------------------------------------------- /src/nvcluster.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #if !defined(NVCLUSTER_BUILDER_COMPILING) 25 | #error NVCLUSTER_BUILDER_COMPILING must be defined when building the library 26 | #endif 27 | 28 | struct nvcluster_Context_t 29 | { 30 | uint32_t version = 0u; 31 | nvcluster_Bool parallelize = NVCLUSTER_FALSE; 32 | }; 33 | 34 | const char* nvclusterResultString(nvcluster_Result result) 35 | { 36 | // clang-format off 37 | switch(result){ 38 | case NVCLUSTER_SUCCESS: return "SUCCESS"; 39 | case NVCLUSTER_ERROR_CONTEXT_VERSION_MISMATCH: return "NVCLUSTER_ERROR_CONTEXT_VERSION_MISMATCH"; 40 | case NVCLUSTER_ERROR_INVALID_CONFIG_CLUSTER_SIZES: return "NVCLUSTER_ERROR_INVALID_CONFIG_CLUSTER_SIZES"; 41 | case NVCLUSTER_ERROR_MISSING_SPATIAL_BOUNDING_BOXES: return "NVCLUSTER_ERROR_MISSING_SPATIAL_BOUNDING_BOXES"; 42 | case NVCLUSTER_ERROR_MISSING_SPATIAL_CENTROIDS: return "NVCLUSTER_ERROR_MISSING_SPATIAL_CENTROIDS"; 43 | case NVCLUSTER_ERROR_INVALID_OUTPUT_ITEM_INDICES_SIZE: return "NVCLUSTER_ERROR_INVALID_OUTPUT_ITEM_INDICES_SIZE"; 44 | case NVCLUSTER_ERROR_SPATIAL_AND_CONNECTIONS_ITEM_COUNT_MISMATCH: return "NVCLUSTER_ERROR_SPATIAL_AND_CONNECTIONS_ITEM_COUNT_MISMATCH"; 45 | case NVCLUSTER_ERROR_SEGMENT_AND_ITEM_COUNT_CONTRADICTION: return "NVCLUSTER_ERROR_SEGMENT_AND_ITEM_COUNT_CONTRADICTION"; 46 | case NVCLUSTER_ERROR_SEGMENT_COUNT_MISMATCH: return "NVCLUSTER_ERROR_SEGMENT_COUNT_MISMATCH"; 47 | case NVCLUSTER_ERROR_MAX_CLUSTER_VERTICES_WITHOUT_CONNECTION_BITS: return "NVCLUSTER_ERROR_MAX_CLUSTER_VERTICES_WITHOUT_CONNECTION_BITS"; 48 | case NVCLUSTER_ERROR_MAX_VERTICES_LESS_THAN_ITEM_VERTICES: return "NVCLUSTER_ERROR_MAX_VERTICES_LESS_THAN_ITEM_VERTICES"; 49 | case NVCLUSTER_ERROR_NO_CONNECTION_ATTRIBUTES: return "NVCLUSTER_ERROR_NO_CONNECTION_ATTRIBUTES"; 50 | case NVCLUSTER_ERROR_ITEM_VERTEX_COUNT_OVERFLOW: return "NVCLUSTER_ERROR_ITEM_VERTEX_COUNT_OVERFLOW"; 51 | case NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTICES_PROVIDED: return "NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTICES_PROVIDED"; 52 | case NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTEX_COUNT_PROVIDED: return "NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTEX_COUNT_PROVIDED"; 53 | case NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_PER_ITEM_VERTEX_COUNT: return "NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_PER_ITEM_VERTEX_COUNT"; 54 | case NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_VERTEX_COUNT: return "NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_VERTEX_COUNT"; 55 | case NVCLUSTER_ERROR_NULL_INPUT: return "NVCLUSTER_ERROR_NULL_INPUT"; 56 | case NVCLUSTER_ERROR_NULL_CONTEXT: return "NVCLUSTER_ERROR_NULL_CONTEXT"; 57 | case NVCLUSTER_ERROR_NULL_OUTPUT: return "NVCLUSTER_ERROR_NULL_OUTPUT"; 58 | case NVCLUSTER_ERROR_INTERNAL_MULTIPLE_UNDERFLOW: return "NVCLUSTER_ERROR_INTERNAL_MULTIPLE_UNDERFLOW"; 59 | default: return ""; 60 | } 61 | // clang-format on 62 | } 63 | 64 | uint32_t nvclusterVersion(void) 65 | { 66 | return NVCLUSTER_VERSION; 67 | } 68 | 69 | nvcluster_Result nvclusterCreateContext(const nvcluster_ContextCreateInfo* createInfo, nvcluster_Context* context) 70 | { 71 | if(createInfo == nullptr) 72 | { 73 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_INPUT; 74 | } 75 | if(context == nullptr) 76 | { 77 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 78 | } 79 | if(createInfo->version != NVCLUSTER_VERSION) 80 | { 81 | return nvcluster_Result::NVCLUSTER_ERROR_CONTEXT_VERSION_MISMATCH; 82 | } 83 | 84 | *context = new nvcluster_Context_t{ 85 | .version = createInfo->version, 86 | .parallelize = createInfo->parallelize, 87 | }; 88 | 89 | return nvcluster_Result::NVCLUSTER_SUCCESS; 90 | } 91 | 92 | nvcluster_Result nvclusterDestroyContext(nvcluster_Context context) 93 | { 94 | if(context == nullptr) 95 | { 96 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 97 | } 98 | 99 | delete context; 100 | 101 | return nvcluster_Result::NVCLUSTER_SUCCESS; 102 | } 103 | 104 | nvcluster_Result nvclusterGetRequirements(nvcluster_Context context, const nvcluster_Config* config, uint32_t itemCount, nvcluster_Counts* outputRequiredCounts) 105 | { 106 | if(context == nullptr) 107 | { 108 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 109 | } 110 | if(config == nullptr) 111 | { 112 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_INPUT; 113 | } 114 | if(outputRequiredCounts == nullptr) 115 | { 116 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_OUTPUT; 117 | } 118 | if(config->minClusterSize == 0 || config->maxClusterSize == 0 || config->minClusterSize > config->maxClusterSize) 119 | { 120 | return nvcluster_Result::NVCLUSTER_ERROR_INVALID_CONFIG_CLUSTER_SIZES; 121 | } 122 | 123 | const size_t n = itemCount; 124 | const size_t Ca = config->minClusterSize; 125 | 126 | // Pre-splitting ignores alignment and each can introduce an extra 127 | // under-filled cluster 128 | const size_t P = config->preSplitThreshold; 129 | const size_t preSplitClusters = P == 0 ? 0 : (n + P - 1) / P; 130 | 131 | size_t maxClusters = std::min(n, (n + Ca - 1u) / Ca + preSplitClusters); 132 | 133 | // Check if maxClusterVertices has been set/is not the default 134 | if(config->maxClusterVertices != 0u && config->maxClusterVertices != ~0u) 135 | { 136 | if(config->maxClusterVertices < config->itemVertexCount) 137 | { 138 | return nvcluster_Result::NVCLUSTER_ERROR_MAX_VERTICES_LESS_THAN_ITEM_VERTICES; 139 | } 140 | 141 | // Worst case, every item is disconnected, forming clusters of size 142 | // (maxClusterVertices / itemVertexCount). That number of clusters would be 143 | // doubled if we only split overflowing clusters exactly in half, but SAH 144 | // does not guarantee this and in fact may form single item clusters. While 145 | // uncommon, it is safer to just return 'n'. 146 | maxClusters = n; 147 | } 148 | 149 | *outputRequiredCounts = nvcluster_Counts{ 150 | .clusterCount = uint32_t(maxClusters), 151 | }; 152 | 153 | return nvcluster_Result::NVCLUSTER_SUCCESS; 154 | } 155 | 156 | inline nvcluster_Result buildMaybeWithConnections(nvcluster_Context context, 157 | const nvcluster_Config* config, 158 | const nvcluster_Input* input, 159 | nvcluster_OutputClusters* outputClusters, 160 | const nvcluster_Segments* segments = nullptr, 161 | nvcluster_Range* segmentClusterRanges = nullptr) 162 | { 163 | if(input->itemCount && !input->itemBoundingBoxes) 164 | { 165 | return nvcluster_Result::NVCLUSTER_ERROR_MISSING_SPATIAL_BOUNDING_BOXES; 166 | } 167 | if(input->itemCount && !input->itemCentroids) 168 | { 169 | return nvcluster_Result::NVCLUSTER_ERROR_MISSING_SPATIAL_CENTROIDS; 170 | } 171 | 172 | // API permutation consistency checks 173 | if(input->itemVertices) 174 | { 175 | // The user is passing in vertices, which implies connections are to be 176 | // generated. If there are any manually provided connections, it's probably 177 | // a mistake. 178 | if(input->itemConnectionRanges || input->connectionTargetItems || input->connectionWeights || input->connectionVertexBits) 179 | { 180 | return nvcluster_Result::NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTICES_PROVIDED; 181 | } 182 | 183 | if(config->itemVertexCount == 0) 184 | { 185 | return nvcluster_Result::NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_PER_ITEM_VERTEX_COUNT; 186 | } 187 | 188 | if(input->vertexCount == 0) 189 | { 190 | return nvcluster_Result::NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_VERTEX_COUNT; 191 | } 192 | } 193 | else 194 | { 195 | // If the user provided connections, itemVertices and vertexCount should not 196 | // be set. 197 | if(input->itemConnectionRanges || input->connectionTargetItems || input->connectionWeights || input->connectionVertexBits) 198 | { 199 | if(input->vertexCount) 200 | { 201 | return nvcluster_Result::NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTEX_COUNT_PROVIDED; 202 | } 203 | } 204 | 205 | // If the user wants a vertex limit, they must provide either 206 | // connectionVertexBits or itemVertices (to generate connectionVertexBits). 207 | if(config->maxClusterVertices != 0u && config->maxClusterVertices != ~0u && input->connectionVertexBits != nullptr) 208 | { 209 | return nvcluster_Result::NVCLUSTER_ERROR_MAX_CLUSTER_VERTICES_WITHOUT_CONNECTION_BITS; 210 | } 211 | 212 | // Should have weights and/or vertex bits if connections are provided 213 | if(input->connectionCount > 0u && input->connectionWeights == nullptr && input->connectionVertexBits == nullptr) 214 | { 215 | return nvcluster_Result::NVCLUSTER_ERROR_NO_CONNECTION_ATTRIBUTES; 216 | } 217 | } 218 | 219 | nvcluster_Range singleSegmentRange{0, input->itemCount}; 220 | nvcluster_Segments singleSegment{ 221 | .segmentItemRanges = &singleSegmentRange, 222 | .segmentCount = 1, 223 | }; 224 | nvcluster_Range outputSegmentIgnored; 225 | if(segments == nullptr) 226 | { 227 | assert(segmentClusterRanges == nullptr); 228 | segments = &singleSegment; 229 | segmentClusterRanges = &outputSegmentIgnored; 230 | } 231 | 232 | // Skip computing connections if the item limit makes the vertex limit 233 | // redundant. 234 | bool skipVertexLimit = input->itemVertices && config->maxClusterSize * config->itemVertexCount <= config->maxClusterVertices; 235 | if(input->itemVertices && !skipVertexLimit) 236 | { 237 | nvcluster::MeshConnections meshConnections = nvcluster::makeMeshConnections(context->parallelize, *config, *input); 238 | return nvcluster::clusterize(context->parallelize, nvcluster::Input(*config, *input, *segments, meshConnections), 239 | nvcluster::OutputClusters(*outputClusters, segmentClusterRanges, segments->segmentCount)); 240 | } 241 | else 242 | { 243 | // Translate 0u to imply no limit and disable the vertex limit if 244 | // skipVertexLimit is set. 245 | nvcluster_Config configCopy = *config; 246 | if(config->maxClusterVertices == 0u || skipVertexLimit) 247 | { 248 | configCopy.maxClusterVertices = ~0u; 249 | config = &configCopy; 250 | } 251 | return nvcluster::clusterize(context->parallelize, nvcluster::Input(*config, *input, *segments), 252 | nvcluster::OutputClusters(*outputClusters, segmentClusterRanges, segments->segmentCount)); 253 | } 254 | } 255 | 256 | nvcluster_Result nvclusterBuild(nvcluster_Context context, const nvcluster_Config* config, const nvcluster_Input* input, nvcluster_OutputClusters* outputClusters) 257 | { 258 | if(context == nullptr) 259 | { 260 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 261 | } 262 | if(input == nullptr) 263 | { 264 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_INPUT; 265 | } 266 | if(outputClusters == nullptr) 267 | { 268 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_OUTPUT; 269 | } 270 | 271 | return buildMaybeWithConnections(context, config, input, outputClusters); 272 | } 273 | 274 | nvcluster_Result nvclusterGetRequirementsSegmented(nvcluster_Context context, 275 | const nvcluster_Config* config, 276 | uint32_t itemCount, 277 | const nvcluster_Segments* segments, 278 | nvcluster_Counts* outputRequiredCounts) 279 | { 280 | if(context == nullptr) 281 | { 282 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 283 | } 284 | if(config == nullptr) 285 | { 286 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_INPUT; 287 | } 288 | if(outputRequiredCounts == nullptr) 289 | { 290 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_OUTPUT; 291 | } 292 | 293 | uint32_t outputClusterCount = 0u; 294 | for(uint32_t itemSegmentIndex = 0; itemSegmentIndex < segments->segmentCount; itemSegmentIndex++) 295 | { 296 | const nvcluster_Range& segmentItemRange = segments->segmentItemRanges[itemSegmentIndex]; 297 | if(itemCount < segmentItemRange.offset + segmentItemRange.count) 298 | { 299 | return nvcluster_Result::NVCLUSTER_ERROR_SEGMENT_AND_ITEM_COUNT_CONTRADICTION; 300 | } 301 | nvcluster_Counts segmentResult{}; 302 | nvcluster_Result res = nvclusterGetRequirements(context, config, segmentItemRange.count, &segmentResult); 303 | if(res != nvcluster_Result::NVCLUSTER_SUCCESS) 304 | { 305 | return res; 306 | } 307 | outputClusterCount += segmentResult.clusterCount; 308 | } 309 | *outputRequiredCounts = nvcluster_Counts{ 310 | .clusterCount = outputClusterCount, 311 | }; 312 | 313 | return nvcluster_Result::NVCLUSTER_SUCCESS; 314 | } 315 | 316 | // TODO: this is a naive implementation with no parallelism across segments. The 317 | // internal implementation is already parallel and should eventually be able to 318 | // handle clustering multiple ranges at the same time. 319 | nvcluster_Result nvclusterBuildSegmented(nvcluster_Context context, 320 | const nvcluster_Config* config, 321 | const nvcluster_Input* input, 322 | const nvcluster_Segments* segments, 323 | nvcluster_OutputClusters* outputClusters, 324 | nvcluster_Range* segmentClusterRanges) 325 | { 326 | if(context == nullptr) 327 | { 328 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_CONTEXT; 329 | } 330 | if(config == nullptr || input == nullptr || segments == nullptr) 331 | { 332 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_INPUT; 333 | } 334 | if(outputClusters == nullptr) 335 | { 336 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_OUTPUT; 337 | } 338 | if(segments->segmentCount && segmentClusterRanges == nullptr) 339 | { 340 | return nvcluster_Result::NVCLUSTER_ERROR_NULL_OUTPUT; 341 | } 342 | return buildMaybeWithConnections(context, config, input, outputClusters, segments, segmentClusterRanges); 343 | } 344 | -------------------------------------------------------------------------------- /include/nvcluster/nvcluster.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION 17 | * SPDX-License-Identifier: Apache-2.0 18 | */ 19 | 20 | #ifndef NVCLUSTER_CLUSTERS_H 21 | #define NVCLUSTER_CLUSTERS_H 22 | 23 | #define NVCLUSTER_VERSION 2 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #if defined(NVCLUSTER_BUILDER_SHARED) 34 | #if defined(_MSC_VER) 35 | // msvc 36 | #if defined(NVCLUSTER_BUILDER_COMPILING) 37 | #define NVCLUSTER_API __declspec(dllexport) 38 | #else 39 | #define NVCLUSTER_API __declspec(dllimport) 40 | #endif 41 | #elif defined(__GNUC__) 42 | // gcc/clang 43 | #define NVCLUSTER_API __attribute__((visibility("default"))) 44 | #else 45 | // Unsupported. If hit, use cmake GenerateExportHeader 46 | #pragma warning Unsupported compiler 47 | #define NVCLUSTER_API 48 | #endif 49 | #else // defined(NVCLUSTER_BUILDER_SHARED) 50 | // static lib, no export needed 51 | #define NVCLUSTER_API 52 | #endif 53 | 54 | #ifdef __cplusplus 55 | #define NVCLUSTER_DEFAULT(x) = x 56 | #else 57 | #define NVCLUSTER_DEFAULT(x) 58 | #endif 59 | 60 | // Binary-stable bool for C 61 | typedef uint8_t nvcluster_Bool; 62 | #define NVCLUSTER_TRUE (nvcluster_Bool)1u 63 | #define NVCLUSTER_FALSE (nvcluster_Bool)0u 64 | 65 | typedef enum nvcluster_Result 66 | { 67 | NVCLUSTER_SUCCESS, 68 | NVCLUSTER_ERROR_CONTEXT_VERSION_MISMATCH, 69 | NVCLUSTER_ERROR_INVALID_CONFIG_CLUSTER_SIZES, 70 | NVCLUSTER_ERROR_MISSING_SPATIAL_BOUNDING_BOXES, 71 | NVCLUSTER_ERROR_MISSING_SPATIAL_CENTROIDS, 72 | NVCLUSTER_ERROR_INVALID_OUTPUT_ITEM_INDICES_SIZE, 73 | NVCLUSTER_ERROR_SPATIAL_AND_CONNECTIONS_ITEM_COUNT_MISMATCH, 74 | NVCLUSTER_ERROR_SEGMENT_AND_ITEM_COUNT_CONTRADICTION, 75 | NVCLUSTER_ERROR_SEGMENT_COUNT_MISMATCH, 76 | NVCLUSTER_ERROR_MAX_CLUSTER_VERTICES_WITHOUT_CONNECTION_BITS, 77 | NVCLUSTER_ERROR_MAX_VERTICES_LESS_THAN_ITEM_VERTICES, 78 | NVCLUSTER_ERROR_NO_CONNECTION_ATTRIBUTES, 79 | NVCLUSTER_ERROR_ITEM_VERTEX_COUNT_OVERFLOW, 80 | NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTICES_PROVIDED, 81 | NVCLUSTER_ERROR_BOTH_CONNECTIONS_AND_VERTEX_COUNT_PROVIDED, 82 | NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_PER_ITEM_VERTEX_COUNT, 83 | NVCLUSTER_ERROR_ITEM_VERTICES_WITHOUT_VERTEX_COUNT, 84 | NVCLUSTER_ERROR_NULL_INPUT, 85 | NVCLUSTER_ERROR_NULL_CONTEXT, 86 | NVCLUSTER_ERROR_NULL_OUTPUT, 87 | 88 | // These likely indicate a bug with the library 89 | NVCLUSTER_ERROR_INTERNAL_MULTIPLE_UNDERFLOW, 90 | } nvcluster_Result; 91 | 92 | typedef struct nvcluster_Vec3f 93 | { 94 | float x NVCLUSTER_DEFAULT(0.0f); 95 | float y NVCLUSTER_DEFAULT(0.0f); 96 | float z NVCLUSTER_DEFAULT(0.0f); 97 | } nvcluster_Vec3f; 98 | 99 | #define nvcluster_defaultVec3f() {0.0f, 0.0f, 0.0f} 100 | 101 | // Axis aligned bounding box 102 | typedef struct nvcluster_AABB 103 | { 104 | #ifdef __cplusplus 105 | float bboxMin[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; 106 | float bboxMax[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; 107 | #else 108 | float bboxMin[3]; 109 | float bboxMax[3]; 110 | #endif 111 | } nvcluster_AABB; 112 | 113 | // clang-format off 114 | #define nvcluster_defaultAABB() {{FLT_MAX, FLT_MAX, FLT_MAX}, {-FLT_MAX, -FLT_MAX, -FLT_MAX}} 115 | // clang-format on 116 | 117 | // An index/cursor based subrange 118 | typedef struct nvcluster_Range 119 | { 120 | uint32_t offset NVCLUSTER_DEFAULT(0u); 121 | uint32_t count NVCLUSTER_DEFAULT(0u); 122 | } nvcluster_Range; 123 | 124 | #define nvcluster_defaultRange() {0u, 0u} 125 | 126 | // Clustering knobs, including cost balancing 127 | typedef struct nvcluster_Config 128 | { 129 | // Minimum number of items per output cluster. 130 | // Ignored when maxClusterVertices is set. 131 | uint32_t minClusterSize NVCLUSTER_DEFAULT(1u); 132 | 133 | // Maximum number of items per output cluster 134 | uint32_t maxClusterSize NVCLUSTER_DEFAULT(~0u); 135 | 136 | // Maximum number of unique vertices per output cluster, if 137 | // connectionVertexBits or itemVertices is not null. Setting a 138 | // maxClusterVertices value causes minClusterSize to be ignored. 139 | // A value of 0u may also be used to disable the vertex limit. 140 | uint32_t maxClusterVertices NVCLUSTER_DEFAULT(~0u); 141 | 142 | // Cost penalty for under-filling clusters, [0, 1] 143 | float costUnderfill NVCLUSTER_DEFAULT(0.1f); 144 | 145 | // Cost penalty for overlapping bounding boxes, [0, 1] 146 | float costOverlap NVCLUSTER_DEFAULT(0.1f); 147 | 148 | // Cost penalty for under-filling maxClusterVertices, [0, 1] 149 | // Requires maxClusterVertices to be set. 150 | float costUnderfillVertices NVCLUSTER_DEFAULT(0.0f); 151 | 152 | // I.e. 3 when clustering triangles, 4 for quads. The number of bits that may 153 | // be set in each connectionVertexBits, and limited by its bit width (at most 154 | // 8). 155 | uint32_t itemVertexCount NVCLUSTER_DEFAULT(3); 156 | 157 | // If nonzero items will be recursively partitioned at the median of the 158 | // longest bounding box side until subsets contain at most preSplitThreshold 159 | // items prior to actual clustering. This is an optimization intended to speed 160 | // up clustering of large sets of items, e.g. more than 100k. 161 | uint32_t preSplitThreshold NVCLUSTER_DEFAULT(0u); 162 | } nvcluster_Config; 163 | 164 | #define nvcluster_defaultConfig() {1u, ~0u, 0.1f, 0.1f, 0u} 165 | 166 | // Type to hold unique bits for connections between items. E.g. triangle A might 167 | // connect to triangle B, sharing two vertices. Two of the first 3 bits would be 168 | // set for that connection. The third may be used in another of triangle A's 169 | // connection. Triangle B would use its own bits to identify the same shared 170 | // vertices. 171 | typedef uint8_t nvcluster_VertexBits; 172 | 173 | typedef struct nvcluster_Input 174 | { 175 | // Required section of spatial definition of items to cluster 176 | 177 | // Bounding boxes of items to cluster 178 | const nvcluster_AABB* itemBoundingBoxes NVCLUSTER_DEFAULT(nullptr); 179 | 180 | // Center positions (xyz) of items to cluster 181 | const nvcluster_Vec3f* itemCentroids NVCLUSTER_DEFAULT(nullptr); 182 | 183 | // Number of elements in itemBoundingBoxes and itemCentroids 184 | uint32_t itemCount NVCLUSTER_DEFAULT(0u); 185 | 186 | // Optional section to add weighted item connectivity to optimize the spatial 187 | // partitioning towards "minimum cuts". I.e. try to form clusters with high 188 | // interconnected weights. 189 | // 190 | // While the connections are unidirectional, the algorithm requires this data 191 | // structure to specify two directions explicitly for each connection. I.e. if 192 | // item A has a connection to item B, then item B must have a connection to 193 | // item A. The weights must match in both connections. Vertex bits, if used, 194 | // will not. The terms item and connection are equivalent to node and edge in 195 | // graph theory. 196 | 197 | // Each item has Range::count connections to other items, stored at 198 | // Range::offset in connectionTargetItems. Connections are unidirectional but must 199 | // be duplicated to store bidirectional connections. 200 | const nvcluster_Range* itemConnectionRanges NVCLUSTER_DEFAULT(nullptr); 201 | 202 | // Connected item indices that itemConnectionRanges selects from (i.e. edges 203 | // in a graph) 204 | const uint32_t* connectionTargetItems NVCLUSTER_DEFAULT(nullptr); 205 | 206 | // Optional. Weight of each connection. The same value must be used in return 207 | // connections. 208 | const float* connectionWeights NVCLUSTER_DEFAULT(nullptr); 209 | 210 | // Optional. Used when maxClusterVertices is needed. Rather than consume 211 | // indices to triangle vertices and try to match them, this library takes 212 | // per-item-connection bits to identify unique vertices, e.g. triangle A 213 | // connects to triangle B through triangle A's vertex 0 and 1. Note that 214 | // values will not be symmetric in return connections as other items will 215 | // identify those same vertices with different bits. 216 | // 217 | // ____2_____ Example for triangle A (local vertex 0, 1, 2) 218 | // \ |\ | 219 | // \D | \ C| connectionTargetItems = { index of B, C, D}; 220 | // \ |A \ | connectionVertexBits = { 0b011, 0b110, 0b101}; 221 | // 0___1| 222 | // | / 223 | // | B/ 224 | // | / 225 | // |/ 226 | const nvcluster_VertexBits* connectionVertexBits NVCLUSTER_DEFAULT(nullptr); 227 | 228 | // Size of connectionTargetItems, connectionWeights and connectionVertexBits 229 | uint32_t connectionCount NVCLUSTER_DEFAULT(0u); 230 | 231 | // Quick alternative to use maxClusterVertices, replacing connection inputs. 232 | // If not null, computes connectionVertexBits internally from the 2D array of 233 | // vertex indices itemVertices[itemCount][nvcluster_Config::itemVertexCount]. 234 | // For triangle vertex indices, this could be a straight cast of 235 | // std::span triangleVertices, setting itemVertexCount to 3. 236 | const uint32_t* itemVertices NVCLUSTER_DEFAULT(nullptr); 237 | 238 | // Number of unique vertices referenced by itemVertices, if used. I.e. the 239 | // maximum value plus one. Used for internal intermediate allocation size. 240 | uint32_t vertexCount NVCLUSTER_DEFAULT(0u); 241 | } nvcluster_Input; 242 | 243 | // Optionally divide items to cluster into segments and cluster within each 244 | // segment in a single API call. Segments must not overlap. Unreferenced items 245 | // will still appear in nvcluster_OutputClusters::items. 246 | typedef struct nvcluster_Segments 247 | { 248 | // Each segment defines range of items to cluster within 249 | const nvcluster_Range* segmentItemRanges NVCLUSTER_DEFAULT(nullptr); 250 | 251 | // Number of segments 252 | uint32_t segmentCount NVCLUSTER_DEFAULT(0u); 253 | } nvcluster_Segments; 254 | 255 | // Clustering output counts. For example, nvclusterGetRequirements() will first 256 | // write the upper limit of generated clusters. This must be used to size the 257 | // allocation given to e.g. nvclusterBuild(), which will write the 258 | // exact cluster count written. 259 | typedef struct nvcluster_Counts 260 | { 261 | uint32_t clusterCount NVCLUSTER_DEFAULT(0u); 262 | } nvcluster_Counts; 263 | 264 | // Clustering output, defining selections of input items that form clusters 265 | // created by partitioning input items spatially 266 | typedef struct nvcluster_OutputClusters 267 | { 268 | // Clusters defined by ranges of item indices, where each cluster starts at 269 | // range.offset in items and contains range.count items 270 | nvcluster_Range* clusterItemRanges NVCLUSTER_DEFAULT(nullptr); 271 | 272 | // Indices of the input items, referenced by clusterItemRanges 273 | uint32_t* items NVCLUSTER_DEFAULT(nullptr); 274 | 275 | // Initially the number of elements in clusterItemRanges 276 | // The nvclusterBuild*() replaces it with the element count written 277 | uint32_t clusterCount NVCLUSTER_DEFAULT(0u); 278 | 279 | // Initially the number of elements in items 280 | // The nvclusterBuild*() replaces it with the element count written 281 | uint32_t itemCount NVCLUSTER_DEFAULT(0u); 282 | } nvcluster_OutputClusters; 283 | 284 | struct nvcluster_Context_t; 285 | typedef struct nvcluster_Context_t* nvcluster_Context; 286 | 287 | typedef struct nvcluster_ContextCreateInfo 288 | { 289 | // Version expected. nvclusterCreateContext() returns 290 | // nvcluster_Result::NVCLUSTER_ERROR_CONTEXT_VERSION_MISMATCH if another is found at 291 | // runtime. 292 | uint32_t version NVCLUSTER_DEFAULT(NVCLUSTER_VERSION); 293 | 294 | // Set to NVCLUSTER_TRUE or NVCLUSTER_FALSE to enable or disable internal 295 | // parallelisation using std execution policies at runtime 296 | nvcluster_Bool parallelize NVCLUSTER_DEFAULT(NVCLUSTER_TRUE); 297 | } nvcluster_ContextCreateInfo; 298 | 299 | #define nvcluster_defaultContextCreateInfo() {NVCLUSTER_VERSION, NVCLUSTER_TRUE} 300 | 301 | // Usage: 302 | // 1. Call nvclusterGetRequirements(...) or 303 | // nvclusterGetRequirementsSegmented(...) to get conservative sizes 304 | // 2. Allocate data for nvcluster_OutputClusters 305 | // 3. Call nvclusterBuild(...) or nvclusterBuildSegmented(...) 306 | // 4. Resize down to what was written 307 | // 308 | // Alternatively use ClusterStorage or SegmentedClusterStorage, which 309 | // encapsulates the above. 310 | // 311 | // The segmented output, clusterSegments must have space to store 312 | // nvcluster_Segments::segmentCount Range objects 313 | NVCLUSTER_API nvcluster_Result nvclusterGetRequirements(nvcluster_Context context, 314 | const nvcluster_Config* config, 315 | uint32_t itemCount, 316 | nvcluster_Counts* outputRequiredCounts); 317 | NVCLUSTER_API nvcluster_Result nvclusterBuild(nvcluster_Context context, 318 | const nvcluster_Config* config, 319 | const nvcluster_Input* input, 320 | nvcluster_OutputClusters* outputClusters); 321 | NVCLUSTER_API nvcluster_Result nvclusterGetRequirementsSegmented(nvcluster_Context context, 322 | const nvcluster_Config* config, 323 | uint32_t itemCount, 324 | const nvcluster_Segments* segments, 325 | nvcluster_Counts* outputRequiredCounts); 326 | NVCLUSTER_API nvcluster_Result nvclusterBuildSegmented(nvcluster_Context context, 327 | const nvcluster_Config* config, 328 | const nvcluster_Input* input, 329 | const nvcluster_Segments* segments, 330 | nvcluster_OutputClusters* outputClusters, 331 | nvcluster_Range* clusterSegments); 332 | NVCLUSTER_API uint32_t nvclusterVersion(void); 333 | NVCLUSTER_API nvcluster_Result nvclusterCreateContext(const nvcluster_ContextCreateInfo* info, nvcluster_Context* context); 334 | NVCLUSTER_API nvcluster_Result nvclusterDestroyContext(nvcluster_Context context); 335 | NVCLUSTER_API const char* nvclusterResultString(nvcluster_Result result); 336 | 337 | #ifdef __cplusplus 338 | } // extern "C" 339 | #endif 340 | 341 | #endif // NVCLUSTER_CLUSTERS_H 342 | -------------------------------------------------------------------------------- /doc/adjacency_sweep.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | +1 75 | +1 76 | +1 77 | 1 78 | 1 79 | 1 80 | 1 81 | 82 | 83 | 84 | -1 85 | -1 86 | -1 87 | -1 88 | -1 89 | 90 | 2 91 | 0 92 | 93 | Summed Connections 94 | Prefix Summed 95 | 96 | 97 | --------------------------------------------------------------------------------