├── .github └── workflows │ ├── Ubuntu.yml │ └── Windows.yml ├── .gitignore ├── CMakeLists.txt ├── README.md ├── cukd ├── box.h ├── builder.h ├── builder_bitonic.h ├── builder_common.h ├── builder_host.h ├── builder_inplace.h ├── builder_thrust.h ├── common.h ├── cubit │ ├── common.h │ ├── cubit.h │ ├── cubit_maxval.h │ └── cubit_zip.h ├── cukd-math.h ├── data.h ├── fcp.h ├── helpers.h ├── kdtree.h ├── knn.h ├── spatial-kdtree.h ├── traverse-cct.h ├── traverse-default-stack-based.h ├── traverse-sf-imp.h └── traverse-stack-free.h ├── measure.sh ├── sample.cu ├── sampleHost.cu ├── samples ├── CMakeLists.txt ├── knn-float3-spatialkdtree.cu ├── mpiHugeQuery.cu └── mpiHugeQueryHost.cu ├── scripts ├── README.md └── actions │ ├── install_cuda_ubuntu.sh │ └── install_cuda_windows.ps1 └── testing ├── CMakeLists.txt ├── compileKNN.cu ├── compileSpatialKNN.cu ├── floatN-knn-and-fcp.cu ├── issue5.cu ├── test-include-as-subdirectory └── CMakeLists.txt ├── testBuilderEmptyInput.cu ├── testBuilderSimpleInput.cu ├── testBuildersSameResult.cu ├── testHostBuilderEmptyInput.cu ├── testHostBuilderSimpleInput.cu ├── testMultipleDefinitions_a.cu ├── testMultipleDefinitions_b.cu └── testPayloadSampleFromReadme.cu /.github/workflows/Ubuntu.yml: -------------------------------------------------------------------------------- 1 | # Compile project on Ubuntu 2 | name: Ubuntu 3 | on: 4 | push: 5 | paths: 6 | - "**" 7 | - "!.github/**" 8 | - ".github/workflows/Ubuntu.yml" 9 | - "!scripts/" 10 | - "scripts/actions/install_cuda_ubuntu.sh" 11 | - "!*.md" 12 | jobs: 13 | build: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | # explicit include-based build matrix, of known valid options 18 | matrix: 19 | include: 20 | # 24.04 supports CUDA 12.4+ 21 | - os: ubuntu-24.04 22 | cuda: "12.6" 23 | gcc: 13 24 | # 22.04 supports CUDA 11.7+ 25 | - os: ubuntu-22.04 26 | cuda: "12.0" 27 | gcc: 11 28 | # - os: ubuntu-22.04 29 | # cuda: "11.8" 30 | # gcc: 10 31 | # - os: ubuntu-22.04 32 | # cuda: "11.7" 33 | # gcc: 10 34 | # # 20.04 supports CUDA 11.0+ 35 | # - os: ubuntu-20.04 36 | # cuda: "11.6" 37 | # gcc: 10 38 | # - os: ubuntu-20.04 39 | # cuda: "11.5" 40 | # gcc: 10 41 | # - os: ubuntu-20.04 42 | # cuda: "11.4" 43 | # gcc: 10 44 | # - os: ubuntu-20.04 45 | # cuda: "11.3" 46 | # gcc: 10 47 | # - os: ubuntu-20.04 48 | # cuda: "11.2" 49 | # gcc: 10 50 | # - os: ubuntu-20.04 51 | # cuda: "11.0" 52 | # gcc: 9 53 | # 18.04 supports CUDA 10.1+ (gxx <= 8), but were deprecated on 2022-08-08 and unsupported from 2023-04-01 54 | # - os: ubuntu-18.04 55 | # cuda: "10.2" 56 | # gcc: 8 57 | # - os: ubuntu-18.04 58 | # cuda: "10.1" 59 | # gcc: 8 60 | # 16.04 runners are deprecated / removed in september 2021. 61 | # It should still be possible to install CUDA 8 - CUDA 10 in 18.04 images by using the 16.04 repository, but install_cuda_ubuntu.sh would require changes to do so / a way to override the repository to use. 62 | env: 63 | build_dir: "build" 64 | config: "Release" 65 | 66 | steps: 67 | - uses: actions/checkout@v3 68 | 69 | - name: Install CUDA 70 | env: 71 | cuda: ${{ matrix.cuda }} 72 | run: ./scripts/actions/install_cuda_ubuntu.sh 73 | shell: bash 74 | 75 | # Specify the correct host compilers 76 | - name: Install/Select gcc and g++ 77 | run: | 78 | sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} 79 | echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> $GITHUB_ENV 80 | echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV 81 | echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV 82 | 83 | - name: Configure cmake 84 | id: configure 85 | run: cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DCMAKE_CUDA_ARCHITECTURES=all-major 86 | 87 | - name: Configure Error Processing 88 | if: ${{ failure() && steps.configure.outcome == 'failure' }} 89 | run: | 90 | if [[ -d "${{ env.build_dir }}" ]]; then 91 | pushd "${{ env.build_dir }}" 92 | if [[ -f "CMakeFiles/CMakeOutput.log" ]]; then 93 | echo "---- CMakeFiles/CMakeOutput.log" 94 | cat CMakeFiles/CMakeOutput.log 95 | echo "----" 96 | fi 97 | if [[ -f "CMakeFiles/CMakeError.log" ]]; then 98 | echo "---- CMakeFiles/CMakeError.log" 99 | cat CMakeFiles/CMakeError.log 100 | echo "----" 101 | fi 102 | fi 103 | 104 | 105 | 106 | - name: Build everything else 107 | working-directory: ${{ env.build_dir }} 108 | run: cmake --build . --target all --verbose -j `nproc` 109 | 110 | -------------------------------------------------------------------------------- /.github/workflows/Windows.yml: -------------------------------------------------------------------------------- 1 | # Windows builds. 2 | name: Windows 3 | on: 4 | push: 5 | paths: 6 | - "**" 7 | - "!.github/**" 8 | - ".github/workflows/Windows.yml" 9 | - "!scripts" 10 | - "scripts/install_cuda_windows.ps1" 11 | - "!*.md" 12 | jobs: 13 | build: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | # explicit include-based build matrix, of known valid options 18 | matrix: 19 | include: 20 | # Windows-2022 & VS 2022 supports 12.4+ 21 | - os: windows-2022 22 | cuda: "12.4.0" 23 | visual_studio: "Visual Studio 17 2022" 24 | # Windows-2019 & VS 2019 supports 10.1+ 25 | #- os: windows-2019 26 | # cuda: "11.5.0" 27 | # visual_studio: "Visual Studio 16 2019" 28 | #- os: windows-2019 29 | # cuda: "11.4.0" 30 | # visual_studio: "Visual Studio 16 2019" 31 | #- os: windows-2019 32 | # cuda: "11.3.0" 33 | # visual_studio: "Visual Studio 16 2019" 34 | #- os: windows-2019 35 | # cuda: "11.2.0" 36 | # visual_studio: "Visual Studio 16 2019" 37 | #- os: windows-2019 38 | # cuda: "11.1.0" 39 | # visual_studio: "Visual Studio 16 2019" 40 | #- os: windows-2019 41 | # cuda: "11.0.1" 42 | # visual_studio: "Visual Studio 16 2019" 43 | #- os: windows-2019 44 | # cuda: "10.2.89" 45 | # visual_studio: "Visual Studio 16 2019" 46 | #- os: windows-2019 47 | # cuda: "10.1.243" 48 | # visual_studio: "Visual Studio 16 2019" 49 | 50 | env: 51 | build_dir: "build" 52 | config: "Release" 53 | 54 | steps: 55 | - uses: actions/checkout@v3 56 | 57 | - name: Install CUDA 58 | env: 59 | cuda: ${{ matrix.cuda }} 60 | visual_studio: ${{ matrix.visual_studio }} 61 | shell: powershell 62 | run: .\scripts\actions\install_cuda_windows.ps1 63 | 64 | - name: nvcc check 65 | shell: powershell 66 | run: | 67 | nvcc -V 68 | ls $env:CUDA_PATH 69 | ls $env:CUDA_PATH\bin 70 | ls $env:CUDA_PATH\include 71 | 72 | - name: cmake version 73 | shell: bash 74 | run: cmake --version 75 | 76 | - name: Configure CMake 77 | id: configure 78 | shell: bash 79 | run: cmake . -B ${{ env.build_dir }} -G "${{ matrix.visual_studio }}" -A x64 -DCMAKE_CUDA_ARCHITECTURES=all-major 80 | 81 | - name: Configure Error Processing 82 | if: ${{ (failure() && steps.configure.outcome == 'failure') || success() }} 83 | shell: bash 84 | run: | 85 | if [[ -d "${{ env.build_dir }}" ]]; then 86 | pushd "${{ env.build_dir }}" 87 | if [[ -f "CMakeFiles/CMakeOutput.log" ]]; then 88 | echo "---- CMakeFiles/CMakeOutput.log" 89 | cat CMakeFiles/CMakeOutput.log 90 | echo "----" 91 | fi 92 | if [[ -f "CMakeFiles/CMakeError.log" ]]; then 93 | echo "---- CMakeFiles/CMakeError.log" 94 | cat CMakeFiles/CMakeError.log 95 | echo "----" 96 | fi 97 | fi 98 | 99 | - name: Build 100 | working-directory: ${{ env.build_dir }} 101 | run: cmake --build . --config ${{ env.config }} --target ALL_BUILD --verbose 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | textures 3 | *# 4 | .#* 5 | bin 6 | dbg 7 | tags 8 | .ycm_extra_conf.pyc 9 | *.autosave 10 | *DS_Store* 11 | *.gz 12 | *.rpm 13 | *.zip 14 | *.bak 15 | *.patch 16 | .vscode 17 | deps 18 | tbb 19 | ispc 20 | *.aux 21 | *.bbl 22 | *.blg 23 | *.brf 24 | *.dvi 25 | *.lbl 26 | *.log 27 | *.swp 28 | *.out 29 | Session.vim 30 | .idea 31 | !*png/*.pdf 32 | .vs/ 33 | 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ======================================================================== # 2 | # Copyright 2021-2024 Ingo Wald # 3 | # # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # 5 | # you may not use this file except in compliance with the License. # 6 | # You may obtain a copy of the License at # 7 | # # 8 | # http://www.apache.org/licenses/LICENSE-2.0 # 9 | # # 10 | # Unless required by applicable law or agreed to in writing, software # 11 | # distributed under the License is distributed on an "AS IS" BASIS, # 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 13 | # See the License for the specific language governing permissions and # 14 | # limitations under the License. # 15 | # ======================================================================== # 16 | 17 | cmake_minimum_required(VERSION 3.18) 18 | cmake_policy(SET CMP0048 NEW) 19 | cmake_policy(SET CMP0104 NEW) 20 | set(CMAKE_BUILD_TYPE_INIT "Release") 21 | project(cudaKDTree VERSION 1.0.1 LANGUAGES C CXX) 22 | 23 | if (NOT (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})) 24 | set(CUKD_IS_SUBPROJECT ON) 25 | else() 26 | set(CUKD_IS_SUBPROJECT OFF) 27 | endif() 28 | 29 | option(BUILD_ALL_TESTS "Build entire type/dimension/kernel test matrix?" OFF) 30 | 31 | # ------------------------------------------------------------------ 32 | # OpenMP support 33 | # ------------------------------------------------------------------ 34 | find_package(OpenMP) 35 | if(OpenMP_CXX_FOUND) 36 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 37 | 38 | # Fix for OpenMP library path issues 39 | string(REPLACE ";" " " OpenMP_CXX_LIBRARIES_FIXED "${OpenMP_CXX_LIBRARIES}") 40 | set(OpenMP_CXX_LIBRARIES "${OpenMP_CXX_LIBRARIES_FIXED}") 41 | 42 | if(WIN32) 43 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp:llvm") 44 | else() 45 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 46 | endif() 47 | 48 | endif() 49 | 50 | # ------------------------------------------------------------------ 51 | # CUDA and OpenMP integration 52 | # ------------------------------------------------------------------ 53 | # Ensure CUDA compiler can use OpenMP 54 | if(OpenMP_CXX_FOUND) 55 | if(WIN32) 56 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/openmp:llvm") 57 | else() 58 | # For NVCC compiler, need to pass OpenMP flags differently 59 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}") 60 | 61 | set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CXX_COMPILER}") 62 | set(CMAKE_CUDA_STANDARD_LIBRARIES "${CMAKE_CUDA_STANDARD_LIBRARIES} ${OpenMP_CXX_LIBRARIES}") 63 | endif() 64 | endif() 65 | 66 | #add_subdirectory(../bitonic ext_bitonic EXCLUDE_FROM_ALL) 67 | 68 | # ------------------------------------------------------------------ 69 | # general cmake project configs 70 | # ------------------------------------------------------------------ 71 | if (CUKD_IS_SUBPROJECT) 72 | # we're used as a subproject (as we should be!) - parent HAS to have 73 | # set CMAKE_CUDA_ARCHITECTURES for our code to compile 74 | # properly. check if it did, and error out of not 75 | if ((NOT CMAKE_CUDA_ARCHITECTURES) 76 | OR 77 | ((${CMAKE_VERSION} VERSION_LESS 3.24) 78 | AND 79 | ("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "52"))) 80 | message(FATAL_ERROR "#cudaKDTree: no CMAKE_CUDA_ARCHITECTURES defined, or left for cmake to default to arch 5.2. This is almost certainly a configuration problem that will cause you some grief. Please define CMAKE_CUDA_ARCHITECTURES to the (list of) arch(s) you want to be building for, and do that before the `add_subdirectory()` call that includes cudaKDTree. If in doubt as to what arch to use, for cmake version >= 3.24 you can also set it to 'all-major' or 'native'") 81 | endif() 82 | else() 83 | if (CMAKE_CUDA_ARCHITECTURES) 84 | # CI test set this to 'all-major', but older cmake's do not have this. 85 | if ((${CMAKE_VERSION} VERSION_LESS 3.24) 86 | AND 87 | (${CMAKE_CUDA_ARCHITECTURES} STREQUAL "all-major")) 88 | set (CUKD_INIT_ARCHS "70;80") 89 | else() 90 | set (CUKD_INIT_ARCHS "${CMAKE_CUDA_ARCHITECTURES}") 91 | endif() 92 | # set on the cmdline 93 | elseif (${CMAKE_VERSION} VERSION_LESS 3.24) 94 | set (CUKD_INIT_ARCHS "70;80") 95 | else() 96 | set (CUKD_INIT_ARCHS "all-major") 97 | endif() 98 | set(CUKD_CUDA_ARCHITECTURES "${CUKD_INIT_ARCHS}" 99 | CACHE STRING "CUDA Arch(s) to build against") 100 | 101 | set(CMAKE_CUDA_ARCHITECTURES ${CUKD_CUDA_ARCHITECTURES}) 102 | SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 103 | SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 104 | SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 105 | endif() 106 | enable_language(CUDA) 107 | 108 | # ================================================================== 109 | # this builds four variants of this library, that differ in how the 110 | # k-d tree is being TRAVERSED: 111 | # 112 | # `cudaKDTree-default` uses a stack-based traversal, doesn't require 113 | # the world-space bounding box 114 | # 115 | # `cudaKDTree-sf` uses a stack-free traversal. Can generate in more 116 | # efficient code in some cases, but will suffer from the same issues 117 | # as the default variant for certain combination of input point 118 | # distributoins and query point distributions 119 | # 120 | # `cudaKDTree-cct` uses 'closest-corner-tracking', which can in some 121 | # cases be faster than teh default traversal method (in particular if 122 | # there is no good cut-off-radius, and queries can originate far from 123 | # the data points, and/or for highly clustered data. It does however 124 | # require to allocate and provide (a tiny amount of) memory for the 125 | # builder to store the world-space bounding box of the input points, 126 | # as well as to pass that pointer to the query method. 127 | # 128 | # ================================================================== 129 | add_library(cudaKDTree INTERFACE) 130 | target_sources(cudaKDTree INTERFACE 131 | cukd/common.h 132 | # iw, sep 22, 2024 - intentionally renamed from cukd/math.h to cukd/cukd-math.h to 133 | # avoid name conflicts with system math.h if anybody adds cukd/ to include path 134 | cukd/cukd-math.h 135 | cukd/box.h 136 | cukd/builder.h 137 | cukd/builder_bitonic.h 138 | cukd/builder_thrust.h 139 | cukd/builder_inplace.h 140 | # SPATIAL k-d tree, with planes at arbitrary locations 141 | cukd/spatial-kdtree.h 142 | cukd/fcp.h 143 | cukd/knn.h 144 | ) 145 | target_include_directories(cudaKDTree INTERFACE 146 | ${PROJECT_SOURCE_DIR}/ 147 | ) 148 | set_property(TARGET cudaKDTree PROPERTY PUBLIC 149 | CXX_STANDARD 14) 150 | # 'attach' current cmake_cuda_architectures to this library 151 | set_property(TARGET cudaKDTree PROPERTY PUBLIC 152 | CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) 153 | 154 | 155 | 156 | # ================================================================== a 157 | # simple sample example of how to build a k-d tree 158 | # ================================================================== 159 | if (NOT CUKD_IS_SUBPROJECT) 160 | add_executable(cukd_sample sample.cu) 161 | target_link_libraries(cukd_sample cudaKDTree) 162 | 163 | add_executable(cukd_sampleHost sampleHost.cu) 164 | target_link_libraries(cukd_sampleHost PUBLIC cudaKDTree) 165 | 166 | if(OpenMP_CXX_FOUND) 167 | target_link_libraries(cukd_sampleHost PUBLIC OpenMP::OpenMP_CXX) 168 | target_compile_definitions(cukd_sampleHost PUBLIC OPENMP_FOUND) 169 | endif() 170 | 171 | find_package(MPI) 172 | if (MPI_FOUND) 173 | add_executable(cukd_mpiHugeQuery samples/mpiHugeQuery.cu) 174 | target_link_libraries(cukd_mpiHugeQuery PUBLIC cudaKDTree MPI::MPI_CXX) 175 | 176 | add_executable(cukd_mpiHugeQueryHost samples/mpiHugeQueryHost.cu) 177 | target_link_libraries(cukd_mpiHugeQueryHost PUBLIC cudaKDTree MPI::MPI_CXX) 178 | 179 | if(OpenMP_CXX_FOUND) 180 | target_link_libraries(cukd_mpiHugeQueryHost PUBLIC OpenMP::OpenMP_CXX) 181 | target_compile_definitions(cukd_mpiHugeQueryHost PUBLIC OPENMP_FOUND) 182 | endif() 183 | 184 | endif() 185 | endif() 186 | 187 | 188 | 189 | 190 | # ================================================================== 191 | # create _a lot_ of test cases: this generates the whole matrix of 192 | # traversal_method x num_dims x {fcp,knn} 193 | # ================================================================== 194 | if (BUILD_ALL_TESTS) 195 | # test 2, 3, 4, and 8-dimensoinal data; the latter should - if it 196 | # works for N=8, work for any other N>4 197 | # set(DIMS_TO_BUILD 3) 198 | option(CUKD_ENABLE_STATS "Enable Stats tracking?" OFF) 199 | if (CUKD_ENABLE_STATS) 200 | set(CUKD_ENABLE_STATS_VALUE 1) 201 | else() 202 | set(CUKD_ENABLE_STATS_VALUE 0) 203 | endif() 204 | set(DIMS_TO_BUILD 2 3 4 8) 205 | foreach (D IN ITEMS ${DIMS_TO_BUILD}) 206 | # test all four possible traversal methosds 207 | foreach(method stackBased stackFree cct) 208 | # test knn queries, on regular trees (no explicit dimension per node) 209 | add_executable(cukd_float${D}-knn-${method} testing/floatN-knn-and-fcp.cu) 210 | target_link_libraries(cukd_float${D}-knn-${method} cudaKDTree) 211 | target_compile_definitions(cukd_float${D}-knn-${method} 212 | PUBLIC 213 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 214 | -DD_FROM_CMAKE=${D} 215 | -DUSE_KNN=1 216 | -DTRAVERSAL_METHOD=${method}) 217 | 218 | # test knn queries, with 'explicit-dim' trees 219 | add_executable(cukd_float${D}-knn-${method}-xd testing/floatN-knn-and-fcp.cu) 220 | target_link_libraries(cukd_float${D}-knn-${method}-xd cudaKDTree) 221 | target_compile_definitions(cukd_float${D}-knn-${method}-xd 222 | PUBLIC 223 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 224 | -DD_FROM_CMAKE=${D} 225 | -DEXPLICIT_DIM=1 226 | -DUSE_KNN=1 227 | -DTRAVERSAL_METHOD=${method}) 228 | 229 | # test fcp queries, on regular trees 230 | add_executable(cukd_float${D}-fcp-${method} testing/floatN-knn-and-fcp.cu) 231 | target_link_libraries(cukd_float${D}-fcp-${method} cudaKDTree) 232 | target_compile_definitions(cukd_float${D}-fcp-${method} 233 | PUBLIC 234 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 235 | -DD_FROM_CMAKE=${D} 236 | -DTRAVERSAL_METHOD=${method}) 237 | 238 | # test fcp queries, with 'explicit-dim' trees 239 | add_executable(cukd_float${D}-fcp-${method}-xd testing/floatN-knn-and-fcp.cu) 240 | target_link_libraries(cukd_float${D}-fcp-${method}-xd cudaKDTree) 241 | target_compile_definitions(cukd_float${D}-fcp-${method}-xd 242 | PUBLIC 243 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 244 | -DD_FROM_CMAKE=${D} 245 | -DEXPLICIT_DIM=1 246 | -DTRAVERSAL_METHOD=${method}) 247 | 248 | endforeach() 249 | 250 | 251 | foreach(method stackBased cct) 252 | # test knn queries, on regular trees (no explicit dimension per node) 253 | add_executable(cukd_float${D}-knn-spatial-${method} testing/floatN-knn-and-fcp.cu) 254 | target_link_libraries(cukd_float${D}-knn-spatial-${method} cudaKDTree) 255 | target_compile_definitions(cukd_float${D}-knn-spatial-${method} 256 | PUBLIC 257 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 258 | -DD_FROM_CMAKE=${D} 259 | -DSPATIAL=1 260 | -DUSE_KNN=1 261 | -DTRAVERSAL_METHOD=${method}) 262 | 263 | # test fcp queries, on regular trees 264 | add_executable(cukd_float${D}-fcp-spatial-${method} testing/floatN-knn-and-fcp.cu) 265 | target_link_libraries(cukd_float${D}-fcp-spatial-${method} cudaKDTree) 266 | target_compile_definitions(cukd_float${D}-fcp-spatial-${method} 267 | PUBLIC 268 | -DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE} 269 | -DSPATIAL=1 270 | -DD_FROM_CMAKE=${D} 271 | -DTRAVERSAL_METHOD=${method}) 272 | 273 | endforeach() 274 | endforeach() 275 | endif() 276 | 277 | 278 | if (NOT CUKD_IS_SUBPROJECT) 279 | add_subdirectory(samples) 280 | endif() 281 | 282 | if (BUILD_ALL_TESTS) 283 | # add some unit tests 284 | include(CTest) 285 | add_subdirectory(testing) 286 | endif() 287 | 288 | 289 | -------------------------------------------------------------------------------- /cukd/box.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2024 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/ 18 | 19 | #pragma once 20 | 21 | #include "cukd/cukd-math.h" 22 | 23 | namespace cukd { 24 | 25 | template inline __both__ T empty_box_lower(); 26 | template inline __both__ T empty_box_upper(); 27 | 28 | template<> inline __both__ float empty_box_lower() { return +INFINITY; } 29 | template<> inline __both__ float empty_box_upper() { return -INFINITY; } 30 | template<> inline __both__ int empty_box_lower() { return INT_MAX; } 31 | template<> inline __both__ int empty_box_upper() { return INT_MIN; } 32 | 33 | 34 | template 35 | struct box_t { 36 | using point_traits = ::cukd::point_traits; 37 | using scalar_t = typename point_traits::scalar_t; 38 | 39 | // inline __both__ point_t size() const { return upper - lower; } 40 | 41 | /*! returns the dimension in which the box has the widest extent */ 42 | inline __both__ int widestDimension() const; 43 | 44 | inline __both__ bool contains(const point_t &p) const 45 | { 46 | enum { num_dims = num_dims_of::value }; 47 | for (int d=0;d point_traits::get_coord(upper,d)) return false; 50 | } 51 | return true; 52 | } 53 | 54 | inline __both__ void grow(const point_t &p) 55 | { 56 | lower = min(lower,p); 57 | upper = max(upper,p); 58 | } 59 | 60 | inline __both__ void setEmpty() 61 | { 62 | for (int d=0;d::type>(); 64 | // get_coord(upper,d) = empty_box_upper::type>(); 65 | point_traits::set_coord(lower,d,empty_box_lower()); 66 | point_traits::set_coord(upper,d,empty_box_upper()); 67 | } 68 | } 69 | 70 | /*! set to an infinitely _open_ box */ 71 | inline __both__ void setInfinite() 72 | { 73 | for (int d=0;d::type>(); 75 | // get_coord(upper,d) = empty_box_lower::type>(); 76 | point_traits::set_coord(lower,d,empty_box_upper()); 77 | point_traits::set_coord(upper,d,empty_box_lower()); 78 | } 79 | } 80 | 81 | point_t lower, upper; 82 | }; 83 | 84 | /*! helper function for printf debugging */ 85 | template 86 | inline std::ostream &operator<<(std::ostream &o, const box_t &b) 87 | { 88 | o << "{" << b.lower << "," << b.upper << "}"; 89 | return o; 90 | } 91 | 92 | /*! computes the closest point to 'point' that's within the given 93 | box; if point itself is inside that box it'll be the point 94 | itself, otherwise it'll be a point on the outside surface of the 95 | box */ 96 | template 97 | inline __host__ __device__ 98 | point_t project(const cukd::box_t &box, 99 | const point_t &point) 100 | { 101 | return min(max(point,box.lower),box.upper); 102 | } 103 | 104 | // ------------------------------------------------------------------ 105 | template 106 | inline __host__ __device__ 107 | auto sqrDistance(const box_t &box, const point_t &point) 108 | { return cukd::sqrDistance(project(box,point),point); } 109 | 110 | template 111 | /*! returns the dimension in which the box has the widest extent */ 112 | inline __both__ int box_t::widestDimension() const 113 | { 114 | enum { num_dims = point_traits::num_dims }; 115 | 116 | int d_best = 0; 117 | scalar_t w_best = scalar_t(0); 118 | for (int d=0;d 22 | 23 | /* This is a single include file from which 24 | 25 | Builder variants "cheat sheet" 26 | 27 | builder_thrust: 28 | - temporary memory overhead for N points: N ints + order 2N points 29 | (ie, total mem order 3x that of input data!) 30 | - perf 100K float3s (4090) : ~4ms 31 | - perf 1M float3s (4090) : ~20ms 32 | - perf 10M float3s (4090) : ~200ms 33 | 34 | builder_bitonic: 35 | - temporary memory overhead for N points: N ints 36 | (ie, ca 30% mem overhead for float3) 37 | - perf 100K float3s (4090) : ~10ms 38 | - perf 1M float3s (4090) : ~27ms 39 | - perf 10M float3s (4090) : ~390ms 40 | 41 | builder_inplace: 42 | - temporary memory overhead for N points: nada, nil, zilch. 43 | - perf 100K float3s (4090) : ~10ms 44 | - perf 1M float3s (4090) : ~220ms 45 | - perf 10M float3s (4090) : ~4.3s 46 | 47 | */ 48 | 49 | #include "cukd/builder_thrust.h" 50 | #include "cukd/builder_bitonic.h" 51 | #include "cukd/builder_inplace.h" 52 | 53 | namespace cukd { 54 | /*! Builds a left-balanced k-d tree over the given data points, 55 | using data_traits to describe the type of data points that this 56 | tree is being built over (i.e., how to separate a data item's 57 | positional coordinates from any potential payload (if such exists, 58 | e.g., in a 'photon' in photon mapping), what vector/point type to 59 | use for this coordinate data (e.g., float3), whether the data have 60 | a field to store an explicit split dimensional (for Bentley and 61 | Samet's 'optimized' trees, etc. 62 | 63 | Since a (point-)k-d tree's tree topology is implicit in the 64 | ordering of its data points this will re-arrange the data points 65 | to fulfill the balanced k-d tree criterion - ie, this WILL modify 66 | the data array: no individual entry will get changed, but their 67 | order might. If data_traits::has_explicit_dims is defined this 68 | builder will choose each node's split dimension based on the 69 | widest dimension of that node's subtree's domain; if not, it will 70 | chose the dimension in a round-robin style, where the root level 71 | is split along the 'x' coordinate, the next level in y, etc 72 | 73 | 'worldBounds' is a pointer to device-writeable memory to store the 74 | world-space bounding box of the data points that the builder will 75 | compute. If data_traits::has_explicit_dims is true this memory 76 | _has_ to be provided to the builder, and the builder will fill it 77 | in; if data_traits::has_explicit_dims is false, this memory region 78 | is optional: the builder _will_ fill it in if provided, but will 79 | ignore it if isn't. 80 | 81 | *** Example 1: To build a 2D k-dtree over a CUDA int2 type (no other 82 | payload than the two coordinates): 83 | 84 | buildTree(....); 85 | 86 | In this case no data_traits need to be supplied beause these will 87 | be auto-computed for simple cuda vector types. 88 | 89 | *** Example 2: to build a 2D kd-tree over a data type of float4, 90 | where the first 2 coordinates of each point is the dimension we 91 | want to build the kd-tree over, and the other 2 coordinates 92 | are arbitrary other payload data: 93 | 94 | struct float2_plus_payload_traits { 95 | using point_t = float2; 96 | static inline __both__ point_t get_point(const float4 &n) 97 | { return make_float2(n.x, n.y); } 98 | }; 99 | buildTree(...); 100 | 101 | *** Example 3: assuming you have a data type 'Photon' and a 102 | Photon_traits has Photon_traits::has_explicit_dim defined: 103 | 104 | cukd::box_t *d_worldBounds = ; 105 | buildTree(..., worldBounds, ...); 106 | 107 | */ 108 | template> 109 | void buildTree(/*! device-read/writeable array of data points */ 110 | data_t *d_points, 111 | /*! number of data points */ 112 | int numPoints, 113 | /*! device-writeable pointer to store the world-space 114 | bounding box of all data points. if 115 | data_traits::has_explicit_dim is false, this is 116 | optionally allowed to be null */ 117 | box_t *worldBounds=0, 118 | /*! cuda stream to use for all kernels and mallocs 119 | (the builder_thrust may _also_ do some global 120 | device syncs) */ 121 | cudaStream_t stream=0, 122 | GpuMemoryResource &memResource=defaultGpuMemResource()) 123 | { 124 | #if defined(CUKD_BUILDER_INPLACE) 125 | /* this is a _completely_ in-place builder; it will not allocate a 126 | single byte of additional memory during building (or at any other 127 | time); the downside is that for large array's it can be 10x-20x 128 | slower . For refernece: for 10M float3 poitns, builder_inplace 129 | takes about 4.3 seconds; builder_thrust will take about 200ms, 130 | builder_bitonic will take about 390ms */ 131 | buildTree_inPlace 132 | (d_points,numPoints,worldBounds,stream,memResource); 133 | 134 | #elif defined(CUKD_BUILDER_BITONIC) 135 | /* this builder uses our tag-update algorithm, but uses bitonic sort 136 | instead of thrust for soring. it doesn't require thrust, and 137 | doesn't require additional memory other than 1 int for the tag, but 138 | for large arrays (10M-ish points) is about 2x slwoer than than the 139 | thrust variant */ 140 | buildTree_bitonic 141 | (d_points,numPoints,worldBounds,stream,memResource); 142 | #else 143 | /* this builder uses our tag-update algorithm, and uses thrust for 144 | sorting the tag:node pairs. This is our fastest builder, but has 145 | the downside that thrust's sort will not properly work in a 146 | stream, and will, in parituclar, have to allocate (quite a bit 147 | of!) temporary memory during sorting */ 148 | buildTree_thrust 149 | (d_points,numPoints,worldBounds,stream,memResource); 150 | #endif 151 | } 152 | } 153 | 154 | -------------------------------------------------------------------------------- /cukd/builder_common.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2019-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | #include "cukd/helpers.h" 20 | #include "cukd/box.h" 21 | #include "cukd/data.h" 22 | 23 | #include 24 | 25 | namespace cukd { 26 | 27 | /*! helper function for swapping two elements - need to explcitly 28 | prefix this to avoid name clashed with/in thrust */ 29 | template 30 | inline __both__ void cukd_swap(T &a, T &b) 31 | { T c = a; a = b; b = c; } 32 | 33 | 34 | /*! helper class to allow for conditionally "dropping" calls to 35 | set_dim/get_dim for data that doesn't have those functions */ 36 | template 37 | struct if_has_dims; 38 | 39 | template 40 | struct if_has_dims { 41 | static inline __both__ void set_dim(data_t &t, int dim) {} 42 | static inline __both__ int get_dim(const data_t &t, int value_if_false) 43 | { return value_if_false; } 44 | }; 45 | 46 | template 47 | struct if_has_dims { 48 | static inline __both__ void set_dim(data_t &t, int dim) { 49 | data_traits::set_dim(t,dim); 50 | } 51 | static inline __both__ int get_dim(const data_t &t, int /* ignore: value_if_false */) { 52 | return data_traits::get_dim(t); 53 | } 54 | }; 55 | /*! @} */ 56 | 57 | /*! helper function that computes the bounding box of a given set of 58 | points */ 59 | template> 61 | void computeBounds(cukd::box_t *d_bounds, 62 | const data_t *d_points, 63 | int numPoints, 64 | cudaStream_t stream=0); 65 | 66 | template> 68 | void host_computeBounds(cukd::box_t *d_bounds, 69 | const data_t *d_points, 70 | int numPoints); 71 | 72 | // ================================================================== 73 | // IMPLEMENTATION SECTION 74 | // ================================================================== 75 | 76 | template 77 | __global__ 78 | void computeBounds_copyFirst(cukd::box_t *d_bounds, 79 | const data_t *d_points) 80 | { 81 | if (threadIdx.x != 0) return; 82 | 83 | using point_t = typename data_traits::point_t; 84 | const point_t point = data_traits::get_point(d_points[0]); 85 | d_bounds->lower = d_bounds->upper = point; 86 | } 87 | 88 | #ifdef __CUDA_ARCH__ 89 | inline __device__ 90 | int atomicMin(int *addr, int value) 91 | { return ::atomicMin(addr,value); } 92 | 93 | inline __device__ 94 | int atomicMax(int *addr, int value) 95 | { return ::atomicMax(addr,value); } 96 | 97 | inline __device__ 98 | float atomicMin(float *addr, float value) 99 | { 100 | float old = *addr, assumed; 101 | if(old <= value) return old; 102 | do { 103 | assumed = old; 104 | old = __int_as_float(atomicCAS((unsigned int*)addr, __float_as_int(assumed), __float_as_int(value))); 105 | value = min(value,old); 106 | } while(old!=assumed); 107 | return old; 108 | } 109 | 110 | inline __device__ 111 | float atomicMax(float *addr, float value) 112 | { 113 | float old = *addr, assumed; 114 | if(old >= value) return old; 115 | do { 116 | assumed = old; 117 | old = __int_as_float(atomicCAS((unsigned int*)addr, __float_as_int(assumed), __float_as_int(value))); 118 | value = max(value,old); 119 | } while(old!=assumed); 120 | return old; 121 | } 122 | #endif 123 | 124 | template 126 | __global__ 127 | void computeBounds_atomicGrow(cukd::box_t *d_bounds, 128 | const data_t *d_points, 129 | int numPoints) 130 | { 131 | using point_t = typename data_traits::point_t; 132 | using point_traits = ::cukd::point_traits;//typename data_traits::point_traits; 133 | using scalar_t = typename point_traits::scalar_t; 134 | enum { num_dims = point_traits::num_dims }; 135 | 136 | const int tid = threadIdx.x+blockIdx.x*blockDim.x; 137 | if (tid >= numPoints) return; 138 | 139 | point_t point = data_traits::get_point(d_points[tid]); 140 | #pragma unroll(num_dims) 141 | for (int d=0;dlower,d); 143 | scalar_t &hi = point_traits::get_coord(d_bounds->upper,d); 144 | scalar_t f = point_traits::get_coord(point,d); 145 | atomicMin(&lo,f); 146 | atomicMax(&hi,f); 147 | } 148 | } 149 | 150 | /*! host-side helper function to compute bounding box of the data set */ 151 | template 152 | void computeBounds(cukd::box_t *d_bounds, 153 | const data_t *d_points, 154 | int numPoints, 155 | cudaStream_t s) 156 | { 157 | computeBounds_copyFirst 158 | <<<1,1,0,s>>> 159 | (d_bounds,d_points); 160 | computeBounds_atomicGrow 161 | <<>> 162 | (d_bounds,d_points,numPoints); 163 | } 164 | 165 | /*! host-side helper function to compute bounding box of the data set */ 166 | template 167 | void host_computeBounds(cukd::box_t *d_bounds, 168 | const data_t *d_points, 169 | int numPoints) 170 | { 171 | d_bounds->setEmpty(); 172 | for (int i=0;igrow(data_traits::get_point(d_points[i])); 174 | } 175 | 176 | 177 | /*! helper function that finds, for a given node in the tree, the 178 | bounding box of that subtree's domain; by walking _up_ the tree 179 | and applying all clipping planes to the world-space bounding 180 | box */ 181 | template 182 | inline __both__ 183 | cukd::box_t 184 | findBounds(int subtree, 185 | const cukd::box_t *d_bounds, 186 | data_t *d_nodes) 187 | { 188 | using point_t = typename data_traits::point_t; 189 | using point_traits = ::cukd::point_traits; 190 | using scalar_t = typename point_traits::scalar_t; 191 | enum { num_dims = point_traits::num_dims }; 192 | 193 | cukd::box_t bounds = *d_bounds; 194 | int curr = subtree; 195 | while (curr > 0) { 196 | const int parent = (curr+1)/2-1; 197 | const data_t &parent_node = d_nodes[parent]; 198 | const int parent_dim 199 | = if_has_dims 200 | ::get_dim(parent_node,/* if not: */BinaryTree::levelOf(parent) % num_dims); 201 | const scalar_t parent_split_pos 202 | = data_traits::get_coord(parent_node,parent_dim); 203 | 204 | if (curr & 1) { 205 | // curr is left child, set upper 206 | point_traits::set_coord(bounds.upper,parent_dim, 207 | min(parent_split_pos, 208 | get_coord(bounds.upper,parent_dim))); 209 | } else { 210 | // curr is right child, set lower 211 | point_traits::set_coord(bounds.lower,parent_dim, 212 | max(parent_split_pos, 213 | get_coord(bounds.lower,parent_dim))); 214 | } 215 | curr = parent; 216 | } 217 | 218 | return bounds; 219 | } 220 | 221 | 222 | } 223 | -------------------------------------------------------------------------------- /cukd/builder_host.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2019-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | #include "cukd/builder_thrust.h" 20 | 21 | // buildTree_host is currently based on the thrust builder, and 22 | // implemented as part of builder_thrust.h 23 | -------------------------------------------------------------------------------- /cukd/common.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/ 18 | 19 | #pragma once 20 | 21 | #ifndef _USE_MATH_DEFINES 22 | # define _USE_MATH_DEFINES 23 | #endif 24 | #include // using cmath causes issues under Windows 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #ifdef __GNUC__ 39 | #include 40 | #include 41 | #endif 42 | #include 43 | #include 44 | 45 | #ifdef _WIN32 46 | #ifndef WIN32_LEAN_AND_MEAN 47 | #define WIN32_LEAN_AND_MEAN 48 | #endif 49 | #include 50 | #ifdef min 51 | #undef min 52 | #endif 53 | #ifdef max 54 | #undef max 55 | #endif 56 | #endif 57 | 58 | #if !defined(WIN32) 59 | #include 60 | #endif 61 | 62 | #if defined(_MSC_VER) 63 | # define CUKD_DLL_EXPORT __declspec(dllexport) 64 | # define CUKD_DLL_IMPORT __declspec(dllimport) 65 | #elif defined(__clang__) || defined(__GNUC__) 66 | # define CUKD_DLL_EXPORT __attribute__((visibility("default"))) 67 | # define CUKD_DLL_IMPORT __attribute__((visibility("default"))) 68 | #else 69 | # define CUKD_DLL_EXPORT 70 | # define CUKD_DLL_IMPORT 71 | #endif 72 | 73 | # define CUKD_INTERFACE /* nothing - currently not building any special 'owl.dll' */ 74 | #if defined(_MSC_VER) 75 | # define __PRETTY_FUNCTION__ __FUNCTION__ 76 | #endif 77 | 78 | 79 | #ifndef PRINT 80 | # define PRINT(var) std::cout << #var << "=" << var << std::endl; 81 | #ifdef __WIN32__ 82 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __FUNCTION__ << std::endl; 83 | #else 84 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __PRETTY_FUNCTION__ << std::endl; 85 | #endif 86 | #endif 87 | 88 | #if defined(__CUDA_ARCH__) 89 | # define __owl_device __device__ 90 | # define __owl_host __host__ 91 | #else 92 | # define __owl_device /* ignore */ 93 | # define __owl_host /* ignore */ 94 | #endif 95 | 96 | # define __both__ __owl_host __owl_device 97 | 98 | 99 | #ifdef __GNUC__ 100 | #define MAYBE_UNUSED __attribute__((unused)) 101 | #else 102 | #define MAYBE_UNUSED 103 | #endif 104 | 105 | #define CUKD_NOTIMPLEMENTED throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+" not implemented") 106 | 107 | #ifdef WIN32 108 | # define CUKD_TERMINAL_RED "" 109 | # define CUKD_TERMINAL_GREEN "" 110 | # define CUKD_TERMINAL_LIGHT_GREEN "" 111 | # define CUKD_TERMINAL_YELLOW "" 112 | # define CUKD_TERMINAL_BLUE "" 113 | # define CUKD_TERMINAL_LIGHT_BLUE "" 114 | # define CUKD_TERMINAL_RESET "" 115 | # define CUKD_TERMINAL_DEFAULT CUKD_TERMINAL_RESET 116 | # define CUKD_TERMINAL_BOLD "" 117 | 118 | # define CUKD_TERMINAL_MAGENTA "" 119 | # define CUKD_TERMINAL_LIGHT_MAGENTA "" 120 | # define CUKD_TERMINAL_CYAN "" 121 | # define CUKD_TERMINAL_LIGHT_RED "" 122 | #else 123 | # define CUKD_TERMINAL_RED "\033[0;31m" 124 | # define CUKD_TERMINAL_GREEN "\033[0;32m" 125 | # define CUKD_TERMINAL_LIGHT_GREEN "\033[1;32m" 126 | # define CUKD_TERMINAL_YELLOW "\033[1;33m" 127 | # define CUKD_TERMINAL_BLUE "\033[0;34m" 128 | # define CUKD_TERMINAL_LIGHT_BLUE "\033[1;34m" 129 | # define CUKD_TERMINAL_RESET "\033[0m" 130 | # define CUKD_TERMINAL_DEFAULT CUKD_TERMINAL_RESET 131 | # define CUKD_TERMINAL_BOLD "\033[1;1m" 132 | 133 | # define CUKD_TERMINAL_MAGENTA "\e[35m" 134 | # define CUKD_TERMINAL_LIGHT_MAGENTA "\e[95m" 135 | # define CUKD_TERMINAL_CYAN "\e[36m" 136 | # define CUKD_TERMINAL_LIGHT_RED "\033[1;31m" 137 | #endif 138 | 139 | #ifdef _MSC_VER 140 | # define CUKD_ALIGN(alignment) __declspec(align(alignment)) 141 | #else 142 | # define CUKD_ALIGN(alignment) __attribute__((aligned(alignment))) 143 | #endif 144 | 145 | 146 | 147 | namespace cukd { 148 | namespace common { 149 | 150 | #ifdef __WIN32__ 151 | # define osp_snprintf sprintf_s 152 | #else 153 | # define osp_snprintf snprintf 154 | #endif 155 | 156 | /*! added pretty-print function for large numbers, printing 10000000 as "10M" instead */ 157 | inline std::string prettyDouble(const double val) { 158 | const double absVal = abs(val); 159 | char result[1000]; 160 | 161 | if (absVal >= 1e+18f) osp_snprintf(result,1000,"%.1f%c",float(val/1e18f),'E'); 162 | else if (absVal >= 1e+15f) osp_snprintf(result,1000,"%.1f%c",float(val/1e15f),'P'); 163 | else if (absVal >= 1e+12f) osp_snprintf(result,1000,"%.1f%c",float(val/1e12f),'T'); 164 | else if (absVal >= 1e+09f) osp_snprintf(result,1000,"%.1f%c",float(val/1e09f),'G'); 165 | else if (absVal >= 1e+06f) osp_snprintf(result,1000,"%.1f%c",float(val/1e06f),'M'); 166 | else if (absVal >= 1e+03f) osp_snprintf(result,1000,"%.1f%c",float(val/1e03f),'k'); 167 | else if (absVal <= 1e-12f) osp_snprintf(result,1000,"%.1f%c",float(val*1e15f),'f'); 168 | else if (absVal <= 1e-09f) osp_snprintf(result,1000,"%.1f%c",float(val*1e12f),'p'); 169 | else if (absVal <= 1e-06f) osp_snprintf(result,1000,"%.1f%c",float(val*1e09f),'n'); 170 | else if (absVal <= 1e-03f) osp_snprintf(result,1000,"%.1f%c",float(val*1e06f),'u'); 171 | else if (absVal <= 1e-00f) osp_snprintf(result,1000,"%.1f%c",float(val*1e03f),'m'); 172 | else osp_snprintf(result,1000,"%f",(float)val); 173 | 174 | return result; 175 | } 176 | 177 | 178 | /*! return a nicely formatted number as in "3.4M" instead of 179 | "3400000", etc, using mulitples of thousands (K), millions 180 | (M), etc. Ie, the value 64000 would be returned as 64K, and 181 | 65536 would be 65.5K */ 182 | inline std::string prettyNumber(const size_t s) 183 | { 184 | char buf[1000]; 185 | if (s >= (1000LL*1000LL*1000LL*1000LL)) { 186 | osp_snprintf(buf, 1000,"%.2fT",s/(1000.f*1000.f*1000.f*1000.f)); 187 | } else if (s >= (1000LL*1000LL*1000LL)) { 188 | osp_snprintf(buf, 1000, "%.2fG",s/(1000.f*1000.f*1000.f)); 189 | } else if (s >= (1000LL*1000LL)) { 190 | osp_snprintf(buf, 1000, "%.2fM",s/(1000.f*1000.f)); 191 | } else if (s >= (1000LL)) { 192 | osp_snprintf(buf, 1000, "%.2fK",s/(1000.f)); 193 | } else { 194 | osp_snprintf(buf,1000,"%zi",s); 195 | } 196 | return buf; 197 | } 198 | 199 | /*! return a nicely formatted number as in "3.4M" instead of 200 | "3400000", etc, using mulitples of 1024 as in kilobytes, 201 | etc. Ie, the value 65534 would be 64K, 64000 would be 63.8K */ 202 | inline std::string prettyBytes(const size_t s) 203 | { 204 | char buf[1000]; 205 | if (s >= (1024LL*1024LL*1024LL*1024LL)) { 206 | osp_snprintf(buf, 1000,"%.2fT",s/(1024.f*1024.f*1024.f*1024.f)); 207 | } else if (s >= (1024LL*1024LL*1024LL)) { 208 | osp_snprintf(buf, 1000, "%.2fG",s/(1024.f*1024.f*1024.f)); 209 | } else if (s >= (1024LL*1024LL)) { 210 | osp_snprintf(buf, 1000, "%.2fM",s/(1024.f*1024.f)); 211 | } else if (s >= (1024LL)) { 212 | osp_snprintf(buf, 1000, "%.2fK",s/(1024.f)); 213 | } else { 214 | osp_snprintf(buf,1000,"%zi",s); 215 | } 216 | return buf; 217 | } 218 | 219 | inline double getCurrentTime() 220 | { 221 | #ifdef _WIN32 222 | SYSTEMTIME tp; GetSystemTime(&tp); 223 | /* 224 | Please note: we are not handling the "leap year" issue. 225 | */ 226 | size_t numSecsSince2020 227 | = tp.wSecond 228 | + (60ull) * tp.wMinute 229 | + (60ull * 60ull) * tp.wHour 230 | + (60ull * 60ul * 24ull) * tp.wDay 231 | + (60ull * 60ul * 24ull * 365ull) * (tp.wYear - 2020); 232 | return double(numSecsSince2020 + tp.wMilliseconds * 1e-3); 233 | #else 234 | struct timeval tp; gettimeofday(&tp,nullptr); 235 | return double(tp.tv_sec) + double(tp.tv_usec)/1E6; 236 | #endif 237 | } 238 | 239 | inline bool hasSuffix(const std::string &s, const std::string &suffix) 240 | { 241 | return s.substr(s.size()-suffix.size()) == suffix; 242 | } 243 | } // ::common 244 | 245 | template 246 | inline T *loadPoints(std::string fileName, size_t &count) 247 | { 248 | // size_t count; 249 | std::cout << "loading points from " << fileName << std::endl; 250 | std::ifstream in(fileName,std::ios::binary); 251 | in.read((char*)&count,sizeof(count)); 252 | // numPoints = count; 253 | std::cout << "loading " << count << " points" << std::endl; 254 | T *d_points = 0; 255 | cudaMallocManaged((void**)&d_points,count*sizeof(T)); 256 | in.read((char*)d_points,count*sizeof(T)); 257 | return d_points; 258 | } 259 | 260 | template 261 | inline T *loadPoints(std::string fileName, int &count) 262 | { 263 | size_t count64; 264 | T *t = loadPoints(fileName, count64); 265 | count = (int)count64; 266 | return t; 267 | } 268 | 269 | // template 270 | // inline __device__ scalar_t clamp(scalar_t v, scalar_t lo, scalar_t hi) 271 | // { return min(max(v,lo),hi); } 272 | 273 | } // ::cukd 274 | 275 | 276 | #define CUKD_CUDA_CHECK( call ) \ 277 | { \ 278 | cudaError_t rc = call; \ 279 | if (rc != cudaSuccess) { \ 280 | fprintf(stderr, \ 281 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 282 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 283 | throw std::runtime_error("fatal cuda error"); \ 284 | } \ 285 | } 286 | 287 | #define CUKD_CUDA_CALL(call) CUKD_CUDA_CHECK(cuda##call) 288 | 289 | #define CUKD_CUDA_CHECK2( where, call ) \ 290 | { \ 291 | cudaError_t rc = call; \ 292 | if(rc != cudaSuccess) { \ 293 | if (where) \ 294 | fprintf(stderr, "at %s: CUDA call (%s) " \ 295 | "failed with code %d (line %d): %s\n", \ 296 | where,#call, rc, __LINE__, cudaGetErrorString(rc)); \ 297 | fprintf(stderr, \ 298 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 299 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 300 | throw std::runtime_error("fatal cuda error"); \ 301 | } \ 302 | } 303 | 304 | #define CUKD_CUDA_SYNC_CHECK() \ 305 | { \ 306 | cudaError_t rc = cudaDeviceSynchronize(); \ 307 | if (rc != cudaSuccess) { \ 308 | fprintf(stderr, "error (%s: line %d): %s\n", \ 309 | __FILE__, __LINE__, cudaGetErrorString(rc)); \ 310 | throw std::runtime_error("fatal cuda error"); \ 311 | } \ 312 | } 313 | 314 | 315 | 316 | #define CUKD_CUDA_CHECK_NOTHROW( call ) \ 317 | { \ 318 | cudaError_t rc = call; \ 319 | if (rc != cudaSuccess) { \ 320 | fprintf(stderr, \ 321 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 322 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 323 | exit(2); \ 324 | } \ 325 | } 326 | 327 | #define CUKD_CUDA_CALL_NOTHROW(call) CUKD_CUDA_CHECK_NOTHROW(cuda##call) 328 | 329 | #define CUKD_CUDA_CHECK2_NOTHROW( where, call ) \ 330 | { \ 331 | cudaError_t rc = call; \ 332 | if(rc != cudaSuccess) { \ 333 | if (where) \ 334 | fprintf(stderr, "at %s: CUDA call (%s) " \ 335 | "failed with code %d (line %d): %s\n", \ 336 | where,#call, rc, __LINE__, cudaGetErrorString(rc)); \ 337 | fprintf(stderr, \ 338 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 339 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 340 | exit(2); \ 341 | } \ 342 | } 343 | 344 | 345 | /* is supplied externally (from cmake) this adds a "int *stats" 346 | paramater to all query functions, and makes the traversal routines 347 | do atomic counting of traversal steps */ 348 | #if defined(CUKD_ENABLE_STATS) && defined(__CUDA_ARCH__) 349 | # define CUKD_STATS(a) a 350 | # define CUKD_STATS_ARG(a,b) a, 351 | #else 352 | # define CUKD_STATS(a) /* nothing */ 353 | # define CUKD_STATS_ARG(a,b) /* nothing */ 354 | #endif 355 | 356 | #if CUKD_ENABLE_STATS 357 | namespace cukd { 358 | __constant__ __device__ unsigned long long *g_traversalStats; 359 | } 360 | #endif 361 | 362 | 363 | -------------------------------------------------------------------------------- /cukd/cubit/common.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2022 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/ 18 | 19 | #pragma once 20 | 21 | #ifndef _USE_MATH_DEFINES 22 | # define _USE_MATH_DEFINES 23 | #endif 24 | #include // using cmath causes issues under Windows 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #ifdef __GNUC__ 37 | #include 38 | #include 39 | #endif 40 | 41 | #ifdef _WIN32 42 | #ifndef WIN32_LEAN_AND_MEAN 43 | #define WIN32_LEAN_AND_MEAN 44 | #endif 45 | #include 46 | #ifdef min 47 | #undef min 48 | #endif 49 | #ifdef max 50 | #undef max 51 | #endif 52 | #endif 53 | 54 | #if !defined(WIN32) 55 | #include 56 | #endif 57 | 58 | #if defined(_MSC_VER) 59 | # define CUBIT_DLL_EXPORT __declspec(dllexport) 60 | # define CUBIT_DLL_IMPORT __declspec(dllimport) 61 | #elif defined(__clang__) || defined(__GNUC__) 62 | # define CUBIT_DLL_EXPORT __attribute__((visibility("default"))) 63 | # define CUBIT_DLL_IMPORT __attribute__((visibility("default"))) 64 | #else 65 | # define CUBIT_DLL_EXPORT 66 | # define CUBIT_DLL_IMPORT 67 | #endif 68 | 69 | # define CUBIT_INTERFACE /* nothing - currently not building any special 'owl.dll' */ 70 | #if defined(_MSC_VER) 71 | # define __PRETTY_FUNCTION__ __FUNCTION__ 72 | #endif 73 | 74 | 75 | #ifndef PRINT 76 | # define PRINT(var) std::cout << #var << "=" << var << std::endl; 77 | #ifdef __WIN32__ 78 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __FUNCTION__ << std::endl; 79 | #else 80 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __PRETTY_FUNCTION__ << std::endl; 81 | #endif 82 | #endif 83 | 84 | #if defined(__CUDA_ARCH__) 85 | # define __owl_device __device__ 86 | # define __owl_host __host__ 87 | #else 88 | # define __owl_device /* ignore */ 89 | # define __owl_host /* ignore */ 90 | #endif 91 | 92 | # define __both__ __owl_host __owl_device 93 | 94 | 95 | #ifdef __GNUC__ 96 | #define MAYBE_UNUSED __attribute__((unused)) 97 | #else 98 | #define MAYBE_UNUSED 99 | #endif 100 | 101 | #define CUBIT_NOTIMPLEMENTED throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+" not implemented") 102 | 103 | #ifdef WIN32 104 | # define CUBIT_TERMINAL_RED "" 105 | # define CUBIT_TERMINAL_GREEN "" 106 | # define CUBIT_TERMINAL_LIGHT_GREEN "" 107 | # define CUBIT_TERMINAL_YELLOW "" 108 | # define CUBIT_TERMINAL_BLUE "" 109 | # define CUBIT_TERMINAL_LIGHT_BLUE "" 110 | # define CUBIT_TERMINAL_RESET "" 111 | # define CUBIT_TERMINAL_DEFAULT CUBIT_TERMINAL_RESET 112 | # define CUBIT_TERMINAL_BOLD "" 113 | 114 | # define CUBIT_TERMINAL_MAGENTA "" 115 | # define CUBIT_TERMINAL_LIGHT_MAGENTA "" 116 | # define CUBIT_TERMINAL_CYAN "" 117 | # define CUBIT_TERMINAL_LIGHT_RED "" 118 | #else 119 | # define CUBIT_TERMINAL_RED "\033[0;31m" 120 | # define CUBIT_TERMINAL_GREEN "\033[0;32m" 121 | # define CUBIT_TERMINAL_LIGHT_GREEN "\033[1;32m" 122 | # define CUBIT_TERMINAL_YELLOW "\033[1;33m" 123 | # define CUBIT_TERMINAL_BLUE "\033[0;34m" 124 | # define CUBIT_TERMINAL_LIGHT_BLUE "\033[1;34m" 125 | # define CUBIT_TERMINAL_RESET "\033[0m" 126 | # define CUBIT_TERMINAL_DEFAULT CUBIT_TERMINAL_RESET 127 | # define CUBIT_TERMINAL_BOLD "\033[1;1m" 128 | 129 | # define CUBIT_TERMINAL_MAGENTA "\e[35m" 130 | # define CUBIT_TERMINAL_LIGHT_MAGENTA "\e[95m" 131 | # define CUBIT_TERMINAL_CYAN "\e[36m" 132 | # define CUBIT_TERMINAL_LIGHT_RED "\033[1;31m" 133 | #endif 134 | 135 | #ifdef _MSC_VER 136 | # define __cubit_align(alignment) __declspec(align(alignment)) 137 | #else 138 | # define __cubit_align(alignment) __attribute__((aligned(alignment))) 139 | #endif 140 | 141 | 142 | 143 | namespace cubit { 144 | namespace common { 145 | 146 | #ifdef __CUDA_ARCH__ 147 | using ::min; 148 | using ::max; 149 | using std::abs; 150 | #else 151 | using std::min; 152 | using std::max; 153 | using std::abs; 154 | #endif 155 | 156 | inline __both__ int32_t divRoundUp(int32_t a, int32_t b) { return (a+b-1)/b; } 157 | inline __both__ uint32_t divRoundUp(uint32_t a, uint32_t b) { return (a+b-1)/b; } 158 | inline __both__ int64_t divRoundUp(int64_t a, int64_t b) { return (a+b-1)/b; } 159 | inline __both__ uint64_t divRoundUp(uint64_t a, uint64_t b) { return (a+b-1)/b; } 160 | 161 | using ::sin; // this is the double version 162 | using ::cos; // this is the double version 163 | 164 | #ifdef __WIN32__ 165 | # define osp_snprintf sprintf_s 166 | #else 167 | # define osp_snprintf snprintf 168 | #endif 169 | 170 | /*! added pretty-print function for large numbers, printing 10000000 as "10M" instead */ 171 | inline std::string prettyDouble(const double val) { 172 | const double absVal = abs(val); 173 | char result[1000]; 174 | 175 | if (absVal >= 1e+18f) osp_snprintf(result,1000,"%.1f%c",float(val/1e18f),'E'); 176 | else if (absVal >= 1e+15f) osp_snprintf(result,1000,"%.1f%c",float(val/1e15f),'P'); 177 | else if (absVal >= 1e+12f) osp_snprintf(result,1000,"%.1f%c",float(val/1e12f),'T'); 178 | else if (absVal >= 1e+09f) osp_snprintf(result,1000,"%.1f%c",float(val/1e09f),'G'); 179 | else if (absVal >= 1e+06f) osp_snprintf(result,1000,"%.1f%c",float(val/1e06f),'M'); 180 | else if (absVal >= 1e+03f) osp_snprintf(result,1000,"%.1f%c",float(val/1e03f),'k'); 181 | else if (absVal <= 1e-12f) osp_snprintf(result,1000,"%.1f%c",float(val*1e15f),'f'); 182 | else if (absVal <= 1e-09f) osp_snprintf(result,1000,"%.1f%c",float(val*1e12f),'p'); 183 | else if (absVal <= 1e-06f) osp_snprintf(result,1000,"%.1f%c",float(val*1e09f),'n'); 184 | else if (absVal <= 1e-03f) osp_snprintf(result,1000,"%.1f%c",float(val*1e06f),'u'); 185 | else if (absVal <= 1e-00f) osp_snprintf(result,1000,"%.1f%c",float(val*1e03f),'m'); 186 | else osp_snprintf(result,1000,"%f",(float)val); 187 | 188 | return result; 189 | } 190 | 191 | 192 | /*! return a nicely formatted number as in "3.4M" instead of 193 | "3400000", etc, using mulitples of thousands (K), millions 194 | (M), etc. Ie, the value 64000 would be returned as 64K, and 195 | 65536 would be 65.5K */ 196 | inline std::string prettyNumber(const size_t s) 197 | { 198 | char buf[1000]; 199 | if (s >= (1000LL*1000LL*1000LL*1000LL)) { 200 | osp_snprintf(buf, 1000,"%.2fT",s/(1000.f*1000.f*1000.f*1000.f)); 201 | } else if (s >= (1000LL*1000LL*1000LL)) { 202 | osp_snprintf(buf, 1000, "%.2fG",s/(1000.f*1000.f*1000.f)); 203 | } else if (s >= (1000LL*1000LL)) { 204 | osp_snprintf(buf, 1000, "%.2fM",s/(1000.f*1000.f)); 205 | } else if (s >= (1000LL)) { 206 | osp_snprintf(buf, 1000, "%.2fK",s/(1000.f)); 207 | } else { 208 | osp_snprintf(buf,1000,"%zi",s); 209 | } 210 | return buf; 211 | } 212 | 213 | /*! return a nicely formatted number as in "3.4M" instead of 214 | "3400000", etc, using mulitples of 1024 as in kilobytes, 215 | etc. Ie, the value 65534 would be 64K, 64000 would be 63.8K */ 216 | inline std::string prettyBytes(const size_t s) 217 | { 218 | char buf[1000]; 219 | if (s >= (1024LL*1024LL*1024LL*1024LL)) { 220 | osp_snprintf(buf, 1000,"%.2fT",s/(1024.f*1024.f*1024.f*1024.f)); 221 | } else if (s >= (1024LL*1024LL*1024LL)) { 222 | osp_snprintf(buf, 1000, "%.2fG",s/(1024.f*1024.f*1024.f)); 223 | } else if (s >= (1024LL*1024LL)) { 224 | osp_snprintf(buf, 1000, "%.2fM",s/(1024.f*1024.f)); 225 | } else if (s >= (1024LL)) { 226 | osp_snprintf(buf, 1000, "%.2fK",s/(1024.f)); 227 | } else { 228 | osp_snprintf(buf,1000,"%zi",s); 229 | } 230 | return buf; 231 | } 232 | 233 | inline double getCurrentTime() 234 | { 235 | #ifdef _WIN32 236 | SYSTEMTIME tp; GetSystemTime(&tp); 237 | /* 238 | Please note: we are not handling the "leap year" issue. 239 | */ 240 | size_t numSecsSince2020 241 | = tp.wSecond 242 | + (60ull) * tp.wMinute 243 | + (60ull * 60ull) * tp.wHour 244 | + (60ull * 60ul * 24ull) * tp.wDay 245 | + (60ull * 60ul * 24ull * 365ull) * (tp.wYear - 2020); 246 | return double(numSecsSince2020 + tp.wMilliseconds * 1e-3); 247 | #else 248 | struct timeval tp; gettimeofday(&tp,nullptr); 249 | return double(tp.tv_sec) + double(tp.tv_usec)/1E6; 250 | #endif 251 | } 252 | 253 | inline bool hasSuffix(const std::string &s, const std::string &suffix) 254 | { 255 | return s.substr(s.size()-suffix.size()) == suffix; 256 | } 257 | 258 | } // ::cubit::common 259 | 260 | 261 | #ifndef CUBIT_CUDA_CHECK 262 | #define CUBIT_CUDA_CHECK( call ) \ 263 | { \ 264 | cudaError_t rc = call; \ 265 | if (rc != cudaSuccess) { \ 266 | fprintf(stderr, \ 267 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 268 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 269 | throw("fatal cuda error"); \ 270 | } \ 271 | } 272 | 273 | #define CUBIT_CUDA_CALL(call) CUBIT_CUDA_CHECK(cuda##call) 274 | 275 | #define CUBIT_CUDA_CHECK2( where, call ) \ 276 | { \ 277 | cudaError_t rc = call; \ 278 | if(rc != cudaSuccess) { \ 279 | if (where) \ 280 | fprintf(stderr, "at %s: CUDA call (%s) " \ 281 | "failed with code %d (line %d): %s\n", \ 282 | where,#call, rc, __LINE__, cudaGetErrorString(rc)); \ 283 | fprintf(stderr, \ 284 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 285 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 286 | throw("fatal cuda error"); \ 287 | } \ 288 | } 289 | 290 | #define CUBIT_CUDA_SYNC_CHECK() \ 291 | { \ 292 | cudaError_t rc = cudaDeviceSynchronize(); \ 293 | if (rc != cudaSuccess) { \ 294 | fprintf(stderr, "error (%s: line %d): %s\n", \ 295 | __FILE__, __LINE__, cudaGetErrorString(rc)); \ 296 | throw("fatal cuda error"); \ 297 | } \ 298 | } 299 | 300 | #define CUBIT_CUDA_SYNC_CHECK_STREAM(s) \ 301 | { \ 302 | cudaStreamSynchronize(s); \ 303 | cudaError_t rc = cudaGetLastError(); \ 304 | if (rc != cudaSuccess) { \ 305 | fprintf(stderr, "error (%s: line %d): %s\n", \ 306 | __FILE__, __LINE__, cudaGetErrorString(rc)); \ 307 | throw("fatal cuda error"); \ 308 | } \ 309 | } 310 | 311 | 312 | 313 | #define CUBIT_CUDA_CHECK_NOTHROW( call ) \ 314 | { \ 315 | cudaError_t rc = call; \ 316 | if (rc != cudaSuccess) { \ 317 | fprintf(stderr, \ 318 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 319 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 320 | exit(2); \ 321 | } \ 322 | } 323 | 324 | #define CUBIT_CUDA_CALL_NOTHROW(call) CUBIT_CUDA_CHECK_NOTHROW(cuda##call) 325 | 326 | #define CUBIT_CUDA_CHECK2_NOTHROW( where, call ) \ 327 | { \ 328 | cudaError_t rc = call; \ 329 | if(rc != cudaSuccess) { \ 330 | if (where) \ 331 | fprintf(stderr, "at %s: CUDA call (%s) " \ 332 | "failed with code %d (line %d): %s\n", \ 333 | where,#call, rc, __LINE__, cudaGetErrorString(rc)); \ 334 | fprintf(stderr, \ 335 | "CUDA call (%s) failed with code %d (line %d): %s\n", \ 336 | #call, rc, __LINE__, cudaGetErrorString(rc)); \ 337 | exit(2); \ 338 | } \ 339 | } 340 | #endif 341 | 342 | } // ::cubit 343 | -------------------------------------------------------------------------------- /cukd/cukd-math.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2024 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/ 18 | 19 | #pragma once 20 | 21 | #include "cukd/common.h" 22 | 23 | namespace cukd { 24 | 25 | #ifdef __CUDA_ARCH__ 26 | using ::min; 27 | using ::max; 28 | using std::abs; 29 | #else 30 | using std::min; 31 | using std::max; 32 | using std::abs; 33 | #endif 34 | 35 | // ================================================================== 36 | // default operators on cuda vector types: 37 | // ================================================================== 38 | 39 | /*! template interface for cuda vector types (such as float3, int4, 40 | etc), that allows for querying which scalar type this vec is 41 | defined over */ 42 | template struct scalar_type_of; 43 | template<> struct scalar_type_of { using type = float; }; 44 | template<> struct scalar_type_of { using type = float; }; 45 | template<> struct scalar_type_of { using type = float; }; 46 | template<> struct scalar_type_of { using type = int; }; 47 | template<> struct scalar_type_of { using type = int; }; 48 | template<> struct scalar_type_of { using type = int; }; 49 | 50 | /*! template interface for cuda vector types (such as float3, int4, 51 | etc), that allows for querying which scalar type this vec is 52 | defined over */ 53 | template struct num_dims_of; 54 | template<> struct num_dims_of { enum { value = 2 }; }; 55 | template<> struct num_dims_of { enum { value = 3 }; }; 56 | template<> struct num_dims_of { enum { value = 4 }; }; 57 | template<> struct num_dims_of { enum { value = 2 }; }; 58 | template<> struct num_dims_of { enum { value = 3 }; }; 59 | template<> struct num_dims_of { enum { value = 4 }; }; 60 | 61 | inline __both__ float get_coord(const float2 &v, int d) { return d?v.y:v.x; } 62 | inline __both__ float get_coord(const float3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); } 63 | inline __both__ float get_coord(const float4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); } 64 | 65 | inline __both__ float &get_coord(float2 &v, int d) { return d?v.y:v.x; } 66 | inline __both__ float &get_coord(float3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); } 67 | inline __both__ float &get_coord(float4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); } 68 | 69 | 70 | inline __both__ int get_coord(const int2 &v, int d) { return d?v.y:v.x; } 71 | inline __both__ int get_coord(const int3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); } 72 | inline __both__ int get_coord(const int4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); } 73 | 74 | inline __both__ int &get_coord(int2 &v, int d) { return d?v.y:v.x; } 75 | inline __both__ int &get_coord(int3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); } 76 | inline __both__ int &get_coord(int4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); } 77 | 78 | 79 | inline __both__ void set_coord(int2 &v, int d, int vv) { (d?v.y:v.x) = vv; } 80 | inline __both__ void set_coord(int3 &v, int d, int vv) { ((d==2)?v.z:(d?v.y:v.x)) = vv; } 81 | inline __both__ void set_coord(int4 &v, int d, int vv) { ((d>=2)?(d>2?v.w:v.z):(d?v.y:v.x)) = vv; } 82 | 83 | inline __both__ void set_coord(float2 &v, int d, float vv) { (d?v.y:v.x) = vv; } 84 | inline __both__ void set_coord(float3 &v, int d, float vv) { ((d==2)?v.z:(d?v.y:v.x)) = vv; } 85 | inline __both__ void set_coord(float4 &v, int d, float vv) { ((d>=2)?(d>2?v.w:v.z):(d?v.y:v.x)) = vv; } 86 | 87 | inline __both__ int32_t divRoundUp(int32_t a, int32_t b) { return (a+b-1)/b; } 88 | inline __both__ uint32_t divRoundUp(uint32_t a, uint32_t b) { return (a+b-1)/b; } 89 | inline __both__ int64_t divRoundUp(int64_t a, int64_t b) { return (a+b-1)/b; } 90 | inline __both__ uint64_t divRoundUp(uint64_t a, uint64_t b) { return (a+b-1)/b; } 91 | 92 | using ::sin; // this is the double version 93 | using ::cos; // this is the double version 94 | 95 | // ================================================================== 96 | // default operators on cuda vector types: 97 | // ================================================================== 98 | 99 | 100 | inline __both__ float2 operator-(float2 a, float2 b) 101 | { return make_float2(a.x-b.x,a.y-b.y); } 102 | inline __both__ float3 operator-(float3 a, float3 b) 103 | { return make_float3(a.x-b.x,a.y-b.y,a.z-b.z); } 104 | inline __both__ float4 operator-(float4 a, float4 b) 105 | { return make_float4(a.x-b.x,a.y-b.y,a.z-b.z,a.w-b.w); } 106 | 107 | inline __both__ float dot(float2 a, float2 b) 108 | { return a.x*b.x+a.y*b.y; } 109 | inline __both__ float dot(float3 a, float3 b) 110 | { return a.x*b.x+a.y*b.y+a.z*b.z; } 111 | inline __both__ float dot(float4 a, float4 b) 112 | { return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w; } 113 | 114 | inline __both__ float2 min(float2 a, float2 b) 115 | { return make_float2(min(a.x,b.x),min(a.y,b.y)); } 116 | inline __both__ float3 min(float3 a, float3 b) 117 | { return make_float3(min(a.x,b.x),min(a.y,b.y),min(a.z,b.z)); } 118 | inline __both__ float4 min(float4 a, float4 b) 119 | { return make_float4(min(a.x,b.x),min(a.y,b.y),min(a.z,b.z),min(a.w,b.w)); } 120 | 121 | inline __both__ float2 max(float2 a, float2 b) 122 | { return make_float2(max(a.x,b.x),max(a.y,b.y)); } 123 | inline __both__ float3 max(float3 a, float3 b) 124 | { return make_float3(max(a.x,b.x),max(a.y,b.y),max(a.z,b.z)); } 125 | inline __both__ float4 max(float4 a, float4 b) 126 | { return make_float4(max(a.x,b.x),max(a.y,b.y),max(a.z,b.z),max(a.w,b.w)); } 127 | 128 | inline std::ostream &operator<<(std::ostream &o, float3 v) 129 | { o << "(" << v.x << "," << v.y << "," << v.z << ")"; return o; } 130 | 131 | 132 | // ================================================================== 133 | // for some tests: our own, arbitrary-dimensioal vector type 134 | // ================================================================== 135 | template 136 | struct vec_float { 137 | float v[N]; 138 | }; 139 | template struct scalar_type_of> { using type = float; }; 140 | template struct num_dims_of> { enum { value = N }; }; 141 | 142 | template 143 | inline __both__ float get_coord(const vec_float &v, int d) { return v.v[d]; } 144 | template 145 | inline __both__ float &get_coord(vec_float &v, int d) { return v.v[d]; } 146 | template 147 | inline __both__ void set_coord(vec_float &v, int d, float vv) { v.v[d] = vv; } 148 | 149 | 150 | 151 | template 152 | inline __both__ vec_float min(vec_float a, vec_float b) 153 | { 154 | vec_float r; 155 | for (int i=0;i 160 | inline __both__ vec_float max(vec_float a, vec_float b) 161 | { 162 | vec_float r; 163 | for (int i=0;i 168 | inline __both__ float dot(vec_float a, vec_float b) 169 | { 170 | float sum = 0.f; 171 | for (int i=0;i 176 | inline __both__ vec_float operator-(const vec_float &a, const vec_float &b) 177 | { 178 | vec_float r; 179 | for (int i=0;i inline __both__ float as_float_rz(T t); 192 | template<> inline __both__ float as_float_rz(float f) { return f; } 193 | #ifdef __CUDA_ARCH__ 194 | template<> inline __device__ float as_float_rz(int i) { return __int2float_rz(i); } 195 | #endif 196 | 197 | /*! @] */ 198 | 199 | 200 | // ------------------------------------------------------------------ 201 | /*! float-accuracy (with round-to-zero mode) of distance between two point_t's */ 202 | template 203 | inline __both__ 204 | float fSqrDistance(const point_t &a, const point_t &b) 205 | { 206 | const point_t diff = b-a; 207 | return as_float_rz(dot(diff,diff)); 208 | } 209 | 210 | template 211 | inline __both__ 212 | auto sqrDistance(const point_t &a, const point_t &b) 213 | { const point_t d = a-b; return dot(d,d); } 214 | 215 | // ------------------------------------------------------------------ 216 | // scalar distance(point,point) 217 | // ------------------------------------------------------------------ 218 | 219 | inline __both__ float square_root(float f) { return sqrtf(f); } 220 | 221 | template 222 | inline __both__ auto distance(const point_t &a, const point_t &b) 223 | { return square_root(sqrDistance(a,b)); } 224 | 225 | // ------------------------------------------------------------------ 226 | template 227 | inline __both__ int arg_max(point_t p) 228 | { 229 | enum { num_dims = num_dims_of::value }; 230 | using scalar_t = typename scalar_type_of::type; 231 | int best_dim = 0; 232 | scalar_t best_val = get_coord(p,0); 233 | for (int i=1;i best_val) { 236 | best_val = f; 237 | best_dim = i; 238 | } 239 | } 240 | return best_dim; 241 | } 242 | 243 | // ------------------------------------------------------------------ 244 | inline std::ostream &operator<<(std::ostream &out, 245 | float2 v) 246 | { 247 | out << "(" << v.x << "," << v.y << ")"; 248 | return out; 249 | } 250 | 251 | template 252 | inline __host__ __device__ 253 | auto sqr(scalar_t f) { return f * f; } 254 | 255 | template 256 | inline __host__ __device__ 257 | scalar_t sqrt(scalar_t f); 258 | 259 | template<> inline __host__ __device__ 260 | float sqrt(float f) { return ::sqrtf(f); } 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | template 270 | inline __host__ __device__ 271 | auto sqrDistance(const typename point_traits_a::point_t& a, 272 | const typename point_traits_b::point_t& b) 273 | { 274 | typename point_traits_a::scalar_t res = 0; 275 | for(int i=0; i 283 | inline __host__ __device__ 284 | auto distance(const typename point_traits_a::point_t& a, 285 | const typename point_traits_b::point_t& b) 286 | { 287 | typename point_traits_a::scalar_t res = 0; 288 | for(int i=0; i struct point_traits; 299 | 300 | /*! point traits that describe our defaul tpoint type of cuda float3, int3, float4, etc. 301 | 302 | The four basic things a point_traits has to do for a given type are: 303 | 304 | - define the scalar_t that this point is built over 305 | 306 | - define the enum num_dims of dimensions that this point has 307 | 308 | - define a static function `get_coord(const point_t, int d)` that 309 | returns the given point's d'th coordiate 310 | 311 | - define a static function `set_coord(point_t &, int d, scalar_t 312 | v)` that sets the given point's d'the coordinate to the given 313 | value 314 | */ 315 | template 316 | struct point_traits { 317 | enum { num_dims = num_dims_of::value }; 318 | using scalar_t = typename scalar_type_of::type; 319 | 320 | /*! get the d'th coordindate - for our default cuda types we use 321 | the ::cukd::get_coord helpers we hvae for those types */ 322 | static inline __both__ 323 | scalar_t get_coord(const cuda_t &v, int d) { return ::cukd::get_coord(v,d); } 324 | 325 | static inline __both__ 326 | scalar_t &get_coord(cuda_t &v, int d) { return ::cukd::get_coord(v,d); } 327 | 328 | static inline __both__ 329 | void set_coord(cuda_t &v, int d, scalar_t vv) 330 | { ::cukd::set_coord(v,d,vv); } 331 | }; 332 | 333 | 334 | 335 | 336 | 337 | } // ::cukd 338 | -------------------------------------------------------------------------------- /cukd/data.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2024 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /*! \file cukd/data.h Describes (abstract) data types (that k-d trees 18 | can be built over, and data type traits that describe this data. 19 | */ 20 | 21 | #pragma once 22 | 23 | #include "cukd/cukd-math.h" 24 | #include "cukd/box.h" 25 | 26 | namespace cukd { 27 | 28 | 29 | /*! defines an abstract interface to what a 'data point' in a k-d 30 | tree is -- which is some sort of actual D-dimensional point of 31 | scalar coordinates, plus potentially some payload, and potentially 32 | a means of storing the split dimension). This needs to define the 33 | following: 34 | 35 | - data_traits::point_t: the actual point type that stores the 36 | coordinates of this data point 37 | 38 | - enum data_traits::has_explicit_dim : whether that node type has 39 | a field to store an explicit split dimension in each node. If not, 40 | the k-d tree builder and traverse _have_ to use round-robin for 41 | split distance; otherwise, it will always split the widest 42 | dimension. 43 | 44 | - enum data_traits::set_dim(data_t &, int) and 45 | data_traits::get_dim(const data_t &) to read and write dimensions. For 46 | data_t's that don't actually have any explicit split dimension 47 | these function may be dummies that don't do anything (they'll 48 | never get called in that case), but they have to be defined to 49 | make the compiler happy. 50 | 51 | The _default_ data point for this library is just the point_t 52 | itself: no payload, no means of storing any split dimension (ie, 53 | always doing round-robin dimensions), and the coordinates just 54 | stored as the point itself. 55 | */ 56 | template> 58 | struct default_data_traits { 59 | // ------------------------------------------------------------------ 60 | /* part I : describes the _types_ of d-dimensional point data that 61 | the tree will be built over */ 62 | // ------------------------------------------------------------------ 63 | using point_t = _point_t; 64 | using point_traits = _point_traits; 65 | 66 | // ------------------------------------------------------------------ 67 | /* part II : describes the type of _data_ (which can be more than 68 | just a point). */ 69 | // ------------------------------------------------------------------ 70 | 71 | using data_t = _point_t; 72 | 73 | // ------------------------------------------------------------------ 74 | /* part III : how to extract a point or coordinate from an actual 75 | data struct */ 76 | // ------------------------------------------------------------------ 77 | private: 78 | // this doesn't _need_ to be defined in a data_traits, but makes some of 79 | // the blow code cleaner to read 80 | using scalar_t = typename point_traits::scalar_t; 81 | public: 82 | /*! return a reference to the 'd'th positional coordinate of the 83 | given node - for the default simple 'data==point' case we can 84 | simply return a reference to the point itself */ 85 | static inline __both__ const point_t &get_point(const data_t &n) { return n; } 86 | 87 | /*! return the 'd'th positional coordinate of the given node */ 88 | static inline __both__ 89 | scalar_t get_coord(const data_t &n, int d) 90 | { return point_traits::get_coord(get_point(n),d); } 91 | 92 | // ------------------------------------------------------------------ 93 | /* part IV : whether the data has a way of storing a split 94 | dimension for non-round robin paritioning, and if so, how to 95 | store (for building) and read (for traversing) that split 96 | dimensional in/from a node */ 97 | // ------------------------------------------------------------------ 98 | 99 | /* whether that node type has a field to store an explicit split 100 | dimension in each node. If not, the k-d tree builder and 101 | traverse _have_ to use round-robin for split distance; 102 | otherwise, it will alwyas split the widest dimensoin */ 103 | enum { has_explicit_dim = false }; 104 | 105 | /*! !{ just defining this for completeness, get/set_dim should never 106 | get called for this type because we have set has_explicit_dim 107 | set to false. note traversal should ONLY ever call this 108 | function for data_t's that define has_explicit_dim to true */ 109 | static inline __host__ __device__ int get_dim(const data_t &) { return -1; } 110 | static inline __host__ __device__ void set_dim(data_t &, int) {} 111 | /*! @} */ 112 | }; 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /cukd/helpers.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2019-2024 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | #include "cukd/common.h" 20 | #include "cukd/cukd-math.h" 21 | 22 | namespace cukd { 23 | 24 | // ------------------------------------------------------------------ 25 | /*! defines a 'memory resource' that can be used for allocating gpu 26 | memory; this allows the user to switch between usign 27 | cudaMallocAsync (where avialble) vs regular cudaMalloc (where 28 | not), or to use their own memory pool, to use managed memory, 29 | etc. All memory allocatoins done during construction will use 30 | the memory resource passed to the respective build function. */ 31 | struct GpuMemoryResource { 32 | virtual cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) = 0; 33 | virtual cudaError_t free(void* ptr, cudaStream_t s) = 0; 34 | }; 35 | 36 | struct ManagedMemMemoryResource : public GpuMemoryResource { 37 | cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) override 38 | { 39 | cudaStreamSynchronize(s); 40 | return cudaMallocManaged(ptr,size); 41 | } 42 | cudaError_t free(void* ptr, cudaStream_t s) override 43 | { 44 | cudaStreamSynchronize(s); 45 | return cudaFree(ptr); 46 | } 47 | }; 48 | 49 | /* by default let's use cuda malloc async, which is much better and 50 | faster than regular malloc; but that's available on cuda 11, so 51 | let's add a fall back for older cuda's, too */ 52 | #if CUDART_VERSION >= 11020 53 | struct AsyncGpuMemoryResource final : GpuMemoryResource { 54 | cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) override { 55 | return cudaMallocAsync(ptr, size, s); 56 | } 57 | cudaError_t free(void* ptr, cudaStream_t s) override { 58 | return cudaFreeAsync(ptr, s); 59 | } 60 | }; 61 | 62 | inline GpuMemoryResource &defaultGpuMemResource() { 63 | static AsyncGpuMemoryResource memResource; 64 | return memResource; 65 | } 66 | #else 67 | inline GpuMemoryResource &defaultGpuMemResource() { 68 | static ManagedMemMemoryResource memResource; 69 | return memResource; 70 | } 71 | #endif 72 | 73 | /*! helper functions for a generic, arbitrary-size binary tree - 74 | mostly to compute level of a given node in that tree, and child 75 | IDs, parent IDs, etc */ 76 | struct BinaryTree { 77 | inline static __host__ __device__ int rootNode() { return 0; } 78 | inline static __host__ __device__ int parentOf(int nodeID) { return (nodeID-1)/2; } 79 | inline static __host__ __device__ int isLeftSibling(int nodeID) { return (nodeID & 1); } 80 | inline static __host__ __device__ int leftChildOf (int nodeID) { return 2*nodeID+1; } 81 | inline static __host__ __device__ int rightChildOf(int nodeID) { return 2*nodeID+2; } 82 | inline static __host__ __device__ int firstNodeInLevel(int L) { return (1<= 0 117 | inline __host__ __device__ int numNodes() const { return (1<= numPoints) return 0; 248 | return ArbitraryBinaryTree(numPoints).numNodesInSubtree(leftChildRoot); 249 | } 250 | 251 | inline __host__ __device__ 252 | int sizeOfSegment(int n) const 253 | { return ArbitraryBinaryTree(numPoints).numNodesInSubtree(n); } 254 | 255 | 256 | const int numLevelsDone; 257 | const int numPoints; 258 | }; 259 | 260 | } 261 | 262 | -------------------------------------------------------------------------------- /cukd/kdtree.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2019-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | #include "builder.h" 20 | 21 | /*! if you are looking for a "struct KDTree" or the like: the 22 | _default_ kd-tree in cukd is one where the the tree is entirely 23 | _implicit_ in the order of the data points; i.e., there _is_ no 24 | separate dedicated data type for a k-d tree - it's simply an array 25 | of points (e.g., float3's, float2s, some type of Photons for 26 | photon-mapping, etc), and the builder will simply re-arrange those 27 | data points in the array */ 28 | 29 | 30 | -------------------------------------------------------------------------------- /cukd/traverse-cct.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2022-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | /* traversal with 'closest-corner-tracking' - somewhat better for some 18 | input distributions, by tracking the (N-dimensional) closest point 19 | in the given subtree's domain, rather than just always comparing 20 | only to the 1-dimensoinal plane */ 21 | #pragma once 22 | 23 | namespace cukd { 24 | 25 | template> 28 | inline __host__ __device__ 29 | void traverse_cct(result_t &result, 30 | typename data_traits::point_t queryPoint, 31 | const box_t d_bounds, 32 | const data_t *d_nodes, 33 | int numPoints) 34 | { 35 | using point_t = typename data_traits::point_t; 36 | using point_traits = ::cukd::point_traits; 37 | using scalar_t = typename point_traits::scalar_t; 38 | enum { num_dims = point_traits::num_dims }; 39 | 40 | scalar_t cullDist = result.initialCullDist2(); 41 | 42 | struct 43 | StackEntry { 44 | int nodeID; 45 | point_t closestCorner; 46 | }; 47 | /* can do at most 2**30 points... */ 48 | StackEntry stackBase[30]; 49 | StackEntry *stackPtr = stackBase; 50 | 51 | int nodeID = 0; 52 | point_t closestPointOnSubtreeBounds = project(d_bounds,queryPoint); 53 | if (sqrDistance(queryPoint,closestPointOnSubtreeBounds) > cullDist) 54 | return; 55 | 56 | while (true) { 57 | 58 | if (nodeID >= numPoints) { 59 | while (true) { 60 | if (stackPtr == stackBase) 61 | return; 62 | --stackPtr; 63 | closestPointOnSubtreeBounds = stackPtr->closestCorner; 64 | if (sqrDistance(closestPointOnSubtreeBounds,queryPoint) >= cullDist) 65 | continue; 66 | nodeID = stackPtr->nodeID; 67 | break; 68 | } 69 | } 70 | const auto &node = d_nodes[nodeID]; 71 | CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1)); 72 | const point_t nodePoint = data_traits::get_point(node); 73 | { 74 | const auto sqrDist = sqrDistance(nodePoint,queryPoint); 75 | cullDist = result.processCandidate(nodeID,sqrDist); 76 | } 77 | 78 | const int dim 79 | = data_traits::has_explicit_dim 80 | ? data_traits::get_dim(d_nodes[nodeID]) 81 | : (BinaryTree::levelOf(nodeID) % num_dims); 82 | const auto node_dim = get_coord(nodePoint,dim); 83 | const auto query_dim = get_coord(queryPoint,dim); 84 | const bool leftIsClose = query_dim < node_dim; 85 | const int lChild = 2*nodeID+1; 86 | const int rChild = lChild+1; 87 | 88 | auto farSideCorner = closestPointOnSubtreeBounds; 89 | const int farChild = leftIsClose?rChild:lChild; 90 | point_traits::set_coord(farSideCorner,dim,node_dim); 91 | if (farChild < numPoints && sqrDistance(farSideCorner,queryPoint) < cullDist) { 92 | stackPtr->closestCorner = farSideCorner; 93 | stackPtr->nodeID = farChild; 94 | stackPtr++; 95 | } 96 | 97 | nodeID = leftIsClose?lChild:rChild; 98 | } 99 | } 100 | 101 | 102 | } 103 | -------------------------------------------------------------------------------- /cukd/traverse-default-stack-based.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2022-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | #include "cukd/helpers.h" 20 | 21 | namespace cukd { 22 | 23 | /*! traverse k-d tree with default, stack-based (sb) traversal */ 24 | template> 27 | inline __host__ __device__ 28 | void traverse_default(result_t &result, 29 | typename data_traits::point_t queryPoint, 30 | const data_t *d_nodes, 31 | int numPoints) 32 | { 33 | using point_t = typename data_traits::point_t; 34 | using scalar_t = typename scalar_type_of::type; 35 | enum { num_dims = num_dims_of::value }; 36 | 37 | scalar_t cullDist = result.initialCullDist2(); 38 | 39 | bool dbg = 0; //threadIdx.x==0 && blockIdx.x == 0; 40 | 41 | if (dbg) printf("stackbased %f %f\n", 42 | get_coord(queryPoint,0), 43 | get_coord(queryPoint,1) 44 | ); 45 | 46 | 47 | /* can do at most 2**30 points... */ 48 | struct StackEntry { 49 | int nodeID; 50 | float sqrDist; 51 | }; 52 | StackEntry stackBase[30]; 53 | StackEntry *stackPtr = stackBase; 54 | 55 | /*! current node in the tree we're traversing */ 56 | int curr = 0; 57 | 58 | while (true) { 59 | while (curr < numPoints) { 60 | const int curr_dim 61 | = data_traits::has_explicit_dim 62 | ? data_traits::get_dim(d_nodes[curr]) 63 | : (BinaryTree::levelOf(curr) % num_dims); 64 | CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1)); 65 | const data_t &curr_node = d_nodes[curr]; 66 | const auto sqrDist = sqrDistance(data_traits::get_point(curr_node), 67 | queryPoint); 68 | if (dbg) printf("=== %i dim %i sqrDist %f\n",curr,curr_dim,sqrDist); 69 | 70 | cullDist = result.processCandidate(curr,sqrDist); 71 | if (dbg) 72 | printf("node %i pt %f %f sqrDist %f cullDist %f\n", 73 | curr, 74 | get_coord(data_traits::get_point(curr_node),0), 75 | get_coord(data_traits::get_point(curr_node),1), 76 | sqrDist,cullDist); 77 | 78 | const auto node_coord = data_traits::get_coord(curr_node,curr_dim); 79 | const auto query_coord = get_coord(queryPoint,curr_dim); 80 | const bool leftIsClose = query_coord < node_coord; 81 | const int lChild = 2*curr+1; 82 | const int rChild = lChild+1; 83 | 84 | const int closeChild = leftIsClose?lChild:rChild; 85 | const int farChild = leftIsClose?rChild:lChild; 86 | 87 | const float sqrDistToPlane = sqr(query_coord - node_coord); 88 | if (dbg) printf("sqrDist %f cullDist %f\n", 89 | sqrDistToPlane,cullDist); 90 | if (sqrDistToPlane < cullDist && farChild < numPoints) { 91 | stackPtr->nodeID = farChild; 92 | stackPtr->sqrDist = sqrDistToPlane; 93 | ++stackPtr; 94 | } 95 | curr = closeChild; 96 | } 97 | 98 | while (true) { 99 | if (stackPtr == stackBase) 100 | return; 101 | --stackPtr; 102 | if (stackPtr->sqrDist >= cullDist) 103 | continue; 104 | curr = stackPtr->nodeID; 105 | break; 106 | } 107 | } 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /cukd/traverse-sf-imp.h: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2022 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #pragma once 18 | 19 | namespace cukd { 20 | 21 | template> 23 | inline __host__ __device__ 24 | box_t 25 | recomputeBounds(int curr, 26 | box_t bounds, 27 | const data_t *d_nodes 28 | ) 29 | { 30 | using point_t = typename data_traits::point_t; 31 | using scalar_t = typename scalar_type_of::type; 32 | enum { num_dims = num_dims_of::value }; 33 | 34 | while (true) { 35 | if (curr == 0) break; 36 | const int parent = (curr+1)/2-1; 37 | 38 | const auto &parent_node = d_nodes[parent]; 39 | CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1)); 40 | const int parent_dim 41 | = data_traits::has_explicit_dim 42 | ? data_traits::get_dim(parent_node) 43 | : (BinaryTree::levelOf(parent) % num_dims); 44 | const float parent_split_pos = data_traits::get_coord(parent_node,parent_dim); 45 | 46 | if (curr & 1) { 47 | // curr is left child, set upper 48 | get_coord(bounds.upper,parent_dim) 49 | = min(parent_split_pos, 50 | get_coord(bounds.upper,parent_dim)); 51 | } else { 52 | // curr is right child, set lower 53 | get_coord(bounds.lower,parent_dim) 54 | = max(parent_split_pos, 55 | get_coord(bounds.lower,parent_dim)); 56 | } 57 | 58 | curr = parent; 59 | }; 60 | return bounds; 61 | } 62 | 63 | template> 66 | inline __host__ __device__ 67 | void traverse_sf_imp(result_t &result, 68 | typename data_traits::point_t queryPoint, 69 | const box_t worldBounds, 70 | const data_t *d_nodes, 71 | int numPoints) 72 | { 73 | using point_t = typename data_traits::point_t; 74 | using scalar_t = typename scalar_type_of::type; 75 | enum { num_dims = num_dims_of::value }; 76 | 77 | float cullDist = result.initialCullDist2(); 78 | 79 | 80 | int prev = -1; 81 | int curr = 0; 82 | 83 | box_t bounds = worldBounds; 84 | 85 | while (true) { 86 | if (curr == -1) 87 | // this can only (and will) happen if and only if we come from a 88 | // child, arrive at the root, and decide to go to the parent of 89 | // the root ... while means we're done. 90 | return;// closest_found_so_far; 91 | 92 | bounds = recomputeBounds 93 | (curr,worldBounds,d_nodes); 94 | const int parent = (curr+1)/2-1; 95 | 96 | point_t closestPointOnSubtreeBounds = project(bounds,queryPoint); 97 | if (sqrDistance(closestPointOnSubtreeBounds,queryPoint) >= cullDist) { 98 | prev = curr; 99 | curr = parent; 100 | continue; 101 | } 102 | 103 | 104 | if (curr >= numPoints) { 105 | // in some (rare) cases it's possible that below traversal 106 | // logic will go to a "close child", but may actually only 107 | // have a far child. In that case it's easiest to fix this 108 | // right here, pretend we've done that (non-existent) close 109 | // child, and let parent pick up traversal as if it had been 110 | // done. 111 | prev = curr; 112 | curr = parent; 113 | 114 | continue; 115 | } 116 | CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1)); 117 | const auto &curr_node = d_nodes[curr]; 118 | const int child = 2*curr+1; 119 | const bool from_child = (prev >= child); 120 | if (!from_child) { 121 | const auto dist_sqr = 122 | sqrDistance(queryPoint,data_traits::get_point(curr_node)); 123 | cullDist = result.processCandidate(curr,dist_sqr); 124 | } 125 | 126 | const int curr_dim 127 | = data_traits::has_explicit_dim 128 | ? data_traits::get_dim(d_nodes[curr]) 129 | : (BinaryTree::levelOf(curr) % num_dims); 130 | const float curr_split_pos = data_traits::get_coord(curr_node,curr_dim); 131 | const float curr_dim_dist = get_coord(queryPoint,curr_dim) - curr_split_pos; 132 | const int curr_side = curr_dim_dist > 0.f; 133 | const int curr_close_child = 2*curr + 1 + curr_side; 134 | const int curr_far_child = 2*curr + 2 - curr_side; 135 | 136 | int next = -1; 137 | if (prev == curr_close_child) { 138 | // if we came from the close child, we may still have to check 139 | // the far side - but only if this exists, and if far half of 140 | // current space if even within search radius. 141 | if ((curr_far_child> 27 | inline __host__ __device__ 28 | void traverse_stack_free(result_t &result, 29 | typename data_traits::point_t queryPoint, 30 | const data_t *d_nodes, 31 | int N, 32 | float eps=0.0f) 33 | { 34 | using point_t = typename data_traits::point_t; 35 | using scalar_t = typename scalar_type_of::type; 36 | enum { num_dims = num_dims_of::value }; 37 | const auto epsErr = 1 + eps; 38 | 39 | scalar_t cullDist = result.initialCullDist2(); 40 | 41 | int prev = -1; 42 | int curr = 0; 43 | 44 | while (true) { 45 | const int parent = (curr+1)/2-1; 46 | if (curr >= N) { 47 | // in some (rare) cases it's possible that below traversal 48 | // logic will go to a "close child", but may actually only 49 | // have a far child. In that case it's easiest to fix this 50 | // right here, pretend we've done that (non-existent) close 51 | // child, and let parent pick up traversal as if it had been 52 | // done. 53 | prev = curr; 54 | curr = parent; 55 | 56 | continue; 57 | } 58 | CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1)); 59 | const auto &curr_node = d_nodes[curr]; 60 | const int child = 2*curr+1; 61 | const bool from_child = (prev >= child); 62 | if (!from_child) { 63 | const auto sqrDist = 64 | sqrDistance(queryPoint,data_traits::get_point(curr_node)); 65 | cullDist = result.processCandidate(curr,sqrDist); 66 | } 67 | 68 | const int curr_dim 69 | = data_traits::has_explicit_dim 70 | ? data_traits::get_dim(d_nodes[curr]) 71 | : (BinaryTree::levelOf(curr) % num_dims); 72 | const float curr_dim_dist 73 | = get_coord(queryPoint,curr_dim) 74 | - data_traits::get_coord(curr_node,curr_dim); 75 | const int curr_side = curr_dim_dist > 0.f; 76 | const int curr_close_child = 2*curr + 1 + curr_side; 77 | const int curr_far_child = 2*curr + 2 - curr_side; 78 | 79 | int next = -1; 80 | if (prev == curr_close_child) 81 | // if we came from the close child, we may still have to check 82 | // the far side - but only if this exists, and if far half of 83 | // current space if even within search radius. 84 | next 85 | = ((curr_far_child results-$f.txt 11 | done 12 | 13 | for N in 1000 10000 100000 1000000 10000000 100000000 1000000000; do 14 | echo "### running for N = $N" 15 | for f in fcp knn-clamped knn-unlimited; do 16 | echo "############## N = $N, uniform" | tee -a results-$f.txt 17 | done 18 | for method in cct cct-xd spatial-cct spatial-stackBased stackBased stackBased-xd stackFree stackFree-xd ; do 19 | # ================================================================== 20 | # for clamping use radius 10000 - that's 1% of [0,1M] domain 21 | # we generate samples in 22 | ./cukd_float2-fcp-${method} -nr 10 $N > tmp.fcp.txt 23 | ./cukd_float2-knn-${method} -nr 10 $N > tmp.knn-unlimited.txt 24 | ./cukd_float2-knn-${method} -nr 10 $N -r 10000 > tmp.knn-clamped.txt 25 | 26 | for f in fcp knn-clamped knn-unlimited; do 27 | stats=`cat tmp.$f.txt | grep NICE_STATS | awk '{print \$2}'` 28 | perf=`cat tmp.$f.txt | grep "that is" | awk '{print \$3}'` 29 | echo "stats $stats perf $perf" 30 | echo "method $method stats $stats perf $perf" | tee -a results-$f.txt 31 | done 32 | done 33 | for f in fcp knn-clamped knn-unlimited; do 34 | echo "############## N = $N, clustered" | tee -a results-$f.txt 35 | done 36 | for method in cct cct-xd spatial-cct spatial-stackBased stackBased stackBased-xd stackFree stackFree-xd ; do 37 | # ================================================================== 38 | # for clamping use radius 10000 - that's 1% of [0,1M] domain 39 | # we generate samples in 40 | ./cukd_float2-fcp-${method} --clustered -nr 10 $N > tmp.fcp.txt 41 | ./cukd_float2-knn-${method} --clustered -nr 10 $N > tmp.knn-unlimited.txt 42 | ./cukd_float2-knn-${method} --clustered -nr 10 $N -r 10000 > tmp.knn-clamped.txt 43 | 44 | for f in fcp knn-clamped knn-unlimited; do 45 | stats=`cat tmp.$f.txt | grep NICE_STATS | awk '{print \$2}'` 46 | perf=`cat tmp.$f.txt | grep "that is" | awk '{print \$3}'` 47 | echo "stats $stats perf $perf" 48 | echo "method $method stats $stats perf $perf" | tee -a results-$f.txt 49 | done 50 | 51 | done 52 | done 53 | 54 | -------------------------------------------------------------------------------- /sample.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | // fcp = "find closest point" query 19 | #include "cukd/fcp.h" 20 | #include 21 | #include 22 | #include 23 | 24 | using namespace cukd; 25 | 26 | float3 *generatePoints(int N) 27 | { 28 | static int g_seed = 100000; 29 | std::seed_seq seq{g_seed++}; 30 | std::default_random_engine rd(seq); 31 | std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() 32 | std::uniform_int_distribution<> dist(0,N); 33 | 34 | std::cout << "generating " << N << " uniform random points" << std::endl; 35 | float3 *d_points = 0; 36 | cudaMallocManaged((char **)&d_points,N*sizeof(*d_points)); 37 | if (!d_points) 38 | throw std::runtime_error("could not allocate points mem..."); 39 | 40 | for (int i=0;i *d_bounds, 55 | float3 *d_nodes, 56 | int numNodes, 57 | float cutOffRadius) 58 | { 59 | int tid = threadIdx.x+blockIdx.x*blockDim.x; 60 | if (tid >= numQueries) return; 61 | 62 | using point_t = float3; 63 | point_t queryPos = d_queries[tid]; 64 | FcpSearchParams params; 65 | params.cutOffRadius = cutOffRadius; 66 | int closestID 67 | = cukd::cct::fcp 68 | (queryPos,*d_bounds,d_nodes,numNodes,params); 69 | 70 | d_results[tid] 71 | = (closestID < 0) 72 | ? INFINITY 73 | : distance(queryPos,d_nodes[closestID]); 74 | } 75 | 76 | 77 | 78 | 79 | int main(int ac, const char **av) 80 | { 81 | using namespace cukd::common; 82 | 83 | int numPoints = 1000000; 84 | int nRepeats = 1; 85 | size_t numQueries = 1000000; 86 | float cutOffRadius = std::numeric_limits::infinity(); 87 | for (int i=1;i *d_bounds; 112 | cudaMallocManaged((void**)&d_bounds,sizeof(cukd::box_t)); 113 | std::cout << "allocated memory for the world space bounding box ..." << std::endl; 114 | 115 | // ================================================================== 116 | // build the tree. this will also comptue the world-space boudig box 117 | // of all points 118 | // ================================================================== 119 | std::cout << "calling builder..." << std::endl; 120 | double t0 = getCurrentTime(); 121 | cukd::buildTree(d_points,numPoints,d_bounds); 122 | CUKD_CUDA_SYNC_CHECK(); 123 | double t1 = getCurrentTime(); 124 | std::cout << "done building tree, took " 125 | << prettyDouble(t1-t0) << "s" << std::endl; 126 | 127 | // ================================================================== 128 | // create set of sample query points 129 | // ================================================================== 130 | float3 *d_queries 131 | = generatePoints(numQueries); 132 | // allocate memory for the results 133 | float *d_results; 134 | CUKD_CUDA_CALL(MallocManaged((void**)&d_results,numQueries*sizeof(*d_results))); 135 | 136 | 137 | // ================================================================== 138 | // and do some queryies - let's do the same ones in a loop so we cna 139 | // measure perf. 140 | // ================================================================== 141 | { 142 | double t0 = getCurrentTime(); 143 | for (int i=0;i>> 147 | (d_results,d_queries,numQueries, 148 | d_bounds,d_points,numPoints,cutOffRadius); 149 | cudaDeviceSynchronize(); 150 | } 151 | CUKD_CUDA_SYNC_CHECK(); 152 | double t1 = getCurrentTime(); 153 | std::cout << "done " << nRepeats 154 | << " iterations of " << numQueries 155 | << " fcp queries, took " << prettyDouble(t1-t0) 156 | << "s" << std::endl; 157 | std::cout << "that is " << prettyDouble(numQueries*nRepeats/(t1-t0)) 158 | << " queries/s" << std::endl; 159 | } 160 | 161 | } 162 | 163 | -------------------------------------------------------------------------------- /sampleHost.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | // fcp = "find closest point" query 19 | #include "cukd/fcp.h" 20 | #include 21 | #include 22 | #include 23 | 24 | using namespace cukd; 25 | 26 | void generatePoints(size_t N, std::vector &points) 27 | { 28 | static int g_seed = 100000; 29 | std::seed_seq seq{g_seed++}; 30 | std::default_random_engine rd(seq); 31 | std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() 32 | std::uniform_int_distribution<> dist(0,N); 33 | 34 | std::cout << "generating " << N << " uniform random points" << std::endl; 35 | points.resize(N); 36 | 37 | #ifdef OPENMP_FOUND 38 | #pragma omp parallel for 39 | #endif 40 | for (size_t i=0;i *bounds, 52 | float3 *nodes, 53 | int numNodes, 54 | float cutOffRadius) 55 | { 56 | using point_t = float3; 57 | 58 | #ifdef OPENMP_FOUND 59 | #pragma omp parallel for 60 | #endif 61 | for (size_t tid = 0; tid < numQueries; tid++) { 62 | point_t queryPos = queries[tid]; 63 | FcpSearchParams params; 64 | params.cutOffRadius = cutOffRadius; 65 | int closestID 66 | = cukd::cct::fcp 67 | (queryPos,*bounds,nodes,numNodes,params); 68 | 69 | results[tid] 70 | = (closestID < 0) 71 | ? INFINITY 72 | : distance(queryPos,nodes[closestID]); 73 | } 74 | } 75 | 76 | int main(int ac, const char **av) 77 | { 78 | using namespace cukd::common; 79 | 80 | size_t numPoints = 10000; 81 | int nRepeats = 1; 82 | size_t numQueries = 10000; 83 | float cutOffRadius = std::numeric_limits::infinity(); 84 | for (int i=1;i points; 102 | generatePoints(numPoints, points); 103 | 104 | // ================================================================== 105 | // allocate some memory for the world-space bounding box, so the 106 | // builder can compute and return that for our chosen traversal 107 | // method to use 108 | // ================================================================== 109 | cukd::box_t bounds; 110 | std::cout << "allocated memory for the world space bounding box ..." << std::endl; 111 | 112 | // ================================================================== 113 | // build the tree. this will also comptue the world-space boudig box 114 | // of all points 115 | // ================================================================== 116 | std::cout << "calling builder..." << std::endl; 117 | double t0 = getCurrentTime(); 118 | cukd::buildTree_host(points.data(),numPoints,&bounds); 119 | double t1 = getCurrentTime(); 120 | std::cout << "done building tree, took " 121 | << prettyDouble(t1-t0) << "s" << std::endl; 122 | 123 | // ================================================================== 124 | // create set of sample query points 125 | // ================================================================== 126 | std::vector queries; 127 | generatePoints(numQueries, queries); 128 | // allocate memory for the results 129 | std::vector results(numQueries); 130 | 131 | // ================================================================== 132 | // and do some queryies - let's do the same ones in a loop so we cna 133 | // measure perf. 134 | // ================================================================== 135 | { 136 | double t0 = getCurrentTime(); 137 | for (int i=0;i 8 | #include 9 | #include 10 | #include 11 | 12 | #define FIXED_K 16 13 | 14 | using data_t = float3; 15 | using data_traits = cukd::default_data_traits; 16 | 17 | // CUDA KNN Kernel 18 | __global__ void KnnKernel( 19 | const float3* d_queries, int numQueries, 20 | const cukd::SpatialKDTree tree, 21 | float3* d_results, int k, float radius) 22 | { 23 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 24 | if (tid >= numQueries) return; 25 | 26 | cukd::HeapCandidateList result(radius); // Fixed at 16, for generalization make template 27 | 28 | cukd::stackBased::knn 29 | (result, tree, d_queries[tid]); 30 | 31 | for (int i = 0; i < k; i++) { 32 | int ID = result.get_pointID(i); 33 | d_results[tid * k + i] 34 | = ID < 0 35 | ? make_float3(0.f,0.f,0.f) 36 | : tree.data[ID]; 37 | } 38 | } 39 | 40 | float3* knnSearchCuda(const float3* points, const int numPoints, 41 | const float3* queries, const int numQueries, 42 | const int k, const float radius) { 43 | 44 | // Allocate managed memory for points, queries, and results 45 | float3* d_points; 46 | cudaMallocManaged(&d_points, numPoints * sizeof(float3)); 47 | std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; 48 | std::cout << "Allocated " << numPoints << " points at " << d_points << std::endl; 49 | cudaMemcpy(d_points, points, numPoints * sizeof(float3), cudaMemcpyHostToDevice); 50 | 51 | float3* d_queries; 52 | cudaMallocManaged(&d_queries, numQueries * sizeof(float3)); 53 | cudaMemcpy(d_queries, queries, numQueries * sizeof(float3), cudaMemcpyHostToDevice); 54 | 55 | // Build Spatial KD-Tree (managed memory) 56 | cukd::SpatialKDTree tree; 57 | cukd::BuildConfig buildConfig{}; 58 | buildTree(tree,d_points,numPoints,buildConfig); 59 | 60 | CUKD_CUDA_SYNC_CHECK(); 61 | 62 | // Results 63 | float3* d_results; 64 | cudaMallocManaged(&d_results, numQueries * k * sizeof(float3)); 65 | 66 | int threadsPerBlock = 256; 67 | int numBlocks = (numQueries + threadsPerBlock - 1) / threadsPerBlock; 68 | 69 | KnnKernel<<>>(d_queries, numQueries, tree, d_results, k, radius); 70 | cudaDeviceSynchronize(); 71 | 72 | // Copy back results 73 | float3* neighbors = new float3[numQueries * k]; 74 | cudaMemcpy(neighbors, d_results, numQueries * k * sizeof(float3), cudaMemcpyDeviceToHost); 75 | 76 | // Cleanup 77 | cudaFree(d_points); 78 | cudaFree(d_queries); 79 | cudaFree(d_results); 80 | cukd::free(tree); 81 | 82 | return neighbors; 83 | } 84 | 85 | int main(int, char **) { 86 | std::random_device rd; // a seed source for the random number engine 87 | std::mt19937 gen(rd()); // mersenne_twister_engine seeded with rd() 88 | std::uniform_real_distribution rng(0.f,+1.f); 89 | 90 | for (int r=0;r<10;r++) { 91 | std::vector points; 92 | { 93 | int N = 240000+int(20000*rng(gen)); 94 | for (int i=0;i queries; 99 | { 100 | int N = 240000+int(20000*rng(gen)); 101 | for (int i=0;i 21 | #include 22 | 23 | #define CUKD_MPI_CALL(fctCall) \ 24 | { int rc = MPI_##fctCall; \ 25 | if (rc != MPI_SUCCESS) \ 26 | throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+#fctCall); } 27 | 28 | using cukd::divRoundUp; 29 | 30 | struct MPIComm { 31 | MPIComm(MPI_Comm comm) 32 | : comm(comm) 33 | { 34 | MPI_Comm_rank(comm,&rank); 35 | MPI_Comm_size(comm,&size); 36 | } 37 | MPI_Comm comm; 38 | int rank, size; 39 | }; 40 | 41 | template 42 | std::vector readFilePortion(std::string inFileName, 43 | int rank, int size, 44 | size_t *pBegin = 0, 45 | size_t *pNumTotal = 0 46 | ) 47 | { 48 | std::ifstream in(inFileName.c_str(),std::ios::binary); 49 | in.seekg(0,std::ios::end); 50 | size_t numBytes = in.tellg(); 51 | in.seekg(0,std::ios::beg); 52 | 53 | size_t numData = numBytes / sizeof(T); 54 | if (pNumTotal) *pNumTotal = numData; 55 | size_t begin = numData * (rank+0)/size; 56 | if (pBegin) *pBegin = begin; 57 | size_t end = numData * (rank+1)/size; 58 | in.seekg(begin*sizeof(T),std::ios::beg); 59 | 60 | std::vector result(end-begin); 61 | in.read((char *)result.data(),(end-begin)*sizeof(T)); 62 | return result; 63 | } 64 | 65 | 66 | void usage(const std::string &error) 67 | { 68 | std::cerr << "Error: " << error << std::endl << std::endl; 69 | std::cerr << "./mpiHugeQuery -k [-r ] in.float3s -o out.dat" << std::endl; 70 | exit(error.empty()?0:1); 71 | } 72 | 73 | 74 | 75 | __global__ void runQuery(float3 *tree, int N, 76 | uint64_t *candidateLists, int k, float maxRadius, 77 | float3 *queries, int numQueries, 78 | int round) 79 | { 80 | int tid = threadIdx.x+blockIdx.x*blockDim.x; 81 | if (tid >= numQueries) return; 82 | 83 | float3 qp = queries[tid]; 84 | cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k, 85 | round == 0 ? maxRadius : -1.f); 86 | cukd::stackFree::knn(cl,qp,tree,N); 87 | } 88 | 89 | __global__ void extractFinalResult(float *d_finalResults, 90 | int numPoints, 91 | int k, 92 | uint64_t *candidateLists) 93 | { 94 | int tid = threadIdx.x+blockIdx.x*blockDim.x; 95 | if (tid >= numPoints) return; 96 | 97 | cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,-1.f); 98 | float result = cl.returnValue(); 99 | if (!isinf(result)) 100 | result = sqrtf(result); 101 | 102 | d_finalResults[tid] = result; 103 | } 104 | 105 | int main(int ac, char **av) 106 | { 107 | MPI_Init(&ac,&av); 108 | float maxRadius = std::numeric_limits::infinity(); 109 | int k = 0; 110 | int gpuAffinityCount = 0; 111 | std::string inFileName; 112 | std::string outFileName; 113 | 114 | for (int i=1;i myPoints 148 | = readFilePortion(inFileName,mpi.rank,mpi.size,&begin,&numPointsTotal); 149 | std::cout << "#" << mpi.rank << "/" << mpi.size 150 | << ": got " << myPoints.size() << " points to work on" 151 | << std::endl; 152 | 153 | float3 *d_tree = 0; 154 | float3 *d_tree_recv = 0; 155 | int N = myPoints.size(); 156 | // alloc N+1 so we can store one more if anytoher rank gets oen more point 157 | CUKD_CUDA_CALL(Malloc((void **)&d_tree,(N+1)*sizeof(myPoints[0]))); 158 | CUKD_CUDA_CALL(Malloc((void **)&d_tree_recv,(N+1)*sizeof(myPoints[0]))); 159 | CUKD_CUDA_CALL(Memcpy(d_tree,myPoints.data(),N*sizeof(myPoints[0]), 160 | cudaMemcpyDefault)); 161 | cukd::buildTree(d_tree,N); 162 | 163 | float3 *d_queries; 164 | int numQueries = myPoints.size(); 165 | uint64_t *d_cand; 166 | CUKD_CUDA_CALL(Malloc((void **)&d_queries,N*sizeof(float3))); 167 | CUKD_CUDA_CALL(Memcpy(d_queries,myPoints.data(),N*sizeof(float3),cudaMemcpyDefault)); 168 | CUKD_CUDA_CALL(Malloc((void **)&d_cand,N*k*sizeof(uint64_t))); 169 | 170 | // ----------------------------------------------------------------------------- 171 | // now, do the queries and cycling: 172 | // ----------------------------------------------------------------------------- 173 | for (int round=0;round>> 200 | (/* tree */d_tree,N, 201 | /* query params */d_cand,k,maxRadius, 202 | /* query points */d_queries,numQueries, 203 | round); 204 | CUKD_CUDA_CALL(DeviceSynchronize()); 205 | } 206 | std::cout << "done all queries..." << std::endl; 207 | float *d_finalResults = 0; 208 | CUKD_CUDA_CALL(MallocManaged((void **)&d_finalResults,myPoints.size()*sizeof(float))); 209 | extractFinalResult<<>> 210 | (d_finalResults,numQueries,k,d_cand); 211 | CUKD_CUDA_CALL(DeviceSynchronize()); 212 | 213 | MPI_Barrier(mpi.comm); 214 | 215 | for (int i=0;i 21 | #include 22 | 23 | #define CUKD_MPI_CALL(fctCall) \ 24 | { int rc = MPI_##fctCall; \ 25 | if (rc != MPI_SUCCESS) \ 26 | throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+#fctCall); } 27 | 28 | using cukd::divRoundUp; 29 | 30 | struct MPIComm { 31 | MPIComm(MPI_Comm comm) 32 | : comm(comm) 33 | { 34 | MPI_Comm_rank(comm,&rank); 35 | MPI_Comm_size(comm,&size); 36 | } 37 | MPI_Comm comm; 38 | int rank, size; 39 | }; 40 | 41 | template 42 | std::vector readFilePortion(std::string inFileName, 43 | int rank, int size, 44 | size_t *pBegin = 0, 45 | size_t *pNumTotal = 0 46 | ) 47 | { 48 | std::ifstream in(inFileName.c_str(),std::ios::binary); 49 | in.seekg(0,std::ios::end); 50 | size_t numBytes = in.tellg(); 51 | in.seekg(0,std::ios::beg); 52 | 53 | size_t numData = numBytes / sizeof(T); 54 | if (pNumTotal) *pNumTotal = numData; 55 | size_t begin = numData * (rank+0)/size; 56 | if (pBegin) *pBegin = begin; 57 | size_t end = numData * (rank+1)/size; 58 | in.seekg(begin*sizeof(T),std::ios::beg); 59 | 60 | std::vector result(end-begin); 61 | in.read((char *)result.data(),(end-begin)*sizeof(T)); 62 | return result; 63 | } 64 | 65 | 66 | void usage(const std::string &error) 67 | { 68 | std::cerr << "Error: " << error << std::endl << std::endl; 69 | std::cerr << "./mpiHugeQuery -k [-r ] in.float3s -o out.dat" << std::endl; 70 | exit(error.empty()?0:1); 71 | } 72 | 73 | 74 | 75 | void runQuery_host(float3 *tree, size_t N, 76 | uint64_t *candidateLists, int k, float maxRadius, 77 | float3 *queries, size_t numQueries, 78 | int round) 79 | { 80 | #ifdef OPENMP_FOUND 81 | #pragma omp parallel for 82 | #endif 83 | for (size_t tid = 0; tid < numQueries; tid++) { 84 | float3 qp = queries[tid]; 85 | cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k, 86 | round == 0 ? maxRadius : -1.f); 87 | cukd::stackFree::knn(cl,qp,tree,N); 88 | } 89 | } 90 | 91 | void extractFinalResult_host(float *finalResults, 92 | size_t numPoints, 93 | int k, 94 | uint64_t *candidateLists) 95 | { 96 | #ifdef OPENMP_FOUND 97 | #pragma omp parallel for 98 | #endif 99 | for (size_t tid = 0; tid < numPoints; tid++) { 100 | cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,-1.f); 101 | float result = cl.returnValue(); 102 | if (!isinf(result)) 103 | result = sqrtf(result); 104 | 105 | finalResults[tid] = result; 106 | } 107 | } 108 | 109 | int main(int ac, char **av) 110 | { 111 | MPI_Init(&ac,&av); 112 | float maxRadius = std::numeric_limits::infinity(); 113 | int k = 0; 114 | std::string inFileName; 115 | std::string outFileName; 116 | 117 | for (int i=1;i myPoints 143 | = readFilePortion(inFileName,mpi.rank,mpi.size,&begin,&numPointsTotal); 144 | std::cout << "#" << mpi.rank << "/" << mpi.size 145 | << ": got " << myPoints.size() << " points to work on" 146 | << std::endl; 147 | 148 | size_t N = myPoints.size(); 149 | std::vector tree((N+1)); 150 | std::vector tree_recv((N+1)); 151 | memcpy(tree.data(),myPoints.data(),N*sizeof(float3)); 152 | 153 | // Add timing to your mpiHugeQuery.cu 154 | double start_time, end_time; 155 | // Start timing before your main computation 156 | MPI_Barrier(mpi.comm); 157 | start_time = MPI_Wtime(); 158 | 159 | cukd::buildTree_host(tree.data(),N); 160 | 161 | // End timing after computation 162 | MPI_Barrier(mpi.comm); 163 | end_time = MPI_Wtime(); 164 | 165 | // Print results on rank 0 166 | if (mpi.rank == 0) { 167 | printf("Total execution time (buildTree_host): %.6f seconds\n", end_time - start_time); 168 | } 169 | 170 | size_t numQueries = myPoints.size(); 171 | std::vector queries(N); 172 | memcpy(queries.data(),myPoints.data(),N*sizeof(float3)); 173 | 174 | std::vector cand(N*k); 175 | 176 | // ----------------------------------------------------------------------------- 177 | // now, do the queries and cycling: 178 | // ----------------------------------------------------------------------------- 179 | MPI_Barrier(mpi.comm); 180 | start_time = MPI_Wtime(); 181 | 182 | for (int round=0;round finalResults(myPoints.size()); 224 | extractFinalResult_host(finalResults.data(),numQueries,k,cand.data()); 225 | 226 | MPI_Barrier(mpi.comm); 227 | 228 | for (int i=0;i= b 24 | function version_ge() { 25 | [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 26 | [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ] 27 | } 28 | # returns 0 (true) if a > b 29 | function version_gt() { 30 | [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 31 | [ "$1" = "$2" ] && return 1 || version_ge $1 $2 32 | } 33 | # returns 0 (true) if a <= b 34 | function version_le() { 35 | [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 36 | [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ] 37 | } 38 | # returns 0 (true) if a < b 39 | function version_lt() { 40 | [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 41 | [ "$1" = "$2" ] && return 1 || version_le $1 $2 42 | } 43 | 44 | ## ------------------- 45 | ## Select CUDA version 46 | ## ------------------- 47 | 48 | # Get the cuda version from the environment as $cuda. 49 | CUDA_VERSION_MAJOR_MINOR=${cuda} 50 | 51 | # Split the version. 52 | # We (might/probably) don't know PATCH at this point - it depends which version gets installed. 53 | CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1) 54 | CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2) 55 | CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3) 56 | # use lsb_release to find the OS. 57 | UBUNTU_VERSION=$(lsb_release -sr) 58 | UBUNTU_VERSION="${UBUNTU_VERSION//.}" 59 | 60 | echo "CUDA_MAJOR: ${CUDA_MAJOR}" 61 | echo "CUDA_MINOR: ${CUDA_MINOR}" 62 | echo "CUDA_PATCH: ${CUDA_PATCH}" 63 | # echo "UBUNTU_NAME: ${UBUNTU_NAME}" 64 | echo "UBUNTU_VERSION: ${UBUNTU_VERSION}" 65 | 66 | # If we don't know the CUDA_MAJOR or MINOR, error. 67 | if [ -z "${CUDA_MAJOR}" ] ; then 68 | echo "Error: Unknown CUDA Major version. Aborting." 69 | exit 1 70 | fi 71 | if [ -z "${CUDA_MINOR}" ] ; then 72 | echo "Error: Unknown CUDA Minor version. Aborting." 73 | exit 1 74 | fi 75 | # If we don't know the Ubuntu version, error. 76 | if [ -z ${UBUNTU_VERSION} ]; then 77 | echo "Error: Unknown Ubuntu version. Aborting." 78 | exit 1 79 | fi 80 | 81 | 82 | ## ------------------------------- 83 | ## Select CUDA packages to install 84 | ## ------------------------------- 85 | CUDA_PACKAGES="" 86 | for package in "${CUDA_PACKAGES_IN[@]}" 87 | do : 88 | # @todo This is not perfect. Should probably provide a separate list for diff versions 89 | # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y 90 | if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then 91 | package="cuda-compiler" 92 | elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then 93 | package="cuda-nvcc" 94 | # CUB/Thrust are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+ 95 | elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then 96 | # CUDA cuda-thrust >= 11.4 97 | if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then 98 | package="cuda-cccl" 99 | # Use cuda-thrust > 11.2 100 | elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then 101 | package="cuda-thrust" 102 | # Do not include this pacakge < 11.3 103 | else 104 | continue 105 | fi 106 | fi 107 | # CUDA 11+ includes lib* / lib*-dev packages, which if they existed previously where cuda-cu*- / cuda-cu*-dev- 108 | if [[ ${package} == libcu* ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "11.0" ; then 109 | package="${package/libcu/cuda-cu}" 110 | fi 111 | # Build the full package name and append to the string. 112 | CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}" 113 | done 114 | echo "CUDA_PACKAGES ${CUDA_PACKAGES}" 115 | 116 | ## ----------------- 117 | ## Prepare to install 118 | ## ----------------- 119 | CPU_ARCH="x86_64" 120 | PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin" 121 | PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${PIN_FILENAME}" 122 | # apt keyring package now available https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ 123 | KERYRING_PACKAGE_FILENAME="cuda-keyring_1.1-1_all.deb" 124 | KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}" 125 | REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/" 126 | 127 | echo "PIN_FILENAME ${PIN_FILENAME}" 128 | echo "PIN_URL ${PIN_URL}" 129 | echo "KEYRING_PACKAGE_URL ${KEYRING_PACKAGE_URL}" 130 | 131 | ## ----------------- 132 | ## Check for root/sudo 133 | ## ----------------- 134 | 135 | # Detect if the script is being run as root, storing true/false in is_root. 136 | is_root=false 137 | if (( $EUID == 0)); then 138 | is_root=true 139 | fi 140 | # Find if sudo is available 141 | has_sudo=false 142 | if command -v sudo &> /dev/null ; then 143 | has_sudo=true 144 | fi 145 | # Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. 146 | if [ "$is_root" = false ] && [ "$has_sudo" = false ]; then 147 | echo "Root or sudo is required. Aborting." 148 | exit 1 149 | elif [ "$is_root" = false ] ; then 150 | USE_SUDO=sudo 151 | else 152 | USE_SUDO= 153 | fi 154 | 155 | ## ----------------- 156 | ## Install 157 | ## ----------------- 158 | echo "Adding CUDA Repository" 159 | wget ${PIN_URL} 160 | $USE_SUDO mv ${PIN_FILENAME} /etc/apt/preferences.d/cuda-repository-pin-600 161 | wget ${KEYRING_PACKAGE_URL} && ${USE_SUDO} dpkg -i ${KERYRING_PACKAGE_FILENAME} && rm ${KERYRING_PACKAGE_FILENAME} 162 | $USE_SUDO add-apt-repository "deb ${REPO_URL} /" 163 | $USE_SUDO apt-get update 164 | 165 | echo "Installing CUDA packages ${CUDA_PACKAGES}" 166 | $USE_SUDO apt-get -y install ${CUDA_PACKAGES} 167 | 168 | if [[ $? -ne 0 ]]; then 169 | echo "CUDA Installation Error." 170 | exit 1 171 | fi 172 | 173 | ## ----------------- 174 | ## Set environment vars / vars to be propagated 175 | ## ----------------- 176 | 177 | CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} 178 | echo "CUDA_PATH=${CUDA_PATH}" 179 | export CUDA_PATH=${CUDA_PATH} 180 | export PATH="$CUDA_PATH/bin:$PATH" 181 | export LD_LIBRARY_PATH="$CUDA_PATH/lib:$LD_LIBRARY_PATH" 182 | export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH" 183 | # Check nvcc is now available. 184 | nvcc -V 185 | 186 | # If executed on github actions, make the appropriate echo statements to update the environment 187 | if [[ $GITHUB_ACTIONS ]]; then 188 | # Set paths for subsequent steps, using ${CUDA_PATH} 189 | echo "Adding CUDA to CUDA_PATH, PATH and LD_LIBRARY_PATH" 190 | echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV 191 | echo "${CUDA_PATH}/bin" >> $GITHUB_PATH 192 | echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV 193 | echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV 194 | fi 195 | -------------------------------------------------------------------------------- /scripts/actions/install_cuda_windows.ps1: -------------------------------------------------------------------------------- 1 | ## ------------------- 2 | ## Constants 3 | ## ------------------- 4 | 5 | # Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern 6 | # From 11.0, the download url/toolkit version is separate from the cudart version. 7 | # Releases since 11.5.1 (including 11.4.4) use `windows` rather than `win10` in the uri, due to windows 11 inclusion 8 | $CUDA_KNOWN_URLS = @{ 9 | "8.0.44" = "https://developer.nvidia.com/compute/cuda/8.0/Prod/network_installers/cuda_8.0.44_win10_network-exe"; 10 | "8.0.61" = "https://developer.nvidia.com/compute/cuda/8.0/Prod2/network_installers/cuda_8.0.61_win10_network-exe"; 11 | "9.0.176" = "https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe"; 12 | "9.1.85" = "https://developer.nvidia.com/compute/cuda/9.1/Prod/network_installers/cuda_9.1.85_win10_network"; 13 | "9.2.148" = "https://developer.nvidia.com/compute/cuda/9.2/Prod2/network_installers2/cuda_9.2.148_win10_network"; 14 | "10.0.130" = "https://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network"; 15 | "10.1.105" = "https://developer.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.105_win10_network.exe"; 16 | "10.1.168" = "https://developer.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.168_win10_network.exe"; 17 | "10.1.243" = "https://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe"; 18 | "10.2.89" = "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"; 19 | "11.0.1" = "https://developer.download.nvidia.com/compute/cuda/11.0.1/network_installers/cuda_11.0.1_win10_network.exe"; 20 | "11.0.2" = "https://developer.download.nvidia.com/compute/cuda/11.0.2/network_installers/cuda_11.0.2_win10_network.exe"; 21 | "11.0.3" = "https://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe"; 22 | "11.1.0" = "https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe"; 23 | "11.1.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe"; 24 | "11.2.0" = "https://developer.download.nvidia.com/compute/cuda/11.2.0/network_installers/cuda_11.2.0_win10_network.exe"; 25 | "11.2.1" = "https://developer.download.nvidia.com/compute/cuda/11.2.1/network_installers/cuda_11.2.1_win10_network.exe"; 26 | "11.2.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe"; 27 | "11.3.0" = "https://developer.download.nvidia.com/compute/cuda/11.3.0/network_installers/cuda_11.3.0_win10_network.exe"; 28 | "11.3.1" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe"; 29 | "11.4.0" = "https://developer.download.nvidia.com/compute/cuda/11.4.0/network_installers/cuda_11.4.0_win10_network.exe"; 30 | "11.4.1" = "https://developer.download.nvidia.com/compute/cuda/11.4.1/network_installers/cuda_11.4.1_win10_network.exe"; 31 | "11.4.2" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe"; 32 | "11.4.3" = "https://developer.download.nvidia.com/compute/cuda/11.4.3/network_installers/cuda_11.4.3_win10_network.exe"; 33 | "11.4.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.4/network_installers/cuda_11.4.4_windows_network.exe"; 34 | "11.5.0" = "https://developer.download.nvidia.com/compute/cuda/11.5.0/network_installers/cuda_11.5.0_win10_network.exe"; 35 | "11.5.1" = "https://developer.download.nvidia.com/compute/cuda/11.5.1/network_installers/cuda_11.5.1_windows_network.exe"; 36 | "11.5.2" = "https://developer.download.nvidia.com/compute/cuda/11.5.2/network_installers/cuda_11.5.2_windows_network.exe"; 37 | "11.6.0" = "https://developer.download.nvidia.com/compute/cuda/11.6.0/network_installers/cuda_11.6.0_windows_network.exe"; 38 | "11.6.1" = "https://developer.download.nvidia.com/compute/cuda/11.6.1/network_installers/cuda_11.6.1_windows_network.exe"; 39 | "11.6.2" = "https://developer.download.nvidia.com/compute/cuda/11.6.2/network_installers/cuda_11.6.2_windows_network.exe"; 40 | "11.7.0" = "https://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"; 41 | "11.7.1" = "https://developer.download.nvidia.com/compute/cuda/11.7.1/network_installers/cuda_11.7.1_windows_network.exe"; 42 | "11.8.0" = "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe"; 43 | "12.0.0" = "https://developer.download.nvidia.com/compute/cuda/12.0.0/network_installers/cuda_12.0.0_windows_network.exe" 44 | "12.4.0" = "https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe" 45 | } 46 | 47 | # @todo - change this to be based on _MSC_VER intead, or invert it to be CUDA keyed instead 48 | $VISUAL_STUDIO_MIN_CUDA = @{ 49 | "2022" = "11.6.0"; 50 | "2019" = "10.1"; 51 | "2017" = "10.0"; # Depends on which version of 2017! 9.0 to 10.0 depending on version 52 | "2015" = "8.0"; # Might support older, unsure. Depracated as of 11.1, unsupported in 11.2 53 | } 54 | 55 | # cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0 56 | # @todo - make this easier to vary per CUDA version. 57 | $CUDA_PACKAGES_IN = @( 58 | "nvcc"; 59 | "visual_studio_integration"; 60 | "curand_dev"; 61 | "nvrtc_dev"; 62 | "cudart"; 63 | "thrust"; 64 | ) 65 | 66 | ## ------------------- 67 | ## Select CUDA version 68 | ## ------------------- 69 | 70 | # Get the cuda version from the environment as env:cuda. 71 | $CUDA_VERSION_FULL = $env:cuda 72 | # Make sure CUDA_VERSION_FULL is set and valid, otherwise error. 73 | 74 | # Validate CUDA version, extracting components via regex 75 | $cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?[1-9][0-9]*)\.(?[0-9]+)\.(?[0-9]+)$" 76 | if(-not $cuda_ver_matched){ 77 | Write-Output "Invalid CUDA version specified, .. required. '$CUDA_VERSION_FULL'." 78 | exit 1 79 | } 80 | $CUDA_MAJOR=$Matches.major 81 | $CUDA_MINOR=$Matches.minor 82 | $CUDA_PATCH=$Matches.patch 83 | 84 | ## --------------------------- 85 | ## Visual studio support check 86 | ## --------------------------- 87 | # Exit if visual studio is too new for the cuda version. 88 | $VISUAL_STUDIO = $env:visual_studio.trim() 89 | if ($VISUAL_STUDIO.length -ge 4) { 90 | $VISUAL_STUDIO_YEAR = $VISUAL_STUDIO.Substring($VISUAL_STUDIO.Length-4) 91 | if ($VISUAL_STUDIO_YEAR.length -eq 4 -and $VISUAL_STUDIO_MIN_CUDA.containsKey($VISUAL_STUDIO_YEAR)){ 92 | $MINIMUM_CUDA_VERSION = $VISUAL_STUDIO_MIN_CUDA[$VISUAL_STUDIO_YEAR] 93 | if ([version]$CUDA_VERSION_FULL -lt [version]$MINIMUM_CUDA_VERSION) { 94 | Write-Output "Error: Visual Studio $($VISUAL_STUDIO_YEAR) requires CUDA >= $($MINIMUM_CUDA_VERSION)" 95 | exit 1 96 | } 97 | } 98 | } else { 99 | Write-Output "Warning: Unknown Visual Studio Version. CUDA version may be insufficient." 100 | } 101 | 102 | ## ------------------------------------------------ 103 | ## Select CUDA packages to install from environment 104 | ## ------------------------------------------------ 105 | 106 | $CUDA_PACKAGES = "" 107 | Foreach ($package in $CUDA_PACKAGES_IN) { 108 | # Make sure the correct package name is used for nvcc. 109 | if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){ 110 | $package="compiler" 111 | } elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") { 112 | $package="nvcc" 113 | } elseif($package -eq "thrust" -and [version]$CUDA_VERSION_FULL -lt [version]"11.3") { 114 | # Thrust is a package from CUDA 11.3, otherwise it should be skipped. 115 | continue 116 | } 117 | $CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)" 118 | } 119 | echo "$($CUDA_PACKAGES)" 120 | ## ----------------- 121 | ## Prepare download 122 | ## ----------------- 123 | 124 | # Select the download link if known, otherwise have a guess. 125 | $CUDA_REPO_PKG_REMOTE="" 126 | $CUDA_REPO_PKG_LOCAL="" 127 | if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){ 128 | $CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL] 129 | } else{ 130 | # Guess what the url is given the most recent pattern (at the time of writing, 10.1) 131 | Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating." 132 | if([version]$CUDA_VERSION_FULL -ge [version]"11.5.1"){ 133 | $CUDA_REPO_PKG_REMOTE="https://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_windows_network.exe" 134 | } else { 135 | $CUDA_REPO_PKG_REMOTE="https://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe" 136 | } 137 | } 138 | if([version]$CUDA_VERSION_FULL -ge [version]"11.5.1"){ 139 | $CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_windows_network.exe" 140 | } else { 141 | $CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe" 142 | } 143 | 144 | ## ------------ 145 | ## Install CUDA 146 | ## ------------ 147 | 148 | # Get CUDA network installer, retrying upto N times. 149 | Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)" 150 | 151 | $downloaded = $false 152 | $download_attempt = 0 153 | $download_attempt_delay = 30 154 | $download_attempts_max = 5 155 | 156 | while (-not $downloaded) { 157 | Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null 158 | $download_attempt++ 159 | # If download succeeded, break out the loop. 160 | if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){ 161 | Write-Output "Downloading Complete" 162 | $downloaded=$true 163 | } else { 164 | # If downlaod failed, either wait and try again, or give up and error. 165 | if ($download_attempt -le $download_attempts_max) { 166 | Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) (attempt $($download_attempt)/$($download_attempts_max)). Retrying." 167 | # Sleep for a number of seconds. 168 | Start-Sleep $download_attempt_delay 169 | } else { 170 | Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) after $($download_attempts_max) attempts. Aborting." 171 | # Abort the script. 172 | exit 1 173 | } 174 | } 175 | } 176 | 177 | # Invoke silent install of CUDA (via network installer) 178 | Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)" 179 | Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)" 180 | 181 | # Check the return status of the CUDA installer. 182 | if (!$?) { 183 | Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)" 184 | exit 1 185 | } 186 | 187 | # Store the CUDA_PATH in the environment for the current session, to be forwarded in the action. 188 | $CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)" 189 | $CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" 190 | # Set environmental variables in this session 191 | $env:CUDA_PATH = "$($CUDA_PATH)" 192 | $env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)" 193 | Write-Output "CUDA_PATH $($CUDA_PATH)" 194 | Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)" 195 | 196 | # PATH needs updating elsewhere, anything in here won't persist. 197 | # Append $CUDA_PATH/bin to path. 198 | # Set CUDA_PATH as an environmental variable 199 | 200 | # If executing on github actions, emit the appropriate echo statements to update environment variables 201 | if (Test-Path "env:GITHUB_ACTIONS") { 202 | # Set paths for subsequent steps, using $env:CUDA_PATH 203 | echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH" 204 | echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append 205 | echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append 206 | echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append 207 | } 208 | -------------------------------------------------------------------------------- /testing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ======================================================================== # 2 | # Copyright 2023-2024 Ingo Wald # 3 | # # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # 5 | # you may not use this file except in compliance with the License. # 6 | # You may obtain a copy of the License at # 7 | # # 8 | # http://www.apache.org/licenses/LICENSE-2.0 # 9 | # # 10 | # Unless required by applicable law or agreed to in writing, software # 11 | # distributed under the License is distributed on an "AS IS" BASIS, # 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 13 | # See the License for the specific language governing permissions and # 14 | # limitations under the License. # 15 | # ======================================================================== # 16 | 17 | # directory for both "real" test cases and unit testing 18 | 19 | project(cukdTests LANGUAGES CUDA CXX) 20 | 21 | # test all builders on both simple float3 and Photon types, empty inputs 22 | add_executable(cukdTestBitonicEmptyInput testBuilderEmptyInput.cu) 23 | target_compile_definitions(cukdTestBitonicEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_bitonic) 24 | target_link_libraries(cukdTestBitonicEmptyInput PRIVATE cudaKDTree) 25 | add_test(NAME cukdTestBitonicEmptyInput COMMAND cukdTestBitonicEmptyInput) 26 | 27 | add_executable(cukdTestThrustEmptyInput testBuilderEmptyInput.cu) 28 | target_compile_definitions(cukdTestThrustEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_thrust) 29 | target_link_libraries(cukdTestThrustEmptyInput PRIVATE cudaKDTree) 30 | add_test(NAME cukdTestThrustEmptyInput COMMAND cukdTestThrustEmptyInput) 31 | 32 | add_executable(cukdTestInPlaceEmptyInput testBuilderEmptyInput.cu) 33 | target_compile_definitions(cukdTestInPlaceEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_inPlace) 34 | target_link_libraries(cukdTestInPlaceEmptyInput PRIVATE cudaKDTree) 35 | add_test(NAME cukdTestInPlaceEmptyInput COMMAND cukdTestInPlaceEmptyInput) 36 | 37 | # test all builders on both simple float3 and Photon types, simple 1000 random points 38 | add_executable(cukdTestBitonicSimpleInput testBuilderSimpleInput.cu) 39 | target_compile_definitions(cukdTestBitonicSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_bitonic) 40 | target_link_libraries(cukdTestBitonicSimpleInput PRIVATE cudaKDTree) 41 | add_test(NAME cukdTestBitonicSimpleInput COMMAND cukdTestBitonicSimpleInput) 42 | 43 | add_executable(cukdTestThrustSimpleInput testBuilderSimpleInput.cu) 44 | target_compile_definitions(cukdTestThrustSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_thrust) 45 | target_link_libraries(cukdTestThrustSimpleInput PRIVATE cudaKDTree) 46 | add_test(NAME cukdTestThrustSimpleInput COMMAND cukdTestThrustSimpleInput) 47 | 48 | add_executable(cukdTestInPlaceSimpleInput testBuilderSimpleInput.cu) 49 | target_compile_definitions(cukdTestInPlaceSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_inPlace) 50 | target_link_libraries(cukdTestInPlaceSimpleInput PRIVATE cudaKDTree) 51 | add_test(NAME cukdTestInPlaceSimpleInput COMMAND cukdTestInPlaceSimpleInput) 52 | 53 | 54 | add_executable(cukdTestPayloadSampleFromReadme testPayloadSampleFromReadme.cu) 55 | target_compile_definitions(cukdTestPayloadSampleFromReadme PUBLIC -DBUILDER_TO_TEST=buildTree_thrust) 56 | target_link_libraries(cukdTestPayloadSampleFromReadme PRIVATE cudaKDTree) 57 | add_test(NAME cukdTestPayloadSampleFromReadme COMMAND cukdTestPayloadSampleFromReadme) 58 | 59 | 60 | 61 | add_executable(cukdTestHostBuilderEmptyInput testHostBuilderEmptyInput.cu) 62 | target_link_libraries(cukdTestHostBuilderEmptyInput PRIVATE cudaKDTree) 63 | add_test(NAME cukdTestHostBuilderEmptyInput COMMAND cukdTestHostBuilderEmptyInput) 64 | 65 | add_executable(cukdTestHostBuilderSimpleInput testHostBuilderSimpleInput.cu) 66 | target_link_libraries(cukdTestHostBuilderSimpleInput PRIVATE cudaKDTree) 67 | add_test(NAME cukdTestHostBuilderSimpleInput COMMAND cukdTestHostBuilderSimpleInput) 68 | 69 | 70 | # tests, for a wide range of input data, whether host, thrust, 71 | # bitonic, and inplace builders all produce the same tree. 72 | add_executable(cukdTestBuildersSameResult testBuildersSameResult.cu) 73 | target_link_libraries(cukdTestBuildersSameResult PRIVATE cudaKDTree) 74 | add_test(NAME cukdTestBuildersSameResult COMMAND cukdTestBuildersSameResult) 75 | 76 | 77 | 78 | # make sure all knn variants for a _spatial_ k-d tree will at least compile 79 | add_executable(cukdTestCompileSpatialKNN compileSpatialKNN.cu) 80 | target_link_libraries(cukdTestCompileSpatialKNN PRIVATE cudaKDTree) 81 | add_test(NAME cukdTestCompileSpatialKNN COMMAND cukdTestCompileSpatialKNN) 82 | # make sure all knn variants for a _spatial_ k-d tree will at least compile 83 | add_executable(cukdTestCompileKNN compileKNN.cu) 84 | target_link_libraries(cukdTestCompileKNN PRIVATE cudaKDTree) 85 | add_test(NAME cukdTestCompileKNN COMMAND cukdTestCompileKNN) 86 | 87 | 88 | 89 | # add a (compile-only) test to see if we can link two different object 90 | # files (that both include the same builders) without getting any 91 | # multiple definition errors. 92 | add_executable(cukdTestMultipleDefinitions 93 | testMultipleDefinitions_a.cu testMultipleDefinitions_b.cu) 94 | target_link_libraries(cukdTestMultipleDefinitions PRIVATE cudaKDTree) 95 | 96 | 97 | # ================================================================== 98 | # issue 5: reported wrong/inconsisten results for differnet builders, 99 | # in a given set generated by given random seed 100 | # ================================================================== 101 | # 102 | add_executable(cukdTestIssue5_thrust issue5.cu) 103 | target_link_libraries(cukdTestIssue5_thrust PRIVATE cudaKDTree) 104 | target_compile_definitions(cukdTestIssue5_thrust PUBLIC BUILDER_TO_TEST=buildTree_thrust) 105 | add_test(NAME cukdTestIssue5_thrust COMMAND cukdTestIssue5_thrust) 106 | # 107 | add_executable(cukdTestIssue5_bitonic issue5.cu) 108 | target_link_libraries(cukdTestIssue5_bitonic PRIVATE cudaKDTree) 109 | target_compile_definitions(cukdTestIssue5_bitonic PUBLIC BUILDER_TO_TEST=buildTree_bitonic) 110 | add_test(NAME cukdTestIssue5_bitonic COMMAND cukdTestIssue5_bitonic) 111 | # 112 | add_executable(cukdTestIssue5_inPlace issue5.cu) 113 | target_link_libraries(cukdTestIssue5_inPlace PRIVATE cudaKDTree) 114 | target_compile_definitions(cukdTestIssue5_inPlace PUBLIC BUILDER_TO_TEST=buildTree_inPlace) 115 | add_test(NAME cukdTestIssue5_inPlace COMMAND cukdTestIssue5_inPlace) 116 | 117 | 118 | 119 | enable_testing() 120 | -------------------------------------------------------------------------------- /testing/compileKNN.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/knn.h" 18 | 19 | using namespace cukd; 20 | 21 | __global__ 22 | void invokeQueries(float3 *d_tree, int N, 23 | box_t *d_worldBounds, 24 | float *d_results, float3 *d_queries) 25 | { 26 | int tid = threadIdx.x+blockIdx.x*blockDim.x; 27 | 28 | HeapCandidateList<100> stackHeapResults(10.f); 29 | stackBased::knn(stackHeapResults, 30 | d_queries[tid],d_tree,N); 31 | 32 | FixedCandidateList<4> stackListResults(10.f); 33 | stackBased::knn(stackListResults, 34 | d_queries[tid],d_tree,N); 35 | 36 | HeapCandidateList<100> stackFreeHeapResults(10.f); 37 | stackFree::knn(stackFreeHeapResults, 38 | d_queries[tid],d_tree,N); 39 | 40 | FixedCandidateList<4> stackFreeListResults(10.f); 41 | stackFree::knn(stackFreeListResults, 42 | d_queries[tid],d_tree,N); 43 | 44 | // cct kernel has an additional 'worldbnuds' argument 45 | HeapCandidateList<100> cctHeapResults(10.f); 46 | cct::knn(cctHeapResults, 47 | d_queries[tid],*d_worldBounds,d_tree,N); 48 | 49 | FixedCandidateList<4> cctListResults(10.f); 50 | cct::knn(cctListResults, 51 | d_queries[tid],*d_worldBounds,d_tree,N); 52 | 53 | d_results[tid] 54 | = stackHeapResults.maxRadius2() 55 | + stackListResults.maxRadius2() 56 | + stackFreeHeapResults.maxRadius2() 57 | + stackFreeListResults.maxRadius2() 58 | + cctHeapResults.maxRadius2() 59 | + cctListResults.maxRadius2(); 60 | } 61 | 62 | int main(int, const char **) 63 | { 64 | /* this only tests _compile_ capability */ 65 | return 0; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /testing/compileSpatialKNN.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/knn.h" 18 | 19 | using namespace cukd; 20 | 21 | __global__ 22 | void invokeQueries(SpatialKDTree *d_tree, float *d_results, float3 *d_queries) 23 | { 24 | int tid = threadIdx.x+blockIdx.x*blockDim.x; 25 | 26 | HeapCandidateList<100> stackHeapResults(10.f); 27 | stackBased::knn(stackHeapResults,*d_tree,d_queries[tid]); 28 | 29 | FixedCandidateList<4> stackListResults(10.f); 30 | stackBased::knn(stackListResults,*d_tree,d_queries[tid]); 31 | 32 | HeapCandidateList<100> cctHeapResults(10.f); 33 | cct::knn(cctHeapResults,*d_tree,d_queries[tid]); 34 | 35 | FixedCandidateList<4> cctListResults(10.f); 36 | cct::knn(cctListResults,*d_tree,d_queries[tid]); 37 | 38 | d_results[tid] 39 | = stackHeapResults.maxRadius2() 40 | + stackListResults.maxRadius2() 41 | + cctHeapResults.maxRadius2() 42 | + cctListResults.maxRadius2(); 43 | } 44 | 45 | int main(int, const char **) 46 | { 47 | /* this only tests _compile_ capability */ 48 | return 0; 49 | } 50 | 51 | -------------------------------------------------------------------------------- /testing/issue5.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2024 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | #include 19 | #include "cukd/fcp.h" 20 | 21 | 22 | #define AS_STRING(x) #x 23 | #define TO_STRING(x) AS_STRING(x) 24 | 25 | void generateRandomPoints(size_t nb_, 26 | int seed_, 27 | std::vector &destPts_) 28 | { 29 | const double maxVal = 100.0; 30 | const double scale = 2.0f * maxVal / RAND_MAX; 31 | 32 | destPts_.resize( nb_ ); 33 | std::srand( seed_ ); 34 | 35 | for ( size_t i = 0; i < nb_; i++ ) { 36 | destPts_[ i ].x = static_cast< float >( std::rand() * scale - maxVal ); 37 | destPts_[ i ].y = static_cast< float >( std::rand() * scale - maxVal ); 38 | destPts_[ i ].z = static_cast< float >( std::rand() * scale - maxVal ); 39 | } 40 | } 41 | 42 | 43 | __global__ void checkResult(float3 *data, 44 | int numData, 45 | float3 queryPoint, 46 | cukd::FcpSearchParams params, 47 | float expectedSqrDist) 48 | { 49 | if (threadIdx.x != 0) return; 50 | 51 | int res = cukd::stackBased::fcp(queryPoint,data,numData,params); 52 | if (res < 0) { 53 | printf("no result!?\n"); 54 | return; 55 | } 56 | float3 pt = data[res]; 57 | float sqrDist = cukd::fSqrDistance(pt,queryPoint); 58 | 59 | printf("found res %i, pos %f %f %f sqrdist %f expected %f\n", 60 | res,pt.x,pt.y,pt.z,sqrDist,expectedSqrDist); 61 | } 62 | 63 | float distance(float3 a, float3 b) 64 | { 65 | auto sqr = [&](float f) { return f*f; }; 66 | float f = 0.f; 67 | f += sqr(a.x-b.x); 68 | f += sqr(a.y-b.y); 69 | f += sqr(a.z-b.z); 70 | return f; 71 | } 72 | 73 | int main(int, char **) 74 | { 75 | std::vector points; 76 | // Point are generated like this (nb_= 90167, seed_= 33): 77 | int nb_= 90167, seed_= 33; 78 | generateRandomPoints(nb_,seed_,points); 79 | // It should start like this: 80 | // [0] {x=-99.1088562 y=-87.9879150 z=27.7626877 } float3 81 | // [1] {x=-38.3892326 y=31.5713978 z=-37.0891457 } float3 82 | // [2] {x=-22.0435200 y=-92.5473785 z=89.4833221 } float3 83 | // [3] {x=48.5274811 y=-94.0671997 z=80.3888092 } float3 84 | // [4] {x=-33.9030113 y=34.4157219 z=95.2085953 } float3 85 | for (int i=0;i<5;i++) 86 | printf("[%i] (%f %f %f)\n",i,points[i].x,points[i].y,points[i].z); 87 | 88 | cukd::box_t *worldBounds = 0; 89 | CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds))); 90 | float3 *d_points = 0; 91 | CUKD_CUDA_CALL(MallocManaged((void **)&d_points,points.size()*sizeof(float3))); 92 | CUKD_CUDA_CALL(Memcpy(d_points,points.data(),points.size()*sizeof(float3), 93 | cudaMemcpyDefault)); 94 | 95 | cukd::BUILDER_TO_TEST 96 | (d_points,points.size(),worldBounds); 97 | 98 | std::cout << "world bounds is " << *worldBounds << std::endl; 99 | 100 | // The querry point is {x=-98.4496613 y=76.9219055 z=25.8888512 } 101 | float3 queryPoint = make_float3(-98.4496613, 76.9219055, 25.8888512); 102 | // The "cutOffRadius" is 5.0 103 | cukd::FcpSearchParams params; 104 | params.cutOffRadius = 5.f; 105 | 106 | // The closest point should be at squared distance of 2.8466301 107 | float expectedSqrDist = 2.8466301f; 108 | 109 | float closestDist = INFINITY; 110 | int closest = -1; 111 | for (int i=0;i= closestDist) continue; 116 | closestDist = dist; 117 | closest = i; 118 | } 119 | float3 pt = points[closest]; 120 | std::cout << "reference closest dist is " << pt.x << ", " << pt.y << ", " << pt.z 121 | << " at dist " << closestDist << std::endl; 122 | checkResult<<<1,32>>>(d_points,points.size(), 123 | queryPoint,params,expectedSqrDist); 124 | CUKD_CUDA_SYNC_CHECK(); 125 | } 126 | 127 | 128 | -------------------------------------------------------------------------------- /testing/test-include-as-subdirectory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.28) 2 | 3 | project(test-cukd) 4 | 5 | set(CMAKE_CUDA_ARCHITECTURES native) 6 | add_subdirectory(../../ BUILD_CUKD EXCLUDE_FROM_ALL) 7 | 8 | add_executable(testBuild-sample-from-subdirectory 9 | ../../sample.cu 10 | ) 11 | target_link_libraries(testBuild-sample-from-subdirectory 12 | cudaKDTree 13 | ) 14 | -------------------------------------------------------------------------------- /testing/testBuilderEmptyInput.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | #include 19 | 20 | namespace test_float3 { 21 | void test_empty() 22 | { 23 | std::cout << "testing float3 array, empty input." << std::endl; 24 | 25 | // dummy arrays, just to get the types to force the right builder 26 | // instantiation: 27 | float3 *points = 0; 28 | int numPoints = 0; 29 | // BUILDER_TO_TEST supplied by cmakefile: 30 | cukd::BUILDER_TO_TEST(points,numPoints); 31 | } 32 | } 33 | 34 | namespace test_photon { 35 | /*! for those wondering what this test is for: have a look at Henrik 36 | Wan Jensen, "Realistic Image Synthesis using Photon Mapping" 37 | https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */ 38 | struct Photon { 39 | float3 position; 40 | float3 power; 41 | uint16_t normal_phi; 42 | uint8_t normal_theta; 43 | uint8_t splitDim; 44 | }; 45 | 46 | struct Photon_traits { 47 | using point_t = float3; 48 | 49 | enum { has_explicit_dim = true }; 50 | 51 | static inline __both__ 52 | const point_t &get_point(const Photon &p) 53 | { return p.position; } 54 | 55 | static inline __both__ float get_coord(const Photon &p, int d) 56 | { return cukd::get_coord(p.position,d); } 57 | 58 | static inline __device__ int get_dim(const Photon &p) 59 | { return p.splitDim; } 60 | 61 | static inline __device__ void set_dim(Photon &p, int d) 62 | { p.splitDim = d; } 63 | }; 64 | 65 | void test_empty() 66 | { 67 | std::cout << "testing 'Photons' array (float3 plus payload), empty input." << std::endl; 68 | 69 | // dummy arrays, just to get the types to force the right builder 70 | // instantiation: 71 | Photon *points = 0; 72 | int numPoints = 0; 73 | // BUILDER_TO_TEST supplied by cmakefile: 74 | cukd::BUILDER_TO_TEST 75 | (points,numPoints); 76 | } 77 | } 78 | 79 | int main(int, const char **) 80 | { 81 | test_float3::test_empty(); 82 | CUKD_CUDA_SYNC_CHECK(); 83 | 84 | test_photon::test_empty(); 85 | CUKD_CUDA_SYNC_CHECK(); 86 | 87 | return 0; 88 | } 89 | 90 | -------------------------------------------------------------------------------- /testing/testBuilderSimpleInput.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | #include 19 | 20 | #define AS_STRING(x) #x 21 | #define TO_STRING(x) AS_STRING(x) 22 | 23 | namespace test_float3 { 24 | void test_simple() 25 | { 26 | std::cout << "testing `" << TO_STRING(BUILDER_TO_TEST) 27 | << "` on float3 array, 1000 uniform random points." << std::endl; 28 | 29 | int numPoints = 1000; 30 | 31 | float3 *points = 0; 32 | CUKD_CUDA_CALL(MallocManaged((void **)&points,numPoints*sizeof(float3))); 33 | 34 | std::default_random_engine rd; 35 | std::mt19937 gen(rd()); 36 | std::uniform_real_distribution dist(0.f,100.f); 37 | for (int i=0;i dist(0.f,100.f); 92 | for (int i=0;i *worldBounds = 0; 101 | CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds))); 102 | 103 | cukd::BUILDER_TO_TEST 104 | (photons,numPhotons,worldBounds); 105 | 106 | std::cout << "world bounds is " << *worldBounds << std::endl; 107 | CUKD_CUDA_CALL(Free(photons)); 108 | CUKD_CUDA_CALL(Free(worldBounds)); 109 | } 110 | } 111 | 112 | int main(int, const char **) 113 | { 114 | test_float3::test_simple(); 115 | CUKD_CUDA_SYNC_CHECK(); 116 | 117 | test_photon::test_simple(); 118 | CUKD_CUDA_SYNC_CHECK(); 119 | 120 | return 0; 121 | } 122 | 123 | -------------------------------------------------------------------------------- /testing/testHostBuilderEmptyInput.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder_host.h" 18 | #include 19 | 20 | namespace test_float3 { 21 | void test_empty() 22 | { 23 | std::cout << "testing float3 array, empty input." << std::endl; 24 | 25 | // dummy arrays, just to get the types to force the right builder 26 | // instantiation: 27 | float3 *points = 0; 28 | int numPoints = 0; 29 | // BUILDER_TO_TEST supplied by cmakefile: 30 | cukd::buildTree_host(points,numPoints); 31 | } 32 | } 33 | 34 | namespace test_photon { 35 | /*! for those wondering what this test is for: have a look at Henrik 36 | Wan Jensen, "Realistic Image Synthesis using Photon Mapping" 37 | https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */ 38 | struct Photon { 39 | float3 position; 40 | float3 power; 41 | uint16_t normal_phi; 42 | uint8_t normal_theta; 43 | uint8_t splitDim; 44 | }; 45 | 46 | struct Photon_traits { 47 | using point_t = float3; 48 | #if 1 49 | enum { has_explicit_dim = false }; 50 | #else 51 | enum { has_explicit_dim = true }; 52 | 53 | static inline __both__ int get_dim(const Photon &p) 54 | { return p.splitDim; } 55 | 56 | static inline __both__ void set_dim(Photon &p, int d) 57 | { p.splitDim = d; } 58 | #endif 59 | 60 | static inline __both__ 61 | const point_t &get_point(const Photon &p) 62 | { return p.position; } 63 | 64 | static inline __both__ float get_coord(const Photon &p, int d) 65 | { return cukd::get_coord(p.position,d); } 66 | }; 67 | 68 | void test_empty() 69 | { 70 | std::cout << "testing 'Photons' array (float3 plus payload), empty input." << std::endl; 71 | 72 | // dummy arrays, just to get the types to force the right builder 73 | // instantiation: 74 | Photon *points = 0; 75 | int numPoints = 0; 76 | // BUILDER_TO_TEST supplied by cmakefile: 77 | cukd::buildTree_host 78 | (points,numPoints); 79 | } 80 | } 81 | 82 | int main(int, const char **) 83 | { 84 | test_float3::test_empty(); 85 | test_photon::test_empty(); 86 | return 0; 87 | } 88 | 89 | -------------------------------------------------------------------------------- /testing/testHostBuilderSimpleInput.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder_host.h" 18 | #include 19 | 20 | #define AS_STRING(x) #x 21 | #define TO_STRING(x) AS_STRING(x) 22 | 23 | namespace test_float3 { 24 | void test_simple() 25 | { 26 | std::cout << "testing `buildTree_host` on float3 array, 1000 uniform random points." << std::endl; 27 | 28 | int numPoints = 1000; 29 | 30 | float3 *points = 0; 31 | CUKD_CUDA_CALL(MallocManaged((void **)&points,numPoints*sizeof(float3))); 32 | 33 | std::default_random_engine rd; 34 | std::mt19937 gen(rd()); 35 | std::uniform_real_distribution dist(0.f,100.f); 36 | for (int i=0;i h_photons(numPhotons); 85 | Photon *photons = h_photons.data(); 86 | 87 | // Photon *photons = 0; 88 | // CUKD_CUDA_CALL(MallocManaged((void **)&photons,numPhotons*sizeof(Photon))); 89 | 90 | std::default_random_engine rd; 91 | std::mt19937 gen(rd()); 92 | std::uniform_real_distribution dist(0.f,100.f); 93 | for (int i=0;i *worldBounds = 0; 102 | CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds))); 103 | 104 | cukd::buildTree_host 105 | (photons,numPhotons,worldBounds); 106 | 107 | std::cout << "world bounds is " << *worldBounds << std::endl; 108 | // CUKD_CUDA_CALL(Free(photons)); 109 | CUKD_CUDA_CALL(Free(worldBounds)); 110 | } 111 | } 112 | 113 | int main(int, const char **) 114 | { 115 | test_float3::test_simple(); 116 | CUKD_CUDA_SYNC_CHECK(); 117 | 118 | test_photon::test_simple(); 119 | CUKD_CUDA_SYNC_CHECK(); 120 | 121 | return 0; 122 | } 123 | 124 | -------------------------------------------------------------------------------- /testing/testMultipleDefinitions_a.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2023-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | 19 | void foo(float3 *data, int numData) 20 | { 21 | cukd::buildTree(data,numData,nullptr); 22 | } 23 | 24 | int main(int, char **) 25 | { 26 | return 0; 27 | }; 28 | 29 | -------------------------------------------------------------------------------- /testing/testMultipleDefinitions_b.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2023-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | 19 | void foo2(float3 *data, int numData) 20 | { 21 | cukd::buildTree(data,numData,nullptr); 22 | } 23 | 24 | 25 | -------------------------------------------------------------------------------- /testing/testPayloadSampleFromReadme.cu: -------------------------------------------------------------------------------- 1 | // ======================================================================== // 2 | // Copyright 2018-2023 Ingo Wald // 3 | // // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); // 5 | // you may not use this file except in compliance with the License. // 6 | // You may obtain a copy of the License at // 7 | // // 8 | // http://www.apache.org/licenses/LICENSE-2.0 // 9 | // // 10 | // Unless required by applicable law or agreed to in writing, software // 11 | // distributed under the License is distributed on an "AS IS" BASIS, // 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // 13 | // See the License for the specific language governing permissions and // 14 | // limitations under the License. // 15 | // ======================================================================== // 16 | 17 | #include "cukd/builder.h" 18 | #include 19 | #include "cukd/fcp.h" 20 | 21 | #define AS_STRING(x) #x 22 | #define TO_STRING(x) AS_STRING(x) 23 | 24 | namespace example1 { 25 | 26 | struct PointPlusPayload { 27 | float3 position; 28 | int payload; 29 | }; 30 | 31 | struct PointPlusPayload_traits 32 | : public cukd::default_data_traits 33 | { 34 | using point_t = float3; 35 | 36 | static inline __device__ __host__ 37 | float3 get_point(const PointPlusPayload &data) 38 | { return data.position; } 39 | 40 | static inline __device__ __host__ 41 | float get_coord(const PointPlusPayload &data, int dim) 42 | { return cukd::get_coord(get_point(data),dim); } 43 | 44 | enum { has_explicit_dim = false }; 45 | 46 | /*! !{ just defining this for completeness, get/set_dim should never 47 | get called for this type because we have set has_explicit_dim 48 | set to false. note traversal should ONLY ever call this 49 | function for data_t's that define has_explicit_dim to true */ 50 | static inline __device__ int get_dim(const PointPlusPayload &) { return -1; } 51 | }; 52 | 53 | int divRoundUp(int a, int b) { return (a+b-1)/b; } 54 | 55 | __global__ 56 | void callFCP(PointPlusPayload *data, int numData, 57 | cukd::box_t *d_worldBounds) 58 | { 59 | int tid = threadIdx.x+blockIdx.x*blockIdx.x; 60 | if (tid >= numData) return; 61 | 62 | int result = cukd::stackBased::fcp 63 | (data[tid].position,*d_worldBounds,data,numData); 64 | } 65 | 66 | void foo(PointPlusPayload *data, int numData, cukd::box_t *d_worldBounds) 67 | { 68 | cukd::buildTree 69 | 71 | (data,numData,d_worldBounds); 72 | 73 | callFCP<<>>(data,numData,d_worldBounds); 74 | } 75 | 76 | void test() 77 | { 78 | std::cout << "testing `" << AS_STRING(BUILDER_TO_TEST) 79 | << "` on 'PointPlusPayloads' array (float3 plus payload), 1000 random data." << std::endl; 80 | 81 | int numPointPlusPayloads = 1000; 82 | 83 | PointPlusPayload *data = 0; 84 | CUKD_CUDA_CALL(MallocManaged((void **)&data,numPointPlusPayloads*sizeof(PointPlusPayload))); 85 | 86 | std::default_random_engine rd; 87 | std::mt19937 gen(rd()); 88 | std::uniform_real_distribution dist(0.f,100.f); 89 | for (int i=0;i *worldBounds = 0; 96 | CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds))); 97 | 98 | // cukd::BUILDER_TO_TEST 99 | // (data,numPointPlusPayloads,worldBounds); 100 | foo(data,numPointPlusPayloads,worldBounds); 101 | 102 | std::cout << "world bounds is " << *worldBounds << std::endl; 103 | CUKD_CUDA_CALL(Free(data)); 104 | CUKD_CUDA_CALL(Free(worldBounds)); 105 | } 106 | 107 | } 108 | 109 | int main(int, const char **) 110 | { 111 | example1::test(); 112 | CUKD_CUDA_SYNC_CHECK(); 113 | 114 | return 0; 115 | } 116 | --------------------------------------------------------------------------------