├── .github
    └── workflows
    │   ├── Ubuntu.yml
    │   └── Windows.yml
├── .gitignore
├── CMakeLists.txt
├── README.md
├── cukd
    ├── box.h
    ├── builder.h
    ├── builder_bitonic.h
    ├── builder_common.h
    ├── builder_host.h
    ├── builder_inplace.h
    ├── builder_thrust.h
    ├── common.h
    ├── cubit
    │   ├── common.h
    │   ├── cubit.h
    │   ├── cubit_maxval.h
    │   └── cubit_zip.h
    ├── cukd-math.h
    ├── data.h
    ├── fcp.h
    ├── helpers.h
    ├── kdtree.h
    ├── knn.h
    ├── spatial-kdtree.h
    ├── traverse-cct.h
    ├── traverse-default-stack-based.h
    ├── traverse-sf-imp.h
    └── traverse-stack-free.h
├── measure.sh
├── sample.cu
├── sampleHost.cu
├── samples
    ├── CMakeLists.txt
    ├── knn-float3-spatialkdtree.cu
    ├── mpiHugeQuery.cu
    └── mpiHugeQueryHost.cu
├── scripts
    ├── README.md
    └── actions
    │   ├── install_cuda_ubuntu.sh
    │   └── install_cuda_windows.ps1
└── testing
    ├── CMakeLists.txt
    ├── compileKNN.cu
    ├── compileSpatialKNN.cu
    ├── floatN-knn-and-fcp.cu
    ├── issue5.cu
    ├── test-include-as-subdirectory
        └── CMakeLists.txt
    ├── testBuilderEmptyInput.cu
    ├── testBuilderSimpleInput.cu
    ├── testBuildersSameResult.cu
    ├── testHostBuilderEmptyInput.cu
    ├── testHostBuilderSimpleInput.cu
    ├── testMultipleDefinitions_a.cu
    ├── testMultipleDefinitions_b.cu
    └── testPayloadSampleFromReadme.cu


/.github/workflows/Ubuntu.yml:
--------------------------------------------------------------------------------
  1 | # Compile project on Ubuntu
  2 | name: Ubuntu
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - "**"
  7 |       - "!.github/**"
  8 |       - ".github/workflows/Ubuntu.yml"
  9 |       - "!scripts/"
 10 |       - "scripts/actions/install_cuda_ubuntu.sh"
 11 |       - "!*.md"
 12 | jobs:
 13 |   build:
 14 |     runs-on: ${{ matrix.os }}
 15 |     strategy:
 16 |       fail-fast: false
 17 |       # explicit include-based build matrix, of known valid options
 18 |       matrix:
 19 |         include:
 20 |           # 24.04 supports CUDA 12.4+
 21 |           - os: ubuntu-24.04
 22 |             cuda: "12.6"
 23 |             gcc: 13
 24 |           # 22.04 supports CUDA 11.7+
 25 |           - os: ubuntu-22.04
 26 |             cuda: "12.0"
 27 |             gcc: 11
 28 |           # - os: ubuntu-22.04
 29 |           #   cuda: "11.8"
 30 |           #   gcc: 10
 31 |           # - os: ubuntu-22.04
 32 |           #   cuda: "11.7"
 33 |           #   gcc: 10
 34 |           # # 20.04 supports CUDA 11.0+
 35 |           # - os: ubuntu-20.04
 36 |           #   cuda: "11.6"
 37 |           #   gcc: 10
 38 |           # - os: ubuntu-20.04
 39 |           #   cuda: "11.5"
 40 |           #   gcc: 10
 41 |           # - os: ubuntu-20.04
 42 |           #   cuda: "11.4"
 43 |           #   gcc: 10
 44 |           # - os: ubuntu-20.04
 45 |           #   cuda: "11.3"
 46 |           #   gcc: 10
 47 |           # - os: ubuntu-20.04
 48 |           #   cuda: "11.2"
 49 |           #   gcc: 10
 50 |           # - os: ubuntu-20.04
 51 |           #   cuda: "11.0"
 52 |           #   gcc: 9
 53 |           # 18.04 supports CUDA 10.1+ (gxx <= 8), but were deprecated on 2022-08-08 and unsupported from 2023-04-01
 54 |           # - os: ubuntu-18.04
 55 |           #   cuda: "10.2"
 56 |           #   gcc: 8
 57 |           # - os: ubuntu-18.04
 58 |           #   cuda: "10.1"
 59 |           #   gcc: 8
 60 |           # 16.04 runners are deprecated / removed in september 2021.
 61 |           # It should still be possible to install CUDA 8 - CUDA 10 in 18.04 images by using the 16.04 repository, but install_cuda_ubuntu.sh would require changes to do so / a way to override the repository to use.
 62 |     env:
 63 |       build_dir: "build"
 64 |       config: "Release"
 65 | 
 66 |     steps:
 67 |     - uses: actions/checkout@v3
 68 | 
 69 |     - name: Install CUDA
 70 |       env:
 71 |         cuda: ${{ matrix.cuda }}
 72 |       run: ./scripts/actions/install_cuda_ubuntu.sh
 73 |       shell: bash
 74 | 
 75 |     # Specify the correct host compilers
 76 |     - name: Install/Select gcc and g++ 
 77 |       run: |
 78 |         sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }}
 79 |         echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> $GITHUB_ENV
 80 |         echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
 81 |         echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
 82 | 
 83 |     - name: Configure cmake
 84 |       id: configure
 85 |       run: cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DCMAKE_CUDA_ARCHITECTURES=all-major
 86 | 
 87 |     - name: Configure Error Processing
 88 |       if: ${{ failure() && steps.configure.outcome == 'failure' }}
 89 |       run: |
 90 |           if [[ -d "${{ env.build_dir }}" ]]; then
 91 |             pushd "${{ env.build_dir }}"
 92 |             if [[ -f "CMakeFiles/CMakeOutput.log" ]]; then
 93 |               echo "---- CMakeFiles/CMakeOutput.log"
 94 |               cat CMakeFiles/CMakeOutput.log
 95 |               echo "----"
 96 |             fi
 97 |             if [[ -f "CMakeFiles/CMakeError.log" ]]; then
 98 |               echo "---- CMakeFiles/CMakeError.log"
 99 |               cat CMakeFiles/CMakeError.log
100 |               echo "----"
101 |             fi
102 |           fi
103 | 
104 | 
105 | 
106 |     - name: Build everything else
107 |       working-directory: ${{ env.build_dir }}
108 |       run: cmake --build . --target all --verbose -j `nproc`
109 | 
110 | 


--------------------------------------------------------------------------------
/.github/workflows/Windows.yml:
--------------------------------------------------------------------------------
  1 | # Windows builds.
  2 | name: Windows
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - "**"
  7 |       - "!.github/**"
  8 |       - ".github/workflows/Windows.yml"
  9 |       - "!scripts"
 10 |       - "scripts/install_cuda_windows.ps1"
 11 |       - "!*.md"
 12 | jobs:
 13 |   build:
 14 |     runs-on: ${{ matrix.os }}
 15 |     strategy:
 16 |       fail-fast: false
 17 |       # explicit include-based build matrix, of known valid options
 18 |       matrix:
 19 |         include:
 20 |           # Windows-2022 & VS 2022 supports 12.4+
 21 |           - os: windows-2022
 22 |             cuda: "12.4.0"
 23 |             visual_studio: "Visual Studio 17 2022"
 24 |           # Windows-2019 & VS 2019 supports 10.1+
 25 |           #- os: windows-2019
 26 |           #  cuda: "11.5.0"
 27 |           #  visual_studio: "Visual Studio 16 2019"
 28 |           #- os: windows-2019
 29 |           #  cuda: "11.4.0"
 30 |           #  visual_studio: "Visual Studio 16 2019"
 31 |           #- os: windows-2019
 32 |           #  cuda: "11.3.0"
 33 |           #  visual_studio: "Visual Studio 16 2019"
 34 |           #- os: windows-2019
 35 |           #  cuda: "11.2.0"
 36 |           #  visual_studio: "Visual Studio 16 2019"
 37 |           #- os: windows-2019
 38 |           #  cuda: "11.1.0"
 39 |           #  visual_studio: "Visual Studio 16 2019"
 40 |           #- os: windows-2019
 41 |           #  cuda: "11.0.1"
 42 |           #  visual_studio: "Visual Studio 16 2019"
 43 |           #- os: windows-2019
 44 |           #  cuda: "10.2.89"
 45 |           #  visual_studio: "Visual Studio 16 2019"
 46 |           #- os: windows-2019
 47 |           #  cuda: "10.1.243"
 48 |           #  visual_studio: "Visual Studio 16 2019"
 49 | 
 50 |     env:
 51 |       build_dir: "build"
 52 |       config: "Release"
 53 | 
 54 |     steps:
 55 |     - uses: actions/checkout@v3
 56 | 
 57 |     - name: Install CUDA
 58 |       env: 
 59 |         cuda: ${{ matrix.cuda }}
 60 |         visual_studio: ${{ matrix.visual_studio }}
 61 |       shell: powershell
 62 |       run: .\scripts\actions\install_cuda_windows.ps1
 63 | 
 64 |     - name: nvcc check
 65 |       shell: powershell
 66 |       run: |
 67 |         nvcc -V
 68 |         ls $env:CUDA_PATH
 69 |         ls $env:CUDA_PATH\bin
 70 |         ls $env:CUDA_PATH\include
 71 | 
 72 |     - name: cmake version
 73 |       shell: bash
 74 |       run: cmake --version
 75 | 
 76 |     - name: Configure CMake
 77 |       id: configure
 78 |       shell: bash
 79 |       run: cmake . -B ${{ env.build_dir }} -G "${{ matrix.visual_studio }}" -A x64  -DCMAKE_CUDA_ARCHITECTURES=all-major
 80 | 
 81 |     - name: Configure Error Processing
 82 |       if: ${{ (failure() && steps.configure.outcome == 'failure') || success() }}
 83 |       shell: bash
 84 |       run: |
 85 |           if [[ -d "${{ env.build_dir }}" ]]; then
 86 |             pushd "${{ env.build_dir }}"
 87 |             if [[ -f "CMakeFiles/CMakeOutput.log" ]]; then
 88 |               echo "---- CMakeFiles/CMakeOutput.log"
 89 |               cat CMakeFiles/CMakeOutput.log
 90 |               echo "----"
 91 |             fi
 92 |             if [[ -f "CMakeFiles/CMakeError.log" ]]; then
 93 |               echo "---- CMakeFiles/CMakeError.log"
 94 |               cat CMakeFiles/CMakeError.log
 95 |               echo "----"
 96 |             fi
 97 |           fi
 98 | 
 99 |     - name: Build
100 |       working-directory: ${{ env.build_dir }}
101 |       run: cmake --build . --config ${{ env.config }} --target ALL_BUILD --verbose
102 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | textures
 3 | *#
 4 | .#*
 5 | bin
 6 | dbg
 7 | tags
 8 | .ycm_extra_conf.pyc
 9 | *.autosave
10 | *DS_Store*
11 | *.gz
12 | *.rpm
13 | *.zip
14 | *.bak
15 | *.patch
16 | .vscode
17 | deps
18 | tbb
19 | ispc
20 | *.aux
21 | *.bbl
22 | *.blg
23 | *.brf
24 | *.dvi
25 | *.lbl
26 | *.log
27 | *.swp
28 | *.out
29 | Session.vim
30 | .idea
31 | !*png/*.pdf
32 | .vs/
33 | 
34 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # ======================================================================== #
  2 | # Copyright 2021-2024 Ingo Wald                                            #
  3 | #                                                                          #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  5 | # you may not use this file except in compliance with the License.         #
  6 | # You may obtain a copy of the License at                                  #
  7 | #                                                                          #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0                           #
  9 | #                                                                          #
 10 | # Unless required by applicable law or agreed to in writing, software      #
 11 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 13 | # See the License for the specific language governing permissions and      #
 14 | # limitations under the License.                                           #
 15 | # ======================================================================== #
 16 | 
 17 | cmake_minimum_required(VERSION 3.18)
 18 | cmake_policy(SET CMP0048 NEW)
 19 | cmake_policy(SET CMP0104 NEW)
 20 | set(CMAKE_BUILD_TYPE_INIT "Release")
 21 | project(cudaKDTree VERSION 1.0.1 LANGUAGES C CXX)
 22 | 
 23 | if (NOT (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}))
 24 |   set(CUKD_IS_SUBPROJECT ON)
 25 | else()
 26 |   set(CUKD_IS_SUBPROJECT OFF)
 27 | endif()
 28 | 
 29 | option(BUILD_ALL_TESTS "Build entire type/dimension/kernel test matrix?" OFF)
 30 | 
 31 | # ------------------------------------------------------------------
 32 | # OpenMP support
 33 | # ------------------------------------------------------------------
 34 | find_package(OpenMP)
 35 | if(OpenMP_CXX_FOUND)
 36 |   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 37 |   
 38 |   # Fix for OpenMP library path issues
 39 |   string(REPLACE ";" " " OpenMP_CXX_LIBRARIES_FIXED "${OpenMP_CXX_LIBRARIES}")
 40 |   set(OpenMP_CXX_LIBRARIES "${OpenMP_CXX_LIBRARIES_FIXED}")
 41 | 
 42 |   if(WIN32)
 43 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp:llvm")
 44 |   else()
 45 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 46 |   endif()
 47 | 
 48 | endif()
 49 | 
 50 | # ------------------------------------------------------------------
 51 | # CUDA and OpenMP integration
 52 | # ------------------------------------------------------------------
 53 | # Ensure CUDA compiler can use OpenMP
 54 | if(OpenMP_CXX_FOUND)  
 55 |   if(WIN32)
 56 |     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/openmp:llvm")
 57 |   else()
 58 |     # For NVCC compiler, need to pass OpenMP flags differently
 59 |     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}")
 60 | 
 61 |     set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CXX_COMPILER}")
 62 |     set(CMAKE_CUDA_STANDARD_LIBRARIES "${CMAKE_CUDA_STANDARD_LIBRARIES} ${OpenMP_CXX_LIBRARIES}")
 63 |   endif()
 64 | endif()
 65 | 
 66 | #add_subdirectory(../bitonic ext_bitonic EXCLUDE_FROM_ALL)
 67 | 
 68 | # ------------------------------------------------------------------
 69 | # general cmake project configs
 70 | # ------------------------------------------------------------------
 71 | if (CUKD_IS_SUBPROJECT)
 72 |   # we're used as a subproject (as we should be!) - parent HAS to have
 73 |   # set CMAKE_CUDA_ARCHITECTURES for our code to compile
 74 |   # properly. check if it did, and error out of not
 75 |   if ((NOT CMAKE_CUDA_ARCHITECTURES)
 76 |       OR
 77 |       ((${CMAKE_VERSION} VERSION_LESS 3.24)
 78 | 	AND
 79 | 	("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "52")))
 80 |     message(FATAL_ERROR "#cudaKDTree: no CMAKE_CUDA_ARCHITECTURES defined, or left for cmake to default to arch 5.2. This is almost certainly a configuration problem that will cause you some grief. Please define CMAKE_CUDA_ARCHITECTURES to the (list of) arch(s) you want to be building for, and do that before the `add_subdirectory()` call that includes cudaKDTree. If in doubt as to what arch to use, for cmake version >= 3.24 you can also set it to 'all-major' or 'native'")
 81 |   endif()
 82 | else()
 83 |   if (CMAKE_CUDA_ARCHITECTURES)
 84 |     # CI test set this to 'all-major', but older cmake's do not have this.
 85 |     if ((${CMAKE_VERSION} VERSION_LESS 3.24)
 86 | 	AND
 87 | 	(${CMAKE_CUDA_ARCHITECTURES} STREQUAL "all-major"))
 88 |       set (CUKD_INIT_ARCHS "70;80")
 89 |     else()
 90 |       set (CUKD_INIT_ARCHS "${CMAKE_CUDA_ARCHITECTURES}")
 91 |     endif()
 92 |     # set on the cmdline
 93 |   elseif (${CMAKE_VERSION} VERSION_LESS 3.24)
 94 |     set (CUKD_INIT_ARCHS "70;80")
 95 |   else()
 96 |     set (CUKD_INIT_ARCHS "all-major")
 97 |   endif()
 98 |   set(CUKD_CUDA_ARCHITECTURES "${CUKD_INIT_ARCHS}"
 99 |     CACHE STRING "CUDA Arch(s) to build against")
100 | 
101 |   set(CMAKE_CUDA_ARCHITECTURES ${CUKD_CUDA_ARCHITECTURES})
102 |   SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
103 |   SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
104 |   SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
105 | endif()
106 | enable_language(CUDA)
107 | 
108 | # ==================================================================
109 | # this builds four variants of this library, that differ in how the
110 | # k-d tree is being TRAVERSED:
111 | #
112 | # `cudaKDTree-default` uses a stack-based traversal, doesn't require
113 | # the world-space bounding box
114 | #
115 | # `cudaKDTree-sf` uses a stack-free traversal. Can generate in more
116 | # efficient code in some cases, but will suffer from the same issues
117 | # as the default variant for certain combination of input point
118 | # distributoins and query point distributions
119 | #
120 | # `cudaKDTree-cct` uses 'closest-corner-tracking', which can in some
121 | # cases be faster than teh default traversal method (in particular if
122 | # there is no good cut-off-radius, and queries can originate far from
123 | # the data points, and/or for highly clustered data. It does however
124 | # require to allocate and provide (a tiny amount of) memory for the
125 | # builder to store the world-space bounding box of the input points,
126 | # as well as to pass that pointer to the query method.
127 | #
128 | # ==================================================================
129 | add_library(cudaKDTree INTERFACE)
130 | target_sources(cudaKDTree INTERFACE
131 |   cukd/common.h
132 |   # iw, sep 22, 2024 - intentionally renamed from cukd/math.h to cukd/cukd-math.h to 
133 |   # avoid name conflicts with system math.h if anybody adds cukd/ to include path
134 |   cukd/cukd-math.h
135 |   cukd/box.h
136 |   cukd/builder.h
137 |   cukd/builder_bitonic.h
138 |   cukd/builder_thrust.h
139 |   cukd/builder_inplace.h
140 |   # SPATIAL k-d tree, with planes at arbitrary locations 
141 |   cukd/spatial-kdtree.h
142 |   cukd/fcp.h
143 |   cukd/knn.h
144 |   )
145 | target_include_directories(cudaKDTree INTERFACE
146 |   ${PROJECT_SOURCE_DIR}/
147 |   )
148 | set_property(TARGET cudaKDTree PROPERTY PUBLIC
149 |   CXX_STANDARD 14)
150 | # 'attach' current cmake_cuda_architectures to this library
151 | set_property(TARGET cudaKDTree PROPERTY PUBLIC 
152 |   CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
153 | 
154 | 
155 | 
156 | # ================================================================== a
157 | # simple sample example of how to build a k-d tree
158 | # ==================================================================
159 | if (NOT CUKD_IS_SUBPROJECT)
160 |   add_executable(cukd_sample sample.cu)
161 |   target_link_libraries(cukd_sample cudaKDTree)
162 | 
163 |   add_executable(cukd_sampleHost sampleHost.cu)
164 |   target_link_libraries(cukd_sampleHost PUBLIC cudaKDTree)
165 | 
166 |   if(OpenMP_CXX_FOUND)
167 |     target_link_libraries(cukd_sampleHost PUBLIC OpenMP::OpenMP_CXX)
168 |     target_compile_definitions(cukd_sampleHost PUBLIC OPENMP_FOUND)
169 |   endif()
170 | 
171 |   find_package(MPI)
172 |   if (MPI_FOUND) 
173 |     add_executable(cukd_mpiHugeQuery samples/mpiHugeQuery.cu)
174 |     target_link_libraries(cukd_mpiHugeQuery PUBLIC cudaKDTree MPI::MPI_CXX)
175 | 
176 |     add_executable(cukd_mpiHugeQueryHost samples/mpiHugeQueryHost.cu)
177 |     target_link_libraries(cukd_mpiHugeQueryHost PUBLIC cudaKDTree MPI::MPI_CXX)
178 | 
179 |     if(OpenMP_CXX_FOUND)
180 |       target_link_libraries(cukd_mpiHugeQueryHost PUBLIC OpenMP::OpenMP_CXX)
181 |       target_compile_definitions(cukd_mpiHugeQueryHost PUBLIC OPENMP_FOUND)
182 |     endif()
183 | 
184 |   endif()
185 | endif()
186 | 
187 | 
188 | 
189 | 
190 | # ==================================================================
191 | # create _a lot_ of test cases: this generates the whole matrix of
192 | # traversal_method x num_dims x {fcp,knn}
193 | # ==================================================================
194 | if (BUILD_ALL_TESTS)
195 |   # test 2, 3, 4, and 8-dimensoinal data; the latter should - if it
196 |   # works for N=8, work for any other N>4
197 |   #  set(DIMS_TO_BUILD 3)
198 |   option(CUKD_ENABLE_STATS "Enable Stats tracking?" OFF)
199 |   if (CUKD_ENABLE_STATS)
200 |     set(CUKD_ENABLE_STATS_VALUE 1)
201 |   else()
202 |     set(CUKD_ENABLE_STATS_VALUE 0)
203 |   endif()
204 |   set(DIMS_TO_BUILD 2 3 4 8)
205 |   foreach (D IN ITEMS ${DIMS_TO_BUILD})
206 |     # test all four possible traversal methosds
207 |     foreach(method stackBased stackFree cct)
208 |       # test knn queries, on regular trees (no explicit dimension per node)
209 |       add_executable(cukd_float${D}-knn-${method} testing/floatN-knn-and-fcp.cu)
210 |       target_link_libraries(cukd_float${D}-knn-${method} cudaKDTree)
211 |       target_compile_definitions(cukd_float${D}-knn-${method}
212 | 	PUBLIC
213 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
214 | 	-DD_FROM_CMAKE=${D}
215 | 	-DUSE_KNN=1
216 | 	-DTRAVERSAL_METHOD=${method})
217 | 	
218 |       # test knn queries, with 'explicit-dim' trees
219 |       add_executable(cukd_float${D}-knn-${method}-xd testing/floatN-knn-and-fcp.cu)
220 |       target_link_libraries(cukd_float${D}-knn-${method}-xd cudaKDTree)
221 |       target_compile_definitions(cukd_float${D}-knn-${method}-xd
222 | 	PUBLIC
223 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
224 | 	-DD_FROM_CMAKE=${D}
225 | 	-DEXPLICIT_DIM=1
226 | 	-DUSE_KNN=1
227 | 	-DTRAVERSAL_METHOD=${method})
228 | 
229 |       # test fcp queries, on regular trees
230 |       add_executable(cukd_float${D}-fcp-${method} testing/floatN-knn-and-fcp.cu)
231 |       target_link_libraries(cukd_float${D}-fcp-${method} cudaKDTree)
232 |       target_compile_definitions(cukd_float${D}-fcp-${method}
233 | 	PUBLIC
234 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
235 | 	-DD_FROM_CMAKE=${D}
236 | 	-DTRAVERSAL_METHOD=${method})
237 | 
238 |       # test fcp queries, with 'explicit-dim' trees
239 |       add_executable(cukd_float${D}-fcp-${method}-xd testing/floatN-knn-and-fcp.cu)
240 |       target_link_libraries(cukd_float${D}-fcp-${method}-xd cudaKDTree)
241 |       target_compile_definitions(cukd_float${D}-fcp-${method}-xd
242 | 	PUBLIC
243 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
244 | 	-DD_FROM_CMAKE=${D}
245 | 	-DEXPLICIT_DIM=1
246 | 	-DTRAVERSAL_METHOD=${method})
247 | 
248 |     endforeach()
249 | 
250 | 
251 |     foreach(method stackBased cct)
252 |       # test knn queries, on regular trees (no explicit dimension per node)
253 |       add_executable(cukd_float${D}-knn-spatial-${method} testing/floatN-knn-and-fcp.cu)
254 |       target_link_libraries(cukd_float${D}-knn-spatial-${method} cudaKDTree)
255 |       target_compile_definitions(cukd_float${D}-knn-spatial-${method}
256 | 	PUBLIC
257 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
258 | 	-DD_FROM_CMAKE=${D}
259 | 	-DSPATIAL=1
260 | 	-DUSE_KNN=1
261 | 	-DTRAVERSAL_METHOD=${method})
262 | 
263 |       # test fcp queries, on regular trees
264 |       add_executable(cukd_float${D}-fcp-spatial-${method} testing/floatN-knn-and-fcp.cu)
265 |       target_link_libraries(cukd_float${D}-fcp-spatial-${method} cudaKDTree)
266 |       target_compile_definitions(cukd_float${D}-fcp-spatial-${method}
267 | 	PUBLIC
268 | 	-DCUKD_ENABLE_STATS=${CUKD_ENABLE_STATS_VALUE}
269 | 	-DSPATIAL=1
270 | 	-DD_FROM_CMAKE=${D}
271 | 	-DTRAVERSAL_METHOD=${method})
272 | 
273 |     endforeach()
274 |   endforeach()
275 | endif()
276 | 
277 | 
278 | if (NOT CUKD_IS_SUBPROJECT)
279 |   add_subdirectory(samples)
280 | endif()
281 | 
282 | if (BUILD_ALL_TESTS)
283 |   # add some unit tests
284 |   include(CTest) 
285 |   add_subdirectory(testing)
286 | endif()
287 | 
288 | 
289 | 


--------------------------------------------------------------------------------
/cukd/box.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2024 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/
 18 | 
 19 | #pragma once
 20 | 
 21 | #include "cukd/cukd-math.h"
 22 | 
 23 | namespace cukd {
 24 |   
 25 |   template<typename T> inline __both__ T empty_box_lower();
 26 |   template<typename T> inline __both__ T empty_box_upper();
 27 | 
 28 |   template<> inline __both__ float empty_box_lower<float>() { return +INFINITY; }
 29 |   template<> inline __both__ float empty_box_upper<float>() { return -INFINITY; }
 30 |   template<> inline __both__ int empty_box_lower<int>() { return INT_MAX; }
 31 |   template<> inline __both__ int empty_box_upper<int>() { return INT_MIN; }
 32 |   
 33 | 
 34 |   template<typename point_t>
 35 |   struct box_t {
 36 |     using point_traits = ::cukd::point_traits<point_t>;
 37 |     using scalar_t = typename point_traits::scalar_t;
 38 |     
 39 |     // inline __both__ point_t size() const { return upper - lower; }
 40 | 
 41 |     /*! returns the dimension in which the box has the widest extent */
 42 |     inline __both__ int widestDimension() const;
 43 |     
 44 |     inline __both__ bool contains(const point_t &p) const
 45 |     {
 46 |       enum { num_dims = num_dims_of<point_t>::value };
 47 |       for (int d=0;d<num_dims;d++) {
 48 |         if (point_traits::get_coord(p,d) < point_traits::get_coord(lower,d)) return false;
 49 |         if (point_traits::get_coord(p,d) > point_traits::get_coord(upper,d)) return false;
 50 |       }
 51 |       return true;
 52 |     }
 53 | 
 54 |     inline __both__ void grow(const point_t &p)
 55 |     {
 56 |       lower = min(lower,p);
 57 |       upper = max(upper,p);
 58 |     }
 59 |     
 60 |     inline __both__ void setEmpty()
 61 |     {
 62 |       for (int d=0;d<point_traits::num_dims;d++) {
 63 |         // get_coord(lower,d) = empty_box_lower<typename scalar_type_of<point_t>::type>();
 64 |         // get_coord(upper,d) = empty_box_upper<typename scalar_type_of<point_t>::type>();
 65 |         point_traits::set_coord(lower,d,empty_box_lower<scalar_t>());
 66 |         point_traits::set_coord(upper,d,empty_box_upper<scalar_t>());
 67 |       }
 68 |     }
 69 | 
 70 |     /*! set to an infinitely _open_ box */
 71 |     inline __both__ void setInfinite()
 72 |     {
 73 |       for (int d=0;d<point_traits::num_dims;d++) {
 74 |         // get_coord(lower,d) = empty_box_upper<typename scalar_type_of<point_t>::type>();
 75 |         // get_coord(upper,d) = empty_box_lower<typename scalar_type_of<point_t>::type>();
 76 |         point_traits::set_coord(lower,d,empty_box_upper<scalar_t>());
 77 |         point_traits::set_coord(upper,d,empty_box_lower<scalar_t>());
 78 |       }
 79 |     }
 80 | 
 81 |     point_t lower, upper;
 82 |   };
 83 | 
 84 |   /*! helper function for printf debugging */
 85 |   template<typename T>
 86 |   inline std::ostream &operator<<(std::ostream &o, const box_t<T> &b)
 87 |   {
 88 |     o << "{" << b.lower << "," << b.upper << "}";
 89 |     return o;
 90 |   }
 91 |   
 92 |   /*! computes the closest point to 'point' that's within the given
 93 |     box; if point itself is inside that box it'll be the point
 94 |     itself, otherwise it'll be a point on the outside surface of the
 95 |     box */
 96 |   template<typename point_t>
 97 |   inline __host__ __device__
 98 |   point_t project(const cukd::box_t<point_t>  &box,
 99 |                   const point_t               &point)
100 |   {
101 |     return min(max(point,box.lower),box.upper);
102 |   }
103 | 
104 |   // ------------------------------------------------------------------
105 |   template<typename point_t>
106 |   inline __host__ __device__
107 |   auto sqrDistance(const box_t<point_t> &box, const point_t &point)
108 |   { return cukd::sqrDistance(project(box,point),point); }
109 | 
110 |   template<typename point_t>
111 |   /*! returns the dimension in which the box has the widest extent */
112 |   inline __both__ int box_t<point_t>::widestDimension() const
113 |   {
114 |     enum { num_dims = point_traits::num_dims };
115 |       
116 |     int d_best = 0;
117 |     scalar_t w_best = scalar_t(0);
118 |     for (int d=0;d<num_dims;d++) {
119 |       scalar_t w_d = get_coord(upper,d) - get_coord(lower,d);
120 |       if (w_d < w_best) continue;
121 |       w_best = w_d;
122 |       d_best = d;
123 |     }
124 |     return d_best;
125 |   }
126 |   
127 | } // ::cukd
128 | 


--------------------------------------------------------------------------------
/cukd/builder.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2019-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "cukd/helpers.h"
 20 | #include "cukd/box.h"
 21 | #include <cuda.h>
 22 | 
 23 | /* This is a single include file from which
 24 | 
 25 |   Builder variants "cheat sheet"
 26 | 
 27 |   builder_thrust:
 28 |   - temporary memory overhead for N points: N ints + order 2N points 
 29 |     (ie, total mem order 3x that of input data!)
 30 |   - perf 100K float3s (4090) :   ~4ms
 31 |   - perf   1M float3s (4090) :  ~20ms
 32 |   - perf  10M float3s (4090) : ~200ms
 33 |   
 34 |   builder_bitonic:
 35 |   - temporary memory overhead for N points: N ints 
 36 |     (ie, ca 30% mem overhead for float3)
 37 |   - perf 100K float3s (4090) :  ~10ms
 38 |   - perf   1M float3s (4090) :  ~27ms
 39 |   - perf  10M float3s (4090) : ~390ms
 40 | 
 41 |   builder_inplace:
 42 |   - temporary memory overhead for N points: nada, nil, zilch.
 43 |   - perf 100K float3s (4090) :  ~10ms
 44 |   - perf   1M float3s (4090) : ~220ms
 45 |   - perf  10M float3s (4090) : ~4.3s
 46 | 
 47 |  */
 48 | 
 49 | #include "cukd/builder_thrust.h"
 50 | #include "cukd/builder_bitonic.h"
 51 | #include "cukd/builder_inplace.h"
 52 | 
 53 | namespace cukd {
 54 |   /*! Builds a left-balanced k-d tree over the given data points,
 55 |     using data_traits to describe the type of data points that this
 56 |     tree is being built over (i.e., how to separate a data item's
 57 |     positional coordinates from any potential payload (if such exists,
 58 |     e.g., in a 'photon' in photon mapping), what vector/point type to
 59 |     use for this coordinate data (e.g., float3), whether the data have
 60 |     a field to store an explicit split dimensional (for Bentley and
 61 |     Samet's 'optimized' trees, etc.
 62 | 
 63 |     Since a (point-)k-d tree's tree topology is implicit in the
 64 |     ordering of its data points this will re-arrange the data points
 65 |     to fulfill the balanced k-d tree criterion - ie, this WILL modify
 66 |     the data array: no individual entry will get changed, but their
 67 |     order might. If data_traits::has_explicit_dims is defined this
 68 |     builder will choose each node's split dimension based on the
 69 |     widest dimension of that node's subtree's domain; if not, it will
 70 |     chose the dimension in a round-robin style, where the root level
 71 |     is split along the 'x' coordinate, the next level in y, etc
 72 | 
 73 |     'worldBounds' is a pointer to device-writeable memory to store the
 74 |     world-space bounding box of the data points that the builder will
 75 |     compute. If data_traits::has_explicit_dims is true this memory
 76 |     _has_ to be provided to the builder, and the builder will fill it
 77 |     in; if data_traits::has_explicit_dims is false, this memory region
 78 |     is optional: the builder _will_ fill it in if provided, but will
 79 |     ignore it if isn't.
 80 | 
 81 |     *** Example 1: To build a 2D k-dtree over a CUDA int2 type (no other
 82 |     payload than the two coordinates):
 83 |       
 84 |     buildTree<int2>(....);
 85 | 
 86 |     In this case no data_traits need to be supplied beause these will
 87 |     be auto-computed for simple cuda vector types.
 88 |       
 89 |     *** Example 2: to build a 2D kd-tree over a data type of float4,
 90 |     where the first 2 coordinates of each point is the dimension we
 91 |     want to build the kd-tree over, and the other 2 coordinates
 92 |     are arbitrary other payload data:
 93 |       
 94 |     struct float2_plus_payload_traits {
 95 |        using point_t = float2;
 96 |        static inline __both__ point_t get_point(const float4 &n)
 97 |        { return make_float2(n.x, n.y); }
 98 |     };
 99 |     buildTree<float4,float2_plus_payload_traits>(...);
100 |       
101 |     *** Example 3: assuming you have a data type 'Photon' and a
102 |     Photon_traits has Photon_traits::has_explicit_dim defined:
103 |       
104 |     cukd::box_t<float3> *d_worldBounds = <cudaMalloc>;
105 |     buildTree<Photon,Photon_traits>(..., worldBounds, ...);
106 |       
107 |   */
108 |   template<typename data_t, typename data_traits=default_data_traits<data_t>>
109 |   void buildTree(/*! device-read/writeable array of data points */
110 |                  data_t *d_points,
111 |                  /*! number of data points */
112 |                  int numPoints,
113 |                  /*! device-writeable pointer to store the world-space
114 |                      bounding box of all data points. if
115 |                      data_traits::has_explicit_dim is false, this is
116 |                      optionally allowed to be null */
117 |                  box_t<typename data_traits::point_t> *worldBounds=0,
118 |                  /*! cuda stream to use for all kernels and mallocs
119 |                      (the builder_thrust may _also_ do some global
120 |                      device syncs) */
121 |                  cudaStream_t stream=0,
122 |                  GpuMemoryResource &memResource=defaultGpuMemResource())
123 |   {
124 | #if defined(CUKD_BUILDER_INPLACE)
125 | /* this is a _completely_ in-place builder; it will not allocate a
126 |    single byte of additional memory during building (or at any other
127 |    time); the downside is that for large array's it can be 10x-20x
128 |    slower . For refernece: for 10M float3 poitns, builder_inplace
129 |    takes about 4.3 seconds; builder_thrust will take about 200ms,
130 |    builder_bitonic will take about 390ms */
131 |     buildTree_inPlace<data_t,data_traits>
132 |       (d_points,numPoints,worldBounds,stream,memResource);
133 | 
134 | #elif defined(CUKD_BUILDER_BITONIC)
135 | /* this builder uses our tag-update algorithm, but uses bitonic sort
136 |    instead of thrust for soring. it doesn't require thrust, and
137 |    doesn't require additional memory other than 1 int for the tag, but
138 |    for large arrays (10M-ish points) is about 2x slwoer than than the
139 |    thrust variant */
140 |     buildTree_bitonic<data_t,data_traits>
141 |       (d_points,numPoints,worldBounds,stream,memResource);
142 | #else
143 | /* this builder uses our tag-update algorithm, and uses thrust for
144 |     sorting the tag:node pairs. This is our fastest builder, but has
145 |     the downside that thrust's sort will not properly work in a
146 |     stream, and will, in parituclar, have to allocate (quite a bit
147 |     of!) temporary memory during sorting */
148 |     buildTree_thrust<data_t,data_traits>
149 |       (d_points,numPoints,worldBounds,stream,memResource);
150 | #endif
151 |   }
152 | }
153 | 
154 | 


--------------------------------------------------------------------------------
/cukd/builder_common.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2019-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "cukd/helpers.h"
 20 | #include "cukd/box.h"
 21 | #include "cukd/data.h"
 22 | 
 23 | #include <cuda.h>
 24 | 
 25 | namespace cukd {
 26 | 
 27 |   /*! helper function for swapping two elements - need to explcitly
 28 |       prefix this to avoid name clashed with/in thrust */
 29 |   template<typename T>
 30 |   inline __both__ void cukd_swap(T &a, T &b)
 31 |   { T c = a; a = b; b = c; }
 32 | 
 33 |   
 34 |   /*! helper class to allow for conditionally "dropping" calls to
 35 |     set_dim/get_dim for data that doesn't have those functions */
 36 |   template<typename data_t, typename data_traits, bool has_dim>
 37 |   struct if_has_dims;
 38 |   
 39 |   template<typename data_t, typename data_traits>
 40 |   struct if_has_dims<data_t,data_traits,false> {
 41 |     static inline __both__ void set_dim(data_t &t, int dim) {}
 42 |     static inline __both__ int get_dim(const data_t &t, int value_if_false)
 43 |     { return value_if_false; }
 44 |   };
 45 |   
 46 |   template<typename data_t, typename data_traits>
 47 |   struct if_has_dims<data_t,data_traits,true> {
 48 |     static inline __both__ void set_dim(data_t &t, int dim) {
 49 |       data_traits::set_dim(t,dim);
 50 |     }
 51 |     static inline __both__ int get_dim(const data_t &t, int /* ignore: value_if_false */) {
 52 |       return data_traits::get_dim(t);
 53 |     }
 54 |   };
 55 |   /*! @} */
 56 |   
 57 |   /*! helper function that computes the bounding box of a given set of
 58 |       points */
 59 |   template<typename data_t, 
 60 |            typename data_traits=default_data_traits<data_t>>
 61 |   void computeBounds(cukd::box_t<typename data_traits::point_t> *d_bounds,
 62 |                      const data_t *d_points,
 63 |                      int numPoints,
 64 |                      cudaStream_t stream=0);
 65 |   
 66 |   template<typename data_t, 
 67 |            typename data_traits=default_data_traits<data_t>>
 68 |   void host_computeBounds(cukd::box_t<typename data_traits::point_t> *d_bounds,
 69 |                           const data_t *d_points,
 70 |                           int numPoints);
 71 | 
 72 |   // ==================================================================
 73 |   // IMPLEMENTATION SECTION
 74 |   // ==================================================================
 75 | 
 76 |   template<typename data_t, typename data_traits>
 77 |   __global__
 78 |   void computeBounds_copyFirst(cukd::box_t<typename data_traits::point_t> *d_bounds,
 79 |                                const data_t *d_points)
 80 |   {
 81 |     if (threadIdx.x != 0) return;
 82 |     
 83 |     using point_t = typename data_traits::point_t;
 84 |     const point_t point = data_traits::get_point(d_points[0]);
 85 |     d_bounds->lower = d_bounds->upper = point;
 86 |   }
 87 | 
 88 | #ifdef __CUDA_ARCH__
 89 |   inline __device__
 90 |   int atomicMin(int *addr, int value)
 91 |   { return ::atomicMin(addr,value); }
 92 |   
 93 |   inline __device__
 94 |   int atomicMax(int *addr, int value)
 95 |   { return ::atomicMax(addr,value); }
 96 |   
 97 |   inline __device__
 98 |   float atomicMin(float *addr, float value)
 99 |   {
100 |     float old = *addr, assumed;
101 |     if(old <= value) return old;
102 |     do {
103 |       assumed = old;
104 |       old = __int_as_float(atomicCAS((unsigned int*)addr, __float_as_int(assumed), __float_as_int(value)));
105 |       value = min(value,old);
106 |     } while(old!=assumed);
107 |     return old;
108 |   }
109 | 
110 |   inline __device__
111 |   float atomicMax(float *addr, float value)
112 |   {
113 |     float old = *addr, assumed;
114 |     if(old >= value) return old;
115 |     do {
116 |       assumed = old;
117 |       old = __int_as_float(atomicCAS((unsigned int*)addr, __float_as_int(assumed), __float_as_int(value)));
118 |       value = max(value,old);
119 |     } while(old!=assumed);
120 |     return old;
121 |   }
122 | #endif
123 | 
124 |   template<typename data_t,
125 |            typename data_traits>
126 |   __global__
127 |   void computeBounds_atomicGrow(cukd::box_t<typename data_traits::point_t> *d_bounds,
128 |                                 const data_t *d_points,
129 |                                 int numPoints)
130 |   {
131 |     using point_t = typename data_traits::point_t;
132 |     using point_traits = ::cukd::point_traits<point_t>;//typename data_traits::point_traits;
133 |     using scalar_t = typename point_traits::scalar_t;
134 |     enum { num_dims = point_traits::num_dims };
135 |     
136 |     const int tid = threadIdx.x+blockIdx.x*blockDim.x;
137 |     if (tid >= numPoints) return;
138 |     
139 |     point_t point = data_traits::get_point(d_points[tid]);
140 | #pragma unroll(num_dims)
141 |     for (int d=0;d<num_dims;d++) {
142 |       scalar_t &lo = point_traits::get_coord(d_bounds->lower,d);
143 |       scalar_t &hi = point_traits::get_coord(d_bounds->upper,d);
144 |       scalar_t f = point_traits::get_coord(point,d);
145 |       atomicMin(&lo,f);
146 |       atomicMax(&hi,f);
147 |     }
148 |   }
149 | 
150 |   /*! host-side helper function to compute bounding box of the data set */
151 |   template<typename data_t, typename data_traits>
152 |   void computeBounds(cukd::box_t<typename data_traits::point_t> *d_bounds,
153 |                      const data_t *d_points,
154 |                      int numPoints,
155 |                      cudaStream_t s)
156 |   {
157 |     computeBounds_copyFirst<data_t,data_traits>
158 |       <<<1,1,0,s>>>
159 |       (d_bounds,d_points);
160 |     computeBounds_atomicGrow<data_t,data_traits>
161 |       <<<divRoundUp(numPoints,128),128,0,s>>>
162 |       (d_bounds,d_points,numPoints);
163 |   }
164 | 
165 |   /*! host-side helper function to compute bounding box of the data set */
166 |   template<typename data_t, typename data_traits>
167 |   void host_computeBounds(cukd::box_t<typename data_traits::point_t> *d_bounds,
168 |                           const data_t *d_points,
169 |                           int numPoints)
170 |   {
171 |     d_bounds->setEmpty();
172 |     for (int i=0;i<numPoints;i++)
173 |       d_bounds->grow(data_traits::get_point(d_points[i]));
174 |   }
175 |   
176 | 
177 |   /*! helper function that finds, for a given node in the tree, the
178 |       bounding box of that subtree's domain; by walking _up_ the tree
179 |       and applying all clipping planes to the world-space bounding
180 |       box */
181 |   template<typename data_t,typename data_traits>
182 |   inline __both__
183 |   cukd::box_t<typename data_traits::point_t>
184 |   findBounds(int subtree,
185 |              const cukd::box_t<typename data_traits::point_t> *d_bounds,
186 |              data_t *d_nodes)
187 |   {
188 |     using point_t  = typename data_traits::point_t;
189 |     using point_traits = ::cukd::point_traits<point_t>;
190 |     using scalar_t = typename point_traits::scalar_t;
191 |     enum { num_dims = point_traits::num_dims };
192 |     
193 |     cukd::box_t<typename data_traits::point_t> bounds = *d_bounds;
194 |     int curr = subtree;
195 |     while (curr > 0) {
196 |       const int     parent = (curr+1)/2-1;
197 |       const data_t &parent_node = d_nodes[parent];
198 |       const int     parent_dim
199 |         = if_has_dims<data_t,data_traits,data_traits::has_explicit_dim>
200 |         ::get_dim(parent_node,/* if not: */BinaryTree::levelOf(parent) % num_dims);
201 |       const scalar_t parent_split_pos
202 |         = data_traits::get_coord(parent_node,parent_dim);
203 |       
204 |       if (curr & 1) {
205 |         // curr is left child, set upper
206 |         point_traits::set_coord(bounds.upper,parent_dim,
207 |                                 min(parent_split_pos,
208 |                                     get_coord(bounds.upper,parent_dim)));
209 |       } else {
210 |         // curr is right child, set lower
211 |         point_traits::set_coord(bounds.lower,parent_dim,
212 |                                 max(parent_split_pos,
213 |                                     get_coord(bounds.lower,parent_dim)));
214 |       }
215 |       curr = parent;
216 |     }
217 |     
218 |     return bounds;
219 |   }
220 |   
221 | 
222 | }
223 | 


--------------------------------------------------------------------------------
/cukd/builder_host.h:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2019-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #pragma once
18 | 
19 | #include "cukd/builder_thrust.h"
20 | 
21 | // buildTree_host is currently based on the thrust builder, and
22 | // implemented as part of builder_thrust.h
23 | 


--------------------------------------------------------------------------------
/cukd/common.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/
 18 | 
 19 | #pragma once
 20 | 
 21 | #ifndef _USE_MATH_DEFINES
 22 | #  define _USE_MATH_DEFINES
 23 | #endif
 24 | #include <math.h> // using cmath causes issues under Windows
 25 | #include <cuda_runtime.h>
 26 | #include <math_constants.h>
 27 | #include <cuda.h>
 28 | #include <stdio.h>
 29 | #include <iostream>
 30 | #include <stdexcept>
 31 | #include <memory>
 32 | #include <assert.h>
 33 | #include <string>
 34 | #include <math.h>
 35 | #include <cmath>
 36 | #include <algorithm>
 37 | #include <sstream>
 38 | #ifdef __GNUC__
 39 | #include <execinfo.h>
 40 | #include <sys/time.h>
 41 | #endif
 42 | #include <fstream>
 43 | #include <iostream>
 44 | 
 45 | #ifdef _WIN32
 46 | #ifndef WIN32_LEAN_AND_MEAN
 47 | #define WIN32_LEAN_AND_MEAN
 48 | #endif
 49 | #include <Windows.h>
 50 | #ifdef min
 51 | #undef min
 52 | #endif
 53 | #ifdef max
 54 | #undef max
 55 | #endif
 56 | #endif
 57 | 
 58 | #if !defined(WIN32)
 59 | #include <signal.h>
 60 | #endif
 61 | 
 62 | #if defined(_MSC_VER)
 63 | #  define CUKD_DLL_EXPORT __declspec(dllexport)
 64 | #  define CUKD_DLL_IMPORT __declspec(dllimport)
 65 | #elif defined(__clang__) || defined(__GNUC__)
 66 | #  define CUKD_DLL_EXPORT __attribute__((visibility("default")))
 67 | #  define CUKD_DLL_IMPORT __attribute__((visibility("default")))
 68 | #else
 69 | #  define CUKD_DLL_EXPORT
 70 | #  define CUKD_DLL_IMPORT
 71 | #endif
 72 | 
 73 | # define CUKD_INTERFACE /* nothing - currently not building any special 'owl.dll' */
 74 | #if defined(_MSC_VER)
 75 | #  define __PRETTY_FUNCTION__ __FUNCTION__
 76 | #endif
 77 | 
 78 | 
 79 | #ifndef PRINT
 80 | # define PRINT(var) std::cout << #var << "=" << var << std::endl;
 81 | #ifdef __WIN32__
 82 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __FUNCTION__ << std::endl;
 83 | #else
 84 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __PRETTY_FUNCTION__ << std::endl;
 85 | #endif
 86 | #endif
 87 | 
 88 | #if defined(__CUDA_ARCH__)
 89 | # define __owl_device   __device__
 90 | # define __owl_host     __host__
 91 | #else
 92 | # define __owl_device   /* ignore */
 93 | # define __owl_host     /* ignore */
 94 | #endif
 95 | 
 96 | # define __both__   __owl_host __owl_device
 97 | 
 98 | 
 99 | #ifdef __GNUC__
100 | #define MAYBE_UNUSED __attribute__((unused))
101 | #else
102 | #define MAYBE_UNUSED
103 | #endif
104 | 
105 | #define CUKD_NOTIMPLEMENTED throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+" not implemented")
106 | 
107 | #ifdef WIN32
108 | # define CUKD_TERMINAL_RED ""
109 | # define CUKD_TERMINAL_GREEN ""
110 | # define CUKD_TERMINAL_LIGHT_GREEN ""
111 | # define CUKD_TERMINAL_YELLOW ""
112 | # define CUKD_TERMINAL_BLUE ""
113 | # define CUKD_TERMINAL_LIGHT_BLUE ""
114 | # define CUKD_TERMINAL_RESET ""
115 | # define CUKD_TERMINAL_DEFAULT CUKD_TERMINAL_RESET
116 | # define CUKD_TERMINAL_BOLD ""
117 | 
118 | # define CUKD_TERMINAL_MAGENTA ""
119 | # define CUKD_TERMINAL_LIGHT_MAGENTA ""
120 | # define CUKD_TERMINAL_CYAN ""
121 | # define CUKD_TERMINAL_LIGHT_RED ""
122 | #else
123 | # define CUKD_TERMINAL_RED "\033[0;31m"
124 | # define CUKD_TERMINAL_GREEN "\033[0;32m"
125 | # define CUKD_TERMINAL_LIGHT_GREEN "\033[1;32m"
126 | # define CUKD_TERMINAL_YELLOW "\033[1;33m"
127 | # define CUKD_TERMINAL_BLUE "\033[0;34m"
128 | # define CUKD_TERMINAL_LIGHT_BLUE "\033[1;34m"
129 | # define CUKD_TERMINAL_RESET "\033[0m"
130 | # define CUKD_TERMINAL_DEFAULT CUKD_TERMINAL_RESET
131 | # define CUKD_TERMINAL_BOLD "\033[1;1m"
132 | 
133 | # define CUKD_TERMINAL_MAGENTA "\e[35m"
134 | # define CUKD_TERMINAL_LIGHT_MAGENTA "\e[95m"
135 | # define CUKD_TERMINAL_CYAN "\e[36m"
136 | # define CUKD_TERMINAL_LIGHT_RED "\033[1;31m"
137 | #endif
138 | 
139 | #ifdef _MSC_VER
140 | # define CUKD_ALIGN(alignment) __declspec(align(alignment))
141 | #else
142 | # define CUKD_ALIGN(alignment) __attribute__((aligned(alignment)))
143 | #endif
144 | 
145 | 
146 | 
147 | namespace cukd {
148 |   namespace common {
149 | 
150 | #ifdef __WIN32__
151 | #  define osp_snprintf sprintf_s
152 | #else
153 | #  define osp_snprintf snprintf
154 | #endif
155 | 
156 |     /*! added pretty-print function for large numbers, printing 10000000 as "10M" instead */
157 |     inline std::string prettyDouble(const double val) {
158 |       const double absVal = abs(val);
159 |       char result[1000];
160 | 
161 |       if      (absVal >= 1e+18f) osp_snprintf(result,1000,"%.1f%c",float(val/1e18f),'E');
162 |       else if (absVal >= 1e+15f) osp_snprintf(result,1000,"%.1f%c",float(val/1e15f),'P');
163 |       else if (absVal >= 1e+12f) osp_snprintf(result,1000,"%.1f%c",float(val/1e12f),'T');
164 |       else if (absVal >= 1e+09f) osp_snprintf(result,1000,"%.1f%c",float(val/1e09f),'G');
165 |       else if (absVal >= 1e+06f) osp_snprintf(result,1000,"%.1f%c",float(val/1e06f),'M');
166 |       else if (absVal >= 1e+03f) osp_snprintf(result,1000,"%.1f%c",float(val/1e03f),'k');
167 |       else if (absVal <= 1e-12f) osp_snprintf(result,1000,"%.1f%c",float(val*1e15f),'f');
168 |       else if (absVal <= 1e-09f) osp_snprintf(result,1000,"%.1f%c",float(val*1e12f),'p');
169 |       else if (absVal <= 1e-06f) osp_snprintf(result,1000,"%.1f%c",float(val*1e09f),'n');
170 |       else if (absVal <= 1e-03f) osp_snprintf(result,1000,"%.1f%c",float(val*1e06f),'u');
171 |       else if (absVal <= 1e-00f) osp_snprintf(result,1000,"%.1f%c",float(val*1e03f),'m');
172 |       else osp_snprintf(result,1000,"%f",(float)val);
173 | 
174 |       return result;
175 |     }
176 | 
177 | 
178 |     /*! return a nicely formatted number as in "3.4M" instead of
179 |       "3400000", etc, using mulitples of thousands (K), millions
180 |       (M), etc. Ie, the value 64000 would be returned as 64K, and
181 |       65536 would be 65.5K */
182 |     inline std::string prettyNumber(const size_t s)
183 |     {
184 |       char buf[1000];
185 |       if (s >= (1000LL*1000LL*1000LL*1000LL)) {
186 |         osp_snprintf(buf, 1000,"%.2fT",s/(1000.f*1000.f*1000.f*1000.f));
187 |       } else if (s >= (1000LL*1000LL*1000LL)) {
188 |         osp_snprintf(buf, 1000, "%.2fG",s/(1000.f*1000.f*1000.f));
189 |       } else if (s >= (1000LL*1000LL)) {
190 |         osp_snprintf(buf, 1000, "%.2fM",s/(1000.f*1000.f));
191 |       } else if (s >= (1000LL)) {
192 |         osp_snprintf(buf, 1000, "%.2fK",s/(1000.f));
193 |       } else {
194 |         osp_snprintf(buf,1000,"%zi",s);
195 |       }
196 |       return buf;
197 |     }
198 | 
199 |     /*! return a nicely formatted number as in "3.4M" instead of
200 |       "3400000", etc, using mulitples of 1024 as in kilobytes,
201 |       etc. Ie, the value 65534 would be 64K, 64000 would be 63.8K */
202 |     inline std::string prettyBytes(const size_t s)
203 |     {
204 |       char buf[1000];
205 |       if (s >= (1024LL*1024LL*1024LL*1024LL)) {
206 |         osp_snprintf(buf, 1000,"%.2fT",s/(1024.f*1024.f*1024.f*1024.f));
207 |       } else if (s >= (1024LL*1024LL*1024LL)) {
208 |         osp_snprintf(buf, 1000, "%.2fG",s/(1024.f*1024.f*1024.f));
209 |       } else if (s >= (1024LL*1024LL)) {
210 |         osp_snprintf(buf, 1000, "%.2fM",s/(1024.f*1024.f));
211 |       } else if (s >= (1024LL)) {
212 |         osp_snprintf(buf, 1000, "%.2fK",s/(1024.f));
213 |       } else {
214 |         osp_snprintf(buf,1000,"%zi",s);
215 |       }
216 |       return buf;
217 |     }
218 | 
219 |     inline double getCurrentTime()
220 |     {
221 | #ifdef _WIN32
222 |       SYSTEMTIME tp; GetSystemTime(&tp);
223 |       /*
224 |         Please note: we are not handling the "leap year" issue.
225 |       */
226 |       size_t numSecsSince2020
227 |         = tp.wSecond
228 |         + (60ull) * tp.wMinute
229 |         + (60ull * 60ull) * tp.wHour
230 |         + (60ull * 60ul * 24ull) * tp.wDay
231 |         + (60ull * 60ul * 24ull * 365ull) * (tp.wYear - 2020);
232 |       return double(numSecsSince2020 + tp.wMilliseconds * 1e-3);
233 | #else
234 |       struct timeval tp; gettimeofday(&tp,nullptr);
235 |       return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
236 | #endif
237 |     }
238 | 
239 |     inline bool hasSuffix(const std::string &s, const std::string &suffix)
240 |     {
241 |       return s.substr(s.size()-suffix.size()) == suffix;
242 |     }
243 |   } // ::common
244 | 
245 |   template<typename T>
246 |   inline T *loadPoints(std::string fileName, size_t &count)
247 |   {
248 |     // size_t count;
249 |     std::cout << "loading points from " << fileName << std::endl;
250 |     std::ifstream in(fileName,std::ios::binary);
251 |     in.read((char*)&count,sizeof(count));
252 |     // numPoints = count;
253 |     std::cout << "loading " << count <<  " points" << std::endl;
254 |     T *d_points = 0;
255 |     cudaMallocManaged((void**)&d_points,count*sizeof(T));
256 |     in.read((char*)d_points,count*sizeof(T));
257 |     return d_points;
258 |   }
259 |     
260 |   template<typename T>
261 |   inline T *loadPoints(std::string fileName, int &count)
262 |   {
263 |     size_t count64;
264 |     T *t = loadPoints<T>(fileName, count64);
265 |     count = (int)count64;
266 |     return t;
267 |   }
268 | 
269 |   // template<typename scalar_t>
270 |   // inline __device__ scalar_t clamp(scalar_t v, scalar_t lo, scalar_t hi)
271 |   // { return min(max(v,lo),hi); }
272 | 
273 | } // ::cukd
274 | 
275 | 
276 | #define CUKD_CUDA_CHECK( call )                                         \
277 |   {                                                                     \
278 |     cudaError_t rc = call;                                              \
279 |     if (rc != cudaSuccess) {                                            \
280 |       fprintf(stderr,                                                   \
281 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
282 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
283 |       throw std::runtime_error("fatal cuda error");                     \
284 |     }                                                                   \
285 |   }
286 | 
287 | #define CUKD_CUDA_CALL(call) CUKD_CUDA_CHECK(cuda##call)
288 | 
289 | #define CUKD_CUDA_CHECK2( where, call )                                 \
290 |   {                                                                     \
291 |     cudaError_t rc = call;                                              \
292 |     if(rc != cudaSuccess) {                                             \
293 |       if (where)                                                        \
294 |         fprintf(stderr, "at %s: CUDA call (%s) "                        \
295 |                 "failed with code %d (line %d): %s\n",                  \
296 |                 where,#call, rc, __LINE__, cudaGetErrorString(rc));     \
297 |       fprintf(stderr,                                                   \
298 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
299 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
300 |       throw std::runtime_error("fatal cuda error");                     \
301 |     }                                                                   \
302 |   }
303 | 
304 | #define CUKD_CUDA_SYNC_CHECK()                                  \
305 |   {                                                             \
306 |     cudaError_t rc = cudaDeviceSynchronize();                                    \
307 |     if (rc != cudaSuccess) {                                    \
308 |       fprintf(stderr, "error (%s: line %d): %s\n",              \
309 |               __FILE__, __LINE__, cudaGetErrorString(rc));      \
310 |       throw std::runtime_error("fatal cuda error");             \
311 |     }                                                           \
312 |   }
313 | 
314 | 
315 | 
316 | #define CUKD_CUDA_CHECK_NOTHROW( call )                                 \
317 |   {                                                                     \
318 |     cudaError_t rc = call;                                              \
319 |     if (rc != cudaSuccess) {                                            \
320 |       fprintf(stderr,                                                   \
321 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
322 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
323 |       exit(2);                                                          \
324 |     }                                                                   \
325 |   }
326 | 
327 | #define CUKD_CUDA_CALL_NOTHROW(call) CUKD_CUDA_CHECK_NOTHROW(cuda##call)
328 | 
329 | #define CUKD_CUDA_CHECK2_NOTHROW( where, call )                         \
330 |   {                                                                     \
331 |     cudaError_t rc = call;                                              \
332 |     if(rc != cudaSuccess) {                                             \
333 |       if (where)                                                        \
334 |         fprintf(stderr, "at %s: CUDA call (%s) "                        \
335 |                 "failed with code %d (line %d): %s\n",                  \
336 |                 where,#call, rc, __LINE__, cudaGetErrorString(rc));     \
337 |       fprintf(stderr,                                                   \
338 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
339 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
340 |       exit(2);                                                          \
341 |     }                                                                   \
342 |   }
343 | 
344 | 
345 | /* is supplied externally (from cmake) this adds a "int *stats"
346 |    paramater to all query functions, and makes the traversal routines
347 |    do atomic counting of traversal steps */
348 | #if defined(CUKD_ENABLE_STATS) && defined(__CUDA_ARCH__)
349 | # define CUKD_STATS(a) a
350 | # define CUKD_STATS_ARG(a,b) a,
351 | #else
352 | # define CUKD_STATS(a) /* nothing */
353 | # define CUKD_STATS_ARG(a,b) /* nothing */
354 | #endif
355 | 
356 | #if CUKD_ENABLE_STATS
357 | namespace cukd {
358 |   __constant__ __device__ unsigned long long *g_traversalStats;
359 | }
360 | #endif
361 |   
362 | 
363 | 


--------------------------------------------------------------------------------
/cukd/cubit/common.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2022 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/
 18 | 
 19 | #pragma once
 20 | 
 21 | #ifndef _USE_MATH_DEFINES
 22 | #  define _USE_MATH_DEFINES
 23 | #endif
 24 | #include <math.h> // using cmath causes issues under Windows
 25 | 
 26 | #include <stdio.h>
 27 | #include <iostream>
 28 | #include <stdexcept>
 29 | #include <memory>
 30 | #include <assert.h>
 31 | #include <string>
 32 | #include <math.h>
 33 | #include <cmath>
 34 | #include <algorithm>
 35 | #include <sstream>
 36 | #ifdef __GNUC__
 37 | #include <execinfo.h>
 38 | #include <sys/time.h>
 39 | #endif
 40 | 
 41 | #ifdef _WIN32
 42 | #ifndef WIN32_LEAN_AND_MEAN
 43 | #define WIN32_LEAN_AND_MEAN
 44 | #endif
 45 | #include <Windows.h>
 46 | #ifdef min
 47 | #undef min
 48 | #endif
 49 | #ifdef max
 50 | #undef max
 51 | #endif
 52 | #endif
 53 | 
 54 | #if !defined(WIN32)
 55 | #include <signal.h>
 56 | #endif
 57 | 
 58 | #if defined(_MSC_VER)
 59 | #  define CUBIT_DLL_EXPORT __declspec(dllexport)
 60 | #  define CUBIT_DLL_IMPORT __declspec(dllimport)
 61 | #elif defined(__clang__) || defined(__GNUC__)
 62 | #  define CUBIT_DLL_EXPORT __attribute__((visibility("default")))
 63 | #  define CUBIT_DLL_IMPORT __attribute__((visibility("default")))
 64 | #else
 65 | #  define CUBIT_DLL_EXPORT
 66 | #  define CUBIT_DLL_IMPORT
 67 | #endif
 68 | 
 69 | # define CUBIT_INTERFACE /* nothing - currently not building any special 'owl.dll' */
 70 | #if defined(_MSC_VER)
 71 | #  define __PRETTY_FUNCTION__ __FUNCTION__
 72 | #endif
 73 | 
 74 | 
 75 | #ifndef PRINT
 76 | # define PRINT(var) std::cout << #var << "=" << var << std::endl;
 77 | #ifdef __WIN32__
 78 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __FUNCTION__ << std::endl;
 79 | #else
 80 | # define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __PRETTY_FUNCTION__ << std::endl;
 81 | #endif
 82 | #endif
 83 | 
 84 | #if defined(__CUDA_ARCH__)
 85 | # define __owl_device   __device__
 86 | # define __owl_host     __host__
 87 | #else
 88 | # define __owl_device   /* ignore */
 89 | # define __owl_host     /* ignore */
 90 | #endif
 91 | 
 92 | # define __both__   __owl_host __owl_device
 93 | 
 94 | 
 95 | #ifdef __GNUC__
 96 | #define MAYBE_UNUSED __attribute__((unused))
 97 | #else
 98 | #define MAYBE_UNUSED
 99 | #endif
100 | 
101 | #define CUBIT_NOTIMPLEMENTED throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+" not implemented")
102 | 
103 | #ifdef WIN32
104 | # define CUBIT_TERMINAL_RED ""
105 | # define CUBIT_TERMINAL_GREEN ""
106 | # define CUBIT_TERMINAL_LIGHT_GREEN ""
107 | # define CUBIT_TERMINAL_YELLOW ""
108 | # define CUBIT_TERMINAL_BLUE ""
109 | # define CUBIT_TERMINAL_LIGHT_BLUE ""
110 | # define CUBIT_TERMINAL_RESET ""
111 | # define CUBIT_TERMINAL_DEFAULT CUBIT_TERMINAL_RESET
112 | # define CUBIT_TERMINAL_BOLD ""
113 | 
114 | # define CUBIT_TERMINAL_MAGENTA ""
115 | # define CUBIT_TERMINAL_LIGHT_MAGENTA ""
116 | # define CUBIT_TERMINAL_CYAN ""
117 | # define CUBIT_TERMINAL_LIGHT_RED ""
118 | #else
119 | # define CUBIT_TERMINAL_RED "\033[0;31m"
120 | # define CUBIT_TERMINAL_GREEN "\033[0;32m"
121 | # define CUBIT_TERMINAL_LIGHT_GREEN "\033[1;32m"
122 | # define CUBIT_TERMINAL_YELLOW "\033[1;33m"
123 | # define CUBIT_TERMINAL_BLUE "\033[0;34m"
124 | # define CUBIT_TERMINAL_LIGHT_BLUE "\033[1;34m"
125 | # define CUBIT_TERMINAL_RESET "\033[0m"
126 | # define CUBIT_TERMINAL_DEFAULT CUBIT_TERMINAL_RESET
127 | # define CUBIT_TERMINAL_BOLD "\033[1;1m"
128 | 
129 | # define CUBIT_TERMINAL_MAGENTA "\e[35m"
130 | # define CUBIT_TERMINAL_LIGHT_MAGENTA "\e[95m"
131 | # define CUBIT_TERMINAL_CYAN "\e[36m"
132 | # define CUBIT_TERMINAL_LIGHT_RED "\033[1;31m"
133 | #endif
134 | 
135 | #ifdef _MSC_VER
136 | # define __cubit_align(alignment) __declspec(align(alignment)) 
137 | #else
138 | # define __cubit_align(alignment) __attribute__((aligned(alignment)))
139 | #endif
140 | 
141 | 
142 | 
143 | namespace cubit {
144 |   namespace common {
145 | 
146 | #ifdef __CUDA_ARCH__
147 |     using ::min;
148 |     using ::max;
149 |     using std::abs;
150 | #else
151 |     using std::min;
152 |     using std::max;
153 |     using std::abs;
154 | #endif
155 | 
156 |     inline __both__ int32_t  divRoundUp(int32_t a, int32_t b) { return (a+b-1)/b; }
157 |     inline __both__ uint32_t divRoundUp(uint32_t a, uint32_t b) { return (a+b-1)/b; }
158 |     inline __both__ int64_t  divRoundUp(int64_t a, int64_t b) { return (a+b-1)/b; }
159 |     inline __both__ uint64_t divRoundUp(uint64_t a, uint64_t b) { return (a+b-1)/b; }
160 |   
161 |     using ::sin; // this is the double version
162 |     using ::cos; // this is the double version
163 | 
164 | #ifdef __WIN32__
165 | #  define osp_snprintf sprintf_s
166 | #else
167 | #  define osp_snprintf snprintf
168 | #endif
169 |   
170 |     /*! added pretty-print function for large numbers, printing 10000000 as "10M" instead */
171 |     inline std::string prettyDouble(const double val) {
172 |       const double absVal = abs(val);
173 |       char result[1000];
174 | 
175 |       if      (absVal >= 1e+18f) osp_snprintf(result,1000,"%.1f%c",float(val/1e18f),'E');
176 |       else if (absVal >= 1e+15f) osp_snprintf(result,1000,"%.1f%c",float(val/1e15f),'P');
177 |       else if (absVal >= 1e+12f) osp_snprintf(result,1000,"%.1f%c",float(val/1e12f),'T');
178 |       else if (absVal >= 1e+09f) osp_snprintf(result,1000,"%.1f%c",float(val/1e09f),'G');
179 |       else if (absVal >= 1e+06f) osp_snprintf(result,1000,"%.1f%c",float(val/1e06f),'M');
180 |       else if (absVal >= 1e+03f) osp_snprintf(result,1000,"%.1f%c",float(val/1e03f),'k');
181 |       else if (absVal <= 1e-12f) osp_snprintf(result,1000,"%.1f%c",float(val*1e15f),'f');
182 |       else if (absVal <= 1e-09f) osp_snprintf(result,1000,"%.1f%c",float(val*1e12f),'p');
183 |       else if (absVal <= 1e-06f) osp_snprintf(result,1000,"%.1f%c",float(val*1e09f),'n');
184 |       else if (absVal <= 1e-03f) osp_snprintf(result,1000,"%.1f%c",float(val*1e06f),'u');
185 |       else if (absVal <= 1e-00f) osp_snprintf(result,1000,"%.1f%c",float(val*1e03f),'m');
186 |       else osp_snprintf(result,1000,"%f",(float)val);
187 | 
188 |       return result;
189 |     }
190 |   
191 | 
192 |     /*! return a nicely formatted number as in "3.4M" instead of
193 |         "3400000", etc, using mulitples of thousands (K), millions
194 |         (M), etc. Ie, the value 64000 would be returned as 64K, and
195 |         65536 would be 65.5K */
196 |     inline std::string prettyNumber(const size_t s)
197 |     {
198 |       char buf[1000];
199 |       if (s >= (1000LL*1000LL*1000LL*1000LL)) {
200 |         osp_snprintf(buf, 1000,"%.2fT",s/(1000.f*1000.f*1000.f*1000.f));
201 |       } else if (s >= (1000LL*1000LL*1000LL)) {
202 |         osp_snprintf(buf, 1000, "%.2fG",s/(1000.f*1000.f*1000.f));
203 |       } else if (s >= (1000LL*1000LL)) {
204 |         osp_snprintf(buf, 1000, "%.2fM",s/(1000.f*1000.f));
205 |       } else if (s >= (1000LL)) {
206 |         osp_snprintf(buf, 1000, "%.2fK",s/(1000.f));
207 |       } else {
208 |         osp_snprintf(buf,1000,"%zi",s);
209 |       }
210 |       return buf;
211 |     }
212 | 
213 |     /*! return a nicely formatted number as in "3.4M" instead of
214 |         "3400000", etc, using mulitples of 1024 as in kilobytes,
215 |         etc. Ie, the value 65534 would be 64K, 64000 would be 63.8K */
216 |     inline std::string prettyBytes(const size_t s)
217 |     {
218 |       char buf[1000];
219 |       if (s >= (1024LL*1024LL*1024LL*1024LL)) {
220 |         osp_snprintf(buf, 1000,"%.2fT",s/(1024.f*1024.f*1024.f*1024.f));
221 |       } else if (s >= (1024LL*1024LL*1024LL)) {
222 |         osp_snprintf(buf, 1000, "%.2fG",s/(1024.f*1024.f*1024.f));
223 |       } else if (s >= (1024LL*1024LL)) {
224 |         osp_snprintf(buf, 1000, "%.2fM",s/(1024.f*1024.f));
225 |       } else if (s >= (1024LL)) {
226 |         osp_snprintf(buf, 1000, "%.2fK",s/(1024.f));
227 |       } else {
228 |         osp_snprintf(buf,1000,"%zi",s);
229 |       }
230 |       return buf;
231 |     }
232 |   
233 |     inline double getCurrentTime()
234 |     {
235 | #ifdef _WIN32
236 |       SYSTEMTIME tp; GetSystemTime(&tp);
237 |       /*
238 |          Please note: we are not handling the "leap year" issue.
239 |      */
240 |       size_t numSecsSince2020
241 |           = tp.wSecond
242 |           + (60ull) * tp.wMinute
243 |           + (60ull * 60ull) * tp.wHour
244 |           + (60ull * 60ul * 24ull) * tp.wDay
245 |           + (60ull * 60ul * 24ull * 365ull) * (tp.wYear - 2020);
246 |       return double(numSecsSince2020 + tp.wMilliseconds * 1e-3);
247 | #else
248 |       struct timeval tp; gettimeofday(&tp,nullptr);
249 |       return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
250 | #endif
251 |     }
252 | 
253 |     inline bool hasSuffix(const std::string &s, const std::string &suffix)
254 |     {
255 |       return s.substr(s.size()-suffix.size()) == suffix;
256 |     }
257 |     
258 |   } // ::cubit::common
259 | 
260 | 
261 | #ifndef CUBIT_CUDA_CHECK
262 | #define CUBIT_CUDA_CHECK( call )                                              \
263 |   {                                                                     \
264 |     cudaError_t rc = call;                                              \
265 |     if (rc != cudaSuccess) {                                            \
266 |       fprintf(stderr,                                                   \
267 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
268 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
269 |       throw("fatal cuda error");                                    \
270 |     }                                                                   \
271 |   }
272 | 
273 | #define CUBIT_CUDA_CALL(call) CUBIT_CUDA_CHECK(cuda##call)
274 | 
275 | #define CUBIT_CUDA_CHECK2( where, call )                                      \
276 |   {                                                                     \
277 |     cudaError_t rc = call;                                              \
278 |     if(rc != cudaSuccess) {                                             \
279 |       if (where)                                                        \
280 |         fprintf(stderr, "at %s: CUDA call (%s) "                        \
281 |                 "failed with code %d (line %d): %s\n",                  \
282 |                 where,#call, rc, __LINE__, cudaGetErrorString(rc));     \
283 |       fprintf(stderr,                                                   \
284 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
285 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
286 |       throw("fatal cuda error");                                    \
287 |     }                                                                   \
288 |   }
289 | 
290 | #define CUBIT_CUDA_SYNC_CHECK()                                       \
291 |   {                                                             \
292 |     cudaError_t rc = cudaDeviceSynchronize();                                    \
293 |     if (rc != cudaSuccess) {                                    \
294 |       fprintf(stderr, "error (%s: line %d): %s\n",              \
295 |               __FILE__, __LINE__, cudaGetErrorString(rc));      \
296 |       throw("fatal cuda error");                            \
297 |     }                                                           \
298 |   }
299 | 
300 | #define CUBIT_CUDA_SYNC_CHECK_STREAM(s)                               \
301 |   {                                                             \
302 |     cudaStreamSynchronize(s);                                   \
303 |     cudaError_t rc = cudaGetLastError();                        \
304 |     if (rc != cudaSuccess) {                                    \
305 |       fprintf(stderr, "error (%s: line %d): %s\n",              \
306 |               __FILE__, __LINE__, cudaGetErrorString(rc));      \
307 |       throw("fatal cuda error");                            \
308 |     }                                                           \
309 |   }
310 | 
311 | 
312 | 
313 | #define CUBIT_CUDA_CHECK_NOTHROW( call )                                      \
314 |   {                                                                     \
315 |     cudaError_t rc = call;                                              \
316 |     if (rc != cudaSuccess) {                                            \
317 |       fprintf(stderr,                                                   \
318 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
319 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
320 |       exit(2);                                                          \
321 |     }                                                                   \
322 |   }
323 | 
324 | #define CUBIT_CUDA_CALL_NOTHROW(call) CUBIT_CUDA_CHECK_NOTHROW(cuda##call)
325 | 
326 | #define CUBIT_CUDA_CHECK2_NOTHROW( where, call )                              \
327 |   {                                                                     \
328 |     cudaError_t rc = call;                                              \
329 |     if(rc != cudaSuccess) {                                             \
330 |       if (where)                                                        \
331 |         fprintf(stderr, "at %s: CUDA call (%s) "                        \
332 |                 "failed with code %d (line %d): %s\n",                  \
333 |                 where,#call, rc, __LINE__, cudaGetErrorString(rc));     \
334 |       fprintf(stderr,                                                   \
335 |               "CUDA call (%s) failed with code %d (line %d): %s\n",     \
336 |               #call, rc, __LINE__, cudaGetErrorString(rc));             \
337 |       exit(2);                                                          \
338 |     }                                                                   \
339 |   }
340 | #endif
341 |   
342 | } // ::cubit
343 | 


--------------------------------------------------------------------------------
/cukd/cukd-math.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2024 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /* copied from OWL project, and put into new namespace to avoid naming conflicts.*/
 18 | 
 19 | #pragma once
 20 | 
 21 | #include "cukd/common.h"
 22 | 
 23 | namespace cukd {
 24 | 
 25 | #ifdef __CUDA_ARCH__
 26 |   using ::min;
 27 |   using ::max;
 28 |   using std::abs;
 29 | #else
 30 |   using std::min;
 31 |   using std::max;
 32 |   using std::abs;
 33 | #endif
 34 | 
 35 |   // ==================================================================
 36 |   // default operators on cuda vector types:
 37 |   // ==================================================================
 38 | 
 39 |   /*! template interface for cuda vector types (such as float3, int4,
 40 |       etc), that allows for querying which scalar type this vec is
 41 |       defined over */
 42 |   template<typename cuda_vec_t> struct scalar_type_of;
 43 |   template<> struct scalar_type_of<float2> { using type = float; };
 44 |   template<> struct scalar_type_of<float3> { using type = float; };
 45 |   template<> struct scalar_type_of<float4> { using type = float; };
 46 |   template<> struct scalar_type_of<int2>   { using type = int; };
 47 |   template<> struct scalar_type_of<int3>   { using type = int; };
 48 |   template<> struct scalar_type_of<int4>   { using type = int; };
 49 |   
 50 |   /*! template interface for cuda vector types (such as float3, int4,
 51 |       etc), that allows for querying which scalar type this vec is
 52 |       defined over */
 53 |   template<typename cuda_vec_t> struct num_dims_of;
 54 |   template<> struct num_dims_of<float2> { enum { value = 2 }; };
 55 |   template<> struct num_dims_of<float3> { enum { value = 3 }; };
 56 |   template<> struct num_dims_of<float4> { enum { value = 4 }; };
 57 |   template<> struct num_dims_of<int2>   { enum { value = 2 }; };
 58 |   template<> struct num_dims_of<int3>   { enum { value = 3 }; };
 59 |   template<> struct num_dims_of<int4>   { enum { value = 4 }; };
 60 | 
 61 |   inline __both__ float get_coord(const float2 &v, int d) { return d?v.y:v.x; }
 62 |   inline __both__ float get_coord(const float3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); }
 63 |   inline __both__ float get_coord(const float4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); }
 64 |   
 65 |   inline __both__ float &get_coord(float2 &v, int d) { return d?v.y:v.x; }
 66 |   inline __both__ float &get_coord(float3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); }
 67 |   inline __both__ float &get_coord(float4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); }
 68 | 
 69 | 
 70 |   inline __both__ int get_coord(const int2 &v, int d) { return d?v.y:v.x; }
 71 |   inline __both__ int get_coord(const int3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); }
 72 |   inline __both__ int get_coord(const int4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); }
 73 |   
 74 |   inline __both__ int &get_coord(int2 &v, int d) { return d?v.y:v.x; }
 75 |   inline __both__ int &get_coord(int3 &v, int d) { return (d==2)?v.z:(d?v.y:v.x); }
 76 |   inline __both__ int &get_coord(int4 &v, int d) { return (d>=2)?(d>2?v.w:v.z):(d?v.y:v.x); }
 77 | 
 78 |   
 79 |   inline __both__ void set_coord(int2 &v, int d, int vv) { (d?v.y:v.x) = vv; }
 80 |   inline __both__ void set_coord(int3 &v, int d, int vv) { ((d==2)?v.z:(d?v.y:v.x)) = vv; }
 81 |   inline __both__ void set_coord(int4 &v, int d, int vv) { ((d>=2)?(d>2?v.w:v.z):(d?v.y:v.x)) = vv; }
 82 |   
 83 |   inline __both__ void set_coord(float2 &v, int d, float vv) { (d?v.y:v.x) = vv; }
 84 |   inline __both__ void set_coord(float3 &v, int d, float vv) { ((d==2)?v.z:(d?v.y:v.x)) = vv; }
 85 |   inline __both__ void set_coord(float4 &v, int d, float vv) { ((d>=2)?(d>2?v.w:v.z):(d?v.y:v.x)) = vv; }
 86 |   
 87 |   inline __both__ int32_t divRoundUp(int32_t a, int32_t b) { return (a+b-1)/b; }
 88 |   inline __both__ uint32_t divRoundUp(uint32_t a, uint32_t b) { return (a+b-1)/b; }
 89 |   inline __both__ int64_t divRoundUp(int64_t a, int64_t b) { return (a+b-1)/b; }
 90 |   inline __both__ uint64_t divRoundUp(uint64_t a, uint64_t b) { return (a+b-1)/b; }
 91 | 
 92 |   using ::sin; // this is the double version
 93 |   using ::cos; // this is the double version
 94 | 
 95 |   // ==================================================================
 96 |   // default operators on cuda vector types:
 97 |   // ==================================================================
 98 | 
 99 | 
100 |   inline __both__ float2 operator-(float2 a, float2 b)
101 |   { return make_float2(a.x-b.x,a.y-b.y); }
102 |   inline __both__ float3 operator-(float3 a, float3 b)
103 |   { return make_float3(a.x-b.x,a.y-b.y,a.z-b.z); }
104 |   inline __both__ float4 operator-(float4 a, float4 b)
105 |   { return make_float4(a.x-b.x,a.y-b.y,a.z-b.z,a.w-b.w); }
106 | 
107 |   inline __both__ float dot(float2 a, float2 b)
108 |   { return a.x*b.x+a.y*b.y; }
109 |   inline __both__ float dot(float3 a, float3 b)
110 |   { return a.x*b.x+a.y*b.y+a.z*b.z; }
111 |   inline __both__ float dot(float4 a, float4 b)
112 |   { return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w; }
113 |   
114 |   inline __both__ float2 min(float2 a, float2 b)
115 |   { return make_float2(min(a.x,b.x),min(a.y,b.y)); }
116 |   inline __both__ float3 min(float3 a, float3 b)
117 |   { return make_float3(min(a.x,b.x),min(a.y,b.y),min(a.z,b.z)); }
118 |   inline __both__ float4 min(float4 a, float4 b)
119 |   { return make_float4(min(a.x,b.x),min(a.y,b.y),min(a.z,b.z),min(a.w,b.w)); }
120 | 
121 |   inline __both__ float2 max(float2 a, float2 b)
122 |   { return make_float2(max(a.x,b.x),max(a.y,b.y)); }
123 |   inline __both__ float3 max(float3 a, float3 b)
124 |   { return make_float3(max(a.x,b.x),max(a.y,b.y),max(a.z,b.z)); }
125 |   inline __both__ float4 max(float4 a, float4 b)
126 |   { return make_float4(max(a.x,b.x),max(a.y,b.y),max(a.z,b.z),max(a.w,b.w)); }
127 | 
128 |   inline std::ostream &operator<<(std::ostream &o, float3 v)
129 |   { o << "(" << v.x << "," << v.y << "," << v.z << ")"; return o; }
130 | 
131 |       
132 |   // ==================================================================
133 |   // for some tests: our own, arbitrary-dimensioal vector type
134 |   // ==================================================================
135 |   template<int N>
136 |   struct vec_float {
137 |     float v[N];
138 |   };
139 |   template<int N> struct scalar_type_of<vec_float<N>> { using type = float; };
140 |   template<int N> struct num_dims_of<vec_float<N>> { enum { value = N }; };
141 |   
142 |   template<int N>
143 |   inline __both__ float get_coord(const vec_float<N> &v, int d) { return v.v[d]; }
144 |   template<int N>
145 |   inline __both__ float &get_coord(vec_float<N> &v, int d) { return v.v[d]; }
146 |   template<int N>
147 |   inline __both__ void set_coord(vec_float<N> &v, int d, float vv) { v.v[d] = vv; }
148 |   
149 |   
150 | 
151 |   template<int N>
152 |   inline __both__ vec_float<N> min(vec_float<N> a, vec_float<N> b)
153 |   {
154 |     vec_float<N> r;
155 |     for (int i=0;i<N;i++) r.v[i] = min(a.v[i],b.v[i]);
156 |     return r;
157 |   }
158 |   
159 |   template<int N>
160 |   inline __both__ vec_float<N> max(vec_float<N> a, vec_float<N> b)
161 |   {
162 |     vec_float<N> r;
163 |     for (int i=0;i<N;i++) r.v[i] = max(a.v[i],b.v[i]);
164 |     return r;
165 |   }
166 | 
167 |   template<int N>
168 |   inline __both__ float dot(vec_float<N> a, vec_float<N> b)
169 |   {
170 |     float sum = 0.f;
171 |     for (int i=0;i<N;i++) sum += a.v[i] * b.v[i];
172 |     return sum;
173 |   }
174 |   
175 |   template<int N>
176 |   inline __both__ vec_float<N> operator-(const vec_float<N> &a, const vec_float<N> &b)
177 |   {
178 |     vec_float<N> r;
179 |     for (int i=0;i<N;i++) r.v[i] = a.v[i] - b.v[i];
180 |     return r;
181 |   }
182 | 
183 | 
184 | 
185 |   // ------------------------------------------------------------------
186 |   /*! @{ helper function(s) to convert scalar of any type to float,
187 |       with guarnateed round-to-zero mode, so functions like fSqrDist
188 |       can reliably compute distance in float with conservative
189 |       distance metric */
190 | 
191 |   template<typename T> inline __both__ float as_float_rz(T t);
192 |   template<> inline __both__ float as_float_rz(float f) { return f; }
193 | #ifdef __CUDA_ARCH__
194 |   template<> inline __device__ float as_float_rz(int i) { return __int2float_rz(i); }
195 | #endif
196 | 
197 |   /*! @] */
198 | 
199 |   
200 |   // ------------------------------------------------------------------
201 |   /*! float-accuracy (with round-to-zero mode) of distance between two point_t's */
202 |   template<typename point_t>
203 |   inline __both__
204 |   float fSqrDistance(const point_t &a, const point_t &b)
205 |   {
206 |     const point_t diff = b-a;
207 |     return as_float_rz(dot(diff,diff));
208 |   }
209 | 
210 |   template<typename point_t>
211 |   inline __both__
212 |   auto sqrDistance(const point_t &a, const point_t &b)
213 |   { const point_t d = a-b; return dot(d,d); }
214 | 
215 |   // ------------------------------------------------------------------
216 |   // scalar distance(point,point)
217 |   // ------------------------------------------------------------------
218 | 
219 |   inline __both__ float square_root(float f) { return sqrtf(f); }
220 |   
221 |   template<typename point_t>
222 |   inline __both__ auto distance(const point_t &a, const point_t &b)
223 |   { return square_root(sqrDistance(a,b)); }
224 |   
225 |   // ------------------------------------------------------------------
226 |   template<typename point_t>
227 |   inline __both__ int arg_max(point_t p)
228 |   {
229 |     enum { num_dims = num_dims_of<point_t>::value };
230 |     using scalar_t = typename scalar_type_of<point_t>::type;
231 |     int best_dim = 0;
232 |     scalar_t best_val = get_coord(p,0);
233 |     for (int i=1;i<num_dims;i++) {
234 |       scalar_t f = get_coord(p,i);
235 |       if (f > best_val) {
236 |         best_val = f;
237 |         best_dim = i;
238 |       }
239 |     }
240 |     return best_dim;
241 |   }
242 |   
243 |   // ------------------------------------------------------------------
244 |   inline std::ostream &operator<<(std::ostream &out,
245 |                                   float2 v)
246 |   {
247 |     out << "(" << v.x << "," << v.y << ")";
248 |     return out;
249 |   }
250 | 
251 |   template <typename scalar_t>
252 |   inline __host__ __device__
253 |   auto sqr(scalar_t f) { return f * f; }
254 | 
255 |   template <typename scalar_t>
256 |   inline __host__ __device__
257 |   scalar_t sqrt(scalar_t f);
258 | 
259 |   template<> inline __host__ __device__
260 |   float sqrt(float f) { return ::sqrtf(f); }
261 | 
262 | 
263 | 
264 | 
265 |   
266 | 
267 | 
268 |   
269 |   template <typename point_traits_a, typename point_traits_b=point_traits_a>
270 |   inline __host__ __device__
271 |   auto sqrDistance(const typename point_traits_a::point_t& a,
272 |                    const typename point_traits_b::point_t& b)
273 |   {
274 |     typename point_traits_a::scalar_t res = 0;
275 |     for(int i=0; i<min(point_traits_a::numDims, point_traits_b::numDims); ++i) {
276 |       const auto diff = point_traits_a::getCoord(a, i) - point_traits_b::getCoord(b, i);
277 |       res += sqr(diff);
278 |     }
279 |     return res;
280 |   }
281 | 
282 |   template <typename point_traits_a, typename point_traits_b=point_traits_a>
283 |   inline __host__ __device__
284 |   auto distance(const typename point_traits_a::point_t& a,
285 |                 const typename point_traits_b::point_t& b)
286 |   {
287 |     typename point_traits_a::scalar_t res = 0;
288 |     for(int i=0; i<min(point_traits_a::numDims, point_traits_b::numDims); ++i) {
289 |       const auto diff = point_traits_a::getCoord(a, i) - point_traits_b::getCoord(b, i);
290 |       res += sqr(diff);
291 |     }
292 |     return sqrt(res);
293 |   }
294 | 
295 | 
296 | 
297 | 
298 |   template<typename T> struct point_traits;
299 | 
300 |   /*! point traits that describe our defaul tpoint type of cuda float3, int3, float4, etc.
301 |     
302 |     The four basic things a point_traits has to do for a given type are:
303 |     
304 |     - define the scalar_t that this point is built over
305 |     
306 |     - define the enum num_dims of dimensions that this point has
307 |     
308 |     - define a static function `get_coord(const point_t, int d)` that
309 |     returns the given point's d'th coordiate
310 |     
311 |     - define a static function `set_coord(point_t &, int d, scalar_t
312 |       v)` that sets the given point's d'the coordinate to the given
313 |       value
314 |    */
315 |   template<typename cuda_t>
316 |   struct point_traits {
317 |     enum { num_dims = num_dims_of<cuda_t>::value };
318 |     using scalar_t  = typename scalar_type_of<cuda_t>::type;
319 | 
320 |     /*! get the d'th coordindate - for our default cuda types we use
321 |         the ::cukd::get_coord helpers we hvae for those types */
322 |     static inline __both__
323 |     scalar_t get_coord(const cuda_t &v, int d) { return ::cukd::get_coord(v,d); }
324 | 
325 |     static inline __both__
326 |     scalar_t &get_coord(cuda_t &v, int d) { return ::cukd::get_coord(v,d); }
327 |     
328 |     static inline __both__
329 |     void set_coord(cuda_t &v, int d, scalar_t vv)
330 |     { ::cukd::set_coord(v,d,vv); }
331 |   };
332 | 
333 | 
334 | 
335 | 
336 |   
337 | } // ::cukd
338 | 


--------------------------------------------------------------------------------
/cukd/data.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2024 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /*! \file cukd/data.h Describes (abstract) data types (that k-d trees
 18 |     can be built over, and data type traits that describe this data.
 19 | */
 20 | 
 21 | #pragma once
 22 | 
 23 | #include "cukd/cukd-math.h"
 24 | #include "cukd/box.h"
 25 | 
 26 | namespace cukd {
 27 | 
 28 | 
 29 |   /*! defines an abstract interface to what a 'data point' in a k-d
 30 |     tree is -- which is some sort of actual D-dimensional point of
 31 |     scalar coordinates, plus potentially some payload, and potentially
 32 |     a means of storing the split dimension). This needs to define the
 33 |     following:
 34 | 
 35 |     - data_traits::point_t: the actual point type that stores the
 36 |     coordinates of this data point
 37 | 
 38 |     - enum data_traits::has_explicit_dim : whether that node type has
 39 |     a field to store an explicit split dimension in each node. If not,
 40 |     the k-d tree builder and traverse _have_ to use round-robin for
 41 |     split distance; otherwise, it will always split the widest
 42 |     dimension.
 43 | 
 44 |     - enum data_traits::set_dim(data_t &, int) and
 45 |     data_traits::get_dim(const data_t &) to read and write dimensions. For
 46 |     data_t's that don't actually have any explicit split dimension
 47 |     these function may be dummies that don't do anything (they'll
 48 |     never get called in that case), but they have to be defined to
 49 |     make the compiler happy.
 50 | 
 51 |     The _default_ data point for this library is just the point_t
 52 |     itself: no payload, no means of storing any split dimension (ie,
 53 |     always doing round-robin dimensions), and the coordinates just
 54 |     stored as the point itself.
 55 |   */
 56 |   template<typename _point_t,
 57 |            typename _point_traits=cukd::point_traits<_point_t>>
 58 |   struct default_data_traits {
 59 |     // ------------------------------------------------------------------
 60 |     /* part I : describes the _types_ of d-dimensional point data that
 61 |        the tree will be built over */
 62 |     // ------------------------------------------------------------------
 63 |     using point_t      = _point_t;
 64 |     using point_traits = _point_traits;
 65 | 
 66 |     // ------------------------------------------------------------------
 67 |     /* part II : describes the type of _data_ (which can be more than
 68 |        just a point).   */
 69 |     // ------------------------------------------------------------------
 70 | 
 71 |     using data_t = _point_t;
 72 | 
 73 |     // ------------------------------------------------------------------
 74 |     /* part III : how to extract a point or coordinate from an actual
 75 |        data struct */
 76 |     // ------------------------------------------------------------------
 77 |   private:
 78 |     // this doesn't _need_ to be defined in a data_traits, but makes some of
 79 |     // the blow code cleaner to read
 80 |     using scalar_t  = typename point_traits::scalar_t;
 81 |   public:    
 82 |     /*! return a reference to the 'd'th positional coordinate of the
 83 |       given node - for the default simple 'data==point' case we can
 84 |       simply return a reference to the point itself */
 85 |     static inline __both__ const point_t &get_point(const data_t &n) { return n; }
 86 | 
 87 |     /*! return the 'd'th positional coordinate of the given node */
 88 |     static inline __both__
 89 |     scalar_t get_coord(const data_t &n, int d)
 90 |     { return point_traits::get_coord(get_point(n),d); }
 91 | 
 92 |     // ------------------------------------------------------------------
 93 |     /* part IV : whether the data has a way of storing a split
 94 |        dimension for non-round robin paritioning, and if so, how to
 95 |        store (for building) and read (for traversing) that split
 96 |        dimensional in/from a node */
 97 |     // ------------------------------------------------------------------
 98 | 
 99 |     /* whether that node type has a field to store an explicit split
100 |        dimension in each node. If not, the k-d tree builder and
101 |        traverse _have_ to use round-robin for split distance;
102 |        otherwise, it will alwyas split the widest dimensoin */
103 |     enum { has_explicit_dim = false };
104 | 
105 |     /*! !{ just defining this for completeness, get/set_dim should never
106 |       get called for this type because we have set has_explicit_dim
107 |       set to false. note traversal should ONLY ever call this
108 |       function for data_t's that define has_explicit_dim to true */
109 |     static inline __host__ __device__ int  get_dim(const data_t &) { return -1; }
110 |     static inline __host__ __device__ void set_dim(data_t &, int) {}
111 |     /*! @} */
112 |   };
113 | 
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/cukd/helpers.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2019-2024 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "cukd/common.h"
 20 | #include "cukd/cukd-math.h"
 21 | 
 22 | namespace cukd {
 23 | 
 24 |   // ------------------------------------------------------------------
 25 |   /*! defines a 'memory resource' that can be used for allocating gpu
 26 |       memory; this allows the user to switch between usign
 27 |       cudaMallocAsync (where avialble) vs regular cudaMalloc (where
 28 |       not), or to use their own memory pool, to use managed memory,
 29 |       etc. All memory allocatoins done during construction will use
 30 |       the memory resource passed to the respective build function. */
 31 |   struct GpuMemoryResource {
 32 |     virtual cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) = 0;
 33 |     virtual cudaError_t free(void* ptr, cudaStream_t s) = 0;
 34 |   };
 35 | 
 36 |   struct ManagedMemMemoryResource : public GpuMemoryResource {
 37 |     cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) override
 38 |     {
 39 |       cudaStreamSynchronize(s);
 40 |       return cudaMallocManaged(ptr,size);
 41 |     }
 42 |     cudaError_t free(void* ptr, cudaStream_t s) override
 43 |     {
 44 |       cudaStreamSynchronize(s);
 45 |       return cudaFree(ptr);
 46 |     }
 47 |   };
 48 | 
 49 |   /* by default let's use cuda malloc async, which is much better and
 50 |      faster than regular malloc; but that's available on cuda 11, so
 51 |      let's add a fall back for older cuda's, too */
 52 | #if CUDART_VERSION >= 11020
 53 |   struct AsyncGpuMemoryResource final : GpuMemoryResource {
 54 |     cudaError_t malloc(void** ptr, size_t size, cudaStream_t s) override {
 55 |       return cudaMallocAsync(ptr, size, s);
 56 |     }
 57 |     cudaError_t free(void* ptr, cudaStream_t s) override {
 58 |       return cudaFreeAsync(ptr, s);
 59 |     }
 60 |   };
 61 | 
 62 |   inline GpuMemoryResource &defaultGpuMemResource() {
 63 |     static AsyncGpuMemoryResource memResource;
 64 |     return memResource;
 65 |   }
 66 | #else
 67 |   inline GpuMemoryResource &defaultGpuMemResource() {
 68 |     static ManagedMemMemoryResource memResource;
 69 |     return memResource;
 70 |   }
 71 | #endif
 72 | 
 73 |   /*! helper functions for a generic, arbitrary-size binary tree -
 74 |     mostly to compute level of a given node in that tree, and child
 75 |     IDs, parent IDs, etc */
 76 |   struct BinaryTree {
 77 |     inline static __host__ __device__ int rootNode() { return 0; }
 78 |     inline static __host__ __device__ int parentOf(int nodeID) { return (nodeID-1)/2; }
 79 |     inline static __host__ __device__ int isLeftSibling(int nodeID) { return (nodeID & 1); }
 80 |     inline static __host__ __device__ int leftChildOf (int nodeID) { return 2*nodeID+1; }
 81 |     inline static __host__ __device__ int rightChildOf(int nodeID) { return 2*nodeID+2; }
 82 |     inline static __host__ __device__ int firstNodeInLevel(int L) { return (1<<L)-1; }
 83 |   
 84 |     inline static __host__ __device__ int levelOf(int nodeID)
 85 |     {
 86 | #ifdef __CUDA_ARCH__
 87 |       int k = 63 - __clzll(nodeID+1);
 88 | #elif defined(_MSC_VER)
 89 |       unsigned long bs;
 90 |       _BitScanReverse(&bs, nodeID + 1);
 91 |       int k = bs;
 92 | #else
 93 |       int k = 63 - __builtin_clzll(nodeID+1);
 94 | #endif
 95 |       return k;
 96 |     }
 97 |   
 98 |     inline static __host__ __device__ int numLevelsFor(int numPoints)
 99 |     {
100 |       return levelOf(numPoints-1)+1;
101 |     }
102 |   
103 |     inline __host__ __device__ int numSiblingsToLeftOf(int n)
104 |     {
105 |       int levelOf_n = BinaryTree::levelOf(n);
106 |       return n - BinaryTree::firstNodeInLevel(levelOf_n);
107 |     }
108 |   };
109 | 
110 |   /*! helper class for all expressions operating on a full binary tree
111 |       of a given number of levels */
112 |   struct FullBinaryTreeOf
113 |   {
114 |     inline __host__ __device__ FullBinaryTreeOf(int numLevels) : numLevels(numLevels) {}
115 |   
116 |     // tested, works for any numLevels >= 0
117 |     inline __host__ __device__ int numNodes() const { return (1<<numLevels)-1; }
118 |     inline __host__ __device__ int numOnLastLevel() const { return (1<<(numLevels-1)); }
119 |   
120 |     const int numLevels;
121 |   };
122 | 
123 |   /*! helper class for all kind of values revolving around a given
124 |       subtree in full binary tree of a given number of levels. Allos
125 |       us to compute the number of nodes in a given subtree, the first
126 |       and last node of a given subtree, etc */
127 |   struct SubTreeInFullTreeOf
128 |   {
129 |     inline __host__ __device__
130 |     SubTreeInFullTreeOf(int numLevelsTree, int subtreeRoot)
131 |       : numLevelsTree(numLevelsTree),
132 |         subtreeRoot(subtreeRoot),
133 |         levelOfSubtree(BinaryTree::levelOf(subtreeRoot)),
134 |         numLevelsSubtree(numLevelsTree - levelOfSubtree)
135 |     {}
136 |     inline __host__ __device__
137 |     int lastNodeOnLastLevel() const
138 |     {
139 |       // return ((subtreeRoot+2) << (numLevelsSubtree-1)) - 2;
140 |       int first = (subtreeRoot+1)<<(numLevelsSubtree-1);
141 |       int onLast = (1<<(numLevelsSubtree-1)) - 1;
142 |       return first+onLast;
143 |     }
144 |     inline __host__ __device__
145 |     int numOnLastLevel() const { return FullBinaryTreeOf(numLevelsSubtree).numOnLastLevel(); }
146 |     inline __host__ __device__
147 |     int numNodes()            const { return FullBinaryTreeOf(numLevelsSubtree).numNodes(); }
148 |   
149 |     const int numLevelsTree;
150 |     const int subtreeRoot;
151 |     const int levelOfSubtree;
152 |     const int numLevelsSubtree;
153 |   };
154 | 
155 |   inline __host__ __device__ int clamp(int val, int lo, int hi)
156 |   { return max(min(val,hi),lo); }
157 | 
158 |                                        
159 |   /*! helper functions for a binary tree of exactly N nodes. For this
160 |       paper, all we need to be able to compute is the size of any
161 |       given subtree in this tree */
162 |   struct ArbitraryBinaryTree {
163 |     inline __host__ __device__ ArbitraryBinaryTree(int numNodes)
164 |       : numNodes(numNodes) {}
165 |     inline __host__ __device__ int numNodesInSubtree(int n)
166 |     {
167 |       auto fullSubtree
168 |         = SubTreeInFullTreeOf(BinaryTree::numLevelsFor(numNodes),n);
169 |       const int lastOnLastLevel
170 |         = fullSubtree.lastNodeOnLastLevel();
171 |       const int numMissingOnLastLevel
172 |         = clamp(lastOnLastLevel - numNodes, 0, fullSubtree.numOnLastLevel());
173 |       const int result = fullSubtree.numNodes() - numMissingOnLastLevel;
174 |       return result;
175 |     }
176 |   
177 |     const int numNodes;
178 |   };
179 | 
180 |   // ==================================================================
181 |   // helper functions for our N-step data ordering
182 |   // ==================================================================
183 | 
184 |   /*! helper class for the array layout that this method is based upon
185 |       (please see accompanying paper): in the L'th construction step,
186 |       this array layout first stores all the first L levels' nodes in
187 |       proper KD-tree order, then has, for each level-L subtree on this
188 |       L'th level, first all nodes from the first subtree on this
189 |       level, then those for the second, etc. */
190 |   struct ArrayLayoutInStep {
191 |     inline __host__ __device__ 
192 |     ArrayLayoutInStep(int step, /* num nodes in three: */int numPoints)
193 |       : numLevelsDone(step), numPoints(numPoints)
194 |     {}
195 | 
196 |     /*! number of nodes already settled to their final position in all
197 |       previous steps; if we start counting steps at L=0 for the
198 |       first step, then 'L' is also the number of binary tree levels
199 |       that have already been built. */
200 |     inline __host__ __device__ int numSettledNodes() const
201 |     { return FullBinaryTreeOf(numLevelsDone).numNodes(); }
202 | 
203 |     /*! given a node ID 'n' *on* (!) the current level 'L' (ie, a
204 |       subtree), computes the number of nodes in the subtree under (and
205 |       including) node n */
206 |     inline __host__ __device__ int segmentBegin(int subtreeOnLevel)
207 |     {
208 |       int numSettled = FullBinaryTreeOf(numLevelsDone).numNodes();
209 |       int numLevelsTotal = BinaryTree::numLevelsFor(numPoints);
210 |       int numLevelsRemaining = numLevelsTotal-numLevelsDone;
211 |     
212 |       int firstNodeInThisLevel = FullBinaryTreeOf(numLevelsDone).numNodes();
213 |       int numEarlierSubtreesOnSameLevel = subtreeOnLevel-firstNodeInThisLevel;
214 | 
215 |       int numToLeftIfFull
216 |         = numEarlierSubtreesOnSameLevel
217 |         * FullBinaryTreeOf(numLevelsRemaining).numNodes();
218 | 
219 |       int numToLeftOnLastIfFull
220 |         = numEarlierSubtreesOnSameLevel
221 |         * FullBinaryTreeOf(numLevelsRemaining).numOnLastLevel();
222 | 
223 |       int numTotalOnLastLevel
224 |         = numPoints - FullBinaryTreeOf(numLevelsTotal-1).numNodes();
225 | 
226 |       int numReallyToLeftOnLast
227 |         = min(numTotalOnLastLevel,numToLeftOnLastIfFull);
228 |       int numMissingOnLast
229 |         = numToLeftOnLastIfFull - numReallyToLeftOnLast;
230 | 
231 |       int result = numSettled + numToLeftIfFull - numMissingOnLast;
232 |       return result;
233 |     }
234 | 
235 |     inline __host__ __device__
236 |     int pivotPosOf(int subtree)
237 |     {
238 |       int segBegin = segmentBegin(subtree);
239 |       int pivotPos = segBegin + sizeOfLeftSubtreeOf(subtree);
240 |       return pivotPos;
241 |     }
242 | 
243 |     inline __host__ __device__
244 |     int sizeOfLeftSubtreeOf(int subtree)
245 |     {
246 |       int leftChildRoot = BinaryTree::leftChildOf(subtree);
247 |       if (leftChildRoot >= numPoints) return 0;
248 |       return ArbitraryBinaryTree(numPoints).numNodesInSubtree(leftChildRoot);
249 |     }
250 |     
251 |     inline __host__ __device__
252 |     int sizeOfSegment(int n) const
253 |     { return ArbitraryBinaryTree(numPoints).numNodesInSubtree(n); }
254 | 
255 |   
256 |     const int numLevelsDone;
257 |     const int numPoints;
258 |   };
259 | 
260 | }
261 | 
262 | 


--------------------------------------------------------------------------------
/cukd/kdtree.h:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2019-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #pragma once
18 | 
19 | #include "builder.h"
20 | 
21 | /*! if you are looking for a "struct KDTree" or the like: the
22 |     _default_ kd-tree in cukd is one where the the tree is entirely
23 |     _implicit_ in the order of the data points; i.e., there _is_ no
24 |     separate dedicated data type for a k-d tree - it's simply an array
25 |     of points (e.g., float3's, float2s, some type of Photons for
26 |     photon-mapping, etc), and the builder will simply re-arrange those
27 |     data points in the array */
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/cukd/traverse-cct.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2022-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | /* traversal with 'closest-corner-tracking' - somewhat better for some
 18 |    input distributions, by tracking the (N-dimensional) closest point
 19 |    in the given subtree's domain, rather than just always comparing
 20 |    only to the 1-dimensoinal plane */
 21 | #pragma once
 22 | 
 23 | namespace cukd {
 24 |   
 25 |   template<typename result_t,
 26 |            typename data_t,
 27 |            typename data_traits=default_data_traits<data_t>>
 28 |   inline __host__ __device__
 29 |   void traverse_cct(result_t &result,
 30 |                     typename data_traits::point_t queryPoint,
 31 |                     const box_t<typename data_traits::point_t> d_bounds,
 32 |                     const data_t *d_nodes,
 33 |                     int numPoints)
 34 |   {
 35 |     using point_t    = typename data_traits::point_t;
 36 |     using point_traits = ::cukd::point_traits<point_t>;
 37 |     using scalar_t   = typename point_traits::scalar_t;
 38 |     enum { num_dims  = point_traits::num_dims };
 39 |       
 40 |     scalar_t cullDist = result.initialCullDist2();
 41 | 
 42 |     struct
 43 |       StackEntry {
 44 |       int     nodeID;
 45 |       point_t closestCorner;
 46 |     };
 47 |     /* can do at most 2**30 points... */
 48 |     StackEntry  stackBase[30];
 49 |     StackEntry *stackPtr = stackBase;
 50 | 
 51 |     int nodeID = 0;
 52 |     point_t closestPointOnSubtreeBounds = project(d_bounds,queryPoint);
 53 |     if (sqrDistance(queryPoint,closestPointOnSubtreeBounds) > cullDist)
 54 |       return;
 55 | 
 56 |     while (true) {
 57 |       
 58 |       if (nodeID >= numPoints) {
 59 |         while (true) {
 60 |           if (stackPtr == stackBase)
 61 |             return;
 62 |           --stackPtr;
 63 |           closestPointOnSubtreeBounds = stackPtr->closestCorner;
 64 |           if (sqrDistance(closestPointOnSubtreeBounds,queryPoint) >= cullDist)
 65 |             continue;
 66 |           nodeID = stackPtr->nodeID;
 67 |           break;
 68 |         }
 69 |       }
 70 |       const auto &node  = d_nodes[nodeID];
 71 |       CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1));
 72 |       const point_t nodePoint = data_traits::get_point(node);
 73 |       {
 74 |         const auto sqrDist = sqrDistance(nodePoint,queryPoint);
 75 |         cullDist = result.processCandidate(nodeID,sqrDist);
 76 |       }
 77 |       
 78 |       const int  dim
 79 |         = data_traits::has_explicit_dim
 80 |         ? data_traits::get_dim(d_nodes[nodeID])
 81 |         : (BinaryTree::levelOf(nodeID) % num_dims);
 82 |       const auto node_dim   = get_coord(nodePoint,dim);
 83 |       const auto query_dim  = get_coord(queryPoint,dim);
 84 |       const bool  leftIsClose = query_dim < node_dim;
 85 |       const int   lChild = 2*nodeID+1;
 86 |       const int   rChild = lChild+1;
 87 | 
 88 |       auto farSideCorner = closestPointOnSubtreeBounds;
 89 |       const int farChild = leftIsClose?rChild:lChild;
 90 |       point_traits::set_coord(farSideCorner,dim,node_dim);
 91 |       if (farChild < numPoints && sqrDistance(farSideCorner,queryPoint) < cullDist) {
 92 |         stackPtr->closestCorner = farSideCorner;
 93 |         stackPtr->nodeID = farChild;
 94 |         stackPtr++;
 95 |       }
 96 | 
 97 |       nodeID = leftIsClose?lChild:rChild;
 98 |     }
 99 |   }
100 | 
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/cukd/traverse-default-stack-based.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2022-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "cukd/helpers.h"
 20 | 
 21 | namespace cukd {
 22 | 
 23 |   /*! traverse k-d tree with default, stack-based (sb) traversal */
 24 |   template<typename result_t,
 25 |            typename data_t,
 26 |            typename data_traits=default_data_traits<data_t>>
 27 |   inline __host__ __device__
 28 |   void traverse_default(result_t &result,
 29 |                         typename data_traits::point_t queryPoint,
 30 |                         const data_t *d_nodes,
 31 |                         int numPoints)
 32 |   {
 33 |     using point_t  = typename data_traits::point_t;
 34 |     using scalar_t = typename scalar_type_of<point_t>::type;
 35 |     enum { num_dims = num_dims_of<point_t>::value };
 36 |     
 37 |     scalar_t cullDist = result.initialCullDist2();
 38 | 
 39 |     bool dbg = 0; //threadIdx.x==0 && blockIdx.x == 0;
 40 |     
 41 |     if (dbg) printf("stackbased %f %f\n",
 42 |                     get_coord(queryPoint,0),
 43 |                     get_coord(queryPoint,1)
 44 |                     );
 45 |     
 46 |     
 47 |     /* can do at most 2**30 points... */
 48 |     struct StackEntry {
 49 |       int   nodeID;
 50 |       float sqrDist;
 51 |     };
 52 |     StackEntry stackBase[30];
 53 |     StackEntry *stackPtr = stackBase;
 54 | 
 55 |     /*! current node in the tree we're traversing */
 56 |     int curr = 0;
 57 |     
 58 |     while (true) {
 59 |       while (curr < numPoints) {
 60 |         const int  curr_dim
 61 |           = data_traits::has_explicit_dim
 62 |           ? data_traits::get_dim(d_nodes[curr])
 63 |           : (BinaryTree::levelOf(curr) % num_dims);
 64 |         CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1));
 65 |         const data_t &curr_node  = d_nodes[curr];
 66 |         const auto sqrDist = sqrDistance(data_traits::get_point(curr_node),
 67 |                                          queryPoint);
 68 |         if (dbg) printf("=== %i dim %i sqrDist %f\n",curr,curr_dim,sqrDist);
 69 |         
 70 |         cullDist = result.processCandidate(curr,sqrDist);
 71 |         if (dbg)
 72 |           printf("node %i pt %f %f sqrDist %f cullDist %f\n",
 73 |                  curr,
 74 |                  get_coord(data_traits::get_point(curr_node),0),
 75 |                  get_coord(data_traits::get_point(curr_node),1),
 76 |                  sqrDist,cullDist);
 77 | 
 78 |         const auto node_coord   = data_traits::get_coord(curr_node,curr_dim);
 79 |         const auto query_coord  = get_coord(queryPoint,curr_dim);
 80 |         const bool  leftIsClose = query_coord < node_coord;
 81 |         const int   lChild = 2*curr+1;
 82 |         const int   rChild = lChild+1;
 83 | 
 84 |         const int closeChild = leftIsClose?lChild:rChild;
 85 |         const int farChild   = leftIsClose?rChild:lChild;
 86 |         
 87 |         const float sqrDistToPlane = sqr(query_coord - node_coord);
 88 |         if (dbg) printf("sqrDist %f cullDist %f\n",
 89 |                         sqrDistToPlane,cullDist);    
 90 |         if (sqrDistToPlane < cullDist && farChild < numPoints) {
 91 |           stackPtr->nodeID  = farChild;
 92 |           stackPtr->sqrDist = sqrDistToPlane;
 93 |           ++stackPtr;
 94 |         }
 95 |         curr = closeChild;
 96 |       }
 97 | 
 98 |       while (true) {
 99 |         if (stackPtr == stackBase) 
100 |           return;
101 |         --stackPtr;
102 |         if (stackPtr->sqrDist >= cullDist)
103 |           continue;
104 |         curr = stackPtr->nodeID;
105 |         break;
106 |       }
107 |     }
108 |   }
109 |   
110 | }
111 | 


--------------------------------------------------------------------------------
/cukd/traverse-sf-imp.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2022 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | namespace cukd {
 20 | 
 21 |   template<typename data_t,
 22 |            typename data_traits=default_data_traits<data_t>>
 23 |   inline __host__ __device__
 24 |   box_t<typename data_traits::point_t>
 25 |   recomputeBounds(int curr,
 26 |                   box_t<typename data_traits::point_t> bounds,
 27 |                   const data_t *d_nodes
 28 |                   )
 29 |   {
 30 |     using point_t  = typename data_traits::point_t;
 31 |     using scalar_t = typename scalar_type_of<point_t>::type;
 32 |     enum { num_dims = num_dims_of<point_t>::value };
 33 |     
 34 |     while (true) {
 35 |       if (curr == 0) break;
 36 |       const int parent = (curr+1)/2-1;
 37 | 
 38 |       const auto &parent_node = d_nodes[parent];
 39 |       CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1));
 40 |       const int   parent_dim
 41 |         = data_traits::has_explicit_dim
 42 |         ? data_traits::get_dim(parent_node)
 43 |         : (BinaryTree::levelOf(parent) % num_dims);
 44 |       const float parent_split_pos = data_traits::get_coord(parent_node,parent_dim);
 45 |       
 46 |       if (curr & 1) {
 47 |         // curr is left child, set upper
 48 |         get_coord(bounds.upper,parent_dim)
 49 |           = min(parent_split_pos,
 50 |                 get_coord(bounds.upper,parent_dim));
 51 |       } else {
 52 |         // curr is right child, set lower
 53 |         get_coord(bounds.lower,parent_dim)
 54 |           = max(parent_split_pos,
 55 |                 get_coord(bounds.lower,parent_dim));
 56 |       }
 57 |       
 58 |       curr = parent;
 59 |     };
 60 |     return bounds;
 61 |   }
 62 |   
 63 |   template<typename result_t,
 64 |            typename data_t,
 65 |            typename data_traits=default_data_traits<data_t>>
 66 |   inline __host__ __device__
 67 |   void traverse_sf_imp(result_t &result,
 68 |                        typename data_traits::point_t queryPoint,
 69 |                        const box_t<typename data_traits::point_t> worldBounds,
 70 |                        const data_t *d_nodes,
 71 |                        int numPoints)
 72 |   {
 73 |     using point_t  = typename data_traits::point_t;
 74 |     using scalar_t = typename scalar_type_of<point_t>::type;
 75 |     enum { num_dims = num_dims_of<point_t>::value };
 76 | 
 77 |     float cullDist = result.initialCullDist2();
 78 |     
 79 |     
 80 |     int prev = -1;
 81 |     int curr = 0;
 82 | 
 83 |     box_t<point_t> bounds = worldBounds;
 84 |     
 85 |     while (true) {
 86 |       if (curr == -1)
 87 |         // this can only (and will) happen if and only if we come from a
 88 |         // child, arrive at the root, and decide to go to the parent of
 89 |         // the root ... while means we're done.
 90 |         return;// closest_found_so_far;
 91 | 
 92 |       bounds = recomputeBounds<data_t,data_traits>
 93 |         (curr,worldBounds,d_nodes);
 94 |       const int parent = (curr+1)/2-1;
 95 |       
 96 |       point_t closestPointOnSubtreeBounds = project(bounds,queryPoint);
 97 |       if (sqrDistance(closestPointOnSubtreeBounds,queryPoint) >= cullDist) {
 98 |         prev = curr;
 99 |         curr = parent;
100 |         continue;
101 |       }
102 | 
103 | 
104 |       if (curr >= numPoints) {
105 |         // in some (rare) cases it's possible that below traversal
106 |         // logic will go to a "close child", but may actually only
107 |         // have a far child. In that case it's easiest to fix this
108 |         // right here, pretend we've done that (non-existent) close
109 |         // child, and let parent pick up traversal as if it had been
110 |         // done.
111 |         prev = curr;
112 |         curr = parent;
113 | 
114 |         continue;
115 |       }
116 |       CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1));
117 |       const auto &curr_node = d_nodes[curr];
118 |       const int  child = 2*curr+1;
119 |       const bool from_child = (prev >= child);
120 |       if (!from_child) {
121 |         const auto dist_sqr =
122 |           sqrDistance(queryPoint,data_traits::get_point(curr_node));
123 |         cullDist = result.processCandidate(curr,dist_sqr);
124 |       }
125 | 
126 |       const int  curr_dim
127 |         = data_traits::has_explicit_dim
128 |         ? data_traits::get_dim(d_nodes[curr])
129 |         : (BinaryTree::levelOf(curr) % num_dims);
130 |       const float curr_split_pos = data_traits::get_coord(curr_node,curr_dim);
131 |       const float curr_dim_dist = get_coord(queryPoint,curr_dim) - curr_split_pos;
132 |       const int   curr_side = curr_dim_dist > 0.f;
133 |       const int   curr_close_child = 2*curr + 1 + curr_side;
134 |       const int   curr_far_child   = 2*curr + 2 - curr_side;
135 | 
136 |       int next = -1;
137 |       if (prev == curr_close_child) {
138 |         // if we came from the close child, we may still have to check
139 |         // the far side - but only if this exists, and if far half of
140 |         // current space if even within search radius.
141 |         if ((curr_far_child<numPoints)
142 |             &&
143 |             (curr_dim_dist * curr_dim_dist < cullDist)
144 |             )
145 |           {
146 |             next = curr_far_child;
147 |             if (curr_side == 1) {
148 |               get_coord(bounds.lower,curr_dim) = curr_split_pos;
149 |             } else {
150 |               get_coord(bounds.upper,curr_dim) = curr_split_pos;
151 |             }
152 |           }
153 |         else
154 |           {
155 |             next = parent;
156 |           }
157 |       } else if (prev == curr_far_child) {
158 |         // if we did come from the far child, then both children are
159 |         // done, and we can only go up.
160 |         next = parent;
161 |       } else {
162 |         // we didn't come from any child, so must be coming from a
163 |         // parent... we've already been processed ourselves just now,
164 |         // so next stop is to look at the children (unless there
165 |         // aren't any). this still leaves the case that we might have
166 |         // a child, but only a far child, and this far child may or
167 |         // may not be in range ... we'll fix that by just going to
168 |         // near child _even if_ only the far child exists, and have
169 |         // that child do a dummy traversal of that missing child, then
170 |         // pick up on the far-child logic when we return.
171 |         // next
172 |         
173 |         if (child < numPoints) {
174 |           next = curr_close_child;
175 |           if (curr_side == 1) {
176 |             get_coord(bounds.upper,curr_dim) = curr_split_pos;
177 |           } else {
178 |             get_coord(bounds.lower,curr_dim) = curr_split_pos;
179 |           }
180 |         } else {
181 |           next = parent;
182 |         }
183 |       }
184 | 
185 |       if (next == -1)
186 |         // this can only (and will) happen if and only if we come from a
187 |         // child, arrive at the root, and decide to go to the parent of
188 |         // the root ... while means we're done.
189 |         return;
190 | 
191 |       prev = curr;
192 |       curr = next;
193 |     }
194 |   }
195 | }
196 | 


--------------------------------------------------------------------------------
/cukd/traverse-stack-free.h:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2022-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "cukd/data.h"
 20 | #include "cukd/helpers.h"
 21 | 
 22 | namespace cukd {
 23 | 
 24 |   template<typename result_t,
 25 |            typename data_t,
 26 |            typename data_traits=default_data_traits<data_t>>
 27 |   inline __host__ __device__
 28 |   void traverse_stack_free(result_t &result,
 29 |                            typename data_traits::point_t queryPoint,
 30 |                            const data_t *d_nodes,
 31 |                            int N,
 32 |                            float eps=0.0f)
 33 |   {
 34 |     using point_t  = typename data_traits::point_t;
 35 |     using scalar_t = typename scalar_type_of<point_t>::type;
 36 |     enum { num_dims = num_dims_of<point_t>::value };
 37 |     const auto epsErr = 1 + eps;
 38 | 
 39 |     scalar_t cullDist = result.initialCullDist2();
 40 |     
 41 |     int prev = -1;
 42 |     int curr = 0;
 43 | 
 44 |     while (true) {
 45 |       const int parent = (curr+1)/2-1;
 46 |       if (curr >= N) {
 47 |         // in some (rare) cases it's possible that below traversal
 48 |         // logic will go to a "close child", but may actually only
 49 |         // have a far child. In that case it's easiest to fix this
 50 |         // right here, pretend we've done that (non-existent) close
 51 |         // child, and let parent pick up traversal as if it had been
 52 |         // done.
 53 |         prev = curr;
 54 |         curr = parent;
 55 | 
 56 |         continue;
 57 |       }
 58 |       CUKD_STATS(if (cukd::g_traversalStats) ::atomicAdd(cukd::g_traversalStats,1));
 59 |       const auto &curr_node = d_nodes[curr];
 60 |       const int  child = 2*curr+1;
 61 |       const bool from_child = (prev >= child);
 62 |       if (!from_child) {
 63 |         const auto sqrDist =
 64 |           sqrDistance(queryPoint,data_traits::get_point(curr_node));
 65 |         cullDist = result.processCandidate(curr,sqrDist);
 66 |       }
 67 | 
 68 |       const int  curr_dim
 69 |         = data_traits::has_explicit_dim
 70 |         ? data_traits::get_dim(d_nodes[curr])
 71 |         : (BinaryTree::levelOf(curr) % num_dims);
 72 |       const float curr_dim_dist
 73 |         = get_coord(queryPoint,curr_dim)
 74 |         - data_traits::get_coord(curr_node,curr_dim);
 75 |       const int   curr_side = curr_dim_dist > 0.f;
 76 |       const int   curr_close_child = 2*curr + 1 + curr_side;
 77 |       const int   curr_far_child   = 2*curr + 2 - curr_side;
 78 | 
 79 |       int next = -1;
 80 |       if (prev == curr_close_child)
 81 |         // if we came from the close child, we may still have to check
 82 |         // the far side - but only if this exists, and if far half of
 83 |         // current space if even within search radius.
 84 |         next
 85 |           = ((curr_far_child<N) && (curr_dim_dist * curr_dim_dist * epsErr < cullDist))
 86 |           ? curr_far_child
 87 |           : parent;
 88 |       else if (prev == curr_far_child)
 89 |         // if we did come from the far child, then both children are
 90 |         // done, and we can only go up.
 91 |         next = parent;
 92 |       else
 93 |         // we didn't come from any child, so must be coming from a
 94 |         // parent... we've already been processed ourselves just now,
 95 |         // so next stop is to look at the children (unless there
 96 |         // aren't any). this still leaves the case that we might have
 97 |         // a child, but only a far child, and this far child may or
 98 |         // may not be in range ... we'll fix that by just going to
 99 |         // near child _even if_ only the far child exists, and have
100 |         // that child do a dummy traversal of that missing child, then
101 |         // pick up on the far-child logic when we return.
102 |         next
103 |           = (child<N)
104 |           ? curr_close_child
105 |           : parent;
106 | 
107 |       if (next == -1)
108 |         // if (curr == 0 && from_child)
109 |         // this can only (and will) happen if and only if we come from a
110 |         // child, arrive at the root, and decide to go to the parent of
111 |         // the root ... while means we're done.
112 |         return;
113 | 
114 |       prev = curr;
115 |       curr = next;
116 |     }
117 |   }
118 | 
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/measure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # script that measures fcp and knn perf for paper
 3 | 
 4 | #for method in cct cct-xd spatial-cct spatial-stackBased stackBased stackBased-xd stackFree stackFree-xd ; do
 5 | #    echo "results for method ${method}" | tee fcp-and-knn-results-${method}.txt
 6 | #done
 7 | 
 8 | 
 9 | for f in fcp knn-clamped knn-unlimited; do 
10 |     echo "" > results-$f.txt
11 | done
12 | 
13 | for N in 1000 10000 100000 1000000 10000000 100000000 1000000000; do
14 |     echo "### running for N = $N"
15 |     for f in fcp knn-clamped knn-unlimited; do 
16 | 	echo "############## N = $N, uniform" | tee -a results-$f.txt
17 |     done
18 |     for method in cct cct-xd spatial-cct spatial-stackBased stackBased stackBased-xd stackFree stackFree-xd ; do
19 | 	# ==================================================================
20 | 	# for clamping use radius 10000 - that's 1% of [0,1M] domain
21 | 	# we generate samples in
22 | 	./cukd_float2-fcp-${method} -nr 10 $N > tmp.fcp.txt
23 | 	./cukd_float2-knn-${method} -nr 10 $N > tmp.knn-unlimited.txt
24 | 	./cukd_float2-knn-${method} -nr 10 $N -r 10000 > tmp.knn-clamped.txt
25 | 	
26 | 	for f in fcp knn-clamped knn-unlimited; do
27 | 	    stats=`cat tmp.$f.txt | grep NICE_STATS | awk '{print \$2}'`
28 | 	    perf=`cat tmp.$f.txt | grep "that is" | awk '{print \$3}'`
29 | 	    echo "stats $stats perf $perf"
30 | 	    echo "method $method stats $stats perf $perf" | tee -a results-$f.txt
31 | 	done
32 |     done
33 |     for f in fcp knn-clamped knn-unlimited; do 
34 | 	echo "############## N = $N, clustered" | tee -a results-$f.txt
35 |     done
36 |     for method in cct cct-xd spatial-cct spatial-stackBased stackBased stackBased-xd stackFree stackFree-xd ; do
37 | 	# ==================================================================
38 | 	# for clamping use radius 10000 - that's 1% of [0,1M] domain
39 | 	# we generate samples in
40 | 	./cukd_float2-fcp-${method} --clustered -nr 10 $N > tmp.fcp.txt
41 | 	./cukd_float2-knn-${method} --clustered -nr 10 $N > tmp.knn-unlimited.txt
42 | 	./cukd_float2-knn-${method} --clustered -nr 10 $N -r 10000 > tmp.knn-clamped.txt
43 | 	
44 | 	for f in fcp knn-clamped knn-unlimited; do
45 | 	    stats=`cat tmp.$f.txt | grep NICE_STATS | awk '{print \$2}'`
46 | 	    perf=`cat tmp.$f.txt | grep "that is" | awk '{print \$3}'`
47 | 	    echo "stats $stats perf $perf"
48 | 	    echo "method $method stats $stats perf $perf" | tee -a results-$f.txt
49 | 	done
50 | 
51 |     done
52 | done
53 | 
54 | 


--------------------------------------------------------------------------------
/sample.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder.h"
 18 | // fcp = "find closest point" query
 19 | #include "cukd/fcp.h"
 20 | #include <queue>
 21 | #include <iomanip>
 22 | #include <random>
 23 | 
 24 | using namespace cukd;
 25 | 
 26 | float3 *generatePoints(int N)
 27 | {
 28 |   static int g_seed = 100000;
 29 |   std::seed_seq seq{g_seed++};
 30 |   std::default_random_engine rd(seq);
 31 |   std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
 32 |   std::uniform_int_distribution<> dist(0,N);
 33 | 
 34 |   std::cout << "generating " << N << " uniform random points" << std::endl;
 35 |   float3 *d_points = 0;
 36 |   cudaMallocManaged((char **)&d_points,N*sizeof(*d_points));
 37 |   if (!d_points)
 38 |     throw std::runtime_error("could not allocate points mem...");
 39 |   
 40 |   for (int i=0;i<N;i++) {
 41 |     d_points[i].x = (float)dist(gen);
 42 |     d_points[i].y = (float)dist(gen);
 43 |     d_points[i].z = (float)dist(gen);
 44 |   }
 45 |   return d_points;
 46 | }
 47 | 
 48 | 
 49 | __global__
 50 | void d_fcp(float   *d_results,
 51 |            float3  *d_queries,
 52 |            int      numQueries,
 53 |            /*! the world bounding box computed by the builder */
 54 |            const cukd::box_t<float3> *d_bounds,
 55 |            float3  *d_nodes,
 56 |            int      numNodes,
 57 |            float    cutOffRadius)
 58 | {
 59 |   int tid = threadIdx.x+blockIdx.x*blockDim.x;
 60 |   if (tid >= numQueries) return;
 61 | 
 62 |   using point_t = float3;
 63 |   point_t queryPos = d_queries[tid];
 64 |   FcpSearchParams params;
 65 |   params.cutOffRadius = cutOffRadius;
 66 |   int closestID
 67 |     = cukd::cct::fcp
 68 |     (queryPos,*d_bounds,d_nodes,numNodes,params);
 69 |   
 70 |   d_results[tid]
 71 |     = (closestID < 0)
 72 |     ? INFINITY
 73 |     : distance(queryPos,d_nodes[closestID]);
 74 | }
 75 | 
 76 | 
 77 | 
 78 | 
 79 | int main(int ac, const char **av)
 80 | {
 81 |   using namespace cukd::common;
 82 | 
 83 |   int    numPoints = 1000000;
 84 |   int    nRepeats = 1;
 85 |   size_t numQueries = 1000000;
 86 |   float  cutOffRadius = std::numeric_limits<float>::infinity();
 87 |   for (int i=1;i<ac;i++) {
 88 |     std::string arg = av[i];
 89 |     if (arg[0] != '-')
 90 |       numPoints = std::stoi(arg);
 91 |     else if (arg == "-nq")
 92 |       numQueries = atoi(av[++i]);
 93 |     else if (arg == "-nr")
 94 |       nRepeats = atoi(av[++i]);
 95 |     else if (arg == "-r")
 96 |       cutOffRadius = std::stof(av[++i]);
 97 |     else
 98 |       throw std::runtime_error("known cmdline arg "+arg);
 99 |   }
100 |   
101 |   // ==================================================================
102 |   // create sample input point that we'll build the tree over
103 |   // ==================================================================
104 |   float3 *d_points = generatePoints(numPoints);
105 | 
106 |   // ==================================================================
107 |   // allocate some memory for the world-space bounding box, so the
108 |   // builder can compute and return that for our chosen traversal
109 |   // method to use
110 |   // ==================================================================
111 |   cukd::box_t<float3> *d_bounds;
112 |   cudaMallocManaged((void**)&d_bounds,sizeof(cukd::box_t<float3>));
113 |   std::cout << "allocated memory for the world space bounding box ..." << std::endl;
114 | 
115 |   // ==================================================================
116 |   // build the tree. this will also comptue the world-space boudig box
117 |   // of all points
118 |   // ==================================================================
119 |   std::cout << "calling builder..." << std::endl;
120 |   double t0 = getCurrentTime();
121 |   cukd::buildTree(d_points,numPoints,d_bounds);
122 |   CUKD_CUDA_SYNC_CHECK();
123 |   double t1 = getCurrentTime();
124 |   std::cout << "done building tree, took "
125 |             << prettyDouble(t1-t0) << "s" << std::endl;
126 | 
127 |   // ==================================================================
128 |   // create set of sample query points
129 |   // ==================================================================
130 |   float3 *d_queries
131 |     = generatePoints(numQueries);
132 |   // allocate memory for the results
133 |   float  *d_results;
134 |   CUKD_CUDA_CALL(MallocManaged((void**)&d_results,numQueries*sizeof(*d_results)));
135 | 
136 | 
137 |   // ==================================================================
138 |   // and do some queryies - let's do the same ones in a loop so we cna
139 |   // measure perf.
140 |   // ==================================================================
141 |   {
142 |     double t0 = getCurrentTime();
143 |     for (int i=0;i<nRepeats;i++) {
144 |       int bs = 128;
145 |       int nb = divRoundUp((int)numQueries,bs);
146 |       d_fcp<<<nb,bs>>>
147 |         (d_results,d_queries,numQueries,
148 |          d_bounds,d_points,numPoints,cutOffRadius);
149 |       cudaDeviceSynchronize();
150 |     }
151 |     CUKD_CUDA_SYNC_CHECK();
152 |     double t1 = getCurrentTime();
153 |     std::cout << "done " << nRepeats
154 |               << " iterations of " << numQueries
155 |               << " fcp queries, took " << prettyDouble(t1-t0)
156 |               << "s" << std::endl;
157 |     std::cout << "that is " << prettyDouble(numQueries*nRepeats/(t1-t0))
158 |               << " queries/s" << std::endl;
159 |   }
160 |   
161 | }
162 |   
163 | 


--------------------------------------------------------------------------------
/sampleHost.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder.h"
 18 | // fcp = "find closest point" query
 19 | #include "cukd/fcp.h"
 20 | #include <queue>
 21 | #include <iomanip>
 22 | #include <random>
 23 | 
 24 | using namespace cukd;
 25 | 
 26 | void generatePoints(size_t N, std::vector<float3> &points)
 27 | {
 28 |   static int g_seed = 100000;
 29 |   std::seed_seq seq{g_seed++};
 30 |   std::default_random_engine rd(seq);
 31 |   std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
 32 |   std::uniform_int_distribution<> dist(0,N);
 33 | 
 34 |   std::cout << "generating " << N << " uniform random points" << std::endl;
 35 |   points.resize(N);
 36 | 
 37 | #ifdef OPENMP_FOUND
 38 |   #pragma omp parallel for
 39 | #endif  
 40 |   for (size_t i=0;i<N;i++) {
 41 |     points[i].x = (float)dist(gen);
 42 |     points[i].y = (float)dist(gen);
 43 |     points[i].z = (float)dist(gen);
 44 |   }
 45 | }
 46 | 
 47 | void fcp_host(float *results,
 48 |            float3  *queries,
 49 |            size_t   numQueries,
 50 |            /*! the world bounding box computed by the builder */
 51 |            const cukd::box_t<float3> *bounds,
 52 |            float3  *nodes,
 53 |            int      numNodes,
 54 |            float    cutOffRadius)
 55 | {
 56 |   using point_t = float3;
 57 | 
 58 | #ifdef OPENMP_FOUND
 59 |   #pragma omp parallel for
 60 | #endif  
 61 |   for (size_t tid = 0; tid < numQueries; tid++) {    
 62 |     point_t queryPos = queries[tid];
 63 |     FcpSearchParams params;
 64 |     params.cutOffRadius = cutOffRadius;
 65 |     int closestID
 66 |       = cukd::cct::fcp
 67 |       (queryPos,*bounds,nodes,numNodes,params);
 68 |     
 69 |     results[tid]
 70 |       = (closestID < 0)
 71 |       ? INFINITY
 72 |       : distance(queryPos,nodes[closestID]);
 73 |   }
 74 | }
 75 | 
 76 | int main(int ac, const char **av)
 77 | {
 78 |   using namespace cukd::common;
 79 | 
 80 |   size_t numPoints = 10000;
 81 |   int    nRepeats = 1;
 82 |   size_t numQueries = 10000;
 83 |   float  cutOffRadius = std::numeric_limits<float>::infinity();
 84 |   for (int i=1;i<ac;i++) {
 85 |     std::string arg = av[i];
 86 |     if (arg[0] != '-')
 87 |       numPoints = atoll(arg.c_str());
 88 |     else if (arg == "-nq")
 89 |       numQueries = atoll(av[++i]);
 90 |     else if (arg == "-nr")
 91 |       nRepeats = atoi(av[++i]);
 92 |     else if (arg == "-r")
 93 |       cutOffRadius = std::stof(av[++i]);
 94 |     else
 95 |       throw std::runtime_error("known cmdline arg "+arg);
 96 |   }
 97 |   
 98 |   // ==================================================================
 99 |   // create sample input point that we'll build the tree over
100 |   // ==================================================================
101 |   std::vector<float3> points;
102 |   generatePoints(numPoints, points);
103 | 
104 |   // ==================================================================
105 |   // allocate some memory for the world-space bounding box, so the
106 |   // builder can compute and return that for our chosen traversal
107 |   // method to use
108 |   // ==================================================================
109 |   cukd::box_t<float3> bounds;
110 |   std::cout << "allocated memory for the world space bounding box ..." << std::endl;
111 | 
112 |   // ==================================================================
113 |   // build the tree. this will also comptue the world-space boudig box
114 |   // of all points
115 |   // ==================================================================
116 |   std::cout << "calling builder..." << std::endl;
117 |   double t0 = getCurrentTime();
118 |   cukd::buildTree_host(points.data(),numPoints,&bounds);
119 |   double t1 = getCurrentTime();
120 |   std::cout << "done building tree, took "
121 |             << prettyDouble(t1-t0) << "s" << std::endl;
122 | 
123 |   // ==================================================================
124 |   // create set of sample query points
125 |   // ==================================================================
126 |   std::vector<float3> queries;
127 |   generatePoints(numQueries, queries);
128 |   // allocate memory for the results
129 |   std::vector<float> results(numQueries);
130 | 
131 |   // ==================================================================
132 |   // and do some queryies - let's do the same ones in a loop so we cna
133 |   // measure perf.
134 |   // ==================================================================
135 |   {
136 |     double t0 = getCurrentTime();
137 |     for (int i=0;i<nRepeats;i++) {
138 |       fcp_host
139 |         (results.data(),queries.data(),numQueries,
140 |          &bounds,points.data(),numPoints,cutOffRadius);
141 |     }
142 |     double t1 = getCurrentTime();
143 |     std::cout << "done " << nRepeats
144 |               << " iterations of " << numQueries
145 |               << " fcp queries, took " << prettyDouble(t1-t0)
146 |               << "s" << std::endl;
147 |     std::cout << "that is " << prettyDouble(numQueries*nRepeats/(t1-t0))
148 |               << " queries/s" << std::endl;
149 |   }
150 |   
151 | }
152 |   
153 | 


--------------------------------------------------------------------------------
/samples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | # sample created from reported issue #34. this sample uses a float3
3 | # spatialKDTree with ~250K points and ~250k query points, and performs
4 | # a set of queries on that, using a different set of points and query
5 | # points in each run.
6 | add_executable(knn-float3-spatialkdtree knn-float3-spatialkdtree.cu)
7 | target_link_libraries(knn-float3-spatialkdtree PRIVATE cudaKDTree)
8 | 
9 | 


--------------------------------------------------------------------------------
/samples/knn-float3-spatialkdtree.cu:
--------------------------------------------------------------------------------
  1 | /* sample created from reported issue #34. this sample uses a float3
  2 |  spatialKDTree with ~250K points and ~250k query points, and performs
  3 |  a set of queries on that, using a different set of points and query
  4 |  points in each run.
  5 | */
  6 | 
  7 | #include <cuda_runtime.h>
  8 | #include <cukd/builder.h>
  9 | #include <cukd/knn.h>
 10 | #include <random>
 11 | 
 12 | #define FIXED_K 16
 13 | 
 14 | using data_t = float3;
 15 | using data_traits = cukd::default_data_traits<float3>;
 16 | 
 17 | // CUDA KNN Kernel
 18 | __global__ void KnnKernel(
 19 |     const float3* d_queries, int numQueries,
 20 |     const cukd::SpatialKDTree<float3, data_traits> tree,
 21 |     float3* d_results, int k, float radius)
 22 | {
 23 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 24 |     if (tid >= numQueries) return;
 25 | 
 26 |     cukd::HeapCandidateList<FIXED_K> result(radius); // Fixed at 16, for generalization make template
 27 | 
 28 |     cukd::stackBased::knn<decltype(result), float3, data_traits>
 29 |       (result, tree, d_queries[tid]);
 30 | 
 31 |     for (int i = 0; i < k; i++) {
 32 |       int ID = result.get_pointID(i);
 33 |       d_results[tid * k + i]
 34 |         = ID < 0
 35 |         ? make_float3(0.f,0.f,0.f)
 36 |         : tree.data[ID];
 37 |     }
 38 | }
 39 | 
 40 | float3* knnSearchCuda(const float3* points, const int numPoints,
 41 |                       const float3* queries, const int numQueries,
 42 |                       const int k, const float radius) {
 43 | 
 44 |     // Allocate managed memory for points, queries, and results
 45 |     float3* d_points;
 46 |     cudaMallocManaged(&d_points, numPoints * sizeof(float3));
 47 |     std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
 48 |     std::cout << "Allocated " << numPoints << " points at " << d_points << std::endl;
 49 |     cudaMemcpy(d_points, points, numPoints * sizeof(float3), cudaMemcpyHostToDevice);
 50 | 
 51 |     float3* d_queries;
 52 |     cudaMallocManaged(&d_queries, numQueries * sizeof(float3));
 53 |     cudaMemcpy(d_queries, queries, numQueries * sizeof(float3), cudaMemcpyHostToDevice);
 54 | 
 55 |     // Build Spatial KD-Tree (managed memory)
 56 |     cukd::SpatialKDTree<float3, data_traits> tree;
 57 |     cukd::BuildConfig buildConfig{};
 58 |     buildTree(tree,d_points,numPoints,buildConfig);
 59 |     
 60 |     CUKD_CUDA_SYNC_CHECK();
 61 | 
 62 |     // Results
 63 |     float3* d_results;
 64 |     cudaMallocManaged(&d_results, numQueries * k * sizeof(float3));
 65 | 
 66 |     int threadsPerBlock = 256;
 67 |     int numBlocks = (numQueries + threadsPerBlock - 1) / threadsPerBlock;
 68 | 
 69 |     KnnKernel<<<numBlocks, threadsPerBlock>>>(d_queries, numQueries, tree, d_results, k, radius);
 70 |     cudaDeviceSynchronize();
 71 | 
 72 |     // Copy back results
 73 |     float3* neighbors = new float3[numQueries * k];
 74 |     cudaMemcpy(neighbors, d_results, numQueries * k * sizeof(float3), cudaMemcpyDeviceToHost);
 75 | 
 76 |     // Cleanup
 77 |     cudaFree(d_points);
 78 |     cudaFree(d_queries);
 79 |     cudaFree(d_results);
 80 |     cukd::free(tree);
 81 | 
 82 |     return neighbors;
 83 | }
 84 | 
 85 | int main(int, char **) {
 86 |   std::random_device rd;  // a seed source for the random number engine
 87 |   std::mt19937 gen(rd()); // mersenne_twister_engine seeded with rd()
 88 |   std::uniform_real_distribution<float> rng(0.f,+1.f);
 89 | 
 90 |   for (int r=0;r<10;r++) {
 91 |     std::vector<float3> points;
 92 |     {
 93 |       int N = 240000+int(20000*rng(gen));
 94 |       for (int i=0;i<N;i++) {
 95 |         points.push_back(make_float3(rng(gen),rng(gen),rng(gen)));
 96 |       }
 97 |     }
 98 |     std::vector<float3> queries;
 99 |     {
100 |       int N = 240000+int(20000*rng(gen));
101 |       for (int i=0;i<N;i++) {
102 |         queries.push_back(make_float3(rng(gen),rng(gen),rng(gen)));
103 |       }
104 |     }
105 |     std::cout << "running knn query on " << points.size()
106 |               << " points" << std::endl;
107 |     float3 *result
108 |       = knnSearchCuda(points.data(),points.size(),
109 |                       queries.data(),queries.size(),
110 |                       FIXED_K,2.f);
111 |     delete[] result;
112 |   }
113 | }
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/samples/mpiHugeQuery.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2025-2025 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this fle except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/cukd-math.h"
 18 | #include "cukd/traverse-stack-free.h"
 19 | #include "cukd/knn.h"
 20 | #include <mpi.h>
 21 | #include <stdexcept>
 22 | 
 23 | #define CUKD_MPI_CALL(fctCall)                                          \
 24 |   { int rc = MPI_##fctCall;                                             \
 25 |     if (rc != MPI_SUCCESS)                                              \
 26 |       throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+#fctCall); }
 27 | 
 28 | using cukd::divRoundUp;
 29 | 
 30 | struct MPIComm {
 31 |   MPIComm(MPI_Comm comm)
 32 |     : comm(comm)
 33 |   {
 34 |     MPI_Comm_rank(comm,&rank);
 35 |     MPI_Comm_size(comm,&size);
 36 |   }
 37 |   MPI_Comm comm;
 38 |   int rank, size;
 39 | };
 40 | 
 41 | template<typename T>
 42 | std::vector<T> readFilePortion(std::string inFileName,
 43 |                                int rank, int size,
 44 |                                size_t *pBegin = 0,
 45 |                                size_t *pNumTotal = 0
 46 |                                )
 47 | {
 48 |   std::ifstream in(inFileName.c_str(),std::ios::binary);
 49 |   in.seekg(0,std::ios::end);
 50 |   size_t numBytes = in.tellg();
 51 |   in.seekg(0,std::ios::beg);
 52 | 
 53 |   size_t numData = numBytes / sizeof(T);
 54 |   if (pNumTotal) *pNumTotal = numData;
 55 |   size_t begin = numData * (rank+0)/size;
 56 |   if (pBegin) *pBegin = begin;
 57 |   size_t end   = numData * (rank+1)/size;
 58 |   in.seekg(begin*sizeof(T),std::ios::beg);
 59 |   
 60 |   std::vector<T> result(end-begin);
 61 |   in.read((char *)result.data(),(end-begin)*sizeof(T));
 62 |   return result;
 63 | }
 64 | 
 65 | 
 66 | void usage(const std::string &error)
 67 | {
 68 |   std::cerr << "Error: " << error << std::endl << std::endl;
 69 |   std::cerr << "./mpiHugeQuery -k <k> [-r <maxRadius>] in.float3s -o out.dat" << std::endl;
 70 |   exit(error.empty()?0:1);
 71 | }
 72 | 
 73 | 
 74 | 
 75 | __global__ void runQuery(float3 *tree, int N,
 76 |                          uint64_t *candidateLists, int k, float maxRadius,
 77 |                          float3 *queries, int numQueries,
 78 |                          int round)
 79 | {
 80 |   int tid = threadIdx.x+blockIdx.x*blockDim.x;
 81 |   if (tid >= numQueries) return;
 82 | 
 83 |   float3 qp = queries[tid];
 84 |   cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,
 85 |                                  round == 0 ? maxRadius : -1.f);
 86 |   cukd::stackFree::knn(cl,qp,tree,N);
 87 | }
 88 | 
 89 | __global__ void extractFinalResult(float *d_finalResults,
 90 |                                    int numPoints,
 91 |                                    int k,
 92 |                                    uint64_t *candidateLists)
 93 | {
 94 |   int tid = threadIdx.x+blockIdx.x*blockDim.x;
 95 |   if (tid >= numPoints) return;
 96 | 
 97 |   cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,-1.f);
 98 |   float result = cl.returnValue();
 99 |   if (!isinf(result))
100 |     result = sqrtf(result);
101 | 
102 |   d_finalResults[tid] = result;
103 |  }
104 |   
105 | int main(int ac, char **av)
106 | {
107 |   MPI_Init(&ac,&av);
108 |   float maxRadius = std::numeric_limits<float>::infinity();
109 |   int   k = 0;
110 |   int   gpuAffinityCount = 0;
111 |   std::string inFileName;
112 |   std::string outFileName;
113 | 
114 |   for (int i=1;i<ac;i++) {
115 |     const std::string arg = av[i];
116 |     if (arg == "-o")
117 |       outFileName = av[++i];
118 |     else if (arg[0] != '-')
119 |       inFileName = arg;
120 |     else if (arg == "-r")
121 |       maxRadius = std::atof(av[++i]);
122 |     else if (arg == "-g")
123 |       gpuAffinityCount = std::atoi(av[++i]);
124 |     else if (arg == "-k")
125 |       k = std::atoi(av[++i]);
126 |     else
127 |       usage("unknown cmdline arg '"+arg+"'");
128 |   }
129 | 
130 |   if (inFileName.empty())
131 |     usage("no input file name specified");
132 |   if (outFileName.empty())
133 |     usage("no output file name specified");
134 |   if (k < 1)
135 |     usage("no k specified, or invalid k value");
136 | 
137 |   MPIComm mpi(MPI_COMM_WORLD);
138 |   if (gpuAffinityCount) {
139 |     int deviceID = mpi.rank % gpuAffinityCount;
140 |     std::cout << "#" << mpi.rank << "/" << mpi.size
141 |               << "setting active GPU #" << deviceID << std::endl;
142 |     CUKD_CUDA_CALL(SetDevice(deviceID));
143 |   }
144 | 
145 |   size_t begin = 0;
146 |   size_t numPointsTotal = 0;
147 |   std::vector<float3> myPoints
148 |     = readFilePortion<float3>(inFileName,mpi.rank,mpi.size,&begin,&numPointsTotal);
149 |   std::cout << "#" << mpi.rank << "/" << mpi.size
150 |             << ": got " << myPoints.size() << " points to work on"
151 |             << std::endl;
152 | 
153 |   float3 *d_tree = 0;
154 |   float3 *d_tree_recv = 0;
155 |   int N = myPoints.size();
156 |   // alloc N+1 so we can store one more if anytoher rank gets oen more point
157 |   CUKD_CUDA_CALL(Malloc((void **)&d_tree,(N+1)*sizeof(myPoints[0])));
158 |   CUKD_CUDA_CALL(Malloc((void **)&d_tree_recv,(N+1)*sizeof(myPoints[0])));
159 |   CUKD_CUDA_CALL(Memcpy(d_tree,myPoints.data(),N*sizeof(myPoints[0]),
160 |                         cudaMemcpyDefault));
161 |   cukd::buildTree(d_tree,N);
162 | 
163 |   float3   *d_queries;
164 |   int numQueries = myPoints.size();
165 |   uint64_t *d_cand;
166 |   CUKD_CUDA_CALL(Malloc((void **)&d_queries,N*sizeof(float3)));
167 |   CUKD_CUDA_CALL(Memcpy(d_queries,myPoints.data(),N*sizeof(float3),cudaMemcpyDefault));
168 |   CUKD_CUDA_CALL(Malloc((void **)&d_cand,N*k*sizeof(uint64_t)));
169 | 
170 |   // -----------------------------------------------------------------------------
171 |   // now, do the queries and cycling:
172 |   // -----------------------------------------------------------------------------
173 |   for (int round=0;round<mpi.size;round++) {
174 |     
175 |     if (round == 0) {
176 |       // nothing to do , we already have our own tree
177 |     } else {
178 |       MPI_Request requests[2];
179 |       int sendCount = N;
180 |       int recvCount = 0;
181 |       int sendPeer = (mpi.rank+1)%mpi.size;
182 |       int recvPeer = (mpi.rank+mpi.size-1)%mpi.size;
183 |       CUKD_MPI_CALL(Irecv(&recvCount,1*sizeof(int),MPI_BYTE,recvPeer,0,
184 |                         mpi.comm,&requests[0]));
185 |       CUKD_MPI_CALL(Isend(&sendCount,1*sizeof(int),MPI_BYTE,sendPeer,0,
186 |                         mpi.comm,&requests[1]));
187 |       CUKD_MPI_CALL(Waitall(2,requests,MPI_STATUSES_IGNORE));
188 |       
189 |       CUKD_MPI_CALL(Irecv(d_tree_recv,recvCount*sizeof(*d_tree),MPI_BYTE,recvPeer,0,
190 |                           mpi.comm,&requests[0]));
191 |       CUKD_MPI_CALL(Isend(d_tree,sendCount*sizeof(*d_tree),MPI_BYTE,sendPeer,0,
192 |                           mpi.comm,&requests[1]));
193 |       CUKD_MPI_CALL(Waitall(2,requests,MPI_STATUSES_IGNORE));
194 |       
195 |       N = recvCount;
196 |       std::swap(d_tree,d_tree_recv);
197 |     }
198 |     // -----------------------------------------------------------------------------
199 |     runQuery<<<divRoundUp(numQueries,1024),1024>>>
200 |       (/* tree */d_tree,N,
201 |        /* query params */d_cand,k,maxRadius,
202 |        /* query points */d_queries,numQueries,
203 |        round);
204 |     CUKD_CUDA_CALL(DeviceSynchronize());
205 |   }
206 |   std::cout << "done all queries..." << std::endl;
207 |   float *d_finalResults = 0;
208 |   CUKD_CUDA_CALL(MallocManaged((void **)&d_finalResults,myPoints.size()*sizeof(float)));
209 |   extractFinalResult<<<divRoundUp(numQueries,1024),1024>>>
210 |     (d_finalResults,numQueries,k,d_cand);
211 |   CUKD_CUDA_CALL(DeviceSynchronize());
212 | 
213 |   MPI_Barrier(mpi.comm);
214 | 
215 |   for (int i=0;i<mpi.size;i++) {
216 |     MPI_Barrier(mpi.comm);
217 |     if (i == mpi.rank) {
218 |       FILE *file = fopen(outFileName.c_str(),i==0?"wb":"ab");
219 |       fwrite(d_finalResults,sizeof(float),numQueries,file);
220 |       fclose(file);
221 |     }
222 |     MPI_Barrier(mpi.comm);
223 |   }
224 |   MPI_Finalize();
225 | }
226 | 


--------------------------------------------------------------------------------
/samples/mpiHugeQueryHost.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2025-2025 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this fle except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/cukd-math.h"
 18 | #include "cukd/traverse-stack-free.h"
 19 | #include "cukd/knn.h"
 20 | #include <mpi.h>
 21 | #include <stdexcept>
 22 | 
 23 | #define CUKD_MPI_CALL(fctCall)                                          \
 24 |   { int rc = MPI_##fctCall;                                             \
 25 |     if (rc != MPI_SUCCESS)                                              \
 26 |       throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+#fctCall); }
 27 | 
 28 | using cukd::divRoundUp;
 29 | 
 30 | struct MPIComm {
 31 |   MPIComm(MPI_Comm comm)
 32 |     : comm(comm)
 33 |   {
 34 |     MPI_Comm_rank(comm,&rank);
 35 |     MPI_Comm_size(comm,&size);
 36 |   }
 37 |   MPI_Comm comm;
 38 |   int rank, size;
 39 | };
 40 | 
 41 | template<typename T>
 42 | std::vector<T> readFilePortion(std::string inFileName,
 43 |                                int rank, int size,
 44 |                                size_t *pBegin = 0,
 45 |                                size_t *pNumTotal = 0
 46 |                                )
 47 | {
 48 |   std::ifstream in(inFileName.c_str(),std::ios::binary);
 49 |   in.seekg(0,std::ios::end);
 50 |   size_t numBytes = in.tellg();
 51 |   in.seekg(0,std::ios::beg);
 52 | 
 53 |   size_t numData = numBytes / sizeof(T);
 54 |   if (pNumTotal) *pNumTotal = numData;
 55 |   size_t begin = numData * (rank+0)/size;
 56 |   if (pBegin) *pBegin = begin;
 57 |   size_t end   = numData * (rank+1)/size;
 58 |   in.seekg(begin*sizeof(T),std::ios::beg);
 59 |   
 60 |   std::vector<T> result(end-begin);
 61 |   in.read((char *)result.data(),(end-begin)*sizeof(T));
 62 |   return result;
 63 | }
 64 | 
 65 | 
 66 | void usage(const std::string &error)
 67 | {
 68 |   std::cerr << "Error: " << error << std::endl << std::endl;
 69 |   std::cerr << "./mpiHugeQuery -k <k> [-r <maxRadius>] in.float3s -o out.dat" << std::endl;
 70 |   exit(error.empty()?0:1);
 71 | }
 72 | 
 73 | 
 74 | 
 75 | void runQuery_host(float3 *tree, size_t N,
 76 |                     uint64_t *candidateLists, int k, float maxRadius,
 77 |                     float3 *queries, size_t numQueries,
 78 |                     int round)
 79 | {
 80 | #ifdef OPENMP_FOUND
 81 |   #pragma omp parallel for
 82 | #endif  
 83 |   for (size_t tid = 0; tid < numQueries; tid++) {
 84 |     float3 qp = queries[tid];
 85 |     cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,
 86 |                     round == 0 ? maxRadius : -1.f);
 87 |     cukd::stackFree::knn(cl,qp,tree,N);
 88 |   }
 89 | }
 90 | 
 91 | void extractFinalResult_host(float *finalResults,
 92 |                               size_t numPoints,
 93 |                               int k,
 94 |                               uint64_t *candidateLists)
 95 | {
 96 | #ifdef OPENMP_FOUND
 97 |   #pragma omp parallel for
 98 | #endif  
 99 |   for (size_t tid = 0; tid < numPoints; tid++) {
100 |     cukd::FlexHeapCandidateList cl(candidateLists+k*tid,k,-1.f);
101 |     float result = cl.returnValue();
102 |     if (!isinf(result))
103 |       result = sqrtf(result);
104 | 
105 |     finalResults[tid] = result;
106 |   }
107 | } 
108 |   
109 | int main(int ac, char **av)
110 | {
111 |   MPI_Init(&ac,&av);
112 |   float maxRadius = std::numeric_limits<float>::infinity();
113 |   int   k = 0;
114 |   std::string inFileName;
115 |   std::string outFileName;
116 | 
117 |   for (int i=1;i<ac;i++) {
118 |     const std::string arg = av[i];
119 |     if (arg == "-o")
120 |       outFileName = av[++i];
121 |     else if (arg[0] != '-')
122 |       inFileName = arg;
123 |     else if (arg == "-r")
124 |       maxRadius = std::atof(av[++i]);
125 |     else if (arg == "-k")
126 |       k = std::atoi(av[++i]);
127 |     else
128 |       usage("unknown cmdline arg '"+arg+"'");
129 |   }
130 | 
131 |   if (inFileName.empty())
132 |     usage("no input file name specified");
133 |   if (outFileName.empty())
134 |     usage("no output file name specified");
135 |   if (k < 1)
136 |     usage("no k specified, or invalid k value");
137 | 
138 |   MPIComm mpi(MPI_COMM_WORLD);
139 | 
140 |   size_t begin = 0;
141 |   size_t numPointsTotal = 0;
142 |   std::vector<float3> myPoints
143 |     = readFilePortion<float3>(inFileName,mpi.rank,mpi.size,&begin,&numPointsTotal);
144 |   std::cout << "#" << mpi.rank << "/" << mpi.size
145 |             << ": got " << myPoints.size() << " points to work on"
146 |             << std::endl;
147 |   
148 |   size_t N = myPoints.size();
149 |   std::vector<float3> tree((N+1));
150 |   std::vector<float3> tree_recv((N+1));
151 |   memcpy(tree.data(),myPoints.data(),N*sizeof(float3));
152 | 
153 |   // Add timing to your mpiHugeQuery.cu
154 |   double start_time, end_time;
155 |   // Start timing before your main computation
156 |   MPI_Barrier(mpi.comm);
157 |   start_time = MPI_Wtime();
158 | 
159 |   cukd::buildTree_host(tree.data(),N);
160 |   
161 |   // End timing after computation
162 |   MPI_Barrier(mpi.comm);
163 |   end_time = MPI_Wtime();
164 | 
165 |   // Print results on rank 0
166 |   if (mpi.rank == 0) {
167 |       printf("Total execution time (buildTree_host): %.6f seconds\n", end_time - start_time);
168 |   }  
169 | 
170 |   size_t numQueries = myPoints.size();
171 |   std::vector<float3>  queries(N);
172 |   memcpy(queries.data(),myPoints.data(),N*sizeof(float3));
173 |   
174 |   std::vector<uint64_t>  cand(N*k);
175 | 
176 |   // -----------------------------------------------------------------------------
177 |   // now, do the queries and cycling:
178 |   // -----------------------------------------------------------------------------
179 |   MPI_Barrier(mpi.comm);
180 |   start_time = MPI_Wtime();
181 | 
182 |   for (int round=0;round<mpi.size;round++) {
183 |     
184 |     if (round == 0) {
185 |       // nothing to do , we already have our own tree
186 |     } else {
187 |       MPI_Request requests[2];
188 |       int sendCount = N;
189 |       int recvCount = 0;
190 |       int sendPeer = (mpi.rank+1)%mpi.size;
191 |       int recvPeer = (mpi.rank+mpi.size-1)%mpi.size;
192 |       CUKD_MPI_CALL(Irecv(&recvCount,1*sizeof(int),MPI_BYTE,recvPeer,0,
193 |                         mpi.comm,&requests[0]));
194 |       CUKD_MPI_CALL(Isend(&sendCount,1*sizeof(int),MPI_BYTE,sendPeer,0,
195 |                         mpi.comm,&requests[1]));
196 |       CUKD_MPI_CALL(Waitall(2,requests,MPI_STATUSES_IGNORE));
197 |       
198 |       CUKD_MPI_CALL(Irecv(tree_recv.data(),recvCount*sizeof(float3),MPI_BYTE,recvPeer,0,
199 |                           mpi.comm,&requests[0]));
200 |       CUKD_MPI_CALL(Isend(tree.data(),sendCount*sizeof(float3),MPI_BYTE,sendPeer,0,
201 |                           mpi.comm,&requests[1]));
202 |       CUKD_MPI_CALL(Waitall(2,requests,MPI_STATUSES_IGNORE));
203 |       
204 |       N = recvCount;
205 |       std::swap(tree,tree_recv);
206 |     }
207 |     // -----------------------------------------------------------------------------
208 |     runQuery_host(tree.data(),N,
209 |                   cand.data(),k,maxRadius,
210 |                   queries.data(),numQueries,
211 |                   round);
212 |   }
213 | 
214 |   // End timing after computation
215 |   MPI_Barrier(mpi.comm);
216 |   end_time = MPI_Wtime();
217 | 
218 |   // Print results on rank 0
219 |   if (mpi.rank == 0) {
220 |       printf("Total execution time (queries and cycling are done): %.6f seconds\n", end_time - start_time);
221 |   }  
222 | 
223 |   std::vector<float> finalResults(myPoints.size());
224 |   extractFinalResult_host(finalResults.data(),numQueries,k,cand.data());
225 | 
226 |   MPI_Barrier(mpi.comm);
227 | 
228 |   for (int i=0;i<mpi.size;i++) {
229 |     MPI_Barrier(mpi.comm);
230 |     if (i == mpi.rank) {
231 |       FILE *file = fopen(outFileName.c_str(),i==0?"wb":"ab");
232 |       fwrite(finalResults.data(),sizeof(float),numQueries,file);
233 |       fclose(file);
234 |     }
235 |     MPI_Barrier(mpi.comm);
236 |   }
237 |   MPI_Finalize();
238 | }
239 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | github actions scripts, imported from https://github.com/ptheywood/cuda-cmake-github-actions
2 | 


--------------------------------------------------------------------------------
/scripts/actions/install_cuda_ubuntu.sh:
--------------------------------------------------------------------------------
  1 | # @todo - better / more robust parsing of inputs from env vars.
  2 | ## -------------------
  3 | ## Constants
  4 | ## -------------------
  5 | 
  6 | # List of sub-packages to install.
  7 | # @todo - pass this in from outside the script? 
  8 | # @todo - check the specified subpackages exist via apt pre-install?  apt-rdepends cuda-9-0 | grep "^cuda-"?
  9 | 
 10 | # Ideally choose from the list of meta-packages to minimise variance between cuda versions (although it does change too). Some of these packages may not be availble in older CUDA releases
 11 | CUDA_PACKAGES_IN=(
 12 |     "cuda-compiler"
 13 |     "cuda-cudart-dev"
 14 |     "cuda-nvtx"
 15 |     "cuda-nvrtc-dev"
 16 |     "libcurand-dev" # 11-0+
 17 |     "cuda-cccl" # 11.4+, provides cub and thrust. On 11.3 known as cuda-thrust-11-3
 18 | )
 19 | 
 20 | ## -------------------
 21 | ## Bash functions
 22 | ## -------------------
 23 | # returns 0 (true) if a >= b
 24 | function version_ge() {
 25 |     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
 26 |     [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ]
 27 | }
 28 | # returns 0 (true) if a > b
 29 | function version_gt() {
 30 |     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
 31 |     [ "$1" = "$2" ] && return 1 || version_ge $1 $2
 32 | }
 33 | # returns 0 (true) if a <= b
 34 | function version_le() {
 35 |     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
 36 |     [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ]
 37 | }
 38 | # returns 0 (true) if a < b
 39 | function version_lt() {
 40 |     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
 41 |     [ "$1" = "$2" ] && return 1 || version_le $1 $2
 42 | }
 43 | 
 44 | ## -------------------
 45 | ## Select CUDA version
 46 | ## -------------------
 47 | 
 48 | # Get the cuda version from the environment as $cuda.
 49 | CUDA_VERSION_MAJOR_MINOR=${cuda}
 50 | 
 51 | # Split the version.
 52 | # We (might/probably) don't know PATCH at this point - it depends which version gets installed.
 53 | CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1)
 54 | CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2)
 55 | CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3)
 56 | # use lsb_release to find the OS.
 57 | UBUNTU_VERSION=$(lsb_release -sr)
 58 | UBUNTU_VERSION="${UBUNTU_VERSION//.}"
 59 | 
 60 | echo "CUDA_MAJOR: ${CUDA_MAJOR}"
 61 | echo "CUDA_MINOR: ${CUDA_MINOR}"
 62 | echo "CUDA_PATCH: ${CUDA_PATCH}"
 63 | # echo "UBUNTU_NAME: ${UBUNTU_NAME}"
 64 | echo "UBUNTU_VERSION: ${UBUNTU_VERSION}"
 65 | 
 66 | # If we don't know the CUDA_MAJOR or MINOR, error.
 67 | if [ -z "${CUDA_MAJOR}" ] ; then
 68 |     echo "Error: Unknown CUDA Major version. Aborting."
 69 |     exit 1
 70 | fi
 71 | if [ -z "${CUDA_MINOR}" ] ; then
 72 |     echo "Error: Unknown CUDA Minor version. Aborting."
 73 |     exit 1
 74 | fi
 75 | # If we don't know the Ubuntu version, error.
 76 | if [ -z ${UBUNTU_VERSION} ]; then
 77 |     echo "Error: Unknown Ubuntu version. Aborting."
 78 |     exit 1
 79 | fi
 80 | 
 81 | 
 82 | ## -------------------------------
 83 | ## Select CUDA packages to install
 84 | ## -------------------------------
 85 | CUDA_PACKAGES=""
 86 | for package in "${CUDA_PACKAGES_IN[@]}"
 87 | do : 
 88 |     # @todo This is not perfect. Should probably provide a separate list for diff versions
 89 |     # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y
 90 |     if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
 91 |         package="cuda-compiler"
 92 |     elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
 93 |         package="cuda-nvcc"
 94 |     # CUB/Thrust  are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+
 95 |     elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then
 96 |         # CUDA cuda-thrust >= 11.4
 97 |         if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then
 98 |             package="cuda-cccl"
 99 |         # Use cuda-thrust > 11.2
100 |         elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then
101 |             package="cuda-thrust"
102 |         # Do not include this pacakge < 11.3
103 |         else
104 |             continue
105 |         fi
106 |     fi
107 |     # CUDA 11+ includes lib* / lib*-dev packages, which if they existed previously where cuda-cu*- / cuda-cu*-dev-
108 |     if [[ ${package} == libcu* ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "11.0" ; then
109 |         package="${package/libcu/cuda-cu}"
110 |     fi
111 |     # Build the full package name and append to the string.
112 |     CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}"
113 | done
114 | echo "CUDA_PACKAGES ${CUDA_PACKAGES}"
115 | 
116 | ## -----------------
117 | ## Prepare to install
118 | ## -----------------
119 | CPU_ARCH="x86_64"
120 | PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin"
121 | PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${PIN_FILENAME}"
122 | # apt keyring package now available https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
123 | KERYRING_PACKAGE_FILENAME="cuda-keyring_1.1-1_all.deb"
124 | KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}"
125 | REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/"
126 | 
127 | echo "PIN_FILENAME ${PIN_FILENAME}"
128 | echo "PIN_URL ${PIN_URL}"
129 | echo "KEYRING_PACKAGE_URL ${KEYRING_PACKAGE_URL}"
130 | 
131 | ## -----------------
132 | ## Check for root/sudo
133 | ## -----------------
134 | 
135 | # Detect if the script is being run as root, storing true/false in is_root.
136 | is_root=false
137 | if (( $EUID == 0)); then
138 |    is_root=true
139 | fi
140 | # Find if sudo is available
141 | has_sudo=false
142 | if command -v sudo &> /dev/null ; then
143 |     has_sudo=true
144 | fi
145 | # Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. 
146 | if [ "$is_root" = false ] && [ "$has_sudo" = false ]; then 
147 |     echo "Root or sudo is required. Aborting."
148 |     exit 1
149 | elif [ "$is_root" = false ] ; then
150 |     USE_SUDO=sudo
151 | else
152 |     USE_SUDO=
153 | fi
154 | 
155 | ## -----------------
156 | ## Install
157 | ## -----------------
158 | echo "Adding CUDA Repository"
159 | wget ${PIN_URL}
160 | $USE_SUDO mv ${PIN_FILENAME} /etc/apt/preferences.d/cuda-repository-pin-600
161 | wget ${KEYRING_PACKAGE_URL} && ${USE_SUDO} dpkg -i ${KERYRING_PACKAGE_FILENAME} && rm ${KERYRING_PACKAGE_FILENAME}
162 | $USE_SUDO add-apt-repository "deb ${REPO_URL} /"
163 | $USE_SUDO apt-get update
164 | 
165 | echo "Installing CUDA packages ${CUDA_PACKAGES}"
166 | $USE_SUDO apt-get -y install ${CUDA_PACKAGES}
167 | 
168 | if [[ $? -ne 0 ]]; then
169 |     echo "CUDA Installation Error."
170 |     exit 1
171 | fi
172 | 
173 | ## -----------------
174 | ## Set environment vars / vars to be propagated
175 | ## -----------------
176 | 
177 | CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
178 | echo "CUDA_PATH=${CUDA_PATH}"
179 | export CUDA_PATH=${CUDA_PATH}
180 | export PATH="$CUDA_PATH/bin:$PATH"
181 | export LD_LIBRARY_PATH="$CUDA_PATH/lib:$LD_LIBRARY_PATH"
182 | export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
183 | # Check nvcc is now available.
184 | nvcc -V
185 | 
186 | # If executed on github actions, make the appropriate echo statements to update the environment
187 | if [[ $GITHUB_ACTIONS ]]; then
188 |     # Set paths for subsequent steps, using ${CUDA_PATH}
189 |     echo "Adding CUDA to CUDA_PATH, PATH and LD_LIBRARY_PATH"
190 |     echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
191 |     echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
192 |     echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
193 |     echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
194 | fi
195 | 


--------------------------------------------------------------------------------
/scripts/actions/install_cuda_windows.ps1:
--------------------------------------------------------------------------------
  1 | ## -------------------
  2 | ## Constants
  3 | ## -------------------
  4 | 
  5 | # Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern
  6 | # From 11.0, the download url/toolkit version is separate from the cudart version.
  7 | # Releases since 11.5.1 (including 11.4.4) use `windows` rather than `win10` in the uri, due to windows 11 inclusion
  8 | $CUDA_KNOWN_URLS = @{
  9 |     "8.0.44"   = "https://developer.nvidia.com/compute/cuda/8.0/Prod/network_installers/cuda_8.0.44_win10_network-exe";
 10 |     "8.0.61"   = "https://developer.nvidia.com/compute/cuda/8.0/Prod2/network_installers/cuda_8.0.61_win10_network-exe";
 11 |     "9.0.176"  = "https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe";
 12 |     "9.1.85"   = "https://developer.nvidia.com/compute/cuda/9.1/Prod/network_installers/cuda_9.1.85_win10_network";
 13 |     "9.2.148"  = "https://developer.nvidia.com/compute/cuda/9.2/Prod2/network_installers2/cuda_9.2.148_win10_network";
 14 |     "10.0.130" = "https://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network";
 15 |     "10.1.105" = "https://developer.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.105_win10_network.exe";
 16 |     "10.1.168" = "https://developer.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.168_win10_network.exe";
 17 |     "10.1.243" = "https://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe";
 18 |     "10.2.89"  = "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
 19 |     "11.0.1" = "https://developer.download.nvidia.com/compute/cuda/11.0.1/network_installers/cuda_11.0.1_win10_network.exe";
 20 |     "11.0.2" = "https://developer.download.nvidia.com/compute/cuda/11.0.2/network_installers/cuda_11.0.2_win10_network.exe";
 21 |     "11.0.3" = "https://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
 22 |     "11.1.0" = "https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe";
 23 |     "11.1.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
 24 |     "11.2.0" = "https://developer.download.nvidia.com/compute/cuda/11.2.0/network_installers/cuda_11.2.0_win10_network.exe";
 25 |     "11.2.1" = "https://developer.download.nvidia.com/compute/cuda/11.2.1/network_installers/cuda_11.2.1_win10_network.exe";
 26 |     "11.2.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
 27 |     "11.3.0" = "https://developer.download.nvidia.com/compute/cuda/11.3.0/network_installers/cuda_11.3.0_win10_network.exe";
 28 |     "11.3.1" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
 29 |     "11.4.0" = "https://developer.download.nvidia.com/compute/cuda/11.4.0/network_installers/cuda_11.4.0_win10_network.exe";
 30 |     "11.4.1" = "https://developer.download.nvidia.com/compute/cuda/11.4.1/network_installers/cuda_11.4.1_win10_network.exe";
 31 |     "11.4.2" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
 32 |     "11.4.3" = "https://developer.download.nvidia.com/compute/cuda/11.4.3/network_installers/cuda_11.4.3_win10_network.exe";
 33 |     "11.4.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.4/network_installers/cuda_11.4.4_windows_network.exe";
 34 |     "11.5.0" = "https://developer.download.nvidia.com/compute/cuda/11.5.0/network_installers/cuda_11.5.0_win10_network.exe";
 35 |     "11.5.1" = "https://developer.download.nvidia.com/compute/cuda/11.5.1/network_installers/cuda_11.5.1_windows_network.exe";
 36 |     "11.5.2" = "https://developer.download.nvidia.com/compute/cuda/11.5.2/network_installers/cuda_11.5.2_windows_network.exe";
 37 |     "11.6.0" = "https://developer.download.nvidia.com/compute/cuda/11.6.0/network_installers/cuda_11.6.0_windows_network.exe";
 38 |     "11.6.1" = "https://developer.download.nvidia.com/compute/cuda/11.6.1/network_installers/cuda_11.6.1_windows_network.exe";
 39 |     "11.6.2" = "https://developer.download.nvidia.com/compute/cuda/11.6.2/network_installers/cuda_11.6.2_windows_network.exe";
 40 |     "11.7.0" = "https://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe";
 41 |     "11.7.1" = "https://developer.download.nvidia.com/compute/cuda/11.7.1/network_installers/cuda_11.7.1_windows_network.exe";
 42 |     "11.8.0" = "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe";
 43 |     "12.0.0" = "https://developer.download.nvidia.com/compute/cuda/12.0.0/network_installers/cuda_12.0.0_windows_network.exe"
 44 |     "12.4.0" = "https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe" 
 45 | }
 46 | 
 47 | # @todo - change this to be based on _MSC_VER intead, or invert it to be CUDA keyed instead
 48 | $VISUAL_STUDIO_MIN_CUDA = @{
 49 |     "2022" = "11.6.0";
 50 |     "2019" = "10.1";
 51 |     "2017" = "10.0"; # Depends on which version of 2017! 9.0 to 10.0 depending on version
 52 |     "2015" = "8.0";  # Might support older, unsure. Depracated as of 11.1, unsupported in 11.2
 53 | }
 54 | 
 55 | # cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
 56 | # @todo - make this easier to vary per CUDA version.
 57 | $CUDA_PACKAGES_IN = @(
 58 |     "nvcc";
 59 |     "visual_studio_integration";
 60 |     "curand_dev";
 61 |     "nvrtc_dev";
 62 |     "cudart";
 63 |     "thrust";
 64 | )
 65 | 
 66 | ## -------------------
 67 | ## Select CUDA version
 68 | ## -------------------
 69 | 
 70 | # Get the cuda version from the environment as env:cuda.
 71 | $CUDA_VERSION_FULL = $env:cuda
 72 | # Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
 73 | 
 74 | # Validate CUDA version, extracting components via regex
 75 | $cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)\.(?<patch>[0-9]+)$"
 76 | if(-not $cuda_ver_matched){
 77 |     Write-Output "Invalid CUDA version specified, <major>.<minor>.<patch> required. '$CUDA_VERSION_FULL'."
 78 |     exit 1
 79 | }
 80 | $CUDA_MAJOR=$Matches.major
 81 | $CUDA_MINOR=$Matches.minor
 82 | $CUDA_PATCH=$Matches.patch
 83 | 
 84 | ## ---------------------------
 85 | ## Visual studio support check
 86 | ## ---------------------------
 87 | # Exit if visual studio is too new for the cuda version.
 88 | $VISUAL_STUDIO = $env:visual_studio.trim()
 89 | if ($VISUAL_STUDIO.length -ge 4) {
 90 | $VISUAL_STUDIO_YEAR = $VISUAL_STUDIO.Substring($VISUAL_STUDIO.Length-4)
 91 |     if ($VISUAL_STUDIO_YEAR.length -eq 4 -and $VISUAL_STUDIO_MIN_CUDA.containsKey($VISUAL_STUDIO_YEAR)){
 92 |         $MINIMUM_CUDA_VERSION = $VISUAL_STUDIO_MIN_CUDA[$VISUAL_STUDIO_YEAR]
 93 |         if ([version]$CUDA_VERSION_FULL -lt [version]$MINIMUM_CUDA_VERSION) {
 94 |             Write-Output "Error: Visual Studio $($VISUAL_STUDIO_YEAR) requires CUDA >= $($MINIMUM_CUDA_VERSION)"
 95 |             exit 1
 96 |         }
 97 |     }
 98 | } else {
 99 |     Write-Output "Warning: Unknown Visual Studio Version. CUDA version may be insufficient."
100 | }
101 | 
102 | ## ------------------------------------------------
103 | ## Select CUDA packages to install from environment
104 | ## ------------------------------------------------
105 | 
106 | $CUDA_PACKAGES = ""
107 | Foreach ($package in $CUDA_PACKAGES_IN) {
108 |     # Make sure the correct package name is used for nvcc.
109 |     if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
110 |         $package="compiler"
111 |     } elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
112 |         $package="nvcc"
113 |     } elseif($package -eq "thrust" -and [version]$CUDA_VERSION_FULL -lt [version]"11.3") {
114 |         # Thrust is a package from CUDA 11.3, otherwise it should be skipped.
115 |         continue
116 |     }
117 |     $CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
118 | }
119 | echo "$($CUDA_PACKAGES)"
120 | ## -----------------
121 | ## Prepare download
122 | ## -----------------
123 | 
124 | # Select the download link if known, otherwise have a guess.
125 | $CUDA_REPO_PKG_REMOTE=""
126 | $CUDA_REPO_PKG_LOCAL=""
127 | if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
128 |     $CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
129 | } else{
130 |     # Guess what the url is given the most recent pattern (at the time of writing, 10.1)
131 |     Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
132 |     if([version]$CUDA_VERSION_FULL -ge [version]"11.5.1"){
133 |         $CUDA_REPO_PKG_REMOTE="https://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_windows_network.exe"
134 |     } else {
135 |         $CUDA_REPO_PKG_REMOTE="https://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
136 |     }
137 | }
138 | if([version]$CUDA_VERSION_FULL -ge [version]"11.5.1"){
139 |     $CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_windows_network.exe"
140 | } else {
141 |     $CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
142 | }
143 | 
144 | ## ------------
145 | ## Install CUDA
146 | ## ------------
147 | 
148 | # Get CUDA network installer, retrying upto N times.
149 | Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
150 | 
151 | $downloaded = $false
152 | $download_attempt = 0
153 | $download_attempt_delay = 30
154 | $download_attempts_max = 5
155 | 
156 | while (-not $downloaded) {
157 |     Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
158 |     $download_attempt++
159 |     # If download succeeded, break out the loop.
160 |     if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
161 |         Write-Output "Downloading Complete"
162 |         $downloaded=$true
163 |     } else {
164 |         # If downlaod failed, either wait and try again, or give up and error.
165 |         if ($download_attempt -le $download_attempts_max) {
166 |             Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) (attempt $($download_attempt)/$($download_attempts_max)). Retrying."
167 |             # Sleep for a number of seconds.
168 |             Start-Sleep $download_attempt_delay
169 |         } else {
170 |             Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) after $($download_attempts_max) attempts. Aborting."
171 |             # Abort the script.
172 |             exit 1
173 |         }
174 |     }
175 | }
176 | 
177 | # Invoke silent install of CUDA (via network installer)
178 | Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
179 | Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
180 | 
181 | # Check the return status of the CUDA installer.
182 | if (!$?) {
183 |     Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
184 |     exit 1
185 | }
186 | 
187 | # Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
188 | $CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
189 | $CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)"
190 | # Set environmental variables in this session
191 | $env:CUDA_PATH = "$($CUDA_PATH)"
192 | $env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
193 | Write-Output "CUDA_PATH $($CUDA_PATH)"
194 | Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
195 | 
196 | # PATH needs updating elsewhere, anything in here won't persist.
197 | # Append $CUDA_PATH/bin to path.
198 | # Set CUDA_PATH as an environmental variable
199 | 
200 | # If executing on github actions, emit the appropriate echo statements to update environment variables
201 | if (Test-Path "env:GITHUB_ACTIONS") {
202 |     # Set paths for subsequent steps, using $env:CUDA_PATH
203 |     echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
204 |     echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
205 |     echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
206 |     echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
207 | }
208 | 


--------------------------------------------------------------------------------
/testing/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # ======================================================================== #
  2 | # Copyright 2023-2024 Ingo Wald                                            #
  3 | #                                                                          #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  5 | # you may not use this file except in compliance with the License.         #
  6 | # You may obtain a copy of the License at                                  #
  7 | #                                                                          #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0                           #
  9 | #                                                                          #
 10 | # Unless required by applicable law or agreed to in writing, software      #
 11 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 13 | # See the License for the specific language governing permissions and      #
 14 | # limitations under the License.                                           #
 15 | # ======================================================================== #
 16 | 
 17 | # directory for both "real" test cases and unit testing
 18 | 
 19 | project(cukdTests LANGUAGES CUDA CXX)
 20 | 
 21 | # test all builders on both simple float3 and Photon types, empty inputs
 22 | add_executable(cukdTestBitonicEmptyInput testBuilderEmptyInput.cu)
 23 | target_compile_definitions(cukdTestBitonicEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_bitonic)
 24 | target_link_libraries(cukdTestBitonicEmptyInput PRIVATE cudaKDTree)
 25 | add_test(NAME cukdTestBitonicEmptyInput COMMAND cukdTestBitonicEmptyInput)
 26 | 
 27 | add_executable(cukdTestThrustEmptyInput testBuilderEmptyInput.cu)
 28 | target_compile_definitions(cukdTestThrustEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_thrust)
 29 | target_link_libraries(cukdTestThrustEmptyInput PRIVATE cudaKDTree)
 30 | add_test(NAME cukdTestThrustEmptyInput COMMAND cukdTestThrustEmptyInput)
 31 | 
 32 | add_executable(cukdTestInPlaceEmptyInput testBuilderEmptyInput.cu)
 33 | target_compile_definitions(cukdTestInPlaceEmptyInput PUBLIC -DBUILDER_TO_TEST=buildTree_inPlace)
 34 | target_link_libraries(cukdTestInPlaceEmptyInput PRIVATE cudaKDTree)
 35 | add_test(NAME cukdTestInPlaceEmptyInput COMMAND cukdTestInPlaceEmptyInput)
 36 | 
 37 | # test all builders on both simple float3 and Photon types, simple 1000 random points
 38 | add_executable(cukdTestBitonicSimpleInput testBuilderSimpleInput.cu)
 39 | target_compile_definitions(cukdTestBitonicSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_bitonic)
 40 | target_link_libraries(cukdTestBitonicSimpleInput PRIVATE cudaKDTree)
 41 | add_test(NAME cukdTestBitonicSimpleInput COMMAND cukdTestBitonicSimpleInput)
 42 | 
 43 | add_executable(cukdTestThrustSimpleInput testBuilderSimpleInput.cu)
 44 | target_compile_definitions(cukdTestThrustSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_thrust)
 45 | target_link_libraries(cukdTestThrustSimpleInput PRIVATE cudaKDTree)
 46 | add_test(NAME cukdTestThrustSimpleInput COMMAND cukdTestThrustSimpleInput)
 47 | 
 48 | add_executable(cukdTestInPlaceSimpleInput testBuilderSimpleInput.cu)
 49 | target_compile_definitions(cukdTestInPlaceSimpleInput PUBLIC -DBUILDER_TO_TEST=buildTree_inPlace)
 50 | target_link_libraries(cukdTestInPlaceSimpleInput PRIVATE cudaKDTree)
 51 | add_test(NAME cukdTestInPlaceSimpleInput COMMAND cukdTestInPlaceSimpleInput)
 52 | 
 53 | 
 54 | add_executable(cukdTestPayloadSampleFromReadme testPayloadSampleFromReadme.cu)
 55 | target_compile_definitions(cukdTestPayloadSampleFromReadme PUBLIC -DBUILDER_TO_TEST=buildTree_thrust)
 56 | target_link_libraries(cukdTestPayloadSampleFromReadme PRIVATE cudaKDTree)
 57 | add_test(NAME cukdTestPayloadSampleFromReadme COMMAND cukdTestPayloadSampleFromReadme)
 58 | 
 59 | 
 60 | 
 61 | add_executable(cukdTestHostBuilderEmptyInput testHostBuilderEmptyInput.cu)
 62 | target_link_libraries(cukdTestHostBuilderEmptyInput PRIVATE cudaKDTree)
 63 | add_test(NAME cukdTestHostBuilderEmptyInput COMMAND cukdTestHostBuilderEmptyInput)
 64 | 
 65 | add_executable(cukdTestHostBuilderSimpleInput testHostBuilderSimpleInput.cu)
 66 | target_link_libraries(cukdTestHostBuilderSimpleInput PRIVATE cudaKDTree)
 67 | add_test(NAME cukdTestHostBuilderSimpleInput COMMAND cukdTestHostBuilderSimpleInput)
 68 | 
 69 | 
 70 | # tests, for a wide range of input data, whether host, thrust,
 71 | # bitonic, and inplace builders all produce the same tree.
 72 | add_executable(cukdTestBuildersSameResult testBuildersSameResult.cu)
 73 | target_link_libraries(cukdTestBuildersSameResult PRIVATE cudaKDTree)
 74 | add_test(NAME cukdTestBuildersSameResult COMMAND cukdTestBuildersSameResult)
 75 | 
 76 | 
 77 | 
 78 | # make sure all knn variants for a _spatial_ k-d tree will at least compile
 79 | add_executable(cukdTestCompileSpatialKNN compileSpatialKNN.cu)
 80 | target_link_libraries(cukdTestCompileSpatialKNN PRIVATE cudaKDTree)
 81 | add_test(NAME cukdTestCompileSpatialKNN COMMAND cukdTestCompileSpatialKNN)
 82 | # make sure all knn variants for a _spatial_ k-d tree will at least compile
 83 | add_executable(cukdTestCompileKNN compileKNN.cu)
 84 | target_link_libraries(cukdTestCompileKNN PRIVATE cudaKDTree)
 85 | add_test(NAME cukdTestCompileKNN COMMAND cukdTestCompileKNN)
 86 | 
 87 | 
 88 | 
 89 | # add a (compile-only) test to see if we can link two different object
 90 | # files (that both include the same builders) without getting any
 91 | # multiple definition errors.
 92 | add_executable(cukdTestMultipleDefinitions
 93 |   testMultipleDefinitions_a.cu  testMultipleDefinitions_b.cu)
 94 | target_link_libraries(cukdTestMultipleDefinitions PRIVATE cudaKDTree)
 95 | 
 96 | 
 97 | # ==================================================================
 98 | # issue 5: reported wrong/inconsisten results for differnet builders,
 99 | # in a given set generated by given random seed
100 | # ==================================================================
101 | #
102 | add_executable(cukdTestIssue5_thrust issue5.cu)
103 | target_link_libraries(cukdTestIssue5_thrust PRIVATE cudaKDTree)
104 | target_compile_definitions(cukdTestIssue5_thrust PUBLIC BUILDER_TO_TEST=buildTree_thrust)
105 | add_test(NAME cukdTestIssue5_thrust COMMAND cukdTestIssue5_thrust)
106 | #
107 | add_executable(cukdTestIssue5_bitonic issue5.cu)
108 | target_link_libraries(cukdTestIssue5_bitonic PRIVATE cudaKDTree)
109 | target_compile_definitions(cukdTestIssue5_bitonic PUBLIC BUILDER_TO_TEST=buildTree_bitonic)
110 | add_test(NAME cukdTestIssue5_bitonic COMMAND cukdTestIssue5_bitonic)
111 | #
112 | add_executable(cukdTestIssue5_inPlace issue5.cu)
113 | target_link_libraries(cukdTestIssue5_inPlace PRIVATE cudaKDTree)
114 | target_compile_definitions(cukdTestIssue5_inPlace PUBLIC BUILDER_TO_TEST=buildTree_inPlace)
115 | add_test(NAME cukdTestIssue5_inPlace COMMAND cukdTestIssue5_inPlace)
116 | 
117 | 
118 | 
119 | enable_testing()
120 | 


--------------------------------------------------------------------------------
/testing/compileKNN.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/knn.h"
18 | 
19 | using namespace cukd;
20 | 
21 | __global__
22 | void invokeQueries(float3 *d_tree, int N,
23 |                    box_t<float3> *d_worldBounds,
24 |                    float *d_results, float3 *d_queries)
25 | {
26 |   int tid = threadIdx.x+blockIdx.x*blockDim.x;
27 |   
28 |   HeapCandidateList<100> stackHeapResults(10.f);
29 |   stackBased::knn(stackHeapResults,
30 |                   d_queries[tid],d_tree,N);
31 | 
32 |   FixedCandidateList<4> stackListResults(10.f);
33 |   stackBased::knn(stackListResults,
34 |                   d_queries[tid],d_tree,N);
35 |   
36 |   HeapCandidateList<100> stackFreeHeapResults(10.f);
37 |   stackFree::knn(stackFreeHeapResults,
38 |                   d_queries[tid],d_tree,N);
39 | 
40 |   FixedCandidateList<4> stackFreeListResults(10.f);
41 |   stackFree::knn(stackFreeListResults,
42 |                   d_queries[tid],d_tree,N);
43 | 
44 |   // cct kernel has an additional 'worldbnuds' argument
45 |   HeapCandidateList<100> cctHeapResults(10.f);
46 |   cct::knn(cctHeapResults,
47 |                   d_queries[tid],*d_worldBounds,d_tree,N);
48 |   
49 |   FixedCandidateList<4> cctListResults(10.f);
50 |   cct::knn(cctListResults,
51 |                   d_queries[tid],*d_worldBounds,d_tree,N);
52 | 
53 |   d_results[tid]
54 |     = stackHeapResults.maxRadius2()
55 |     + stackListResults.maxRadius2()
56 |     + stackFreeHeapResults.maxRadius2()
57 |     + stackFreeListResults.maxRadius2()
58 |     + cctHeapResults.maxRadius2()
59 |     + cctListResults.maxRadius2();
60 | }
61 | 
62 | int main(int, const char **)
63 | {
64 |   /* this only tests _compile_ capability */
65 |   return 0;
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/testing/compileSpatialKNN.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/knn.h"
18 | 
19 | using namespace cukd;
20 | 
21 | __global__
22 | void invokeQueries(SpatialKDTree<float3> *d_tree, float *d_results, float3 *d_queries)
23 | {
24 |   int tid = threadIdx.x+blockIdx.x*blockDim.x;
25 |   
26 |   HeapCandidateList<100> stackHeapResults(10.f);
27 |   stackBased::knn(stackHeapResults,*d_tree,d_queries[tid]);
28 | 
29 |   FixedCandidateList<4> stackListResults(10.f);
30 |   stackBased::knn(stackListResults,*d_tree,d_queries[tid]);
31 |   
32 |   HeapCandidateList<100> cctHeapResults(10.f);
33 |   cct::knn(cctHeapResults,*d_tree,d_queries[tid]);
34 |   
35 |   FixedCandidateList<4> cctListResults(10.f);
36 |   cct::knn(cctListResults,*d_tree,d_queries[tid]);
37 | 
38 |   d_results[tid]
39 |     = stackHeapResults.maxRadius2()
40 |     + stackListResults.maxRadius2()
41 |     + cctHeapResults.maxRadius2()
42 |     + cctListResults.maxRadius2();
43 | }
44 | 
45 | int main(int, const char **)
46 | {
47 |   /* this only tests _compile_ capability */
48 |   return 0;
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/testing/issue5.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2024 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder.h"
 18 | #include <random>
 19 | #include "cukd/fcp.h"
 20 | 
 21 | 
 22 | #define AS_STRING(x) #x
 23 | #define TO_STRING(x) AS_STRING(x)
 24 | 
 25 | void generateRandomPoints(size_t nb_,
 26 |                           int seed_,
 27 |                           std::vector<float3> &destPts_)
 28 | {
 29 |   const double maxVal = 100.0;
 30 |   const double scale = 2.0f * maxVal / RAND_MAX;
 31 | 
 32 |   destPts_.resize( nb_ );
 33 |   std::srand( seed_ );
 34 |   
 35 |   for ( size_t i = 0; i < nb_; i++ ) {
 36 |     destPts_[ i ].x = static_cast< float >( std::rand() * scale - maxVal );
 37 |     destPts_[ i ].y = static_cast< float >( std::rand() * scale - maxVal );
 38 |     destPts_[ i ].z = static_cast< float >( std::rand() * scale - maxVal );
 39 |   }
 40 | }
 41 | 
 42 | 
 43 | __global__ void checkResult(float3 *data,
 44 |                             int numData,
 45 |                             float3 queryPoint,
 46 |                             cukd::FcpSearchParams params,
 47 |                             float expectedSqrDist)
 48 | {
 49 |   if (threadIdx.x != 0) return;
 50 |   
 51 |   int res = cukd::stackBased::fcp(queryPoint,data,numData,params);
 52 |   if (res < 0) {
 53 |     printf("no result!?\n");
 54 |     return;
 55 |   }
 56 |   float3 pt = data[res];
 57 |   float sqrDist = cukd::fSqrDistance(pt,queryPoint);
 58 |   
 59 |   printf("found res %i, pos %f %f %f sqrdist %f expected %f\n",
 60 |          res,pt.x,pt.y,pt.z,sqrDist,expectedSqrDist);
 61 | }
 62 | 
 63 | float distance(float3 a, float3 b)
 64 | {
 65 |   auto sqr = [&](float f) { return f*f; };
 66 |   float f = 0.f;
 67 |   f += sqr(a.x-b.x);
 68 |   f += sqr(a.y-b.y);
 69 |   f += sqr(a.z-b.z);
 70 |   return f;
 71 | }
 72 | 
 73 | int main(int, char **)
 74 | {
 75 |   std::vector<float3> points;
 76 |   // Point are generated like this (nb_= 90167, seed_= 33):
 77 |   int nb_= 90167, seed_= 33;
 78 |   generateRandomPoints(nb_,seed_,points);
 79 |   // It should start like this:
 80 |   // [0] {x=-99.1088562 y=-87.9879150 z=27.7626877 } float3
 81 |   // [1] {x=-38.3892326 y=31.5713978 z=-37.0891457 } float3
 82 |   // [2] {x=-22.0435200 y=-92.5473785 z=89.4833221 } float3
 83 |   // [3] {x=48.5274811 y=-94.0671997 z=80.3888092 } float3
 84 |   // [4] {x=-33.9030113 y=34.4157219 z=95.2085953 } float3
 85 |   for (int i=0;i<5;i++)
 86 |     printf("[%i] (%f %f %f)\n",i,points[i].x,points[i].y,points[i].z);
 87 |   
 88 |   cukd::box_t<float3> *worldBounds = 0;
 89 |   CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds)));
 90 |   float3 *d_points = 0;
 91 |   CUKD_CUDA_CALL(MallocManaged((void **)&d_points,points.size()*sizeof(float3)));
 92 |   CUKD_CUDA_CALL(Memcpy(d_points,points.data(),points.size()*sizeof(float3),
 93 |                         cudaMemcpyDefault));
 94 |   
 95 |   cukd::BUILDER_TO_TEST
 96 |     (d_points,points.size(),worldBounds);
 97 |   
 98 |   std::cout << "world bounds is " << *worldBounds << std::endl;
 99 | 
100 |   // The querry point is {x=-98.4496613 y=76.9219055 z=25.8888512 }
101 |   float3 queryPoint = make_float3(-98.4496613, 76.9219055, 25.8888512);
102 |   // The "cutOffRadius" is 5.0
103 |   cukd::FcpSearchParams params;
104 |   params.cutOffRadius = 5.f;
105 |   
106 |   // The closest point should be at squared distance of 2.8466301
107 |   float expectedSqrDist = 2.8466301f;
108 | 
109 |   float closestDist = INFINITY;
110 |   int   closest = -1;
111 |   for (int i=0;i<points.size();i++) {
112 |     float3 pt = points[i];
113 |     float dist = distance(points[i],queryPoint);
114 |     // float dist = cukd::fSqrDistance(points[i],queryPoint);
115 |     if (dist >= closestDist) continue;
116 |     closestDist = dist;
117 |     closest = i;
118 |   }
119 |   float3 pt = points[closest];
120 |   std::cout << "reference closest dist is " << pt.x << ", " << pt.y << ", " << pt.z
121 |          << " at dist " << closestDist << std::endl;
122 |   checkResult<<<1,32>>>(d_points,points.size(),
123 |                         queryPoint,params,expectedSqrDist);
124 |   CUKD_CUDA_SYNC_CHECK();
125 | }
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/testing/test-include-as-subdirectory/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.28)
 2 | 
 3 | project(test-cukd)
 4 | 
 5 | set(CMAKE_CUDA_ARCHITECTURES native)
 6 | add_subdirectory(../../ BUILD_CUKD EXCLUDE_FROM_ALL)
 7 | 
 8 | add_executable(testBuild-sample-from-subdirectory
 9 |   ../../sample.cu
10 |   )
11 | target_link_libraries(testBuild-sample-from-subdirectory
12 |   cudaKDTree
13 |   )
14 | 


--------------------------------------------------------------------------------
/testing/testBuilderEmptyInput.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/builder.h"
18 | #include <random>
19 | 
20 | namespace test_float3 {
21 |   void test_empty()
22 |   {
23 |     std::cout << "testing float3 array, empty input." << std::endl;
24 |     
25 |     // dummy arrays, just to get the types to force the right builder
26 |     // instantiation:
27 |     float3 *points = 0;
28 |     int numPoints = 0;
29 |     // BUILDER_TO_TEST supplied by cmakefile:
30 |     cukd::BUILDER_TO_TEST(points,numPoints);
31 |   }
32 | }
33 | 
34 | namespace test_photon {
35 |   /*! for those wondering what this test is for: have a look at Henrik
36 |     Wan Jensen, "Realistic Image Synthesis using Photon Mapping"
37 |     https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */
38 |   struct Photon {
39 |     float3 position;
40 |     float3 power;
41 |     uint16_t normal_phi;
42 |     uint8_t  normal_theta;
43 |     uint8_t  splitDim;
44 |   };
45 | 
46 |   struct Photon_traits {
47 |     using point_t = float3;
48 |     
49 |     enum { has_explicit_dim = true };
50 |     
51 |     static inline __both__
52 |     const point_t &get_point(const Photon &p)
53 |     { return p.position; }
54 |     
55 |     static inline __both__ float get_coord(const Photon &p, int d)
56 |     { return cukd::get_coord(p.position,d); }
57 |     
58 |     static inline __device__ int  get_dim(const Photon &p)
59 |     { return p.splitDim; }
60 |     
61 |     static inline __device__ void set_dim(Photon &p, int d)
62 |     { p.splitDim = d; }
63 |   };
64 |   
65 |   void test_empty()
66 |   {
67 |     std::cout << "testing 'Photons' array (float3 plus payload), empty input." << std::endl;
68 | 
69 |     // dummy arrays, just to get the types to force the right builder
70 |     // instantiation:
71 |     Photon *points = 0;
72 |     int numPoints = 0;
73 |     // BUILDER_TO_TEST supplied by cmakefile:
74 |     cukd::BUILDER_TO_TEST<Photon,Photon_traits>
75 |       (points,numPoints);
76 |   }
77 | }
78 | 
79 | int main(int, const char **)
80 | {
81 |   test_float3::test_empty();
82 |   CUKD_CUDA_SYNC_CHECK();
83 | 
84 |   test_photon::test_empty();
85 |   CUKD_CUDA_SYNC_CHECK();
86 | 
87 |   return 0;
88 | }
89 | 
90 | 


--------------------------------------------------------------------------------
/testing/testBuilderSimpleInput.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder.h"
 18 | #include <random>
 19 | 
 20 | #define AS_STRING(x) #x
 21 | #define TO_STRING(x) AS_STRING(x)
 22 | 
 23 | namespace test_float3 {
 24 |   void test_simple()
 25 |   {
 26 |     std::cout << "testing `" << TO_STRING(BUILDER_TO_TEST)
 27 |               << "` on float3 array, 1000 uniform random points." << std::endl;
 28 |     
 29 |     int numPoints = 1000;
 30 |     
 31 |     float3 *points = 0;
 32 |     CUKD_CUDA_CALL(MallocManaged((void **)&points,numPoints*sizeof(float3)));
 33 |     
 34 |     std::default_random_engine rd;
 35 |     std::mt19937 gen(rd());
 36 |     std::uniform_real_distribution<float> dist(0.f,100.f);
 37 |     for (int i=0;i<numPoints;i++) {
 38 |       points[i].x = dist(gen);
 39 |       points[i].y = dist(gen);
 40 |       points[i].z = dist(gen);
 41 |     }
 42 |     // BUILDER_TO_TEST supplied by cmakefile:
 43 |     cukd::BUILDER_TO_TEST(points,numPoints);
 44 |     CUKD_CUDA_CALL(Free(points));
 45 |   }
 46 | }
 47 | 
 48 | namespace test_photon {
 49 |   /*! for those wondering what this test is for: have a look at Henrik
 50 |     Wan Jensen, "Realistic Image Synthesis using Photon Mapping"
 51 |     https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */
 52 |   struct Photon {
 53 |     float3 position;
 54 |     float3 power;
 55 |     uint16_t normal_phi;
 56 |     uint8_t  normal_theta;
 57 |     uint8_t  splitDim;
 58 |   };
 59 | 
 60 |   struct Photon_traits {
 61 |     using point_t = float3;
 62 | 
 63 |     enum { has_explicit_dim = true };
 64 |     
 65 |     static inline __both__
 66 |     const point_t &get_point(const Photon &p)
 67 |     { return p.position; }
 68 |     
 69 |     static inline __both__ float get_coord(const Photon &p, int d)
 70 |     { return cukd::get_coord(p.position,d); }
 71 |     
 72 |     static inline __device__ int  get_dim(const Photon &p)
 73 |     { return p.splitDim; }
 74 |     
 75 |     static inline __device__ void set_dim(Photon &p, int d)
 76 |     { p.splitDim = d; }
 77 |   };
 78 |   
 79 |   void test_simple()
 80 |   {
 81 |     std::cout << "testing `" << AS_STRING(BUILDER_TO_TEST)
 82 |               << "` on 'Photons' array (float3 plus payload), 1000 random photons." << std::endl;
 83 | 
 84 |     int numPhotons = 1000;
 85 |     
 86 |     Photon *photons = 0;
 87 |     CUKD_CUDA_CALL(MallocManaged((void **)&photons,numPhotons*sizeof(Photon)));
 88 |     
 89 |     std::default_random_engine rd;
 90 |     std::mt19937 gen(rd());
 91 |     std::uniform_real_distribution<float> dist(0.f,100.f);
 92 |     for (int i=0;i<numPhotons;i++) {
 93 |       photons[i].position.x = dist(gen);
 94 |       photons[i].position.y = dist(gen);
 95 |       photons[i].position.z = dist(gen);
 96 |       photons[i].power = make_float3(0.f,0.f,0.f);
 97 |       photons[i].normal_theta = 0;
 98 |       photons[i].normal_phi = 0;
 99 |     }
100 |     cukd::box_t<float3> *worldBounds = 0;
101 |     CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds)));
102 |     
103 |     cukd::BUILDER_TO_TEST<Photon,Photon_traits>
104 |       (photons,numPhotons,worldBounds);
105 | 
106 |     std::cout << "world bounds is " << *worldBounds << std::endl;
107 |     CUKD_CUDA_CALL(Free(photons));
108 |     CUKD_CUDA_CALL(Free(worldBounds));
109 |   }
110 | }
111 | 
112 | int main(int, const char **)
113 | {
114 |   test_float3::test_simple();
115 |   CUKD_CUDA_SYNC_CHECK();
116 | 
117 |   test_photon::test_simple();
118 |   CUKD_CUDA_SYNC_CHECK();
119 | 
120 |   return 0;
121 | }
122 | 
123 | 


--------------------------------------------------------------------------------
/testing/testHostBuilderEmptyInput.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2018-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/builder_host.h"
18 | #include <random>
19 | 
20 | namespace test_float3 {
21 |   void test_empty()
22 |   {
23 |     std::cout << "testing float3 array, empty input." << std::endl;
24 |     
25 |     // dummy arrays, just to get the types to force the right builder
26 |     // instantiation:
27 |     float3 *points = 0;
28 |     int numPoints = 0;
29 |     // BUILDER_TO_TEST supplied by cmakefile:
30 |     cukd::buildTree_host(points,numPoints);
31 |   }
32 | }
33 | 
34 | namespace test_photon {
35 |   /*! for those wondering what this test is for: have a look at Henrik
36 |     Wan Jensen, "Realistic Image Synthesis using Photon Mapping"
37 |     https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */
38 |   struct Photon {
39 |     float3 position;
40 |     float3 power;
41 |     uint16_t normal_phi;
42 |     uint8_t  normal_theta;
43 |     uint8_t  splitDim;
44 |   };
45 | 
46 |   struct Photon_traits {
47 |     using point_t = float3;
48 | #if 1 
49 |     enum { has_explicit_dim = false };
50 | #else
51 |     enum { has_explicit_dim = true };
52 |     
53 |     static inline __both__ int  get_dim(const Photon &p)
54 |     { return p.splitDim; }
55 |     
56 |     static inline __both__ void set_dim(Photon &p, int d)
57 |     { p.splitDim = d; }
58 | #endif
59 |     
60 |     static inline __both__
61 |     const point_t &get_point(const Photon &p)
62 |     { return p.position; }
63 |     
64 |     static inline __both__ float get_coord(const Photon &p, int d)
65 |     { return cukd::get_coord(p.position,d); }
66 |   };
67 |   
68 |   void test_empty()
69 |   {
70 |     std::cout << "testing 'Photons' array (float3 plus payload), empty input." << std::endl;
71 | 
72 |     // dummy arrays, just to get the types to force the right builder
73 |     // instantiation:
74 |     Photon *points = 0;
75 |     int numPoints = 0;
76 |     // BUILDER_TO_TEST supplied by cmakefile:
77 |     cukd::buildTree_host<Photon,Photon_traits>
78 |       (points,numPoints);
79 |   }
80 | }
81 | 
82 | int main(int, const char **)
83 | {
84 |   test_float3::test_empty();
85 |   test_photon::test_empty();
86 |   return 0;
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/testing/testHostBuilderSimpleInput.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder_host.h"
 18 | #include <random>
 19 | 
 20 | #define AS_STRING(x) #x
 21 | #define TO_STRING(x) AS_STRING(x)
 22 | 
 23 | namespace test_float3 {
 24 |   void test_simple()
 25 |   {
 26 |     std::cout << "testing `buildTree_host` on float3 array, 1000 uniform random points." << std::endl;
 27 |     
 28 |     int numPoints = 1000;
 29 |     
 30 |     float3 *points = 0;
 31 |     CUKD_CUDA_CALL(MallocManaged((void **)&points,numPoints*sizeof(float3)));
 32 |     
 33 |     std::default_random_engine rd;
 34 |     std::mt19937 gen(rd());
 35 |     std::uniform_real_distribution<float> dist(0.f,100.f);
 36 |     for (int i=0;i<numPoints;i++) {
 37 |       points[i].x = dist(gen);
 38 |       points[i].y = dist(gen);
 39 |       points[i].z = dist(gen);
 40 |     }
 41 |     // BUILDER_TO_TEST supplied by cmakefile:
 42 |     cukd::buildTree_host(points,numPoints);
 43 |     CUKD_CUDA_CALL(Free(points));
 44 |   }
 45 | }
 46 | 
 47 | namespace test_photon {
 48 |   /*! for those wondering what this test is for: have a look at Henrik
 49 |     Wan Jensen, "Realistic Image Synthesis using Photon Mapping"
 50 |     https://www.amazon.com/Realistic-Image-Synthesis-Photon-Mapping/dp/1568811470 */
 51 |   struct Photon {
 52 |     float3 position;
 53 |     float3 power;
 54 |     uint16_t normal_phi;
 55 |     uint8_t  normal_theta;
 56 |     uint8_t  splitDim;
 57 |   };
 58 | 
 59 |   struct Photon_traits {
 60 |     using point_t = float3;
 61 |     enum { has_explicit_dim = true };
 62 |     
 63 |     static inline __both__
 64 |     const point_t &get_point(const Photon &p)
 65 |     { return p.position; }
 66 |     
 67 |     static inline __both__ float get_coord(const Photon &p, int d)
 68 |     { return cukd::get_coord(p.position,d); }
 69 |     
 70 |     static inline __both__ int  get_dim(const Photon &p)
 71 |     { return p.splitDim; }
 72 |     
 73 |     static inline __both__ void set_dim(Photon &p, int d)
 74 |     { p.splitDim = d; }
 75 |   };
 76 |   
 77 |   void test_simple()
 78 |   {
 79 |     std::cout << "testing `buildTree_host` on 'Photons' array"
 80 |       " (float3 plus payload), 1000 random photons." << std::endl;
 81 | 
 82 |     int numPhotons = 1000;
 83 | 
 84 |     std::vector<Photon> h_photons(numPhotons);
 85 |     Photon *photons = h_photons.data();
 86 | 
 87 |     // Photon *photons = 0;
 88 |     // CUKD_CUDA_CALL(MallocManaged((void **)&photons,numPhotons*sizeof(Photon)));
 89 |     
 90 |     std::default_random_engine rd;
 91 |     std::mt19937 gen(rd());
 92 |     std::uniform_real_distribution<float> dist(0.f,100.f);
 93 |     for (int i=0;i<numPhotons;i++) {
 94 |       photons[i].position.x = dist(gen);
 95 |       photons[i].position.y = dist(gen);
 96 |       photons[i].position.z = dist(gen);
 97 |       photons[i].power = make_float3(0.f,0.f,0.f);
 98 |       photons[i].normal_theta = 0;
 99 |       photons[i].normal_phi = 0;
100 |     }
101 |     cukd::box_t<float3> *worldBounds = 0;
102 |     CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds)));
103 |     
104 |     cukd::buildTree_host<Photon,Photon_traits>
105 |       (photons,numPhotons,worldBounds);
106 | 
107 |     std::cout << "world bounds is " << *worldBounds << std::endl;
108 |     // CUKD_CUDA_CALL(Free(photons));
109 |     CUKD_CUDA_CALL(Free(worldBounds));
110 |   }
111 | }
112 | 
113 | int main(int, const char **)
114 | {
115 |   test_float3::test_simple();
116 |   CUKD_CUDA_SYNC_CHECK();
117 | 
118 |   test_photon::test_simple();
119 |   CUKD_CUDA_SYNC_CHECK();
120 | 
121 |   return 0;
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/testing/testMultipleDefinitions_a.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2023-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/builder.h"
18 | 
19 | void foo(float3 *data, int numData)
20 | {
21 |   cukd::buildTree(data,numData,nullptr);
22 | }
23 | 
24 | int main(int, char **)
25 | {
26 |   return 0;
27 | };
28 | 
29 | 


--------------------------------------------------------------------------------
/testing/testMultipleDefinitions_b.cu:
--------------------------------------------------------------------------------
 1 | // ======================================================================== //
 2 | // Copyright 2023-2023 Ingo Wald                                            //
 3 | //                                                                          //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
 5 | // you may not use this file except in compliance with the License.         //
 6 | // You may obtain a copy of the License at                                  //
 7 | //                                                                          //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
 9 | //                                                                          //
10 | // Unless required by applicable law or agreed to in writing, software      //
11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13 | // See the License for the specific language governing permissions and      //
14 | // limitations under the License.                                           //
15 | // ======================================================================== //
16 | 
17 | #include "cukd/builder.h"
18 | 
19 | void foo2(float3 *data, int numData)
20 | {
21 |   cukd::buildTree(data,numData,nullptr);
22 | }
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/testing/testPayloadSampleFromReadme.cu:
--------------------------------------------------------------------------------
  1 | // ======================================================================== //
  2 | // Copyright 2018-2023 Ingo Wald                                            //
  3 | //                                                                          //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");          //
  5 | // you may not use this file except in compliance with the License.         //
  6 | // You may obtain a copy of the License at                                  //
  7 | //                                                                          //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0                           //
  9 | //                                                                          //
 10 | // Unless required by applicable law or agreed to in writing, software      //
 11 | // distributed under the License is distributed on an "AS IS" BASIS,        //
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
 13 | // See the License for the specific language governing permissions and      //
 14 | // limitations under the License.                                           //
 15 | // ======================================================================== //
 16 | 
 17 | #include "cukd/builder.h"
 18 | #include <random>
 19 | #include "cukd/fcp.h"
 20 | 
 21 | #define AS_STRING(x) #x
 22 | #define TO_STRING(x) AS_STRING(x)
 23 | 
 24 | namespace example1 {
 25 |   
 26 |   struct PointPlusPayload {
 27 |     float3 position;
 28 |     int    payload;
 29 |   };
 30 | 
 31 |   struct PointPlusPayload_traits
 32 |     : public cukd::default_data_traits<float3>
 33 |   {
 34 |     using point_t = float3;
 35 | 
 36 |     static inline __device__ __host__
 37 |     float3 get_point(const PointPlusPayload &data)
 38 |     { return data.position; }
 39 | 
 40 |     static inline __device__ __host__
 41 |     float  get_coord(const PointPlusPayload &data, int dim)
 42 |     { return cukd::get_coord(get_point(data),dim); }
 43 | 
 44 |     enum { has_explicit_dim = false };
 45 | 
 46 |     /*! !{ just defining this for completeness, get/set_dim should never
 47 |       get called for this type because we have set has_explicit_dim
 48 |       set to false. note traversal should ONLY ever call this
 49 |       function for data_t's that define has_explicit_dim to true */
 50 |     static inline __device__ int  get_dim(const PointPlusPayload &) { return -1; }
 51 |   };
 52 | 
 53 |   int divRoundUp(int a, int b) { return (a+b-1)/b; }
 54 |   
 55 |   __global__
 56 |   void callFCP(PointPlusPayload *data, int numData,
 57 |                cukd::box_t<float3> *d_worldBounds)
 58 |   {
 59 |     int tid = threadIdx.x+blockIdx.x*blockIdx.x;
 60 |     if (tid >= numData) return;
 61 | 
 62 |     int result = cukd::stackBased::fcp<PointPlusPayload,PointPlusPayload_traits>
 63 |       (data[tid].position,*d_worldBounds,data,numData);
 64 |   }
 65 |   
 66 |   void foo(PointPlusPayload *data, int numData, cukd::box_t<float3> *d_worldBounds)
 67 |   {
 68 |     cukd::buildTree
 69 |       </* type of the data: */PointPlusPayload,
 70 |                               /* traits for this data: */PointPlusPayload_traits>
 71 |       (data,numData,d_worldBounds);
 72 |     
 73 |     callFCP<<<divRoundUp(numData,128),128>>>(data,numData,d_worldBounds);
 74 |   }
 75 | 
 76 |   void test()
 77 |   {
 78 |     std::cout << "testing `" << AS_STRING(BUILDER_TO_TEST)
 79 |               << "` on 'PointPlusPayloads' array (float3 plus payload), 1000 random data." << std::endl;
 80 | 
 81 |     int numPointPlusPayloads = 1000;
 82 |     
 83 |     PointPlusPayload *data = 0;
 84 |     CUKD_CUDA_CALL(MallocManaged((void **)&data,numPointPlusPayloads*sizeof(PointPlusPayload)));
 85 |     
 86 |     std::default_random_engine rd;
 87 |     std::mt19937 gen(rd());
 88 |     std::uniform_real_distribution<float> dist(0.f,100.f);
 89 |     for (int i=0;i<numPointPlusPayloads;i++) {
 90 |       data[i].position.x = dist(gen);
 91 |       data[i].position.y = dist(gen);
 92 |       data[i].position.z = dist(gen);
 93 |       data[i].payload    = i;
 94 |     }
 95 |     cukd::box_t<float3> *worldBounds = 0;
 96 |     CUKD_CUDA_CALL(MallocManaged((void **)&worldBounds,sizeof(*worldBounds)));
 97 |     
 98 |     // cukd::BUILDER_TO_TEST<PointPlusPayload,PointPlusPayload_traits>
 99 |     //   (data,numPointPlusPayloads,worldBounds);
100 |     foo(data,numPointPlusPayloads,worldBounds);
101 | 
102 |     std::cout << "world bounds is " << *worldBounds << std::endl;
103 |     CUKD_CUDA_CALL(Free(data));
104 |     CUKD_CUDA_CALL(Free(worldBounds));
105 |   }
106 | 
107 | }
108 | 
109 | int main(int, const char **)
110 | {
111 |   example1::test();
112 |   CUKD_CUDA_SYNC_CHECK();
113 | 
114 |   return 0;
115 | }
116 | 


--------------------------------------------------------------------------------