├── CMakeLists.txt ├── LICENSE ├── README.md ├── Utils ├── IDFactory.cpp ├── IDFactory.h ├── Timing.cpp ├── Timing.h ├── cuda_helper.cu └── cuda_helper.h ├── demo ├── CMakeLists.txt └── main.cu ├── images └── screenshot.jpg ├── include ├── ActivationTable.h ├── Common.h ├── PointSet.h └── cuNSearch.h └── src ├── GridInfo.h ├── NotImplementedException.h ├── PointSet.cu ├── PointSetImplementation.cu ├── PointSetImplementation.h ├── Types.h ├── cuNSearch.cu ├── cuNSearchDeviceData.cu ├── cuNSearchDeviceData.h ├── cuNSearchKernels.cu ├── cuNSearchKernels.cuh ├── helper_linearIndex.h └── helper_mortonCode.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | #Requires cmake 3.12 for first class cuda support with visual studio 3 | 4 | project(cuNSearch LANGUAGES CXX CUDA) 5 | # Visual studio solution directories. 6 | set_property(GLOBAL PROPERTY USE_FOLDERS on) 7 | 8 | 9 | option(CUNSEARCH_USE_DOUBLE_PRECISION "Use double precision." ON) 10 | 11 | if(CUNSEARCH_USE_DOUBLE_PRECISION) 12 | message(STATUS "cuNSearch::Real = double") 13 | else() 14 | message(STATUS "cuNSearch::Real = float") 15 | endif(CUNSEARCH_USE_DOUBLE_PRECISION) 16 | 17 | if(CUNSEARCH_USE_DOUBLE_PRECISION) 18 | add_compile_options(-DCUNSEARCH_USE_DOUBLE_PRECISION) 19 | endif(CUNSEARCH_USE_DOUBLE_PRECISION) 20 | 21 | OPTION(BUILD_AS_SHARED_LIBS "Build all the libraries as shared" OFF) 22 | if (BUILD_AS_SHARED_LIBS) 23 | if(WIN32) 24 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON CACHE BOOL "Export all symbols") 25 | endif(WIN32) 26 | endif (BUILD_AS_SHARED_LIBS) 27 | 28 | set(CMAKE_CXX_STANDARD 14) 29 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 30 | SET(CMAKE_DEBUG_POSTFIX "_d") 31 | 32 | set (INCLUDE_HEADERS 33 | include/PointSet.h 34 | include/ActivationTable.h 35 | include/Common.h 36 | include/cuNSearch.h 37 | ) 38 | 39 | set (HEADER_FILES 40 | src/Types.h 41 | src/cuNSearchDeviceData.h 42 | src/GridInfo.h 43 | src/NotImplementedException.h 44 | src/PointSetImplementation.h 45 | src/cuNSearchKernels.cuh 46 | src/helper_linearIndex.h 47 | src/helper_mortonCode.h 48 | Utils/cuda_helper.h 49 | Utils/Timing.h 50 | Utils/IDFactory.h 51 | ) 52 | 53 | set (SOURCE_FILES 54 | src/PointSet.cu 55 | src/PointSetImplementation.cu 56 | src/cuNSearch.cu 57 | src/cuNSearchDeviceData.cu 58 | src/cuNSearchKernels.cu 59 | Utils/cuda_helper.cu 60 | Utils/Timing.cpp 61 | Utils/IDFactory.cpp 62 | ) 63 | 64 | if(BUILD_AS_SHARED_LIBS) 65 | add_library(cuNSearch SHARED ${INCLUDE_HEADERS} ${HEADER_FILES} ${SOURCE_FILES}) 66 | else() 67 | add_library(cuNSearch STATIC ${INCLUDE_HEADERS} ${HEADER_FILES} ${SOURCE_FILES}) 68 | endif() 69 | 70 | include(GenerateExportHeader) 71 | generate_export_header(cuNSearch 72 | BASE_NAME cuNSearch 73 | EXPORT_MACRO_NAME cuNSearch_EXPORT 74 | EXPORT_FILE_NAME ${CMAKE_CURRENT_BINARY_DIR}/cuNSearch/cuNSearch_export.h 75 | ) 76 | 77 | target_include_directories(cuNSearch PUBLIC 78 | "include" 79 | "Utils" 80 | ${CUDA_INCLUDE_DIRS} 81 | ${CMAKE_BINARY_DIR}/cuNSearch 82 | ) 83 | target_link_libraries(cuNSearch PUBLIC ${CUDA_LIBRARIES}) 84 | target_compile_definitions(cuNSearch PUBLIC $<$:DEBUG>) 85 | 86 | list(APPEND INCLUDE_HEADERS 87 | ${CMAKE_CURRENT_BINARY_DIR}/cuNSearch/cuNSearch_export.h) 88 | 89 | install(FILES ${INCLUDE_HEADERS} 90 | DESTINATION include/) 91 | 92 | install(TARGETS cuNSearch 93 | RUNTIME DESTINATION bin 94 | LIBRARY DESTINATION lib 95 | ARCHIVE DESTINATION lib 96 | ) 97 | 98 | option(BUILD_DEMO "Build example of how to use this library." 99 | ON) 100 | if(BUILD_DEMO) 101 | add_subdirectory(demo) 102 | endif(BUILD_DEMO) 103 | 104 | 105 | unset(USE_DOUBLE_PRECISION CACHE) 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019-present, cuNSearch contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cuNSearch 2 | A C++/CUDA library to efficiently compute neighborhood information on the GPU for 3D point clouds within a fixed radius. Suitable for many applications, e.g. neighborhood search for SPH fluid simulations. 3 | 4 | The library interface is similar to the CPU neighborhood search [CompactNSearch](https://github.com/InteractiveComputerGraphics/CompactNSearch). 5 | ## Libraries using cuNSearch 6 | 7 | * [SPlisHSPlasH](https://github.com/InteractiveComputerGraphics/SPlisHSPlasH) - A C++ library for the physically-based simulation of fluids using Smoothed Particle Hydrodynamics (see screenshot) 8 | 9 | ![](images/screenshot.jpg) 10 | 11 | ## Build Instructions 12 | 13 | This project is based on [CMake](https://cmake.org/). Simply generate project, Makefiles, etc. using [CMake](https://cmake.org/) and compile the project with the compiler of your choice. 14 | 15 | Requirements: 16 | - CMake 3.12 17 | - CUDA SDK 9.0 or newer 18 | - C++ 11 19 | 20 | The code was tested with the following configurations: 21 | - Windows 10 64-bit, CMake 3.12.3, Visual Studio 2017, CUDA SDK 10.1 22 | - Debian 9 64-bit, CMake 3.12.3, GCC 6.3.0, CUDA SDK 9.2 23 | 24 | 25 | ## Usage 26 | A data structure to perform a neighborhood search can be created by calling the constructor given a fixed search radius ```r```. 27 | ```c++ 28 | cuNSearch::NeighborhoodSearch nsearch(r); 29 | ``` 30 | An arbitrary number of point clouds can then be added to the data structure using the method ```add_point_set```. The library expects the point positions to be contiguously stored in an array-like structure. The method will return a unique id associated with the initialized point set. 31 | ```c++ 32 | std::vector> positions; 33 | // ... Fill array with 3 * n real numbers representing three-dimensional point positions. 34 | unsigned int point_set_id = nsearch.add_point_set(positions.front().data(), positions.size()); 35 | nsearch.find_neighbors(); 36 | ``` 37 | In order to generate the neighborhood information simply execute the following command 38 | ```c++ 39 | nsearch.find_neighbors(); 40 | ``` 41 | Finally, the neighborhood information can be accessed as follows 42 | ```c++ 43 | PointSet const& ps = nsearch.point_set(point_set_id); 44 | for (int i = 0; i < ps.n_points(); ++i) 45 | { 46 | for (int j = 0; j < ps.n_neighbors(i); ++j) 47 | { 48 | // Return PointID of the jth neighbor of the ith particle in the 0th point set. 49 | PointID const& pid = ps.neighbor(0, i, j); 50 | // ... 51 | // Do whatever you want with the point id. The id contains two indices. 52 | // The first field pid.point_set_id represents the unique point set id returnd by add_point_set. 53 | // The second field pid.point_id stands for the index of the neighboring particle within 54 | // the containing point set. 55 | // ... 56 | } 57 | } 58 | ``` 59 | 60 | Besides the basic functionality the library offers to compute a rule for reordering the points according to a space-filling Z curve. The reordering will improve the performance of future neighborhood queries and accesses. The rule can be computed via 61 | ```c++ 62 | nsearch.z_sort(); 63 | ``` 64 | Please note that the actual reordering must be invoked by the user by 65 | ```c++ 66 | ps.sort_field(positions.data()); 67 | ``` 68 | Assuming that there is additional information stored per-point (e.g. velocity, color, mass etc.) the information **must** also be reorded using the same method to maintain consistency. Subsequently, the ```find_neighbors``` function has to be invoked again to update the neighborhood information. 69 | 70 | Another self-explaining (benchmark) [demo](demo/main.cu) is contained in the project. 71 | 72 | ## Activation Table 73 | 74 | When maintaining multiple it is sometimes desired that only certain point sets can find points from other point sets. Therefore an activation table is implemented where the user can specify whether a point set i searches points in another point set j. When nothing else is specified all point sets will search points in all other point sets. The activation table can be modified with e.g. 75 | ```c++ 76 | nsearch.set_active(i, j, false) 77 | ``` 78 | 79 | ## Common mistakes and issues 80 | 81 | Visual Studio may not detect changes in ".cu" files. 82 | 83 | Use of thrust library in cpp files: Some thrust classes can only be used when the file is compiled by the nvidia compiler nvcc. 84 | This is usually solved by change the file ending to .cu to mark the file for the nvcc compiler. 85 | 86 | ## References 87 | 88 | * R. Hoetzlein, 2014. "Fast Fixed-Radius Nearest Neighbors: Interactive Million-Particle Fluids", GPU Technology Conference (GTC), Santa Clara, CA. 89 | -------------------------------------------------------------------------------- /Utils/IDFactory.cpp: -------------------------------------------------------------------------------- 1 | #include "IDFactory.h" 2 | 3 | using namespace cuNSearch; 4 | 5 | int IDFactory::id = 0; 6 | 7 | -------------------------------------------------------------------------------- /Utils/IDFactory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace cuNSearch 4 | { 5 | /** Factory for unique ids. 6 | */ 7 | class IDFactory 8 | { 9 | private: 10 | /** Current id */ 11 | static int id; 12 | 13 | public: 14 | static int getId() { return id++; } 15 | }; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /Utils/Timing.cpp: -------------------------------------------------------------------------------- 1 | #include "Timing.h" 2 | 3 | using namespace cuNSearch; 4 | 5 | std::unordered_map Timing::m_averageTimes; 6 | std::stack Timing::m_timingStack; 7 | bool Timing::m_dontPrintTimes = false; 8 | unsigned int Timing::m_startCounter = 0; 9 | unsigned int Timing::m_stopCounter = 0; 10 | -------------------------------------------------------------------------------- /Utils/Timing.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIMING_H__ 2 | #define __TIMING_H__ 3 | 4 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) 5 | #define FORCE_INLINE __forceinline 6 | #else 7 | #define FORCE_INLINE __attribute__((always_inline)) 8 | #endif 9 | 10 | #include 11 | #include 12 | #include 13 | //#include "Common/Common.h" 14 | 15 | #include 16 | #include "IDFactory.h" 17 | #include "cuNSearch_export.h" 18 | 19 | namespace cuNSearch 20 | { 21 | struct TimingHelper 22 | { 23 | std::chrono::time_point start; 24 | std::string name; 25 | }; 26 | 27 | struct AverageTime 28 | { 29 | double totalTime; 30 | unsigned int counter; 31 | std::string name; 32 | }; 33 | 34 | class Timing 35 | { 36 | public: 37 | static cuNSearch_EXPORT bool m_dontPrintTimes; 38 | static cuNSearch_EXPORT unsigned int m_startCounter; 39 | static cuNSearch_EXPORT unsigned int m_stopCounter; 40 | static cuNSearch_EXPORT std::stack m_timingStack; 41 | static cuNSearch_EXPORT std::unordered_map m_averageTimes; 42 | 43 | static void reset() 44 | { 45 | while (!m_timingStack.empty()) 46 | m_timingStack.pop(); 47 | m_averageTimes.clear(); 48 | m_startCounter = 0; 49 | m_stopCounter = 0; 50 | } 51 | 52 | FORCE_INLINE static void startTiming(const std::string& name = std::string("")) 53 | { 54 | TimingHelper h; 55 | h.start = std::chrono::high_resolution_clock::now(); 56 | h.name = name; 57 | Timing::m_timingStack.push(h); 58 | Timing::m_startCounter++; 59 | } 60 | 61 | FORCE_INLINE static double stopTiming(bool print = true) 62 | { 63 | if (!Timing::m_timingStack.empty()) 64 | { 65 | Timing::m_stopCounter++; 66 | std::chrono::time_point stop = std::chrono::high_resolution_clock::now(); 67 | TimingHelper h = Timing::m_timingStack.top(); 68 | Timing::m_timingStack.pop(); 69 | std::chrono::duration elapsed_seconds = stop - h.start; 70 | double t = elapsed_seconds.count() * 1000.0; 71 | 72 | if (print) 73 | std::cout << "time " << h.name.c_str() << ": " << t << " ms\n" << std::flush; 74 | return t; 75 | } 76 | return 0; 77 | } 78 | 79 | FORCE_INLINE static double stopTiming(bool print, int &id) 80 | { 81 | if (id == -1) 82 | id = IDFactory::getId(); 83 | if (!Timing::m_timingStack.empty()) 84 | { 85 | Timing::m_stopCounter++; 86 | std::chrono::time_point stop = std::chrono::high_resolution_clock::now(); 87 | TimingHelper h = Timing::m_timingStack.top(); 88 | Timing::m_timingStack.pop(); 89 | 90 | std::chrono::duration elapsed_seconds = stop - h.start; 91 | double t = elapsed_seconds.count() * 1000.0; 92 | 93 | if (print && !Timing::m_dontPrintTimes) 94 | std::cout << "time " << h.name.c_str() << ": " << t << " ms\n" << std::flush; 95 | 96 | if (id >= 0) 97 | { 98 | std::unordered_map::iterator iter; 99 | iter = Timing::m_averageTimes.find(id); 100 | if (iter != Timing::m_averageTimes.end()) 101 | { 102 | Timing::m_averageTimes[id].totalTime += t; 103 | Timing::m_averageTimes[id].counter++; 104 | } 105 | else 106 | { 107 | AverageTime at; 108 | at.counter = 1; 109 | at.totalTime = t; 110 | at.name = h.name; 111 | Timing::m_averageTimes[id] = at; 112 | } 113 | } 114 | return t; 115 | } 116 | return 0; 117 | } 118 | 119 | FORCE_INLINE static void printAverageTimes() 120 | { 121 | std::unordered_map::iterator iter; 122 | for (iter = Timing::m_averageTimes.begin(); iter != Timing::m_averageTimes.end(); iter++) 123 | { 124 | AverageTime &at = iter->second; 125 | const double avgTime = at.totalTime / at.counter; 126 | std::cout << "Average time " << at.name.c_str() << ": " << avgTime << " ms\n" << std::flush; 127 | } 128 | if (Timing::m_startCounter != Timing::m_stopCounter) 129 | std::cout << "Problem: " << Timing::m_startCounter << " calls of startTiming and " << Timing::m_stopCounter << " calls of stopTiming.\n " << std::flush; 130 | std::cout << "---------------------------------------------------------------------------\n\n"; 131 | } 132 | 133 | FORCE_INLINE static void printTimeSums() 134 | { 135 | std::unordered_map::iterator iter; 136 | for (iter = Timing::m_averageTimes.begin(); iter != Timing::m_averageTimes.end(); iter++) 137 | { 138 | AverageTime &at = iter->second; 139 | const double timeSum = at.totalTime; 140 | std::cout << "Time sum " << at.name.c_str() << ": " << timeSum << " ms\n" << std::flush; 141 | } 142 | if (Timing::m_startCounter != Timing::m_stopCounter) 143 | std::cout << "Problem: " << Timing::m_startCounter << " calls of startTiming and " << Timing::m_stopCounter << " calls of stopTiming.\n " << std::flush; 144 | std::cout << "---------------------------------------------------------------------------\n\n"; 145 | } 146 | }; 147 | 148 | } 149 | 150 | #endif -------------------------------------------------------------------------------- /Utils/cuda_helper.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_helper.h" 2 | #include 3 | 4 | CUDAException::CUDAException(const char *_const_Message) : std::runtime_error(_const_Message) 5 | { 6 | 7 | } 8 | 9 | CUDAMallocException::CUDAMallocException(const char *_const_Message) : std::runtime_error(_const_Message) 10 | { 11 | 12 | } 13 | 14 | CUDAMemCopyException::CUDAMemCopyException(const char *_const_Message) : std::runtime_error(_const_Message) 15 | { 16 | 17 | } 18 | 19 | void CudaHelper::DeviceSynchronize() 20 | { 21 | cudaError_t cudaStatus = cudaDeviceSynchronize(); 22 | if (cudaStatus != cudaSuccess) 23 | { 24 | auto temp = cudaGetErrorString(cudaStatus); 25 | throw CUDAException(temp); 26 | } 27 | } 28 | 29 | void CudaHelper::GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads) 30 | { 31 | numberOfThreads = (numberOfElements / alignment) * alignment; 32 | numberOfThreadBlocks = (numberOfElements / alignment); 33 | if (numberOfElements % alignment != 0) 34 | { 35 | numberOfThreads += alignment; 36 | numberOfThreadBlocks++; 37 | } 38 | } 39 | 40 | void CudaHelper::MemcpyHostToDevice(void* host, void* device, size_t size) 41 | { 42 | cudaError_t cudaStatus = cudaMemcpy(device, host, size, cudaMemcpyHostToDevice); 43 | if (cudaStatus != cudaSuccess) 44 | { 45 | throw CUDAMemCopyException("cudaMemcpy() failed!"); 46 | } 47 | } 48 | 49 | void CudaHelper::MemcpyDeviceToHost(void* device, void* host, size_t size) 50 | { 51 | cudaError_t cudaStatus = cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost); 52 | if (cudaStatus != cudaSuccess) 53 | { 54 | throw CUDAMemCopyException("cudaMemcpy() failed!"); 55 | } 56 | } 57 | 58 | void CudaHelper::CheckLastError() 59 | { 60 | cudaError_t cudaStatus = cudaGetLastError(); 61 | if (cudaStatus != cudaSuccess) 62 | { 63 | auto temp = cudaGetErrorString(cudaStatus); 64 | throw CUDAException(temp); 65 | } 66 | } -------------------------------------------------------------------------------- /Utils/cuda_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | class CUDAException : public std::runtime_error 5 | { 6 | public: 7 | CUDAException(const char *_const_Message); 8 | }; 9 | 10 | class CUDAMallocException : public std::runtime_error 11 | { 12 | public: 13 | CUDAMallocException(const char *_const_Message); 14 | }; 15 | 16 | class CUDAMemCopyException : public std::runtime_error 17 | { 18 | public: 19 | CUDAMemCopyException(const char *const_Message); 20 | }; 21 | 22 | /*static*/ class CudaHelper 23 | { 24 | public: 25 | /** Synchronizes the device work with the current thread and throws any errors as exception. 26 | */ 27 | static void DeviceSynchronize(); 28 | 29 | 30 | /** Throws the last error as exception. 31 | */ 32 | static void CheckLastError(); 33 | 34 | static void GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads); 35 | 36 | 37 | /** Gets a raw pointer from a thrust vector 38 | */ 39 | template 40 | static T* GetPointer(thrust::device_vector &vector) 41 | { 42 | return thrust::raw_pointer_cast(&vector[0]); 43 | } 44 | 45 | /** Gets the size of the device_vector data in bytes. 46 | */ 47 | template 48 | static size_t GetSizeInBytes(const thrust::device_vector &vector) 49 | { 50 | return sizeof(T) * vector.size(); 51 | } 52 | 53 | /** Copies data from host to device. 54 | */ 55 | static void MemcpyHostToDevice(void* host, void* device, size_t size); 56 | 57 | /** Copies data from host to device. 58 | */ 59 | template 60 | static void MemcpyHostToDevice(T* host, T* device, size_t elements) 61 | { 62 | MemcpyHostToDevice((void*)host, (void*)device, elements * sizeof(T)); 63 | } 64 | 65 | /** Copies data from device to host. 66 | */ 67 | static void MemcpyDeviceToHost(void* device, void* host, size_t size); 68 | 69 | /** Copies data from device to host. 70 | */ 71 | template 72 | static void MemcpyDeviceToHost(T* device, T* host, size_t elements) 73 | { 74 | MemcpyDeviceToHost((void*)device, (void*)host, elements * sizeof(T)); 75 | } 76 | 77 | }; -------------------------------------------------------------------------------- /demo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(Demo 2 | main.cu 3 | ) 4 | add_dependencies(Demo cuNSearch) 5 | target_link_libraries(Demo PRIVATE cuNSearch) 6 | -------------------------------------------------------------------------------- /demo/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuNSearch.h" 2 | #include "Timing.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace cuNSearch; 12 | 13 | using Real3 = std::array; 14 | std::vector positions; 15 | 16 | inline Real3 operator-(const Real3 & left, const Real3 & right) 17 | { 18 | return Real3{ left[0] - right[0], left[1] - right[1], left[2] - right[2] }; 19 | } 20 | 21 | std::size_t const N = 120; 22 | Real const r_omega = static_cast(0.15); 23 | Real const r_omega2 = r_omega * r_omega; 24 | Real const radius = static_cast(2.0) * (static_cast(2.0) * r_omega / static_cast(N - 1)); 25 | 26 | void testCuNSearch() 27 | { 28 | //Generate test data 29 | Real min_x = std::numeric_limits::max(); 30 | Real max_x = std::numeric_limits::min(); 31 | positions.reserve(N * N * N); 32 | for (unsigned int i = 0; i < N; ++i) 33 | { 34 | for (unsigned int j = 0; j < N; ++j) 35 | { 36 | for (unsigned int k = 0; k < N; ++k) 37 | { 38 | std::array x = { { 39 | r_omega * static_cast(2.0 * static_cast(i) / static_cast(N - 1) - 1.0), 40 | r_omega * static_cast(2.0 * static_cast(j) / static_cast(N - 1) - 1.0), 41 | r_omega * static_cast(2.0 * static_cast(k) / static_cast(N - 1) - 1.0) } }; 42 | 43 | Real l2 = x[0] * x[0] + x[1] * x[1] + x[2] * x[2]; 44 | if (l2 < r_omega2) 45 | { 46 | x[0] += static_cast(0.35); 47 | x[1] += static_cast(0.35); 48 | x[2] += static_cast(0.35); 49 | positions.push_back(x); 50 | if (min_x > x[0]) 51 | { 52 | min_x = x[0]; 53 | } 54 | if (max_x < x[0]) 55 | { 56 | max_x = x[0]; 57 | } 58 | } 59 | } 60 | } 61 | } 62 | std::random_shuffle(positions.begin(), positions.end()); 63 | printf("Number of particles: %d \n", static_cast(positions.size())); 64 | 65 | //Create neighborhood search instance 66 | NeighborhoodSearch nsearch(radius); 67 | 68 | //Add point set from the test data 69 | auto pointSetIndex = nsearch.add_point_set(positions.front().data(), positions.size(), true, true); 70 | 71 | for (size_t i = 0; i < 5; i++) 72 | { 73 | if (i != 0) 74 | { 75 | nsearch.z_sort(); 76 | nsearch.point_set(pointSetIndex).sort_field((Real3*)nsearch.point_set(pointSetIndex).GetPoints()); 77 | } 78 | 79 | Timing::reset(); 80 | nsearch.find_neighbors(); 81 | Timing::printAverageTimes(); 82 | } 83 | 84 | //Neighborhood search result test 85 | auto &pointSet = nsearch.point_set(0); 86 | auto points = pointSet.GetPoints(); 87 | 88 | std::cout << "Validate results" << std::endl; 89 | for (unsigned int i = 0; i < pointSet.n_points(); i++) 90 | { 91 | Real3 point = ((Real3*)points)[i]; 92 | auto count = pointSet.n_neighbors(0, i); 93 | for (unsigned int j = 0; j < count; j++) 94 | { 95 | auto neighbor = pointSet.neighbor(0, i, j); 96 | auto diff = point - ((Real3*)points)[neighbor]; 97 | float squaredLength = diff[0] * diff[0] + diff[1] * diff[1] + diff[2] * diff[2]; 98 | float distance = sqrt(squaredLength); 99 | 100 | if (distance > radius) 101 | { 102 | throw std::runtime_error("Not a neighbor"); 103 | } 104 | } 105 | } 106 | } 107 | 108 | int main(int argc, char* argv[]) 109 | { 110 | #ifdef DEBUG 111 | std::cout << "Debug Build:" << std::endl; 112 | 113 | if(sizeof(Real) == 4) 114 | std::cout << "Real = float" << std::endl; 115 | else if (sizeof(Real) == 8) 116 | std::cout << "Real = double" << std::endl; 117 | #endif 118 | 119 | testCuNSearch(); 120 | std::cout << "Finished Testing" << std::endl; 121 | getchar(); 122 | } 123 | -------------------------------------------------------------------------------- /images/screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InteractiveComputerGraphics/cuNSearch/4dd5598824bb7dfceae86d89758a6be74e7a9ac4/images/screenshot.jpg -------------------------------------------------------------------------------- /include/ActivationTable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // This is a public header. Avoid references to cuda or other external references. 3 | 4 | #include 5 | 6 | namespace cuNSearch 7 | { 8 | class ActivationTable 9 | { 10 | private: 11 | std::vector> m_table; 12 | 13 | public: 14 | 15 | bool operator==(ActivationTable const& other) const 16 | { 17 | return m_table == other.m_table; 18 | } 19 | 20 | bool operator!=(ActivationTable const& other) const 21 | { 22 | return !(m_table == other.m_table); 23 | } 24 | 25 | /** Add point set. If search_neighbors is true, neighbors in all other point sets are searched. 26 | * If find_neighbors is true, the new point set is activated in the neighborhood search of all other point sets. 27 | */ 28 | void add_point_set(bool search_neighbors = true, bool find_neighbors = true) 29 | { 30 | // add column to each row 31 | auto size = m_table.size(); 32 | for (auto i = 0u; i < size; i++) 33 | { 34 | m_table[i].resize(size + 1); 35 | m_table[i][size] = static_cast(find_neighbors); 36 | } 37 | 38 | // add new row 39 | m_table.resize(size + 1); 40 | m_table[size].resize(size + 1); 41 | for (auto i = 0u; i < size + 1; i++) 42 | m_table[size][i] = static_cast(search_neighbors); 43 | } 44 | 45 | /** Activate/Deactivate that neighbors in point set index2 are found when searching for neighbors of point set index1. 46 | */ 47 | void set_active(unsigned int index1, unsigned int index2, bool active) 48 | { 49 | m_table[index1][index2] = static_cast(active); 50 | } 51 | 52 | /** Activate/Deactivate all point set pairs containing the given index. If search_neighbors is true, neighbors in all other point sets are searched. 53 | * If find_neighbors is true, the new point set is activated in the neighborhood search of all other point sets. 54 | */ 55 | void set_active(unsigned int index, bool search_neighbors = true, bool find_neighbors = true) 56 | { 57 | auto size = m_table.size(); 58 | for (auto i = 0u; i < size; i++) 59 | { 60 | m_table[i][index] = static_cast(find_neighbors); 61 | m_table[index][i] = static_cast(search_neighbors); 62 | } 63 | m_table[index][index] = static_cast(search_neighbors && find_neighbors); 64 | } 65 | 66 | /** Activate/Deactivate all point set pairs. 67 | */ 68 | void set_active(bool active) 69 | { 70 | auto size = m_table.size(); 71 | for (auto i = 0u; i < size; i++) 72 | for (auto j = 0u; j < size; j++) 73 | m_table[i][j] = static_cast(active); 74 | } 75 | 76 | bool is_active(unsigned int index1, unsigned int index2) const 77 | { 78 | return m_table[index1][index2] != 0; 79 | } 80 | 81 | bool is_searching_neighbors(unsigned int const index) const 82 | { 83 | for (auto i = 0u; i < m_table[index].size(); i++) 84 | { 85 | if (m_table[index][i]) 86 | { 87 | return true; 88 | } 89 | } 90 | return false; 91 | } 92 | }; 93 | } 94 | -------------------------------------------------------------------------------- /include/Common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // This is a public header. Avoid references to cuda or other external references. 3 | 4 | namespace cuNSearch 5 | { 6 | typedef unsigned long long ulong; 7 | typedef unsigned short ushort; 8 | typedef unsigned int uint; 9 | typedef unsigned char byte; 10 | 11 | #ifdef CUNSEARCH_USE_DOUBLE_PRECISION 12 | using Real = double; 13 | #else 14 | using Real = float; 15 | #endif 16 | } -------------------------------------------------------------------------------- /include/PointSet.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // This is a public header. Avoid references to cuda or other external references. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Common.h" 10 | 11 | namespace cuNSearch 12 | { 13 | class NeighborhoodSearch; 14 | class PointSetImplementation; 15 | class cuNSearchDeviceData; 16 | 17 | template 18 | std::unique_ptr make_unique(Args&&... args) 19 | { 20 | return std::unique_ptr(new T(std::forward(args)...)); 21 | } 22 | 23 | /** 24 | * @class PointSet. 25 | * Represents a set of points in three-dimensional space. 26 | */ 27 | class PointSet 28 | { 29 | struct NeighborSet 30 | { 31 | //Pinned memory 32 | uint NeighborCountAllocationSize; 33 | uint ParticleCountAllocationSize; 34 | uint *Counts; 35 | uint *Offsets; 36 | uint *Neighbors; 37 | 38 | NeighborSet() 39 | { 40 | NeighborCountAllocationSize = 0u; 41 | ParticleCountAllocationSize = 0u; 42 | Counts = nullptr; 43 | Offsets = nullptr; 44 | Neighbors = nullptr; 45 | } 46 | }; 47 | 48 | public: 49 | ///** 50 | //* Copy constructor. 51 | //*/ 52 | PointSet(PointSet const& other); 53 | 54 | 55 | ~PointSet(); 56 | //Define descructor in cpp file to allow unique_ptr to incomplete type. 57 | //https://stackoverflow.com/questions/9954518/stdunique-ptr-with-an-incomplete-type-wont-compile 58 | 59 | /** 60 | * Returns the number of neighbors of point i in the given point set. 61 | * @param i Point index. 62 | * @returns Number of points neighboring point i in point set point_set. 63 | */ 64 | inline std::size_t n_neighbors(unsigned int point_set, unsigned int i) const 65 | { 66 | return neighbors[point_set].Counts[i]; 67 | } 68 | 69 | /** 70 | * Fetches id pair of kth neighbor of point i in the given point set. 71 | * @param point_set Point set index of other point set where neighbors have been searched. 72 | * @param i Point index for which the neighbor id should be returned. 73 | * @param k Represents kth neighbor of point i. 74 | * @returns Index of neighboring point i in point set point_set. 75 | */ 76 | inline unsigned int neighbor(unsigned int point_set, unsigned int i, unsigned int k) const 77 | { 78 | //Return index of the k-th neighbor to point i (of the given point set) 79 | const auto &neighborSet = neighbors[point_set]; 80 | return neighborSet.Neighbors[neighborSet.Offsets[i] + k]; 81 | } 82 | 83 | /** 84 | * Fetches pointer to neighbors of point i in the given point set. 85 | * @param point_set Point set index of other point set where neighbors have been searched. 86 | * @param i Point index for which the neighbor id should be returned. 87 | * @returns Pointer to ids of neighboring points of i in point set point_set. 88 | */ 89 | inline unsigned int * neighbor_list(unsigned int point_set, unsigned int i) const 90 | { 91 | //Return index of the k-th neighbor to point i (of the given point set) 92 | const auto &neighborSet = neighbors[point_set]; 93 | return &neighborSet.Neighbors[neighborSet.Offsets[i]]; 94 | } 95 | 96 | /** 97 | * @returns the number of points contained in the point set. 98 | */ 99 | std::size_t n_points() const { return m_n; } 100 | 101 | /* 102 | * Returns true, if the point locations may be updated by the user. 103 | **/ 104 | bool is_dynamic() const { return m_dynamic; } 105 | 106 | /** 107 | * If true is passed, the point positions may be altered by the user. 108 | */ 109 | void set_dynamic(bool v) { m_dynamic = v; } 110 | 111 | Real const* GetPoints() { return m_x; } 112 | 113 | /** 114 | * Return the user data which can be attached to a point set. 115 | */ 116 | void *get_user_data() { return m_user_data; } 117 | 118 | /** 119 | * Reorders an array according to a previously generated sort table by invocation of the method 120 | * "z_sort" of class "NeighborhoodSearch". Please note that the method "z_sort" of class 121 | * "Neighborhood search" has to be called beforehand. 122 | */ 123 | template 124 | void sort_field(T* lst) const; 125 | 126 | private: 127 | friend NeighborhoodSearch; 128 | friend cuNSearchDeviceData; 129 | 130 | // Implementation and cuda data are hidden in the PointSetImplementation class to avoid unnecessary dependencies in public headers. 131 | std::unique_ptr impl; 132 | 133 | PointSet(Real const* x, std::size_t n, bool dynamic, void *user_data = nullptr); 134 | 135 | void resize(Real const* x, std::size_t n); 136 | 137 | Real const* point(unsigned int i) const { return &m_x[3*i]; } 138 | 139 | Real const* m_x; //positions of the points 140 | std::size_t m_n; //# of points in the set 141 | bool m_dynamic; //if false the points do not move and the hash values do not change 142 | void *m_user_data; 143 | 144 | std::vector sortIndices; 145 | std::vector neighbors; 146 | }; 147 | 148 | 149 | template 150 | void PointSet::sort_field(T* lst) const 151 | { 152 | std::vector tmp(lst, lst + sortIndices.size()); 153 | std::transform(sortIndices.begin(), sortIndices.end(), 154 | //#ifdef _MSC_VER 155 | // stdext::unchecked_array_iterator(lst), 156 | //#else 157 | lst, 158 | //#endif 159 | [&](int i) { return tmp[i]; }); 160 | } 161 | 162 | 163 | } 164 | 165 | -------------------------------------------------------------------------------- /include/cuNSearch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // This is a public header. Avoid references to cuda or other external references. 3 | 4 | #include "Common.h" 5 | #include "ActivationTable.h" 6 | #include "PointSet.h" 7 | 8 | namespace cuNSearch 9 | { 10 | class cuNSearchDeviceData; 11 | 12 | /** 13 | * @class NeighborhoodSearch 14 | * Stores point data multiple set of points in which neighborhood information for a fixed 15 | * radius r should be generated. 16 | */ 17 | class NeighborhoodSearch 18 | { 19 | private: 20 | std::vector pointSets; 21 | 22 | public: 23 | 24 | /** 25 | * Constructor. 26 | * Creates a new instance of the neighborhood search class. 27 | * @param r Search radius. If two points are closer to each other than a distance r they are considered neighbors. 28 | */ 29 | NeighborhoodSearch(Real searchRadius); 30 | 31 | /** 32 | * Destructor. 33 | */ 34 | ~NeighborhoodSearch(); 35 | //Define descructor in cpp file to allow unique_ptr to incomplete type. 36 | //https://stackoverflow.com/questions/9954518/stdunique-ptr-with-an-incomplete-type-wont-compile 37 | 38 | 39 | /** 40 | * Get method to access a point set. 41 | * @param i Index of the point set to retrieve. 42 | */ 43 | PointSet const& point_set(unsigned int i) const { return pointSets[i]; } 44 | 45 | /** 46 | * Get method to access a point set. 47 | * @param i Index of the point set to retrieve. 48 | */ 49 | PointSet & point_set(unsigned int i) { return pointSets[i]; } 50 | 51 | /** 52 | * Returns the number of point sets contained in the search. 53 | */ 54 | std::size_t n_point_sets() const { return pointSets.size(); } 55 | 56 | /** 57 | * Get method to access the list of point sets. 58 | */ 59 | std::vector const& point_sets() const { return pointSets; } 60 | 61 | /** 62 | * Get method to access the list of point sets. 63 | */ 64 | std::vector & point_sets() { return pointSets; } 65 | 66 | /** 67 | * Increases the size of a point set under the assumption that the existing points remain at 68 | * the same position. 69 | * @param i Index of point set that will be resized. 70 | * @param x Pointer to the point position data. Must point to continguous data of 3 * n 71 | * real values. 72 | * @param n Number of points. 73 | */ 74 | void resize_point_set(unsigned int i, Real const* x, std::size_t n); 75 | 76 | 77 | /** 78 | * Creates and adds a new set of points. 79 | * @param x Pointer to the point position data. Must point to continguous data of 3 * n 80 | * real values. 81 | * @param n Number of points. 82 | * @param is_dynamic Specifies whether the point positions will change for future queries. 83 | * @param search_neighbors If true, neighbors in all other point sets are searched. 84 | * @param find_neighbors If true, the new point set is activated in the neighborhood search of all other point sets. 85 | * @returns Returns unique identifier in form of an index assigned to the newly created point 86 | * set. 87 | */ 88 | unsigned int add_point_set(Real const* x, std::size_t n, bool is_dynamic = true, 89 | bool search_neighbors = true, bool find_neighbors = true, void *user_data = nullptr); 90 | 91 | /** 92 | * Performs the actual query. This method will assign a list of neighboring points to each point 93 | * every added point set. 94 | */ 95 | void find_neighbors(bool points_changed = true); 96 | 97 | /** 98 | * Performs the actual query for a single point. This method return a list of neighboring points. Note: That points_changed() must be called each time 99 | * when the positions of a point set changed. 100 | */ 101 | void find_neighbors(unsigned int point_set_id, unsigned int point_index, std::vector> &neighbors); 102 | 103 | /** 104 | * Update neighborhood search data structures after a position change. 105 | * If general find_neighbors() function is called there is no requirement to manually update the point sets. 106 | * Otherwise, in case of using point-wise search (find_neighbors(i, j, neighbors)) the method must be called explicitly. 107 | */ 108 | void update_point_sets(); 109 | 110 | /** 111 | * Update neighborhood search data structures after a position change. 112 | * Has to be called when the positions of a non-dynamic pointset change. 113 | * If general find_neighbors() function is called there is no requirement to manually update the point sets. 114 | * Otherwise, in case of using point-wise search (find_neighbors(i, j, neighbors)) the method must be called explicitly. 115 | */ 116 | void update_point_set(int i); 117 | 118 | /** 119 | * Update neighborhood search data structures after changing the activation table. 120 | * If general find_neighbors() function is called there is no requirement to manually update the point sets. 121 | * Otherwise, in case of using point-wise search (find_neighbors(i, j, neighbors)) the method must be called explicitly. 122 | */ 123 | void update_activation_table(); 124 | 125 | /* 126 | * Generates a sort table according to a space-filling Z curve. Any array-based per point 127 | * information can then be reordered using the function sort_field of the PointSet class. 128 | * Please note that the position data will not be modified by this class, such that the user has 129 | * to invoke the sort_field function on the position array. Moreover, be aware the the grid has 130 | * be reinitialized after each sort. Therefore, the points should not be reordered too 131 | * frequently. 132 | */ 133 | void z_sort(); 134 | 135 | /* 136 | * @returns Returns the radius in which point neighbors are searched. 137 | */ 138 | Real radius() const { return searchRadius; } 139 | 140 | /** 141 | * Sets the radius in which point point neighbors are searched. 142 | * Updates the hash table for all non-dynamic point sets 143 | * @param r Search radius. 144 | */ 145 | void set_radius(Real r); 146 | 147 | /** Activate/deactivate that neighbors in point set j are found when searching for neighbors of point set i. 148 | * @param i Index of searching point set. 149 | * @param j Index of point set of which points should/shouldn't be found by point set i. 150 | * @param active Flag in order to (de)activate that points in i find point in j. 151 | */ 152 | void set_active(unsigned int i, unsigned int j, bool active) 153 | { 154 | m_activation_table.set_active(i, j, active); 155 | } 156 | 157 | /** Activate/Deactivate all point set pairs containing the given index. If search_neighbors is true, neighbors in all other point sets are searched. 158 | * If find_neighbors is true, the new point set is activated in the neighborhood search of all other point sets. 159 | * @param i Index of searching point set. 160 | * @param search_neighbors If true/false enables/disables that point set i searches points in all other point sets. 161 | * @param find_neighbors If true/false enable/disables that point set i is found by all other point sets. 162 | */ 163 | void set_active(unsigned int i, bool search_neighbors = true, bool find_neighbors = true) 164 | { 165 | m_activation_table.set_active(i, search_neighbors, find_neighbors); 166 | } 167 | 168 | /** Activate/Deactivate all point set pairs. 169 | */ 170 | void set_active(bool active) 171 | { 172 | m_activation_table.set_active(active); 173 | } 174 | 175 | /** Returns true if point set i searchs points in point set j. 176 | * @param i Searching point set. 177 | * @param j Set of points to be found by i. 178 | */ 179 | bool is_active(unsigned int i, unsigned int j) const 180 | { 181 | return m_activation_table.is_active(i, j); 182 | } 183 | 184 | private: 185 | bool isInitialized = false; 186 | Real searchRadius; 187 | ActivationTable m_activation_table; 188 | 189 | // Implementation and cuda data are hidden in the cuNSearchDeviceData class to avoid unnecessary dependencies in public headers. 190 | std::unique_ptr deviceData; 191 | 192 | void updatePointSet(PointSet &pointSet); 193 | }; 194 | } 195 | -------------------------------------------------------------------------------- /src/GridInfo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Types.h" 3 | 4 | namespace cuNSearch 5 | { 6 | struct GridInfo 7 | { 8 | cuNSearch::Real3 GridMin; 9 | uint ParticleCount; 10 | cuNSearch::Real3 GridDelta; 11 | UInt3 GridDimension; 12 | UInt3 MetaGridDimension; 13 | float SquaredSearchRadius; 14 | }; 15 | } -------------------------------------------------------------------------------- /src/NotImplementedException.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | class NotImplementedException : std::runtime_error 6 | { 7 | public: 8 | NotImplementedException() 9 | : std::runtime_error("Not implemented.") 10 | { 11 | 12 | } 13 | 14 | NotImplementedException(std::string str) 15 | : std::runtime_error("Not implemented: " + str) 16 | { 17 | std::cout << "Not implemented: " << str << std::endl; 18 | } 19 | }; -------------------------------------------------------------------------------- /src/PointSet.cu: -------------------------------------------------------------------------------- 1 | #include "PointSet.h" 2 | #include "PointSetImplementation.h" 3 | #include "cuda_helper.h" 4 | #include "NotImplementedException.h" 5 | 6 | namespace cuNSearch 7 | { 8 | PointSet::PointSet(PointSet const& other) 9 | { 10 | this->m_dynamic = other.m_dynamic; 11 | this->m_x = other.m_x; 12 | this->m_n = other.m_n; 13 | this->m_user_data = other.m_user_data; 14 | this->sortIndices = other.sortIndices; 15 | this->neighbors = other.neighbors; 16 | 17 | PointSetImplementation *ptr = other.impl.get(); 18 | impl = make_unique(PointSetImplementation(*ptr)); 19 | } 20 | 21 | PointSet::~PointSet() 22 | { 23 | 24 | } 25 | 26 | PointSet::PointSet(Real const* x, std::size_t n, bool dynamic, void *user_data) 27 | : m_x(x), m_n(n), m_dynamic(dynamic), m_user_data(user_data) 28 | { 29 | impl = make_unique(n, (Real3*)x); 30 | } 31 | 32 | void PointSet::resize(Real const* x, std::size_t n) 33 | { 34 | m_x = x; 35 | m_n = n; 36 | 37 | impl->resize(n, (Real3*)x); 38 | } 39 | }; 40 | -------------------------------------------------------------------------------- /src/PointSetImplementation.cu: -------------------------------------------------------------------------------- 1 | #include "PointSetImplementation.h" 2 | #include "NotImplementedException.h" 3 | 4 | namespace cuNSearch 5 | { 6 | PointSetImplementation::PointSetImplementation(size_t particleCount, Real3 *particles) 7 | { 8 | m_ParticleCount = particleCount; 9 | m_Particles = particles; 10 | 11 | uint threadStarts = 0; 12 | ThreadsPerBlock = 64; 13 | CudaHelper::GetThreadBlocks(static_cast(particleCount), ThreadsPerBlock, BlockStartsForParticles, threadStarts); 14 | 15 | copyToDevice(); 16 | } 17 | 18 | PointSetImplementation& PointSetImplementation::operator=(PointSetImplementation const& other) 19 | { 20 | if (this != &other) 21 | { 22 | PointSetImplementation tmp(other); 23 | std::swap(tmp, *this); 24 | } 25 | return *this; 26 | } 27 | 28 | void PointSetImplementation::prepareInternalDataStructures(GridInfo &gridInfo, size_t numberOfCells) 29 | { 30 | this->gridInfo = gridInfo; 31 | 32 | d_ParticleCellIndices.resize(m_ParticleCount); 33 | d_SortIndices.resize(m_ParticleCount); 34 | d_ReversedSortIndices.resize(m_ParticleCount); 35 | 36 | d_CellOffsets.resize(numberOfCells); 37 | d_CellParticleCounts.resize(numberOfCells); 38 | } 39 | 40 | void PointSetImplementation::copyToDevice() 41 | { 42 | d_Particles.resize(m_ParticleCount); 43 | CudaHelper::MemcpyHostToDevice(m_Particles, CudaHelper::GetPointer(d_Particles), m_ParticleCount); 44 | } 45 | } -------------------------------------------------------------------------------- /src/PointSetImplementation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Types.h" 3 | #include "GridInfo.h" 4 | #include 5 | #include "cuda_helper.h" 6 | 7 | namespace cuNSearch 8 | { 9 | class NeighborhoodSearch; 10 | class cuNSearchDeviceData; 11 | 12 | class PointSetImplementation 13 | { 14 | public: 15 | PointSetImplementation(size_t particleCount, Real3 *particles); 16 | 17 | PointSetImplementation(PointSetImplementation const& other) = default; 18 | PointSetImplementation& operator=(PointSetImplementation const& other); 19 | ~PointSetImplementation() { } 20 | 21 | void resize(size_t particleCount, Real3 *particles) 22 | { 23 | m_ParticleCount = particleCount; 24 | m_Particles = particles; 25 | 26 | uint threadStarts = 0; 27 | CudaHelper::GetThreadBlocks(static_cast(particleCount), ThreadsPerBlock, BlockStartsForParticles, threadStarts); 28 | 29 | copyToDevice(); 30 | } 31 | 32 | void copyToDevice(); 33 | 34 | private: 35 | friend NeighborhoodSearch; 36 | friend cuNSearchDeviceData; 37 | 38 | // Min Max of all particles 39 | Real3 Min, Max; 40 | 41 | size_t m_ParticleCount; 42 | 43 | // Pointer to the host particle data 44 | Real3 *m_Particles; 45 | 46 | // Number of thread blocks that must be started to start a thread per particle 47 | int ThreadsPerBlock; 48 | uint BlockStartsForParticles; 49 | 50 | // All device data for the a point set to perform query operations. 51 | GridInfo gridInfo; 52 | thrust::device_vector d_Particles; 53 | thrust::device_vector d_ParticleCellIndices; 54 | 55 | thrust::device_vector d_CellOffsets; 56 | thrust::device_vector d_CellParticleCounts; 57 | thrust::device_vector d_SortIndices; 58 | thrust::device_vector d_ReversedSortIndices; 59 | 60 | void prepareInternalDataStructures(GridInfo &gridInfo, size_t numberOfCells); 61 | }; 62 | }; -------------------------------------------------------------------------------- /src/Types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Common.h" 3 | 4 | #define VALUE_TO_STRING(x) #x 5 | #define VALUE(x) VALUE_TO_STRING(x) 6 | #define VAR_NAME_VALUE(var) #var "=" VALUE(var) 7 | #pragma message(VAR_NAME_VALUE(CUDA_VERSION)) 8 | 9 | #include 10 | #include 11 | 12 | #define CUDA_METHOD __host__ __device__ 13 | 14 | namespace cuNSearch 15 | { 16 | #ifdef CUNSEARCH_USE_DOUBLE_PRECISION 17 | using Real3 = double3; 18 | #define Real3(x, y, z) make_double3(x, y, z) 19 | #else 20 | using Real3 = float3; 21 | #define Real3(x, y, z) make_float3(x, y, z) 22 | #endif 23 | 24 | using UInt3 = uint3; 25 | #define UInt3(x, y, z) make_uint3(x, y, z) 26 | using Int3 = int3; 27 | #define Int3(x, y, z) make_int3(x, y, z) 28 | 29 | inline CUDA_METHOD Real3 operator*(Real3 left, Real3 right) 30 | { 31 | return Real3(left.x * right.x, left.y * right.y, left.z * right.z); 32 | } 33 | 34 | inline CUDA_METHOD Real3 operator*(Int3 left, Real3 right) 35 | { 36 | return Real3(left.x * right.x, left.y * right.y, left.z * right.z); 37 | } 38 | 39 | inline CUDA_METHOD Real3 operator*(Real3 left, Int3 right) 40 | { 41 | return Real3(left.x * right.x, left.y * right.y, left.z * right.z); 42 | } 43 | 44 | inline CUDA_METHOD Real3 operator*(Real3 left, Real right) 45 | { 46 | return Real3(left.x * right, left.y * right, left.z * right); 47 | } 48 | 49 | inline CUDA_METHOD Real3 operator*(Real left, Real3 right) 50 | { 51 | return Real3(left * right.x, left * right.y, left * right.z); 52 | } 53 | 54 | 55 | 56 | 57 | inline CUDA_METHOD Real3 operator-(Real3 left, Real3 right) 58 | { 59 | return Real3(left.x - right.x, left.y - right.y, left.z - right.z); 60 | } 61 | inline CUDA_METHOD Real3 operator+(Real3 left, Real3 right) 62 | { 63 | return Real3(left.x + right.x, left.y + right.y, left.z + right.z); 64 | } 65 | 66 | inline CUDA_METHOD Int3 operator+(Int3 left, Int3 right) 67 | { 68 | return Int3(left.x + right.x, left.y + right.y, left.z + right.z); 69 | } 70 | 71 | inline CUDA_METHOD UInt3 operator+(UInt3 left, UInt3 right) 72 | { 73 | return UInt3(left.x + right.x, left.y + right.y, left.z + right.z); 74 | } 75 | 76 | 77 | inline CUDA_METHOD Int3 operator*(Int3 left, int right) 78 | { 79 | return Int3(left.x * right, left.y * right, left.z * right); 80 | } 81 | 82 | inline CUDA_METHOD Int3 operator*(int left, Int3 right) 83 | { 84 | return Int3(left * right.x, left * right.y, left * right.z); 85 | } 86 | 87 | 88 | inline CUDA_METHOD void operator-=(Real3 &a, Real3 b) 89 | { 90 | a.x -= b.x; 91 | a.y -= b.y; 92 | a.z -= b.z; 93 | } 94 | } 95 | 96 | namespace cuNSearch 97 | { 98 | using Int3 = Int3; 99 | using UInt3 = UInt3; 100 | using Real3 = Real3; 101 | } -------------------------------------------------------------------------------- /src/cuNSearch.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "cuNSearch.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef DEBUG 24 | #define PRINT_STATS true 25 | #define USE_TIMING(x) x; 26 | #else 27 | #define PRINT_STATS false 28 | #define USE_TIMING(x) 29 | #endif 30 | #include "Timing.h" 31 | 32 | #include "NotImplementedException.h" 33 | #include "cuNSearchDeviceData.h" 34 | #include "PointSetImplementation.h" 35 | #include "cuNSearchKernels.cuh" 36 | #include "cuda_helper.h" 37 | 38 | 39 | namespace cuNSearch 40 | { 41 | NeighborhoodSearch::NeighborhoodSearch(Real searchRadius) 42 | { 43 | deviceData = make_unique(searchRadius); 44 | set_radius(searchRadius); 45 | } 46 | 47 | NeighborhoodSearch::~NeighborhoodSearch() 48 | { 49 | 50 | } 51 | 52 | unsigned int NeighborhoodSearch::add_point_set(Real const* x, std::size_t n, bool is_dynamic, 53 | bool search_neighbors, bool find_neighbors, void *user_data) 54 | { 55 | auto index = pointSets.size(); 56 | pointSets.push_back(PointSet(x, n, is_dynamic, user_data)); 57 | m_activation_table.add_point_set(search_neighbors, find_neighbors); 58 | 59 | for (auto &pointSet : pointSets) 60 | { 61 | pointSet.neighbors.resize(pointSets.size()); 62 | } 63 | 64 | return static_cast(index); 65 | } 66 | 67 | 68 | void NeighborhoodSearch::set_radius(Real r) 69 | { 70 | this->searchRadius = r; 71 | deviceData->setSearchRadius(r); 72 | isInitialized = false; 73 | } 74 | 75 | void NeighborhoodSearch::z_sort() 76 | { 77 | //Do nothing as the sort step is part of the main implementation 78 | } 79 | 80 | void 81 | NeighborhoodSearch::resize_point_set(unsigned int index, Real const* x, std::size_t size) 82 | { 83 | pointSets[index].resize(x, size); 84 | } 85 | 86 | void 87 | NeighborhoodSearch::update_activation_table() 88 | { 89 | //Update neighborhood search data structures after changing the activation table. 90 | //If general find_neighbors() function is called there is no requirement to manually update the point sets. 91 | } 92 | 93 | void 94 | NeighborhoodSearch::updatePointSet(PointSet &pointSet) 95 | { 96 | USE_TIMING(Timing::startTiming("Update point sets - copyParticleData")); 97 | pointSet.impl->copyToDevice(); 98 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 99 | 100 | USE_TIMING(Timing::startTiming("Update point sets - computeMinMax")); 101 | deviceData->computeMinMax(pointSet); 102 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 103 | 104 | USE_TIMING(Timing::startTiming("Update point sets - computeCellInformation")); 105 | deviceData->computeCellInformation(pointSet); 106 | 107 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 108 | } 109 | 110 | void 111 | NeighborhoodSearch::find_neighbors(bool points_changed_) 112 | { 113 | if (points_changed_ || !isInitialized) 114 | { 115 | for (auto &pointSet : pointSets) 116 | { 117 | if (!isInitialized || pointSet.is_dynamic()) 118 | { 119 | updatePointSet(pointSet); 120 | } 121 | } 122 | } 123 | isInitialized = true; 124 | 125 | for (unsigned int i = 0; i < pointSets.size(); i++) 126 | { 127 | for (unsigned int j = 0; j < pointSets.size(); j++) 128 | { 129 | if (m_activation_table.is_active(i, j)) 130 | { 131 | auto &queryPointSet = pointSets[i]; 132 | auto &pointSet = pointSets[j]; 133 | deviceData->computeNeighborhood(queryPointSet, pointSet, j); 134 | } 135 | } 136 | } 137 | } 138 | 139 | void 140 | NeighborhoodSearch::find_neighbors(unsigned int point_set_id, unsigned int point_index, std::vector> &neighbors) 141 | { 142 | throw new NotImplementedException("NeighborhoodSearch::find_neighbors()"); 143 | } 144 | 145 | void 146 | NeighborhoodSearch::update_point_sets() 147 | { 148 | for (unsigned int i = 0; i < pointSets.size(); i++) 149 | { 150 | update_point_set(i); 151 | } 152 | } 153 | 154 | void 155 | NeighborhoodSearch::update_point_set(int i) 156 | { 157 | updatePointSet(pointSets[i]); 158 | } 159 | } 160 | 161 | -------------------------------------------------------------------------------- /src/cuNSearchDeviceData.cu: -------------------------------------------------------------------------------- 1 | #include "cuNSearchDeviceData.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef DEBUG 12 | #define PRINT_STATS true 13 | #define USE_TIMING(x) x; 14 | #else 15 | #define PRINT_STATS false 16 | #define USE_TIMING(x) 17 | #endif 18 | #include "Timing.h" 19 | 20 | #include "PointSetImplementation.h" 21 | #include "GridInfo.h" 22 | #include "cuda_helper.h" 23 | #include "cuNSearchKernels.cuh" 24 | 25 | namespace cuNSearch 26 | { 27 | void cuNSearchDeviceData::computeMinMax(PointSet &pointSet) 28 | { 29 | if (pointSet.n_points() == 0) 30 | return; 31 | 32 | auto pointSetImpl = pointSet.impl.get(); 33 | 34 | Int3 data[2]; 35 | data[0] = Int3(std::numeric_limits().max(), std::numeric_limits().max(), std::numeric_limits().max()); 36 | data[1] = Int3(std::numeric_limits().min(), std::numeric_limits().min(), std::numeric_limits().min()); 37 | d_MinMax.resize(2); 38 | CudaHelper::MemcpyHostToDevice(data, CudaHelper::GetPointer(d_MinMax), 2); 39 | 40 | kComputeMinMax << BlockStartsForParticles, pointSetImpl->ThreadsPerBlock >> > ( 41 | (Real3*)CudaHelper::GetPointer(pointSetImpl->d_Particles), 42 | static_cast(pointSet.n_points()), 43 | m_SearchRadius, 44 | CudaHelper::GetPointer(d_MinMax), 45 | CudaHelper::GetPointer(d_MinMax) + 1 46 | ); 47 | CudaHelper::CheckLastError(); 48 | CudaHelper::DeviceSynchronize(); 49 | 50 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_MinMax), data, 2); 51 | Int3 minCell = data[0]; 52 | Int3 maxCell = data[1]; 53 | 54 | pointSetImpl->Min.x = minCell.x * m_SearchRadius; 55 | pointSetImpl->Min.y = minCell.y * m_SearchRadius; 56 | pointSetImpl->Min.z = minCell.z * m_SearchRadius; 57 | 58 | pointSetImpl->Max.x = maxCell.x * m_SearchRadius; 59 | pointSetImpl->Max.y = maxCell.y * m_SearchRadius; 60 | pointSetImpl->Max.z = maxCell.z * m_SearchRadius; 61 | 62 | //CPU implementation of min max computation 63 | //Real3 cpuMin, cpuMax; 64 | //cpuMin = make_Real3(std::numeric_limits().max()); 65 | //cpuMax = make_Real3(std::numeric_limits().min()); 66 | 67 | //Real3 *points = (Real3 *)pointSet.m_x; 68 | //for (size_t i = 0; i < pointSet.n_points(); i++) 69 | //{ 70 | // cpuMin.x = std::min(cpuMin.x, points[i].x); 71 | // cpuMin.y = std::min(cpuMin.y, points[i].y); 72 | // cpuMin.z = std::min(cpuMin.z, points[i].z); 73 | 74 | // cpuMax.x = std::max(cpuMax.x, points[i].x); 75 | // cpuMax.y = std::max(cpuMax.y, points[i].y); 76 | // cpuMax.z = std::max(cpuMax.z, points[i].z); 77 | //} 78 | } 79 | 80 | void cuNSearchDeviceData::computeCellInformation(PointSet &pointSet) 81 | { 82 | if (pointSet.n_points() == 0) 83 | return; 84 | 85 | auto pointSetImpl = pointSet.impl.get(); 86 | Real3 sceneMin = pointSetImpl->Min; 87 | Real3 sceneMax = pointSetImpl->Max; 88 | 89 | GridInfo gridInfo; 90 | gridInfo.ParticleCount = static_cast(pointSet.n_points()); 91 | gridInfo.SquaredSearchRadius = m_SearchRadius * m_SearchRadius; 92 | gridInfo.GridMin = sceneMin; 93 | 94 | Real cellSize = m_SearchRadius; 95 | Real3 gridSize = sceneMax - sceneMin; 96 | gridInfo.GridDimension.x = static_cast(ceil(gridSize.x / cellSize)); 97 | gridInfo.GridDimension.y = static_cast(ceil(gridSize.y / cellSize)); 98 | gridInfo.GridDimension.z = static_cast(ceil(gridSize.z / cellSize)); 99 | 100 | //Increase grid by 2 cells in each direciton (+4 in each dimension) to skip bounds checks in the kernel 101 | gridInfo.GridDimension.x += 4; 102 | gridInfo.GridDimension.y += 4; 103 | gridInfo.GridDimension.z += 4; 104 | gridInfo.GridMin -= Real3(cellSize, cellSize, cellSize) * (Real)2; 105 | 106 | //One meta grid cell contains 8x8x8 grild cells. (512) 107 | gridInfo.MetaGridDimension.x = static_cast(ceil(gridInfo.GridDimension.x / (float)CUDA_META_GRID_GROUP_SIZE)); 108 | gridInfo.MetaGridDimension.y = static_cast(ceil(gridInfo.GridDimension.y / (float)CUDA_META_GRID_GROUP_SIZE)); 109 | gridInfo.MetaGridDimension.z = static_cast(ceil(gridInfo.GridDimension.z / (float)CUDA_META_GRID_GROUP_SIZE)); 110 | 111 | // Adjust grid size to multiple of cell size 112 | gridSize.x = gridInfo.GridDimension.x * cellSize; 113 | gridSize.y = gridInfo.GridDimension.y * cellSize; 114 | gridSize.z = gridInfo.GridDimension.z * cellSize; 115 | 116 | gridInfo.GridDelta.x = gridInfo.GridDimension.x / gridSize.x; 117 | gridInfo.GridDelta.y = gridInfo.GridDimension.y / gridSize.y; 118 | gridInfo.GridDelta.z = gridInfo.GridDimension.z / gridSize.z; 119 | 120 | d_TempSortIndices.resize(gridInfo.ParticleCount); 121 | 122 | uint numberOfCells = (gridInfo.MetaGridDimension.x * gridInfo.MetaGridDimension.y * gridInfo.MetaGridDimension.z) * CUDA_META_GRID_BLOCK_SIZE; 123 | pointSet.impl->prepareInternalDataStructures(gridInfo, numberOfCells); 124 | 125 | CudaHelper::CheckLastError(); 126 | CudaHelper::DeviceSynchronize(); 127 | 128 | cudaMemset(CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), 0, CudaHelper::GetSizeInBytes(pointSetImpl->d_CellParticleCounts)); 129 | 130 | CudaHelper::CheckLastError(); 131 | CudaHelper::DeviceSynchronize(); 132 | 133 | kInsertParticles_Morton << BlockStartsForParticles, pointSetImpl->ThreadsPerBlock >> > ( 134 | gridInfo, 135 | (Real3*)CudaHelper::GetPointer(pointSetImpl->d_Particles), 136 | CudaHelper::GetPointer(pointSetImpl->d_ParticleCellIndices), 137 | CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), 138 | CudaHelper::GetPointer(d_TempSortIndices) 139 | ); 140 | 141 | CudaHelper::CheckLastError(); 142 | CudaHelper::DeviceSynchronize(); 143 | 144 | thrust::exclusive_scan( 145 | pointSetImpl->d_CellParticleCounts.begin(), 146 | pointSetImpl->d_CellParticleCounts.end(), 147 | pointSetImpl->d_CellOffsets.begin()); 148 | CudaHelper::DeviceSynchronize(); 149 | 150 | kCountingSortIndices << BlockStartsForParticles, pointSetImpl->ThreadsPerBlock >> > ( 151 | gridInfo, 152 | CudaHelper::GetPointer(pointSetImpl->d_ParticleCellIndices), 153 | CudaHelper::GetPointer(pointSetImpl->d_CellOffsets), 154 | CudaHelper::GetPointer(d_TempSortIndices), 155 | CudaHelper::GetPointer(pointSetImpl->d_SortIndices) 156 | ); 157 | 158 | CudaHelper::DeviceSynchronize(); 159 | 160 | auto &tempSequence = d_TempSortIndices; 161 | thrust::sequence(tempSequence.begin(), tempSequence.end()); 162 | 163 | thrust::gather( 164 | pointSetImpl->d_SortIndices.begin(), 165 | pointSetImpl->d_SortIndices.end(), 166 | tempSequence.begin(), 167 | pointSetImpl->d_ReversedSortIndices.begin()); 168 | 169 | CudaHelper::CheckLastError(); 170 | CudaHelper::DeviceSynchronize(); 171 | 172 | pointSet.sortIndices.resize(pointSetImpl->d_SortIndices.size()); 173 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(pointSetImpl->d_SortIndices), pointSet.sortIndices.data(), pointSetImpl->d_SortIndices.size()); 174 | } 175 | 176 | void cuNSearchDeviceData::computeNeighborhood(PointSet &queryPointSet, PointSet &pointSet, uint neighborListEntry) 177 | { 178 | if (queryPointSet.n_points() == 0) 179 | return; 180 | 181 | auto queryPointSetImpl = queryPointSet.impl.get(); 182 | auto pointSetImpl = pointSet.impl.get(); 183 | 184 | uint particleCount = static_cast(queryPointSet.n_points()); 185 | 186 | USE_TIMING(Timing::startTiming("Execute kNeighborCount")); 187 | d_NeighborCounts.resize(particleCount); 188 | 189 | kComputeCounts << BlockStartsForParticles, queryPointSetImpl->ThreadsPerBlock >> > ( 190 | (Real3*)CudaHelper::GetPointer(queryPointSetImpl->d_Particles), 191 | static_cast(queryPointSet.n_points()), 192 | 193 | pointSetImpl->gridInfo, 194 | (Real3*)CudaHelper::GetPointer(pointSetImpl->d_Particles), 195 | CudaHelper::GetPointer(pointSetImpl->d_CellOffsets), 196 | CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), 197 | 198 | CudaHelper::GetPointer(d_NeighborCounts), 199 | CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices) 200 | ); 201 | 202 | CudaHelper::CheckLastError(); 203 | CudaHelper::DeviceSynchronize(); 204 | 205 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 206 | USE_TIMING(Timing::startTiming("Execute exclusive_scan over counts")); 207 | 208 | d_NeighborWriteOffsets.resize(particleCount); 209 | 210 | //Prefix sum over neighbor counts 211 | thrust::exclusive_scan( 212 | d_NeighborCounts.begin(), 213 | d_NeighborCounts.end(), 214 | d_NeighborWriteOffsets.begin()); 215 | 216 | CudaHelper::DeviceSynchronize(); 217 | 218 | //Compute total amount of neighbors 219 | uint lastOffset = 0; 220 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets) + particleCount - 1, &lastOffset, 1); 221 | uint lastParticleNeighborCount = 0; 222 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts) + particleCount - 1, &lastParticleNeighborCount, 1); 223 | uint totalNeighborCount = lastOffset + lastParticleNeighborCount; 224 | d_Neighbors.resize(totalNeighborCount); 225 | 226 | CudaHelper::DeviceSynchronize(); 227 | 228 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 229 | USE_TIMING(Timing::startTiming("Execute kNeighborhoodQueryWithCounts")); 230 | 231 | kNeighborhoodQueryWithCounts << BlockStartsForParticles, queryPointSetImpl->ThreadsPerBlock >> > ( 232 | (Real3*)CudaHelper::GetPointer(queryPointSetImpl->d_Particles), 233 | static_cast(queryPointSet.n_points()), 234 | 235 | pointSetImpl->gridInfo, 236 | (Real3*)CudaHelper::GetPointer(pointSetImpl->d_Particles), 237 | CudaHelper::GetPointer(pointSetImpl->d_CellOffsets), 238 | CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), 239 | 240 | CudaHelper::GetPointer(d_NeighborWriteOffsets), 241 | CudaHelper::GetPointer(d_Neighbors), 242 | CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices) 243 | ); 244 | 245 | CudaHelper::CheckLastError(); 246 | CudaHelper::DeviceSynchronize(); 247 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 248 | 249 | //Copy data to host 250 | USE_TIMING(Timing::startTiming("Neighbor copy from device to host - resize")); 251 | 252 | auto &neighborSet = queryPointSet.neighbors[neighborListEntry]; 253 | 254 | if (neighborSet.NeighborCountAllocationSize < totalNeighborCount) 255 | { 256 | if (neighborSet.NeighborCountAllocationSize != 0) 257 | { 258 | cudaFreeHost(neighborSet.Neighbors); 259 | } 260 | 261 | neighborSet.NeighborCountAllocationSize = static_cast(totalNeighborCount * 1.5); 262 | cudaMallocHost(&neighborSet.Neighbors, sizeof(uint) * neighborSet.NeighborCountAllocationSize); 263 | } 264 | if (neighborSet.ParticleCountAllocationSize < particleCount) 265 | { 266 | if (neighborSet.ParticleCountAllocationSize != 0) 267 | { 268 | cudaFreeHost(neighborSet.Offsets); 269 | cudaFreeHost(neighborSet.Counts); 270 | } 271 | 272 | neighborSet.ParticleCountAllocationSize = static_cast(particleCount * 1.5); 273 | cudaMallocHost(&neighborSet.Offsets, sizeof(uint) * neighborSet.ParticleCountAllocationSize); 274 | cudaMallocHost(&neighborSet.Counts, sizeof(uint) * neighborSet.ParticleCountAllocationSize); 275 | } 276 | 277 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 278 | USE_TIMING(Timing::startTiming("Neighbor copy from device to host - MemcpyDeviceToHost")); 279 | 280 | if (PRINT_STATS) 281 | { 282 | int bytesToCopy = totalNeighborCount * 4 + particleCount * 2 * 4; 283 | printf("Total neighbors: %d \n", totalNeighborCount); 284 | printf("Average neighbors: %d \n", totalNeighborCount / particleCount); 285 | printf("Expected amount: %f MB \n", bytesToCopy / (1024.0f * 1024.0f)); 286 | } 287 | 288 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_Neighbors), neighborSet.Neighbors, totalNeighborCount); 289 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts), neighborSet.Counts, particleCount); 290 | CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets), neighborSet.Offsets, particleCount); 291 | 292 | USE_TIMING(Timing::stopTiming(PRINT_STATS)); 293 | } 294 | } -------------------------------------------------------------------------------- /src/cuNSearchDeviceData.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Types.h" 3 | #include "PointSet.h" 4 | #include 5 | 6 | namespace cuNSearch 7 | { 8 | class cuNSearchDeviceData 9 | { 10 | public: 11 | cuNSearchDeviceData(Real searchRadius) 12 | { 13 | m_SearchRadius = searchRadius; 14 | } 15 | 16 | void setSearchRadius(Real searchRadius) 17 | { 18 | m_SearchRadius = searchRadius; 19 | } 20 | 21 | /** Compute min max for the given point set 22 | */ 23 | void computeMinMax(PointSet &pointSet); 24 | 25 | 26 | /** Constructs the uniform grid for the given point set a updates all cell information to allow queries on this point set 27 | */ 28 | void computeCellInformation(PointSet &pointSet); 29 | 30 | 31 | /** Queries the neighbors in the given point set for all particles in the query point set. 32 | */ 33 | void computeNeighborhood(PointSet &queryPointSet, PointSet &pointSet, uint neighborListEntry); 34 | 35 | private: 36 | Real m_SearchRadius; 37 | 38 | thrust::device_vector d_MinMax; 39 | thrust::device_vector d_TempSortIndices; 40 | 41 | //Device neighbor buffers (only temporary used: after the computation the data is copied to the host) 42 | thrust::device_vector d_Neighbors; 43 | thrust::device_vector d_NeighborCounts; 44 | thrust::device_vector d_NeighborWriteOffsets; 45 | }; 46 | }; -------------------------------------------------------------------------------- /src/cuNSearchKernels.cu: -------------------------------------------------------------------------------- 1 | #include "cuNSearchKernels.cuh" 2 | #include "Types.h" 3 | #include "helper_mortonCode.h" 4 | #include "helper_linearIndex.h" 5 | 6 | #define INT16_RANGE 32767 7 | #define UPDATE_REF_OFFSET -32768 8 | 9 | #pragma region HelperMethods 10 | inline __device__ uint ToCellIndex_MortonMetaGrid(const GridInfo &GridInfo, Int3 gridCell) 11 | { 12 | Int3 metaGridCell = Int3( 13 | gridCell.x / CUDA_META_GRID_GROUP_SIZE, 14 | gridCell.y / CUDA_META_GRID_GROUP_SIZE, 15 | gridCell.z / CUDA_META_GRID_GROUP_SIZE); 16 | gridCell.x %= CUDA_META_GRID_GROUP_SIZE; 17 | gridCell.y %= CUDA_META_GRID_GROUP_SIZE; 18 | gridCell.z %= CUDA_META_GRID_GROUP_SIZE; 19 | uint metaGridIndex = CellIndicesToLinearIndex(GridInfo.MetaGridDimension, metaGridCell); 20 | return metaGridIndex * CUDA_META_GRID_BLOCK_SIZE + MortonCode3(gridCell.x, gridCell.y, gridCell.z); 21 | } 22 | 23 | inline __device__ Int3 ToGridCell_MortonMetaGrid(const GridInfo &GridInfo, uint &cellIndex) 24 | { 25 | uint metaGridIndex = cellIndex / CUDA_META_GRID_BLOCK_SIZE; 26 | Int3 gridCell = MortonCodeToIndexInt3(cellIndex % CUDA_META_GRID_BLOCK_SIZE) + LinearCellIndexTo3DIndicesInt3(GridInfo.MetaGridDimension, metaGridIndex) * CUDA_META_GRID_GROUP_SIZE; 27 | return gridCell; 28 | } 29 | 30 | inline __device__ uint ToCellIndex(const GridInfo &GridInfo, Int3 gridCell) 31 | { 32 | return CellIndicesToLinearIndex(GridInfo.GridDimension, gridCell); 33 | } 34 | 35 | inline __device__ Int3 ToGridCell(const GridInfo &GridInfo, const uint &cellIndex) 36 | { 37 | return LinearCellIndexTo3DIndicesInt3(GridInfo.GridDimension, cellIndex); 38 | } 39 | #pragma endregion HelperMethods 40 | 41 | __global__ void kComputeMinMax( 42 | const Real3 *particles, 43 | uint particleCount, 44 | float searchRadius, 45 | Int3 *minCell, 46 | Int3 *maxCell 47 | ) 48 | { 49 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 50 | if (particleIndex >= particleCount) return; 51 | const Real3 particle = particles[particleIndex]; 52 | 53 | Int3 cell; 54 | cell.x = (int)floor(particle.x / searchRadius); 55 | cell.y = (int)floor(particle.y / searchRadius); 56 | cell.z = (int)floor(particle.z / searchRadius); 57 | 58 | atomicMin(&(minCell->x), cell.x); 59 | atomicMin(&(minCell->y), cell.y); 60 | atomicMin(&(minCell->z), cell.z); 61 | 62 | atomicMax(&(maxCell->x), cell.x); 63 | atomicMax(&(maxCell->y), cell.y); 64 | atomicMax(&(maxCell->z), cell.z); 65 | 66 | //printf("%d %d %d Min: %d %d %d Max: %d %d %d \n", cell.x, cell.y, cell.z, minCell->x, minCell->y, minCell->z, maxCell->x, maxCell->y, maxCell->z); 67 | } 68 | 69 | 70 | #pragma region kInsertParticles 71 | __global__ void kInsertParticles( 72 | const GridInfo GridInfo, 73 | const Real3 *particles, 74 | uint *particleCellIndices, 75 | uint *cellParticleCounts, 76 | uint *sortIndices 77 | ) 78 | { 79 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 80 | if (particleIndex >= GridInfo.ParticleCount) return; 81 | 82 | Real3 gridCellF = (particles[particleIndex] - GridInfo.GridMin) * GridInfo.GridDelta; 83 | Int3 gridCell = Int3(int(gridCellF.x), int(gridCellF.y), int(gridCellF.z)); 84 | uint cellIndex = ToCellIndex(GridInfo, gridCell); 85 | particleCellIndices[particleIndex] = cellIndex; 86 | sortIndices[particleIndex] = atomicAdd(&cellParticleCounts[cellIndex], 1); 87 | } 88 | 89 | __global__ void kInsertParticles_Morton( 90 | const GridInfo GridInfo, 91 | const Real3 *particles, 92 | uint *particleCellIndices, 93 | uint *cellParticleCounts, 94 | uint *sortIndices 95 | ) 96 | { 97 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 98 | if (particleIndex >= GridInfo.ParticleCount) return; 99 | 100 | Real3 gridCellF = (particles[particleIndex] - GridInfo.GridMin) * GridInfo.GridDelta; 101 | Int3 gridCell = Int3(int(gridCellF.x), int(gridCellF.y), int(gridCellF.z)); 102 | uint cellIndex = ToCellIndex_MortonMetaGrid(GridInfo, gridCell); 103 | particleCellIndices[particleIndex] = cellIndex; 104 | sortIndices[particleIndex] = atomicAdd(&cellParticleCounts[cellIndex], 1); 105 | } 106 | #pragma endregion kInsertParticles 107 | 108 | __global__ void kCountingSortIndices( 109 | const GridInfo GridInfo, 110 | const uint *particleCellIndices, 111 | const uint *cellOffsets, 112 | const uint *sortIndicesSrc, 113 | uint *sortIndicesDest 114 | ) 115 | { 116 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 117 | if (particleIndex >= GridInfo.ParticleCount) return; 118 | 119 | uint gridCellIndex = particleCellIndices[particleIndex]; 120 | 121 | uint sortIndex = sortIndicesSrc[particleIndex] + cellOffsets[gridCellIndex]; 122 | sortIndicesDest[sortIndex] = particleIndex; 123 | } 124 | 125 | __global__ void kComputeCounts( 126 | const Real3 *queryPoints, 127 | const uint queryPointCount, 128 | 129 | const GridInfo GridInfo, 130 | const Real3 *particles, 131 | const uint *cellOffsets, 132 | const uint *cellParticleCounts, 133 | uint *neighborCounts, 134 | const uint *reversedSortIndices 135 | ) 136 | { 137 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 138 | if (particleIndex >= queryPointCount) return; 139 | const Real3 particle = queryPoints[particleIndex]; 140 | Real3 gridCellF = (particle - GridInfo.GridMin) * GridInfo.GridDelta; 141 | 142 | Int3 coord = Int3(int(floor(gridCellF.x)), int(floor(gridCellF.y)), int(floor(gridCellF.z))); 143 | 144 | uint neighborCount = 0; 145 | for (int z = -1; z < 2; z++) 146 | for (int y = -1; y < 2; y++) 147 | for (int x = -1; x < 2; x++) 148 | { 149 | Int3 finalCoord = coord + Int3(x, y, z); 150 | 151 | if (finalCoord.x < 0 || finalCoord.y < 0 || finalCoord.z < 0 152 | || finalCoord.x >= GridInfo.GridDimension.x || finalCoord.y >= GridInfo.GridDimension.y || finalCoord.z >= GridInfo.GridDimension.z) 153 | continue; 154 | 155 | uint neighborCellIndex = ToCellIndex_MortonMetaGrid(GridInfo, finalCoord); 156 | uint neighborCellCount = cellParticleCounts[neighborCellIndex]; 157 | uint neighborCellStart = cellOffsets[neighborCellIndex]; 158 | 159 | for (uint i = neighborCellStart; i < neighborCellStart + neighborCellCount; i++) 160 | { 161 | uint &neighborIndex = i; 162 | Real3 diff = particles[reversedSortIndices[neighborIndex]] - particle; 163 | float squaredDistance = diff.x * diff.x + diff.y * diff.y + diff.z * diff.z; 164 | if (squaredDistance < GridInfo.SquaredSearchRadius && squaredDistance > 0.0) 165 | { 166 | neighborCount++; 167 | } 168 | 169 | if (neighborCount == CUDA_MAX_NEIGHBORS) 170 | { 171 | neighborCounts[particleIndex] = neighborCount; 172 | return; 173 | } 174 | } 175 | } 176 | 177 | neighborCounts[particleIndex] = neighborCount; 178 | } 179 | 180 | 181 | __global__ void kNeighborhoodQueryWithCounts( 182 | const Real3 *queryPoints, 183 | const uint queryPointCount, 184 | 185 | const GridInfo GridInfo, 186 | const Real3 *particles, 187 | const uint *cellOffsets, 188 | const uint *cellParticleCounts, 189 | const uint *neighborWriteOffsets, 190 | uint *neighbors, 191 | const uint *reversedSortIndices) 192 | { 193 | uint particleIndex = blockIdx.x * blockDim.x + threadIdx.x; 194 | if (particleIndex >= queryPointCount) return; 195 | const Real3 particle = queryPoints[particleIndex]; 196 | Real3 gridCellF = (particle - GridInfo.GridMin) * GridInfo.GridDelta; 197 | 198 | Int3 coord = Int3(int(floor(gridCellF.x)), int(floor(gridCellF.y)), int(floor(gridCellF.z))); 199 | 200 | uint neighborCount = 0; 201 | const uint writeOffset = neighborWriteOffsets[particleIndex]; 202 | 203 | for (int z = -1; z < 2; z++) 204 | for (int y = -1; y < 2; y++) 205 | for (int x = -1; x < 2; x++) 206 | { 207 | Int3 finalCoord = coord + Int3(x, y, z); 208 | 209 | if (finalCoord.x < 0 || finalCoord.y < 0 || finalCoord.z < 0 210 | || finalCoord.x >= GridInfo.GridDimension.x || finalCoord.y >= GridInfo.GridDimension.y || finalCoord.z >= GridInfo.GridDimension.z) 211 | continue; 212 | 213 | uint neighborCellIndex = ToCellIndex_MortonMetaGrid(GridInfo, finalCoord); 214 | uint neighborCellCount = cellParticleCounts[neighborCellIndex]; 215 | uint neighborCellStart = cellOffsets[neighborCellIndex]; 216 | 217 | for (uint i = neighborCellStart; i < neighborCellStart + neighborCellCount; i++) 218 | { 219 | uint &neighborIndex = i; 220 | Real3 diff = particles[reversedSortIndices[neighborIndex]] - particle; 221 | float squaredDistance = diff.x * diff.x + diff.y * diff.y + diff.z * diff.z; 222 | if (squaredDistance < GridInfo.SquaredSearchRadius && squaredDistance > 0.0) 223 | { 224 | neighbors[writeOffset + neighborCount] = reversedSortIndices[neighborIndex]; 225 | neighborCount++; 226 | } 227 | 228 | if (neighborCount == CUDA_MAX_NEIGHBORS) 229 | { 230 | return; 231 | } 232 | } 233 | } 234 | } -------------------------------------------------------------------------------- /src/cuNSearchKernels.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "Types.h" 4 | #include "GridInfo.h" 5 | 6 | #define CUDA_MAX_NEIGHBORS 70 7 | #define CUDA_META_GRID_GROUP_SIZE 8 8 | #define CUDA_META_GRID_BLOCK_SIZE (CUDA_META_GRID_GROUP_SIZE*CUDA_META_GRID_GROUP_SIZE*CUDA_META_GRID_GROUP_SIZE) 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned char byte; 12 | using namespace cuNSearch; 13 | 14 | __global__ void kComputeMinMax( 15 | const Real3 *particles, 16 | uint particleCount, 17 | float searchRadius, 18 | Int3 *minCell, 19 | Int3 *maxCell 20 | ); 21 | 22 | __global__ void kInsertParticles( 23 | const GridInfo GridInfo, 24 | const Real3 *particles, 25 | uint *particleCellIndices, 26 | uint *cellParticleCounts, 27 | uint *sortIndices 28 | ); 29 | 30 | __global__ void kInsertParticles_Morton( 31 | const GridInfo GridInfo, 32 | const Real3 *particles, 33 | uint *particleCellIndices, 34 | uint *cellParticleCounts, 35 | uint *sortIndices 36 | ); 37 | 38 | __global__ void kCountingSortIndices( 39 | const GridInfo GridInfo, 40 | const uint *particleCellIndices, 41 | const uint *cellOffsets, 42 | const uint *sortIndicesSrc, 43 | uint *sortIndicesDest 44 | ); 45 | 46 | __global__ void kComputeCounts( 47 | const Real3 *queryPoints, 48 | const uint queryPointCount, 49 | 50 | const GridInfo GridInfo, 51 | const Real3 *particles, 52 | const uint *cellOffsets, 53 | const uint *cellParticleCounts, 54 | uint *neighborCounts, 55 | const uint *reversedSortIndices 56 | ); 57 | 58 | __global__ void kNeighborhoodQueryWithCounts( 59 | const Real3 *queryPoints, 60 | const uint queryPointCount, 61 | 62 | const GridInfo GridInfo, 63 | const Real3 *particles, 64 | const uint *cellOffsets, 65 | const uint *cellParticleCounts, 66 | const uint *neighborWriteOffsets, 67 | uint *neighbors, 68 | const uint *reversedSortIndices); -------------------------------------------------------------------------------- /src/helper_linearIndex.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "Types.h" 4 | 5 | __host__ __device__ inline uint CellIndicesToLinearIndex( 6 | UInt3 &cellDimensions, 7 | UInt3 &xyz 8 | ) 9 | { 10 | return xyz.z * cellDimensions.y * cellDimensions.x + xyz.y * cellDimensions.x + xyz.x; 11 | } 12 | 13 | __host__ __device__ inline uint CellIndicesToLinearIndex( 14 | const UInt3&cellDimensions, 15 | Int3 &xyz 16 | ) 17 | { 18 | return xyz.z * cellDimensions.y * cellDimensions.x + xyz.y * cellDimensions.x + xyz.x; 19 | } 20 | 21 | __host__ __device__ inline void LinearCellIndexTo3DIndices( 22 | const UInt3 &cellDimensions, 23 | const uint linearIndex, 24 | UInt3 &xyz 25 | ) 26 | { 27 | xyz.z = linearIndex / (cellDimensions.y * cellDimensions.x); 28 | xyz.y = (linearIndex % (cellDimensions.y * cellDimensions.x)) / (cellDimensions.x); 29 | xyz.x = (linearIndex % (cellDimensions.y * cellDimensions.x)) % cellDimensions.x; 30 | } 31 | 32 | __host__ __device__ inline UInt3 LinearCellIndexTo3DIndices( 33 | const UInt3 &cellDimensions, 34 | const uint linearIndex) 35 | { 36 | UInt3 xyz; 37 | xyz.z = linearIndex / (cellDimensions.y * cellDimensions.x); 38 | xyz.y = (linearIndex % (cellDimensions.y * cellDimensions.x)) / (cellDimensions.x); 39 | xyz.x = (linearIndex % (cellDimensions.y * cellDimensions.x)) % cellDimensions.x; 40 | return xyz; 41 | } 42 | 43 | __host__ __device__ inline Int3 LinearCellIndexTo3DIndicesInt3( 44 | const UInt3 &cellDimensions, 45 | const uint &linearIndex) 46 | { 47 | Int3 xyz; 48 | xyz.z = linearIndex / (cellDimensions.y * cellDimensions.x); 49 | xyz.y = (linearIndex % (cellDimensions.y * cellDimensions.x)) / (cellDimensions.x); 50 | xyz.x = (linearIndex % (cellDimensions.y * cellDimensions.x)) % cellDimensions.x; 51 | return xyz; 52 | } -------------------------------------------------------------------------------- /src/helper_mortonCode.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "Types.h" 4 | 5 | //https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ 6 | // "Insert" a 0 bit after each of the 16 low bits of x 7 | __host__ __device__ inline uint Part1By1(uint x) 8 | { 9 | x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210 10 | x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210 11 | x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210 12 | x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10 13 | x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 14 | return x; 15 | } 16 | 17 | // "Insert" two 0 bits after each of the 10 low bits of x 18 | __host__ __device__ inline uint Part1By2(uint x) 19 | { 20 | x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210 21 | x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210 22 | x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210 23 | x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10 24 | x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 25 | return x; 26 | } 27 | 28 | __host__ __device__ inline uint MortonCode3(uint x, uint y, uint z) 29 | { 30 | return (Part1By2(z) << 2) + (Part1By2(y) << 1) + Part1By2(x); 31 | } 32 | 33 | // Inverse of Part1By1 - "delete" all odd-indexed bits 34 | __host__ __device__ inline uint Compact1By1(uint x) 35 | { 36 | x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 37 | x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10 38 | x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210 39 | x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210 40 | x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210 41 | return x; 42 | } 43 | 44 | // Inverse of Part1By2 - "delete" all bits not at positions divisible by 3 45 | __host__ __device__ inline uint Compact1By2(uint x) 46 | { 47 | x &= 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 48 | x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10 49 | x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210 50 | x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210 51 | x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210 52 | return x; 53 | } 54 | 55 | __host__ __device__ inline uint DecodeMorton2X(uint code) 56 | { 57 | return Compact1By1(code >> 0); 58 | } 59 | 60 | __host__ __device__ inline uint DecodeMorton2Y(uint code) 61 | { 62 | return Compact1By1(code >> 1); 63 | } 64 | 65 | __host__ __device__ inline uint DecodeMorton3X(uint code) 66 | { 67 | return Compact1By2(code >> 0); 68 | } 69 | 70 | __host__ __device__ inline uint DecodeMorton3Y(uint code) 71 | { 72 | return Compact1By2(code >> 1); 73 | } 74 | 75 | __host__ __device__ inline uint DecodeMorton3Z(uint code) 76 | { 77 | return Compact1By2(code >> 2); 78 | } 79 | __host__ __device__ inline UInt3 MortonCodeToIndex3(uint mortonCode) 80 | { 81 | UInt3 xyz; 82 | xyz.x = DecodeMorton3X(mortonCode); 83 | xyz.y = DecodeMorton3Y(mortonCode); 84 | xyz.z = DecodeMorton3Z(mortonCode); 85 | return xyz; 86 | } 87 | __host__ __device__ inline Int3 MortonCodeToIndexInt3(uint mortonCode) 88 | { 89 | Int3 xyz; 90 | xyz.x = (int)DecodeMorton3X(mortonCode); 91 | xyz.y = (int)DecodeMorton3Y(mortonCode); 92 | xyz.z = (int)DecodeMorton3Z(mortonCode); 93 | return xyz; 94 | } --------------------------------------------------------------------------------