├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE.txt ├── README.md ├── cmake └── modules │ └── FindSDL2.cmake ├── screenshot.png └── src ├── CMakeLists.txt ├── bbox.h ├── build.cu ├── build.h ├── common.h ├── compress.cu ├── expand.cu ├── flatten.cu ├── grid.h ├── load_obj.cpp ├── load_obj.h ├── main.cpp ├── mem_manager.cu ├── mem_manager.h ├── merge.cu ├── parallel.cuh ├── prims.h ├── profile.cu ├── ray.h ├── traverse.cu ├── traverse.h └── vec.h /.gitignore: -------------------------------------------------------------------------------- 1 | lib/cub/docs 2 | lib/cub/examples 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/cub"] 2 | path = lib/cub 3 | url = https://github.com/NVlabs/cub 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(hagrid) 3 | 4 | set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules) 5 | 6 | find_package(CUDA) 7 | 8 | find_package(SDL2 REQUIRED) 9 | include_directories(${SDL2_INCLUDE_DIR}) 10 | 11 | set(CMAKE_CXX_STANDARD 11) 12 | include_directories(lib/cub) 13 | add_subdirectory(src) 14 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2017 Arsène Pérard-Gayot 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hagrid 2 | 3 | ![Screenshot](screenshot.png) 4 | 5 | This project is an implementation of the paper: _GPU Ray Tracing using Irregular Grids_. 6 | This is not the version that has been used in the paper. 7 | 8 | ## Changes 9 | 10 | Some improvements have been made to the construction algorithm, which change the performance characteristics of the structure: 11 | 12 | - The voxel map can have more than two levels. 13 | - Construction is faster (~ +33%) 14 | - Memory consumption is lower (~ -20%) 15 | - Traversal is slower (~ -5%) 16 | 17 | The traversal being slower can easily being remedied by increasing the resolution (the default parameters are the parameters used in the paper, which will result in lower performance than what was reported for the reasons above). The improvements in build times and memory consumption more than compensate the loss in traversal performance. As a recommendation, use `--top-density 0.15 --snd-density 3.0` if you want to get approximately the same performance as the original paper. _Increasing the resolution further will result in higher performance_, and the new voxel map structure should prevent the memory usage from exploding. 18 | 19 | The following experimental features (not mentioned in the paper) are also available, and should increase performance: 20 | 21 | - A more precise expansion algorithm (set the `subset_only` variable to false in [src/expand.cu](src/expand.cu#L159)), 22 | - A simple compression scheme (use the `--compress` option). 23 | 24 | ## Building 25 | 26 | This project requires CUDA, SDL2, and CMake. Use the following commands to build the project: 27 | 28 | mkdir build 29 | cd build 30 | cmake-gui .. 31 | make -j 32 | 33 | If you encounter any problems when building, make sure you clone the repository with the `--recursive` option, so that all submodules are cloned as well. 34 | If the submodules are properly downloaded, the `/lib` directory should no longer be empty. 35 | 36 | ## Testing and Benchmarking 37 | 38 | Once built, the project provides a library for traversal, a library for the construction, and a test/benchmark executable. The executable uses command line arguments to specify the scene to use. The scene must be in the OBJ format. Here is a list of typical uses of the command line program: 39 | 40 | - Lists options: 41 | 42 | src/hagrid 43 | 44 | - Loads the file `scene.obj`, builds an irregular grid using default parameters and displays an interactive view: 45 | 46 | src/hagrid scene.obj 47 | 48 | - Loads the file `scene.obj`, builds an irregular grid with top-level density 0.12, second-level density 0.24, alpha 0.995 (threshold to control cell merging---see the paper), 3 expansion passes, and displays an interactive view: 49 | 50 | src/hagrid scene.obj --top-density 0.12 --snd-density 2.4 --alpha 0.995 --expansion 3 51 | 52 | - Loads the file `scene.obj`, benchmarks the construction with default parameters by running 10 construction iterations and 5 warmup construction iterations and keeps intermediate buffers alive (should be preferred when benchmarking construction times), and finally displays an interactive view: 53 | 54 | src/hagrid scene.obj --build-iter 10 --build-warmup 5 --keep-alive 55 | 56 | - Loads the file `scene.obj`, builds an irregular grid with default parameters, and benchmark the traversal by running it on the given ray distribution (the file `distribution.rays`, containing each ray stored as 6 floats in binary format---3 for the origin and 3 for the direction) 100 times with 20 warmup iterations, and limit the distance along the ray to the range [0, 100]: 57 | 58 | src/hagrid scene.obj --ray-file distribution.rays --bench-iter 100 --bench-warmup 20 -tmin 0 -tmax 100 59 | 60 | ## License 61 | 62 | The code is distributed under the MIT license (see [LICENSE.txt](LICENSE.txt)). 63 | -------------------------------------------------------------------------------- /cmake/modules/FindSDL2.cmake: -------------------------------------------------------------------------------- 1 | # Locate SDL2 library 2 | # This module defines 3 | # SDL2_LIBRARY, the name of the library to link against 4 | # SDL2_FOUND, if false, do not try to link to SDL2 5 | # SDL2_INCLUDE_DIR, where to find SDL.h 6 | # 7 | # This module responds to the the flag: 8 | # SDL2_BUILDING_LIBRARY 9 | # If this is defined, then no SDL2_main will be linked in because 10 | # only applications need main(). 11 | # Otherwise, it is assumed you are building an application and this 12 | # module will attempt to locate and set the the proper link flags 13 | # as part of the returned SDL2_LIBRARY variable. 14 | # 15 | # Don't forget to include SDL2main.h and SDL2main.m your project for the 16 | # OS X framework based version. (Other versions link to -lSDL2main which 17 | # this module will try to find on your behalf.) Also for OS X, this 18 | # module will automatically add the -framework Cocoa on your behalf. 19 | # 20 | # 21 | # Additional Note: If you see an empty SDL2_LIBRARY_TEMP in your configuration 22 | # and no SDL2_LIBRARY, it means CMake did not find your SDL2 library 23 | # (SDL2.dll, libsdl2.so, SDL2.framework, etc). 24 | # Set SDL2_LIBRARY_TEMP to point to your SDL2 library, and configure again. 25 | # Similarly, if you see an empty SDL2MAIN_LIBRARY, you should set this value 26 | # as appropriate. These values are used to generate the final SDL2_LIBRARY 27 | # variable, but when these values are unset, SDL2_LIBRARY does not get created. 28 | # 29 | # 30 | # $SDL2DIR is an environment variable that would 31 | # correspond to the ./configure --prefix=$SDL2DIR 32 | # used in building SDL2. 33 | # l.e.galup 9-20-02 34 | # 35 | # Modified by Eric Wing. 36 | # Added code to assist with automated building by using environmental variables 37 | # and providing a more controlled/consistent search behavior. 38 | # Added new modifications to recognize OS X frameworks and 39 | # additional Unix paths (FreeBSD, etc). 40 | # Also corrected the header search path to follow "proper" SDL2 guidelines. 41 | # Added a search for SDL2main which is needed by some platforms. 42 | # Added a search for threads which is needed by some platforms. 43 | # Added needed compile switches for MinGW. 44 | # 45 | # On OSX, this will prefer the Framework version (if found) over others. 46 | # People will have to manually change the cache values of 47 | # SDL2_LIBRARY to override this selection or set the CMake environment 48 | # CMAKE_INCLUDE_PATH to modify the search paths. 49 | # 50 | # Note that the header path has changed from SDL2/SDL.h to just SDL.h 51 | # This needed to change because "proper" SDL2 convention 52 | # is #include "SDL.h", not . This is done for portability 53 | # reasons because not all systems place things in SDL2/ (see FreeBSD). 54 | # 55 | # Ported by Johnny Patterson. This is a literal port for SDL2 of the FindSDL.cmake 56 | # module with the minor edit of changing "SDL" to "SDL2" where necessary. This 57 | # was not created for redistribution, and exists temporarily pending official 58 | # SDL2 CMake modules. 59 | 60 | #============================================================================= 61 | # Copyright 2003-2009 Kitware, Inc. 62 | # 63 | # Distributed under the OSI-approved BSD License (the "License"); 64 | # see accompanying file Copyright.txt for details. 65 | # 66 | # This software is distributed WITHOUT ANY WARRANTY; without even the 67 | # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 68 | # See the License for more information. 69 | #============================================================================= 70 | # (To distribute this file outside of CMake, substitute the full 71 | # License text for the above reference.) 72 | 73 | FIND_PATH(SDL2_INCLUDE_DIR SDL.h 74 | HINTS 75 | $ENV{SDL2DIR} 76 | PATH_SUFFIXES include/SDL2 include 77 | PATHS 78 | ~/Library/Frameworks 79 | /Library/Frameworks 80 | /usr/local/include/SDL2 81 | /usr/include/SDL2 82 | /sw # Fink 83 | /opt/local # DarwinPorts 84 | /opt/csw # Blastwave 85 | /opt 86 | ) 87 | #MESSAGE("SDL2_INCLUDE_DIR is ${SDL2_INCLUDE_DIR}") 88 | 89 | FIND_LIBRARY(SDL2_LIBRARY_TEMP 90 | NAMES SDL2 91 | HINTS 92 | $ENV{SDL2DIR} 93 | PATH_SUFFIXES lib64 lib 94 | PATHS 95 | /sw 96 | /opt/local 97 | /opt/csw 98 | /opt 99 | ) 100 | 101 | #MESSAGE("SDL2_LIBRARY_TEMP is ${SDL2_LIBRARY_TEMP}") 102 | 103 | IF(NOT SDL2_BUILDING_LIBRARY) 104 | IF(NOT ${SDL2_INCLUDE_DIR} MATCHES ".framework") 105 | # Non-OS X framework versions expect you to also dynamically link to 106 | # SDL2main. This is mainly for Windows and OS X. Other (Unix) platforms 107 | # seem to provide SDL2main for compatibility even though they don't 108 | # necessarily need it. 109 | FIND_LIBRARY(SDL2MAIN_LIBRARY 110 | NAMES SDL2main 111 | HINTS 112 | $ENV{SDL2DIR} 113 | PATH_SUFFIXES lib64 lib 114 | PATHS 115 | /sw 116 | /opt/local 117 | /opt/csw 118 | /opt 119 | ) 120 | ENDIF(NOT ${SDL2_INCLUDE_DIR} MATCHES ".framework") 121 | ENDIF(NOT SDL2_BUILDING_LIBRARY) 122 | 123 | # SDL2 may require threads on your system. 124 | # The Apple build may not need an explicit flag because one of the 125 | # frameworks may already provide it. 126 | # But for non-OSX systems, I will use the CMake Threads package. 127 | IF(NOT APPLE) 128 | FIND_PACKAGE(Threads) 129 | ENDIF(NOT APPLE) 130 | 131 | # MinGW needs an additional library, mwindows 132 | # It's total link flags should look like -lmingw32 -lSDL2main -lSDL2 -lmwindows 133 | # (Actually on second look, I think it only needs one of the m* libraries.) 134 | IF(MINGW) 135 | SET(MINGW32_LIBRARY mingw32 CACHE STRING "mwindows for MinGW") 136 | ENDIF(MINGW) 137 | 138 | SET(SDL2_FOUND "NO") 139 | IF(SDL2_LIBRARY_TEMP) 140 | # For SDL2main 141 | IF(NOT SDL2_BUILDING_LIBRARY) 142 | IF(SDL2MAIN_LIBRARY) 143 | SET(SDL2_LIBRARY_TEMP ${SDL2MAIN_LIBRARY} ${SDL2_LIBRARY_TEMP}) 144 | ENDIF(SDL2MAIN_LIBRARY) 145 | ENDIF(NOT SDL2_BUILDING_LIBRARY) 146 | 147 | # For OS X, SDL2 uses Cocoa as a backend so it must link to Cocoa. 148 | # CMake doesn't display the -framework Cocoa string in the UI even 149 | # though it actually is there if I modify a pre-used variable. 150 | # I think it has something to do with the CACHE STRING. 151 | # So I use a temporary variable until the end so I can set the 152 | # "real" variable in one-shot. 153 | IF(APPLE) 154 | SET(SDL2_LIBRARY_TEMP ${SDL2_LIBRARY_TEMP} "-framework Cocoa") 155 | ENDIF(APPLE) 156 | 157 | # For threads, as mentioned Apple doesn't need this. 158 | # In fact, there seems to be a problem if I used the Threads package 159 | # and try using this line, so I'm just skipping it entirely for OS X. 160 | IF(NOT APPLE) 161 | SET(SDL2_LIBRARY_TEMP ${SDL2_LIBRARY_TEMP} ${CMAKE_THREAD_LIBS_INIT}) 162 | ENDIF(NOT APPLE) 163 | 164 | # For MinGW library 165 | IF(MINGW) 166 | SET(SDL2_LIBRARY_TEMP ${MINGW32_LIBRARY} ${SDL2_LIBRARY_TEMP}) 167 | ENDIF(MINGW) 168 | 169 | # Set the final string here so the GUI reflects the final state. 170 | SET(SDL2_LIBRARY ${SDL2_LIBRARY_TEMP} CACHE STRING "Where the SDL2 Library can be found") 171 | # Set the temp variable to INTERNAL so it is not seen in the CMake GUI 172 | SET(SDL2_LIBRARY_TEMP "${SDL2_LIBRARY_TEMP}" CACHE INTERNAL "") 173 | 174 | SET(SDL2_FOUND "YES") 175 | ENDIF(SDL2_LIBRARY_TEMP) 176 | 177 | INCLUDE(FindPackageHandleStandardArgs) 178 | 179 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(SDL2 180 | REQUIRED_VARS SDL2_LIBRARY SDL2_INCLUDE_DIR) 181 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cg-saarland/hagrid/fa3eb62eba14d073dfeddd3b9ca8fbfb2d6848af/screenshot.png -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_BUILD_TYPE STREQUAL "Debug") 2 | set(OPT_FLAG "-g") 3 | else() 4 | set(OPT_FLAG "--use_fast_math;-O3") 5 | endif() 6 | 7 | cuda_compile(HAGRID_BUILD 8 | build.cu 9 | merge.cu 10 | flatten.cu 11 | expand.cu 12 | compress.cu 13 | mem_manager.cu 14 | profile.cu 15 | mem_manager.h 16 | parallel.cuh 17 | build.h 18 | grid.h 19 | vec.h 20 | bbox.h 21 | prims.h 22 | ray.h 23 | common.h 24 | OPTIONS ${OPT_FLAG} "-std=c++11;--expt-extended-lambda;-lineinfo;-DHOST=__host__;-DDEVICE=__device__") 25 | 26 | add_library(hagrid_build ${HAGRID_BUILD}) 27 | set_target_properties(hagrid_build PROPERTIES LINKER_LANGUAGE CXX) 28 | 29 | cuda_compile(HAGRID_TRAVERSE 30 | traverse.cu 31 | traverse.h 32 | ray.h 33 | prims.h 34 | vec.h 35 | grid.h 36 | OPTIONS ${OPT_FLAG} "-std=c++11;--expt-extended-lambda;-lineinfo;--maxrregcount=40;-DHOST=__host__;-DDEVICE=__device__") 37 | 38 | add_library(hagrid_traverse ${HAGRID_TRAVERSE}) 39 | set_target_properties(hagrid_traverse PROPERTIES LINKER_LANGUAGE CXX) 40 | 41 | add_executable(hagrid main.cpp load_obj.cpp load_obj.h grid.h traverse.h build.h vec.h) 42 | target_compile_definitions(hagrid PRIVATE HOST= DEVICE=) 43 | target_link_libraries(hagrid hagrid_build hagrid_traverse ${CUDA_LIBRARIES} ${SDL2_LIBRARY}) 44 | -------------------------------------------------------------------------------- /src/bbox.h: -------------------------------------------------------------------------------- 1 | #ifndef BBOX_H 2 | #define BBOX_H 3 | 4 | #include 5 | #include 6 | #include "vec.h" 7 | 8 | namespace hagrid { 9 | 10 | struct BBox { 11 | vec3 min; 12 | int pad0; 13 | vec3 max; 14 | int pad1; 15 | 16 | HOST DEVICE BBox() {} 17 | HOST DEVICE BBox(const vec3& v) : min(v), max(v) {} 18 | HOST DEVICE BBox(const vec3& min, const vec3& max) : min(min), max(max) {} 19 | 20 | HOST DEVICE BBox& extend(const vec3& f) { 21 | min = hagrid::min(min, f); 22 | max = hagrid::max(max, f); 23 | return *this; 24 | } 25 | 26 | HOST DEVICE BBox& extend(const BBox& bb) { 27 | min = hagrid::min(min, bb.min); 28 | max = hagrid::max(max, bb.max); 29 | return *this; 30 | } 31 | 32 | HOST DEVICE BBox& overlap(const BBox& bb) { 33 | min = hagrid::max(min, bb.min); 34 | max = hagrid::min(max, bb.max); 35 | return *this; 36 | } 37 | 38 | HOST DEVICE vec3 extents() const { 39 | return max - min; 40 | } 41 | 42 | HOST DEVICE vec3 center() const { 43 | return 0.5f * (max + min); 44 | } 45 | 46 | HOST DEVICE float half_area() const { 47 | const vec3 len = max - min; 48 | const float kx = hagrid::max(len.x, 0.0f); 49 | const float ky = hagrid::max(len.y, 0.0f); 50 | const float kz = hagrid::max(len.z, 0.0f); 51 | return kx * (ky + kz) + ky * kz; 52 | } 53 | 54 | HOST DEVICE bool is_empty() const { 55 | return min.x > max.x || min.y > max.y || min.z > max.z; 56 | } 57 | 58 | HOST DEVICE bool is_inside(const vec3& f) const { 59 | return f.x >= min.x && f.y >= min.y && f.z >= min.z && 60 | f.x <= max.x && f.y <= max.y && f.z <= max.z; 61 | } 62 | 63 | HOST DEVICE bool is_overlapping(const BBox& bb) const { 64 | return min.x <= bb.max.x && max.x >= bb.min.x && 65 | min.y <= bb.max.y && max.y >= bb.min.y && 66 | min.z <= bb.max.z && max.z >= bb.min.z; 67 | } 68 | 69 | HOST DEVICE bool is_included(const BBox& bb) const { 70 | return min.x >= bb.min.x && max.x <= bb.max.x && 71 | min.y >= bb.min.y && max.y <= bb.max.y && 72 | min.z >= bb.min.z && max.z <= bb.max.z; 73 | } 74 | 75 | HOST DEVICE bool is_strictly_included(const BBox& bb) const { 76 | return is_included(bb) && 77 | (min.x > bb.min.x || max.x < bb.max.x || 78 | min.y > bb.min.y || max.y < bb.max.y || 79 | min.z > bb.min.z || max.z < bb.max.z); 80 | } 81 | 82 | HOST DEVICE static BBox empty() { return BBox(vec3( FLT_MAX), vec3(-FLT_MAX)); } 83 | HOST DEVICE static BBox full() { return BBox(vec3(-FLT_MAX), vec3( FLT_MAX)); } 84 | }; 85 | 86 | #ifdef __NVCC__ 87 | __device__ __forceinline__ BBox load_bbox(const BBox* bb_ptr) { 88 | const float4* ptr = (const float4*)bb_ptr; 89 | auto bb0 = ptr[0]; 90 | auto bb1 = ptr[1]; 91 | return BBox(vec3(bb0.x, bb0.y, bb0.z), 92 | vec3(bb1.x, bb1.y, bb1.z)); 93 | } 94 | 95 | __device__ __forceinline__ void store_bbox(BBox* bb_ptr, const BBox& bb) { 96 | float4* ptr = (float4*)bb_ptr; 97 | ptr[0] = make_float4(bb.min.x, bb.min.y, bb.min.z, 0); 98 | ptr[1] = make_float4(bb.max.x, bb.max.y, bb.max.z, 0); 99 | } 100 | #endif // __NVCC__ 101 | 102 | } // namespace hagrid 103 | 104 | #endif // BBOX_H 105 | -------------------------------------------------------------------------------- /src/build.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "build.h" 4 | #include "vec.h" 5 | #include "bbox.h" 6 | #include "grid.h" 7 | #include "prims.h" 8 | #include "mem_manager.h" 9 | #include "parallel.cuh" 10 | 11 | namespace hagrid { 12 | 13 | /// Level of the grid during construction 14 | struct Level { 15 | int* ref_ids; ///< Array of primitive indices 16 | int* cell_ids; ///< Array of cell indices 17 | int num_refs; ///< Number of references in the level 18 | int num_kept; ///< Number of references kept (remaining is split) 19 | Cell* cells; ///< Array of cells 20 | Entry* entries; ///< Array of voxel map entries 21 | int num_cells; ///< Number of cells 22 | 23 | Level() {} 24 | Level(int* ref_ids, int* cell_ids, int num_refs, int num_kept, Cell* cells, Entry* entries, int num_cells) 25 | : ref_ids(ref_ids) 26 | , cell_ids(cell_ids) 27 | , num_refs(num_refs) 28 | , num_kept(num_kept) 29 | , cells(cells) 30 | , entries(entries) 31 | , num_cells(num_cells) 32 | { 33 | assert(num_refs >= num_kept); 34 | } 35 | }; 36 | 37 | static __constant__ ivec3 grid_dims; 38 | static __constant__ BBox grid_bbox; 39 | static __constant__ vec3 cell_size; 40 | static __constant__ int grid_shift; 41 | 42 | /// Compute the bounding box of every primitive 43 | template 44 | __global__ void compute_bboxes(const Primitive* __restrict__ prims, 45 | BBox* __restrict__ bboxes, 46 | int num_prims) { 47 | const int id = threadIdx.x + blockDim.x * blockIdx.x; 48 | if (id >= num_prims) 49 | return; 50 | 51 | auto prim = load_prim(prims + id); 52 | store_bbox(bboxes + id, prim.bbox()); 53 | } 54 | 55 | /// Compute an over-approximation of the number of references 56 | /// that are going to be generated during reference emission 57 | __global__ void count_new_refs(const BBox* __restrict__ bboxes, 58 | int* __restrict__ counts, 59 | int num_refs) { 60 | int id = threadIdx.x + blockDim.x * blockIdx.x; 61 | if (id >= num_refs) return; 62 | 63 | auto ref_bb = load_bbox(bboxes + id); 64 | auto range = compute_range(grid_dims, grid_bbox, ref_bb); 65 | counts[id] = max(0, range.size()); 66 | } 67 | 68 | /// Emit the new references by inserting existing ones into the sub-levels 69 | __global__ void __launch_bounds__(64) 70 | emit_new_refs(const BBox* __restrict__ bboxes, 71 | const int* __restrict__ start_emit, 72 | int* __restrict__ new_ref_ids, 73 | int* __restrict__ new_cell_ids, 74 | int num_prims) { 75 | int id = threadIdx.x + blockDim.x * blockIdx.x; 76 | 77 | Range range; 78 | int start = 0, end = 0; 79 | 80 | if (id < num_prims) { 81 | start = start_emit[id + 0]; 82 | end = start_emit[id + 1]; 83 | 84 | if (start < end) { 85 | auto ref_bb = load_bbox(bboxes + id); 86 | range = compute_range(grid_dims, grid_bbox, ref_bb); 87 | } 88 | } 89 | 90 | bool blocked = (end - start) >= 16; 91 | if (!blocked && start < end) { 92 | int x = range.lx; 93 | int y = range.ly; 94 | int z = range.lz; 95 | int cur = start; 96 | while (cur < end) { 97 | new_ref_ids [cur] = id; 98 | new_cell_ids[cur] = x + grid_dims.x * (y + grid_dims.y * z); 99 | cur++; 100 | x++; 101 | if (x > range.hx) { x = range.lx; y++; } 102 | if (y > range.hy) { y = range.ly; z++; } 103 | } 104 | } 105 | 106 | static constexpr unsigned all_mask = unsigned(-1); 107 | int mask = __ballot_sync(all_mask, blocked); 108 | while (mask) { 109 | int bit = __ffs(mask) - 1; 110 | mask &= ~(1 << bit); 111 | 112 | int warp_start = __shfl_sync(all_mask, start, bit); 113 | int warp_end = __shfl_sync(all_mask, end, bit); 114 | int warp_id = threadIdx.x - __shfl_sync(all_mask, threadIdx.x, 0); 115 | 116 | int lx = __shfl_sync(all_mask, range.lx, bit); 117 | int ly = __shfl_sync(all_mask, range.ly, bit); 118 | int lz = __shfl_sync(all_mask, range.lz, bit); 119 | int hx = __shfl_sync(all_mask, range.hx, bit); 120 | int hy = __shfl_sync(all_mask, range.hy, bit); 121 | int r = __shfl_sync(all_mask, id, bit); 122 | 123 | int sx = hx - lx + 1; 124 | int sy = hy - ly + 1; 125 | 126 | // Split the work on all the threads of the warp 127 | for (int i = warp_start + warp_id; i < warp_end; i += 32) { 128 | int k = i - warp_start; 129 | int x = lx + (k % sx); 130 | int y = ly + ((k / sx) % sy); 131 | int z = lz + (k / (sx * sy)); 132 | new_ref_ids[i] = r; 133 | new_cell_ids[i] = x + grid_dims.x * (y + grid_dims.y * z); 134 | } 135 | } 136 | } 137 | 138 | /// Filter out references that do not intersect the cell they are in 139 | template 140 | __global__ void filter_refs(int* __restrict__ cell_ids, 141 | int* __restrict__ ref_ids, 142 | const Primitive* __restrict__ prims, 143 | const Cell* __restrict__ cells, 144 | int num_refs) { 145 | int id = threadIdx.x + blockDim.x * blockIdx.x; 146 | if (id >= num_refs) return; 147 | 148 | auto cell = load_cell(cells + cell_ids[id]); 149 | auto prim = load_prim(prims + ref_ids[id]); 150 | auto bbox = BBox(grid_bbox.min + vec3(cell.min) * cell_size, 151 | grid_bbox.min + vec3(cell.max) * cell_size); 152 | bool intersect = intersect_prim_cell(prim, bbox); 153 | if (!intersect) { 154 | cell_ids[id] = -1; 155 | ref_ids[id] = -1; 156 | } 157 | } 158 | 159 | /// Compute a mask for each reference which determines which sub-cell is intersected 160 | template 161 | __global__ void compute_split_masks(const int* __restrict__ cell_ids, 162 | const int* __restrict__ ref_ids, 163 | const Primitive* __restrict__ prims, 164 | const Cell* __restrict__ cells, 165 | int* split_masks, 166 | int num_split) { 167 | int id = threadIdx.x + blockDim.x * blockIdx.x; 168 | if (id >= num_split) return; 169 | 170 | auto cell_id = cell_ids[id]; 171 | if (cell_id < 0) { 172 | split_masks[id] = 0; 173 | return; 174 | } 175 | auto ref = ref_ids[id]; 176 | auto cell = load_cell(cells + cell_id); 177 | auto prim = load_prim(prims + ref); 178 | 179 | auto cell_min = grid_bbox.min + cell_size * vec3(cell.min); 180 | auto cell_max = grid_bbox.min + cell_size * vec3(cell.max); 181 | auto middle = (cell_min + cell_max) * 0.5f; 182 | 183 | int mask = 0xFF; 184 | 185 | // Optimization: Test against half spaces first 186 | auto ref_bb = prim.bbox(); 187 | if (ref_bb.min.x > cell_max.x || 188 | ref_bb.max.x < cell_min.x) mask = 0; 189 | if (ref_bb.min.x > middle.x) mask &= 0xAA; 190 | if (ref_bb.max.x < middle.x) mask &= 0x55; 191 | if (ref_bb.min.y > cell_max.y || 192 | ref_bb.max.y < cell_min.y) mask = 0; 193 | if (ref_bb.min.y > middle.y) mask &= 0xCC; 194 | if (ref_bb.max.y < middle.y) mask &= 0x33; 195 | if (ref_bb.min.z > cell_max.z || 196 | ref_bb.max.z < cell_min.z) mask = 0; 197 | if (ref_bb.min.z > middle.z) mask &= 0xF0; 198 | if (ref_bb.max.z < middle.z) mask &= 0x0F; 199 | 200 | for (int i = __ffs(mask) - 1;;) { 201 | auto bbox = BBox(vec3(i & 1 ? middle.x : cell_min.x, 202 | i & 2 ? middle.y : cell_min.y, 203 | i & 4 ? middle.z : cell_min.z), 204 | vec3(i & 1 ? cell_max.x : middle.x, 205 | i & 2 ? cell_max.y : middle.y, 206 | i & 4 ? cell_max.z : middle.z)); 207 | if (!intersect_prim_cell(prim, bbox)) mask &= ~(1 << i); 208 | 209 | // Skip non-intersected children 210 | int skip = __ffs(mask >> (i + 1)); 211 | if (skip == 0) break; 212 | i += 1 + (skip - 1); 213 | } 214 | 215 | split_masks[id] = mask; 216 | } 217 | 218 | /// Split references according to the given array of split masks 219 | __global__ void split_refs(const int* __restrict__ cell_ids, 220 | const int* __restrict__ ref_ids, 221 | const Entry* __restrict__ entries, 222 | const int* __restrict__ split_masks, 223 | const int* __restrict__ start_split, 224 | int* __restrict__ new_cell_ids, 225 | int* __restrict__ new_ref_ids, 226 | int num_split) { 227 | int id = threadIdx.x + blockDim.x * blockIdx.x; 228 | if (id >= num_split) return; 229 | 230 | auto cell_id = cell_ids[id]; 231 | auto ref = ref_ids[id]; 232 | auto begin = entries[cell_id].begin; 233 | 234 | auto mask = split_masks[id]; 235 | auto start = start_split[id]; 236 | while (mask) { 237 | int child_id = __ffs(mask) - 1; 238 | mask &= ~(1 << child_id); 239 | new_ref_ids [start] = ref; 240 | new_cell_ids[start] = begin + child_id; 241 | start++; 242 | } 243 | } 244 | 245 | /// Compute the number of references per cell using atomics 246 | __global__ void count_refs_per_cell(const int* __restrict__ cell_ids, 247 | int* __restrict__ refs_per_cell, 248 | int num_refs) { 249 | int id = threadIdx.x + blockDim.x * blockIdx.x; 250 | if (id >= num_refs) return; 251 | int cell_id = cell_ids[id]; 252 | if (cell_id >= 0) atomicAdd(refs_per_cell + cell_id, 1); 253 | } 254 | 255 | /// Compute the logarithm of the sub-level resolution for top-level cells 256 | __global__ void compute_log_dims(const int* __restrict__ refs_per_cell, 257 | int* __restrict__ log_dims, 258 | float snd_density, 259 | int num_cells) { 260 | int id = threadIdx.x + blockDim.x * blockIdx.x; 261 | if (id >= num_cells) return; 262 | 263 | auto extents = grid_bbox.extents() / vec3(grid_dims); 264 | auto bbox = BBox(vec3(0, 0, 0), extents); 265 | auto dims = compute_grid_dims(bbox, refs_per_cell[id], snd_density); 266 | auto max_dim = max(dims.x, max(dims.y, dims.z)); 267 | auto log_dim = 31 - __clz(max_dim); 268 | log_dim = (1 << log_dim) < max_dim ? log_dim + 1 : log_dim; 269 | log_dims[id] = log_dim; 270 | } 271 | 272 | /// Update the logarithm of the sub-level resolution for top-level cells (after a new subdivision level) 273 | __global__ void update_log_dims(int* __restrict__ log_dims, int num_top_cells) { 274 | int id = threadIdx.x + blockDim.x * blockIdx.x; 275 | if (id >= num_top_cells) return; 276 | 277 | log_dims[id] = max(0, log_dims[id] - 1); 278 | } 279 | 280 | /// Given a position on the virtual grid, return the corresponding top-level cell index 281 | __device__ __forceinline__ int top_level_cell(ivec3 pos) { 282 | return (pos.x >> grid_shift) + grid_dims.x * ((pos.y >> grid_shift) + grid_dims.y * (pos.z >> grid_shift)); 283 | } 284 | 285 | /// Count the (sub-)dimensions of each cell, based on the array of references 286 | __global__ void compute_dims(const int* __restrict__ cell_ids, 287 | const Cell* __restrict__ cells, 288 | const int* __restrict__ log_dims, 289 | Entry* __restrict__ entries, 290 | int num_refs) { 291 | int id = threadIdx.x + blockDim.x * blockIdx.x; 292 | if (id >= num_refs) return; 293 | 294 | auto cell_id = cell_ids[id]; 295 | if (cell_id < 0) return; 296 | 297 | auto cell_min = load_cell_min(cells + cell_id); 298 | auto top_cell_id = top_level_cell(cell_min); 299 | auto log_dim = log_dims[top_cell_id]; 300 | 301 | entries[cell_id] = make_entry(min(log_dim, 1), 0); 302 | } 303 | 304 | /// Mark references that are kept so that they can be moved to the beginning of the array 305 | __global__ void mark_kept_refs(const int* __restrict__ cell_ids, 306 | const Entry* __restrict__ entries, 307 | int* kept_flags, 308 | int num_refs) { 309 | int id = threadIdx.x + blockDim.x * blockIdx.x; 310 | if (id >= num_refs) return; 311 | 312 | auto cell_id = cell_ids[id]; 313 | kept_flags[id] = (cell_id >= 0) && (entries[cell_id].log_dim == 0); 314 | } 315 | 316 | /// Update the entries for the one level before the current one 317 | __global__ void update_entries(const int* __restrict__ start_cell, 318 | Entry* __restrict__ entries, 319 | int num_cells) { 320 | int id = threadIdx.x + blockDim.x * blockIdx.x; 321 | if (id >= num_cells) return; 322 | 323 | auto start = start_cell[id]; 324 | auto entry = entries[id]; 325 | 326 | // If the cell is subdivided, write the first sub-cell index into the current entry 327 | entry.begin = entry.log_dim != 0 ? start : id; 328 | entries[id] = entry; 329 | } 330 | 331 | /// Generate cells for the top level 332 | __global__ void emit_top_cells(Cell* __restrict__ new_cells, int num_cells) { 333 | int id = threadIdx.x + blockDim.x * blockIdx.x; 334 | if (id >= num_cells) return; 335 | 336 | int x = id % grid_dims.x; 337 | int y = (id / grid_dims.x) % grid_dims.y; 338 | int z = id / (grid_dims.x * grid_dims.y); 339 | int inc = 1 << grid_shift; 340 | 341 | x <<= grid_shift; 342 | y <<= grid_shift; 343 | z <<= grid_shift; 344 | 345 | Cell cell; 346 | cell.min = ivec3(x, y, z); 347 | cell.max = ivec3(x + inc, y + inc, z + inc); 348 | cell.begin = 0; 349 | cell.end = 0; 350 | store_cell(new_cells + id, cell); 351 | } 352 | 353 | /// Generate new cells based on the previous level 354 | __global__ void emit_new_cells(const Entry* __restrict__ entries, 355 | const Cell* __restrict__ cells, 356 | Cell* __restrict__ new_cells, 357 | int num_cells) { 358 | int id = threadIdx.x + blockDim.x * blockIdx.x; 359 | if (id >= num_cells) return; 360 | 361 | auto entry = entries[id]; 362 | auto log_dim = entry.log_dim; 363 | if (log_dim == 0) return; 364 | 365 | auto start = entry.begin; 366 | auto cell = load_cell(cells + id); 367 | int min_x = cell.min.x; 368 | int min_y = cell.min.y; 369 | int min_z = cell.min.z; 370 | int inc = (cell.max.x - cell.min.x) >> 1; 371 | 372 | for (int i = 0; i < 8; i++) { 373 | int x = min_x + (i & 1) * inc; 374 | int y = min_y + ((i >> 1) & 1) * inc; 375 | int z = min_z + (i >> 2) * inc; 376 | 377 | cell.min = ivec3(x, y, z); 378 | cell.max = ivec3(x + inc, y + inc, z + inc); 379 | cell.begin = 0; 380 | cell.end = 0; 381 | store_cell(new_cells + start + i, cell); 382 | } 383 | } 384 | 385 | /// Copy the references with an offset, different for each level 386 | __global__ void copy_refs(const int* __restrict__ cell_ids, 387 | int* __restrict__ new_cell_ids, 388 | int cell_off, 389 | int num_kept) { 390 | int id = threadIdx.x + blockDim.x * blockIdx.x; 391 | if (id >= num_kept) return; 392 | 393 | new_cell_ids[id] = cell_ids[id] + cell_off; 394 | } 395 | 396 | /// Mark the cells that are used as 'kept' 397 | __global__ void mark_kept_cells(const Entry* __restrict__ entries, 398 | int* kept_cells, 399 | int num_cells) { 400 | int id = threadIdx.x + blockDim.x * blockIdx.x; 401 | if (id >= num_cells) return; 402 | 403 | kept_cells[id] = entries[id].log_dim == 0; 404 | } 405 | 406 | /// Copy only the cells that are kept to another array of cells 407 | __global__ void copy_cells(const Cell* __restrict__ cells, 408 | const int* __restrict__ start_cell, 409 | Cell* new_cells, 410 | int cell_off, 411 | int num_cells) { 412 | int id = threadIdx.x + blockDim.x * blockIdx.x; 413 | if (id >= num_cells) return; 414 | 415 | auto cell = load_cell(cells + id); 416 | auto start = start_cell[cell_off + id + 0]; 417 | auto end = start_cell[cell_off + id + 1]; 418 | if (start < end) store_cell(new_cells + start, cell); 419 | } 420 | 421 | /// Copy the voxel map entries and remap kept cells to their correct indices 422 | __global__ void copy_entries(const Entry* __restrict__ entries, 423 | const int* __restrict__ start_cell, 424 | Entry* __restrict__ new_entries, 425 | int cell_off, 426 | int next_level_off, 427 | int num_cells) { 428 | int id = threadIdx.x + blockDim.x * blockIdx.x; 429 | if (id >= num_cells) return; 430 | 431 | auto entry = entries[id]; 432 | if (entry.log_dim == 0) { 433 | // Points to a cell 434 | entry.begin = start_cell[cell_off + entry.begin]; 435 | } else { 436 | // Points to another entry in the next level 437 | entry.begin += next_level_off; 438 | } 439 | new_entries[id] = entry; 440 | } 441 | 442 | /// Remap references so that they map to the correct cells 443 | __global__ void remap_refs(int* __restrict__ cell_ids, 444 | const int* __restrict__ start_cell, 445 | int num_refs) { 446 | int id = threadIdx.x + blockDim.x * blockIdx.x; 447 | if (id >= num_refs) return; 448 | 449 | cell_ids[id] = start_cell[cell_ids[id]]; 450 | } 451 | 452 | /// Sets the cell ranges once the references are sorted by cell 453 | __global__ void compute_cell_ranges(const int* cell_ids, Cell* cells, int num_refs) { 454 | int id = threadIdx.x + blockDim.x * blockIdx.x; 455 | if (id >= num_refs) return; 456 | 457 | int cell_id = cell_ids[id + 0]; 458 | if (id >= num_refs - 1) { 459 | cells[cell_id].end = id + 1; 460 | return; 461 | } 462 | int next_id = cell_ids[id + 1]; 463 | 464 | if (cell_id != next_id) { 465 | cells[cell_id].end = id + 1; 466 | cells[next_id].begin = id + 1; 467 | } 468 | } 469 | 470 | template 471 | void first_build_iter(MemManager& mem, float snd_density, 472 | const Primitive* prims, int num_prims, 473 | const BBox* bboxes, const BBox& grid_bb, const ivec3& dims, 474 | int*& log_dims, int& grid_shift, std::vector& levels) { 475 | Parallel par(mem); 476 | 477 | int num_top_cells = dims.x * dims.y * dims.z; 478 | 479 | // Emission of the references in 4 passes: count new refs + scan + emission + filtering 480 | auto start_emit = mem.alloc(num_prims + 1); 481 | auto new_ref_counts = mem.alloc(num_prims + 1); 482 | auto refs_per_cell = mem.alloc(num_top_cells); 483 | log_dims = mem.alloc(num_top_cells + 1); 484 | count_new_refs<<>>(bboxes, new_ref_counts, num_prims); 485 | DEBUG_SYNC(); 486 | 487 | int num_new_refs = par.scan(new_ref_counts, num_prims + 1, start_emit); 488 | mem.free(new_ref_counts); 489 | 490 | auto new_ref_ids = mem.alloc(2 * num_new_refs); 491 | auto new_cell_ids = new_ref_ids + num_new_refs; 492 | emit_new_refs<<>>(bboxes, start_emit, new_ref_ids, new_cell_ids, num_prims); 493 | DEBUG_SYNC(); 494 | 495 | mem.free(start_emit); 496 | 497 | // Compute the number of references per cell 498 | mem.zero(refs_per_cell, num_top_cells); 499 | count_refs_per_cell<<>>(new_cell_ids, refs_per_cell, num_new_refs); 500 | DEBUG_SYNC(); 501 | 502 | // Compute an independent resolution in each of the top-level cells 503 | compute_log_dims<<>>(refs_per_cell, log_dims, snd_density, num_top_cells); 504 | DEBUG_SYNC(); 505 | mem.free(refs_per_cell); 506 | 507 | // Find the maximum sub-level resolution 508 | grid_shift = par.reduce(log_dims, num_top_cells, log_dims + num_top_cells, [] __device__ (int a, int b) { return max(a, b); }); 509 | auto cell_size = grid_bb.extents() / vec3(dims << grid_shift); 510 | 511 | set_global(hagrid::grid_shift, grid_shift); 512 | set_global(hagrid::cell_size, cell_size); 513 | 514 | // Emission of the new cells 515 | auto new_cells = mem.alloc(num_top_cells + 0); 516 | auto new_entries = mem.alloc(num_top_cells + 1); 517 | emit_top_cells<<>>(new_cells, num_top_cells); 518 | DEBUG_SYNC(); 519 | mem.zero(new_entries, num_top_cells + 1); 520 | 521 | // Filter out the references that do not intersect the cell they are in 522 | filter_refs<<>>(new_cell_ids, new_ref_ids, prims, new_cells, num_new_refs); 523 | 524 | levels.emplace_back(new_ref_ids, new_cell_ids, num_new_refs, num_new_refs, new_cells, new_entries, num_top_cells); 525 | } 526 | 527 | template 528 | bool build_iter(MemManager& mem, 529 | const Primitive* prims, int num_prims, 530 | const ivec3& dims, int* log_dims, 531 | std::vector& levels) { 532 | Parallel par(mem); 533 | 534 | int* cell_ids = levels.back().cell_ids; 535 | int* ref_ids = levels.back().ref_ids; 536 | Cell* cells = levels.back().cells; 537 | Entry* entries = levels.back().entries; 538 | 539 | int num_top_cells = dims.x * dims.y * dims.z; 540 | int num_refs = levels.back().num_refs; 541 | int num_cells = levels.back().num_cells; 542 | 543 | int cur_level = levels.size(); 544 | 545 | auto kept_flags = mem.alloc(num_refs + 1); 546 | 547 | // Find out which cell will be split based on whether it is empty or not and the maximum depth 548 | compute_dims<<>>(cell_ids, cells, log_dims, entries, num_refs); 549 | DEBUG_SYNC(); 550 | update_log_dims<<>>(log_dims, num_top_cells); 551 | DEBUG_SYNC(); 552 | mark_kept_refs<<>>(cell_ids, entries, kept_flags, num_refs); 553 | DEBUG_SYNC(); 554 | 555 | // Store the sub-cells starting index in the entries 556 | auto start_cell = mem.alloc(num_cells + 1); 557 | int num_new_cells = par.scan(par.transform(entries, [] __device__ (Entry e) { 558 | return e.log_dim == 0 ? 0 : 8; 559 | }), num_cells + 1, start_cell); 560 | update_entries<<>>(start_cell, entries, num_cells); 561 | DEBUG_SYNC(); 562 | 563 | mem.free(start_cell); 564 | 565 | // Partition the set of cells into the sets of those which will be split and those which won't 566 | auto tmp_ref_ids = mem.alloc(num_refs * 2); 567 | auto tmp_cell_ids = tmp_ref_ids + num_refs; 568 | int num_sel_refs = par.partition(ref_ids, tmp_ref_ids, num_refs, kept_flags); 569 | int num_sel_cells = par.partition(cell_ids, tmp_cell_ids, num_refs, kept_flags); 570 | assert(num_sel_refs == num_sel_cells); 571 | 572 | mem.free(kept_flags); 573 | 574 | std::swap(tmp_ref_ids, ref_ids); 575 | std::swap(tmp_cell_ids, cell_ids); 576 | mem.free(tmp_ref_ids); 577 | 578 | int num_kept = num_sel_refs; 579 | levels.back().ref_ids = ref_ids; 580 | levels.back().cell_ids = cell_ids; 581 | levels.back().num_kept = num_kept; 582 | 583 | if (num_new_cells == 0) { 584 | // Exit here because no new reference will be emitted 585 | mem.free(log_dims); 586 | return false; 587 | } 588 | 589 | int num_split = num_refs - num_kept; 590 | 591 | // Split the references 592 | auto split_masks = mem.alloc(num_split + 1); 593 | auto start_split = mem.alloc(num_split + 1); 594 | compute_split_masks<<>>(cell_ids + num_kept, ref_ids + num_kept, prims, cells, split_masks, num_split); 595 | DEBUG_SYNC(); 596 | 597 | int num_new_refs = par.scan(par.transform(split_masks, [] __device__ (int mask) { 598 | return __popc(mask); 599 | }), num_split + 1, start_split); 600 | assert(num_new_refs <= 8 * num_split); 601 | 602 | auto new_ref_ids = mem.alloc(num_new_refs * 2); 603 | auto new_cell_ids = new_ref_ids + num_new_refs; 604 | split_refs<<>>(cell_ids + num_kept, ref_ids + num_kept, entries, split_masks, start_split, new_cell_ids, new_ref_ids, num_split); 605 | DEBUG_SYNC(); 606 | 607 | mem.free(split_masks); 608 | mem.free(start_split); 609 | 610 | // Emission of the new cells 611 | auto new_cells = mem.alloc(num_new_cells + 0); 612 | auto new_entries = mem.alloc(num_new_cells + 1); 613 | emit_new_cells<<>>(entries, cells, new_cells, num_cells); 614 | DEBUG_SYNC(); 615 | mem.zero(new_entries, num_new_cells + 1); 616 | 617 | levels.emplace_back(new_ref_ids, new_cell_ids, num_new_refs, num_new_refs, new_cells, new_entries, num_new_cells); 618 | return true; 619 | } 620 | 621 | void concat_levels(MemManager& mem, std::vector& levels, Grid& grid) { 622 | Parallel par(mem); 623 | int num_levels = levels.size(); 624 | 625 | // Start with references 626 | int total_refs = 0; 627 | int total_cells = 0; 628 | for (auto& level : levels) { 629 | total_refs += level.num_kept; 630 | total_cells += level.num_cells; 631 | } 632 | 633 | // Copy primitive references as-is 634 | auto ref_ids = mem.alloc(total_refs); 635 | auto cell_ids = mem.alloc(total_refs); 636 | for (int i = 0, off = 0; i < num_levels; off += levels[i].num_kept, i++) { 637 | mem.copy(ref_ids + off, levels[i].ref_ids, levels[i].num_kept); 638 | } 639 | // Copy the cell indices with an offset 640 | for (int i = 0, off = 0, cell_off = 0; i < num_levels; off += levels[i].num_kept, cell_off += levels[i].num_cells, i++) { 641 | int num_kept = levels[i].num_kept; 642 | if (num_kept) { 643 | copy_refs<<>>(levels[i].cell_ids, cell_ids + off, cell_off, num_kept); 644 | DEBUG_SYNC(); 645 | } 646 | mem.free(levels[i].ref_ids); 647 | } 648 | 649 | // Mark the cells at the leaves of the structure as kept 650 | auto kept_cells = mem.alloc(total_cells + 1); 651 | for (int i = 0, cell_off = 0; i < num_levels; cell_off += levels[i].num_cells, i++) { 652 | int num_cells = levels[i].num_cells; 653 | mark_kept_cells<<>>(levels[i].entries, kept_cells + cell_off, num_cells); 654 | DEBUG_SYNC(); 655 | } 656 | 657 | // Compute the insertion position of each cell 658 | auto start_cell = mem.alloc(total_cells + 1); 659 | int new_total_cells = par.scan(kept_cells, total_cells + 1, start_cell); 660 | mem.free(kept_cells); 661 | 662 | // Allocate new cells, and copy only the cells that are kept 663 | auto cells = mem.alloc(new_total_cells); 664 | for (int i = 0, cell_off = 0; i < num_levels; cell_off += levels[i].num_cells, i++) { 665 | int num_cells = levels[i].num_cells; 666 | copy_cells<<>>(levels[i].cells, start_cell, cells, cell_off, num_cells); 667 | DEBUG_SYNC(); 668 | mem.free(levels[i].cells); 669 | } 670 | 671 | auto entries = mem.alloc(total_cells); 672 | for (int i = 0, off = 0; i < num_levels; off += levels[i].num_cells, i++) { 673 | int num_cells = levels[i].num_cells; 674 | int next_level_off = off + num_cells; 675 | copy_entries<<>>(levels[i].entries, start_cell, entries + off, off, next_level_off, num_cells); 676 | DEBUG_SYNC(); 677 | mem.free(levels[i].entries); 678 | } 679 | 680 | // Remap the cell indices in the references (which currently map to incorrect cells) 681 | remap_refs<<>>(cell_ids, start_cell, total_refs); 682 | DEBUG_SYNC(); 683 | 684 | mem.free(start_cell); 685 | 686 | // Sort the references by cell (re-use old slots whenever possible) 687 | auto tmp_ref_ids = mem.alloc(total_refs); 688 | auto tmp_cell_ids = mem.alloc(total_refs); 689 | auto new_ref_ids = tmp_ref_ids; 690 | auto new_cell_ids = tmp_cell_ids; 691 | par.sort_pairs(cell_ids, ref_ids, new_cell_ids, new_ref_ids, total_refs, ilog2(new_total_cells)); 692 | if (ref_ids != new_ref_ids) std::swap(ref_ids, tmp_ref_ids); 693 | if (cell_ids != new_cell_ids) std::swap(cell_ids, tmp_cell_ids); 694 | mem.free(tmp_ref_ids); 695 | mem.free(tmp_cell_ids); 696 | 697 | // Compute the ranges of references for each cell 698 | compute_cell_ranges<<>>(cell_ids, cells, total_refs); 699 | DEBUG_SYNC(); 700 | 701 | mem.free(cell_ids); 702 | 703 | grid.entries = entries; 704 | grid.ref_ids = ref_ids; 705 | grid.cells = cells; 706 | grid.shift = levels.size() - 1; 707 | grid.num_cells = new_total_cells; 708 | grid.num_entries = total_cells; 709 | grid.num_refs = total_refs; 710 | 711 | grid.offsets.resize(levels.size()); 712 | for (int i = 0, off = 0; i < levels.size(); i++) { 713 | off += levels[i].num_cells; 714 | grid.offsets[i] = off; 715 | } 716 | } 717 | 718 | template 719 | void build(MemManager& mem, const Primitive* prims, int num_prims, Grid& grid, float top_density, float snd_density) { 720 | Parallel par(mem); 721 | 722 | // Allocate a bounding box for each primitive + one for the global bounding box 723 | auto bboxes = mem.alloc(num_prims + 1); 724 | 725 | compute_bboxes<<>>(prims, bboxes, num_prims); 726 | auto grid_bb = par.reduce(bboxes, num_prims, bboxes + num_prims, 727 | [] __device__ (BBox a, const BBox& b) { return a.extend(b); }, BBox::empty()); 728 | auto dims = compute_grid_dims(grid_bb, num_prims, top_density); 729 | // Round to the next multiple of 2 on each dimension (in order to align the memory) 730 | dims.x = dims.x % 2 ? dims.x + 1 : dims.x; 731 | dims.y = dims.y % 2 ? dims.y + 1 : dims.y; 732 | dims.z = dims.z % 2 ? dims.z + 1 : dims.z; 733 | 734 | // Slightly enlarge the bounding box of the grid 735 | auto extents = grid_bb.extents(); 736 | grid_bb.min -= extents * 0.001f; 737 | grid_bb.max += extents * 0.001f; 738 | 739 | set_global(hagrid::grid_dims, dims); 740 | set_global(hagrid::grid_bbox, grid_bb); 741 | 742 | int* log_dims = nullptr; 743 | int grid_shift = 0; 744 | std::vector levels; 745 | 746 | // Build top level 747 | first_build_iter(mem, snd_density, prims, num_prims, bboxes, grid_bb, dims, log_dims, grid_shift, levels); 748 | 749 | mem.free(bboxes); 750 | 751 | int iter = 1; 752 | while (build_iter(mem, prims, num_prims, dims, log_dims, levels)) iter++; 753 | 754 | concat_levels(mem, levels, grid); 755 | grid.small_cells = nullptr; 756 | grid.dims = dims; 757 | grid.bbox = grid_bb; 758 | } 759 | 760 | void build_grid(MemManager& mem, const Tri* tris, int num_tris, Grid& grid, float top_density, float snd_density) { build(mem, tris, num_tris, grid, top_density, snd_density); } 761 | 762 | } // namespace hagrid 763 | -------------------------------------------------------------------------------- /src/build.h: -------------------------------------------------------------------------------- 1 | #ifndef BUILD_H 2 | #define BUILD_H 3 | 4 | #include "mem_manager.h" 5 | #include "prims.h" 6 | #include "grid.h" 7 | 8 | namespace hagrid { 9 | 10 | /// Builds an initial irregular grid. 11 | /// The building process starts by creating a uniform grid of density 'top_density', 12 | /// and then proceeds to compute an independent resolution in each of its cells 13 | /// (using the second-level density 'snd_density'). 14 | /// In each cell, an octree depth is computed from these independent resolutions 15 | /// and the primitive references are split until every cell has reached its maximum depth. 16 | /// The voxel map follows the octree structure. 17 | void build_grid(MemManager& mem, const Tri* tris, int num_tris, Grid& grid, float top_density, float snd_density); 18 | 19 | /// Performs the neighbor merging optimization (merging cells according to the SAH). 20 | void merge_grid(MemManager& mem, Grid& grid, float alpha); 21 | 22 | /// Flattens the voxel map to speed up queries. 23 | /// Once this optimization is performed, the voxel map no longer follows an octree structure. 24 | /// Each inner node of the voxel map now may have up to 1 << (3 * (1 << Entry::LOG_DIM_BITS - 1)) children. 25 | void flatten_grid(MemManager& mem, Grid& grid); 26 | 27 | /// Performs the cell expansion optimization (expands cells over neighbors that share the same set of primitives). 28 | void expand_grid(MemManager& mem, Grid& grid, const Tri* tris, int iters); 29 | 30 | /// Tries to compress the grid by using sentinels in the reference array and using 16-bit cell dimensions. Returns true on success, otherwise false. 31 | bool compress_grid(MemManager& mem, Grid& grid); 32 | 33 | } // namespace hagrid 34 | 35 | #endif // BUILD_H 36 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef __NVCC__ 9 | #include 10 | #endif 11 | 12 | namespace hagrid { 13 | 14 | /// Returns the number of milliseconds elapsed on the device for the given function 15 | HOST float profile(std::function); 16 | 17 | /// Rounds the division by an integer so that round_div(i, j) * j > i 18 | HOST DEVICE inline int round_div(int i, int j) { 19 | return i / j + (i % j ? 1 : 0); 20 | } 21 | 22 | /// Computes the minimum between two values 23 | template HOST DEVICE T min(T a, T b) { return a < b ? a : b; } 24 | /// Computes the maximum between two values 25 | template HOST DEVICE T max(T a, T b) { return a > b ? a : b; } 26 | /// Clamps the first value in the range defined by the last two arguments 27 | template HOST DEVICE T clamp(T a, T b, T c) { return min(c, max(b, a)); } 28 | /// Swaps the contents of two references 29 | template HOST DEVICE void swap(T& a, T& b) { auto tmp = a; a = b; b = tmp; } 30 | 31 | /// Reinterprets a values as unsigned int 32 | template 33 | HOST DEVICE U as(T t) { 34 | union { T t; U u; } v; 35 | v.t = t; 36 | return v.u; 37 | } 38 | 39 | /// Returns x with the sign of x * y 40 | HOST DEVICE inline float safe_rcp(float x) { 41 | return x != 0 ? 1.0f / x : copysign(as(0x7f800000u), x); 42 | } 43 | 44 | /// Returns x with the sign of x * y 45 | HOST DEVICE inline float prodsign(float x, float y) { 46 | return as(as(x) ^ (as(y) & 0x80000000)); 47 | } 48 | 49 | /// Converts a float to an ordered float 50 | HOST DEVICE inline uint32_t float_to_ordered(float f) { 51 | auto u = as(f); 52 | auto mask = -(int)(u >> 31u) | 0x80000000u; 53 | return u ^ mask; 54 | } 55 | 56 | /// Converts back an ordered integer to float 57 | HOST DEVICE inline float ordered_to_float(uint32_t u) { 58 | auto mask = ((u >> 31u) - 1u) | 0x80000000u; 59 | return as(u ^ mask); 60 | } 61 | 62 | /// Computes the cubic root of an integer 63 | HOST DEVICE inline int icbrt(int x) { 64 | unsigned y = 0; 65 | for (int s = 30; s >= 0; s = s - 3) { 66 | y = 2 * y; 67 | const unsigned b = (3 * y * (y + 1) + 1) << s; 68 | if (x >= b) { 69 | x = x - b; 70 | y = y + 1; 71 | } 72 | } 73 | return y; 74 | } 75 | 76 | template struct Log2 { enum { Value = Log2::Value }; }; 77 | template struct Log2<1, I> { enum { Value = I }; }; 78 | 79 | /// Computes the logarithm in base 2 of an integer such that (1 << log2(x)) >= x 80 | template 81 | HOST DEVICE int ilog2(T t) { 82 | auto a = 0; 83 | auto b = sizeof(T) * 8; 84 | auto all = T(-1); 85 | #pragma unroll 86 | for (int i = 0; i < Log2::Value; i++) { 87 | auto m = (a + b) / 2; 88 | T mask = all << T(m); 89 | if (t & mask) a = m + 1; 90 | else b = m; 91 | } 92 | return a; 93 | } 94 | 95 | #ifdef __NVCC__ 96 | #ifndef NDEBUG 97 | #define DEBUG_SYNC() CHECK_CUDA_CALL(cudaDeviceSynchronize()) 98 | #else 99 | #define DEBUG_SYNC() do{} while(0) 100 | #endif 101 | #define CHECK_CUDA_CALL(x) check_cuda_call(x, __FILE__, __LINE__) 102 | 103 | __host__ static void check_cuda_call(cudaError_t err, const char* file, int line) { 104 | if (err != cudaSuccess) { 105 | std::cerr << file << "(" << line << "): " << cudaGetErrorString(err) << std::endl; 106 | abort(); 107 | } 108 | } 109 | 110 | template 111 | __host__ void set_global(T& symbol, const T& val) { 112 | size_t size; 113 | CHECK_CUDA_CALL(cudaGetSymbolSize(&size, symbol)); 114 | CHECK_CUDA_CALL(cudaMemcpyToSymbol(symbol, &val, size)); 115 | } 116 | 117 | template 118 | __host__ T get_global(const T& symbol) { 119 | size_t size; 120 | T val; 121 | CHECK_CUDA_CALL(cudaGetSymbolSize(&size, symbol)); 122 | CHECK_CUDA_CALL(cudaMemcpyFromSymbol(&val, symbol, size)); 123 | return val; 124 | } 125 | #endif // __NVCC__ 126 | 127 | } // namespace hagrid 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /src/compress.cu: -------------------------------------------------------------------------------- 1 | #include "parallel.cuh" 2 | #include "build.h" 3 | 4 | namespace hagrid { 5 | 6 | __global__ void count_sentinel_refs(const Cell* cells, int* ref_counts, int num_cells) { 7 | int id = threadIdx.x + blockDim.x * blockIdx.x; 8 | if (id >= num_cells) return; 9 | 10 | auto cell = load_cell(cells + id); 11 | auto count = cell.end - cell.begin; 12 | ref_counts[id] = count > 0 ? count + 1 : 0; 13 | } 14 | 15 | __global__ void emit_small_cells(const Cell* cells, 16 | SmallCell* small_cells, 17 | int* __restrict__ refs, 18 | int* __restrict__ ref_scan, 19 | int* __restrict__ sentinel_refs, 20 | int num_cells) { 21 | int id = threadIdx.x + blockDim.x * blockIdx.x; 22 | if (id >= num_cells) return; 23 | 24 | auto cell = load_cell(cells + id); 25 | int first = ref_scan[id]; 26 | int count = cell.end - cell.begin; 27 | 28 | SmallCell small_cell(usvec3(cell.min), usvec3(cell.max), count > 0 ? first : -1); 29 | store_cell(small_cells + id, small_cell); 30 | 31 | if (count > 0) { 32 | for (int i = 0; i < count; i++) 33 | sentinel_refs[first + i] = refs[cell.begin + i]; 34 | sentinel_refs[first + count] = -1; 35 | } 36 | } 37 | 38 | bool compress_grid(MemManager& mem, Grid& grid) { 39 | auto dims = grid.dims << grid.shift; 40 | // Compression cannot work if the dimensions cannot fit into 16-bit indices 41 | if (dims.x >= (1 << 16) || 42 | dims.y >= (1 << 16) || 43 | dims.z >= (1 << 16)) 44 | return false; 45 | 46 | Parallel par(mem); 47 | auto ref_counts = mem.alloc(grid.num_cells + 1); 48 | auto ref_scan = mem.alloc(grid.num_cells + 1); 49 | auto small_cells = mem.alloc(grid.num_cells); 50 | count_sentinel_refs<<>>(grid.cells, ref_counts, grid.num_cells); 51 | auto num_sentinel_refs = par.scan(ref_counts, grid.num_cells + 1, ref_scan); 52 | auto sentinel_refs = mem.alloc(num_sentinel_refs); 53 | emit_small_cells<<>>(grid.cells, small_cells, grid.ref_ids, ref_scan, sentinel_refs, grid.num_cells); 54 | grid.small_cells = small_cells; 55 | mem.free(grid.cells); 56 | mem.free(grid.ref_ids); 57 | mem.free(ref_counts); 58 | mem.free(ref_scan); 59 | grid.cells = nullptr; 60 | grid.ref_ids = sentinel_refs; 61 | grid.num_refs = num_sentinel_refs; 62 | return true; 63 | } 64 | 65 | } // namespace hagrid 66 | -------------------------------------------------------------------------------- /src/expand.cu: -------------------------------------------------------------------------------- 1 | #include "build.h" 2 | 3 | namespace hagrid { 4 | 5 | static __constant__ ivec3 grid_dims; 6 | static __constant__ vec3 grid_min; 7 | static __constant__ vec3 cell_size; 8 | static __constant__ vec3 grid_inv; 9 | static __constant__ int grid_shift; 10 | 11 | /// Returns true if an overlap with a neighboring cell is possible 12 | template 13 | __device__ bool overlap_possible(const Cell& cell) { 14 | if (dir) 15 | return get(cell.max) < get(grid_dims); 16 | else 17 | return get(cell.min) > 0; 18 | } 19 | 20 | /// Determines if the given range of references is a subset of the other 21 | __device__ __forceinline__ bool is_subset(const int* __restrict__ p0, int c0, const int* __restrict__ p1, int c1) { 22 | if (c1 > c0) return false; 23 | if (c1 == 0) return true; 24 | 25 | int i = 0, j = 0; 26 | 27 | do { 28 | const int a = p0[i]; 29 | const int b = p1[j]; 30 | if (b < a) return false; 31 | j += (a == b); 32 | i++; 33 | } while (i < c0 & j < c1); 34 | 35 | return j == c1; 36 | } 37 | 38 | /// Computes the amount of overlap possible for a cell and a given primitive 39 | template 40 | __device__ int compute_overlap(const Primitive& prim, const Cell& cell, const BBox& cell_bbox, int d) { 41 | static constexpr int axis1 = (axis + 1) % 3; 42 | static constexpr int axis2 = (axis + 2) % 3; 43 | auto prim_bbox = prim.bbox(); 44 | 45 | if (get(prim_bbox.min) <= get(cell_bbox.max) && 46 | get(prim_bbox.max) >= get(cell_bbox.min) && 47 | get(prim_bbox.min) <= get(cell_bbox.max) && 48 | get(prim_bbox.max) >= get(cell_bbox.min)) { 49 | // Approximation: use the original bounding box, not the clipped one 50 | int prim_d = ((dir ? get(prim_bbox.min) : get(prim_bbox.max)) - get(grid_min)) * get(grid_inv); 51 | d = dir 52 | ? min(d, prim_d - get(cell.max)) 53 | : max(d, prim_d - get(cell.min) + 1); 54 | d = dir ? max(d, 0) : min(d, 0); 55 | } 56 | return d; 57 | } 58 | 59 | /// Finds the maximum overlap possible for one cell 60 | template 61 | __device__ int find_overlap(const Entry* __restrict__ entries, 62 | const int* __restrict__ refs, 63 | const Primitive* __restrict__ prims, 64 | const Cell* cells, 65 | const Cell& cell, 66 | bool& continue_overlap) { 67 | constexpr int axis1 = (axis + 1) % 3; 68 | constexpr int axis2 = (axis + 2) % 3; 69 | 70 | if (!overlap_possible(cell)) return 0; 71 | 72 | int d = dir ? get(grid_dims) : -get(grid_dims); 73 | int k1, k2 = get(grid_dims); 74 | int i = get(cell.min); 75 | int j = get(cell.min); 76 | int max_d = d; 77 | while (true) { 78 | ivec3 next_cell; 79 | if (axis == 0) next_cell = ivec3(dir ? cell.max.x : cell.min.x - 1, i, j); 80 | if (axis == 1) next_cell = ivec3(j, dir ? cell.max.y : cell.min.y - 1, i); 81 | if (axis == 2) next_cell = ivec3(i, j, dir ? cell.max.z : cell.min.z - 1); 82 | auto entry = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_cell); 83 | auto next = load_cell(cells + entry); 84 | 85 | max_d = dir 86 | ? min(max_d, get(next.max) - get(cell.max)) 87 | : max(max_d, get(next.min) - get(cell.min)); 88 | d = dir ? min(d, max_d) : max(d, max_d); 89 | 90 | if (subset_only) { 91 | if (!is_subset(refs + cell.begin, cell.end - cell.begin, 92 | refs + next.begin, next.end - next.begin)) { 93 | d = 0; 94 | break; 95 | } 96 | } else { 97 | if (next.begin < next.end) { 98 | auto cell_bbox = BBox(grid_min + cell_size * vec3(cell.min), 99 | grid_min + cell_size * vec3(cell.max)); 100 | 101 | int p1 = cell.begin, p2 = next.begin; 102 | int ref2 = refs[p2]; 103 | while (true) { 104 | // Skip references that are present in the current cell 105 | while (p1 < cell.end) { 106 | int ref1 = refs[p1]; 107 | 108 | if (ref1 > ref2) break; 109 | if (ref1 == ref2) { 110 | if (++p2 >= next.end) break; 111 | ref2 = refs[p2]; 112 | } 113 | 114 | p1++; 115 | } 116 | 117 | if (p2 >= next.end) break; 118 | 119 | // Process references that are only present in the next cell 120 | d = compute_overlap(load_prim(prims + ref2), cell, cell_bbox, d); 121 | if (d == 0 || ++p2 >= next.end) break; 122 | ref2 = refs[p2]; 123 | } 124 | } 125 | 126 | if (d == 0) break; 127 | } 128 | 129 | k1 = get(next.max) - i; 130 | k2 = min(k2, get(next.max) - j); 131 | 132 | i += k1; 133 | if (i >= get(cell.max)) { 134 | i = get(cell.min); 135 | j += k2; 136 | k2 = get(grid_dims); 137 | if (j >= get(cell.max)) break; 138 | } 139 | } 140 | 141 | continue_overlap |= d == max_d; 142 | return d; 143 | } 144 | 145 | template 146 | __global__ void overlap_step(const Entry* __restrict__ entries, 147 | const int* __restrict__ refs, 148 | const Primitive* __restrict__ prims, 149 | const Cell* __restrict__ cells, 150 | Cell* __restrict__ new_cells, 151 | int* __restrict__ cell_flags, 152 | int num_cells) { 153 | int id = threadIdx.x + blockDim.x * blockIdx.x; 154 | if (id >= num_cells || (cell_flags[id] & (1 << axis)) == 0) 155 | return; 156 | 157 | auto cell = load_cell(cells + id); 158 | bool flag = false; 159 | constexpr bool subset_only = true; 160 | auto ov1 = find_overlap(entries, refs, prims, cells, cell, flag); 161 | auto ov2 = find_overlap(entries, refs, prims, cells, cell, flag); 162 | 163 | if (axis == 0) { 164 | cell.min.x += ov1; 165 | cell.max.x += ov2; 166 | } 167 | 168 | if (axis == 1) { 169 | cell.min.y += ov1; 170 | cell.max.y += ov2; 171 | } 172 | 173 | if (axis == 2) { 174 | cell.min.z += ov1; 175 | cell.max.z += ov2; 176 | } 177 | 178 | // If the cell has not been expanded, we will not process it next time 179 | cell_flags[id] = (flag ? 1 << axis : 0) | (cell_flags[id] & ~(1 << axis)); 180 | 181 | store_cell(new_cells + id, cell); 182 | } 183 | 184 | template 185 | void expansion_iter(Grid& grid, const Primitive* prims, Cell*& new_cells, int* cell_flags) { 186 | overlap_step<0><<>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells); 187 | std::swap(new_cells, grid.cells); 188 | DEBUG_SYNC(); 189 | 190 | overlap_step<1><<>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells); 191 | std::swap(new_cells, grid.cells); 192 | DEBUG_SYNC(); 193 | 194 | overlap_step<2><<>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells); 195 | std::swap(new_cells, grid.cells); 196 | DEBUG_SYNC(); 197 | } 198 | 199 | template 200 | void expand(MemManager& mem, Grid& grid, const Primitive* prims, int iters) { 201 | if (iters == 0) return; 202 | 203 | auto new_cells = mem.alloc(grid.num_cells); 204 | auto cell_flags = mem.alloc(grid.num_cells); 205 | 206 | mem.one(cell_flags, grid.num_cells); 207 | auto extents = grid.bbox.extents(); 208 | auto dims = grid.dims << grid.shift; 209 | auto cell_size = extents / vec3(dims); 210 | auto grid_inv = vec3(dims) / extents; 211 | 212 | set_global(hagrid::grid_dims, dims); 213 | set_global(hagrid::grid_min, grid.bbox.min); 214 | set_global(hagrid::cell_size, cell_size); 215 | set_global(hagrid::grid_inv, grid_inv); 216 | set_global(hagrid::grid_shift, grid.shift); 217 | 218 | for (int i = 0; i < iters; i++) 219 | expansion_iter(grid, prims, new_cells, cell_flags); 220 | 221 | mem.free(cell_flags); 222 | mem.free(new_cells); 223 | } 224 | 225 | void expand_grid(MemManager& mem, Grid& grid, const Tri* tris, int iters) { expand(mem, grid, tris, iters); } 226 | 227 | } // namespace hagrid 228 | -------------------------------------------------------------------------------- /src/flatten.cu: -------------------------------------------------------------------------------- 1 | #include "build.h" 2 | #include "parallel.cuh" 3 | 4 | namespace hagrid { 5 | 6 | static constexpr int flat_levels = (1 << Entry::LOG_DIM_BITS) - 1; 7 | 8 | /// Collapses sub-entries that map to the same cell/sub-sub-entry 9 | __global__ void collapse_entries(Entry* entries, int first, int num_entries) { 10 | int id = threadIdx.x + blockDim.x * blockIdx.x; 11 | if (id >= num_entries) return; 12 | 13 | auto entry = entries[first + id]; 14 | if (entry.log_dim) { 15 | auto ptr = (int4*)(entries + entry.begin); 16 | auto ptr0 = ptr[0]; 17 | if (ptr0.x == ptr0.y && 18 | ptr0.x == ptr0.z && 19 | ptr0.x == ptr0.w) { 20 | auto ptr1 = ptr[1]; 21 | if (ptr0.x == ptr1.x && 22 | ptr1.x == ptr1.y && 23 | ptr1.x == ptr1.z && 24 | ptr1.x == ptr1.w) { 25 | entries[first + id] = as(ptr0); 26 | } 27 | } 28 | } 29 | } 30 | 31 | /// Computes the depth of each entry 32 | __global__ void compute_depths(Entry* entries, int* depths, int first, int num_entries) { 33 | int id = threadIdx.x + blockDim.x * blockIdx.x; 34 | if (id >= num_entries) return; 35 | 36 | auto entry = entries[first + id]; 37 | int d = 0; 38 | if (entry.log_dim) { 39 | auto ptr = (const int4*)(depths + entry.begin); 40 | auto d0 = ptr[0]; 41 | auto d1 = ptr[1]; 42 | d = 1 + max(max(max(d0.x, d1.x), max(d0.y, d1.y)), 43 | max(max(d0.z, d1.z), max(d0.w, d1.w))); 44 | } 45 | depths[first + id] = d; 46 | } 47 | 48 | /// Copies the top-level entries and change their depth & start index 49 | __global__ void copy_top_level(const Entry* __restrict__ entries, 50 | const int* __restrict__ start_entries, 51 | const int* __restrict__ depths, 52 | Entry* __restrict__ new_entries, 53 | int num_entries) { 54 | int id = threadIdx.x + blockDim.x * blockIdx.x; 55 | if (id >= num_entries) return; 56 | 57 | auto entry = entries[id]; 58 | if (entry.log_dim) { 59 | entry = make_entry(min(depths[id], flat_levels), num_entries + start_entries[id]); 60 | } 61 | new_entries[id] = entry; 62 | } 63 | 64 | /// Flattens several voxel map levels into one larger level 65 | __global__ void flatten_level(const Entry* __restrict__ entries, 66 | const int* __restrict__ start_entries, 67 | const int* __restrict__ depths, 68 | Entry* __restrict__ new_entries, 69 | int first_entry, 70 | int offset, int next_offset, 71 | int num_entries) { 72 | int id = blockIdx.x; 73 | 74 | int d = min(depths[id + first_entry], flat_levels); 75 | int num_sub_entries = d == 0 ? 0 : 1 << (3 * d); 76 | if (num_sub_entries <= 0) return; 77 | 78 | int start = offset + start_entries[id + first_entry]; 79 | auto root = entries[id + first_entry]; 80 | 81 | for (int i = threadIdx.x; i < num_sub_entries; i += blockDim.x) { 82 | // Treat i as a morton code 83 | int cur_d = d; 84 | int x = 0, y = 0, z = 0; 85 | int next_id = id; 86 | auto entry = root; 87 | while (cur_d > 0) { 88 | cur_d--; 89 | 90 | int pos = i >> (cur_d * 3); 91 | x += (pos & 1) ? (1 << cur_d) : 0; 92 | y += (pos & 2) ? (1 << cur_d) : 0; 93 | z += (pos & 4) ? (1 << cur_d) : 0; 94 | 95 | if (entry.log_dim) { 96 | next_id = entry.begin + (pos & 7); 97 | entry = entries[next_id]; 98 | } 99 | } 100 | 101 | if (entry.log_dim) { 102 | entry = make_entry(min(depths[next_id], flat_levels), next_offset + start_entries[next_id]); 103 | } 104 | 105 | new_entries[start + x + ((y + (z << d)) << d)] = entry; 106 | } 107 | } 108 | 109 | void flatten_grid(MemManager& mem, Grid& grid) { 110 | Parallel par(mem); 111 | 112 | auto depths = mem.alloc(grid.num_entries + 1); 113 | 114 | // Flatten the voxel map 115 | for (int i = grid.shift; i >= 0; i--) { 116 | int first = i > 0 ? grid.offsets[i - 1] : 0; 117 | int last = grid.offsets[i]; 118 | int num_entries = last - first; 119 | // Collapse voxel map entries when possible 120 | collapse_entries<<>>(grid.entries, first, num_entries); 121 | DEBUG_SYNC(); 122 | compute_depths<<>>(grid.entries, depths, first, num_entries); 123 | DEBUG_SYNC(); 124 | } 125 | 126 | // Compute the insertion position of each flattened level, and the total new number of entries 127 | auto start_entries = mem.alloc(grid.num_entries + 1); 128 | std::vector level_offsets(grid.shift); 129 | int total_entries = grid.offsets[0]; 130 | for (int i = 0; i < grid.shift; i += flat_levels) { 131 | int first = i > 0 ? grid.offsets[i - 1] : 0; 132 | int last = grid.offsets[i]; 133 | int num_entries = last - first; 134 | 135 | // CUDA 8 bug: decltype(f(...)) is considered as a call to f (which forces to use __host__ here) 136 | int num_new_entries = par.scan(par.transform(depths + first, [] __host__ __device__ (int d) { 137 | return d > 0 ? 1 << (min(d, flat_levels) * 3) : 0; 138 | }), num_entries + 1, start_entries + first); 139 | level_offsets[i] = total_entries; 140 | total_entries += num_new_entries; 141 | } 142 | 143 | // Flatten the voxel map, by concatenating consecutive several levels together 144 | auto new_entries = mem.alloc(total_entries); 145 | std::vector new_offsets; 146 | 147 | copy_top_level<<>>(grid.entries, start_entries, depths, new_entries, grid.offsets[0]); 148 | for (int i = 0; i < grid.shift; i += flat_levels) { 149 | int first = i > 0 ? grid.offsets[i - 1] : 0; 150 | int last = grid.offsets[i]; 151 | int num_entries = last - first; 152 | 153 | int next_offset = i + flat_levels < grid.shift ? level_offsets[i + flat_levels] : 0; 154 | flatten_level<<>>(grid.entries, 155 | start_entries, 156 | depths, 157 | new_entries, 158 | first, 159 | level_offsets[i], 160 | next_offset, 161 | num_entries); 162 | DEBUG_SYNC(); 163 | 164 | new_offsets.emplace_back(level_offsets[i]); 165 | } 166 | new_offsets.emplace_back(total_entries); 167 | 168 | std::swap(new_entries, grid.entries); 169 | std::swap(new_offsets, grid.offsets); 170 | mem.free(new_entries); 171 | grid.num_entries = total_entries; 172 | 173 | mem.free(depths); 174 | mem.free(start_entries); 175 | } 176 | 177 | } // namespace hagrid 178 | -------------------------------------------------------------------------------- /src/grid.h: -------------------------------------------------------------------------------- 1 | #ifndef GRID_H 2 | #define GRID_H 3 | 4 | #include 5 | 6 | #include "vec.h" 7 | #include "bbox.h" 8 | 9 | namespace hagrid { 10 | 11 | /// Voxel map entry 12 | struct Entry { 13 | enum { 14 | LOG_DIM_BITS = 2, 15 | BEGIN_BITS = 32 - LOG_DIM_BITS 16 | }; 17 | 18 | uint32_t log_dim : LOG_DIM_BITS; ///< Logarithm of the dimensions of the entry (0 for leaves) 19 | uint32_t begin : BEGIN_BITS; ///< Next entry index (cell index for leaves) 20 | }; 21 | 22 | /// Cell of the irregular grid 23 | struct Cell { 24 | ivec3 min; ///< Minimum bounding box coordinate 25 | int begin; ///< Index of the first reference 26 | ivec3 max; ///< Maximum bounding box coordinate 27 | int end; ///< Past-the-end reference index 28 | 29 | HOST DEVICE Cell() {} 30 | HOST DEVICE Cell(const ivec3& min, int begin, const ivec3& max, int end) 31 | : min(min), begin(begin), max(max), end(end) 32 | {} 33 | }; 34 | 35 | /// Compressed irregular grid cell 36 | struct SmallCell { 37 | usvec3 min; ///< Minimum bounding box coordinate 38 | usvec3 max; ///< Maximum bounding box coordinate 39 | int begin; ///< Index of the first reference 40 | 41 | HOST DEVICE SmallCell() {} 42 | HOST DEVICE SmallCell(const usvec3& min, const usvec3& max, int begin) 43 | : min(min), max(max), begin(begin) 44 | {} 45 | }; 46 | 47 | /// Structure holding an irregular grid 48 | struct Grid { 49 | Entry* entries; ///< Voxel map, stored as a contiguous array 50 | int* ref_ids; ///< Array of primitive references 51 | Cell* cells; ///< Cells of the structure (nullptr if compressed) 52 | 53 | SmallCell* small_cells; ///< Compressed cells (nullptr if not compressed) 54 | 55 | BBox bbox; ///< Bounding box of the scene 56 | ivec3 dims; ///< Top-level dimensions 57 | int num_cells; ///< Number of cells 58 | int num_entries; ///< Number of elements in the voxel map 59 | int num_refs; ///< Number of primitive references 60 | int shift; ///< Amount of bits to shift to get from the deepest level to the top-level 61 | std::vector offsets; ///< Offset to each level of the voxel map octree 62 | }; 63 | 64 | /// A 3D integer range 65 | struct Range { 66 | int lx, ly, lz; 67 | int hx, hy, hz; 68 | HOST DEVICE Range() {} 69 | HOST DEVICE Range(int lx, int ly, int lz, 70 | int hx, int hy, int hz) 71 | : lx(lx), ly(ly), lz(lz) 72 | , hx(hx), hy(hy), hz(hz) 73 | {} 74 | HOST DEVICE int size() const { return (hx - lx + 1) * (hy - ly + 1) * (hz - lz + 1) ; } 75 | }; 76 | 77 | /// Returns a voxel map entry with the given dimension and starting index 78 | HOST DEVICE inline Entry make_entry(uint32_t log_dim, uint32_t begin) { 79 | Entry e { .log_dim = log_dim, .begin = begin }; 80 | return e; 81 | } 82 | 83 | /// Computes the range of cells that intersect the given box 84 | HOST DEVICE inline Range compute_range(const ivec3& dims, const BBox& grid_bb, const BBox& obj_bb) { 85 | auto inv = vec3(dims) / grid_bb.extents(); 86 | int lx = max(int((obj_bb.min.x - grid_bb.min.x) * inv.x), 0); 87 | int ly = max(int((obj_bb.min.y - grid_bb.min.y) * inv.y), 0); 88 | int lz = max(int((obj_bb.min.z - grid_bb.min.z) * inv.z), 0); 89 | int hx = min(int((obj_bb.max.x - grid_bb.min.x) * inv.x), dims.x - 1); 90 | int hy = min(int((obj_bb.max.y - grid_bb.min.y) * inv.y), dims.y - 1); 91 | int hz = min(int((obj_bb.max.z - grid_bb.min.z) * inv.z), dims.z - 1); 92 | return Range(lx, ly, lz, hx, hy, hz); 93 | } 94 | 95 | /// Computes grid dimensions based on the formula by Cleary et al. 96 | HOST DEVICE inline ivec3 compute_grid_dims(const BBox& bb, int num_prims, float density) { 97 | const vec3 extents = bb.extents(); 98 | const float volume = extents.x * extents.y * extents.z; 99 | const float ratio = cbrtf(density * num_prims / volume); 100 | return max(ivec3(1), ivec3(extents.x * ratio, extents.y * ratio, extents.z * ratio)); 101 | } 102 | 103 | HOST DEVICE inline uint32_t lookup_entry(const Entry* entries, int shift, const ivec3& dims, const ivec3& voxel) { 104 | auto entry = entries[(voxel.x >> shift) + dims.x * ((voxel.y >> shift) + dims.y * (voxel.z >> shift))]; 105 | auto log_dim = entry.log_dim, d = log_dim; 106 | while (log_dim) { 107 | auto begin = entry.begin; 108 | auto mask = (1 << log_dim) - 1; 109 | 110 | auto k = (voxel >> int(shift - d)) & mask; 111 | entry = entries[begin + k.x + ((k.y + (k.z << log_dim)) << log_dim)]; 112 | log_dim = entry.log_dim; 113 | d += log_dim; 114 | } 115 | return entry.begin; 116 | } 117 | 118 | template 119 | HOST DEVICE int foreach_ref(Cell cell, const int* ref_ids, F f) { 120 | int cur = cell.begin, ref = cur < cell.end ? ref_ids[cur++] : -1; 121 | while (ref >= 0) { 122 | // Preload the next reference 123 | auto next = cur < cell.end ? ref_ids[cur++] : -1; 124 | f(ref); 125 | ref = next; 126 | } 127 | return cell.end - cell.begin; 128 | } 129 | 130 | template 131 | HOST DEVICE int foreach_ref(SmallCell small_cell, const int* ref_ids, F f) { 132 | auto cur = small_cell.begin; 133 | auto ref = cur >= 0 ? ref_ids[cur++] : -1; 134 | while (ref >= 0) { 135 | auto next = ref_ids[cur++]; 136 | f(ref); 137 | ref = next; 138 | } 139 | return cur - small_cell.begin; 140 | } 141 | 142 | #ifdef __NVCC__ 143 | __device__ __forceinline__ Cell load_cell(const Cell* cell_ptr) { 144 | const int4* ptr = (const int4*)cell_ptr; 145 | auto cell0 = ptr[0]; 146 | auto cell1 = ptr[1]; 147 | return Cell(ivec3(cell0.x, cell0.y, cell0.z), cell0.w, 148 | ivec3(cell1.x, cell1.y, cell1.z), cell1.w); 149 | } 150 | 151 | __device__ __forceinline__ ivec3 load_cell_min(const Cell* cell_ptr) { 152 | auto cell0 = ((const int4*)cell_ptr)[0]; 153 | return ivec3(cell0.x, cell0.y, cell0.z); 154 | } 155 | 156 | __device__ __forceinline__ void store_cell(Cell* cell_ptr, const Cell& cell) { 157 | int4* ptr = (int4*)cell_ptr; 158 | ptr[0] = make_int4(cell.min.x, cell.min.y, cell.min.z, cell.begin); 159 | ptr[1] = make_int4(cell.max.x, cell.max.y, cell.max.z, cell.end); 160 | } 161 | 162 | __device__ __forceinline__ SmallCell load_cell(const SmallCell* cell_ptr) { 163 | const uint4* ptr = (const uint4*)cell_ptr; 164 | auto cell = *ptr; 165 | return SmallCell(usvec3(cell.x, cell.x >> 16, cell.y), 166 | usvec3(cell.y >> 16, cell.z, cell.z >> 16), 167 | cell.w); 168 | } 169 | 170 | __device__ __forceinline__ void store_cell(const SmallCell* cell_ptr, const SmallCell& cell) { 171 | uint4* ptr = (uint4*)cell_ptr; 172 | *ptr = make_uint4(cell.min.x | ((uint)(cell.min.y) << 16), 173 | cell.min.z | ((uint)(cell.max.x) << 16), 174 | cell.max.y | ((uint)(cell.max.z) << 16), 175 | (uint)cell.begin); 176 | } 177 | #endif // __NVCC__ 178 | 179 | } // namespace hagrid 180 | 181 | #endif // GRID_H 182 | -------------------------------------------------------------------------------- /src/load_obj.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "load_obj.h" 7 | 8 | namespace hagrid { 9 | 10 | inline void error() { 11 | std::cerr << std::endl; 12 | } 13 | 14 | template 15 | inline void error(T t, Args... args) { 16 | #ifndef NDEBUG 17 | std::cerr << t; 18 | error(args...); 19 | #endif 20 | } 21 | 22 | inline void remove_eol(char* ptr) { 23 | int i = 0; 24 | while (ptr[i]) i++; 25 | i--; 26 | while (i > 0 && std::isspace(ptr[i])) { 27 | ptr[i] = '\0'; 28 | i--; 29 | } 30 | } 31 | 32 | inline char* strip_text(char* ptr) { 33 | while (*ptr && !std::isspace(*ptr)) { ptr++; } 34 | return ptr; 35 | } 36 | 37 | inline char* strip_spaces(char* ptr) { 38 | while (std::isspace(*ptr)) { ptr++; } 39 | return ptr; 40 | } 41 | 42 | inline bool read_index(char** ptr, ObjLoader::Index& idx) { 43 | char* base = *ptr; 44 | 45 | // Detect end of line (negative indices are supported) 46 | base = strip_spaces(base); 47 | if (!std::isdigit(*base) && *base != '-') return false; 48 | 49 | idx.v = 0; 50 | idx.t = 0; 51 | idx.n = 0; 52 | 53 | idx.v = std::strtol(base, &base, 10); 54 | 55 | base = strip_spaces(base); 56 | 57 | if (*base == '/') { 58 | base++; 59 | 60 | // Handle the case when there is no texture coordinate 61 | if (*base != '/') { 62 | idx.t = std::strtol(base, &base, 10); 63 | } 64 | 65 | base = strip_spaces(base); 66 | 67 | if (*base == '/') { 68 | base++; 69 | idx.n = std::strtol(base, &base, 10); 70 | } 71 | } 72 | 73 | *ptr = base; 74 | 75 | return true; 76 | } 77 | 78 | bool ObjLoader::load_obj(const std::string& path, File& file) { 79 | std::ifstream stream(path); 80 | if (!stream) return false; 81 | 82 | // Add an empty object to the scene 83 | int cur_object = 0; 84 | file.objects.emplace_back(); 85 | 86 | // Add an empty group to this object 87 | int cur_group = 0; 88 | file.objects[0].groups.emplace_back(); 89 | 90 | // Add an empty material to the scene 91 | int cur_mtl = 0; 92 | file.materials.emplace_back(""); 93 | 94 | // Add dummy vertex, normal, and texcoord 95 | file.vertices.emplace_back(); 96 | file.normals.emplace_back(); 97 | file.texcoords.emplace_back(); 98 | 99 | int err_count = 0; 100 | const int max_line = 1024; 101 | char line[max_line]; 102 | while (stream.getline(line, max_line)) { 103 | // Strip spaces 104 | char* ptr = strip_spaces(line); 105 | const char* err_line = ptr; 106 | 107 | // Skip comments and empty lines 108 | if (*ptr == '\0' || *ptr == '#') 109 | continue; 110 | 111 | remove_eol(ptr); 112 | 113 | // Test each command in turn, the most frequent first 114 | if (*ptr == 'v') { 115 | switch (ptr[1]) { 116 | case ' ': 117 | case '\t': 118 | { 119 | vec3 v; 120 | v.x = std::strtof(ptr + 1, &ptr); 121 | v.y = std::strtof(ptr, &ptr); 122 | v.z = std::strtof(ptr, &ptr); 123 | file.vertices.push_back(v); 124 | } 125 | break; 126 | case 'n': 127 | #ifndef SKIP_NORMALS 128 | { 129 | vec3 n; 130 | n.x = std::strtof(ptr + 2, &ptr); 131 | n.y = std::strtof(ptr, &ptr); 132 | n.z = std::strtof(ptr, &ptr); 133 | file.normals.push_back(n); 134 | } 135 | #endif 136 | break; 137 | case 't': 138 | #ifndef SKIP_TEXCOORDS 139 | { 140 | vec2 t; 141 | t.x = std::strtof(ptr + 2, &ptr); 142 | t.y = std::strtof(ptr, &ptr); 143 | file.texcoords.push_back(t); 144 | } 145 | #endif 146 | break; 147 | default: 148 | error("invalid vertex"); 149 | err_count++; 150 | break; 151 | } 152 | } else if (*ptr == 'f' && std::isspace(ptr[1])) { 153 | Face f; 154 | 155 | f.index_count = 0; 156 | f.material = cur_mtl; 157 | 158 | bool valid = true; 159 | ptr += 2; 160 | while(f.index_count < Face::max_indices) { 161 | Index index; 162 | valid = read_index(&ptr, index); 163 | 164 | if (valid) { 165 | f.indices[f.index_count++] = index; 166 | } else { 167 | break; 168 | } 169 | } 170 | 171 | if (f.index_count < 3) { 172 | error("invalid face"); 173 | err_count++; 174 | } else { 175 | // Convert relative indices to absolute 176 | for (int i = 0; i < f.index_count; i++) { 177 | f.indices[i].v = (f.indices[i].v < 0) ? file.vertices.size() + f.indices[i].v : f.indices[i].v; 178 | f.indices[i].t = (f.indices[i].t < 0) ? file.texcoords.size() + f.indices[i].t : f.indices[i].t; 179 | f.indices[i].n = (f.indices[i].n < 0) ? file.normals.size() + f.indices[i].n : f.indices[i].n; 180 | } 181 | 182 | // Check if the indices are valid or not 183 | valid = true; 184 | for (int i = 0; i < f.index_count; i++) { 185 | if (f.indices[i].v <= 0 || f.indices[i].t < 0 || f.indices[i].n < 0) { 186 | valid = false; 187 | break; 188 | } 189 | } 190 | 191 | if (valid) { 192 | file.objects[cur_object].groups[cur_group].faces.push_back(f); 193 | } else { 194 | error("invalid indices"); 195 | err_count++; 196 | } 197 | } 198 | } else if (*ptr == 'g' && std::isspace(ptr[1])) { 199 | file.objects[cur_object].groups.emplace_back(); 200 | cur_group++; 201 | } else if (*ptr == 'o' && std::isspace(ptr[1])) { 202 | file.objects.emplace_back(); 203 | cur_object++; 204 | 205 | file.objects[cur_object].groups.emplace_back(); 206 | cur_group = 0; 207 | } else if (!std::strncmp(ptr, "usemtl", 6) && std::isspace(ptr[6])) { 208 | ptr += 6; 209 | 210 | ptr = strip_spaces(ptr); 211 | char* base = ptr; 212 | ptr = strip_text(ptr); 213 | 214 | const std::string mtl_name(base, ptr); 215 | 216 | cur_mtl = std::find(file.materials.begin(), file.materials.end(), mtl_name) - file.materials.begin(); 217 | if (cur_mtl == (int)file.materials.size()) { 218 | file.materials.push_back(mtl_name); 219 | } 220 | } else if (!std::strncmp(ptr, "mtllib", 6) && std::isspace(ptr[6])) { 221 | ptr += 6; 222 | 223 | ptr = strip_spaces(ptr); 224 | char* base = ptr; 225 | ptr = strip_text(ptr); 226 | 227 | const std::string lib_name(base, ptr); 228 | 229 | file.mtl_libs.push_back(lib_name); 230 | } else if (*ptr == 's' && std::isspace(ptr[1])) { 231 | // Ignore smooth commands 232 | } else { 233 | error("unknown command ", ptr); 234 | err_count++; 235 | } 236 | } 237 | 238 | return (err_count == 0); 239 | } 240 | 241 | bool ObjLoader::load_mtl(const std::string& path, MaterialLib& mtl_lib) { 242 | std::ifstream stream(path); 243 | if (!stream) return false; 244 | 245 | const int max_line = 1024; 246 | char line[max_line]; 247 | char* err_line = line; 248 | int err_count = 0; 249 | 250 | std::string mtl_name; 251 | auto current_material = [&] () -> Material& { 252 | return mtl_lib[mtl_name]; 253 | }; 254 | 255 | while (stream.getline(line, max_line)) { 256 | // Strip spaces 257 | char* ptr = strip_spaces(line); 258 | err_line = ptr; 259 | 260 | // Skip comments and empty lines 261 | if (*ptr == '\0' || *ptr == '#') 262 | continue; 263 | 264 | remove_eol(ptr); 265 | 266 | if (!std::strncmp(ptr, "newmtl", 6) && std::isspace(ptr[6])) { 267 | ptr = strip_spaces(ptr + 7); 268 | char* base = ptr; 269 | ptr = strip_text(ptr); 270 | 271 | mtl_name = std::string(base, ptr); 272 | if (mtl_lib.find(mtl_name) != mtl_lib.end()) { 273 | error("material redefinition"); 274 | err_count++; 275 | } 276 | } else if (ptr[0] == 'K') { 277 | if (ptr[1] == 'a' && std::isspace(ptr[2])) { 278 | auto& mat = current_material(); 279 | mat.ka.r = std::strtof(ptr + 3, &ptr); 280 | mat.ka.g = std::strtof(ptr, &ptr); 281 | mat.ka.b = std::strtof(ptr, &ptr); 282 | } else if (ptr[1] == 'd' && std::isspace(ptr[2])) { 283 | auto& mat = current_material(); 284 | mat.kd.r = std::strtof(ptr + 3, &ptr); 285 | mat.kd.g = std::strtof(ptr, &ptr); 286 | mat.kd.b = std::strtof(ptr, &ptr); 287 | } else if (ptr[1] == 's' && std::isspace(ptr[2])) { 288 | auto& mat = current_material(); 289 | mat.ks.r = std::strtof(ptr + 3, &ptr); 290 | mat.ks.g = std::strtof(ptr, &ptr); 291 | mat.ks.b = std::strtof(ptr, &ptr); 292 | } else if (ptr[1] == 'e' && std::isspace(ptr[2])) { 293 | auto& mat = current_material(); 294 | mat.ke.r = std::strtof(ptr + 3, &ptr); 295 | mat.ke.g = std::strtof(ptr, &ptr); 296 | mat.ke.b = std::strtof(ptr, &ptr); 297 | } else { 298 | error("invalid command"); 299 | err_count++; 300 | } 301 | } else if (ptr[0] == 'N') { 302 | if (ptr[1] == 's' && std::isspace(ptr[2])) { 303 | auto& mat = current_material(); 304 | mat.ns = std::strtof(ptr + 3, &ptr); 305 | } else if (ptr[1] == 'i' && std::isspace(ptr[2])) { 306 | auto& mat = current_material(); 307 | mat.ni = std::strtof(ptr + 3, &ptr); 308 | } else { 309 | error("invalid command"); 310 | err_count++; 311 | } 312 | } else if (ptr[0] == 'T') { 313 | if (ptr[1] == 'f' && std::isspace(ptr[2])) { 314 | auto& mat = current_material(); 315 | mat.tf.r = std::strtof(ptr + 3, &ptr); 316 | mat.tf.g = std::strtof(ptr, &ptr); 317 | mat.tf.b = std::strtof(ptr, &ptr); 318 | } else if (ptr[1] == 'r' && std::isspace(ptr[2])) { 319 | auto& mat = current_material(); 320 | mat.tr = std::strtof(ptr + 3, &ptr); 321 | } else { 322 | error("invalid command"); 323 | err_count++; 324 | } 325 | } else if (ptr[0] == 'd' && std::isspace(ptr[1])) { 326 | auto& mat = current_material(); 327 | mat.d = std::strtof(ptr + 2, &ptr); 328 | } else if (!std::strncmp(ptr, "illum", 5) && std::isspace(ptr[5])) { 329 | auto& mat = current_material(); 330 | mat.illum = std::strtof(ptr + 6, &ptr); 331 | } else if (!std::strncmp(ptr, "map_Ka", 6) && std::isspace(ptr[6])) { 332 | auto& mat = current_material(); 333 | mat.map_ka = std::string(strip_spaces(ptr + 7)); 334 | } else if (!std::strncmp(ptr, "map_Kd", 6) && std::isspace(ptr[6])) { 335 | auto& mat = current_material(); 336 | mat.map_kd = std::string(strip_spaces(ptr + 7)); 337 | } else if (!std::strncmp(ptr, "map_Ks", 6) && std::isspace(ptr[6])) { 338 | auto& mat = current_material(); 339 | mat.map_ks = std::string(strip_spaces(ptr + 7)); 340 | } else if (!std::strncmp(ptr, "map_Ke", 6) && std::isspace(ptr[6])) { 341 | auto& mat = current_material(); 342 | mat.map_ke = std::string(strip_spaces(ptr + 7)); 343 | } else if (!std::strncmp(ptr, "map_bump", 8) && std::isspace(ptr[8])) { 344 | auto& mat = current_material(); 345 | mat.map_bump = std::string(strip_spaces(ptr + 9)); 346 | } else if (!std::strncmp(ptr, "bump", 4) && std::isspace(ptr[4])) { 347 | auto& mat = current_material(); 348 | mat.map_bump = std::string(strip_spaces(ptr + 5)); 349 | } else if (!std::strncmp(ptr, "map_d", 5) && std::isspace(ptr[5])) { 350 | auto& mat = current_material(); 351 | mat.map_d = std::string(strip_spaces(ptr + 6)); 352 | } else { 353 | error("unknown command ", ptr); 354 | err_count++; 355 | } 356 | } 357 | 358 | return (err_count == 0); 359 | } 360 | 361 | } // namespace hagrid 362 | -------------------------------------------------------------------------------- /src/load_obj.h: -------------------------------------------------------------------------------- 1 | #ifndef LOAD_OBJ_H 2 | #define LOAD_OBJ_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "vec.h" 10 | 11 | namespace hagrid { 12 | 13 | class ObjLoader { 14 | public: 15 | struct Index { 16 | int v, n, t; 17 | }; 18 | 19 | struct Face { 20 | static constexpr int max_indices = 8; 21 | Index indices[max_indices]; 22 | int index_count; 23 | int material; 24 | }; 25 | 26 | struct Group { 27 | std::vector faces; 28 | }; 29 | 30 | struct Object { 31 | std::vector groups; 32 | }; 33 | 34 | struct Material { 35 | vec3 ka; 36 | vec3 kd; 37 | vec3 ks; 38 | vec3 ke; 39 | float ns; 40 | float ni; 41 | vec3 tf; 42 | float tr; 43 | float d; 44 | int illum; 45 | std::string map_ka; 46 | std::string map_kd; 47 | std::string map_ks; 48 | std::string map_ke; 49 | std::string map_bump; 50 | std::string map_d; 51 | }; 52 | 53 | struct File { 54 | std::vector objects; 55 | std::vector vertices; 56 | std::vector normals; 57 | std::vector texcoords; 58 | std::vector materials; 59 | std::vector mtl_libs; 60 | }; 61 | 62 | struct Path { 63 | Path() {} 64 | Path(const char* p) : Path(std::string(p)) {} 65 | Path(const std::string& p) 66 | : path(p) 67 | { 68 | std::replace(path.begin(), path.end(), '\\', '/'); 69 | auto pos = path.rfind('/'); 70 | base = (pos != std::string::npos) ? path.substr(0, pos) : "."; 71 | file = (pos != std::string::npos) ? path.substr(pos + 1) : path; 72 | } 73 | 74 | operator const std::string& () const { 75 | return path; 76 | } 77 | 78 | std::string path; 79 | std::string base; 80 | std::string file; 81 | }; 82 | 83 | typedef std::unordered_map MaterialLib; 84 | 85 | static bool load_obj(const std::string&, File&); 86 | static bool load_mtl(const std::string&, MaterialLib&); 87 | static bool load_scene(const Path& path, File& file, MaterialLib& mtl_lib) { 88 | if (!load_obj(path, file)) return false; 89 | for (auto& lib : file.mtl_libs) { 90 | // We tolerate errors in the MTL file 91 | load_mtl(path.base + "/" + lib, mtl_lib); 92 | } 93 | return true; 94 | } 95 | }; 96 | 97 | } // namespace hagrid 98 | 99 | #endif // LOAD_OBJ_H 100 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include "build.h" 13 | #include "load_obj.h" 14 | #include "mem_manager.h" 15 | #include "traverse.h" 16 | 17 | using namespace hagrid; 18 | 19 | struct Camera { 20 | vec3 eye; 21 | vec3 right; 22 | vec3 up; 23 | vec3 dir; 24 | }; 25 | 26 | struct View { 27 | vec3 eye; 28 | vec3 forward; 29 | vec3 right; 30 | vec3 up; 31 | float dist; 32 | float rspeed; 33 | float tspeed; 34 | }; 35 | 36 | enum class DisplayMode { 37 | DEPTH, 38 | GRAY_SCALE, 39 | HEAT_MAP 40 | }; 41 | 42 | inline Camera gen_camera(const vec3& eye, const vec3& center, const vec3& up, float fov, float ratio) { 43 | Camera cam; 44 | const float f = tanf(M_PI * fov / 360); 45 | cam.dir = normalize(center - eye); 46 | cam.right = normalize(cross(cam.dir, up)) * (f * ratio); 47 | cam.up = normalize(cross(cam.right, cam.dir)) * f; 48 | cam.eye = eye; 49 | return cam; 50 | } 51 | 52 | inline void gen_rays(const Camera& cam, std::vector& rays, float clip, int w, int h) { 53 | for (int y = 0; y < h; y++) { 54 | for (int x = 0; x < w; x++) { 55 | auto kx = 2 * x / float(w) - 1; 56 | auto ky = 1 - 2 * y / float(h); 57 | auto dir = cam.dir + cam.right * kx + cam.up * ky; 58 | 59 | auto& ray = rays[y * w + x]; 60 | ray.org = cam.eye; 61 | ray.dir = dir; 62 | ray.tmin = 0.0f; 63 | ray.tmax = clip; 64 | } 65 | } 66 | } 67 | 68 | void gradient(uint8_t* color, float k) { 69 | static const vec3 g[] = { 70 | vec3(0, 0, 255), 71 | vec3(0, 255, 255), 72 | vec3(0, 128, 0), 73 | vec3(255, 255, 0), 74 | vec3(255, 0, 0) 75 | }; 76 | constexpr int n = sizeof(g) / sizeof(g[0]); 77 | static const float s = 1.0f / n; 78 | 79 | int i = min(n - 1, int(k * n)); 80 | int j = min(n - 1, i + 1); 81 | 82 | float t = (k - i * s) / s; 83 | auto c = (1.0f - t) * g[i] + t * g[j]; 84 | 85 | color[0] = c.z; 86 | color[1] = c.y; 87 | color[2] = c.x; 88 | } 89 | 90 | template 91 | void update_surface(SDL_Surface* surf, std::vector& hits, float clip, int w, int h) { 92 | for (int y = 0, my = std::min(surf->h, h); y < my; y++) { 93 | unsigned char* row = (unsigned char*)surf->pixels + surf->pitch * y; 94 | for (int x = 0, mx = std::min(surf->w, w); x < mx; x++) { 95 | if (mode == DisplayMode::DEPTH) { 96 | uint8_t color = 255.0f * hits[y * w + x].t / clip; 97 | row[x * 4 + 0] = color; 98 | row[x * 4 + 1] = color; 99 | row[x * 4 + 2] = color; 100 | } else if (mode == DisplayMode::GRAY_SCALE) { 101 | uint8_t color = std::min(255, hits[y * w + x].id); 102 | row[x * 4 + 0] = color; 103 | row[x * 4 + 1] = color; 104 | row[x * 4 + 2] = color; 105 | } else if (mode == DisplayMode::HEAT_MAP) { 106 | gradient(row + x * 4, std::min(100, hits[y * w + x].id) / 100.0f); 107 | } 108 | row[x * 4 + 3] = 255; 109 | } 110 | } 111 | } 112 | 113 | struct ProgramOptions { 114 | std::string scene_file; 115 | std::string ray_file; 116 | float top_density, snd_density; 117 | float alpha; 118 | int exp_iters; 119 | int width, height; 120 | float clip, fov; 121 | int build_iter; 122 | int build_warmup; 123 | int bench_iter; 124 | int bench_warmup; 125 | float tmin, tmax; 126 | bool keep_alive; 127 | bool compress; 128 | bool help; 129 | 130 | ProgramOptions() 131 | : top_density(0.12f) 132 | , snd_density(2.4f) 133 | , alpha(0.995f) 134 | , exp_iters(3) 135 | , width(1024) 136 | , height(1024) 137 | , clip(0) 138 | , fov(60) 139 | , build_iter(1) 140 | , build_warmup(0) 141 | , bench_iter(1) 142 | , bench_warmup(0) 143 | , tmin(0) 144 | , tmax(std::numeric_limits::max()) 145 | , keep_alive(false) 146 | , compress(false) 147 | , help(false) 148 | {} 149 | 150 | bool parse(int argc, char** argv); 151 | 152 | private: 153 | static bool matches(const char* arg, const char* opt1, const char* opt2) { 154 | return !strcmp(arg, opt1) || !strcmp(arg, opt2); 155 | } 156 | 157 | static bool arg_exists(char** argv, int i, int argc) { 158 | if (i >= argc - 1 || argv[i + 1][0] == '-') { 159 | std::cerr << "Argument missing for: " << argv[i] << std::endl; 160 | return false; 161 | } 162 | return true; 163 | } 164 | }; 165 | 166 | bool ProgramOptions::parse(int argc, char** argv) { 167 | bool scene_parsed = false; 168 | for (int i = 1; i < argc; i++) { 169 | auto arg = argv[i]; 170 | 171 | if (arg[0] != '-') { 172 | if (scene_parsed) { 173 | std::cerr << "Cannot accept more than one model on the command line" << std::endl; 174 | return false; 175 | } 176 | scene_file = arg; 177 | scene_parsed = true; 178 | continue; 179 | } 180 | 181 | if (matches(arg, "-h", "--help")) { 182 | help = true; 183 | } else if (matches(arg, "-sx", "--width")) { 184 | if (!arg_exists(argv, i, argc)) return false; 185 | width = strtol(argv[++i], nullptr, 10); 186 | } else if (matches(arg, "-sy", "--height")) { 187 | if (!arg_exists(argv, i, argc)) return false; 188 | height = strtol(argv[++i], nullptr, 10); 189 | } else if (matches(arg, "-c", "--clip")) { 190 | if (!arg_exists(argv, i, argc)) return false; 191 | clip = strtof(argv[++i], nullptr); 192 | } else if (matches(arg, "-f", "--fov")) { 193 | if (!arg_exists(argv, i, argc)) return false; 194 | fov = strtof(argv[++i], nullptr); 195 | } else if (matches(arg, "-td", "--top-density")) { 196 | if (!arg_exists(argv, i, argc)) return false; 197 | top_density = strtof(argv[++i], nullptr); 198 | } else if (matches(arg, "-sd", "--snd-density")) { 199 | if (!arg_exists(argv, i, argc)) return false; 200 | snd_density = strtof(argv[++i], nullptr); 201 | } else if (matches(arg, "-a", "--alpha")) { 202 | if (!arg_exists(argv, i, argc)) return false; 203 | alpha = strtof(argv[++i], nullptr); 204 | } else if (matches(arg, "-e", "--expansion")) { 205 | if (!arg_exists(argv, i, argc)) return false; 206 | exp_iters = strtol(argv[++i], nullptr, 10); 207 | } else if (matches(arg, "-nb", "--build-iter")) { 208 | if (!arg_exists(argv, i, argc)) return false; 209 | build_iter = strtol(argv[++i], nullptr, 10); 210 | } else if (matches(arg, "-wb", "--build-warmup")) { 211 | if (!arg_exists(argv, i, argc)) return false; 212 | build_warmup = strtol(argv[++i], nullptr, 10); 213 | } else if (matches(arg, "-k", "--keep-alive")) { 214 | keep_alive = true; 215 | } else if (matches(arg, "-z", "--compress")) { 216 | compress = true; 217 | } else if (matches(arg, "-r", "--ray-file")) { 218 | if (!arg_exists(argv, i, argc)) return false; 219 | ray_file = argv[++i]; 220 | } else if (matches(arg, "-tmin", "--tmin")) { 221 | if (!arg_exists(argv, i, argc)) return false; 222 | tmin = strtof(argv[++i], nullptr); 223 | } else if (matches(arg, "-tmax", "--tmax")) { 224 | if (!arg_exists(argv, i, argc)) return false; 225 | tmax = strtof(argv[++i], nullptr); 226 | } else if (matches(arg, "-n", "--bench-iter")) { 227 | if (!arg_exists(argv, i, argc)) return false; 228 | bench_iter = strtol(argv[++i], nullptr, 10); 229 | } else if (matches(arg, "-w", "--bench-warmup")) { 230 | if (!arg_exists(argv, i, argc)) return false; 231 | bench_warmup = strtol(argv[++i], nullptr, 10); 232 | } else { 233 | std::cerr << "Unknown argument: " << arg << std::endl; 234 | return false; 235 | } 236 | } 237 | 238 | if (!scene_parsed) { 239 | std::cerr << "No model specified" << std::endl; 240 | return false; 241 | } 242 | 243 | return true; 244 | } 245 | 246 | static bool load_model(const std::string& file_name, std::vector& tris) { 247 | ObjLoader::File obj_file; 248 | ObjLoader::MaterialLib mtl_lib; 249 | if (!ObjLoader::load_scene(file_name, obj_file, mtl_lib)) 250 | return false; 251 | 252 | for (auto& object : obj_file.objects) { 253 | for (auto& group : object.groups) { 254 | for (auto& face : group.faces) { 255 | auto v0 = obj_file.vertices[face.indices[0].v]; 256 | for (int i = 0; i < face.index_count - 2; i++) { 257 | auto v1 = obj_file.vertices[face.indices[i + 1].v]; 258 | auto v2 = obj_file.vertices[face.indices[i + 2].v]; 259 | auto e1 = v0 - v1; 260 | auto e2 = v2 - v0; 261 | auto n = cross(e1, e2); 262 | 263 | const Tri tri = { 264 | v0, n.x, 265 | e1, n.y, 266 | e2, n.z 267 | }; 268 | tris.push_back(tri); 269 | } 270 | } 271 | } 272 | } 273 | 274 | return true; 275 | } 276 | 277 | static bool load_rays(const std::string& file_name, std::vector& rays, float tmin, float tmax) { 278 | std::ifstream in(file_name, std::ifstream::binary); 279 | if (!in) return false; 280 | 281 | in.seekg(0, std::ifstream::end); 282 | int count = in.tellg() / (sizeof(float) * 6); 283 | 284 | rays.resize(count); 285 | in.seekg(0); 286 | 287 | for (int i = 0; i < count; i++) { 288 | float org_dir[6]; 289 | in.read((char*)org_dir, sizeof(float) * 6); 290 | Ray& ray = rays.data()[i]; 291 | 292 | ray.org = vec3(org_dir[0], org_dir[1], org_dir[2]); 293 | ray.dir = vec3(org_dir[3], org_dir[4], org_dir[5]); 294 | 295 | ray.tmin = tmin; 296 | ray.tmax = tmax; 297 | } 298 | 299 | return true; 300 | } 301 | 302 | bool handle_events(View& view, DisplayMode& display_mode) { 303 | static bool arrows[4], camera_on; 304 | SDL_Event event; 305 | while (SDL_PollEvent(&event)) { 306 | switch (event.type) { 307 | case SDL_QUIT: 308 | return true; 309 | case SDL_MOUSEBUTTONDOWN: 310 | SDL_SetRelativeMouseMode(SDL_TRUE); 311 | camera_on = true; 312 | break; 313 | case SDL_MOUSEBUTTONUP: 314 | camera_on = false; 315 | SDL_SetRelativeMouseMode(SDL_FALSE); 316 | break; 317 | case SDL_MOUSEMOTION: 318 | if (camera_on) { 319 | view.right = cross(view.forward, view.up); 320 | view.forward = rotate(view.forward, view.right, -event.motion.yrel * view.rspeed); 321 | view.forward = rotate(view.forward, view.up, -event.motion.xrel * view.rspeed); 322 | view.forward = normalize(view.forward); 323 | view.up = normalize(cross(view.right, view.forward)); 324 | } 325 | break; 326 | case SDL_KEYUP: 327 | switch (event.key.keysym.sym) { 328 | case SDLK_UP: arrows[0] = false; break; 329 | case SDLK_DOWN: arrows[1] = false; break; 330 | case SDLK_LEFT: arrows[2] = false; break; 331 | case SDLK_RIGHT: arrows[3] = false; break; 332 | } 333 | break; 334 | case SDL_KEYDOWN: 335 | switch (event.key.keysym.sym) { 336 | case SDLK_UP: arrows[0] = true; break; 337 | case SDLK_DOWN: arrows[1] = true; break; 338 | case SDLK_LEFT: arrows[2] = true; break; 339 | case SDLK_RIGHT: arrows[3] = true; break; 340 | case SDLK_KP_PLUS: view.tspeed *= 1.1f; break; 341 | case SDLK_KP_MINUS: view.tspeed /= 1.1f; break; 342 | case SDLK_c: 343 | { 344 | auto center = view.eye + view.forward * view.dist; 345 | std::cout << "Eye: " << view.eye.x << " " << view.eye.y << " " << view.eye.z << std::endl; 346 | std::cout << "Center: " << center.x << " " << center.y << " " << center.z << std::endl; 347 | std::cout << "Up: " << view.up.x << " " << view.up.y << " " << view.up.z << std::endl; 348 | } 349 | break; 350 | case SDLK_m: 351 | if (display_mode == DisplayMode::DEPTH) 352 | display_mode = DisplayMode::GRAY_SCALE; 353 | else if (display_mode == DisplayMode::GRAY_SCALE) 354 | display_mode = DisplayMode::HEAT_MAP; 355 | else if (display_mode == DisplayMode::HEAT_MAP) 356 | display_mode = DisplayMode::DEPTH; 357 | break; 358 | case SDLK_ESCAPE: 359 | return true; 360 | } 361 | break; 362 | } 363 | } 364 | 365 | if (arrows[0]) view.eye = view.eye + view.tspeed * view.forward; 366 | if (arrows[1]) view.eye = view.eye - view.tspeed * view.forward; 367 | if (arrows[2]) view.eye = view.eye - view.tspeed * view.right; 368 | if (arrows[3]) view.eye = view.eye + view.tspeed * view.right; 369 | 370 | return false; 371 | } 372 | 373 | static void usage() { 374 | std::cout << "Usage: hagrid [options] file\n" 375 | "Options:\n" 376 | " -h --help Shows this message\n" 377 | " -sx --width Sets the viewport width\n" 378 | " -sy --height Sets the viewport height\n" 379 | " -c --clip Sets the clipping distance\n" 380 | " -f --fov Sets the field of view\n" 381 | " Construction parameters:\n" 382 | " -td --top-density Sets the top-level density\n" 383 | " -sd --snd-density Sets the second-level density\n" 384 | " -a --alpha Sets the cell merging threshold\n" 385 | " -e --expansion Sets the number of expansion iterations\n" 386 | " -nb --build-iter Sets the number of build iterations\n" 387 | " -wb --build-warmup Sets the number of warmup build iterations\n" 388 | " -k --keep-alive Keep the buffers alive during construction\n" 389 | " -z --compress Compress the cells after construction\n" 390 | " Benchmarking:\n" 391 | " -r --ray-file Loads rays from a file and enters benchmark mode\n" 392 | " -tmin --tmin Sets the minimum distance along every ray\n" 393 | " -tmax --tmax Sets the maximum distance along every ray\n" 394 | " -n --bench-iter Sets the number of benchmarking iterations\n" 395 | " -w --bench-warmup Sets the number of benchmarking warmup iterations\n" << std::endl; 396 | } 397 | 398 | static bool benchmark(MemManager& mem, 399 | const Grid& grid, 400 | const Tri* tris, 401 | const std::string& ray_file, 402 | float tmin, float tmax, 403 | int iter, int warmup) { 404 | std::vector host_rays; 405 | if (!load_rays(ray_file, host_rays, tmin, tmax)) { 406 | std::cerr << "Cannot load ray file" << std::endl; 407 | return false; 408 | } 409 | 410 | Ray* rays = mem.alloc(host_rays.size()); 411 | Hit* hits = mem.alloc(host_rays.size()); 412 | mem.copy(rays, host_rays.data(), host_rays.size()); 413 | 414 | for (int i = 0; i < warmup; i++) { 415 | traverse_grid(grid, tris, rays, hits, host_rays.size()); 416 | } 417 | 418 | // Benchmark traversal speed 419 | std::vector timings; 420 | for (int i = 0; i < iter; i++) { 421 | auto kernel_time = profile([&] { 422 | traverse_grid(grid, tris, rays, hits, host_rays.size()); 423 | }); 424 | timings.emplace_back(kernel_time); 425 | } 426 | 427 | std::vector host_hits(host_rays.size()); 428 | mem.copy(host_hits.data(), hits, host_hits.size()); 429 | 430 | int intr = 0; 431 | for (int i = 0; i < host_rays.size(); i++) 432 | intr += (host_hits[i].id >= 0); 433 | 434 | std::sort(timings.begin(), timings.end()); 435 | const double sum = std::accumulate(timings.begin(), timings.end(), 0.0f); 436 | const double avg = sum / timings.size(); 437 | const double med = timings[timings.size() / 2]; 438 | const double min = *std::min_element(timings.begin(), timings.end()); 439 | std::cout << intr << " intersection(s)." << std::endl; 440 | std::cout << sum << "ms for " << iter << " iteration(s)." << std::endl; 441 | std::cout << host_rays.size() * iter / (1000.0 * sum) << " Mrays/sec." << std::endl; 442 | std::cout << "# Average: " << avg << " ms" << std::endl; 443 | std::cout << "# Median: " << med << " ms" << std::endl; 444 | std::cout << "# Min: " << min << " ms" << std::endl; 445 | 446 | return true; 447 | } 448 | 449 | int main(int argc, char** argv) { 450 | if (argc < 2) { 451 | usage(); 452 | return 1; 453 | } 454 | 455 | ProgramOptions opts; 456 | if (!opts.parse(argc, argv)) return 1; 457 | 458 | if (opts.help) { 459 | usage(); 460 | return 0; 461 | } 462 | 463 | std::vector host_tris; 464 | if (!load_model(opts.scene_file, host_tris)) { 465 | std::cerr << "Scene cannot be loaded (file not present or contains errors)" << std::endl; 466 | return 1; 467 | } 468 | 469 | std::cout << host_tris.size() << " triangle(s)" << std::endl; 470 | 471 | MemManager mem(opts.keep_alive); 472 | auto tris = mem.alloc(host_tris.size()); 473 | mem.copy(tris, host_tris.data(), host_tris.size()); 474 | 475 | Grid grid; 476 | grid.entries = nullptr; 477 | grid.cells = nullptr; 478 | grid.ref_ids = nullptr; 479 | 480 | // Warmup iterations 481 | for (int i = 0; i < opts.build_warmup; i++) { 482 | mem.free(grid.entries); 483 | mem.free(grid.cells); 484 | mem.free(grid.ref_ids); 485 | 486 | build_grid(mem, tris, host_tris.size(), grid, opts.top_density, opts.snd_density); 487 | merge_grid(mem, grid, opts.alpha); 488 | flatten_grid(mem, grid); 489 | expand_grid(mem, grid, tris, opts.exp_iters); 490 | if (opts.compress) compress_grid(mem, grid); 491 | } 492 | 493 | // Benchmark construction speed 494 | double total_time = 0; 495 | for (int i = 0; i < opts.build_iter; i++) { 496 | mem.free(grid.entries); 497 | mem.free(grid.cells); 498 | mem.free(grid.ref_ids); 499 | 500 | auto kernel_time = profile([&] { 501 | build_grid(mem, tris, host_tris.size(), grid, opts.top_density, opts.snd_density); 502 | merge_grid(mem, grid, opts.alpha); 503 | flatten_grid(mem, grid); 504 | expand_grid(mem, grid, tris, opts.exp_iters); 505 | if (opts.compress) compress_grid(mem, grid); 506 | }); 507 | total_time += kernel_time; 508 | } 509 | if (opts.compress && !grid.small_cells) 510 | std::cerr << "Could not compress grid. Continuing with uncompressed structure." << std::endl; 511 | 512 | auto dims = grid.dims << grid.shift; 513 | std::cout << "Grid built in " << total_time / opts.build_iter << " ms (" 514 | << dims.x << "x" << dims.y << "x" << dims.z << ", " 515 | << grid.num_cells << " cells, " << grid.num_refs << " references)" << std::endl; 516 | 517 | #ifndef NDEBUG 518 | std::cout << std::endl; 519 | mem.debug_slots(); 520 | std::cout << std::endl; 521 | #endif 522 | 523 | const size_t cells_mem = grid.num_cells * (grid.small_cells ? sizeof(SmallCell) : sizeof(Cell)); 524 | const size_t entries_mem = grid.num_entries * sizeof(int); 525 | const size_t refs_mem = grid.num_refs * sizeof(int); 526 | const size_t tris_mem = host_tris.size() * sizeof(Tri); 527 | const size_t total_mem = cells_mem + entries_mem + refs_mem + tris_mem; 528 | std::cout << "Total memory: " << total_mem / double(1024 * 1024) << " MB" << std::endl; 529 | std::cout << "Cells: " << cells_mem / double(1024 * 1024) << " MB" << std::endl; 530 | std::cout << "Entries: " << entries_mem / double(1024 * 1024) << " MB" << std::endl; 531 | std::cout << "References: " << refs_mem / double(1024 * 1024) << " MB" << std::endl; 532 | std::cout << "Triangles: " << tris_mem / double(1024 * 1024) << " MB" << std::endl; 533 | std::cout << "Peak usage: " << mem.max_usage() / double(1024.0 * 1024.0) << " MB" << std::endl; 534 | 535 | setup_traversal(grid); 536 | 537 | // Compute a clipping distance from the bounding box of the scene 538 | auto scene_size = length(grid.bbox.extents()); 539 | auto scene_center = grid.bbox.center(); 540 | if (opts.clip <= 0) { 541 | opts.clip = scene_size; 542 | } 543 | 544 | if (opts.ray_file != "") { 545 | std::cout << "Entering benchmark mode" << std::endl; 546 | if (!benchmark(mem, grid, tris, opts.ray_file, opts.tmin, opts.tmax, opts.bench_iter, opts.bench_warmup)) 547 | return 1; 548 | return 0; 549 | } 550 | 551 | std::cout << "Entering interactive mode\n" 552 | "Commands:\n" 553 | " Mouse, arrow keys Move the camera\n" 554 | " Numpad '+'/'-' Control camera movement speed\n" 555 | " 'm' Cycle through display modes\n" 556 | " 'c' Prints the camera position" << std::endl; 557 | 558 | if (SDL_Init(SDL_INIT_VIDEO) < 0) { 559 | std::cerr << "Cannot initialize SDL" << std::endl; 560 | return 1; 561 | } 562 | 563 | SDL_Window* win = SDL_CreateWindow("HaGrid", 564 | SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 565 | opts.width, opts.height, 566 | 0); 567 | 568 | SDL_Surface* screen = SDL_GetWindowSurface(win); 569 | 570 | SDL_FlushEvents(SDL_FIRSTEVENT, SDL_LASTEVENT); 571 | 572 | View view = { 573 | scene_center, // Eye 574 | vec3(0.0f, 0.0f, 1.0f), // Forward 575 | vec3(-1.0f, 0.0f, 0.0f), // Right 576 | vec3(0.0f, 1.0f, 0.0f), // Up 577 | 100.0f, 0.005f, // View distance, rotation speed 578 | scene_size * 0.005f // Translation speed 579 | }; 580 | 581 | size_t num_rays = opts.width * opts.height; 582 | std::vector host_hits(num_rays); 583 | std::vector host_rays(num_rays); 584 | Ray* rays = mem.alloc(num_rays); 585 | Hit* hits = mem.alloc(num_rays); 586 | double kernel_time = 0; 587 | auto ticks = SDL_GetTicks(); 588 | int frames = 0; 589 | DisplayMode display_mode = DisplayMode::DEPTH; 590 | bool done = false; 591 | while (!done) { 592 | Camera cam = gen_camera(view.eye, 593 | view.eye + view.forward * view.dist, 594 | view.up, 595 | opts.fov, 596 | (float)opts.width / (float)opts.height); 597 | 598 | gen_rays(cam, host_rays, opts.clip, opts.width, opts.height); 599 | mem.copy(rays, host_rays.data(), num_rays); 600 | 601 | kernel_time += profile([&] { traverse_grid(grid, tris, rays, hits, num_rays); }); 602 | frames++; 603 | 604 | if (SDL_GetTicks() - ticks >= 2000) { 605 | std::ostringstream caption; 606 | caption << "HaGrid [" << double(frames) * double(opts.width * opts.height) / (1000 * kernel_time) << " MRays/s]"; 607 | SDL_SetWindowTitle(win, caption.str().c_str()); 608 | ticks = SDL_GetTicks(); 609 | kernel_time = 0; 610 | frames = 0; 611 | } 612 | 613 | mem.copy(host_hits.data(), hits, num_rays); 614 | SDL_LockSurface(screen); 615 | if (display_mode == DisplayMode::DEPTH) 616 | update_surface(screen, host_hits, opts.clip, opts.width, opts.height); 617 | else if (display_mode == DisplayMode::GRAY_SCALE) 618 | update_surface(screen, host_hits, opts.clip, opts.width, opts.height); 619 | else 620 | update_surface(screen, host_hits, opts.clip, opts.width, opts.height); 621 | SDL_UnlockSurface(screen); 622 | 623 | SDL_UpdateWindowSurface(win); 624 | done = handle_events(view, display_mode); 625 | } 626 | 627 | SDL_DestroyWindow(win); 628 | SDL_Quit(); 629 | 630 | mem.free(rays); 631 | mem.free(hits); 632 | mem.free(tris); 633 | return 0; 634 | } 635 | -------------------------------------------------------------------------------- /src/mem_manager.cu: -------------------------------------------------------------------------------- 1 | #include "mem_manager.h" 2 | #include "common.h" 3 | 4 | namespace hagrid { 5 | 6 | HOST void MemManager::debug_slots() const { 7 | size_t total = 0; 8 | std::cout << "SLOTS: " << std::endl; 9 | for (auto& slot : slots_) { 10 | std::cout << "[" 11 | << (slot.in_use ? 'X' : ' ') 12 | << "] " 13 | << (double)slot.size / (1024.0 * 1024.0) << "MB" << std::endl; 14 | total += slot.size; 15 | } 16 | std::cout << (double)total / (1024.0 * 1024.0) << "MB total" << std::endl; 17 | } 18 | 19 | inline void dealloc_slot(Slot& slot) { 20 | if (slot.ptr) CHECK_CUDA_CALL(cudaFree(slot.ptr)); 21 | slot.size = 0; 22 | slot.ptr = nullptr; 23 | } 24 | 25 | HOST void MemManager::alloc_slot(Slot& slot, size_t size) { 26 | assert(!slot.in_use && "Buffer not deallocated properly"); 27 | if (slot.size < size) { 28 | if (slot.ptr) CHECK_CUDA_CALL(cudaFree(slot.ptr)); 29 | CHECK_CUDA_CALL(cudaMalloc(&slot.ptr, size)); 30 | usage_ = usage_ + size - slot.size; 31 | max_usage_ = std::max(usage_, max_usage_); 32 | slot.size = size; 33 | } 34 | slot.in_use = true; 35 | 36 | if (keep_ && usage_ >= max_usage_) { 37 | // Deallocate the first unused slot 38 | for (auto& slot : slots_) { 39 | if (slot.in_use) continue; 40 | usage_ = usage_ - slot.size; 41 | dealloc_slot(slot); 42 | break; 43 | } 44 | } 45 | } 46 | 47 | HOST void MemManager::free_slot(Slot& slot) { 48 | assert(slot.in_use); 49 | slot.in_use = false; 50 | if (!keep_) { 51 | usage_ = usage_ - slot.size; 52 | dealloc_slot(slot); 53 | } 54 | } 55 | 56 | HOST void MemManager::copy_dev_to_dev(void* dst, const void* src, size_t bytes) { 57 | CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); 58 | } 59 | 60 | HOST void MemManager::copy_hst_to_dev(void* dst, const void* src, size_t bytes) { 61 | CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); 62 | } 63 | 64 | HOST void MemManager::copy_dev_to_hst(void* dst, const void* src, size_t bytes) { 65 | CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); 66 | } 67 | 68 | HOST void MemManager::zero_dev(void* ptr, size_t bytes) { 69 | CHECK_CUDA_CALL(cudaMemset(ptr, 0, bytes)); 70 | } 71 | 72 | HOST void MemManager::one_dev(void* ptr, size_t bytes) { 73 | CHECK_CUDA_CALL(cudaMemset(ptr, 0xFFFFFFFF, bytes)); 74 | } 75 | 76 | 77 | } // namespace hagrid 78 | -------------------------------------------------------------------------------- /src/mem_manager.h: -------------------------------------------------------------------------------- 1 | #ifndef MEM_MANAGER_H 2 | #define MEM_MANAGER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "common.h" 10 | 11 | namespace hagrid { 12 | 13 | /// Directions to copy memory from and to 14 | enum class Copy { 15 | HST_TO_DEV, 16 | DEV_TO_HST, 17 | DEV_TO_DEV 18 | }; 19 | 20 | /// A slot for a buffer in GPU memory 21 | struct Slot { 22 | Slot() 23 | : ptr(nullptr) 24 | , size(0) 25 | , in_use(false) 26 | {} 27 | 28 | void* ptr; 29 | size_t size; 30 | bool in_use; 31 | }; 32 | 33 | /// Utility class to manage memory buffers during construction 34 | class MemManager { 35 | public: 36 | /// Creates a manager object. The boolean flag controls whether 37 | /// buffers are kept or deallocated upon a call to free(). Keeping 38 | /// the buffers increases the memory usage, but speeds-up subsequent 39 | /// builds (often useful for dynamic scenes). 40 | MemManager(bool keep = false) 41 | : keep_(keep), usage_(0), max_usage_(0) 42 | {} 43 | 44 | /// Allocates a buffer, re-using allocated memory when possible 45 | template 46 | HOST T* alloc(size_t n) { 47 | auto size = n * sizeof(T); 48 | auto min_diff = std::numeric_limits::max(); 49 | int found = -1; 50 | 51 | for (int i = 0, n = slots_.size(); i < n; i++) { 52 | auto& slot = slots_[i]; 53 | if (!slot.in_use) { 54 | auto diff = std::max(size, slot.size) - std::min(size, slot.size); 55 | if (diff < min_diff) { 56 | min_diff = diff; 57 | found = i; 58 | } 59 | } 60 | } 61 | 62 | if (found < 0) { 63 | found = slots_.size(); 64 | slots_.resize(found + 1); 65 | } 66 | 67 | Slot& slot = slots_[found]; 68 | alloc_slot(slot, size); 69 | tracker_[slot.ptr] = found; 70 | return reinterpret_cast(slot.ptr); 71 | } 72 | 73 | /// Frees the contents of the given slot 74 | template 75 | HOST void free(T* ptr) { 76 | if (!ptr) return; 77 | assert(tracker_.count(ptr)); 78 | free_slot(slots_[tracker_[ptr]]); 79 | tracker_.erase(ptr); 80 | } 81 | 82 | /// Copies memory between buffers 83 | template 84 | HOST void copy(T* dst, const T* src, size_t n) { 85 | if (type == Copy::DEV_TO_DEV) copy_dev_to_dev(dst, src, sizeof(T) * n); 86 | else if (type == Copy::DEV_TO_HST) copy_dev_to_hst(dst, src, sizeof(T) * n); 87 | else if (type == Copy::HST_TO_DEV) copy_hst_to_dev(dst, src, sizeof(T) * n); 88 | } 89 | 90 | /// Fills memory with zeros 91 | template 92 | HOST void zero(T* ptr, size_t n) { zero_dev(ptr, n * sizeof(T)); } 93 | 94 | /// Fills memory with ones 95 | template 96 | HOST void one(T* ptr, size_t n) { one_dev(ptr, n * sizeof(T)); } 97 | 98 | /// Displays slots and memory usage 99 | void debug_slots() const; 100 | 101 | /// Returns the current memory usage 102 | size_t usage() const { return usage_; } 103 | /// Returns the maximum memory usage 104 | size_t max_usage() const { return max_usage_; } 105 | 106 | private: 107 | HOST void alloc_slot(Slot&, size_t); 108 | HOST void free_slot(Slot&); 109 | HOST void copy_dev_to_dev(void*, const void*, size_t); 110 | HOST void copy_dev_to_hst(void*, const void*, size_t); 111 | HOST void copy_hst_to_dev(void*, const void*, size_t); 112 | HOST void zero_dev(void*, size_t); 113 | HOST void one_dev(void*, size_t); 114 | 115 | std::unordered_map tracker_; 116 | std::vector slots_; 117 | size_t usage_, max_usage_; 118 | bool keep_; 119 | }; 120 | 121 | } // namespace hagrid 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /src/merge.cu: -------------------------------------------------------------------------------- 1 | #include "build.h" 2 | #include "parallel.cuh" 3 | 4 | namespace hagrid { 5 | 6 | /// Structure that contains buffers used during merging 7 | struct MergeBuffers { 8 | int* merge_counts; ///< Contains the number of references in each cell (positive if merged, otherwise negative) 9 | int* prevs, *nexts; ///< Contains the index of the previous/next neighboring cell on the merging axis (positive if merged, otherwise negative) 10 | int* ref_counts; ///< Contains the number of references per cell after merge 11 | int* cell_flags; ///< Contains 1 if the cell is kept (it is not a residue), otherwise 0 12 | int* cell_scan; ///< Scan over cell_flags (insertion position of the cells into the new cell array) 13 | int* ref_scan; ///< Scan over ref_counts (insertion position of the references into the new reference array) 14 | int* new_cell_ids; ///< Mapping between the old cell indices and the new cell indices 15 | }; 16 | 17 | static __constant__ ivec3 grid_dims; 18 | static __constant__ vec3 cell_size; 19 | static __constant__ int grid_shift; 20 | 21 | template 22 | __device__ bool aligned(const Cell& cell1, const Cell& cell2) { 23 | constexpr int axis1 = (axis + 1) % 3; 24 | constexpr int axis2 = (axis + 2) % 3; 25 | 26 | return get(cell1.max) == get(cell2.min) && 27 | get(cell1.min) == get(cell2.min) && 28 | get(cell1.min) == get(cell2.min) && 29 | get(cell1.max) == get(cell2.max) && 30 | get(cell1.max) == get(cell2.max); 31 | } 32 | 33 | /// Restricts the merges so that cells are better aligned for the next iteration 34 | __device__ __forceinline__ bool merge_allowed(int empty_mask, int pos) { 35 | auto top_level_mask = (1 << grid_shift) - 1; 36 | auto is_shifted = (pos >> grid_shift) & empty_mask; 37 | auto is_top_level = !(pos & top_level_mask); 38 | return !is_shifted || !is_top_level; 39 | } 40 | 41 | /// Computes the position of the next cell of the grid on the axis 42 | template 43 | __device__ ivec3 next_cell(const ivec3& min, const ivec3& max) { 44 | return ivec3(axis == 0 ? max.x : min.x, 45 | axis == 1 ? max.y : min.y, 46 | axis == 2 ? max.z : min.z); 47 | } 48 | 49 | /// Computes the position of the previous cell of the grid on the axis 50 | template 51 | __device__ ivec3 prev_cell(const ivec3& min) { 52 | return ivec3(axis == 0 ? min.x - 1 : min.x, 53 | axis == 1 ? min.y - 1 : min.y, 54 | axis == 2 ? min.z - 1 : min.z); 55 | } 56 | 57 | /// Counts the number of elements in the union of two sorted arrays 58 | __device__ __forceinline__ int count_union(const int* __restrict__ p0, int c0, 59 | const int* __restrict__ p1, int c1) { 60 | int i = 0, j = 0, c = 0; 61 | while (i < c0 & j < c1) { 62 | auto a = p0[i]; 63 | auto b = p1[j]; 64 | i += (a <= b); 65 | j += (a >= b); 66 | c++; 67 | } 68 | return c + (c1 - j) + (c0 - i); 69 | } 70 | 71 | /// Merges the two sorted reference arrays 72 | __device__ __forceinline__ void merge_refs(const int* __restrict__ p0, int c0, 73 | const int* __restrict__ p1, int c1, 74 | int* __restrict__ q) { 75 | int i = 0; 76 | int j = 0; 77 | while (i < c0 && j < c1) { 78 | auto a = p0[i]; 79 | auto b = p1[j]; 80 | *(q++) = (a < b) ? a : b; 81 | i += (a <= b); 82 | j += (a >= b); 83 | } 84 | auto k = i < c0 ? i : j; 85 | auto c = i < c0 ? c0 : c1; 86 | auto p = i < c0 ? p0 : p1; 87 | while (k < c) *(q++) = p[k++]; 88 | } 89 | 90 | /// Computes the number of references per cell after the merge 91 | template 92 | __global__ void compute_merge_counts(const Entry* __restrict__ entries, 93 | const Cell* __restrict__ cells, 94 | const int* __restrict__ refs, 95 | int* __restrict__ merge_counts, 96 | int* __restrict__ nexts, 97 | int* __restrict__ prevs, 98 | int empty_mask, 99 | int num_cells) { 100 | int id = threadIdx.x + blockDim.x * blockIdx.x; 101 | if (id >= num_cells) return; 102 | 103 | static constexpr auto unit_cost = 1.0f; 104 | 105 | auto cell1 = load_cell(cells + id); 106 | auto next_pos = next_cell(cell1.min, cell1.max); 107 | int count = -(cell1.end - cell1.begin + 1); 108 | int next_id = -1; 109 | 110 | if (merge_allowed(empty_mask, get(cell1.min)) && 111 | get(next_pos) < get(grid_dims)) { 112 | next_id = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_pos); 113 | auto cell2 = load_cell(cells + next_id); 114 | 115 | if (aligned(cell1, cell2)) { 116 | auto e1 = vec3(cell1.max - cell1.min) * cell_size; 117 | auto e2 = vec3(cell2.max - cell2.min) * cell_size; 118 | auto a1 = e1.x * (e1.y + e1.z) + e1.y * e1.z; 119 | auto a2 = e2.x * (e2.y + e2.z) + e2.y * e2.z; 120 | auto a = a1 + a2 - get<(axis + 1) % 3>(e1) * get<(axis + 2) % 3>(e1); 121 | 122 | int n1 = cell1.end - cell1.begin; 123 | int n2 = cell2.end - cell2.begin; 124 | auto c1 = a1 * (n1 + unit_cost); 125 | auto c2 = a2 * (n2 + unit_cost); 126 | // Early exit test: there is a minimum of max(n1, n2) 127 | // primitives in the union of the two cells 128 | if (a * (max(n1, n2) + unit_cost) <= c1 + c2) { 129 | auto n = count_union(refs + cell1.begin, n1, 130 | refs + cell2.begin, n2); 131 | auto c = a * (n + unit_cost); 132 | if (c <= c1 + c2) count = n; 133 | } 134 | } 135 | } 136 | 137 | merge_counts[id] = count; 138 | 139 | next_id = count >= 0 ? next_id : -1; 140 | nexts[id] = next_id; 141 | if (next_id >= 0) prevs[next_id] = id; 142 | } 143 | 144 | /// Traverses the merge chains and mark the cells at odd positions as residue 145 | template 146 | __global__ void compute_cell_flags(const int* __restrict__ nexts, 147 | const int* __restrict__ prevs, 148 | int* __restrict__ cell_flags, 149 | int num_cells) { 150 | int id = threadIdx.x + blockDim.x * blockIdx.x; 151 | if (id >= num_cells) return; 152 | 153 | // If the previous cell does not exist or does not want to merge with this cell 154 | if (prevs[id] < 0) { 155 | int next_id = nexts[id]; 156 | cell_flags[id] = 1; 157 | 158 | // If this cell wants to merge with the next 159 | if (next_id >= 0) { 160 | int count = 1; 161 | 162 | // Traverse the merge chain 163 | do { 164 | cell_flags[next_id] = count % 2 ? 0 : 1; 165 | next_id = nexts[next_id]; 166 | count++; 167 | } while (next_id >= 0); 168 | } 169 | } 170 | } 171 | 172 | /// Computes the number of new references per cell 173 | __global__ void compute_ref_counts(const int* __restrict__ merge_counts, 174 | const int* __restrict__ cell_flags, 175 | int* __restrict__ ref_counts, 176 | int num_cells) { 177 | int id = threadIdx.x + blockDim.x * blockIdx.x; 178 | if (id >= num_cells) return; 179 | 180 | int count = 0; 181 | if (cell_flags[id]) { 182 | const int merged = merge_counts[id]; 183 | count = merged >= 0 ? merged : -(merged + 1); 184 | } 185 | ref_counts[id] = count; 186 | } 187 | 188 | /// Performs the merge 189 | template 190 | __global__ void merge(const Entry* __restrict__ entries, 191 | const Cell* __restrict__ cells, 192 | const int* __restrict__ refs, 193 | const int* __restrict__ cell_scan, 194 | const int* __restrict__ ref_scan, 195 | const int* __restrict__ merge_counts, 196 | int* __restrict__ new_cell_ids, 197 | Cell* __restrict__ new_cells, 198 | int* __restrict__ new_refs, 199 | int num_cells) { 200 | int id = threadIdx.x + blockDim.x * blockIdx.x; 201 | 202 | bool valid = id < num_cells; 203 | int new_id = valid ? cell_scan[id] : 0; 204 | valid &= cell_scan[id + 1] > new_id; 205 | 206 | int cell_begin = 0, cell_end = 0; 207 | int next_begin = 0, next_end = 0; 208 | int new_refs_begin; 209 | 210 | if (valid) { 211 | auto cell = load_cell(cells + id); 212 | int merge_count = merge_counts[id]; 213 | 214 | new_refs_begin = ref_scan[id]; 215 | new_cell_ids[id] = new_id; 216 | cell_begin = cell.begin; 217 | cell_end = cell.end; 218 | 219 | ivec3 new_min; 220 | ivec3 new_max; 221 | int new_refs_end; 222 | if (merge_count >= 0) { 223 | // Do the merge and store the references into the new array 224 | auto next_id = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_cell(cell.min, cell.max)); 225 | auto next_cell = load_cell(cells + next_id); 226 | next_begin = next_cell.begin; 227 | next_end = next_cell.end; 228 | 229 | // Make the next cell point to the merged one 230 | new_cell_ids[next_id] = new_id; 231 | 232 | new_min = min(next_cell.min, cell.min); 233 | new_max = max(next_cell.max, cell.max); 234 | new_refs_end = new_refs_begin + merge_count; 235 | } else { 236 | new_min = cell.min; 237 | new_max = cell.max; 238 | new_refs_end = new_refs_begin + (cell_end - cell_begin); 239 | } 240 | 241 | store_cell(new_cells + new_id, Cell(new_min, new_refs_begin, 242 | new_max, new_refs_end)); 243 | } 244 | 245 | int warp_id = threadIdx.x % 32; 246 | bool merge = next_begin < next_end; 247 | 248 | // Process consecutive ranges of cells that do not want to be merged 249 | static constexpr unsigned all_mask = unsigned(-1); 250 | uint32_t merge_mask = __ballot_sync(all_mask, valid & !merge); 251 | uint32_t full_mask = __ballot_sync(all_mask, cell_begin < cell_end); 252 | while (merge_mask) { 253 | // Find the range of cells [first_bit, last_bit] that are not merged 254 | auto first_bit = __ffs(merge_mask) - 1; 255 | auto shift_mask = ~(merge_mask >> first_bit); 256 | auto last_bit = shift_mask ? __ffs(shift_mask) + first_bit - 2 : first_bit; 257 | merge_mask &= ~((1 << (last_bit + 1)) - 1); 258 | 259 | // Skip cells that do not contain references 260 | shift_mask = full_mask >> first_bit; 261 | if (!shift_mask) continue; 262 | first_bit += __ffs(shift_mask) - 1; 263 | last_bit -= __clz(full_mask << (31 - last_bit)); 264 | 265 | auto begin = __shfl_sync(all_mask, cell_begin, first_bit); 266 | auto end = __shfl_sync(all_mask, cell_end, last_bit); 267 | auto new_begin = __shfl_sync(all_mask, new_refs_begin, first_bit); 268 | for (int i = begin + warp_id, j = new_begin + warp_id; i < end; i += 32, j += 32) 269 | new_refs[j] = refs[i]; 270 | } 271 | 272 | // Merge references if required 273 | if (merge) { 274 | merge_refs(refs + cell_begin, cell_end - cell_begin, 275 | refs + next_begin, next_end - next_begin, 276 | new_refs + new_refs_begin); 277 | } 278 | } 279 | 280 | /// Maps the old cell indices in the voxel map to the new ones 281 | __global__ void remap_entries(Entry* __restrict__ entries, 282 | const int* __restrict__ new_cell_ids, 283 | int num_entries) { 284 | int id = threadIdx.x + blockDim.x * blockIdx.x; 285 | 286 | if (id < num_entries) { 287 | auto entry = entries[id]; 288 | if (entry.log_dim == 0) entries[id] = make_entry(0, new_cell_ids[entry.begin]); 289 | } 290 | } 291 | 292 | template 293 | void merge_iteration(MemManager& mem, Grid& grid, Cell*& new_cells, int*& new_refs, int empty_mask, MergeBuffers& bufs) { 294 | Parallel par(mem); 295 | 296 | int num_cells = grid.num_cells; 297 | int num_entries = grid.num_entries; 298 | auto cells = grid.cells; 299 | auto refs = grid.ref_ids; 300 | auto entries = grid.entries; 301 | 302 | mem.one(bufs.prevs, num_cells); 303 | compute_merge_counts<<>>(entries, cells, refs, bufs.merge_counts, bufs.nexts, bufs.prevs, empty_mask, num_cells); 304 | DEBUG_SYNC(); 305 | compute_cell_flags<<>>(bufs.nexts, bufs.prevs, bufs.cell_flags, num_cells); 306 | DEBUG_SYNC(); 307 | compute_ref_counts<<>>(bufs.merge_counts, bufs.cell_flags, bufs.ref_counts, num_cells); 308 | DEBUG_SYNC(); 309 | 310 | int num_new_refs = par.scan(bufs.ref_counts, num_cells + 1, bufs.ref_scan); 311 | int num_new_cells = par.scan(bufs.cell_flags, num_cells + 1, bufs.cell_scan); 312 | 313 | merge<<>>(entries, cells, refs, 314 | bufs.cell_scan, bufs.ref_scan, 315 | bufs.merge_counts, bufs.new_cell_ids, 316 | new_cells, new_refs, 317 | num_cells); 318 | DEBUG_SYNC(); 319 | remap_entries<<>>(entries, bufs.new_cell_ids, num_entries); 320 | DEBUG_SYNC(); 321 | 322 | std::swap(new_cells, cells); 323 | std::swap(new_refs, refs); 324 | 325 | grid.cells = cells; 326 | grid.ref_ids = refs; 327 | grid.num_cells = num_new_cells; 328 | grid.num_refs = num_new_refs; 329 | } 330 | 331 | void merge_grid(MemManager& mem, Grid& grid, float alpha) { 332 | MergeBuffers bufs; 333 | 334 | auto new_cells = mem.alloc(grid.num_cells); 335 | auto new_refs = mem.alloc (grid.num_refs); 336 | 337 | size_t buf_size = grid.num_cells + 1; 338 | buf_size = buf_size % 4 ? buf_size + 4 - buf_size % 4 : buf_size; 339 | 340 | bufs.merge_counts = mem.alloc(buf_size); 341 | bufs.ref_counts = mem.alloc(buf_size); 342 | bufs.cell_flags = mem.alloc(buf_size); 343 | bufs.cell_scan = mem.alloc(buf_size); 344 | bufs.ref_scan = mem.alloc(buf_size); 345 | bufs.new_cell_ids = bufs.cell_flags; 346 | bufs.prevs = bufs.cell_scan; 347 | bufs.nexts = bufs.ref_scan; 348 | 349 | auto extents = grid.bbox.extents(); 350 | auto dims = grid.dims << grid.shift; 351 | auto cell_size = extents / vec3(dims); 352 | 353 | set_global(hagrid::grid_dims, dims); 354 | set_global(hagrid::cell_size, cell_size); 355 | set_global(hagrid::grid_shift, grid.shift); 356 | 357 | if (alpha > 0) { 358 | int prev_num_cells = 0, iter = 0; 359 | do { 360 | prev_num_cells = grid.num_cells; 361 | auto mask = iter > 3 ? 0 : (1 << (iter + 1)) - 1; 362 | merge_iteration<0>(mem, grid, new_cells, new_refs, mask, bufs); 363 | merge_iteration<1>(mem, grid, new_cells, new_refs, mask, bufs); 364 | merge_iteration<2>(mem, grid, new_cells, new_refs, mask, bufs); 365 | iter++; 366 | } while (grid.num_cells < alpha * prev_num_cells); 367 | } 368 | 369 | mem.free(bufs.merge_counts); 370 | mem.free(bufs.ref_counts); 371 | mem.free(bufs.cell_flags); 372 | mem.free(bufs.cell_scan); 373 | mem.free(bufs.ref_scan); 374 | 375 | mem.free(new_cells); 376 | mem.free(new_refs); 377 | } 378 | 379 | } // namespace hagrid 380 | -------------------------------------------------------------------------------- /src/parallel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef PARALLEL_CUH 2 | #define PARALLEL_CUH 3 | 4 | #include 5 | #include 6 | #include "mem_manager.h" 7 | #include "common.h" 8 | 9 | namespace hagrid { 10 | 11 | /// Parallel primitives (mostly a wrapper around CUB) 12 | class Parallel { 13 | private: 14 | template 15 | struct ResultType { 16 | typedef typename std::remove_reference::type Type; 17 | }; 18 | 19 | public: 20 | Parallel(MemManager& mem) 21 | : mem_(mem) 22 | {} 23 | 24 | /// Creates a transformation iterator 25 | template 26 | auto transform(InputIt values, F f) -> cub::TransformInputIterator { 27 | return cub::TransformInputIterator(values, f); 28 | } 29 | 30 | /// Computes the exclusive sum of the given array, and returns the total 31 | template 32 | auto scan(InputIt values, int n, OutputIt result) -> typename ResultType::Type { 33 | typedef typename ResultType::Type T; 34 | size_t required_bytes; 35 | CHECK_CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, required_bytes, values, result, n)); 36 | char* tmp_storage = mem_.alloc(required_bytes); 37 | CHECK_CUDA_CALL(cub::DeviceScan::ExclusiveSum(tmp_storage, required_bytes, values, result, n)); 38 | mem_.free(tmp_storage); 39 | T total; 40 | CHECK_CUDA_CALL(cudaMemcpy(&total, result + n - 1, sizeof(T), cudaMemcpyDeviceToHost)); 41 | return total; 42 | } 43 | 44 | /// Computes the reduction of the given operator over the given array array 45 | template 46 | auto reduce(InputIt values, int n, OutputIt result, F f, typename ResultType::Type init = typename ResultType::Type()) -> typename ResultType::Type { 47 | typedef typename ResultType::Type T; 48 | size_t required_bytes; 49 | CHECK_CUDA_CALL(cub::DeviceReduce::Reduce(nullptr, required_bytes, values, result, n, f, init)); 50 | char* tmp_storage = mem_.alloc(required_bytes); 51 | CHECK_CUDA_CALL(cub::DeviceReduce::Reduce(tmp_storage, required_bytes, values, result, n, f, init)); 52 | mem_.free(tmp_storage); 53 | T host_result; 54 | CHECK_CUDA_CALL(cudaMemcpy(&host_result, result, sizeof(T), cudaMemcpyDeviceToHost)); 55 | return host_result; 56 | } 57 | 58 | /// Computes a partition of the given set according to an array of flags, returns the number of elements in first half 59 | template 60 | int partition(InputIt values, OutputIt result, int n, FlagIt flags) { 61 | size_t required_bytes; 62 | CHECK_CUDA_CALL(cub::DevicePartition::Flagged(nullptr, required_bytes, values, flags, result, (int*)nullptr, n)); 63 | required_bytes += 4 - required_bytes % 4; // Align storage 64 | char* tmp_storage = mem_.alloc(required_bytes + sizeof(int)); 65 | int* count_ptr = reinterpret_cast(tmp_storage + required_bytes); 66 | CHECK_CUDA_CALL(cub::DevicePartition::Flagged(tmp_storage, required_bytes, values, flags, result, count_ptr, n)); 67 | int count; 68 | CHECK_CUDA_CALL(cudaMemcpy(&count, count_ptr, sizeof(int), cudaMemcpyDeviceToHost)); 69 | mem_.free(tmp_storage); 70 | return count; 71 | } 72 | 73 | /// Computes a partition of the given set according to an array of flags, returns the number of elements in first half 74 | template 75 | void sort_pairs(Key* keys_in, Value* values_in, Key*& keys_out, Value*& values_out, int n, int bits = sizeof(Key) * 8) { 76 | size_t required_bytes; 77 | cub::DoubleBuffer keys_buf(keys_in, keys_out); 78 | cub::DoubleBuffer values_buf(values_in, values_out); 79 | CHECK_CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, required_bytes, keys_buf, values_buf, n, 0, bits)); 80 | char* tmp_storage = mem_.alloc(required_bytes + sizeof(int)); 81 | CHECK_CUDA_CALL(cub::DeviceRadixSort::SortPairs(tmp_storage, required_bytes, keys_buf, values_buf, n, 0, bits)); 82 | mem_.free(tmp_storage); 83 | keys_out = keys_buf.Current(); 84 | values_out = values_buf.Current(); 85 | } 86 | 87 | private: 88 | MemManager& mem_; 89 | }; 90 | 91 | } // namespace hagrid 92 | 93 | #endif // PARALLEL_CUH 94 | -------------------------------------------------------------------------------- /src/prims.h: -------------------------------------------------------------------------------- 1 | #ifndef PRIMITIVES_H 2 | #define PRIMITIVES_H 3 | 4 | #include 5 | #include 6 | #include "vec.h" 7 | #include "bbox.h" 8 | #include "ray.h" 9 | 10 | namespace hagrid { 11 | 12 | /// Triangle (point + edges + normal) 13 | struct Tri { 14 | vec3 v0; float nx; 15 | vec3 e1; float ny; 16 | vec3 e2; float nz; 17 | 18 | HOST DEVICE Tri() {} 19 | HOST DEVICE Tri(const vec3& v0, float nx, 20 | const vec3& e1, float ny, 21 | const vec3& e2, float nz) 22 | : v0(v0), nx(nx) 23 | , e1(e1), ny(ny) 24 | , e2(e2), nz(nz) 25 | {} 26 | 27 | HOST DEVICE BBox bbox() const { 28 | auto v1 = v0 - e1; 29 | auto v2 = v0 + e2; 30 | return BBox(min(v0, min(v1, v2)), max(v0, max(v1, v2))); 31 | } 32 | 33 | template 34 | HOST DEVICE vec2 clipped_bounds(float min1, float max1, float min2, float max2) const { 35 | auto e3 = e1 + e2; 36 | auto v1 = v0 - e1; 37 | auto v2 = v0 + e2; 38 | 39 | vec2 bounds = vec2(FLT_MAX, FLT_MIN); 40 | 41 | if (get(v0) >= min1 && get(v0) <= max1 && 42 | get(v0) >= min2 && get(v0) <= max2) { 43 | bounds.x = min(get(v0), bounds.x); 44 | bounds.y = max(get(v0), bounds.y); 45 | } 46 | if (get(v1) >= min1 && get(v1) <= max1 && 47 | get(v1) >= min2 && get(v1) <= max2) { 48 | bounds.x = min(get(v1), bounds.x); 49 | bounds.y = max(get(v1), bounds.y); 50 | } 51 | if (get(v2) >= min1 && get(v2) <= max1 && 52 | get(v2) >= min2 && get(v2) <= max2) { 53 | bounds.x = min(get(v2), bounds.x); 54 | bounds.y = max(get(v2), bounds.y); 55 | } 56 | 57 | auto inv1_e1 = 1.0f / get(e1); 58 | auto inv1_e2 = 1.0f / get(e2); 59 | auto inv1_e3 = 1.0f / get(e3); 60 | 61 | // Clip on min1 62 | { 63 | auto tmin1_e1 = (get(v0) - min1) * inv1_e1; 64 | auto tmin1_e2 = (min1 - get(v0)) * inv1_e2; 65 | auto tmin1_e3 = (min1 - get(v1)) * inv1_e3; 66 | if (tmin1_e1 <= 1 && tmin1_e1 >= 0) { 67 | auto p = get(v0) - get(e1) * tmin1_e1; 68 | bounds.x = min(p, bounds.x); 69 | bounds.y = max(p, bounds.y); 70 | } 71 | if (tmin1_e2 <= 1 && tmin1_e2 >= 0) { 72 | auto p = get(v0) + get(e2) * tmin1_e2; 73 | bounds.x = min(p, bounds.x); 74 | bounds.y = max(p, bounds.y); 75 | } 76 | if (tmin1_e3 <= 1 && tmin1_e3 >= 0) { 77 | auto p = get(v1) + get(e3) * tmin1_e3; 78 | bounds.x = min(p, bounds.x); 79 | bounds.y = max(p, bounds.y); 80 | } 81 | } 82 | 83 | // Clip on max1 84 | { 85 | auto tmax1_e1 = (get(v0) - max1) * inv1_e1; 86 | auto tmax1_e2 = (max1 - get(v0)) * inv1_e2; 87 | auto tmax1_e3 = (max1 - get(v1)) * inv1_e3; 88 | if (tmax1_e1 <= 1 && tmax1_e1 >= 0) { 89 | auto p = get(v0) - get(e1) * tmax1_e1; 90 | bounds.x = min(p, bounds.x); 91 | bounds.y = max(p, bounds.y); 92 | } 93 | if (tmax1_e2 <= 1 && tmax1_e2 >= 0) { 94 | auto p = get(v0) + get(e2) * tmax1_e2; 95 | bounds.x = min(p, bounds.x); 96 | bounds.y = max(p, bounds.y); 97 | } 98 | if (tmax1_e3 <= 1 && tmax1_e3 >= 0) { 99 | auto p = get(v1) + get(e3) * tmax1_e3; 100 | bounds.x = min(p, bounds.x); 101 | bounds.y = max(p, bounds.y); 102 | } 103 | } 104 | 105 | auto inv2_e1 = 1.0f / get(e1); 106 | auto inv2_e2 = 1.0f / get(e2); 107 | auto inv2_e3 = 1.0f / get(e3); 108 | 109 | // Clip on min2 110 | { 111 | auto tmin2_e1 = (get(v0) - min2) * inv2_e1; 112 | auto tmin2_e2 = (min2 - get(v0)) * inv2_e2; 113 | auto tmin2_e3 = (min2 - get(v1)) * inv2_e3; 114 | if (tmin2_e1 <= 1 && tmin2_e1 >= 0) { 115 | auto p = get(v0) - get(e1) * tmin2_e1; 116 | bounds.x = min(p, bounds.x); 117 | bounds.y = max(p, bounds.y); 118 | } 119 | if (tmin2_e2 <= 1 && tmin2_e2 >= 0) { 120 | auto p = get(v0) + get(e2) * tmin2_e2; 121 | bounds.x = min(p, bounds.x); 122 | bounds.y = max(p, bounds.y); 123 | } 124 | if (tmin2_e3 <= 1 && tmin2_e3 >= 0) { 125 | auto p = get(v1) + get(e3) * tmin2_e3; 126 | bounds.x = min(p, bounds.x); 127 | bounds.y = max(p, bounds.y); 128 | } 129 | } 130 | 131 | // Clip on max2 132 | { 133 | auto tmax2_e1 = (get(v0) - max2) * inv2_e1; 134 | auto tmax2_e2 = (max2 - get(v0)) * inv2_e2; 135 | auto tmax2_e3 = (max2 - get(v1)) * inv2_e3; 136 | if (tmax2_e1 <= 1 && tmax2_e1 >= 0) { 137 | auto p = get(v0) - get(e1) * tmax2_e1; 138 | bounds.x = min(p, bounds.x); 139 | bounds.y = max(p, bounds.y); 140 | } 141 | if (tmax2_e2 <= 1 && tmax2_e2 >= 0) { 142 | auto p = get(v0) + get(e2) * tmax2_e2; 143 | bounds.x = min(p, bounds.x); 144 | bounds.y = max(p, bounds.y); 145 | } 146 | if (tmax2_e3 <= 1 && tmax2_e3 >= 0) { 147 | auto p = get(v1) + get(e3) * tmax2_e3; 148 | bounds.x = min(p, bounds.x); 149 | bounds.y = max(p, bounds.y); 150 | } 151 | } 152 | 153 | return bounds; 154 | } 155 | 156 | HOST DEVICE vec3 normal() const { 157 | return vec3(nx, ny, nz); 158 | } 159 | }; 160 | 161 | HOST DEVICE inline bool plane_overlap_box(const vec3& n, float d, const vec3& min, const vec3& max) { 162 | auto first = vec3(n.x > 0 ? min.x : max.x, 163 | n.y > 0 ? min.y : max.y, 164 | n.z > 0 ? min.z : max.z); 165 | 166 | auto last = vec3(n.x <= 0 ? min.x : max.x, 167 | n.y <= 0 ? min.y : max.y, 168 | n.z <= 0 ? min.z : max.z); 169 | 170 | auto d0 = dot(n, first) - d; 171 | auto d1 = dot(n, last) - d; 172 | #if __CUDACC_VER_MAJOR__ == 7 173 | union { int i; float f; } u0 = { .f = d0 }; 174 | union { int i; float f; } u1 = { .f = d1 }; 175 | // Equivalent to d1 * d0 <= 0.0f (CUDA 7.0 bug) 176 | return (((u0.i ^ u1.i) & 0x80000000) | (d0 == 0.0f) | (d1 == 0.0f)) != 0; 177 | #else 178 | return d1 * d0 <= 0.0f; 179 | #endif 180 | } 181 | 182 | HOST DEVICE inline bool axis_test_x(const vec3& half_size, 183 | const vec3& e, const vec3& f, 184 | const vec3& v0, const vec3& v1) { 185 | auto p0 = e.y * v0.z - e.z * v0.y; 186 | auto p1 = e.y * v1.z - e.z * v1.y; 187 | auto rad = f.z * half_size.y + f.y * half_size.z; 188 | return fmin(p0, p1) > rad | fmax(p0, p1) < -rad; 189 | } 190 | 191 | HOST DEVICE inline bool axis_test_y(const vec3& half_size, 192 | const vec3& e, const vec3& f, 193 | const vec3& v0, const vec3& v1) { 194 | auto p0 = e.z * v0.x - e.x * v0.z; 195 | auto p1 = e.z * v1.x - e.x * v1.z; 196 | auto rad = f.z * half_size.x + f.x * half_size.z; 197 | return fmin(p0, p1) > rad | fmax(p0, p1) < -rad; 198 | } 199 | 200 | HOST DEVICE inline bool axis_test_z(const vec3& half_size, 201 | const vec3& e, const vec3& f, 202 | const vec3& v0, const vec3& v1) { 203 | auto p0 = e.x * v0.y - e.y * v0.x; 204 | auto p1 = e.x * v1.y - e.y * v1.x; 205 | auto rad = f.y * half_size.x + f.x * half_size.y; 206 | return fmin(p0, p1) > rad | fmax(p0, p1) < -rad; 207 | } 208 | 209 | template 210 | HOST DEVICE inline bool intersect_tri_box(const vec3& v0, const vec3& e1, const vec3& e2, const vec3& n, const vec3& min, const vec3& max) { 211 | if (!plane_overlap_box(n, dot(v0, n), min, max)) 212 | return false; 213 | 214 | auto v1 = v0 - e1; 215 | auto v2 = v0 + e2; 216 | if (bounds_check) { 217 | auto min_x = fmin(v0.x, fmin(v1.x, v2.x)); 218 | auto max_x = fmax(v0.x, fmax(v1.x, v2.x)); 219 | if (min_x > max.x | max_x < min.x) return false; 220 | 221 | auto min_y = fmin(v0.y, fmin(v1.y, v2.y)); 222 | auto max_y = fmax(v0.y, fmax(v1.y, v2.y)); 223 | if (min_y > max.y | max_y < min.y) return false; 224 | 225 | auto min_z = fmin(v0.z, fmin(v1.z, v2.z)); 226 | auto max_z = fmax(v0.z, fmax(v1.z, v2.z)); 227 | if (min_z > max.z | max_z < min.z) return false; 228 | } 229 | 230 | if (cross_axes) { 231 | auto center = (max + min) * 0.5f; 232 | auto half_size = (max - min) * 0.5f; 233 | 234 | auto w0 = v0 - center; 235 | auto w1 = v1 - center; 236 | auto w2 = v2 - center; 237 | 238 | auto f1 = vec3(fabs(e1.x), fabs(e1.y), fabs(e1.z)); 239 | if (axis_test_x(half_size, e1, f1, w0, w2) || 240 | axis_test_y(half_size, e1, f1, w0, w2) || 241 | axis_test_z(half_size, e1, f1, w1, w2)) 242 | return false; 243 | 244 | auto f2 = vec3(fabs(e2.x), fabs(e2.y), fabs(e2.z)); 245 | if (axis_test_x(half_size, e2, f2, w0, w1) || 246 | axis_test_y(half_size, e2, f2, w0, w1) || 247 | axis_test_z(half_size, e2, f2, w1, w2)) 248 | return false; 249 | 250 | auto e3 = e1 + e2; 251 | 252 | auto f3 = vec3(fabs(e3.x), fabs(e3.y), fabs(e3.z)); 253 | if (axis_test_x(half_size, e3, f3, w0, w2) || 254 | axis_test_y(half_size, e3, f3, w0, w2) || 255 | axis_test_z(half_size, e3, f3, w0, w1)) 256 | return false; 257 | } 258 | 259 | return true; 260 | } 261 | 262 | HOST DEVICE inline bool intersect_prim_cell(const Tri& tri, const BBox& bbox) { 263 | return intersect_tri_box(tri.v0, tri.e1, tri.e2, tri.normal(), bbox.min, bbox.max); 264 | } 265 | 266 | HOST DEVICE inline bool intersect_prim_ray(const Tri& tri, const Ray& ray, int id, Hit& hit) { 267 | // Moeller Trumbore 268 | auto n = tri.normal(); 269 | 270 | auto c = tri.v0 - ray.org; 271 | auto r = cross(ray.dir, c); 272 | auto det = dot(n, ray.dir); 273 | auto abs_det = fabs(det); 274 | 275 | auto u = prodsign(dot(r, tri.e2), det); 276 | auto v = prodsign(dot(r, tri.e1), det); 277 | auto w = abs_det - u - v; 278 | 279 | auto eps = 1e-9f; 280 | if (u >= -eps && v >= -eps && w >= -eps) { 281 | auto t = prodsign(dot(n, c), det); 282 | if (t >= abs_det * ray.tmin && abs_det * ray.tmax > t) { 283 | auto inv_det = 1.0f / abs_det; 284 | hit.t = t * inv_det; 285 | #ifdef COMPUTE_UVS 286 | hit.u = u * inv_det; 287 | hit.v = v * inv_det; 288 | #endif 289 | hit.id = id; 290 | return true; 291 | } 292 | } 293 | 294 | return false; 295 | } 296 | 297 | #ifdef __NVCC__ 298 | __device__ __forceinline__ Tri load_prim(const Tri* tri_ptr) { 299 | const float4* ptr = (const float4*)tri_ptr; 300 | auto tri0 = ptr[0]; 301 | auto tri1 = ptr[1]; 302 | auto tri2 = ptr[2]; 303 | return Tri(vec3(tri0.x, tri0.y, tri0.z), tri0.w, 304 | vec3(tri1.x, tri1.y, tri1.z), tri1.w, 305 | vec3(tri2.x, tri2.y, tri2.z), tri2.w); 306 | } 307 | #endif 308 | 309 | } // namespace hagrid 310 | 311 | #endif // PRIMITIVES_H 312 | -------------------------------------------------------------------------------- /src/profile.cu: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | namespace hagrid { 4 | 5 | __host__ float profile(std::function f) { 6 | cudaEvent_t start_kernel, end_kernel; 7 | CHECK_CUDA_CALL(cudaEventCreate(&start_kernel)); 8 | CHECK_CUDA_CALL(cudaEventCreate(&end_kernel)); 9 | CHECK_CUDA_CALL(cudaEventRecord(start_kernel)); 10 | f(); 11 | CHECK_CUDA_CALL(cudaEventRecord(end_kernel)); 12 | CHECK_CUDA_CALL(cudaEventSynchronize(end_kernel)); 13 | float kernel_time = 0; 14 | CHECK_CUDA_CALL(cudaEventElapsedTime(&kernel_time, start_kernel, end_kernel)); 15 | CHECK_CUDA_CALL(cudaEventDestroy(start_kernel)); 16 | CHECK_CUDA_CALL(cudaEventDestroy(end_kernel)); 17 | return kernel_time; 18 | } 19 | 20 | } // namespace hagrid 21 | -------------------------------------------------------------------------------- /src/ray.h: -------------------------------------------------------------------------------- 1 | #ifndef RAY_H 2 | #define RAY_H 3 | 4 | #include "vec.h" 5 | 6 | namespace hagrid { 7 | 8 | /// Ray, defined as org + t * dir with t in [tmin, tmax] 9 | struct Ray { 10 | vec3 org; 11 | float tmin; 12 | vec3 dir; 13 | float tmax; 14 | 15 | HOST DEVICE Ray() {} 16 | HOST DEVICE Ray(const vec3& org, float tmin, 17 | const vec3& dir, float tmax) 18 | : org(org), tmin(tmin), dir(dir), tmax(tmax) 19 | {} 20 | }; 21 | 22 | /// Result of a hit (id is -1 if there is no hit) 23 | struct Hit { 24 | int id; 25 | float t; 26 | float u; 27 | float v; 28 | 29 | HOST DEVICE Hit() {} 30 | HOST DEVICE Hit(int id, float t, float u, float v) 31 | : id(id), t(t), u(u), v(v) 32 | {} 33 | }; 34 | 35 | #ifdef __NVCC__ 36 | __device__ __forceinline__ Ray load_ray(const Ray* ray_ptr) { 37 | const float4* ptr = (const float4*)ray_ptr; 38 | auto ray0 = ptr[0]; 39 | auto ray1 = ptr[1]; 40 | return Ray(vec3(ray0.x, ray0.y, ray0.z), ray0.w, 41 | vec3(ray1.x, ray1.y, ray1.z), ray1.w); 42 | } 43 | 44 | __device__ __forceinline__ void store_hit(Hit* hit_ptr, const Hit& hit) { 45 | float4* ptr = (float4*)hit_ptr; 46 | ptr[0] = make_float4(__int_as_float(hit.id), hit.t, hit.u, hit.v); 47 | } 48 | #endif 49 | 50 | } // namespace hagrid 51 | 52 | #endif // RAY_H 53 | -------------------------------------------------------------------------------- /src/traverse.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "traverse.h" 4 | 5 | namespace hagrid { 6 | 7 | static __constant__ ivec3 grid_dims; 8 | static __constant__ vec3 grid_min; 9 | static __constant__ vec3 grid_max; 10 | static __constant__ vec3 cell_size; 11 | static __constant__ vec3 grid_inv; 12 | static __constant__ int grid_shift; 13 | 14 | __device__ __forceinline__ vec2 intersect_ray_box(vec3 org, vec3 inv_dir, vec3 box_min, vec3 box_max) { 15 | auto tmin = (box_min - org) * inv_dir; 16 | auto tmax = (box_max - org) * inv_dir; 17 | auto t0 = min(tmin, tmax); 18 | auto t1 = max(tmin, tmax); 19 | return vec2(fmax(t0.x, fmax(t0.y, t0.z)), 20 | fmin(t1.x, fmin(t1.y, t1.z))); 21 | } 22 | 23 | __device__ __forceinline__ vec3 compute_voxel(vec3 org, vec3 dir, float t) { 24 | return (t * dir + org - grid_min) * grid_inv; 25 | } 26 | 27 | template 28 | __global__ void traverse(const Entry* __restrict__ entries, 29 | const CellT* __restrict__ cells, 30 | const int* __restrict__ ref_ids, 31 | const Primitive* __restrict__ prims, 32 | const Ray* __restrict__ rays, 33 | Hit* __restrict__ hits, 34 | int num_rays) { 35 | const int id = threadIdx.x + blockDim.x * blockIdx.x; 36 | if (id >= num_rays) return; 37 | 38 | auto ray = load_ray(rays + id); 39 | auto inv_dir = vec3(safe_rcp(ray.dir.x), safe_rcp(ray.dir.y), safe_rcp(ray.dir.z)); 40 | 41 | // Intersect the grid bounding box 42 | auto tbox = intersect_ray_box(ray.org, inv_dir, grid_min, grid_max); 43 | auto tstart = fmax(tbox.x, ray.tmin); 44 | auto tend = fmin(tbox.y, ray.tmax); 45 | 46 | auto hit = Hit(-1, ray.tmax, 0, 0); 47 | int steps = 0; 48 | ivec3 voxel; 49 | 50 | // Early exit if the ray does not hit the grid 51 | if (tstart > tend) goto exit; 52 | 53 | // Find initial voxel 54 | voxel = clamp(ivec3(compute_voxel(ray.org, ray.dir, tstart)), ivec3(0, 0, 0), grid_dims - 1); 55 | 56 | while (true) { 57 | // Lookup entry 58 | const int entry = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, voxel); 59 | 60 | // Lookup the cell associated with this voxel 61 | auto cell = load_cell(cells + entry); 62 | 63 | // Intersect the farmost planes of the cell bounding box 64 | auto cell_point = ivec3(ray.dir.x >= 0.0f ? cell.max.x : cell.min.x, 65 | ray.dir.y >= 0.0f ? cell.max.y : cell.min.y, 66 | ray.dir.z >= 0.0f ? cell.max.z : cell.min.z); 67 | auto tcell = (vec3(cell_point) * cell_size + grid_min - ray.org) * inv_dir; 68 | auto texit = fmin(tcell.x, fmin(tcell.y, tcell.z)); 69 | 70 | // Move to the next voxel 71 | auto exit_point = ivec3(compute_voxel(ray.org, ray.dir, texit)); 72 | auto next_voxel = ivec3(texit == tcell.x ? cell_point.x + (ray.dir.x >= 0.0f ? 0 : -1) : exit_point.x, 73 | texit == tcell.y ? cell_point.y + (ray.dir.y >= 0.0f ? 0 : -1) : exit_point.y, 74 | texit == tcell.z ? cell_point.z + (ray.dir.z >= 0.0f ? 0 : -1) : exit_point.z); 75 | voxel.x = ray.dir.x >= 0.0f ? max(next_voxel.x, voxel.x) : min(next_voxel.x, voxel.x); 76 | voxel.y = ray.dir.y >= 0.0f ? max(next_voxel.y, voxel.y) : min(next_voxel.y, voxel.y); 77 | voxel.z = ray.dir.z >= 0.0f ? max(next_voxel.z, voxel.z) : min(next_voxel.z, voxel.z); 78 | 79 | // Intersect the cell contents and exit if an intersection was found 80 | steps += 1 + foreach_ref(cell, ref_ids, [&] (int ref) { 81 | auto prim = load_prim(prims + ref); 82 | intersect_prim_ray(prim, Ray(ray.org, ray.tmin, ray.dir, hit.t), ref, hit); 83 | }); 84 | 85 | if (hit.t <= texit || 86 | (voxel.x < 0 | voxel.x >= grid_dims.x | 87 | voxel.y < 0 | voxel.y >= grid_dims.y | 88 | voxel.z < 0 | voxel.z >= grid_dims.z)) 89 | break; 90 | } 91 | 92 | exit: 93 | hit.id = steps; 94 | store_hit(hits + id, hit); 95 | } 96 | 97 | void setup_traversal(const Grid& grid) { 98 | auto extents = grid.bbox.extents(); 99 | auto dims = grid.dims << grid.shift; 100 | auto grid_inv = vec3(dims) / extents; 101 | auto cell_size = extents / vec3(dims); 102 | 103 | set_global(hagrid::grid_min, grid.bbox.min); 104 | set_global(hagrid::grid_max, grid.bbox.max); 105 | set_global(hagrid::grid_dims, dims); 106 | set_global(hagrid::cell_size, cell_size); 107 | set_global(hagrid::grid_inv, grid_inv); 108 | set_global(hagrid::grid_shift, grid.shift); 109 | } 110 | 111 | void traverse_grid(const Grid& grid, const Tri* tris, const Ray* rays, Hit* hits, int num_rays) { 112 | if (grid.small_cells) { 113 | traverse<<>>(grid.entries, grid.small_cells, grid.ref_ids, tris, rays, hits, num_rays); 114 | } else { 115 | traverse<<>>(grid.entries, grid.cells, grid.ref_ids, tris, rays, hits, num_rays); 116 | } 117 | } 118 | 119 | } // namespace hagrid 120 | -------------------------------------------------------------------------------- /src/traverse.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAVERSE_H 2 | #define TRAVERSE_H 3 | 4 | #include "grid.h" 5 | #include "vec.h" 6 | #include "prims.h" 7 | 8 | namespace hagrid { 9 | 10 | /// Setups the traversal constants 11 | void setup_traversal(const Grid& grid); 12 | 13 | /// Traverses the structure with the given set of rays 14 | void traverse_grid(const Grid& grid, const Tri* tris, const Ray* rays, Hit* hits, int num_rays); 15 | 16 | } // namespace hagrid 17 | 18 | #endif // TRAVERSE_H 19 | -------------------------------------------------------------------------------- /src/vec.h: -------------------------------------------------------------------------------- 1 | #ifndef VEC_H 2 | #define VEC_H 3 | 4 | #include 5 | #include "common.h" 6 | 7 | namespace hagrid { 8 | 9 | template 10 | struct tvec2 { 11 | T x, y; 12 | HOST DEVICE tvec2() {} 13 | HOST DEVICE tvec2(T xy) : x(xy), y(xy) {} 14 | HOST DEVICE tvec2(T x, T y) : x(x), y(y) {} 15 | template 16 | HOST DEVICE explicit tvec2(const tvec2& xy) : x(xy.x), y(xy.y) {} 17 | 18 | HOST DEVICE tvec2& operator += (const tvec2& other) { *this = *this + other; return *this; } 19 | HOST DEVICE tvec2& operator -= (const tvec2& other) { *this = *this - other; return *this; } 20 | HOST DEVICE tvec2& operator *= (const tvec2& other) { *this = *this * other; return *this; } 21 | HOST DEVICE tvec2& operator /= (const tvec2& other) { *this = *this / other; return *this; } 22 | 23 | HOST DEVICE tvec2& operator *= (T t) { *this = *this * t; return *this; } 24 | HOST DEVICE tvec2& operator /= (T t) { *this = *this / t; return *this; } 25 | }; 26 | 27 | #define BINARY_OP2(op) \ 28 | template HOST DEVICE tvec2 operator op (const tvec2& a, const tvec2& b) { return tvec2(a.x op b.x, a.y op b.y); } \ 29 | template HOST DEVICE tvec2 operator op (const tvec2& a, T b) { return tvec2(a.x op b, a.y op b); } \ 30 | template HOST DEVICE tvec2 operator op (T a, const tvec2& b) { return tvec2(a op b.x, a op b.y); } 31 | 32 | BINARY_OP2(+) 33 | BINARY_OP2(-) 34 | BINARY_OP2(*) 35 | BINARY_OP2(/) 36 | BINARY_OP2(<<) 37 | BINARY_OP2(>>) 38 | BINARY_OP2(&) 39 | BINARY_OP2(|) 40 | 41 | #undef BINARY_OP2 42 | 43 | template HOST DEVICE tvec2 min(const tvec2& a, const tvec2& b) { return tvec2(min(a.x, b.x), min(a.y, b.y)); } 44 | template HOST DEVICE tvec2 max(const tvec2& a, const tvec2& b) { return tvec2(max(a.x, b.x), max(a.y, b.y)); } 45 | template HOST DEVICE tvec2 clamp(const tvec2& a, T b, T c) { return tvec2(min(max(a.x, b), c), min(max(a.y, b), c)); } 46 | template HOST DEVICE T dot(const tvec2& a, const tvec2& b) { return a.x * b.x + a.y * b.y; } 47 | template HOST DEVICE T length(const tvec2& a) { return std::sqrt(dot(a, a)); } 48 | template HOST DEVICE tvec2 normalize(const tvec2& a) { return a * (1.0f / length(a)); } 49 | 50 | template 51 | HOST DEVICE T get(const tvec2& v) { 52 | if (axis == 0) return v.x; 53 | return v.y; 54 | } 55 | 56 | template 57 | struct tvec3 { 58 | union { T x; T r; }; 59 | union { T y; T g; }; 60 | union { T z; T b; }; 61 | HOST DEVICE tvec3() {} 62 | HOST DEVICE tvec3(T xyz) : x(xyz), y(xyz), z(xyz) {} 63 | HOST DEVICE tvec3(T x, T y, T z) : x(x), y(y), z(z) {} 64 | template 65 | HOST DEVICE explicit tvec3(const tvec3& xyz) : x(xyz.x), y(xyz.y), z(xyz.z) {} 66 | 67 | HOST DEVICE tvec3& operator += (const tvec3& other) { *this = *this + other; return *this; } 68 | HOST DEVICE tvec3& operator -= (const tvec3& other) { *this = *this - other; return *this; } 69 | HOST DEVICE tvec3& operator *= (const tvec3& other) { *this = *this * other; return *this; } 70 | HOST DEVICE tvec3& operator /= (const tvec3& other) { *this = *this / other; return *this; } 71 | 72 | HOST DEVICE tvec3& operator *= (T t) { *this = *this * t; return *this; } 73 | HOST DEVICE tvec3& operator /= (T t) { *this = *this / t; return *this; } 74 | }; 75 | 76 | #define BINARY_OP3(op) \ 77 | template HOST DEVICE tvec3 operator op (const tvec3& a, const tvec3& b) { return tvec3(a.x op b.x, a.y op b.y, a.z op b.z); } \ 78 | template HOST DEVICE tvec3 operator op (const tvec3& a, T b) { return tvec3(a.x op b, a.y op b, a.z op b); } \ 79 | template HOST DEVICE tvec3 operator op (T a, const tvec3& b) { return tvec3(a op b.x, a op b.y, a op b.z); } 80 | 81 | BINARY_OP3(+) 82 | BINARY_OP3(-) 83 | BINARY_OP3(*) 84 | BINARY_OP3(/) 85 | BINARY_OP3(<<) 86 | BINARY_OP3(>>) 87 | BINARY_OP3(&) 88 | BINARY_OP3(|) 89 | 90 | #undef BINARY_OP3 91 | 92 | template HOST DEVICE tvec3 min(const tvec3& a, const tvec3& b) { return tvec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } 93 | template HOST DEVICE tvec3 max(const tvec3& a, const tvec3& b) { return tvec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } 94 | template HOST DEVICE tvec3 clamp(const tvec3& a, T b, T c) { return tvec3(min(max(a.x, b), c), min(max(a.y, b), c), min(max(a.z, b), c)); } 95 | template HOST DEVICE T dot(const tvec3& a, const tvec3& b) { return a.x * b.x + a.y * b.y + a.z * b.z; } 96 | template HOST DEVICE T length(const tvec3& a) { return std::sqrt(dot(a, a)); } 97 | template HOST DEVICE tvec3 normalize(const tvec3& a) { return a * (1.0f / length(a)); } 98 | 99 | template 100 | HOST DEVICE tvec3 cross(const tvec3& a, const tvec3& b) { 101 | return tvec3(a.y * b.z - a.z * b.y, 102 | a.z * b.x - a.x * b.z, 103 | a.x * b.y - a.y * b.x); 104 | } 105 | 106 | template 107 | HOST DEVICE tvec3 rotate(const tvec3& v, const tvec3& axis, T angle) { 108 | T half = angle / 2; 109 | 110 | T q[4] = { 111 | axis.x * std::sin(half), 112 | axis.y * std::sin(half), 113 | axis.z * std::sin(half), 114 | std::cos(half) 115 | }; 116 | 117 | T p[4] = { 118 | q[3] * v.x + q[1] * v.z - q[2] * v.y, 119 | q[3] * v.y - q[0] * v.z + q[2] * v.x, 120 | q[3] * v.z + q[0] * v.y - q[1] * v.x, 121 | -(q[0] * v.x + q[1] * v.y + q[2] * v.z) 122 | }; 123 | 124 | return tvec3(p[3] * -q[0] + p[0] * q[3] + p[1] * -q[2] - p[2] * -q[1], 125 | p[3] * -q[1] - p[0] * -q[2] + p[1] * q[3] + p[2] * -q[0], 126 | p[3] * -q[2] + p[0] * -q[1] - p[1] * -q[0] + p[2] * q[3]); 127 | } 128 | 129 | template 130 | HOST DEVICE T get(const tvec3& v) { 131 | if (axis == 0) return v.x; 132 | else if (axis == 1) return v.y; 133 | else return v.z; 134 | } 135 | 136 | typedef tvec2 vec2; 137 | typedef tvec2 ivec2; 138 | typedef tvec2 usvec2; 139 | typedef tvec3 vec3; 140 | typedef tvec3 ivec3; 141 | typedef tvec3 usvec3; 142 | 143 | } // namespace hagrid 144 | 145 | #endif // VEC_H 146 | --------------------------------------------------------------------------------