├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE.txt
├── README.md
├── cmake
    └── modules
    │   └── FindSDL2.cmake
├── screenshot.png
└── src
    ├── CMakeLists.txt
    ├── bbox.h
    ├── build.cu
    ├── build.h
    ├── common.h
    ├── compress.cu
    ├── expand.cu
    ├── flatten.cu
    ├── grid.h
    ├── load_obj.cpp
    ├── load_obj.h
    ├── main.cpp
    ├── mem_manager.cu
    ├── mem_manager.h
    ├── merge.cu
    ├── parallel.cuh
    ├── prims.h
    ├── profile.cu
    ├── ray.h
    ├── traverse.cu
    ├── traverse.h
    └── vec.h


/.gitignore:
--------------------------------------------------------------------------------
1 | lib/cub/docs
2 | lib/cub/examples
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/cub"]
2 | 	path = lib/cub
3 | 	url = https://github.com/NVlabs/cub
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | project(hagrid)
 3 | 
 4 | set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
 5 | 
 6 | find_package(CUDA)
 7 | 
 8 | find_package(SDL2 REQUIRED)
 9 | include_directories(${SDL2_INCLUDE_DIR})
10 | 
11 | set(CMAKE_CXX_STANDARD 11)
12 | include_directories(lib/cub)
13 | add_subdirectory(src)
14 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2017 Arsène Pérard-Gayot
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hagrid
 2 | 
 3 | ![Screenshot](screenshot.png)
 4 | 
 5 | This project is an implementation of the paper: _GPU Ray Tracing using Irregular Grids_.
 6 | This is not the version that has been used in the paper.
 7 | 
 8 | ## Changes
 9 | 
10 | Some improvements have been made to the construction algorithm, which change the performance characteristics of the structure:
11 | 
12 | - The voxel map can have more than two levels.
13 | - Construction is faster (~ +33%)
14 | - Memory consumption is lower (~ -20%)
15 | - Traversal is slower (~ -5%)
16 | 
17 | The traversal being slower can easily being remedied by increasing the resolution (the default parameters are the parameters used in the paper, which will result in lower performance than what was reported for the reasons above). The improvements in build times and memory consumption more than compensate the loss in traversal performance. As a recommendation, use `--top-density 0.15 --snd-density 3.0` if you want to get approximately the same performance as the original paper. _Increasing the resolution further will result in higher performance_, and the new voxel map structure should prevent the memory usage from exploding.
18 | 
19 | The following experimental features (not mentioned in the paper) are also available, and should increase performance:
20 | 
21 | - A more precise expansion algorithm (set the `subset_only` variable to false in [src/expand.cu](src/expand.cu#L159)),
22 | - A simple compression scheme (use the `--compress` option).
23 | 
24 | ## Building
25 | 
26 | This project requires CUDA, SDL2, and CMake. Use the following commands to build the project:
27 | 
28 |     mkdir build
29 |     cd build
30 |     cmake-gui ..
31 |     make -j
32 | 
33 | If you encounter any problems when building, make sure you clone the repository with the `--recursive` option, so that all submodules are cloned as well.
34 | If the submodules are properly downloaded, the `/lib` directory should no longer be empty.
35 | 
36 | ## Testing and Benchmarking
37 | 
38 | Once built, the project provides a library for traversal, a library for the construction, and a test/benchmark executable. The executable uses command line arguments to specify the scene to use. The scene must be in the OBJ format. Here is a list of typical uses of the command line program:
39 | 
40 | - Lists options:
41 | 
42 |       src/hagrid
43 | 
44 | - Loads the file `scene.obj`, builds an irregular grid using default parameters and displays an interactive view:
45 | 
46 |       src/hagrid scene.obj
47 |     
48 | - Loads the file `scene.obj`, builds an irregular grid with top-level density 0.12, second-level density 0.24, alpha 0.995 (threshold to control cell merging---see the paper), 3 expansion passes, and displays an interactive view:
49 | 
50 |       src/hagrid scene.obj --top-density 0.12 --snd-density 2.4 --alpha 0.995 --expansion 3
51 |     
52 | - Loads the file `scene.obj`, benchmarks the construction with default parameters by running 10 construction iterations and 5 warmup construction iterations and keeps intermediate buffers alive (should be preferred when benchmarking construction times), and finally displays an interactive view:
53 | 
54 |       src/hagrid scene.obj --build-iter 10 --build-warmup 5 --keep-alive
55 |       
56 | - Loads the file `scene.obj`, builds an irregular grid with default parameters, and benchmark the traversal by running it on the given ray distribution (the file `distribution.rays`, containing each ray stored as 6 floats in binary format---3 for the origin and 3 for the direction) 100 times with 20 warmup iterations, and limit the distance along the ray to the range [0, 100]:
57 | 
58 |       src/hagrid scene.obj --ray-file distribution.rays --bench-iter 100 --bench-warmup 20 -tmin 0 -tmax 100
59 | 
60 | ## License
61 | 
62 | The code is distributed under the MIT license (see [LICENSE.txt](LICENSE.txt)).
63 | 


--------------------------------------------------------------------------------
/cmake/modules/FindSDL2.cmake:
--------------------------------------------------------------------------------
  1 | # Locate SDL2 library
  2 | # This module defines
  3 | # SDL2_LIBRARY, the name of the library to link against
  4 | # SDL2_FOUND, if false, do not try to link to SDL2
  5 | # SDL2_INCLUDE_DIR, where to find SDL.h
  6 | #
  7 | # This module responds to the the flag:
  8 | # SDL2_BUILDING_LIBRARY
  9 | # If this is defined, then no SDL2_main will be linked in because
 10 | # only applications need main().
 11 | # Otherwise, it is assumed you are building an application and this
 12 | # module will attempt to locate and set the the proper link flags
 13 | # as part of the returned SDL2_LIBRARY variable.
 14 | #
 15 | # Don't forget to include SDL2main.h and SDL2main.m your project for the
 16 | # OS X framework based version. (Other versions link to -lSDL2main which
 17 | # this module will try to find on your behalf.) Also for OS X, this
 18 | # module will automatically add the -framework Cocoa on your behalf.
 19 | #
 20 | #
 21 | # Additional Note: If you see an empty SDL2_LIBRARY_TEMP in your configuration
 22 | # and no SDL2_LIBRARY, it means CMake did not find your SDL2 library
 23 | # (SDL2.dll, libsdl2.so, SDL2.framework, etc).
 24 | # Set SDL2_LIBRARY_TEMP to point to your SDL2 library, and configure again.
 25 | # Similarly, if you see an empty SDL2MAIN_LIBRARY, you should set this value
 26 | # as appropriate. These values are used to generate the final SDL2_LIBRARY
 27 | # variable, but when these values are unset, SDL2_LIBRARY does not get created.
 28 | #
 29 | #
 30 | # $SDL2DIR is an environment variable that would
 31 | # correspond to the ./configure --prefix=$SDL2DIR
 32 | # used in building SDL2.
 33 | # l.e.galup  9-20-02
 34 | #
 35 | # Modified by Eric Wing.
 36 | # Added code to assist with automated building by using environmental variables
 37 | # and providing a more controlled/consistent search behavior.
 38 | # Added new modifications to recognize OS X frameworks and
 39 | # additional Unix paths (FreeBSD, etc).
 40 | # Also corrected the header search path to follow "proper" SDL2 guidelines.
 41 | # Added a search for SDL2main which is needed by some platforms.
 42 | # Added a search for threads which is needed by some platforms.
 43 | # Added needed compile switches for MinGW.
 44 | #
 45 | # On OSX, this will prefer the Framework version (if found) over others.
 46 | # People will have to manually change the cache values of
 47 | # SDL2_LIBRARY to override this selection or set the CMake environment
 48 | # CMAKE_INCLUDE_PATH to modify the search paths.
 49 | #
 50 | # Note that the header path has changed from SDL2/SDL.h to just SDL.h
 51 | # This needed to change because "proper" SDL2 convention
 52 | # is #include "SDL.h", not <SDL2/SDL.h>. This is done for portability
 53 | # reasons because not all systems place things in SDL2/ (see FreeBSD).
 54 | #
 55 | # Ported by Johnny Patterson. This is a literal port for SDL2 of the FindSDL.cmake
 56 | # module with the minor edit of changing "SDL" to "SDL2" where necessary. This
 57 | # was not created for redistribution, and exists temporarily pending official
 58 | # SDL2 CMake modules.
 59 | 
 60 | #=============================================================================
 61 | # Copyright 2003-2009 Kitware, Inc.
 62 | #
 63 | # Distributed under the OSI-approved BSD License (the "License");
 64 | # see accompanying file Copyright.txt for details.
 65 | #
 66 | # This software is distributed WITHOUT ANY WARRANTY; without even the
 67 | # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 68 | # See the License for more information.
 69 | #=============================================================================
 70 | # (To distribute this file outside of CMake, substitute the full
 71 | #  License text for the above reference.)
 72 | 
 73 | FIND_PATH(SDL2_INCLUDE_DIR SDL.h
 74 |   HINTS
 75 |   $ENV{SDL2DIR}
 76 |   PATH_SUFFIXES include/SDL2 include
 77 |   PATHS
 78 |   ~/Library/Frameworks
 79 |   /Library/Frameworks
 80 |   /usr/local/include/SDL2
 81 |   /usr/include/SDL2
 82 |   /sw # Fink
 83 |   /opt/local # DarwinPorts
 84 |   /opt/csw # Blastwave
 85 |   /opt
 86 | )
 87 | #MESSAGE("SDL2_INCLUDE_DIR is ${SDL2_INCLUDE_DIR}")
 88 | 
 89 | FIND_LIBRARY(SDL2_LIBRARY_TEMP
 90 |   NAMES SDL2
 91 |   HINTS
 92 |   $ENV{SDL2DIR}
 93 |   PATH_SUFFIXES lib64 lib
 94 |   PATHS
 95 |   /sw
 96 |   /opt/local
 97 |   /opt/csw
 98 |   /opt
 99 | )
100 | 
101 | #MESSAGE("SDL2_LIBRARY_TEMP is ${SDL2_LIBRARY_TEMP}")
102 | 
103 | IF(NOT SDL2_BUILDING_LIBRARY)
104 |   IF(NOT ${SDL2_INCLUDE_DIR} MATCHES ".framework")
105 |     # Non-OS X framework versions expect you to also dynamically link to
106 |     # SDL2main. This is mainly for Windows and OS X. Other (Unix) platforms
107 |     # seem to provide SDL2main for compatibility even though they don't
108 |     # necessarily need it.
109 |     FIND_LIBRARY(SDL2MAIN_LIBRARY
110 |       NAMES SDL2main
111 |       HINTS
112 |       $ENV{SDL2DIR}
113 |       PATH_SUFFIXES lib64 lib
114 |       PATHS
115 |       /sw
116 |       /opt/local
117 |       /opt/csw
118 |       /opt
119 |     )
120 |   ENDIF(NOT ${SDL2_INCLUDE_DIR} MATCHES ".framework")
121 | ENDIF(NOT SDL2_BUILDING_LIBRARY)
122 | 
123 | # SDL2 may require threads on your system.
124 | # The Apple build may not need an explicit flag because one of the
125 | # frameworks may already provide it.
126 | # But for non-OSX systems, I will use the CMake Threads package.
127 | IF(NOT APPLE)
128 |   FIND_PACKAGE(Threads)
129 | ENDIF(NOT APPLE)
130 | 
131 | # MinGW needs an additional library, mwindows
132 | # It's total link flags should look like -lmingw32 -lSDL2main -lSDL2 -lmwindows
133 | # (Actually on second look, I think it only needs one of the m* libraries.)
134 | IF(MINGW)
135 |   SET(MINGW32_LIBRARY mingw32 CACHE STRING "mwindows for MinGW")
136 | ENDIF(MINGW)
137 | 
138 | SET(SDL2_FOUND "NO")
139 | IF(SDL2_LIBRARY_TEMP)
140 |   # For SDL2main
141 |   IF(NOT SDL2_BUILDING_LIBRARY)
142 |     IF(SDL2MAIN_LIBRARY)
143 |       SET(SDL2_LIBRARY_TEMP ${SDL2MAIN_LIBRARY} ${SDL2_LIBRARY_TEMP})
144 |     ENDIF(SDL2MAIN_LIBRARY)
145 |   ENDIF(NOT SDL2_BUILDING_LIBRARY)
146 | 
147 |   # For OS X, SDL2 uses Cocoa as a backend so it must link to Cocoa.
148 |   # CMake doesn't display the -framework Cocoa string in the UI even
149 |   # though it actually is there if I modify a pre-used variable.
150 |   # I think it has something to do with the CACHE STRING.
151 |   # So I use a temporary variable until the end so I can set the
152 |   # "real" variable in one-shot.
153 |   IF(APPLE)
154 |     SET(SDL2_LIBRARY_TEMP ${SDL2_LIBRARY_TEMP} "-framework Cocoa")
155 |   ENDIF(APPLE)
156 | 
157 |   # For threads, as mentioned Apple doesn't need this.
158 |   # In fact, there seems to be a problem if I used the Threads package
159 |   # and try using this line, so I'm just skipping it entirely for OS X.
160 |   IF(NOT APPLE)
161 |     SET(SDL2_LIBRARY_TEMP ${SDL2_LIBRARY_TEMP} ${CMAKE_THREAD_LIBS_INIT})
162 |   ENDIF(NOT APPLE)
163 | 
164 |   # For MinGW library
165 |   IF(MINGW)
166 |     SET(SDL2_LIBRARY_TEMP ${MINGW32_LIBRARY} ${SDL2_LIBRARY_TEMP})
167 |   ENDIF(MINGW)
168 | 
169 |   # Set the final string here so the GUI reflects the final state.
170 |   SET(SDL2_LIBRARY ${SDL2_LIBRARY_TEMP} CACHE STRING "Where the SDL2 Library can be found")
171 |   # Set the temp variable to INTERNAL so it is not seen in the CMake GUI
172 |   SET(SDL2_LIBRARY_TEMP "${SDL2_LIBRARY_TEMP}" CACHE INTERNAL "")
173 | 
174 |   SET(SDL2_FOUND "YES")
175 | ENDIF(SDL2_LIBRARY_TEMP)
176 | 
177 | INCLUDE(FindPackageHandleStandardArgs)
178 | 
179 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(SDL2
180 |                                   REQUIRED_VARS SDL2_LIBRARY SDL2_INCLUDE_DIR)
181 | 


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cg-saarland/hagrid/fa3eb62eba14d073dfeddd3b9ca8fbfb2d6848af/screenshot.png


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if(CMAKE_BUILD_TYPE STREQUAL "Debug")
 2 |     set(OPT_FLAG "-g")
 3 | else()
 4 |     set(OPT_FLAG "--use_fast_math;-O3")
 5 | endif()
 6 | 
 7 | cuda_compile(HAGRID_BUILD
 8 |     build.cu
 9 |     merge.cu
10 |     flatten.cu
11 |     expand.cu
12 |     compress.cu
13 |     mem_manager.cu
14 |     profile.cu
15 |     mem_manager.h
16 |     parallel.cuh
17 |     build.h
18 |     grid.h
19 |     vec.h
20 |     bbox.h
21 |     prims.h
22 |     ray.h
23 |     common.h
24 |     OPTIONS ${OPT_FLAG} "-std=c++11;--expt-extended-lambda;-lineinfo;-DHOST=__host__;-DDEVICE=__device__")
25 | 
26 | add_library(hagrid_build ${HAGRID_BUILD})
27 | set_target_properties(hagrid_build PROPERTIES LINKER_LANGUAGE CXX)
28 | 
29 | cuda_compile(HAGRID_TRAVERSE
30 |     traverse.cu
31 |     traverse.h
32 |     ray.h
33 |     prims.h
34 |     vec.h
35 |     grid.h
36 |     OPTIONS ${OPT_FLAG} "-std=c++11;--expt-extended-lambda;-lineinfo;--maxrregcount=40;-DHOST=__host__;-DDEVICE=__device__")
37 | 
38 | add_library(hagrid_traverse ${HAGRID_TRAVERSE})
39 | set_target_properties(hagrid_traverse PROPERTIES LINKER_LANGUAGE CXX)
40 | 
41 | add_executable(hagrid main.cpp load_obj.cpp load_obj.h grid.h traverse.h build.h vec.h)
42 | target_compile_definitions(hagrid PRIVATE HOST= DEVICE=)
43 | target_link_libraries(hagrid hagrid_build hagrid_traverse ${CUDA_LIBRARIES} ${SDL2_LIBRARY})
44 | 


--------------------------------------------------------------------------------
/src/bbox.h:
--------------------------------------------------------------------------------
  1 | #ifndef BBOX_H
  2 | #define BBOX_H
  3 | 
  4 | #include <cfloat>
  5 | #include <algorithm>
  6 | #include "vec.h"
  7 | 
  8 | namespace hagrid {
  9 | 
 10 | struct BBox {
 11 |     vec3 min;
 12 |     int pad0;
 13 |     vec3 max;
 14 |     int pad1;
 15 | 
 16 |     HOST DEVICE BBox() {}
 17 |     HOST DEVICE BBox(const vec3& v) : min(v), max(v) {}
 18 |     HOST DEVICE BBox(const vec3& min, const vec3& max) : min(min), max(max) {}
 19 | 
 20 |     HOST DEVICE BBox& extend(const vec3& f) {
 21 |         min = hagrid::min(min, f);
 22 |         max = hagrid::max(max, f);
 23 |         return *this;
 24 |     }
 25 | 
 26 |     HOST DEVICE BBox& extend(const BBox& bb) {
 27 |         min = hagrid::min(min, bb.min);
 28 |         max = hagrid::max(max, bb.max);
 29 |         return *this;
 30 |     }
 31 | 
 32 |     HOST DEVICE BBox& overlap(const BBox& bb) {
 33 |         min = hagrid::max(min, bb.min);
 34 |         max = hagrid::min(max, bb.max);
 35 |         return *this;
 36 |     }
 37 | 
 38 |     HOST DEVICE vec3 extents() const {
 39 |         return max - min;
 40 |     }
 41 | 
 42 |     HOST DEVICE vec3 center() const {
 43 |         return 0.5f * (max + min);
 44 |     }
 45 | 
 46 |     HOST DEVICE float half_area() const {
 47 |         const vec3 len = max - min;
 48 |         const float kx = hagrid::max(len.x, 0.0f);
 49 |         const float ky = hagrid::max(len.y, 0.0f);
 50 |         const float kz = hagrid::max(len.z, 0.0f);
 51 |         return kx * (ky + kz) + ky * kz;
 52 |     }
 53 | 
 54 |     HOST DEVICE bool is_empty() const {
 55 |         return min.x > max.x || min.y > max.y || min.z > max.z;
 56 |     }
 57 | 
 58 |     HOST DEVICE bool is_inside(const vec3& f) const {
 59 |         return f.x >= min.x && f.y >= min.y && f.z >= min.z &&
 60 |                f.x <= max.x && f.y <= max.y && f.z <= max.z;
 61 |     }
 62 | 
 63 |     HOST DEVICE bool is_overlapping(const BBox& bb) const {
 64 |         return min.x <= bb.max.x && max.x >= bb.min.x &&
 65 |                min.y <= bb.max.y && max.y >= bb.min.y &&
 66 |                min.z <= bb.max.z && max.z >= bb.min.z;
 67 |     }
 68 | 
 69 |     HOST DEVICE bool is_included(const BBox& bb) const {
 70 |         return min.x >= bb.min.x && max.x <= bb.max.x &&
 71 |                min.y >= bb.min.y && max.y <= bb.max.y &&
 72 |                min.z >= bb.min.z && max.z <= bb.max.z;
 73 |     }
 74 | 
 75 |     HOST DEVICE bool is_strictly_included(const BBox& bb) const {
 76 |         return is_included(bb) &&
 77 |                (min.x > bb.min.x || max.x < bb.max.x ||
 78 |                 min.y > bb.min.y || max.y < bb.max.y ||
 79 |                 min.z > bb.min.z || max.z < bb.max.z);
 80 |     }
 81 | 
 82 |     HOST DEVICE static BBox empty() { return BBox(vec3( FLT_MAX), vec3(-FLT_MAX)); }
 83 |     HOST DEVICE static BBox full()  { return BBox(vec3(-FLT_MAX), vec3( FLT_MAX)); }
 84 | };
 85 | 
 86 | #ifdef __NVCC__
 87 | __device__ __forceinline__ BBox load_bbox(const BBox* bb_ptr) {
 88 |     const float4* ptr = (const float4*)bb_ptr;
 89 |     auto bb0 = ptr[0];
 90 |     auto bb1 = ptr[1];
 91 |     return BBox(vec3(bb0.x, bb0.y, bb0.z),
 92 |                 vec3(bb1.x, bb1.y, bb1.z));
 93 | }
 94 | 
 95 | __device__ __forceinline__ void store_bbox(BBox* bb_ptr, const BBox& bb) {
 96 |     float4* ptr = (float4*)bb_ptr;
 97 |     ptr[0] = make_float4(bb.min.x, bb.min.y, bb.min.z, 0);
 98 |     ptr[1] = make_float4(bb.max.x, bb.max.y, bb.max.z, 0);
 99 | }
100 | #endif // __NVCC__
101 | 
102 | } // namespace hagrid
103 | 
104 | #endif // BBOX_H
105 | 


--------------------------------------------------------------------------------
/src/build.cu:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | 
  3 | #include "build.h"
  4 | #include "vec.h"
  5 | #include "bbox.h"
  6 | #include "grid.h"
  7 | #include "prims.h"
  8 | #include "mem_manager.h"
  9 | #include "parallel.cuh"
 10 | 
 11 | namespace hagrid {
 12 | 
 13 | /// Level of the grid during construction
 14 | struct Level {
 15 |     int* ref_ids;               ///< Array of primitive indices
 16 |     int* cell_ids;              ///< Array of cell indices
 17 |     int num_refs;               ///< Number of references in the level
 18 |     int num_kept;               ///< Number of references kept (remaining is split)
 19 |     Cell* cells;                ///< Array of cells
 20 |     Entry* entries;             ///< Array of voxel map entries
 21 |     int num_cells;              ///< Number of cells
 22 | 
 23 |     Level() {}
 24 |     Level(int* ref_ids, int* cell_ids, int num_refs, int num_kept, Cell* cells, Entry* entries, int num_cells)
 25 |         : ref_ids(ref_ids)
 26 |         , cell_ids(cell_ids)
 27 |         , num_refs(num_refs)
 28 |         , num_kept(num_kept)
 29 |         , cells(cells)
 30 |         , entries(entries)
 31 |         , num_cells(num_cells)
 32 |     {
 33 |         assert(num_refs >= num_kept);
 34 |     }
 35 | };
 36 | 
 37 | static __constant__ ivec3 grid_dims;
 38 | static __constant__ BBox  grid_bbox;
 39 | static __constant__ vec3  cell_size;
 40 | static __constant__ int   grid_shift;
 41 | 
 42 | /// Compute the bounding box of every primitive
 43 | template <typename Primitive>
 44 | __global__ void compute_bboxes(const Primitive* __restrict__ prims,
 45 |                                BBox* __restrict__ bboxes,
 46 |                                int num_prims) {
 47 |     const int id = threadIdx.x + blockDim.x * blockIdx.x;
 48 |     if (id >= num_prims)
 49 |         return;
 50 | 
 51 |     auto prim = load_prim(prims + id);
 52 |     store_bbox(bboxes + id, prim.bbox());
 53 | }
 54 | 
 55 | /// Compute an over-approximation of the number of references
 56 | /// that are going to be generated during reference emission
 57 | __global__ void count_new_refs(const BBox*  __restrict__ bboxes,
 58 |                                int*        __restrict__ counts,
 59 |                                int num_refs) {
 60 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 61 |     if (id >= num_refs) return;
 62 | 
 63 |     auto ref_bb = load_bbox(bboxes + id);
 64 |     auto range  = compute_range(grid_dims, grid_bbox, ref_bb);
 65 |     counts[id]  = max(0, range.size());
 66 | }
 67 | 
 68 | /// Emit the new references by inserting existing ones into the sub-levels
 69 | __global__ void __launch_bounds__(64)
 70 | emit_new_refs(const BBox* __restrict__ bboxes,
 71 |               const int* __restrict__ start_emit,
 72 |               int* __restrict__ new_ref_ids,
 73 |               int* __restrict__ new_cell_ids,
 74 |               int num_prims) {
 75 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 76 | 
 77 |     Range range;
 78 |     int start = 0, end = 0;
 79 | 
 80 |     if (id < num_prims) {
 81 |         start = start_emit[id + 0];
 82 |         end   = start_emit[id + 1];
 83 | 
 84 |         if (start < end) {
 85 |             auto ref_bb = load_bbox(bboxes + id);
 86 |             range  = compute_range(grid_dims, grid_bbox, ref_bb);
 87 |         }
 88 |     }
 89 | 
 90 |     bool blocked = (end - start) >= 16;
 91 |     if (!blocked && start < end) {
 92 |         int x = range.lx;
 93 |         int y = range.ly;
 94 |         int z = range.lz;
 95 |         int cur = start;
 96 |         while (cur < end) {
 97 |             new_ref_ids [cur] = id;
 98 |             new_cell_ids[cur] = x + grid_dims.x * (y + grid_dims.y * z);
 99 |             cur++;
100 |             x++;
101 |             if (x > range.hx) { x = range.lx; y++; }
102 |             if (y > range.hy) { y = range.ly; z++; }
103 |         }
104 |     }
105 | 
106 |     static constexpr unsigned all_mask = unsigned(-1);
107 |     int mask = __ballot_sync(all_mask, blocked);
108 |     while (mask) {
109 |         int bit = __ffs(mask) - 1;
110 |         mask &= ~(1 << bit);
111 | 
112 |         int warp_start = __shfl_sync(all_mask, start, bit);
113 |         int warp_end   = __shfl_sync(all_mask, end,   bit);
114 |         int warp_id    = threadIdx.x - __shfl_sync(all_mask, threadIdx.x, 0);
115 | 
116 |         int lx = __shfl_sync(all_mask, range.lx, bit);
117 |         int ly = __shfl_sync(all_mask, range.ly, bit);
118 |         int lz = __shfl_sync(all_mask, range.lz, bit);
119 |         int hx = __shfl_sync(all_mask, range.hx, bit);
120 |         int hy = __shfl_sync(all_mask, range.hy, bit);
121 |         int r  = __shfl_sync(all_mask, id, bit);
122 | 
123 |         int sx = hx - lx + 1;
124 |         int sy = hy - ly + 1;
125 | 
126 |         // Split the work on all the threads of the warp
127 |         for (int i = warp_start + warp_id; i < warp_end; i += 32) {
128 |             int k = i - warp_start;
129 |             int x = lx + (k % sx);
130 |             int y = ly + ((k / sx) % sy);
131 |             int z = lz + (k / (sx * sy));
132 |             new_ref_ids[i]  = r;
133 |             new_cell_ids[i] = x + grid_dims.x * (y + grid_dims.y * z);
134 |         }
135 |     }
136 | }
137 | 
138 | /// Filter out references that do not intersect the cell they are in
139 | template <typename Primitive>
140 | __global__ void filter_refs(int* __restrict__ cell_ids,
141 |                             int* __restrict__ ref_ids,
142 |                             const Primitive* __restrict__ prims,
143 |                             const Cell* __restrict__ cells,
144 |                             int num_refs) {
145 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
146 |     if (id >= num_refs) return;
147 | 
148 |     auto cell = load_cell(cells + cell_ids[id]);
149 |     auto prim = load_prim(prims +  ref_ids[id]);
150 |     auto bbox = BBox(grid_bbox.min + vec3(cell.min) * cell_size,
151 |                      grid_bbox.min + vec3(cell.max) * cell_size);
152 |     bool intersect = intersect_prim_cell(prim, bbox);
153 |     if (!intersect) {
154 |         cell_ids[id] = -1;
155 |         ref_ids[id]  = -1;
156 |     }
157 | }
158 | 
159 | /// Compute a mask for each reference which determines which sub-cell is intersected
160 | template <typename Primitive>
161 | __global__ void compute_split_masks(const int* __restrict__ cell_ids,
162 |                                     const int* __restrict__ ref_ids,
163 |                                     const Primitive* __restrict__ prims,
164 |                                     const Cell* __restrict__ cells,
165 |                                     int* split_masks,
166 |                                     int num_split) {
167 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
168 |     if (id >= num_split) return;
169 | 
170 |     auto cell_id = cell_ids[id];
171 |     if (cell_id < 0) {
172 |         split_masks[id] = 0;
173 |         return;
174 |     }
175 |     auto ref  =  ref_ids[id];
176 |     auto cell = load_cell(cells + cell_id);
177 |     auto prim = load_prim(prims + ref);
178 | 
179 |     auto cell_min = grid_bbox.min + cell_size * vec3(cell.min);
180 |     auto cell_max = grid_bbox.min + cell_size * vec3(cell.max);
181 |     auto middle = (cell_min + cell_max) * 0.5f;
182 | 
183 |     int mask = 0xFF;
184 | 
185 |     // Optimization: Test against half spaces first
186 |     auto ref_bb = prim.bbox();
187 |     if (ref_bb.min.x > cell_max.x ||
188 |         ref_bb.max.x < cell_min.x) mask  = 0;
189 |     if (ref_bb.min.x >   middle.x) mask &= 0xAA;
190 |     if (ref_bb.max.x <   middle.x) mask &= 0x55;
191 |     if (ref_bb.min.y > cell_max.y ||
192 |         ref_bb.max.y < cell_min.y) mask  = 0;
193 |     if (ref_bb.min.y >   middle.y) mask &= 0xCC;
194 |     if (ref_bb.max.y <   middle.y) mask &= 0x33;
195 |     if (ref_bb.min.z > cell_max.z ||
196 |         ref_bb.max.z < cell_min.z) mask  = 0;
197 |     if (ref_bb.min.z >   middle.z) mask &= 0xF0;
198 |     if (ref_bb.max.z <   middle.z) mask &= 0x0F;
199 | 
200 |     for (int i = __ffs(mask) - 1;;) {
201 |         auto bbox = BBox(vec3(i & 1 ? middle.x : cell_min.x,
202 |                               i & 2 ? middle.y : cell_min.y,
203 |                               i & 4 ? middle.z : cell_min.z),
204 |                          vec3(i & 1 ? cell_max.x : middle.x,
205 |                               i & 2 ? cell_max.y : middle.y,
206 |                               i & 4 ? cell_max.z : middle.z));
207 |         if (!intersect_prim_cell(prim, bbox)) mask &= ~(1 << i);
208 | 
209 |         // Skip non-intersected children
210 |         int skip = __ffs(mask >> (i + 1));
211 |         if (skip == 0) break;
212 |         i += 1 + (skip - 1);
213 |     }
214 | 
215 |     split_masks[id] = mask;
216 | }
217 | 
218 | /// Split references according to the given array of split masks
219 | __global__ void split_refs(const int* __restrict__ cell_ids,
220 |                            const int* __restrict__ ref_ids,
221 |                            const Entry* __restrict__ entries,
222 |                            const int* __restrict__ split_masks,
223 |                            const int* __restrict__ start_split,
224 |                            int* __restrict__ new_cell_ids,
225 |                            int* __restrict__ new_ref_ids,
226 |                            int num_split) {
227 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
228 |     if (id >= num_split) return;
229 | 
230 |     auto cell_id = cell_ids[id];
231 |     auto ref = ref_ids[id];
232 |     auto begin = entries[cell_id].begin;
233 | 
234 |     auto mask  = split_masks[id];
235 |     auto start = start_split[id];
236 |     while (mask) {
237 |         int child_id = __ffs(mask) - 1;
238 |         mask &= ~(1 << child_id);
239 |         new_ref_ids [start] = ref;
240 |         new_cell_ids[start] = begin + child_id;
241 |         start++;
242 |     }
243 | }
244 | 
245 | /// Compute the number of references per cell using atomics
246 | __global__ void count_refs_per_cell(const int* __restrict__ cell_ids,
247 |                                     int* __restrict__ refs_per_cell,
248 |                                     int num_refs) {
249 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
250 |     if (id >= num_refs) return;
251 |     int cell_id = cell_ids[id];
252 |     if (cell_id >= 0) atomicAdd(refs_per_cell + cell_id, 1);
253 | }
254 | 
255 | /// Compute the logarithm of the sub-level resolution for top-level cells
256 | __global__ void compute_log_dims(const int* __restrict__ refs_per_cell,
257 |                                  int* __restrict__ log_dims,
258 |                                  float snd_density,
259 |                                  int num_cells) {
260 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
261 |     if (id >= num_cells) return;
262 | 
263 |     auto extents = grid_bbox.extents() / vec3(grid_dims);
264 |     auto bbox = BBox(vec3(0, 0, 0), extents);
265 |     auto dims = compute_grid_dims(bbox, refs_per_cell[id], snd_density);
266 |     auto max_dim = max(dims.x, max(dims.y, dims.z));
267 |     auto log_dim = 31 - __clz(max_dim);
268 |     log_dim = (1 << log_dim) < max_dim ? log_dim + 1 : log_dim;
269 |     log_dims[id] = log_dim;
270 | }
271 | 
272 | /// Update the logarithm of the sub-level resolution for top-level cells (after a new subdivision level)
273 | __global__ void update_log_dims(int* __restrict__ log_dims, int num_top_cells) {
274 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
275 |     if (id >= num_top_cells) return;
276 | 
277 |     log_dims[id] = max(0, log_dims[id] - 1);
278 | }
279 | 
280 | /// Given a position on the virtual grid, return the corresponding top-level cell index
281 | __device__ __forceinline__ int top_level_cell(ivec3 pos) {
282 |     return (pos.x >> grid_shift) + grid_dims.x * ((pos.y >> grid_shift) + grid_dims.y * (pos.z >> grid_shift));
283 | }
284 | 
285 | /// Count the (sub-)dimensions of each cell, based on the array of references
286 | __global__ void compute_dims(const int*  __restrict__ cell_ids,
287 |                              const Cell* __restrict__ cells,
288 |                              const int*  __restrict__ log_dims,
289 |                              Entry* __restrict__ entries,
290 |                              int num_refs) {
291 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
292 |     if (id >= num_refs) return;
293 | 
294 |     auto cell_id = cell_ids[id];
295 |     if (cell_id < 0) return;
296 | 
297 |     auto cell_min = load_cell_min(cells + cell_id);
298 |     auto top_cell_id = top_level_cell(cell_min);
299 |     auto log_dim = log_dims[top_cell_id];
300 | 
301 |     entries[cell_id] = make_entry(min(log_dim, 1), 0);
302 | }
303 | 
304 | /// Mark references that are kept so that they can be moved to the beginning of the array
305 | __global__ void mark_kept_refs(const int*   __restrict__ cell_ids,
306 |                                const Entry* __restrict__ entries,
307 |                                int* kept_flags,
308 |                                int num_refs) {
309 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
310 |     if (id >= num_refs) return;
311 | 
312 |     auto cell_id = cell_ids[id];
313 |     kept_flags[id] = (cell_id >= 0) && (entries[cell_id].log_dim == 0);
314 | }
315 | 
316 | /// Update the entries for the one level before the current one
317 | __global__ void update_entries(const int* __restrict__ start_cell,
318 |                                Entry* __restrict__ entries,
319 |                                int num_cells) {
320 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
321 |     if (id >= num_cells) return;
322 | 
323 |     auto start = start_cell[id];
324 |     auto entry = entries[id];
325 | 
326 |     // If the cell is subdivided, write the first sub-cell index into the current entry
327 |     entry.begin = entry.log_dim != 0 ? start : id;
328 |     entries[id] = entry;
329 | }
330 | 
331 | /// Generate cells for the top level
332 | __global__ void emit_top_cells(Cell* __restrict__ new_cells, int num_cells) {
333 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
334 |     if (id >= num_cells) return;
335 | 
336 |     int x = id % grid_dims.x;
337 |     int y = (id / grid_dims.x) % grid_dims.y;
338 |     int z = id / (grid_dims.x * grid_dims.y);
339 |     int inc = 1 << grid_shift;
340 | 
341 |     x <<= grid_shift;
342 |     y <<= grid_shift;
343 |     z <<= grid_shift;
344 | 
345 |     Cell cell;
346 |     cell.min = ivec3(x, y, z);
347 |     cell.max = ivec3(x + inc, y + inc, z + inc);
348 |     cell.begin = 0;
349 |     cell.end   = 0;
350 |     store_cell(new_cells + id, cell);
351 | }
352 | 
353 | /// Generate new cells based on the previous level
354 | __global__ void emit_new_cells(const Entry* __restrict__ entries,
355 |                                const Cell* __restrict__ cells,
356 |                                Cell* __restrict__ new_cells,
357 |                                int num_cells) {
358 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
359 |     if (id >= num_cells) return;
360 | 
361 |     auto entry = entries[id];
362 |     auto log_dim = entry.log_dim;
363 |     if (log_dim == 0) return;
364 | 
365 |     auto start = entry.begin;
366 |     auto cell = load_cell(cells + id);
367 |     int min_x = cell.min.x;
368 |     int min_y = cell.min.y;
369 |     int min_z = cell.min.z;
370 |     int inc = (cell.max.x - cell.min.x) >> 1;
371 | 
372 |     for (int i = 0; i < 8; i++) {
373 |         int x = min_x + (i & 1) * inc;
374 |         int y = min_y + ((i >> 1) & 1) * inc;
375 |         int z = min_z + (i >> 2) * inc;
376 | 
377 |         cell.min = ivec3(x, y, z);
378 |         cell.max = ivec3(x + inc, y + inc, z + inc);
379 |         cell.begin = 0;
380 |         cell.end   = 0;
381 |         store_cell(new_cells + start + i, cell);
382 |     }
383 | }
384 | 
385 | /// Copy the references with an offset, different for each level
386 | __global__ void copy_refs(const int* __restrict__ cell_ids,
387 |                           int* __restrict__ new_cell_ids,
388 |                           int cell_off,
389 |                           int num_kept) {
390 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
391 |     if (id >= num_kept) return;
392 | 
393 |     new_cell_ids[id] = cell_ids[id] + cell_off;
394 | }
395 | 
396 | /// Mark the cells that are used as 'kept'
397 | __global__ void mark_kept_cells(const Entry* __restrict__ entries,
398 |                                 int* kept_cells,
399 |                                 int num_cells) {
400 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
401 |     if (id >= num_cells) return;
402 | 
403 |     kept_cells[id] = entries[id].log_dim == 0;
404 | }
405 | 
406 | /// Copy only the cells that are kept to another array of cells
407 | __global__ void copy_cells(const Cell* __restrict__ cells,
408 |                            const int* __restrict__ start_cell,
409 |                            Cell* new_cells,
410 |                            int cell_off,
411 |                            int num_cells) {
412 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
413 |     if (id >= num_cells) return;
414 | 
415 |     auto cell = load_cell(cells + id);
416 |     auto start = start_cell[cell_off + id + 0];
417 |     auto end   = start_cell[cell_off + id + 1];
418 |     if (start < end) store_cell(new_cells + start, cell);
419 | }
420 | 
421 | /// Copy the voxel map entries and remap kept cells to their correct indices
422 | __global__ void copy_entries(const Entry* __restrict__ entries,
423 |                              const int* __restrict__ start_cell,
424 |                              Entry* __restrict__ new_entries,
425 |                              int cell_off,
426 |                              int next_level_off,
427 |                              int num_cells) {
428 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
429 |     if (id >= num_cells) return;
430 | 
431 |     auto entry = entries[id];
432 |     if (entry.log_dim == 0) {
433 |         // Points to a cell
434 |         entry.begin = start_cell[cell_off + entry.begin];
435 |     } else {
436 |         // Points to another entry in the next level
437 |         entry.begin += next_level_off;
438 |     }
439 |     new_entries[id] = entry;
440 | }
441 | 
442 | /// Remap references so that they map to the correct cells
443 | __global__ void remap_refs(int* __restrict__ cell_ids,
444 |                            const int* __restrict__ start_cell,
445 |                            int num_refs) {
446 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
447 |     if (id >= num_refs) return;
448 | 
449 |     cell_ids[id] = start_cell[cell_ids[id]];
450 | }
451 | 
452 | /// Sets the cell ranges once the references are sorted by cell
453 | __global__ void compute_cell_ranges(const int* cell_ids, Cell* cells, int num_refs) {
454 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
455 |     if (id >= num_refs) return;
456 | 
457 |     int cell_id = cell_ids[id + 0];
458 |     if (id >= num_refs - 1) {
459 |         cells[cell_id].end = id + 1;
460 |         return;
461 |     }
462 |     int next_id = cell_ids[id + 1];
463 | 
464 |     if (cell_id != next_id) {
465 |         cells[cell_id].end   = id + 1;
466 |         cells[next_id].begin = id + 1;
467 |     }
468 | }
469 | 
470 | template <typename Primitive>
471 | void first_build_iter(MemManager& mem, float snd_density,
472 |                       const Primitive* prims, int num_prims,
473 |                       const BBox* bboxes, const BBox& grid_bb, const ivec3& dims,
474 |                       int*& log_dims, int& grid_shift, std::vector<Level>& levels) {
475 |     Parallel par(mem);
476 | 
477 |     int num_top_cells = dims.x * dims.y * dims.z;
478 | 
479 |     // Emission of the references in 4 passes: count new refs + scan + emission + filtering
480 |     auto start_emit     = mem.alloc<int>(num_prims + 1);
481 |     auto new_ref_counts = mem.alloc<int>(num_prims + 1);
482 |     auto refs_per_cell  = mem.alloc<int>(num_top_cells);
483 |     log_dims            = mem.alloc<int>(num_top_cells + 1);
484 |     count_new_refs<<<round_div(num_prims, 64), 64>>>(bboxes, new_ref_counts, num_prims);
485 |     DEBUG_SYNC();
486 | 
487 |     int num_new_refs = par.scan(new_ref_counts, num_prims + 1, start_emit);
488 |     mem.free(new_ref_counts);
489 | 
490 |     auto new_ref_ids  = mem.alloc<int>(2 * num_new_refs);
491 |     auto new_cell_ids = new_ref_ids + num_new_refs;
492 |     emit_new_refs<<<round_div(num_prims, 64), 64>>>(bboxes, start_emit, new_ref_ids, new_cell_ids, num_prims);
493 |     DEBUG_SYNC();
494 | 
495 |     mem.free(start_emit);
496 | 
497 |     // Compute the number of references per cell
498 |     mem.zero(refs_per_cell, num_top_cells);
499 |     count_refs_per_cell<<<round_div(num_new_refs, 64), 64>>>(new_cell_ids, refs_per_cell, num_new_refs);
500 |     DEBUG_SYNC();
501 | 
502 |     // Compute an independent resolution in each of the top-level cells
503 |     compute_log_dims<<<round_div(num_top_cells, 64), 64>>>(refs_per_cell, log_dims, snd_density, num_top_cells);
504 |     DEBUG_SYNC();
505 |     mem.free(refs_per_cell);
506 | 
507 |     // Find the maximum sub-level resolution
508 |     grid_shift = par.reduce(log_dims, num_top_cells, log_dims + num_top_cells, [] __device__ (int a, int b) { return max(a, b); });
509 |     auto cell_size = grid_bb.extents() / vec3(dims << grid_shift);
510 | 
511 |     set_global(hagrid::grid_shift, grid_shift);
512 |     set_global(hagrid::cell_size,  cell_size);
513 | 
514 |     // Emission of the new cells
515 |     auto new_cells   = mem.alloc<Cell >(num_top_cells + 0);
516 |     auto new_entries = mem.alloc<Entry>(num_top_cells + 1);
517 |     emit_top_cells<<<round_div(num_top_cells, 64), 64>>>(new_cells, num_top_cells);
518 |     DEBUG_SYNC();
519 |     mem.zero(new_entries, num_top_cells + 1);
520 | 
521 |     // Filter out the references that do not intersect the cell they are in
522 |     filter_refs<<<round_div(num_new_refs, 64), 64>>>(new_cell_ids, new_ref_ids, prims, new_cells, num_new_refs);
523 | 
524 |     levels.emplace_back(new_ref_ids, new_cell_ids, num_new_refs, num_new_refs, new_cells, new_entries, num_top_cells);
525 | }
526 | 
527 | template <typename Primitive>
528 | bool build_iter(MemManager& mem,
529 |                 const Primitive* prims, int num_prims,
530 |                 const ivec3& dims, int* log_dims,
531 |                 std::vector<Level>& levels) {
532 |     Parallel par(mem);
533 | 
534 |     int* cell_ids  = levels.back().cell_ids;
535 |     int* ref_ids   = levels.back().ref_ids;
536 |     Cell* cells    = levels.back().cells;
537 |     Entry* entries = levels.back().entries;
538 | 
539 |     int num_top_cells = dims.x * dims.y * dims.z;
540 |     int num_refs  = levels.back().num_refs;
541 |     int num_cells = levels.back().num_cells;
542 | 
543 |     int cur_level  = levels.size();
544 | 
545 |     auto kept_flags = mem.alloc<int>(num_refs + 1);
546 | 
547 |     // Find out which cell will be split based on whether it is empty or not and the maximum depth
548 |     compute_dims<<<round_div(num_refs, 64), 64>>>(cell_ids, cells, log_dims, entries, num_refs);
549 |     DEBUG_SYNC();
550 |     update_log_dims<<<round_div(num_top_cells, 64), 64>>>(log_dims, num_top_cells);
551 |     DEBUG_SYNC();
552 |     mark_kept_refs<<<round_div(num_refs, 64), 64>>>(cell_ids, entries, kept_flags, num_refs);
553 |     DEBUG_SYNC();
554 | 
555 |     // Store the sub-cells starting index in the entries
556 |     auto start_cell = mem.alloc<int>(num_cells + 1);
557 |     int num_new_cells = par.scan(par.transform(entries, [] __device__ (Entry e) {
558 |         return e.log_dim == 0 ? 0 : 8;
559 |     }), num_cells + 1, start_cell);
560 |     update_entries<<<round_div(num_cells, 64), 64>>>(start_cell, entries, num_cells);
561 |     DEBUG_SYNC();
562 | 
563 |     mem.free(start_cell);
564 | 
565 |     // Partition the set of cells into the sets of those which will be split and those which won't
566 |     auto tmp_ref_ids  = mem.alloc<int>(num_refs * 2);
567 |     auto tmp_cell_ids = tmp_ref_ids + num_refs;
568 |     int num_sel_refs  = par.partition(ref_ids,  tmp_ref_ids,  num_refs, kept_flags);
569 |     int num_sel_cells = par.partition(cell_ids, tmp_cell_ids, num_refs, kept_flags);
570 |     assert(num_sel_refs == num_sel_cells);
571 | 
572 |     mem.free(kept_flags);
573 | 
574 |     std::swap(tmp_ref_ids, ref_ids);
575 |     std::swap(tmp_cell_ids, cell_ids);
576 |     mem.free(tmp_ref_ids);
577 | 
578 |     int num_kept = num_sel_refs;
579 |     levels.back().ref_ids  = ref_ids;
580 |     levels.back().cell_ids = cell_ids;
581 |     levels.back().num_kept = num_kept;
582 | 
583 |     if (num_new_cells == 0) {
584 |         // Exit here because no new reference will be emitted
585 |         mem.free(log_dims);
586 |         return false;
587 |     }
588 | 
589 |     int num_split = num_refs - num_kept;
590 | 
591 |     // Split the references
592 |     auto split_masks = mem.alloc<int>(num_split + 1);
593 |     auto start_split = mem.alloc<int>(num_split + 1);
594 |     compute_split_masks<<<round_div(num_split, 64), 64>>>(cell_ids + num_kept, ref_ids + num_kept, prims, cells, split_masks, num_split);
595 |     DEBUG_SYNC();
596 | 
597 |     int num_new_refs = par.scan(par.transform(split_masks, [] __device__ (int mask) {
598 |         return __popc(mask);
599 |     }), num_split + 1, start_split);
600 |     assert(num_new_refs <= 8 * num_split);
601 | 
602 |     auto new_ref_ids = mem.alloc<int>(num_new_refs * 2);
603 |     auto new_cell_ids = new_ref_ids + num_new_refs;
604 |     split_refs<<<round_div(num_split, 64), 64>>>(cell_ids + num_kept, ref_ids + num_kept, entries, split_masks, start_split, new_cell_ids, new_ref_ids, num_split);
605 |     DEBUG_SYNC();
606 | 
607 |     mem.free(split_masks);
608 |     mem.free(start_split);
609 | 
610 |     // Emission of the new cells
611 |     auto new_cells   = mem.alloc<Cell >(num_new_cells + 0);
612 |     auto new_entries = mem.alloc<Entry>(num_new_cells + 1);
613 |     emit_new_cells<<<round_div(num_cells, 64), 64>>>(entries, cells, new_cells, num_cells);
614 |     DEBUG_SYNC();
615 |     mem.zero(new_entries, num_new_cells + 1);
616 | 
617 |     levels.emplace_back(new_ref_ids, new_cell_ids, num_new_refs, num_new_refs, new_cells, new_entries, num_new_cells);
618 |     return true;
619 | }
620 | 
621 | void concat_levels(MemManager& mem, std::vector<Level>& levels, Grid& grid) {
622 |     Parallel par(mem);
623 |     int num_levels = levels.size();
624 | 
625 |     // Start with references
626 |     int total_refs = 0;
627 |     int total_cells = 0;
628 |     for (auto& level : levels) {
629 |         total_refs  += level.num_kept;
630 |         total_cells += level.num_cells;
631 |     }
632 | 
633 |     // Copy primitive references as-is
634 |     auto ref_ids  = mem.alloc<int>(total_refs);
635 |     auto cell_ids = mem.alloc<int>(total_refs);
636 |     for (int i = 0, off = 0; i < num_levels; off += levels[i].num_kept, i++) {
637 |         mem.copy<Copy::DEV_TO_DEV>(ref_ids + off, levels[i].ref_ids, levels[i].num_kept);
638 |     }
639 |     // Copy the cell indices with an offset
640 |     for (int i = 0, off = 0, cell_off = 0; i < num_levels; off += levels[i].num_kept, cell_off += levels[i].num_cells, i++) {
641 |         int num_kept = levels[i].num_kept;
642 |         if (num_kept) {
643 |             copy_refs<<<round_div(num_kept, 64), 64>>>(levels[i].cell_ids, cell_ids + off, cell_off, num_kept);
644 |             DEBUG_SYNC();
645 |         }
646 |         mem.free(levels[i].ref_ids);
647 |     }
648 | 
649 |     // Mark the cells at the leaves of the structure as kept
650 |     auto kept_cells = mem.alloc<int>(total_cells + 1);
651 |     for (int i = 0, cell_off = 0; i < num_levels; cell_off += levels[i].num_cells, i++) {
652 |         int num_cells = levels[i].num_cells;
653 |         mark_kept_cells<<<round_div(num_cells, 64), 64>>>(levels[i].entries, kept_cells + cell_off, num_cells);
654 |         DEBUG_SYNC();
655 |     }
656 | 
657 |     // Compute the insertion position of each cell
658 |     auto start_cell = mem.alloc<int>(total_cells + 1);
659 |     int new_total_cells = par.scan(kept_cells, total_cells + 1, start_cell);
660 |     mem.free(kept_cells);
661 | 
662 |     // Allocate new cells, and copy only the cells that are kept
663 |     auto cells = mem.alloc<Cell>(new_total_cells);
664 |     for (int i = 0, cell_off = 0; i < num_levels; cell_off += levels[i].num_cells, i++) {
665 |         int num_cells = levels[i].num_cells;
666 |         copy_cells<<<round_div(num_cells, 64), 64>>>(levels[i].cells, start_cell, cells, cell_off, num_cells);
667 |         DEBUG_SYNC();
668 |         mem.free(levels[i].cells);
669 |     }
670 | 
671 |     auto entries = mem.alloc<Entry>(total_cells);
672 |     for (int i = 0, off = 0; i < num_levels; off += levels[i].num_cells, i++) {
673 |         int num_cells = levels[i].num_cells;
674 |         int next_level_off = off + num_cells;
675 |         copy_entries<<<round_div(num_cells, 64), 64>>>(levels[i].entries, start_cell, entries + off, off, next_level_off, num_cells);
676 |         DEBUG_SYNC();
677 |         mem.free(levels[i].entries);
678 |     }
679 | 
680 |     // Remap the cell indices in the references (which currently map to incorrect cells)
681 |     remap_refs<<<round_div(total_refs, 64), 64>>>(cell_ids, start_cell, total_refs);
682 |     DEBUG_SYNC();
683 | 
684 |     mem.free(start_cell);
685 | 
686 |     // Sort the references by cell (re-use old slots whenever possible)
687 |     auto tmp_ref_ids  = mem.alloc<int>(total_refs);
688 |     auto tmp_cell_ids = mem.alloc<int>(total_refs);
689 |     auto new_ref_ids  = tmp_ref_ids;
690 |     auto new_cell_ids = tmp_cell_ids;
691 |     par.sort_pairs(cell_ids, ref_ids, new_cell_ids, new_ref_ids, total_refs, ilog2(new_total_cells));
692 |     if (ref_ids  != new_ref_ids)  std::swap(ref_ids,  tmp_ref_ids);
693 |     if (cell_ids != new_cell_ids) std::swap(cell_ids, tmp_cell_ids);
694 |     mem.free(tmp_ref_ids);
695 |     mem.free(tmp_cell_ids);
696 | 
697 |     // Compute the ranges of references for each cell
698 |     compute_cell_ranges<<<round_div(total_refs, 64), 64>>>(cell_ids, cells, total_refs);
699 |     DEBUG_SYNC();
700 | 
701 |     mem.free(cell_ids);
702 | 
703 |     grid.entries = entries;
704 |     grid.ref_ids = ref_ids;
705 |     grid.cells   = cells;
706 |     grid.shift   = levels.size() - 1;
707 |     grid.num_cells   = new_total_cells;
708 |     grid.num_entries = total_cells;
709 |     grid.num_refs    = total_refs;
710 | 
711 |     grid.offsets.resize(levels.size());
712 |     for (int i = 0, off = 0; i < levels.size(); i++) {
713 |         off += levels[i].num_cells;
714 |         grid.offsets[i] = off;
715 |     }
716 | }
717 | 
718 | template <typename Primitive>
719 | void build(MemManager& mem, const Primitive* prims, int num_prims, Grid& grid, float top_density, float snd_density) {
720 |     Parallel par(mem);
721 | 
722 |     // Allocate a bounding box for each primitive + one for the global bounding box
723 |     auto bboxes = mem.alloc<BBox>(num_prims + 1);
724 | 
725 |     compute_bboxes<<<round_div(num_prims, 64), 64>>>(prims, bboxes, num_prims);
726 |     auto grid_bb = par.reduce(bboxes, num_prims, bboxes + num_prims,
727 |         [] __device__ (BBox a, const BBox& b) { return a.extend(b); }, BBox::empty());
728 |     auto dims = compute_grid_dims(grid_bb, num_prims, top_density);
729 |     // Round to the next multiple of 2 on each dimension (in order to align the memory)
730 |     dims.x = dims.x % 2 ? dims.x + 1 : dims.x;
731 |     dims.y = dims.y % 2 ? dims.y + 1 : dims.y;
732 |     dims.z = dims.z % 2 ? dims.z + 1 : dims.z;
733 | 
734 |     // Slightly enlarge the bounding box of the grid
735 |     auto extents = grid_bb.extents();
736 |     grid_bb.min -= extents * 0.001f;
737 |     grid_bb.max += extents * 0.001f;
738 | 
739 |     set_global(hagrid::grid_dims, dims);
740 |     set_global(hagrid::grid_bbox, grid_bb);
741 | 
742 |     int* log_dims = nullptr;
743 |     int grid_shift = 0;
744 |     std::vector<Level> levels;
745 | 
746 |     // Build top level
747 |     first_build_iter(mem, snd_density, prims, num_prims, bboxes, grid_bb, dims, log_dims, grid_shift, levels);
748 | 
749 |     mem.free(bboxes);
750 | 
751 |     int iter = 1;
752 |     while (build_iter(mem, prims, num_prims, dims, log_dims, levels)) iter++;
753 | 
754 |     concat_levels(mem, levels, grid);
755 |     grid.small_cells = nullptr;
756 |     grid.dims  = dims;
757 |     grid.bbox  = grid_bb;
758 | }
759 | 
760 | void build_grid(MemManager& mem, const Tri* tris, int num_tris, Grid& grid, float top_density, float snd_density) { build(mem, tris, num_tris, grid, top_density, snd_density); }
761 | 
762 | } // namespace hagrid
763 | 


--------------------------------------------------------------------------------
/src/build.h:
--------------------------------------------------------------------------------
 1 | #ifndef BUILD_H
 2 | #define BUILD_H
 3 | 
 4 | #include "mem_manager.h"
 5 | #include "prims.h"
 6 | #include "grid.h"
 7 | 
 8 | namespace hagrid {
 9 | 
10 | /// Builds an initial irregular grid.
11 | /// The building process starts by creating a uniform grid of density 'top_density',
12 | /// and then proceeds to compute an independent resolution in each of its cells
13 | /// (using the second-level density 'snd_density').
14 | /// In each cell, an octree depth is computed from these independent resolutions
15 | /// and the primitive references are split until every cell has reached its maximum depth.
16 | /// The voxel map follows the octree structure.
17 | void build_grid(MemManager& mem, const Tri* tris, int num_tris, Grid& grid, float top_density, float snd_density);
18 | 
19 | /// Performs the neighbor merging optimization (merging cells according to the SAH).
20 | void merge_grid(MemManager& mem, Grid& grid, float alpha);
21 | 
22 | /// Flattens the voxel map to speed up queries.
23 | /// Once this optimization is performed, the voxel map no longer follows an octree structure.
24 | /// Each inner node of the voxel map now may have up to 1 << (3 * (1 << Entry::LOG_DIM_BITS - 1)) children.
25 | void flatten_grid(MemManager& mem, Grid& grid);
26 | 
27 | /// Performs the cell expansion optimization (expands cells over neighbors that share the same set of primitives).
28 | void expand_grid(MemManager& mem, Grid& grid, const Tri* tris, int iters);
29 | 
30 | /// Tries to compress the grid by using sentinels in the reference array and using 16-bit cell dimensions. Returns true on success, otherwise false.
31 | bool compress_grid(MemManager& mem, Grid& grid);
32 | 
33 | } // namespace hagrid
34 | 
35 | #endif // BUILD_H
36 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | 
  4 | #include <functional>
  5 | #include <cstdint>
  6 | #include <cmath>
  7 | 
  8 | #ifdef __NVCC__
  9 | #include <iostream>
 10 | #endif
 11 | 
 12 | namespace hagrid {
 13 | 
 14 | /// Returns the number of milliseconds elapsed on the device for the given function
 15 | HOST float profile(std::function<void()>);
 16 | 
 17 | /// Rounds the division by an integer so that round_div(i, j) * j > i
 18 | HOST DEVICE inline int round_div(int i, int j) {
 19 |     return i / j + (i % j ? 1 : 0);
 20 | }
 21 | 
 22 | /// Computes the minimum between two values
 23 | template <typename T> HOST DEVICE T min(T a, T b) { return a < b ? a : b; }
 24 | /// Computes the maximum between two values
 25 | template <typename T> HOST DEVICE T max(T a, T b) { return a > b ? a : b; }
 26 | /// Clamps the first value in the range defined by the last two arguments
 27 | template <typename T> HOST DEVICE T clamp(T a, T b, T c) { return min(c, max(b, a)); }
 28 | /// Swaps the contents of two references
 29 | template <typename T> HOST DEVICE void swap(T& a, T& b) { auto tmp = a; a = b; b = tmp; }
 30 | 
 31 | /// Reinterprets a values as unsigned int
 32 | template <typename U, typename T>
 33 | HOST DEVICE U as(T t) {
 34 |     union { T t; U u; } v;
 35 |     v.t = t;
 36 |     return v.u;
 37 | }
 38 | 
 39 | /// Returns x with the sign of x * y
 40 | HOST DEVICE inline float safe_rcp(float x) {
 41 |     return x != 0 ? 1.0f / x : copysign(as<float>(0x7f800000u), x);
 42 | }
 43 | 
 44 | /// Returns x with the sign of x * y
 45 | HOST DEVICE inline float prodsign(float x, float y) {
 46 |     return as<float>(as<uint32_t>(x) ^ (as<uint32_t>(y) & 0x80000000));
 47 | }
 48 | 
 49 | /// Converts a float to an ordered float
 50 | HOST DEVICE inline uint32_t float_to_ordered(float f) {
 51 |     auto u = as<uint32_t>(f);
 52 |     auto mask = -(int)(u >> 31u) | 0x80000000u;
 53 |     return u ^ mask;
 54 | }
 55 | 
 56 | /// Converts back an ordered integer to float
 57 | HOST DEVICE inline float ordered_to_float(uint32_t u) {
 58 |     auto mask = ((u >> 31u) - 1u) | 0x80000000u;
 59 |     return as<float>(u ^ mask);
 60 | }
 61 | 
 62 | /// Computes the cubic root of an integer
 63 | HOST DEVICE inline int icbrt(int x) {
 64 |     unsigned y = 0;
 65 |     for (int s = 30; s >= 0; s = s - 3) {
 66 |         y = 2 * y;
 67 |         const unsigned b = (3 * y * (y + 1) + 1) << s;
 68 |         if (x >= b) {
 69 |             x = x - b;
 70 |             y = y + 1;
 71 |         }
 72 |     }
 73 |     return y;
 74 | }
 75 | 
 76 | template <size_t N, size_t I = 0> struct Log2       { enum { Value = Log2<N / 2, I + 1>::Value }; };
 77 | template <size_t I>               struct Log2<1, I> { enum { Value = I                         }; };
 78 | 
 79 | /// Computes the logarithm in base 2 of an integer such that (1 << log2(x)) >= x
 80 | template <typename T>
 81 | HOST DEVICE int ilog2(T t) {
 82 |     auto a = 0;
 83 |     auto b = sizeof(T) * 8;
 84 |     auto all = T(-1);
 85 |     #pragma unroll
 86 |     for (int i = 0; i < Log2<sizeof(T) * 8>::Value; i++) {
 87 |         auto m = (a + b) / 2;
 88 |         T mask = all << T(m);
 89 |         if (t & mask) a = m + 1;
 90 |         else          b = m;
 91 |     }
 92 |     return a;
 93 | }
 94 | 
 95 | #ifdef __NVCC__
 96 | #ifndef NDEBUG
 97 | #define DEBUG_SYNC() CHECK_CUDA_CALL(cudaDeviceSynchronize())
 98 | #else
 99 | #define DEBUG_SYNC() do{} while(0)
100 | #endif
101 | #define CHECK_CUDA_CALL(x) check_cuda_call(x, __FILE__, __LINE__)
102 | 
103 | __host__ static void check_cuda_call(cudaError_t err, const char* file, int line) {
104 |     if (err != cudaSuccess) {
105 |         std::cerr << file << "(" << line << "): " << cudaGetErrorString(err) << std::endl;
106 |         abort();
107 |     }
108 | }
109 | 
110 | template <typename T>
111 | __host__ void set_global(T& symbol, const T& val) {
112 |     size_t size;
113 |     CHECK_CUDA_CALL(cudaGetSymbolSize(&size, symbol));
114 |     CHECK_CUDA_CALL(cudaMemcpyToSymbol(symbol, &val, size));
115 | }
116 | 
117 | template <typename T>
118 | __host__ T get_global(const T& symbol) {
119 |     size_t size;
120 |     T val;
121 |     CHECK_CUDA_CALL(cudaGetSymbolSize(&size, symbol));
122 |     CHECK_CUDA_CALL(cudaMemcpyFromSymbol(&val, symbol, size));
123 |     return val;
124 | }
125 | #endif // __NVCC__
126 | 
127 | } // namespace hagrid
128 | 
129 | #endif
130 | 


--------------------------------------------------------------------------------
/src/compress.cu:
--------------------------------------------------------------------------------
 1 | #include "parallel.cuh"
 2 | #include "build.h"
 3 | 
 4 | namespace hagrid {
 5 | 
 6 | __global__ void count_sentinel_refs(const Cell* cells, int* ref_counts, int num_cells) {
 7 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 8 |     if (id >= num_cells) return;
 9 | 
10 |     auto cell = load_cell(cells + id);
11 |     auto count = cell.end - cell.begin;
12 |     ref_counts[id] = count > 0 ? count + 1 : 0;
13 | }
14 | 
15 | __global__ void emit_small_cells(const Cell* cells,
16 |                                  SmallCell* small_cells,
17 |                                  int* __restrict__ refs,
18 |                                  int* __restrict__ ref_scan,
19 |                                  int* __restrict__ sentinel_refs,
20 |                                  int num_cells) {
21 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
22 |     if (id >= num_cells) return;
23 | 
24 |     auto cell = load_cell(cells + id);
25 |     int first = ref_scan[id];
26 |     int count = cell.end - cell.begin;
27 | 
28 |     SmallCell small_cell(usvec3(cell.min), usvec3(cell.max), count > 0 ? first : -1);
29 |     store_cell(small_cells + id, small_cell);
30 |    
31 |     if (count > 0) {
32 |         for (int i = 0; i < count; i++)
33 |             sentinel_refs[first + i] = refs[cell.begin + i];
34 |         sentinel_refs[first + count] = -1;
35 |     }
36 | }
37 | 
38 | bool compress_grid(MemManager& mem, Grid& grid) {
39 |     auto dims = grid.dims << grid.shift;
40 |     // Compression cannot work if the dimensions cannot fit into 16-bit indices
41 |     if (dims.x >= (1 << 16) ||
42 |         dims.y >= (1 << 16) ||
43 |         dims.z >= (1 << 16))
44 |         return false;
45 | 
46 |     Parallel par(mem);
47 |     auto ref_counts  = mem.alloc<int>(grid.num_cells + 1);
48 |     auto ref_scan    = mem.alloc<int>(grid.num_cells + 1);
49 |     auto small_cells = mem.alloc<SmallCell>(grid.num_cells);
50 |     count_sentinel_refs<<<round_div(grid.num_cells, 64), 64>>>(grid.cells, ref_counts, grid.num_cells);
51 |     auto num_sentinel_refs = par.scan(ref_counts, grid.num_cells + 1, ref_scan);
52 |     auto sentinel_refs = mem.alloc<int>(num_sentinel_refs);
53 |     emit_small_cells<<<round_div(grid.num_cells, 64), 64>>>(grid.cells, small_cells, grid.ref_ids, ref_scan, sentinel_refs, grid.num_cells);
54 |     grid.small_cells = small_cells;
55 |     mem.free(grid.cells);
56 |     mem.free(grid.ref_ids);
57 |     mem.free(ref_counts);
58 |     mem.free(ref_scan);
59 |     grid.cells = nullptr;
60 |     grid.ref_ids = sentinel_refs;
61 |     grid.num_refs = num_sentinel_refs;
62 |     return true;
63 | }
64 | 
65 | } // namespace hagrid
66 | 


--------------------------------------------------------------------------------
/src/expand.cu:
--------------------------------------------------------------------------------
  1 | #include "build.h"
  2 | 
  3 | namespace hagrid {
  4 | 
  5 | static __constant__ ivec3 grid_dims;
  6 | static __constant__ vec3  grid_min;
  7 | static __constant__ vec3  cell_size;
  8 | static __constant__ vec3  grid_inv;
  9 | static __constant__ int   grid_shift;
 10 | 
 11 | /// Returns true if an overlap with a neighboring cell is possible
 12 | template <int axis, bool dir>
 13 | __device__ bool overlap_possible(const Cell& cell) {
 14 |     if (dir)
 15 |         return get<axis>(cell.max) < get<axis>(grid_dims);
 16 |     else
 17 |         return get<axis>(cell.min) > 0;
 18 | }
 19 | 
 20 | /// Determines if the given range of references is a subset of the other
 21 | __device__ __forceinline__ bool is_subset(const int* __restrict__ p0, int c0, const int* __restrict__ p1, int c1) {
 22 |     if (c1 > c0) return false;
 23 |     if (c1 == 0) return true;
 24 | 
 25 |     int i = 0, j = 0;
 26 | 
 27 |     do {
 28 |         const int a = p0[i];
 29 |         const int b = p1[j];
 30 |         if (b < a) return false;
 31 |         j += (a == b);
 32 |         i++;
 33 |     } while (i < c0 & j < c1);
 34 | 
 35 |     return j == c1;
 36 | }
 37 | 
 38 | /// Computes the amount of overlap possible for a cell and a given primitive
 39 | template <int axis, bool dir, typename Primitive>
 40 | __device__ int compute_overlap(const Primitive& prim, const Cell& cell, const BBox& cell_bbox, int d) {
 41 |     static constexpr int axis1 = (axis + 1) % 3;
 42 |     static constexpr int axis2 = (axis + 2) % 3;
 43 |     auto prim_bbox = prim.bbox();
 44 | 
 45 |     if (get<axis1>(prim_bbox.min) <= get<axis1>(cell_bbox.max) &&
 46 |         get<axis1>(prim_bbox.max) >= get<axis1>(cell_bbox.min) &&
 47 |         get<axis2>(prim_bbox.min) <= get<axis2>(cell_bbox.max) &&
 48 |         get<axis2>(prim_bbox.max) >= get<axis2>(cell_bbox.min)) {
 49 |         // Approximation: use the original bounding box, not the clipped one
 50 |         int prim_d = ((dir ? get<axis>(prim_bbox.min) : get<axis>(prim_bbox.max)) - get<axis>(grid_min)) * get<axis>(grid_inv);
 51 |         d = dir
 52 |             ? min(d, prim_d - get<axis>(cell.max))
 53 |             : max(d, prim_d - get<axis>(cell.min) + 1);
 54 |         d = dir ? max(d, 0) : min(d, 0);
 55 |     }
 56 |     return d;
 57 | }
 58 | 
 59 | /// Finds the maximum overlap possible for one cell
 60 | template <int axis, bool dir, bool subset_only, typename Primitive>
 61 | __device__ int find_overlap(const Entry* __restrict__ entries,
 62 |                             const int* __restrict__ refs,
 63 |                             const Primitive* __restrict__ prims,
 64 |                             const Cell* cells,
 65 |                             const Cell& cell,
 66 |                             bool& continue_overlap) {
 67 |     constexpr int axis1 = (axis + 1) % 3;
 68 |     constexpr int axis2 = (axis + 2) % 3;
 69 | 
 70 |     if (!overlap_possible<axis, dir>(cell)) return 0;
 71 | 
 72 |     int d = dir ? get<axis>(grid_dims) : -get<axis>(grid_dims);
 73 |     int k1, k2 = get<axis2>(grid_dims);
 74 |     int i = get<axis1>(cell.min);
 75 |     int j = get<axis2>(cell.min);
 76 |     int max_d = d;
 77 |     while (true) {
 78 |         ivec3 next_cell;
 79 |         if (axis == 0) next_cell = ivec3(dir ? cell.max.x : cell.min.x - 1, i, j);
 80 |         if (axis == 1) next_cell = ivec3(j, dir ? cell.max.y : cell.min.y - 1, i);
 81 |         if (axis == 2) next_cell = ivec3(i, j, dir ? cell.max.z : cell.min.z - 1);
 82 |         auto entry = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_cell);
 83 |         auto next = load_cell(cells + entry);
 84 | 
 85 |         max_d = dir
 86 |             ? min(max_d, get<axis>(next.max) - get<axis>(cell.max))
 87 |             : max(max_d, get<axis>(next.min) - get<axis>(cell.min));
 88 |         d = dir ? min(d, max_d) : max(d, max_d);
 89 | 
 90 |         if (subset_only) {
 91 |             if (!is_subset(refs + cell.begin, cell.end - cell.begin,
 92 |                            refs + next.begin, next.end - next.begin)) {
 93 |                 d = 0;
 94 |                 break;
 95 |             }
 96 |         } else {
 97 |             if (next.begin < next.end) {
 98 |                 auto cell_bbox = BBox(grid_min + cell_size * vec3(cell.min),
 99 |                                       grid_min + cell_size * vec3(cell.max));
100 | 
101 |                 int p1 = cell.begin, p2 = next.begin;
102 |                 int ref2 = refs[p2];
103 |                 while (true) {
104 |                     // Skip references that are present in the current cell
105 |                     while (p1 < cell.end) {
106 |                         int ref1 = refs[p1];
107 | 
108 |                         if (ref1  > ref2) break;
109 |                         if (ref1 == ref2) {
110 |                             if (++p2 >= next.end) break;
111 |                             ref2 = refs[p2];
112 |                         }
113 | 
114 |                         p1++;
115 |                     }
116 | 
117 |                     if (p2 >= next.end) break;
118 | 
119 |                     // Process references that are only present in the next cell
120 |                     d = compute_overlap<axis, dir>(load_prim(prims + ref2), cell, cell_bbox, d);
121 |                     if (d == 0 || ++p2 >= next.end) break;
122 |                     ref2 = refs[p2];
123 |                 }
124 |             }
125 | 
126 |             if (d == 0) break;
127 |         }
128 | 
129 |         k1 = get<axis1>(next.max) - i;
130 |         k2 = min(k2, get<axis2>(next.max) - j);
131 | 
132 |         i += k1;
133 |         if (i >= get<axis1>(cell.max)) {
134 |             i = get<axis1>(cell.min);
135 |             j += k2;
136 |             k2 = get<axis2>(grid_dims);
137 |             if (j >= get<axis2>(cell.max)) break;
138 |         }
139 |     }
140 | 
141 |     continue_overlap |= d == max_d;
142 |     return d;
143 | }
144 | 
145 | template <int axis, typename Primitive>
146 | __global__ void overlap_step(const Entry* __restrict__ entries,
147 |                              const int* __restrict__ refs,
148 |                              const Primitive* __restrict__ prims,
149 |                              const Cell* __restrict__ cells,
150 |                              Cell* __restrict__ new_cells,
151 |                              int* __restrict__ cell_flags,
152 |                              int num_cells) {
153 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
154 |     if (id >= num_cells || (cell_flags[id] & (1 << axis)) == 0)
155 |         return;
156 | 
157 |     auto cell = load_cell(cells + id);
158 |     bool flag = false;
159 |     constexpr bool subset_only = true;
160 |     auto ov1 = find_overlap<axis, false, subset_only>(entries, refs, prims, cells, cell, flag);
161 |     auto ov2 = find_overlap<axis, true,  subset_only>(entries, refs, prims, cells, cell, flag);
162 | 
163 |     if (axis == 0) {
164 |         cell.min.x += ov1;
165 |         cell.max.x += ov2;
166 |     }
167 | 
168 |     if (axis == 1) {
169 |         cell.min.y += ov1;
170 |         cell.max.y += ov2;
171 |     }
172 | 
173 |     if (axis == 2) {
174 |         cell.min.z += ov1;
175 |         cell.max.z += ov2;
176 |     }
177 | 
178 |     // If the cell has not been expanded, we will not process it next time
179 |     cell_flags[id] = (flag ? 1 << axis : 0) | (cell_flags[id] & ~(1 << axis));
180 | 
181 |     store_cell(new_cells + id, cell);
182 | }
183 | 
184 | template <typename Primitive>
185 | void expansion_iter(Grid& grid, const Primitive* prims, Cell*& new_cells, int* cell_flags) {
186 |     overlap_step<0><<<round_div(grid.num_cells, 64), 64>>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells);
187 |     std::swap(new_cells, grid.cells);
188 |     DEBUG_SYNC();
189 | 
190 |     overlap_step<1><<<round_div(grid.num_cells, 64), 64>>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells);
191 |     std::swap(new_cells, grid.cells);
192 |     DEBUG_SYNC();
193 | 
194 |     overlap_step<2><<<round_div(grid.num_cells, 64), 64>>>(grid.entries, grid.ref_ids, prims, grid.cells, new_cells, cell_flags, grid.num_cells);
195 |     std::swap(new_cells, grid.cells);
196 |     DEBUG_SYNC();
197 | }
198 | 
199 | template <typename Primitive>
200 | void expand(MemManager& mem, Grid& grid, const Primitive* prims, int iters) {
201 |     if (iters == 0) return;
202 | 
203 |     auto new_cells  = mem.alloc<Cell>(grid.num_cells);
204 |     auto cell_flags = mem.alloc<int>(grid.num_cells);
205 | 
206 |     mem.one(cell_flags, grid.num_cells);
207 |     auto extents = grid.bbox.extents();
208 |     auto dims = grid.dims << grid.shift;
209 |     auto cell_size = extents / vec3(dims);
210 |     auto grid_inv = vec3(dims) / extents;
211 | 
212 |     set_global(hagrid::grid_dims,  dims);
213 |     set_global(hagrid::grid_min,   grid.bbox.min);
214 |     set_global(hagrid::cell_size,  cell_size);
215 |     set_global(hagrid::grid_inv,   grid_inv);
216 |     set_global(hagrid::grid_shift, grid.shift);
217 | 
218 |     for (int i = 0; i < iters; i++)
219 |         expansion_iter(grid, prims, new_cells, cell_flags);
220 | 
221 |     mem.free(cell_flags);
222 |     mem.free(new_cells);
223 | }
224 | 
225 | void expand_grid(MemManager& mem, Grid& grid, const Tri* tris, int iters) { expand(mem, grid, tris, iters); }
226 | 
227 | } // namespace hagrid
228 | 


--------------------------------------------------------------------------------
/src/flatten.cu:
--------------------------------------------------------------------------------
  1 | #include "build.h"
  2 | #include "parallel.cuh"
  3 | 
  4 | namespace hagrid {
  5 | 
  6 | static constexpr int flat_levels = (1 << Entry::LOG_DIM_BITS) - 1;
  7 | 
  8 | /// Collapses sub-entries that map to the same cell/sub-sub-entry
  9 | __global__ void collapse_entries(Entry* entries, int first, int num_entries) {
 10 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 11 |     if (id >= num_entries) return;
 12 | 
 13 |     auto entry = entries[first + id];
 14 |     if (entry.log_dim) {
 15 |         auto ptr = (int4*)(entries + entry.begin);
 16 |         auto ptr0 = ptr[0];
 17 |         if (ptr0.x == ptr0.y &&
 18 |             ptr0.x == ptr0.z &&
 19 |             ptr0.x == ptr0.w) {
 20 |             auto ptr1 = ptr[1];
 21 |             if (ptr0.x == ptr1.x &&
 22 |                 ptr1.x == ptr1.y &&
 23 |                 ptr1.x == ptr1.z &&
 24 |                 ptr1.x == ptr1.w) {
 25 |                 entries[first + id] = as<Entry>(ptr0);
 26 |             }
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | /// Computes the depth of each entry
 32 | __global__ void compute_depths(Entry* entries, int* depths, int first, int num_entries) {
 33 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 34 |     if (id >= num_entries) return;
 35 | 
 36 |     auto entry = entries[first + id];
 37 |     int d = 0;
 38 |     if (entry.log_dim) {
 39 |         auto ptr = (const int4*)(depths + entry.begin);
 40 |         auto d0 = ptr[0];
 41 |         auto d1 = ptr[1];
 42 |         d = 1 + max(max(max(d0.x, d1.x), max(d0.y, d1.y)),
 43 |                     max(max(d0.z, d1.z), max(d0.w, d1.w)));
 44 |     }
 45 |     depths[first + id] = d;
 46 | }
 47 | 
 48 | /// Copies the top-level entries and change their depth & start index
 49 | __global__ void copy_top_level(const Entry* __restrict__ entries,
 50 |                                const int* __restrict__ start_entries,
 51 |                                const int* __restrict__ depths,
 52 |                                Entry* __restrict__ new_entries,
 53 |                                int num_entries) {
 54 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
 55 |     if (id >= num_entries) return;
 56 | 
 57 |     auto entry = entries[id];
 58 |     if (entry.log_dim) {
 59 |         entry = make_entry(min(depths[id], flat_levels), num_entries + start_entries[id]);
 60 |     }
 61 |     new_entries[id] = entry;
 62 | }
 63 | 
 64 | /// Flattens several voxel map levels into one larger level
 65 | __global__ void flatten_level(const Entry* __restrict__ entries,
 66 |                               const int* __restrict__ start_entries,
 67 |                               const int* __restrict__ depths,
 68 |                               Entry* __restrict__ new_entries,
 69 |                               int first_entry,
 70 |                               int offset, int next_offset,
 71 |                               int num_entries) {
 72 |     int id = blockIdx.x;
 73 | 
 74 |     int d = min(depths[id + first_entry], flat_levels);
 75 |     int num_sub_entries = d == 0 ? 0 : 1 << (3 * d);
 76 |     if (num_sub_entries <= 0) return;
 77 | 
 78 |     int start = offset + start_entries[id + first_entry];
 79 |     auto root = entries[id + first_entry];
 80 | 
 81 |     for (int i = threadIdx.x; i < num_sub_entries; i += blockDim.x) {
 82 |         // Treat i as a morton code
 83 |         int cur_d = d;
 84 |         int x = 0, y = 0, z = 0;
 85 |         int next_id = id;
 86 |         auto entry = root;
 87 |         while (cur_d > 0) {
 88 |             cur_d--;
 89 | 
 90 |             int pos = i >> (cur_d * 3);
 91 |             x += (pos & 1) ? (1 << cur_d) : 0;
 92 |             y += (pos & 2) ? (1 << cur_d) : 0;
 93 |             z += (pos & 4) ? (1 << cur_d) : 0;
 94 | 
 95 |             if (entry.log_dim) {
 96 |                 next_id = entry.begin + (pos & 7);
 97 |                 entry = entries[next_id];
 98 |             }
 99 |         }
100 | 
101 |         if (entry.log_dim) {
102 |             entry = make_entry(min(depths[next_id], flat_levels), next_offset + start_entries[next_id]);
103 |         }
104 | 
105 |         new_entries[start + x + ((y + (z << d)) << d)] = entry;
106 |     }
107 | }
108 | 
109 | void flatten_grid(MemManager& mem, Grid& grid) {
110 |     Parallel par(mem);
111 | 
112 |     auto depths = mem.alloc<int>(grid.num_entries + 1);
113 | 
114 |     // Flatten the voxel map
115 |     for (int i = grid.shift; i >= 0; i--) {
116 |         int first = i > 0 ? grid.offsets[i - 1] : 0;
117 |         int last  = grid.offsets[i];
118 |         int num_entries = last - first;
119 |         // Collapse voxel map entries when possible
120 |         collapse_entries<<<round_div(num_entries, 64), 64>>>(grid.entries, first, num_entries);
121 |         DEBUG_SYNC();
122 |         compute_depths<<<round_div(num_entries, 64), 64>>>(grid.entries, depths, first, num_entries);
123 |         DEBUG_SYNC();
124 |     }
125 | 
126 |     // Compute the insertion position of each flattened level, and the total new number of entries
127 |     auto start_entries = mem.alloc<int>(grid.num_entries + 1);
128 |     std::vector<int> level_offsets(grid.shift);
129 |     int total_entries = grid.offsets[0];
130 |     for (int i = 0; i < grid.shift; i += flat_levels) {
131 |         int first = i > 0 ? grid.offsets[i - 1] : 0;
132 |         int last  = grid.offsets[i];
133 |         int num_entries = last - first;
134 | 
135 |         // CUDA 8 bug: decltype(f(...)) is considered as a call to f (which forces to use __host__ here)
136 |         int num_new_entries = par.scan(par.transform(depths + first, [] __host__ __device__ (int d) {
137 |             return d > 0 ? 1 << (min(d, flat_levels) * 3) : 0;
138 |         }), num_entries + 1, start_entries + first);
139 |         level_offsets[i] = total_entries;
140 |         total_entries += num_new_entries;
141 |     }
142 | 
143 |     // Flatten the voxel map, by concatenating consecutive several levels together
144 |     auto new_entries = mem.alloc<Entry>(total_entries);
145 |     std::vector<int> new_offsets;
146 | 
147 |     copy_top_level<<<round_div(grid.offsets[0], 64), 64>>>(grid.entries, start_entries, depths, new_entries, grid.offsets[0]);
148 |     for (int i = 0; i < grid.shift; i += flat_levels) {
149 |         int first = i > 0 ? grid.offsets[i - 1] : 0;
150 |         int last  = grid.offsets[i];
151 |         int num_entries = last - first;
152 | 
153 |         int next_offset = i + flat_levels < grid.shift ? level_offsets[i + flat_levels] : 0;
154 |         flatten_level<<<num_entries, 64>>>(grid.entries,
155 |                                            start_entries,
156 |                                            depths,
157 |                                            new_entries,
158 |                                            first,
159 |                                            level_offsets[i],
160 |                                            next_offset,
161 |                                            num_entries);
162 |         DEBUG_SYNC();
163 | 
164 |         new_offsets.emplace_back(level_offsets[i]);
165 |     }
166 |     new_offsets.emplace_back(total_entries);
167 | 
168 |     std::swap(new_entries, grid.entries);
169 |     std::swap(new_offsets, grid.offsets);
170 |     mem.free(new_entries);
171 |     grid.num_entries = total_entries;
172 | 
173 |     mem.free(depths);
174 |     mem.free(start_entries);
175 | }
176 | 
177 | } // namespace hagrid
178 | 


--------------------------------------------------------------------------------
/src/grid.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRID_H
  2 | #define GRID_H
  3 | 
  4 | #include <vector>
  5 | 
  6 | #include "vec.h"
  7 | #include "bbox.h"
  8 | 
  9 | namespace hagrid {
 10 | 
 11 | /// Voxel map entry
 12 | struct Entry {
 13 |     enum {
 14 |         LOG_DIM_BITS = 2,
 15 |         BEGIN_BITS   = 32 - LOG_DIM_BITS
 16 |     };
 17 | 
 18 |     uint32_t log_dim : LOG_DIM_BITS;    ///< Logarithm of the dimensions of the entry (0 for leaves)
 19 |     uint32_t begin   : BEGIN_BITS;      ///< Next entry index (cell index for leaves)
 20 | };
 21 | 
 22 | /// Cell of the irregular grid
 23 | struct Cell {
 24 |     ivec3 min;     ///< Minimum bounding box coordinate
 25 |     int begin;     ///< Index of the first reference
 26 |     ivec3 max;     ///< Maximum bounding box coordinate
 27 |     int end;       ///< Past-the-end reference index
 28 | 
 29 |     HOST DEVICE Cell() {}
 30 |     HOST DEVICE Cell(const ivec3& min, int begin, const ivec3& max, int end)
 31 |         : min(min), begin(begin), max(max), end(end)
 32 |     {}
 33 | };
 34 | 
 35 | /// Compressed irregular grid cell
 36 | struct SmallCell {
 37 |     usvec3 min;     ///< Minimum bounding box coordinate
 38 |     usvec3 max;     ///< Maximum bounding box coordinate
 39 |     int begin;      ///< Index of the first reference
 40 | 
 41 |     HOST DEVICE SmallCell() {}
 42 |     HOST DEVICE SmallCell(const usvec3& min, const usvec3& max, int begin)
 43 |         : min(min), max(max), begin(begin)
 44 |     {}
 45 | };
 46 | 
 47 | /// Structure holding an irregular grid
 48 | struct Grid {
 49 |     Entry* entries;             ///< Voxel map, stored as a contiguous array
 50 |     int*   ref_ids;             ///< Array of primitive references
 51 |     Cell*  cells;               ///< Cells of the structure (nullptr if compressed)
 52 | 
 53 |     SmallCell* small_cells;     ///< Compressed cells (nullptr if not compressed)
 54 | 
 55 |     BBox bbox;                  ///< Bounding box of the scene
 56 |     ivec3 dims;                 ///< Top-level dimensions
 57 |     int num_cells;              ///< Number of cells
 58 |     int num_entries;            ///< Number of elements in the voxel map
 59 |     int num_refs;               ///< Number of primitive references
 60 |     int shift;                  ///< Amount of bits to shift to get from the deepest level to the top-level
 61 |     std::vector<int> offsets;   ///< Offset to each level of the voxel map octree
 62 | };
 63 | 
 64 | /// A 3D integer range
 65 | struct Range {
 66 |     int lx, ly, lz;
 67 |     int hx, hy, hz;
 68 |     HOST DEVICE Range() {}
 69 |     HOST DEVICE Range(int lx, int ly, int lz,
 70 |                       int hx, int hy, int hz)
 71 |         : lx(lx), ly(ly), lz(lz)
 72 |         , hx(hx), hy(hy), hz(hz)
 73 |     {}
 74 |     HOST DEVICE int size() const { return (hx - lx + 1) * (hy - ly + 1) * (hz - lz + 1) ; }
 75 | };
 76 | 
 77 | /// Returns a voxel map entry with the given dimension and starting index
 78 | HOST DEVICE inline Entry make_entry(uint32_t log_dim, uint32_t begin) {
 79 |     Entry e { .log_dim = log_dim, .begin = begin };
 80 |     return e;
 81 | }
 82 | 
 83 | /// Computes the range of cells that intersect the given box
 84 | HOST DEVICE inline Range compute_range(const ivec3& dims, const BBox& grid_bb, const BBox& obj_bb) {
 85 |     auto inv = vec3(dims) / grid_bb.extents();
 86 |     int lx = max(int((obj_bb.min.x - grid_bb.min.x) * inv.x), 0);
 87 |     int ly = max(int((obj_bb.min.y - grid_bb.min.y) * inv.y), 0);
 88 |     int lz = max(int((obj_bb.min.z - grid_bb.min.z) * inv.z), 0);
 89 |     int hx = min(int((obj_bb.max.x - grid_bb.min.x) * inv.x), dims.x - 1);
 90 |     int hy = min(int((obj_bb.max.y - grid_bb.min.y) * inv.y), dims.y - 1);
 91 |     int hz = min(int((obj_bb.max.z - grid_bb.min.z) * inv.z), dims.z - 1);
 92 |     return Range(lx, ly, lz, hx, hy, hz);
 93 | }
 94 | 
 95 | /// Computes grid dimensions based on the formula by Cleary et al.
 96 | HOST DEVICE inline ivec3 compute_grid_dims(const BBox& bb, int num_prims, float density) {
 97 |     const vec3 extents = bb.extents();
 98 |     const float volume = extents.x * extents.y * extents.z;
 99 |     const float ratio = cbrtf(density * num_prims / volume);
100 |     return max(ivec3(1), ivec3(extents.x * ratio, extents.y * ratio, extents.z * ratio));
101 | }
102 | 
103 | HOST DEVICE inline uint32_t lookup_entry(const Entry* entries, int shift, const ivec3& dims, const ivec3& voxel) {
104 |     auto entry = entries[(voxel.x >> shift) + dims.x * ((voxel.y >> shift) + dims.y * (voxel.z >> shift))];
105 |     auto log_dim = entry.log_dim, d = log_dim;
106 |     while (log_dim) {
107 |         auto begin = entry.begin;
108 |         auto mask = (1 << log_dim) - 1;
109 | 
110 |         auto k = (voxel >> int(shift - d)) & mask;
111 |         entry = entries[begin + k.x + ((k.y + (k.z << log_dim)) << log_dim)];
112 |         log_dim = entry.log_dim;
113 |         d += log_dim;
114 |     }
115 |     return entry.begin;
116 | }
117 | 
118 | template <typename F>
119 | HOST DEVICE int foreach_ref(Cell cell, const int* ref_ids, F f) {
120 |     int cur = cell.begin, ref = cur < cell.end ? ref_ids[cur++] : -1;
121 |     while (ref >= 0) {
122 |         // Preload the next reference
123 |         auto next = cur < cell.end ? ref_ids[cur++] : -1;
124 |         f(ref);
125 |         ref = next;
126 |     }
127 |     return cell.end - cell.begin;
128 | }
129 | 
130 | template <typename F>
131 | HOST DEVICE int foreach_ref(SmallCell small_cell, const int* ref_ids, F f) {
132 |     auto cur = small_cell.begin;
133 |     auto ref = cur >= 0 ? ref_ids[cur++] : -1;
134 |     while (ref >= 0) {
135 |         auto next = ref_ids[cur++];
136 |         f(ref);
137 |         ref = next;
138 |     }
139 |     return cur - small_cell.begin;
140 | }
141 | 
142 | #ifdef __NVCC__
143 | __device__ __forceinline__ Cell load_cell(const Cell* cell_ptr) {
144 |     const int4* ptr = (const int4*)cell_ptr;
145 |     auto cell0 = ptr[0];
146 |     auto cell1 = ptr[1];
147 |     return Cell(ivec3(cell0.x, cell0.y, cell0.z), cell0.w,
148 |                 ivec3(cell1.x, cell1.y, cell1.z), cell1.w);
149 | }
150 | 
151 | __device__ __forceinline__ ivec3 load_cell_min(const Cell* cell_ptr) {
152 |     auto cell0 = ((const int4*)cell_ptr)[0];
153 |     return ivec3(cell0.x, cell0.y, cell0.z);
154 | }
155 | 
156 | __device__ __forceinline__ void store_cell(Cell* cell_ptr, const Cell& cell) {
157 |     int4* ptr = (int4*)cell_ptr;
158 |     ptr[0] = make_int4(cell.min.x, cell.min.y, cell.min.z, cell.begin);
159 |     ptr[1] = make_int4(cell.max.x, cell.max.y, cell.max.z, cell.end);
160 | }
161 | 
162 | __device__ __forceinline__ SmallCell load_cell(const SmallCell* cell_ptr) {
163 |     const uint4* ptr = (const uint4*)cell_ptr;
164 |     auto cell = *ptr;
165 |     return SmallCell(usvec3(cell.x, cell.x >> 16, cell.y),
166 |                      usvec3(cell.y >> 16, cell.z, cell.z >> 16),
167 |                      cell.w);
168 | }
169 | 
170 | __device__ __forceinline__ void store_cell(const SmallCell* cell_ptr, const SmallCell& cell) {
171 |     uint4* ptr = (uint4*)cell_ptr;
172 |     *ptr = make_uint4(cell.min.x | ((uint)(cell.min.y) << 16),
173 |                       cell.min.z | ((uint)(cell.max.x) << 16),
174 |                       cell.max.y | ((uint)(cell.max.z) << 16),
175 |                       (uint)cell.begin);
176 | }
177 | #endif // __NVCC__
178 | 
179 | } // namespace hagrid
180 | 
181 | #endif // GRID_H
182 | 


--------------------------------------------------------------------------------
/src/load_obj.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream> 
  3 | #include <cstring>
  4 | #include <cstdlib>
  5 | 
  6 | #include "load_obj.h"
  7 | 
  8 | namespace hagrid {
  9 | 
 10 | inline void error() {
 11 |     std::cerr << std::endl;
 12 | }
 13 | 
 14 | template <typename T, typename... Args>
 15 | inline void error(T t, Args... args) {
 16 | #ifndef NDEBUG
 17 |     std::cerr << t;
 18 |     error(args...);
 19 | #endif
 20 | }
 21 | 
 22 | inline void remove_eol(char* ptr) {
 23 |     int i = 0;
 24 |     while (ptr[i]) i++;
 25 |     i--;
 26 |     while (i > 0 && std::isspace(ptr[i])) {
 27 |         ptr[i] = '\0';
 28 |         i--;
 29 |     }
 30 | }
 31 | 
 32 | inline char* strip_text(char* ptr) {
 33 |     while (*ptr && !std::isspace(*ptr)) { ptr++; }
 34 |     return ptr;
 35 | }
 36 | 
 37 | inline char* strip_spaces(char* ptr) {
 38 |     while (std::isspace(*ptr)) { ptr++; }
 39 |     return ptr;
 40 | }
 41 | 
 42 | inline bool read_index(char** ptr, ObjLoader::Index& idx) {
 43 |     char* base = *ptr;
 44 | 
 45 |     // Detect end of line (negative indices are supported) 
 46 |     base = strip_spaces(base);
 47 |     if (!std::isdigit(*base) && *base != '-') return false;
 48 | 
 49 |     idx.v = 0;
 50 |     idx.t = 0;
 51 |     idx.n = 0;
 52 | 
 53 |     idx.v = std::strtol(base, &base, 10);
 54 | 
 55 |     base = strip_spaces(base);
 56 | 
 57 |     if (*base == '/') {
 58 |         base++;
 59 | 
 60 |         // Handle the case when there is no texture coordinate
 61 |         if (*base != '/') {
 62 |             idx.t = std::strtol(base, &base, 10);
 63 |         }
 64 | 
 65 |         base = strip_spaces(base);
 66 | 
 67 |         if (*base == '/') {
 68 |             base++;
 69 |             idx.n = std::strtol(base, &base, 10);
 70 |         }
 71 |     }
 72 | 
 73 |     *ptr = base;
 74 | 
 75 |     return true;
 76 | }
 77 | 
 78 | bool ObjLoader::load_obj(const std::string& path, File& file) {
 79 |     std::ifstream stream(path);
 80 |     if (!stream) return false;
 81 | 
 82 |     // Add an empty object to the scene
 83 |     int cur_object = 0;
 84 |     file.objects.emplace_back();
 85 | 
 86 |     // Add an empty group to this object
 87 |     int cur_group = 0;
 88 |     file.objects[0].groups.emplace_back();
 89 | 
 90 |     // Add an empty material to the scene
 91 |     int cur_mtl = 0;
 92 |     file.materials.emplace_back("");
 93 | 
 94 |     // Add dummy vertex, normal, and texcoord
 95 |     file.vertices.emplace_back();
 96 |     file.normals.emplace_back();
 97 |     file.texcoords.emplace_back();
 98 | 
 99 |     int err_count = 0;
100 |     const int max_line = 1024;
101 |     char line[max_line];
102 |     while (stream.getline(line, max_line)) {
103 |         // Strip spaces
104 |         char* ptr = strip_spaces(line);
105 |         const char* err_line = ptr;
106 | 
107 |         // Skip comments and empty lines
108 |         if (*ptr == '\0' || *ptr == '#')
109 |             continue;
110 | 
111 |         remove_eol(ptr);
112 | 
113 |         // Test each command in turn, the most frequent first
114 |         if (*ptr == 'v') {
115 |             switch (ptr[1]) {
116 |                 case ' ':
117 |                 case '\t':
118 |                     {
119 |                         vec3 v;
120 |                         v.x = std::strtof(ptr + 1, &ptr);
121 |                         v.y = std::strtof(ptr, &ptr);
122 |                         v.z = std::strtof(ptr, &ptr);
123 |                         file.vertices.push_back(v);
124 |                     }
125 |                     break;
126 |                 case 'n':
127 | #ifndef SKIP_NORMALS
128 |                     {
129 |                         vec3 n;
130 |                         n.x = std::strtof(ptr + 2, &ptr);
131 |                         n.y = std::strtof(ptr, &ptr);
132 |                         n.z = std::strtof(ptr, &ptr);
133 |                         file.normals.push_back(n);
134 |                     }
135 | #endif
136 |                     break;
137 |                 case 't':
138 | #ifndef SKIP_TEXCOORDS
139 |                     {
140 |                         vec2 t;
141 |                         t.x = std::strtof(ptr + 2, &ptr);
142 |                         t.y = std::strtof(ptr, &ptr);
143 |                         file.texcoords.push_back(t);
144 |                     }
145 | #endif
146 |                     break;
147 |                 default:
148 |                     error("invalid vertex");
149 |                     err_count++;
150 |                     break;
151 |             }
152 |         } else if (*ptr == 'f' && std::isspace(ptr[1])) {
153 |             Face f;
154 | 
155 |             f.index_count = 0;
156 |             f.material = cur_mtl;
157 | 
158 |             bool valid = true;
159 |             ptr += 2;
160 |             while(f.index_count < Face::max_indices) {
161 |                 Index index;
162 |                 valid = read_index(&ptr, index);
163 | 
164 |                 if (valid) {
165 |                     f.indices[f.index_count++] = index;
166 |                 } else {
167 |                     break;
168 |                 }
169 |             }
170 | 
171 |             if (f.index_count < 3) {
172 |                 error("invalid face");
173 |                 err_count++;
174 |             } else {
175 |                 // Convert relative indices to absolute
176 |                 for (int i = 0; i < f.index_count; i++) {
177 |                     f.indices[i].v = (f.indices[i].v < 0) ? file.vertices.size()  + f.indices[i].v : f.indices[i].v;
178 |                     f.indices[i].t = (f.indices[i].t < 0) ? file.texcoords.size() + f.indices[i].t : f.indices[i].t;
179 |                     f.indices[i].n = (f.indices[i].n < 0) ? file.normals.size()   + f.indices[i].n : f.indices[i].n;
180 |                 }
181 | 
182 |                 // Check if the indices are valid or not
183 |                 valid = true;
184 |                 for (int i = 0; i < f.index_count; i++) {
185 |                     if (f.indices[i].v <= 0 || f.indices[i].t < 0 || f.indices[i].n < 0) {
186 |                         valid = false;
187 |                         break;
188 |                     }
189 |                 }
190 | 
191 |                 if (valid) {
192 |                     file.objects[cur_object].groups[cur_group].faces.push_back(f);
193 |                 } else {
194 |                     error("invalid indices");
195 |                     err_count++;
196 |                 }
197 |             }
198 |         } else if (*ptr == 'g' && std::isspace(ptr[1])) {
199 |             file.objects[cur_object].groups.emplace_back();
200 |             cur_group++;
201 |         } else if (*ptr == 'o' && std::isspace(ptr[1])) {
202 |             file.objects.emplace_back();
203 |             cur_object++;
204 | 
205 |             file.objects[cur_object].groups.emplace_back();
206 |             cur_group = 0;
207 |         } else if (!std::strncmp(ptr, "usemtl", 6) && std::isspace(ptr[6])) {
208 |             ptr += 6;
209 | 
210 |             ptr = strip_spaces(ptr);
211 |             char* base = ptr;
212 |             ptr = strip_text(ptr);
213 | 
214 |             const std::string mtl_name(base, ptr);
215 | 
216 |             cur_mtl = std::find(file.materials.begin(), file.materials.end(), mtl_name) - file.materials.begin();
217 |             if (cur_mtl == (int)file.materials.size()) {
218 |                 file.materials.push_back(mtl_name);            
219 |             }
220 |         } else if (!std::strncmp(ptr, "mtllib", 6) && std::isspace(ptr[6])) {
221 |             ptr += 6;
222 | 
223 |             ptr = strip_spaces(ptr);
224 |             char* base = ptr;
225 |             ptr = strip_text(ptr);
226 | 
227 |             const std::string lib_name(base, ptr);
228 | 
229 |             file.mtl_libs.push_back(lib_name);
230 |         } else if (*ptr == 's' && std::isspace(ptr[1])) {
231 |             // Ignore smooth commands
232 |         } else {
233 |             error("unknown command ", ptr);
234 |             err_count++;
235 |         }
236 |     }
237 | 
238 |     return (err_count == 0);
239 | }
240 | 
241 | bool ObjLoader::load_mtl(const std::string& path, MaterialLib& mtl_lib) {
242 |     std::ifstream stream(path);
243 |     if (!stream) return false;
244 | 
245 |     const int max_line = 1024;
246 |     char line[max_line];
247 |     char* err_line = line;
248 |     int err_count = 0;
249 | 
250 |     std::string mtl_name;
251 |     auto current_material = [&] () -> Material& {
252 |         return mtl_lib[mtl_name];
253 |     };
254 | 
255 |     while (stream.getline(line, max_line)) {
256 |         // Strip spaces
257 |         char* ptr = strip_spaces(line);
258 |         err_line = ptr;
259 | 
260 |         // Skip comments and empty lines
261 |         if (*ptr == '\0' || *ptr == '#')
262 |             continue;
263 | 
264 |         remove_eol(ptr);
265 | 
266 |         if (!std::strncmp(ptr, "newmtl", 6) && std::isspace(ptr[6])) {
267 |             ptr = strip_spaces(ptr + 7);
268 |             char* base = ptr;
269 |             ptr = strip_text(ptr);
270 | 
271 |             mtl_name = std::string(base, ptr);
272 |             if (mtl_lib.find(mtl_name) != mtl_lib.end()) {
273 |                 error("material redefinition");
274 |                 err_count++;
275 |             }
276 |         } else if (ptr[0] == 'K') {
277 |             if (ptr[1] == 'a' && std::isspace(ptr[2])) {
278 |                 auto& mat = current_material();
279 |                 mat.ka.r = std::strtof(ptr + 3, &ptr);
280 |                 mat.ka.g = std::strtof(ptr, &ptr);
281 |                 mat.ka.b = std::strtof(ptr, &ptr);
282 |             } else if (ptr[1] == 'd' && std::isspace(ptr[2])) {
283 |                 auto& mat = current_material();
284 |                 mat.kd.r = std::strtof(ptr + 3, &ptr);
285 |                 mat.kd.g = std::strtof(ptr, &ptr);
286 |                 mat.kd.b = std::strtof(ptr, &ptr);
287 |             } else if (ptr[1] == 's' && std::isspace(ptr[2])) {
288 |                 auto& mat = current_material();
289 |                 mat.ks.r = std::strtof(ptr + 3, &ptr);
290 |                 mat.ks.g = std::strtof(ptr, &ptr);
291 |                 mat.ks.b = std::strtof(ptr, &ptr);
292 |             } else if (ptr[1] == 'e' && std::isspace(ptr[2])) {
293 |                 auto& mat = current_material();
294 |                 mat.ke.r = std::strtof(ptr + 3, &ptr);
295 |                 mat.ke.g = std::strtof(ptr, &ptr);
296 |                 mat.ke.b = std::strtof(ptr, &ptr);
297 |             } else {
298 |                 error("invalid command");
299 |                 err_count++;
300 |             }
301 |         } else if (ptr[0] == 'N') {
302 |             if (ptr[1] == 's' && std::isspace(ptr[2])) {
303 |                 auto& mat = current_material();
304 |                 mat.ns = std::strtof(ptr + 3, &ptr);
305 |             } else if (ptr[1] == 'i' && std::isspace(ptr[2])) {
306 |                 auto& mat = current_material();
307 |                 mat.ni = std::strtof(ptr + 3, &ptr);
308 |             } else {
309 |                 error("invalid command");
310 |                 err_count++;
311 |             }
312 |         } else if (ptr[0] == 'T') {
313 |             if (ptr[1] == 'f' && std::isspace(ptr[2])) {
314 |                 auto& mat = current_material();
315 |                 mat.tf.r = std::strtof(ptr + 3, &ptr);
316 |                 mat.tf.g = std::strtof(ptr, &ptr);
317 |                 mat.tf.b = std::strtof(ptr, &ptr);
318 |             } else if (ptr[1] == 'r' && std::isspace(ptr[2])) {
319 |                 auto& mat = current_material();
320 |                 mat.tr = std::strtof(ptr + 3, &ptr);
321 |             } else {
322 |                 error("invalid command");
323 |                 err_count++;
324 |             }
325 |         } else if (ptr[0] == 'd' && std::isspace(ptr[1])) {
326 |             auto& mat = current_material();
327 |             mat.d = std::strtof(ptr + 2, &ptr);
328 |         } else if (!std::strncmp(ptr, "illum", 5) && std::isspace(ptr[5])) {
329 |             auto& mat = current_material();
330 |             mat.illum = std::strtof(ptr + 6, &ptr);
331 |         } else if (!std::strncmp(ptr, "map_Ka", 6) && std::isspace(ptr[6])) {
332 |             auto& mat = current_material();
333 |             mat.map_ka = std::string(strip_spaces(ptr + 7));
334 |         } else if (!std::strncmp(ptr, "map_Kd", 6) && std::isspace(ptr[6])) {
335 |             auto& mat = current_material();
336 |             mat.map_kd = std::string(strip_spaces(ptr + 7));
337 |         } else if (!std::strncmp(ptr, "map_Ks", 6) && std::isspace(ptr[6])) {
338 |             auto& mat = current_material();
339 |             mat.map_ks = std::string(strip_spaces(ptr + 7));
340 |         } else if (!std::strncmp(ptr, "map_Ke", 6) && std::isspace(ptr[6])) {
341 |             auto& mat = current_material();
342 |             mat.map_ke = std::string(strip_spaces(ptr + 7));
343 |         } else if (!std::strncmp(ptr, "map_bump", 8) && std::isspace(ptr[8])) {
344 |             auto& mat = current_material();
345 |             mat.map_bump = std::string(strip_spaces(ptr + 9));
346 |         } else if (!std::strncmp(ptr, "bump", 4) && std::isspace(ptr[4])) {
347 |             auto& mat = current_material();
348 |             mat.map_bump = std::string(strip_spaces(ptr + 5));
349 |         } else if (!std::strncmp(ptr, "map_d", 5) && std::isspace(ptr[5])) {
350 |             auto& mat = current_material();
351 |             mat.map_d = std::string(strip_spaces(ptr + 6));
352 |         } else {
353 |             error("unknown command ", ptr);
354 |             err_count++;
355 |         }
356 |     }
357 | 
358 |     return (err_count == 0);
359 | }
360 | 
361 | } // namespace hagrid
362 | 


--------------------------------------------------------------------------------
/src/load_obj.h:
--------------------------------------------------------------------------------
  1 | #ifndef LOAD_OBJ_H
  2 | #define LOAD_OBJ_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include <unordered_map>
  7 | #include <algorithm>
  8 | 
  9 | #include "vec.h"
 10 | 
 11 | namespace hagrid {
 12 | 
 13 | class ObjLoader {
 14 | public:
 15 |     struct Index {
 16 |         int v, n, t;
 17 |     };
 18 | 
 19 |     struct Face {
 20 |         static constexpr int max_indices = 8;
 21 |         Index indices[max_indices];
 22 |         int index_count;
 23 |         int material;
 24 |     };
 25 | 
 26 |     struct Group {
 27 |         std::vector<Face> faces;
 28 |     };
 29 | 
 30 |     struct Object {
 31 |         std::vector<Group> groups;
 32 |     };
 33 | 
 34 |     struct Material {
 35 |         vec3 ka;
 36 |         vec3 kd;
 37 |         vec3 ks;
 38 |         vec3 ke;
 39 |         float ns;
 40 |         float ni;
 41 |         vec3 tf;
 42 |         float tr;
 43 |         float d;
 44 |         int illum;
 45 |         std::string map_ka;
 46 |         std::string map_kd;
 47 |         std::string map_ks;
 48 |         std::string map_ke;
 49 |         std::string map_bump;
 50 |         std::string map_d;
 51 |     };
 52 | 
 53 |     struct File {
 54 |         std::vector<Object>      objects;
 55 |         std::vector<vec3>        vertices;
 56 |         std::vector<vec3>        normals;
 57 |         std::vector<vec2>        texcoords;
 58 |         std::vector<std::string> materials;
 59 |         std::vector<std::string> mtl_libs;
 60 |     };
 61 | 
 62 |     struct Path {
 63 |         Path() {}
 64 |         Path(const char* p) : Path(std::string(p)) {}
 65 |         Path(const std::string& p)
 66 |             : path(p)
 67 |         {
 68 |             std::replace(path.begin(), path.end(), '\\', '/');
 69 |             auto pos = path.rfind('/');
 70 |             base = (pos != std::string::npos) ? path.substr(0, pos)  : ".";
 71 |             file = (pos != std::string::npos) ? path.substr(pos + 1) : path;
 72 |         }
 73 | 
 74 |         operator const std::string& () const {
 75 |             return path;
 76 |         }
 77 |         
 78 |         std::string path;
 79 |         std::string base;
 80 |         std::string file;
 81 |     };
 82 | 
 83 |     typedef std::unordered_map<std::string, Material> MaterialLib;
 84 | 
 85 |     static bool load_obj(const std::string&, File&);
 86 |     static bool load_mtl(const std::string&, MaterialLib&);
 87 |     static bool load_scene(const Path& path, File& file, MaterialLib& mtl_lib) {
 88 |         if (!load_obj(path, file)) return false;
 89 |         for (auto& lib : file.mtl_libs) {
 90 |             // We tolerate errors in the MTL file
 91 |             load_mtl(path.base + "/" + lib, mtl_lib);
 92 |         }
 93 |         return true;
 94 |     }
 95 | };
 96 | 
 97 | } // namespace hagrid
 98 | 
 99 | #endif // LOAD_OBJ_H
100 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <sstream>
  3 | #include <cstdlib>
  4 | #include <cstring>
  5 | #include <chrono>
  6 | #include <fstream>
  7 | #include <numeric>
  8 | #include <limits>
  9 | 
 10 | #include <SDL2/SDL.h>
 11 | 
 12 | #include "build.h"
 13 | #include "load_obj.h"
 14 | #include "mem_manager.h"
 15 | #include "traverse.h"
 16 | 
 17 | using namespace hagrid;
 18 | 
 19 | struct Camera {
 20 |     vec3 eye;
 21 |     vec3 right;
 22 |     vec3 up;
 23 |     vec3 dir;
 24 | };
 25 | 
 26 | struct View {
 27 |     vec3 eye;
 28 |     vec3 forward;
 29 |     vec3 right;
 30 |     vec3 up;
 31 |     float dist;
 32 |     float rspeed;
 33 |     float tspeed;
 34 | };
 35 | 
 36 | enum class DisplayMode {
 37 |     DEPTH,
 38 |     GRAY_SCALE,
 39 |     HEAT_MAP
 40 | };
 41 | 
 42 | inline Camera gen_camera(const vec3& eye, const vec3& center, const vec3& up, float fov, float ratio) {
 43 |     Camera cam;
 44 |     const float f = tanf(M_PI * fov / 360);
 45 |     cam.dir = normalize(center - eye);
 46 |     cam.right = normalize(cross(cam.dir, up)) * (f * ratio);
 47 |     cam.up = normalize(cross(cam.right, cam.dir)) * f;
 48 |     cam.eye = eye;
 49 |     return cam;
 50 | }
 51 | 
 52 | inline void gen_rays(const Camera& cam, std::vector<Ray>& rays, float clip, int w, int h) {
 53 |     for (int y = 0; y < h; y++) {
 54 |         for (int x = 0; x < w; x++) {
 55 |             auto kx = 2 * x / float(w) - 1;
 56 |             auto ky = 1 - 2 * y / float(h);
 57 |             auto dir = cam.dir + cam.right * kx + cam.up * ky;
 58 | 
 59 |             auto& ray = rays[y * w + x];
 60 |             ray.org = cam.eye;
 61 |             ray.dir = dir;
 62 |             ray.tmin = 0.0f;
 63 |             ray.tmax = clip;
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | void gradient(uint8_t* color, float k) {
 69 |     static const vec3 g[] = {
 70 |         vec3(0, 0, 255),
 71 |         vec3(0, 255, 255),
 72 |         vec3(0, 128, 0),
 73 |         vec3(255, 255, 0),
 74 |         vec3(255, 0, 0)
 75 |     };
 76 |     constexpr int n = sizeof(g) / sizeof(g[0]);
 77 |     static const float s = 1.0f / n;
 78 | 
 79 |     int i = min(n - 1, int(k * n));
 80 |     int j = min(n - 1, i + 1);
 81 | 
 82 |     float t = (k - i * s) / s;
 83 |     auto c = (1.0f - t) * g[i] + t * g[j];
 84 | 
 85 |     color[0] = c.z;
 86 |     color[1] = c.y;
 87 |     color[2] = c.x;
 88 | }
 89 | 
 90 | template <DisplayMode mode>
 91 | void update_surface(SDL_Surface* surf, std::vector<Hit>& hits, float clip, int w, int h) {
 92 |     for (int y = 0, my = std::min(surf->h, h); y < my; y++) {
 93 |         unsigned char* row = (unsigned char*)surf->pixels + surf->pitch * y;
 94 |         for (int x = 0, mx = std::min(surf->w, w); x < mx; x++) {
 95 |             if (mode == DisplayMode::DEPTH) {
 96 |                 uint8_t color = 255.0f * hits[y * w + x].t / clip;
 97 |                 row[x * 4 + 0] = color;
 98 |                 row[x * 4 + 1] = color;
 99 |                 row[x * 4 + 2] = color;
100 |             } else if (mode == DisplayMode::GRAY_SCALE) {
101 |                 uint8_t color = std::min(255, hits[y * w + x].id);
102 |                 row[x * 4 + 0] = color;
103 |                 row[x * 4 + 1] = color;
104 |                 row[x * 4 + 2] = color;
105 |             } else if (mode == DisplayMode::HEAT_MAP) {
106 |                 gradient(row + x * 4, std::min(100, hits[y * w + x].id) / 100.0f);
107 |             }
108 |             row[x * 4 + 3] = 255;
109 |         }
110 |     }
111 | }
112 | 
113 | struct ProgramOptions {
114 |     std::string scene_file;
115 |     std::string ray_file;
116 |     float top_density, snd_density;
117 |     float alpha;
118 |     int exp_iters;
119 |     int width, height;
120 |     float clip, fov;
121 |     int build_iter;
122 |     int build_warmup;
123 |     int bench_iter;
124 |     int bench_warmup;
125 |     float tmin, tmax;
126 |     bool keep_alive;
127 |     bool compress;
128 |     bool help;
129 | 
130 |     ProgramOptions()
131 |         : top_density(0.12f)
132 |         , snd_density(2.4f)
133 |         , alpha(0.995f)
134 |         , exp_iters(3)
135 |         , width(1024)
136 |         , height(1024)
137 |         , clip(0)
138 |         , fov(60)
139 |         , build_iter(1)
140 |         , build_warmup(0)
141 |         , bench_iter(1)
142 |         , bench_warmup(0)
143 |         , tmin(0)
144 |         , tmax(std::numeric_limits<float>::max())
145 |         , keep_alive(false)
146 |         , compress(false)
147 |         , help(false)
148 |     {}
149 | 
150 |     bool parse(int argc, char** argv);
151 | 
152 | private:
153 |     static bool matches(const char* arg, const char* opt1, const char* opt2) {
154 |         return !strcmp(arg, opt1) || !strcmp(arg, opt2);
155 |     }
156 | 
157 |     static bool arg_exists(char** argv, int i, int argc) {
158 |         if (i >= argc - 1 || argv[i + 1][0] == '-') {
159 |             std::cerr << "Argument missing for: " << argv[i] << std::endl;
160 |             return false;
161 |         }
162 |         return true;
163 |     }
164 | };
165 | 
166 | bool ProgramOptions::parse(int argc, char** argv) {
167 |     bool scene_parsed = false;
168 |     for (int i = 1; i < argc; i++) {
169 |         auto arg = argv[i];
170 | 
171 |         if (arg[0] != '-') {
172 |             if (scene_parsed) {
173 |                 std::cerr << "Cannot accept more than one model on the command line" << std::endl;
174 |                 return false;
175 |             }
176 |             scene_file = arg;
177 |             scene_parsed = true;
178 |             continue;
179 |         }
180 | 
181 |         if (matches(arg, "-h", "--help")) {
182 |             help = true;
183 |         } else if (matches(arg, "-sx", "--width")) {
184 |             if (!arg_exists(argv, i, argc)) return false;
185 |             width = strtol(argv[++i], nullptr, 10);
186 |         } else if (matches(arg, "-sy", "--height")) {
187 |             if (!arg_exists(argv, i, argc)) return false;
188 |             height = strtol(argv[++i], nullptr, 10);
189 |         } else if (matches(arg, "-c", "--clip")) {
190 |             if (!arg_exists(argv, i, argc)) return false;
191 |             clip = strtof(argv[++i], nullptr);
192 |         } else if (matches(arg, "-f", "--fov")) {
193 |             if (!arg_exists(argv, i, argc)) return false;
194 |             fov = strtof(argv[++i], nullptr);
195 |         } else if (matches(arg, "-td", "--top-density")) {
196 |             if (!arg_exists(argv, i, argc)) return false;
197 |             top_density = strtof(argv[++i], nullptr);
198 |         } else if (matches(arg, "-sd", "--snd-density")) {
199 |             if (!arg_exists(argv, i, argc)) return false;
200 |             snd_density = strtof(argv[++i], nullptr);
201 |         } else if (matches(arg, "-a", "--alpha")) {
202 |             if (!arg_exists(argv, i, argc)) return false;
203 |             alpha = strtof(argv[++i], nullptr);
204 |         } else if (matches(arg, "-e", "--expansion")) {
205 |             if (!arg_exists(argv, i, argc)) return false;
206 |             exp_iters = strtol(argv[++i], nullptr, 10);
207 |         } else if (matches(arg, "-nb", "--build-iter")) {
208 |             if (!arg_exists(argv, i, argc)) return false;
209 |             build_iter = strtol(argv[++i], nullptr, 10);
210 |         } else if (matches(arg, "-wb", "--build-warmup")) {
211 |             if (!arg_exists(argv, i, argc)) return false;
212 |             build_warmup = strtol(argv[++i], nullptr, 10);
213 |         } else if (matches(arg, "-k", "--keep-alive")) {
214 |             keep_alive = true;
215 |         } else if (matches(arg, "-z", "--compress")) {
216 |             compress = true;
217 |         } else if (matches(arg, "-r", "--ray-file")) {
218 |             if (!arg_exists(argv, i, argc)) return false;
219 |             ray_file = argv[++i];
220 |         } else if (matches(arg, "-tmin", "--tmin")) {
221 |             if (!arg_exists(argv, i, argc)) return false;
222 |             tmin = strtof(argv[++i], nullptr);
223 |         } else if (matches(arg, "-tmax", "--tmax")) {
224 |             if (!arg_exists(argv, i, argc)) return false;
225 |             tmax = strtof(argv[++i], nullptr);
226 |         } else if (matches(arg, "-n", "--bench-iter")) {
227 |             if (!arg_exists(argv, i, argc)) return false;
228 |             bench_iter = strtol(argv[++i], nullptr, 10);
229 |         } else if (matches(arg, "-w", "--bench-warmup")) {
230 |             if (!arg_exists(argv, i, argc)) return false;
231 |             bench_warmup = strtol(argv[++i], nullptr, 10);
232 |         } else {
233 |             std::cerr << "Unknown argument: " << arg << std::endl;
234 |             return false;
235 |         }
236 |     }
237 | 
238 |     if (!scene_parsed) {
239 |         std::cerr << "No model specified" << std::endl;
240 |         return false;
241 |     }
242 | 
243 |     return true;
244 | }
245 | 
246 | static bool load_model(const std::string& file_name, std::vector<Tri>& tris) {
247 |     ObjLoader::File obj_file;
248 |     ObjLoader::MaterialLib mtl_lib;
249 |     if (!ObjLoader::load_scene(file_name, obj_file, mtl_lib))
250 |         return false;
251 | 
252 |     for (auto& object : obj_file.objects) {
253 |         for (auto& group : object.groups) {
254 |             for (auto& face : group.faces) {
255 |                 auto v0 = obj_file.vertices[face.indices[0].v];
256 |                 for (int i = 0; i < face.index_count - 2; i++) {
257 |                     auto v1 = obj_file.vertices[face.indices[i + 1].v];
258 |                     auto v2 = obj_file.vertices[face.indices[i + 2].v];
259 |                     auto e1 = v0 - v1;
260 |                     auto e2 = v2 - v0;
261 |                     auto n  = cross(e1, e2);
262 | 
263 |                     const Tri tri = {
264 |                         v0, n.x,
265 |                         e1, n.y,
266 |                         e2, n.z
267 |                     };
268 |                     tris.push_back(tri);
269 |                 }
270 |             }
271 |         }
272 |     }
273 | 
274 |     return true;
275 | }
276 | 
277 | static bool load_rays(const std::string& file_name, std::vector<Ray>& rays, float tmin, float tmax) {
278 |     std::ifstream in(file_name, std::ifstream::binary);
279 |     if (!in) return false;
280 | 
281 |     in.seekg(0, std::ifstream::end);
282 |     int count = in.tellg() / (sizeof(float) * 6);
283 | 
284 |     rays.resize(count);
285 |     in.seekg(0);
286 | 
287 |     for (int i = 0; i < count; i++) {
288 |         float org_dir[6];
289 |         in.read((char*)org_dir, sizeof(float) * 6);
290 |         Ray& ray = rays.data()[i];
291 | 
292 |         ray.org = vec3(org_dir[0], org_dir[1], org_dir[2]);
293 |         ray.dir = vec3(org_dir[3], org_dir[4], org_dir[5]);
294 | 
295 |         ray.tmin = tmin;
296 |         ray.tmax = tmax;
297 |     }
298 | 
299 |     return true;
300 | }
301 | 
302 | bool handle_events(View& view, DisplayMode& display_mode) {
303 |     static bool arrows[4], camera_on;
304 |     SDL_Event event;
305 |     while (SDL_PollEvent(&event)) {
306 |         switch (event.type) {
307 |             case SDL_QUIT:
308 |                 return true;
309 |             case SDL_MOUSEBUTTONDOWN:
310 |                 SDL_SetRelativeMouseMode(SDL_TRUE);
311 |                 camera_on = true;
312 |                 break;
313 |             case SDL_MOUSEBUTTONUP:
314 |                 camera_on = false;
315 |                 SDL_SetRelativeMouseMode(SDL_FALSE);
316 |                 break;
317 |             case SDL_MOUSEMOTION:
318 |                 if (camera_on) {
319 |                     view.right = cross(view.forward, view.up);
320 |                     view.forward = rotate(view.forward, view.right, -event.motion.yrel * view.rspeed);
321 |                     view.forward = rotate(view.forward, view.up,    -event.motion.xrel * view.rspeed);
322 |                     view.forward = normalize(view.forward);
323 |                     view.up = normalize(cross(view.right, view.forward));
324 |                 }
325 |                 break;
326 |             case SDL_KEYUP:
327 |                 switch (event.key.keysym.sym) {
328 |                     case SDLK_UP:    arrows[0] = false; break;
329 |                     case SDLK_DOWN:  arrows[1] = false; break;
330 |                     case SDLK_LEFT:  arrows[2] = false; break;
331 |                     case SDLK_RIGHT: arrows[3] = false; break;
332 |                 }
333 |                 break;
334 |             case SDL_KEYDOWN:
335 |                 switch (event.key.keysym.sym) {
336 |                     case SDLK_UP:    arrows[0] = true; break;
337 |                     case SDLK_DOWN:  arrows[1] = true; break;
338 |                     case SDLK_LEFT:  arrows[2] = true; break;
339 |                     case SDLK_RIGHT: arrows[3] = true; break;
340 |                     case SDLK_KP_PLUS:  view.tspeed *= 1.1f; break;
341 |                     case SDLK_KP_MINUS: view.tspeed /= 1.1f; break;
342 |                     case SDLK_c:
343 |                         {
344 |                             auto center = view.eye + view.forward * view.dist;
345 |                             std::cout << "Eye: " << view.eye.x << " " << view.eye.y << " " << view.eye.z << std::endl;
346 |                             std::cout << "Center: " << center.x << " " << center.y << " " << center.z << std::endl;
347 |                             std::cout << "Up: " << view.up.x << " " << view.up.y << " " << view.up.z << std::endl;
348 |                         }
349 |                         break;
350 |                     case SDLK_m:
351 |                         if (display_mode == DisplayMode::DEPTH)
352 |                             display_mode = DisplayMode::GRAY_SCALE;
353 |                         else if (display_mode == DisplayMode::GRAY_SCALE)
354 |                             display_mode = DisplayMode::HEAT_MAP;
355 |                         else if (display_mode == DisplayMode::HEAT_MAP)
356 |                             display_mode = DisplayMode::DEPTH;
357 |                         break;
358 |                     case SDLK_ESCAPE:
359 |                         return true;
360 |                 }
361 |                 break;
362 |         }
363 |     }
364 | 
365 |     if (arrows[0]) view.eye = view.eye + view.tspeed * view.forward;
366 |     if (arrows[1]) view.eye = view.eye - view.tspeed * view.forward;
367 |     if (arrows[2]) view.eye = view.eye - view.tspeed * view.right;
368 |     if (arrows[3]) view.eye = view.eye + view.tspeed * view.right;
369 | 
370 |     return false;
371 | }
372 | 
373 | static void usage() {
374 |     std::cout << "Usage: hagrid [options] file\n"
375 |                  "Options:\n"
376 |                  "  -h      --help          Shows this message\n"
377 |                  "  -sx     --width         Sets the viewport width\n"
378 |                  "  -sy     --height        Sets the viewport height\n"
379 |                  "  -c      --clip          Sets the clipping distance\n"
380 |                  "  -f      --fov           Sets the field of view\n"
381 |                  " Construction parameters:\n"
382 |                  "  -td     --top-density   Sets the top-level density\n"
383 |                  "  -sd     --snd-density   Sets the second-level density\n"
384 |                  "  -a      --alpha         Sets the cell merging threshold\n"
385 |                  "  -e      --expansion     Sets the number of expansion iterations\n"
386 |                  "  -nb     --build-iter    Sets the number of build iterations\n"
387 |                  "  -wb     --build-warmup  Sets the number of warmup build iterations\n"
388 |                  "  -k      --keep-alive    Keep the buffers alive during construction\n"
389 |                  "  -z      --compress      Compress the cells after construction\n"
390 |                  " Benchmarking:\n"
391 |                  "  -r      --ray-file      Loads rays from a file and enters benchmark mode\n"
392 |                  "  -tmin   --tmin          Sets the minimum distance along every ray\n"
393 |                  "  -tmax   --tmax          Sets the maximum distance along every ray\n"
394 |                  "  -n      --bench-iter    Sets the number of benchmarking iterations\n"
395 |                  "  -w      --bench-warmup  Sets the number of benchmarking warmup iterations\n" << std::endl;
396 | }
397 | 
398 | static bool benchmark(MemManager& mem,
399 |                       const Grid& grid,
400 |                       const Tri* tris,
401 |                       const std::string& ray_file,
402 |                       float tmin, float tmax,
403 |                       int iter, int warmup) {
404 |     std::vector<Ray> host_rays;
405 |     if (!load_rays(ray_file, host_rays, tmin, tmax)) {
406 |         std::cerr << "Cannot load ray file" << std::endl;
407 |         return false;
408 |     }
409 | 
410 |     Ray* rays = mem.alloc<Ray>(host_rays.size());
411 |     Hit* hits = mem.alloc<Hit>(host_rays.size());
412 |     mem.copy<Copy::HST_TO_DEV>(rays, host_rays.data(), host_rays.size());
413 | 
414 |     for (int i = 0; i < warmup; i++) {
415 |         traverse_grid(grid, tris, rays, hits, host_rays.size());
416 |     }
417 | 
418 |     // Benchmark traversal speed
419 |     std::vector<double> timings;
420 |     for (int i = 0; i < iter; i++) {
421 |         auto kernel_time = profile([&] {
422 |             traverse_grid(grid, tris, rays, hits, host_rays.size());
423 |         });
424 |         timings.emplace_back(kernel_time);
425 |     }
426 | 
427 |     std::vector<Hit> host_hits(host_rays.size());
428 |     mem.copy<Copy::DEV_TO_HST>(host_hits.data(), hits, host_hits.size());
429 | 
430 |     int intr = 0;
431 |     for (int i = 0; i < host_rays.size(); i++)
432 |         intr += (host_hits[i].id >= 0);
433 | 
434 |     std::sort(timings.begin(), timings.end());
435 |     const double sum = std::accumulate(timings.begin(), timings.end(), 0.0f);
436 |     const double avg = sum / timings.size();
437 |     const double med = timings[timings.size() / 2];
438 |     const double min = *std::min_element(timings.begin(), timings.end());
439 |     std::cout << intr << " intersection(s)." << std::endl;
440 |     std::cout << sum << "ms for " << iter << " iteration(s)." << std::endl;
441 |     std::cout << host_rays.size() * iter / (1000.0 * sum) << " Mrays/sec." << std::endl;
442 |     std::cout << "# Average: " << avg << " ms" << std::endl;
443 |     std::cout << "# Median: " << med  << " ms" << std::endl;
444 |     std::cout << "# Min: " << min << " ms" << std::endl;
445 | 
446 |     return true;
447 | }
448 | 
449 | int main(int argc, char** argv) {
450 |     if (argc < 2) {
451 |         usage();
452 |         return 1;
453 |     }
454 | 
455 |     ProgramOptions opts;
456 |     if (!opts.parse(argc, argv)) return 1;
457 | 
458 |     if (opts.help) {
459 |         usage();
460 |         return 0;
461 |     }
462 | 
463 |     std::vector<Tri> host_tris;
464 |     if (!load_model(opts.scene_file, host_tris)) {
465 |         std::cerr << "Scene cannot be loaded (file not present or contains errors)" << std::endl;
466 |         return 1;
467 |     }
468 | 
469 |     std::cout << host_tris.size() << " triangle(s)" << std::endl;
470 | 
471 |     MemManager mem(opts.keep_alive);
472 |     auto tris = mem.alloc<Tri>(host_tris.size());
473 |     mem.copy<Copy::HST_TO_DEV>(tris, host_tris.data(), host_tris.size());
474 | 
475 |     Grid grid;
476 |     grid.entries = nullptr;
477 |     grid.cells   = nullptr;
478 |     grid.ref_ids = nullptr;
479 | 
480 |     // Warmup iterations
481 |     for (int i = 0; i < opts.build_warmup; i++) {
482 |         mem.free(grid.entries);
483 |         mem.free(grid.cells);
484 |         mem.free(grid.ref_ids);
485 | 
486 |         build_grid(mem, tris, host_tris.size(), grid, opts.top_density, opts.snd_density);
487 |         merge_grid(mem, grid, opts.alpha);
488 |         flatten_grid(mem, grid);
489 |         expand_grid(mem, grid, tris, opts.exp_iters);
490 |         if (opts.compress) compress_grid(mem, grid);
491 |     }
492 | 
493 |     // Benchmark construction speed
494 |     double total_time = 0;
495 |     for (int i = 0; i < opts.build_iter; i++) {
496 |         mem.free(grid.entries);
497 |         mem.free(grid.cells);
498 |         mem.free(grid.ref_ids);
499 | 
500 |         auto kernel_time = profile([&] {
501 |             build_grid(mem, tris, host_tris.size(), grid, opts.top_density, opts.snd_density);
502 |             merge_grid(mem, grid, opts.alpha);
503 |             flatten_grid(mem, grid);
504 |             expand_grid(mem, grid, tris, opts.exp_iters);
505 |             if (opts.compress) compress_grid(mem, grid);
506 |         });
507 |         total_time += kernel_time;
508 |     }
509 |     if (opts.compress && !grid.small_cells)
510 |         std::cerr << "Could not compress grid. Continuing with uncompressed structure." << std::endl;
511 | 
512 |     auto dims = grid.dims << grid.shift;
513 |     std::cout << "Grid built in " << total_time / opts.build_iter << " ms ("
514 |               << dims.x << "x" << dims.y << "x" << dims.z << ", "
515 |               << grid.num_cells << " cells, " << grid.num_refs << " references)" << std::endl;
516 | 
517 | #ifndef NDEBUG
518 |     std::cout << std::endl;
519 |     mem.debug_slots();
520 |     std::cout << std::endl;
521 | #endif
522 | 
523 |     const size_t cells_mem = grid.num_cells * (grid.small_cells ? sizeof(SmallCell) : sizeof(Cell));
524 |     const size_t entries_mem = grid.num_entries * sizeof(int);
525 |     const size_t refs_mem = grid.num_refs * sizeof(int);
526 |     const size_t tris_mem = host_tris.size() * sizeof(Tri);
527 |     const size_t total_mem = cells_mem + entries_mem + refs_mem + tris_mem;
528 |     std::cout << "Total memory: " << total_mem / double(1024 * 1024) << " MB" << std::endl;
529 |     std::cout << "Cells: " << cells_mem / double(1024 * 1024) << " MB" << std::endl;
530 |     std::cout << "Entries: " << entries_mem / double(1024 * 1024) << " MB" << std::endl;
531 |     std::cout << "References: " << refs_mem / double(1024 * 1024) << " MB" << std::endl;
532 |     std::cout << "Triangles: " << tris_mem / double(1024 * 1024) << " MB" << std::endl;
533 |     std::cout << "Peak usage: " << mem.max_usage() / double(1024.0 * 1024.0) << " MB" << std::endl;
534 | 
535 |     setup_traversal(grid);
536 | 
537 |     // Compute a clipping distance from the bounding box of the scene
538 |     auto scene_size = length(grid.bbox.extents());
539 |     auto scene_center = grid.bbox.center();
540 |     if (opts.clip <= 0) {
541 |         opts.clip = scene_size;
542 |     }
543 | 
544 |     if (opts.ray_file != "") {
545 |         std::cout << "Entering benchmark mode" << std::endl;
546 |         if (!benchmark(mem, grid, tris, opts.ray_file, opts.tmin, opts.tmax, opts.bench_iter, opts.bench_warmup))
547 |             return 1;
548 |         return 0;
549 |     }
550 | 
551 |     std::cout << "Entering interactive mode\n"
552 |                  "Commands:\n"
553 |                  "  Mouse, arrow keys  Move the camera\n"
554 |                  "  Numpad '+'/'-'     Control camera movement speed\n"
555 |                  "  'm'                Cycle through display modes\n"
556 |                  "  'c'                Prints the camera position" << std::endl;
557 | 
558 |     if (SDL_Init(SDL_INIT_VIDEO) < 0) {
559 |         std::cerr << "Cannot initialize SDL" << std::endl;
560 |         return 1;
561 |     }
562 | 
563 |     SDL_Window* win = SDL_CreateWindow("HaGrid",
564 |         SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
565 |         opts.width, opts.height,
566 |         0);
567 | 
568 |     SDL_Surface* screen = SDL_GetWindowSurface(win);
569 | 
570 |     SDL_FlushEvents(SDL_FIRSTEVENT, SDL_LASTEVENT);
571 | 
572 |     View view = {
573 |         scene_center,                // Eye
574 |         vec3(0.0f,  0.0f, 1.0f),     // Forward
575 |         vec3(-1.0f, 0.0f, 0.0f),     // Right
576 |         vec3(0.0f,  1.0f, 0.0f),     // Up
577 |         100.0f, 0.005f,              // View distance, rotation speed
578 |         scene_size * 0.005f          // Translation speed
579 |     };
580 | 
581 |     size_t num_rays = opts.width * opts.height;
582 |     std::vector<Hit> host_hits(num_rays);
583 |     std::vector<Ray> host_rays(num_rays);
584 |     Ray* rays = mem.alloc<Ray>(num_rays);
585 |     Hit* hits = mem.alloc<Hit>(num_rays);
586 |     double kernel_time = 0;
587 |     auto ticks = SDL_GetTicks();
588 |     int frames = 0;
589 |     DisplayMode display_mode = DisplayMode::DEPTH;
590 |     bool done = false;
591 |     while (!done) {
592 |         Camera cam = gen_camera(view.eye,
593 |                                 view.eye + view.forward * view.dist,
594 |                                 view.up,
595 |                                 opts.fov,
596 |                                 (float)opts.width / (float)opts.height);
597 | 
598 |         gen_rays(cam, host_rays, opts.clip, opts.width, opts.height);
599 |         mem.copy<Copy::HST_TO_DEV>(rays, host_rays.data(), num_rays);
600 | 
601 |         kernel_time += profile([&] { traverse_grid(grid, tris, rays, hits, num_rays); });
602 |         frames++;
603 | 
604 |         if (SDL_GetTicks() - ticks >= 2000) {
605 |             std::ostringstream caption;
606 |             caption << "HaGrid [" << double(frames) * double(opts.width * opts.height) / (1000 * kernel_time) << " MRays/s]";
607 |             SDL_SetWindowTitle(win, caption.str().c_str());
608 |             ticks = SDL_GetTicks();
609 |             kernel_time = 0;
610 |             frames = 0;
611 |         }
612 | 
613 |         mem.copy<Copy::DEV_TO_HST>(host_hits.data(), hits, num_rays);
614 |         SDL_LockSurface(screen);
615 |         if (display_mode == DisplayMode::DEPTH)
616 |             update_surface<DisplayMode::DEPTH>(screen, host_hits, opts.clip, opts.width, opts.height);
617 |         else if (display_mode == DisplayMode::GRAY_SCALE)
618 |             update_surface<DisplayMode::GRAY_SCALE>(screen, host_hits, opts.clip, opts.width, opts.height);
619 |         else
620 |             update_surface<DisplayMode::HEAT_MAP>(screen, host_hits, opts.clip, opts.width, opts.height);
621 |         SDL_UnlockSurface(screen);
622 | 
623 |         SDL_UpdateWindowSurface(win);
624 |         done = handle_events(view, display_mode);
625 |     }
626 | 
627 |     SDL_DestroyWindow(win);
628 |     SDL_Quit();
629 | 
630 |     mem.free(rays);
631 |     mem.free(hits);
632 |     mem.free(tris);
633 |     return 0;
634 | }
635 | 


--------------------------------------------------------------------------------
/src/mem_manager.cu:
--------------------------------------------------------------------------------
 1 | #include "mem_manager.h"
 2 | #include "common.h"
 3 | 
 4 | namespace hagrid {
 5 | 
 6 | HOST void MemManager::debug_slots() const {
 7 |     size_t total = 0;
 8 |     std::cout << "SLOTS: " << std::endl;
 9 |     for (auto& slot : slots_) {
10 |         std::cout << "["
11 |                   << (slot.in_use ? 'X' : ' ')
12 |                   << "] "
13 |                   << (double)slot.size / (1024.0 * 1024.0) << "MB" << std::endl;
14 |         total += slot.size;
15 |     }
16 |     std::cout << (double)total / (1024.0 * 1024.0) << "MB total" << std::endl;
17 | }
18 | 
19 | inline void dealloc_slot(Slot& slot) {
20 |     if (slot.ptr) CHECK_CUDA_CALL(cudaFree(slot.ptr));
21 |     slot.size = 0;
22 |     slot.ptr = nullptr;
23 | }
24 | 
25 | HOST void MemManager::alloc_slot(Slot& slot, size_t size) {
26 |     assert(!slot.in_use && "Buffer not deallocated properly");
27 |     if (slot.size < size) {
28 |         if (slot.ptr) CHECK_CUDA_CALL(cudaFree(slot.ptr));
29 |         CHECK_CUDA_CALL(cudaMalloc(&slot.ptr, size));
30 |         usage_     = usage_ + size - slot.size;
31 |         max_usage_ = std::max(usage_, max_usage_);
32 |         slot.size = size;
33 |     }
34 |     slot.in_use = true;
35 | 
36 |     if (keep_ && usage_ >= max_usage_) {
37 |         // Deallocate the first unused slot
38 |         for (auto& slot : slots_) {
39 |             if (slot.in_use) continue;
40 |             usage_ = usage_ - slot.size;
41 |             dealloc_slot(slot);
42 |             break;
43 |         }
44 |     }
45 | }
46 | 
47 | HOST void MemManager::free_slot(Slot& slot) {
48 |     assert(slot.in_use);
49 |     slot.in_use = false;
50 |     if (!keep_) {
51 |         usage_ = usage_ - slot.size;
52 |         dealloc_slot(slot);
53 |     }
54 | }
55 | 
56 | HOST void MemManager::copy_dev_to_dev(void* dst, const void* src, size_t bytes) {
57 |     CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
58 | }
59 | 
60 | HOST void MemManager::copy_hst_to_dev(void* dst, const void* src, size_t bytes) {
61 |     CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
62 | }
63 | 
64 | HOST void MemManager::copy_dev_to_hst(void* dst, const void* src, size_t bytes) {
65 |     CHECK_CUDA_CALL(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
66 | }
67 | 
68 | HOST void MemManager::zero_dev(void* ptr, size_t bytes) {
69 |     CHECK_CUDA_CALL(cudaMemset(ptr, 0, bytes));
70 | }
71 | 
72 | HOST void MemManager::one_dev(void* ptr, size_t bytes) {
73 |     CHECK_CUDA_CALL(cudaMemset(ptr, 0xFFFFFFFF, bytes));
74 | }
75 | 
76 | 
77 | } // namespace hagrid
78 | 


--------------------------------------------------------------------------------
/src/mem_manager.h:
--------------------------------------------------------------------------------
  1 | #ifndef MEM_MANAGER_H
  2 | #define MEM_MANAGER_H
  3 | 
  4 | #include <vector>
  5 | #include <iostream>
  6 | #include <cassert>
  7 | #include <limits>
  8 | #include <unordered_map>
  9 | #include "common.h"
 10 | 
 11 | namespace hagrid {
 12 | 
 13 | /// Directions to copy memory from and to
 14 | enum class Copy {
 15 |     HST_TO_DEV,
 16 |     DEV_TO_HST,
 17 |     DEV_TO_DEV
 18 | };
 19 | 
 20 | /// A slot for a buffer in GPU memory
 21 | struct Slot {
 22 |     Slot()
 23 |         : ptr(nullptr)
 24 |         , size(0)
 25 |         , in_use(false)
 26 |     {}
 27 | 
 28 |     void* ptr;
 29 |     size_t size;
 30 |     bool in_use;
 31 | };
 32 | 
 33 | /// Utility class to manage memory buffers during construction
 34 | class MemManager {
 35 | public:
 36 |     /// Creates a manager object. The boolean flag controls whether
 37 |     /// buffers are kept or deallocated upon a call to free(). Keeping
 38 |     /// the buffers increases the memory usage, but speeds-up subsequent
 39 |     /// builds (often useful for dynamic scenes).
 40 |     MemManager(bool keep = false)
 41 |         : keep_(keep), usage_(0), max_usage_(0)
 42 |     {}
 43 | 
 44 |     /// Allocates a buffer, re-using allocated memory when possible
 45 |     template <typename T>
 46 |     HOST T* alloc(size_t n) {
 47 |         auto size = n * sizeof(T);
 48 |         auto min_diff = std::numeric_limits<size_t>::max();
 49 |         int found = -1;
 50 | 
 51 |         for (int i = 0, n = slots_.size(); i < n; i++) {
 52 |             auto& slot = slots_[i];
 53 |             if (!slot.in_use) {
 54 |                 auto diff = std::max(size, slot.size) - std::min(size, slot.size);
 55 |                 if (diff < min_diff) {
 56 |                     min_diff = diff;
 57 |                     found = i;
 58 |                 }
 59 |             }
 60 |         }
 61 | 
 62 |         if (found < 0) {
 63 |             found = slots_.size();
 64 |             slots_.resize(found + 1);
 65 |         }
 66 | 
 67 |         Slot& slot = slots_[found];
 68 |         alloc_slot(slot, size);
 69 |         tracker_[slot.ptr] = found;
 70 |         return reinterpret_cast<T*>(slot.ptr);
 71 |     }
 72 | 
 73 |     /// Frees the contents of the given slot
 74 |     template <typename T>
 75 |     HOST void free(T* ptr) {
 76 |         if (!ptr) return;
 77 |         assert(tracker_.count(ptr));
 78 |         free_slot(slots_[tracker_[ptr]]);
 79 |         tracker_.erase(ptr);
 80 |     }
 81 | 
 82 |     /// Copies memory between buffers
 83 |     template <Copy type, typename T>
 84 |     HOST void copy(T* dst, const T* src, size_t n) {
 85 |         if (type == Copy::DEV_TO_DEV)      copy_dev_to_dev(dst, src, sizeof(T) * n);
 86 |         else if (type == Copy::DEV_TO_HST) copy_dev_to_hst(dst, src, sizeof(T) * n);
 87 |         else if (type == Copy::HST_TO_DEV) copy_hst_to_dev(dst, src, sizeof(T) * n);
 88 |     }
 89 | 
 90 |     /// Fills memory with zeros
 91 |     template <typename T>
 92 |     HOST void zero(T* ptr, size_t n) { zero_dev(ptr, n * sizeof(T)); }
 93 | 
 94 |     /// Fills memory with ones
 95 |     template <typename T>
 96 |     HOST void one(T* ptr, size_t n) { one_dev(ptr, n * sizeof(T)); }
 97 | 
 98 |     /// Displays slots and memory usage
 99 |     void debug_slots() const;
100 | 
101 |     /// Returns the current memory usage
102 |     size_t usage() const { return usage_; }
103 |     /// Returns the maximum memory usage
104 |     size_t max_usage() const { return max_usage_; }
105 | 
106 | private:
107 |     HOST void alloc_slot(Slot&, size_t);
108 |     HOST void free_slot(Slot&);
109 |     HOST void copy_dev_to_dev(void*, const void*, size_t);
110 |     HOST void copy_dev_to_hst(void*, const void*, size_t);
111 |     HOST void copy_hst_to_dev(void*, const void*, size_t);
112 |     HOST void zero_dev(void*, size_t);
113 |     HOST void one_dev(void*, size_t);
114 | 
115 |     std::unordered_map<void*, int> tracker_;
116 |     std::vector<Slot> slots_;
117 |     size_t usage_, max_usage_;
118 |     bool keep_;
119 | };
120 | 
121 | } // namespace hagrid
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/src/merge.cu:
--------------------------------------------------------------------------------
  1 | #include "build.h"
  2 | #include "parallel.cuh"
  3 | 
  4 | namespace hagrid {
  5 | 
  6 | /// Structure that contains buffers used during merging
  7 | struct MergeBuffers {
  8 |     int* merge_counts;  ///< Contains the number of references in each cell (positive if merged, otherwise negative)
  9 |     int* prevs, *nexts; ///< Contains the index of the previous/next neighboring cell on the merging axis (positive if merged, otherwise negative)
 10 |     int* ref_counts;    ///< Contains the number of references per cell after merge
 11 |     int* cell_flags;    ///< Contains 1 if the cell is kept (it is not a residue), otherwise 0
 12 |     int* cell_scan;     ///< Scan over cell_flags (insertion position of the cells into the new cell array)
 13 |     int* ref_scan;      ///< Scan over ref_counts (insertion position of the references into the new reference array)
 14 |     int* new_cell_ids;  ///< Mapping between the old cell indices and the new cell indices
 15 | };
 16 | 
 17 | static __constant__ ivec3 grid_dims;
 18 | static __constant__ vec3  cell_size;
 19 | static __constant__ int   grid_shift;
 20 | 
 21 | template <int axis>
 22 | __device__ bool aligned(const Cell& cell1, const Cell& cell2) {
 23 |     constexpr int axis1 = (axis + 1) % 3;
 24 |     constexpr int axis2 = (axis + 2) % 3;
 25 | 
 26 |     return get<axis >(cell1.max) == get<axis >(cell2.min) &&
 27 |            get<axis1>(cell1.min) == get<axis1>(cell2.min) &&
 28 |            get<axis2>(cell1.min) == get<axis2>(cell2.min) &&
 29 |            get<axis1>(cell1.max) == get<axis1>(cell2.max) &&
 30 |            get<axis2>(cell1.max) == get<axis2>(cell2.max);
 31 | }
 32 | 
 33 | /// Restricts the merges so that cells are better aligned for the next iteration
 34 | __device__ __forceinline__ bool merge_allowed(int empty_mask, int pos) {
 35 |     auto top_level_mask = (1 << grid_shift) - 1;
 36 |     auto is_shifted   = (pos >> grid_shift) & empty_mask;
 37 |     auto is_top_level = !(pos & top_level_mask);
 38 |     return !is_shifted || !is_top_level;
 39 | }
 40 | 
 41 | /// Computes the position of the next cell of the grid on the axis
 42 | template <int axis>
 43 | __device__ ivec3 next_cell(const ivec3& min, const ivec3& max) {
 44 |     return ivec3(axis == 0 ? max.x : min.x,
 45 |                  axis == 1 ? max.y : min.y,
 46 |                  axis == 2 ? max.z : min.z);
 47 | }
 48 | 
 49 | /// Computes the position of the previous cell of the grid on the axis
 50 | template <int axis>
 51 | __device__ ivec3 prev_cell(const ivec3& min) {
 52 |     return ivec3(axis == 0 ? min.x - 1 : min.x,
 53 |                  axis == 1 ? min.y - 1 : min.y,
 54 |                  axis == 2 ? min.z - 1 : min.z);
 55 | }
 56 | 
 57 | /// Counts the number of elements in the union of two sorted arrays
 58 | __device__ __forceinline__ int count_union(const int* __restrict__ p0, int c0,
 59 |                                            const int* __restrict__ p1, int c1) {
 60 |     int i = 0, j = 0, c = 0;
 61 |     while (i < c0 & j < c1) {
 62 |         auto a = p0[i];
 63 |         auto b = p1[j];
 64 |         i += (a <= b);
 65 |         j += (a >= b);
 66 |         c++;
 67 |     }
 68 |     return c + (c1 - j) + (c0 - i);
 69 | }
 70 | 
 71 | /// Merges the two sorted reference arrays
 72 | __device__ __forceinline__ void merge_refs(const int* __restrict__ p0, int c0,
 73 |                                            const int* __restrict__ p1, int c1,
 74 |                                            int* __restrict__ q) {
 75 |     int i = 0;
 76 |     int j = 0;
 77 |     while (i < c0 && j < c1) {
 78 |         auto a = p0[i];
 79 |         auto b = p1[j];
 80 |         *(q++) = (a < b) ? a : b;
 81 |         i += (a <= b);
 82 |         j += (a >= b);
 83 |     }
 84 |     auto k = i < c0 ? i  :  j;
 85 |     auto c = i < c0 ? c0 : c1;
 86 |     auto p = i < c0 ? p0 : p1;
 87 |     while (k < c) *(q++) = p[k++];
 88 | }
 89 | 
 90 | /// Computes the number of references per cell after the merge
 91 | template <int axis>
 92 | __global__ void compute_merge_counts(const Entry* __restrict__ entries,
 93 |                                      const Cell* __restrict__  cells,
 94 |                                      const int* __restrict__   refs,
 95 |                                      int* __restrict__ merge_counts,
 96 |                                      int* __restrict__ nexts,
 97 |                                      int* __restrict__ prevs,
 98 |                                      int empty_mask,
 99 |                                      int num_cells) {
100 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
101 |     if (id >= num_cells) return;
102 | 
103 |     static constexpr auto unit_cost = 1.0f;
104 | 
105 |     auto cell1 = load_cell(cells + id);
106 |     auto next_pos = next_cell<axis>(cell1.min, cell1.max);
107 |     int count = -(cell1.end - cell1.begin + 1);
108 |     int next_id = -1;
109 | 
110 |     if (merge_allowed(empty_mask, get<axis>(cell1.min)) &&
111 |         get<axis>(next_pos) < get<axis>(grid_dims)) {
112 |         next_id = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_pos);
113 |         auto cell2 = load_cell(cells + next_id);
114 | 
115 |         if (aligned<axis>(cell1, cell2)) {
116 |             auto e1 = vec3(cell1.max - cell1.min) * cell_size;
117 |             auto e2 = vec3(cell2.max - cell2.min) * cell_size;
118 |             auto a1 = e1.x * (e1.y + e1.z) + e1.y * e1.z;
119 |             auto a2 = e2.x * (e2.y + e2.z) + e2.y * e2.z;
120 |             auto a  = a1 + a2 - get<(axis + 1) % 3>(e1) * get<(axis + 2) % 3>(e1);
121 | 
122 |             int n1 = cell1.end - cell1.begin;
123 |             int n2 = cell2.end - cell2.begin;
124 |             auto c1 = a1 * (n1 + unit_cost);
125 |             auto c2 = a2 * (n2 + unit_cost);
126 |             // Early exit test: there is a minimum of max(n1, n2)
127 |             // primitives in the union of the two cells
128 |             if (a * (max(n1, n2) + unit_cost) <= c1 + c2) {
129 |                 auto n = count_union(refs + cell1.begin, n1,
130 |                                      refs + cell2.begin, n2);
131 |                 auto c = a * (n + unit_cost);
132 |                 if (c <= c1 + c2) count = n;
133 |             }
134 |         }
135 |     }
136 |     
137 |     merge_counts[id] = count;
138 | 
139 |     next_id = count >= 0 ? next_id : -1;
140 |     nexts[id] = next_id;
141 |     if (next_id >= 0) prevs[next_id] = id;
142 | }
143 | 
144 | /// Traverses the merge chains and mark the cells at odd positions as residue
145 | template <int axis>
146 | __global__ void compute_cell_flags(const int* __restrict__ nexts,
147 |                                    const int* __restrict__ prevs,
148 |                                    int* __restrict__ cell_flags,
149 |                                    int num_cells) {
150 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
151 |     if (id >= num_cells) return;
152 | 
153 |     // If the previous cell does not exist or does not want to merge with this cell
154 |     if (prevs[id] < 0) {
155 |         int next_id = nexts[id];
156 |         cell_flags[id] = 1;
157 | 
158 |         // If this cell wants to merge with the next
159 |         if (next_id >= 0) {
160 |             int count = 1;
161 | 
162 |             // Traverse the merge chain
163 |             do {
164 |                 cell_flags[next_id] = count % 2 ? 0 : 1;
165 |                 next_id = nexts[next_id];
166 |                 count++;
167 |             } while (next_id >= 0);
168 |         }
169 |     }
170 | }
171 | 
172 | /// Computes the number of new references per cell
173 | __global__ void compute_ref_counts(const int* __restrict__ merge_counts,
174 |                                    const int* __restrict__ cell_flags,
175 |                                    int* __restrict__ ref_counts,
176 |                                    int num_cells) {
177 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
178 |     if (id >= num_cells) return;
179 | 
180 |     int count = 0;
181 |     if (cell_flags[id]) {
182 |         const int merged = merge_counts[id];
183 |         count = merged >= 0 ? merged : -(merged + 1);
184 |     }
185 |     ref_counts[id] = count;
186 | }
187 | 
188 | /// Performs the merge
189 | template <int axis>
190 | __global__ void merge(const Entry* __restrict__ entries,
191 |                       const Cell* __restrict__ cells,
192 |                       const int* __restrict__ refs,
193 |                       const int* __restrict__ cell_scan,
194 |                       const int* __restrict__ ref_scan,
195 |                       const int* __restrict__ merge_counts,
196 |                       int* __restrict__ new_cell_ids,
197 |                       Cell* __restrict__ new_cells,
198 |                       int* __restrict__ new_refs,
199 |                       int num_cells) {
200 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
201 | 
202 |     bool valid = id < num_cells;
203 |     int new_id = valid ? cell_scan[id] : 0;
204 |     valid &= cell_scan[id + 1] > new_id;
205 | 
206 |     int cell_begin = 0, cell_end = 0;
207 |     int next_begin = 0, next_end = 0;
208 |     int new_refs_begin;
209 | 
210 |     if (valid) {
211 |         auto cell = load_cell(cells + id);
212 |         int merge_count = merge_counts[id];
213 | 
214 |         new_refs_begin = ref_scan[id];
215 |         new_cell_ids[id] = new_id;
216 |         cell_begin = cell.begin;
217 |         cell_end   = cell.end;
218 | 
219 |         ivec3 new_min;
220 |         ivec3 new_max;
221 |         int new_refs_end;
222 |         if (merge_count >= 0) {
223 |             // Do the merge and store the references into the new array
224 |             auto next_id = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, next_cell<axis>(cell.min, cell.max));
225 |             auto next_cell = load_cell(cells + next_id);
226 |             next_begin = next_cell.begin;
227 |             next_end   = next_cell.end;
228 | 
229 |             // Make the next cell point to the merged one
230 |             new_cell_ids[next_id] = new_id;
231 | 
232 |             new_min = min(next_cell.min, cell.min);
233 |             new_max = max(next_cell.max, cell.max);
234 |             new_refs_end = new_refs_begin + merge_count;
235 |         } else {
236 |            new_min = cell.min;
237 |            new_max = cell.max;
238 |            new_refs_end = new_refs_begin + (cell_end - cell_begin);
239 |         }
240 | 
241 |         store_cell(new_cells + new_id, Cell(new_min, new_refs_begin,
242 |                                             new_max, new_refs_end));
243 |     }
244 | 
245 |     int warp_id = threadIdx.x % 32;
246 |     bool merge = next_begin < next_end;
247 | 
248 |     // Process consecutive ranges of cells that do not want to be merged
249 |     static constexpr unsigned all_mask = unsigned(-1);
250 |     uint32_t merge_mask = __ballot_sync(all_mask, valid & !merge);
251 |     uint32_t full_mask  = __ballot_sync(all_mask, cell_begin < cell_end);
252 |     while (merge_mask) {
253 |         // Find the range of cells [first_bit, last_bit] that are not merged
254 |         auto first_bit = __ffs(merge_mask) - 1;
255 |         auto shift_mask = ~(merge_mask >> first_bit);
256 |         auto last_bit  = shift_mask ? __ffs(shift_mask) + first_bit - 2 : first_bit;
257 |         merge_mask &= ~((1 << (last_bit + 1)) - 1);
258 | 
259 |         // Skip cells that do not contain references
260 |         shift_mask = full_mask >> first_bit;
261 |         if (!shift_mask) continue;
262 |         first_bit += __ffs(shift_mask) - 1;
263 |         last_bit  -= __clz(full_mask << (31 - last_bit));
264 | 
265 |         auto begin     = __shfl_sync(all_mask, cell_begin,     first_bit);
266 |         auto end       = __shfl_sync(all_mask, cell_end,       last_bit);
267 |         auto new_begin = __shfl_sync(all_mask, new_refs_begin, first_bit);
268 |         for (int i = begin + warp_id, j = new_begin + warp_id; i < end; i += 32, j += 32)
269 |             new_refs[j] = refs[i];
270 |     }
271 | 
272 |     // Merge references if required
273 |     if (merge) {
274 |         merge_refs(refs + cell_begin, cell_end - cell_begin,
275 |                    refs + next_begin, next_end - next_begin,
276 |                    new_refs + new_refs_begin);
277 |     }
278 | }
279 | 
280 | /// Maps the old cell indices in the voxel map to the new ones
281 | __global__ void remap_entries(Entry* __restrict__ entries,
282 |                               const int* __restrict__ new_cell_ids,
283 |                               int num_entries) {
284 |     int id = threadIdx.x + blockDim.x * blockIdx.x;
285 | 
286 |     if (id < num_entries) {
287 |         auto entry = entries[id];
288 |         if (entry.log_dim == 0) entries[id] = make_entry(0, new_cell_ids[entry.begin]);
289 |     }
290 | }
291 | 
292 | template <int axis>
293 | void merge_iteration(MemManager& mem, Grid& grid, Cell*& new_cells, int*& new_refs, int empty_mask, MergeBuffers& bufs) {
294 |     Parallel par(mem);
295 | 
296 |     int num_cells   = grid.num_cells;
297 |     int num_entries = grid.num_entries;
298 |     auto cells   = grid.cells;
299 |     auto refs    = grid.ref_ids;
300 |     auto entries = grid.entries;
301 | 
302 |     mem.one(bufs.prevs, num_cells);
303 |     compute_merge_counts<axis><<<round_div(num_cells, 64), 64>>>(entries, cells, refs, bufs.merge_counts, bufs.nexts, bufs.prevs, empty_mask, num_cells);
304 |     DEBUG_SYNC();
305 |     compute_cell_flags<axis><<<round_div(num_cells, 64), 64>>>(bufs.nexts, bufs.prevs, bufs.cell_flags, num_cells);
306 |     DEBUG_SYNC();
307 |     compute_ref_counts<<<round_div(num_cells, 64), 64>>>(bufs.merge_counts, bufs.cell_flags, bufs.ref_counts, num_cells);
308 |     DEBUG_SYNC();
309 | 
310 |     int num_new_refs  = par.scan(bufs.ref_counts, num_cells + 1, bufs.ref_scan);
311 |     int num_new_cells = par.scan(bufs.cell_flags, num_cells + 1, bufs.cell_scan);
312 | 
313 |     merge<axis><<<round_div(num_cells, 64), 64>>>(entries, cells, refs,
314 |                                                   bufs.cell_scan, bufs.ref_scan,
315 |                                                   bufs.merge_counts, bufs.new_cell_ids,
316 |                                                   new_cells, new_refs,
317 |                                                   num_cells);
318 |     DEBUG_SYNC();
319 |     remap_entries<<<round_div(num_entries, 64), 64>>>(entries, bufs.new_cell_ids, num_entries);
320 |     DEBUG_SYNC();
321 | 
322 |     std::swap(new_cells, cells);
323 |     std::swap(new_refs,  refs);
324 | 
325 |     grid.cells     = cells;
326 |     grid.ref_ids   = refs;
327 |     grid.num_cells = num_new_cells;
328 |     grid.num_refs  = num_new_refs;
329 | }
330 | 
331 | void merge_grid(MemManager& mem, Grid& grid, float alpha) {
332 |     MergeBuffers bufs;
333 | 
334 |     auto new_cells = mem.alloc<Cell>(grid.num_cells);
335 |     auto new_refs  = mem.alloc<int> (grid.num_refs);
336 | 
337 |     size_t buf_size = grid.num_cells + 1;
338 |     buf_size = buf_size % 4 ? buf_size + 4 - buf_size % 4 : buf_size;
339 | 
340 |     bufs.merge_counts = mem.alloc<int>(buf_size);
341 |     bufs.ref_counts   = mem.alloc<int>(buf_size);
342 |     bufs.cell_flags   = mem.alloc<int>(buf_size);
343 |     bufs.cell_scan    = mem.alloc<int>(buf_size);
344 |     bufs.ref_scan     = mem.alloc<int>(buf_size);
345 |     bufs.new_cell_ids = bufs.cell_flags;
346 |     bufs.prevs        = bufs.cell_scan;
347 |     bufs.nexts        = bufs.ref_scan;
348 | 
349 |     auto extents = grid.bbox.extents();
350 |     auto dims = grid.dims << grid.shift;
351 |     auto cell_size = extents / vec3(dims);
352 | 
353 |     set_global(hagrid::grid_dims,  dims);
354 |     set_global(hagrid::cell_size,  cell_size);
355 |     set_global(hagrid::grid_shift, grid.shift);
356 | 
357 |     if (alpha > 0) {
358 |         int prev_num_cells = 0, iter = 0;
359 |         do {
360 |             prev_num_cells = grid.num_cells;
361 |             auto mask = iter > 3 ? 0 : (1 << (iter + 1)) - 1;
362 |             merge_iteration<0>(mem, grid, new_cells, new_refs, mask, bufs);
363 |             merge_iteration<1>(mem, grid, new_cells, new_refs, mask, bufs);
364 |             merge_iteration<2>(mem, grid, new_cells, new_refs, mask, bufs);
365 |             iter++;
366 |         } while (grid.num_cells < alpha * prev_num_cells);
367 |     }
368 | 
369 |     mem.free(bufs.merge_counts);
370 |     mem.free(bufs.ref_counts);
371 |     mem.free(bufs.cell_flags);
372 |     mem.free(bufs.cell_scan);
373 |     mem.free(bufs.ref_scan);
374 | 
375 |     mem.free(new_cells);
376 |     mem.free(new_refs);
377 | }
378 | 
379 | } // namespace hagrid
380 | 


--------------------------------------------------------------------------------
/src/parallel.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef PARALLEL_CUH
 2 | #define PARALLEL_CUH
 3 | 
 4 | #include <type_traits>
 5 | #include <cub/cub.cuh>
 6 | #include "mem_manager.h"
 7 | #include "common.h"
 8 | 
 9 | namespace hagrid {
10 | 
11 | /// Parallel primitives (mostly a wrapper around CUB)
12 | class Parallel {
13 | private:
14 |     template <typename OutputIt>
15 |     struct ResultType {
16 |         typedef typename std::remove_reference<decltype(*(OutputIt()))>::type Type;
17 |     };
18 | 
19 | public:
20 |     Parallel(MemManager& mem)
21 |         : mem_(mem)
22 |     {}
23 | 
24 |     /// Creates a transformation iterator
25 |     template <typename InputIt, typename F>
26 |     auto transform(InputIt values, F f) -> cub::TransformInputIterator<decltype(f(*values)), F, InputIt> {
27 |         return cub::TransformInputIterator<decltype(f(*values)), F, InputIt>(values, f);
28 |     }
29 | 
30 |     /// Computes the exclusive sum of the given array, and returns the total
31 |     template <typename InputIt, typename OutputIt>
32 |     auto scan(InputIt values, int n, OutputIt result) -> typename ResultType<OutputIt>::Type {
33 |         typedef typename ResultType<OutputIt>::Type T;
34 |         size_t required_bytes;
35 |         CHECK_CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, required_bytes, values, result, n));
36 |         char* tmp_storage = mem_.alloc<char>(required_bytes);
37 |         CHECK_CUDA_CALL(cub::DeviceScan::ExclusiveSum(tmp_storage, required_bytes, values, result, n));
38 |         mem_.free(tmp_storage);
39 |         T total;
40 |         CHECK_CUDA_CALL(cudaMemcpy(&total, result + n - 1, sizeof(T), cudaMemcpyDeviceToHost));
41 |         return total;
42 |     }
43 | 
44 |     /// Computes the reduction of the given operator over the given array array
45 |     template <typename InputIt, typename OutputIt, typename F>
46 |     auto reduce(InputIt values, int n, OutputIt result, F f, typename ResultType<OutputIt>::Type init = typename ResultType<OutputIt>::Type()) -> typename ResultType<OutputIt>::Type {
47 |         typedef typename ResultType<OutputIt>::Type T;
48 |         size_t required_bytes;
49 |         CHECK_CUDA_CALL(cub::DeviceReduce::Reduce(nullptr, required_bytes, values, result, n, f, init));
50 |         char* tmp_storage = mem_.alloc<char>(required_bytes);
51 |         CHECK_CUDA_CALL(cub::DeviceReduce::Reduce(tmp_storage, required_bytes, values, result, n, f, init));
52 |         mem_.free(tmp_storage);
53 |         T host_result;
54 |         CHECK_CUDA_CALL(cudaMemcpy(&host_result, result, sizeof(T), cudaMemcpyDeviceToHost));
55 |         return host_result;
56 |     }
57 | 
58 |     /// Computes a partition of the given set according to an array of flags, returns the number of elements in first half
59 |     template <typename InputIt, typename OutputIt, typename FlagIt>
60 |     int partition(InputIt values, OutputIt result, int n, FlagIt flags) {
61 |         size_t required_bytes;
62 |         CHECK_CUDA_CALL(cub::DevicePartition::Flagged(nullptr, required_bytes, values, flags, result, (int*)nullptr, n));
63 |         required_bytes += 4 - required_bytes % 4; // Align storage
64 |         char* tmp_storage = mem_.alloc<char>(required_bytes + sizeof(int));
65 |         int* count_ptr = reinterpret_cast<int*>(tmp_storage + required_bytes);
66 |         CHECK_CUDA_CALL(cub::DevicePartition::Flagged(tmp_storage, required_bytes, values, flags, result, count_ptr, n));
67 |         int count;
68 |         CHECK_CUDA_CALL(cudaMemcpy(&count, count_ptr, sizeof(int), cudaMemcpyDeviceToHost));
69 |         mem_.free(tmp_storage);
70 |         return count;
71 |     }
72 | 
73 |     /// Computes a partition of the given set according to an array of flags, returns the number of elements in first half
74 |     template <typename Key, typename Value>
75 |     void sort_pairs(Key* keys_in, Value* values_in, Key*& keys_out, Value*& values_out, int n, int bits = sizeof(Key) * 8) {
76 |         size_t required_bytes;
77 |         cub::DoubleBuffer<Key>   keys_buf(keys_in, keys_out);
78 |         cub::DoubleBuffer<Value> values_buf(values_in, values_out);
79 |         CHECK_CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, required_bytes, keys_buf, values_buf, n, 0, bits));
80 |         char* tmp_storage = mem_.alloc<char>(required_bytes + sizeof(int));
81 |         CHECK_CUDA_CALL(cub::DeviceRadixSort::SortPairs(tmp_storage, required_bytes, keys_buf, values_buf, n, 0, bits));
82 |         mem_.free(tmp_storage);
83 |         keys_out   = keys_buf.Current();
84 |         values_out = values_buf.Current();
85 |     }
86 | 
87 | private:
88 |     MemManager& mem_;
89 | };
90 | 
91 | } // namespace hagrid
92 | 
93 | #endif // PARALLEL_CUH
94 | 


--------------------------------------------------------------------------------
/src/prims.h:
--------------------------------------------------------------------------------
  1 | #ifndef PRIMITIVES_H
  2 | #define PRIMITIVES_H
  3 | 
  4 | #include <cmath>
  5 | #include <cfloat>
  6 | #include "vec.h"
  7 | #include "bbox.h"
  8 | #include "ray.h"
  9 | 
 10 | namespace hagrid {
 11 | 
 12 | /// Triangle (point + edges + normal)
 13 | struct Tri {
 14 |     vec3 v0; float nx;
 15 |     vec3 e1; float ny;
 16 |     vec3 e2; float nz;
 17 | 
 18 |     HOST DEVICE Tri() {}
 19 |     HOST DEVICE Tri(const vec3& v0, float nx,
 20 |                     const vec3& e1, float ny,
 21 |                     const vec3& e2, float nz)
 22 |         : v0(v0), nx(nx)
 23 |         , e1(e1), ny(ny)
 24 |         , e2(e2), nz(nz)
 25 |     {}
 26 | 
 27 |     HOST DEVICE BBox bbox() const {
 28 |         auto v1 = v0 - e1;
 29 |         auto v2 = v0 + e2;
 30 |         return BBox(min(v0, min(v1, v2)), max(v0, max(v1, v2)));    
 31 |     }
 32 | 
 33 |     template <int axis, int axis1, int axis2>
 34 |     HOST DEVICE vec2 clipped_bounds(float min1, float max1, float min2, float max2) const {
 35 |         auto e3 = e1 + e2;
 36 |         auto v1 = v0 - e1;
 37 |         auto v2 = v0 + e2;
 38 | 
 39 |         vec2 bounds = vec2(FLT_MAX, FLT_MIN);
 40 | 
 41 |         if (get<axis1>(v0) >= min1 && get<axis1>(v0) <= max1 &&
 42 |             get<axis2>(v0) >= min2 && get<axis2>(v0) <= max2) {
 43 |             bounds.x = min(get<axis>(v0), bounds.x);
 44 |             bounds.y = max(get<axis>(v0), bounds.y);
 45 |         }
 46 |         if (get<axis1>(v1) >= min1 && get<axis1>(v1) <= max1 &&
 47 |             get<axis2>(v1) >= min2 && get<axis2>(v1) <= max2) {
 48 |             bounds.x = min(get<axis>(v1), bounds.x);
 49 |             bounds.y = max(get<axis>(v1), bounds.y);
 50 |         }
 51 |         if (get<axis1>(v2) >= min1 && get<axis1>(v2) <= max1 &&
 52 |             get<axis2>(v2) >= min2 && get<axis2>(v2) <= max2) {
 53 |             bounds.x = min(get<axis>(v2), bounds.x);
 54 |             bounds.y = max(get<axis>(v2), bounds.y);
 55 |         }
 56 | 
 57 |         auto inv1_e1 = 1.0f / get<axis1>(e1);
 58 |         auto inv1_e2 = 1.0f / get<axis1>(e2);
 59 |         auto inv1_e3 = 1.0f / get<axis1>(e3);
 60 | 
 61 |         // Clip on min1
 62 |         {
 63 |             auto tmin1_e1 = (get<axis1>(v0) - min1) * inv1_e1;
 64 |             auto tmin1_e2 = (min1 - get<axis1>(v0)) * inv1_e2;
 65 |             auto tmin1_e3 = (min1 - get<axis1>(v1)) * inv1_e3;
 66 |             if (tmin1_e1 <= 1 && tmin1_e1 >= 0) {
 67 |                 auto p = get<axis>(v0) - get<axis>(e1) * tmin1_e1;
 68 |                 bounds.x = min(p, bounds.x);
 69 |                 bounds.y = max(p, bounds.y);
 70 |             }
 71 |             if (tmin1_e2 <= 1 && tmin1_e2 >= 0) {
 72 |                 auto p = get<axis>(v0) + get<axis>(e2) * tmin1_e2;
 73 |                 bounds.x = min(p, bounds.x);
 74 |                 bounds.y = max(p, bounds.y);
 75 |             }
 76 |             if (tmin1_e3 <= 1 && tmin1_e3 >= 0) {
 77 |                 auto p = get<axis>(v1) + get<axis>(e3) * tmin1_e3;
 78 |                 bounds.x = min(p, bounds.x);
 79 |                 bounds.y = max(p, bounds.y);
 80 |             }
 81 |         }
 82 | 
 83 |         // Clip on max1
 84 |         {
 85 |             auto tmax1_e1 = (get<axis1>(v0) - max1) * inv1_e1;
 86 |             auto tmax1_e2 = (max1 - get<axis1>(v0)) * inv1_e2;
 87 |             auto tmax1_e3 = (max1 - get<axis1>(v1)) * inv1_e3;
 88 |             if (tmax1_e1 <= 1 && tmax1_e1 >= 0) {
 89 |                 auto p = get<axis>(v0) - get<axis>(e1) * tmax1_e1;
 90 |                 bounds.x = min(p, bounds.x);
 91 |                 bounds.y = max(p, bounds.y);
 92 |             }
 93 |             if (tmax1_e2 <= 1 && tmax1_e2 >= 0) {
 94 |                 auto p = get<axis>(v0) + get<axis>(e2) * tmax1_e2;
 95 |                 bounds.x = min(p, bounds.x);
 96 |                 bounds.y = max(p, bounds.y);
 97 |             }
 98 |             if (tmax1_e3 <= 1 && tmax1_e3 >= 0) {
 99 |                 auto p = get<axis>(v1) + get<axis>(e3) * tmax1_e3;
100 |                 bounds.x = min(p, bounds.x);
101 |                 bounds.y = max(p, bounds.y);
102 |             }
103 |         }
104 | 
105 |         auto inv2_e1 = 1.0f / get<axis2>(e1);
106 |         auto inv2_e2 = 1.0f / get<axis2>(e2);
107 |         auto inv2_e3 = 1.0f / get<axis2>(e3);
108 | 
109 |         // Clip on min2
110 |         {
111 |             auto tmin2_e1 = (get<axis2>(v0) - min2) * inv2_e1;
112 |             auto tmin2_e2 = (min2 - get<axis2>(v0)) * inv2_e2;
113 |             auto tmin2_e3 = (min2 - get<axis2>(v1)) * inv2_e3;
114 |             if (tmin2_e1 <= 1 && tmin2_e1 >= 0) {
115 |                 auto p = get<axis>(v0) - get<axis>(e1) * tmin2_e1;
116 |                 bounds.x = min(p, bounds.x);
117 |                 bounds.y = max(p, bounds.y);
118 |             }
119 |             if (tmin2_e2 <= 1 && tmin2_e2 >= 0) {
120 |                 auto p = get<axis>(v0) + get<axis>(e2) * tmin2_e2;
121 |                 bounds.x = min(p, bounds.x);
122 |                 bounds.y = max(p, bounds.y);
123 |             }
124 |             if (tmin2_e3 <= 1 && tmin2_e3 >= 0) {
125 |                 auto p = get<axis>(v1) + get<axis>(e3) * tmin2_e3;
126 |                 bounds.x = min(p, bounds.x);
127 |                 bounds.y = max(p, bounds.y);
128 |             }
129 |         }
130 | 
131 |         // Clip on max2
132 |         {
133 |             auto tmax2_e1 = (get<axis2>(v0) - max2) * inv2_e1;
134 |             auto tmax2_e2 = (max2 - get<axis2>(v0)) * inv2_e2;
135 |             auto tmax2_e3 = (max2 - get<axis2>(v1)) * inv2_e3;
136 |             if (tmax2_e1 <= 1 && tmax2_e1 >= 0) {
137 |                 auto p = get<axis>(v0) - get<axis>(e1) * tmax2_e1;
138 |                 bounds.x = min(p, bounds.x);
139 |                 bounds.y = max(p, bounds.y);
140 |             }
141 |             if (tmax2_e2 <= 1 && tmax2_e2 >= 0) {
142 |                 auto p = get<axis>(v0) + get<axis>(e2) * tmax2_e2;
143 |                 bounds.x = min(p, bounds.x);
144 |                 bounds.y = max(p, bounds.y);
145 |             }
146 |             if (tmax2_e3 <= 1 && tmax2_e3 >= 0) {
147 |                 auto p = get<axis>(v1) + get<axis>(e3) * tmax2_e3;
148 |                 bounds.x = min(p, bounds.x);
149 |                 bounds.y = max(p, bounds.y);
150 |             }
151 |         }
152 | 
153 |         return bounds;
154 |     }
155 | 
156 |     HOST DEVICE vec3 normal() const {
157 |         return vec3(nx, ny, nz);
158 |     }
159 | };
160 | 
161 | HOST DEVICE inline bool plane_overlap_box(const vec3& n, float d, const vec3& min, const vec3& max) {
162 |     auto first = vec3(n.x > 0 ? min.x : max.x,
163 |                       n.y > 0 ? min.y : max.y,
164 |                       n.z > 0 ? min.z : max.z);
165 | 
166 |     auto last = vec3(n.x <= 0 ? min.x : max.x,
167 |                      n.y <= 0 ? min.y : max.y,
168 |                      n.z <= 0 ? min.z : max.z);
169 | 
170 |     auto d0 = dot(n, first) - d;
171 |     auto d1 = dot(n, last)  - d;
172 | #if __CUDACC_VER_MAJOR__ == 7
173 |     union { int i; float f; } u0 = { .f = d0 };
174 |     union { int i; float f; } u1 = { .f = d1 };
175 |     // Equivalent to d1 * d0 <= 0.0f (CUDA 7.0 bug)
176 |     return (((u0.i ^ u1.i) & 0x80000000) | (d0 == 0.0f) | (d1 == 0.0f)) != 0;
177 | #else
178 |     return d1 * d0 <= 0.0f;
179 | #endif
180 | }
181 | 
182 | HOST DEVICE inline bool axis_test_x(const vec3& half_size,
183 |                                     const vec3& e, const vec3& f,
184 |                                     const vec3& v0, const vec3& v1) {
185 |     auto p0 = e.y * v0.z - e.z * v0.y;
186 |     auto p1 = e.y * v1.z - e.z * v1.y;
187 |     auto rad = f.z * half_size.y + f.y * half_size.z;
188 |     return fmin(p0, p1) > rad | fmax(p0, p1) < -rad;
189 | }
190 | 
191 | HOST DEVICE inline bool axis_test_y(const vec3& half_size,
192 |                                     const vec3& e, const vec3& f,
193 |                                     const vec3& v0, const vec3& v1) {
194 |     auto p0 = e.z * v0.x - e.x * v0.z;
195 |     auto p1 = e.z * v1.x - e.x * v1.z;
196 |     auto rad = f.z * half_size.x + f.x * half_size.z;
197 |     return fmin(p0, p1) > rad | fmax(p0, p1) < -rad;
198 | }
199 | 
200 | HOST DEVICE inline bool axis_test_z(const vec3& half_size,
201 |                                     const vec3& e, const vec3& f,
202 |                                     const vec3& v0, const vec3& v1) {
203 |     auto p0 = e.x * v0.y - e.y * v0.x;
204 |     auto p1 = e.x * v1.y - e.y * v1.x;
205 |     auto rad = f.y * half_size.x + f.x * half_size.y;
206 |     return fmin(p0, p1) > rad | fmax(p0, p1) < -rad;
207 | }
208 | 
209 | template <bool bounds_check, bool cross_axes>
210 | HOST DEVICE inline bool intersect_tri_box(const vec3& v0, const vec3& e1, const vec3& e2, const vec3& n, const vec3& min, const vec3& max) {
211 |     if (!plane_overlap_box(n, dot(v0, n), min, max))
212 |         return false;
213 | 
214 |     auto v1 = v0 - e1;
215 |     auto v2 = v0 + e2;
216 |     if (bounds_check) {
217 |         auto min_x = fmin(v0.x, fmin(v1.x, v2.x));
218 |         auto max_x = fmax(v0.x, fmax(v1.x, v2.x));
219 |         if (min_x > max.x | max_x < min.x) return false;
220 | 
221 |         auto min_y = fmin(v0.y, fmin(v1.y, v2.y));
222 |         auto max_y = fmax(v0.y, fmax(v1.y, v2.y));
223 |         if (min_y > max.y | max_y < min.y) return false;
224 | 
225 |         auto min_z = fmin(v0.z, fmin(v1.z, v2.z));
226 |         auto max_z = fmax(v0.z, fmax(v1.z, v2.z));
227 |         if (min_z > max.z | max_z < min.z) return false;
228 |     }
229 | 
230 |     if (cross_axes) {
231 |         auto center    = (max + min) * 0.5f;
232 |         auto half_size = (max - min) * 0.5f;
233 | 
234 |         auto w0 = v0 - center;
235 |         auto w1 = v1 - center;
236 |         auto w2 = v2 - center;
237 | 
238 |         auto f1 = vec3(fabs(e1.x), fabs(e1.y), fabs(e1.z));
239 |         if (axis_test_x(half_size, e1, f1, w0, w2) ||
240 |             axis_test_y(half_size, e1, f1, w0, w2) ||
241 |             axis_test_z(half_size, e1, f1, w1, w2))
242 |             return false;
243 | 
244 |         auto f2 = vec3(fabs(e2.x), fabs(e2.y), fabs(e2.z));
245 |         if (axis_test_x(half_size, e2, f2, w0, w1) ||
246 |             axis_test_y(half_size, e2, f2, w0, w1) ||
247 |             axis_test_z(half_size, e2, f2, w1, w2))
248 |             return false;
249 | 
250 |         auto e3 = e1 + e2;
251 | 
252 |         auto f3 = vec3(fabs(e3.x), fabs(e3.y), fabs(e3.z));
253 |         if (axis_test_x(half_size, e3, f3, w0, w2) ||
254 |             axis_test_y(half_size, e3, f3, w0, w2) ||
255 |             axis_test_z(half_size, e3, f3, w0, w1))
256 |             return false;
257 |     }
258 | 
259 |     return true;
260 | }
261 | 
262 | HOST DEVICE inline bool intersect_prim_cell(const Tri& tri, const BBox& bbox) {
263 |     return intersect_tri_box<false, true>(tri.v0, tri.e1, tri.e2, tri.normal(), bbox.min, bbox.max);
264 | }
265 | 
266 | HOST DEVICE inline bool intersect_prim_ray(const Tri& tri, const Ray& ray, int id, Hit& hit) {
267 |     // Moeller Trumbore
268 |     auto n = tri.normal();
269 | 
270 |     auto c = tri.v0 - ray.org;
271 |     auto r = cross(ray.dir, c);
272 |     auto det = dot(n, ray.dir);
273 |     auto abs_det = fabs(det);
274 | 
275 |     auto u = prodsign(dot(r, tri.e2), det);
276 |     auto v = prodsign(dot(r, tri.e1), det);
277 |     auto w = abs_det - u - v;
278 | 
279 |     auto eps = 1e-9f;
280 |     if (u >= -eps && v >= -eps && w >= -eps) {
281 |         auto t = prodsign(dot(n, c), det);
282 |         if (t >= abs_det * ray.tmin && abs_det * ray.tmax > t) {
283 |             auto inv_det = 1.0f / abs_det;
284 |             hit.t = t * inv_det;
285 | #ifdef COMPUTE_UVS
286 |             hit.u = u * inv_det;
287 |             hit.v = v * inv_det;
288 | #endif
289 |             hit.id = id;
290 |             return true;
291 |         }
292 |     }
293 | 
294 |     return false;
295 | }
296 | 
297 | #ifdef __NVCC__
298 | __device__ __forceinline__ Tri load_prim(const Tri* tri_ptr) {
299 |     const float4* ptr = (const float4*)tri_ptr;
300 |     auto tri0 = ptr[0];
301 |     auto tri1 = ptr[1];
302 |     auto tri2 = ptr[2];
303 |     return Tri(vec3(tri0.x, tri0.y, tri0.z), tri0.w,
304 |                vec3(tri1.x, tri1.y, tri1.z), tri1.w,
305 |                vec3(tri2.x, tri2.y, tri2.z), tri2.w);
306 | }
307 | #endif
308 | 
309 | } // namespace hagrid
310 | 
311 | #endif // PRIMITIVES_H
312 | 


--------------------------------------------------------------------------------
/src/profile.cu:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | namespace hagrid {
 4 | 
 5 | __host__ float profile(std::function<void()> f) {
 6 |     cudaEvent_t start_kernel, end_kernel;
 7 |     CHECK_CUDA_CALL(cudaEventCreate(&start_kernel));
 8 |     CHECK_CUDA_CALL(cudaEventCreate(&end_kernel));
 9 |     CHECK_CUDA_CALL(cudaEventRecord(start_kernel));
10 |     f();
11 |     CHECK_CUDA_CALL(cudaEventRecord(end_kernel));
12 |     CHECK_CUDA_CALL(cudaEventSynchronize(end_kernel));
13 |     float kernel_time = 0;
14 |     CHECK_CUDA_CALL(cudaEventElapsedTime(&kernel_time, start_kernel, end_kernel));
15 |     CHECK_CUDA_CALL(cudaEventDestroy(start_kernel));
16 |     CHECK_CUDA_CALL(cudaEventDestroy(end_kernel));
17 |     return kernel_time;
18 | }
19 | 
20 | } // namespace hagrid
21 | 


--------------------------------------------------------------------------------
/src/ray.h:
--------------------------------------------------------------------------------
 1 | #ifndef RAY_H
 2 | #define RAY_H
 3 | 
 4 | #include "vec.h"
 5 | 
 6 | namespace hagrid {
 7 | 
 8 | /// Ray, defined as org + t * dir with t in [tmin, tmax]
 9 | struct Ray {
10 |     vec3 org;
11 |     float tmin;
12 |     vec3 dir;
13 |     float tmax;
14 | 
15 |     HOST DEVICE Ray() {}
16 |     HOST DEVICE Ray(const vec3& org, float tmin,
17 |                     const vec3& dir, float tmax)
18 |         : org(org), tmin(tmin), dir(dir), tmax(tmax)
19 |     {}
20 | };
21 | 
22 | /// Result of a hit (id is -1 if there is no hit)
23 | struct Hit {
24 |     int id;
25 |     float t;
26 |     float u;
27 |     float v;
28 | 
29 |     HOST DEVICE Hit() {}
30 |     HOST DEVICE Hit(int id, float t, float u, float v)
31 |         : id(id), t(t), u(u), v(v)
32 |     {}
33 | };
34 | 
35 | #ifdef __NVCC__
36 | __device__ __forceinline__ Ray load_ray(const Ray* ray_ptr) {
37 |     const float4* ptr = (const float4*)ray_ptr;
38 |     auto ray0 = ptr[0];
39 |     auto ray1 = ptr[1];
40 |     return Ray(vec3(ray0.x, ray0.y, ray0.z), ray0.w,
41 |                vec3(ray1.x, ray1.y, ray1.z), ray1.w);
42 | }
43 | 
44 | __device__ __forceinline__ void store_hit(Hit* hit_ptr, const Hit& hit) {
45 |     float4* ptr = (float4*)hit_ptr;
46 |     ptr[0] = make_float4(__int_as_float(hit.id), hit.t, hit.u, hit.v);
47 | }
48 | #endif
49 | 
50 | } // namespace hagrid
51 | 
52 | #endif // RAY_H
53 | 


--------------------------------------------------------------------------------
/src/traverse.cu:
--------------------------------------------------------------------------------
  1 | #include <cub/cub.cuh>
  2 | 
  3 | #include "traverse.h"
  4 | 
  5 | namespace hagrid {
  6 | 
  7 | static __constant__ ivec3 grid_dims;
  8 | static __constant__ vec3  grid_min;
  9 | static __constant__ vec3  grid_max;
 10 | static __constant__ vec3  cell_size;
 11 | static __constant__ vec3  grid_inv;
 12 | static __constant__ int   grid_shift;
 13 | 
 14 | __device__ __forceinline__ vec2 intersect_ray_box(vec3 org, vec3 inv_dir, vec3 box_min, vec3 box_max) {
 15 |     auto tmin = (box_min - org) * inv_dir;
 16 |     auto tmax = (box_max - org) * inv_dir;
 17 |     auto t0 = min(tmin, tmax);
 18 |     auto t1 = max(tmin, tmax);
 19 |     return vec2(fmax(t0.x, fmax(t0.y, t0.z)),
 20 |                 fmin(t1.x, fmin(t1.y, t1.z)));
 21 | }
 22 | 
 23 | __device__ __forceinline__ vec3 compute_voxel(vec3 org, vec3 dir, float t) {
 24 |     return (t * dir + org - grid_min) * grid_inv;
 25 | }
 26 | 
 27 | template <typename CellT, typename Primitive>
 28 | __global__ void traverse(const Entry* __restrict__ entries,
 29 |                          const CellT* __restrict__ cells,
 30 |                          const int*  __restrict__ ref_ids,
 31 |                          const Primitive*  __restrict__ prims,
 32 |                          const Ray*  __restrict__ rays,
 33 |                          Hit* __restrict__ hits,
 34 |                          int num_rays) {
 35 |     const int id = threadIdx.x + blockDim.x * blockIdx.x;
 36 |     if (id >= num_rays) return;
 37 | 
 38 |     auto ray = load_ray(rays + id);
 39 |     auto inv_dir = vec3(safe_rcp(ray.dir.x), safe_rcp(ray.dir.y), safe_rcp(ray.dir.z));
 40 | 
 41 |     // Intersect the grid bounding box
 42 |     auto tbox = intersect_ray_box(ray.org, inv_dir, grid_min, grid_max);
 43 |     auto tstart = fmax(tbox.x, ray.tmin);
 44 |     auto tend   = fmin(tbox.y, ray.tmax);
 45 | 
 46 |     auto hit = Hit(-1, ray.tmax, 0, 0);
 47 |     int steps = 0;
 48 |     ivec3 voxel;
 49 | 
 50 |     // Early exit if the ray does not hit the grid
 51 |     if (tstart > tend) goto exit;
 52 | 
 53 |     // Find initial voxel
 54 |     voxel = clamp(ivec3(compute_voxel(ray.org, ray.dir, tstart)), ivec3(0, 0, 0), grid_dims - 1);
 55 | 
 56 |     while (true) {
 57 |         // Lookup entry
 58 |         const int entry = lookup_entry(entries, grid_shift, grid_dims >> grid_shift, voxel);
 59 | 
 60 |         // Lookup the cell associated with this voxel
 61 |         auto cell = load_cell(cells + entry);
 62 | 
 63 |         // Intersect the farmost planes of the cell bounding box
 64 |         auto cell_point = ivec3(ray.dir.x >= 0.0f ? cell.max.x : cell.min.x,
 65 |                                 ray.dir.y >= 0.0f ? cell.max.y : cell.min.y,
 66 |                                 ray.dir.z >= 0.0f ? cell.max.z : cell.min.z);
 67 |         auto tcell = (vec3(cell_point) * cell_size + grid_min - ray.org) * inv_dir;
 68 |         auto texit = fmin(tcell.x, fmin(tcell.y, tcell.z));
 69 | 
 70 |         // Move to the next voxel
 71 |         auto exit_point = ivec3(compute_voxel(ray.org, ray.dir, texit));
 72 |         auto next_voxel = ivec3(texit == tcell.x ? cell_point.x + (ray.dir.x >= 0.0f ? 0 : -1) : exit_point.x,
 73 |                                 texit == tcell.y ? cell_point.y + (ray.dir.y >= 0.0f ? 0 : -1) : exit_point.y,
 74 |                                 texit == tcell.z ? cell_point.z + (ray.dir.z >= 0.0f ? 0 : -1) : exit_point.z);
 75 |         voxel.x = ray.dir.x >= 0.0f ? max(next_voxel.x, voxel.x) : min(next_voxel.x, voxel.x);
 76 |         voxel.y = ray.dir.y >= 0.0f ? max(next_voxel.y, voxel.y) : min(next_voxel.y, voxel.y);
 77 |         voxel.z = ray.dir.z >= 0.0f ? max(next_voxel.z, voxel.z) : min(next_voxel.z, voxel.z);
 78 | 
 79 |         // Intersect the cell contents and exit if an intersection was found
 80 |         steps += 1 + foreach_ref(cell, ref_ids, [&] (int ref) {
 81 |             auto prim = load_prim(prims + ref);
 82 |             intersect_prim_ray(prim, Ray(ray.org, ray.tmin, ray.dir, hit.t), ref, hit);
 83 |         });
 84 | 
 85 |         if (hit.t <= texit ||
 86 |             (voxel.x < 0 | voxel.x >= grid_dims.x |
 87 |              voxel.y < 0 | voxel.y >= grid_dims.y |
 88 |              voxel.z < 0 | voxel.z >= grid_dims.z))
 89 |             break;
 90 |     }
 91 | 
 92 | exit:
 93 |     hit.id = steps;
 94 |     store_hit(hits + id, hit);
 95 | }
 96 | 
 97 | void setup_traversal(const Grid& grid) {
 98 |     auto extents = grid.bbox.extents();
 99 |     auto dims = grid.dims << grid.shift;
100 |     auto grid_inv  = vec3(dims) / extents;
101 |     auto cell_size = extents / vec3(dims);
102 | 
103 |     set_global(hagrid::grid_min,   grid.bbox.min);
104 |     set_global(hagrid::grid_max,   grid.bbox.max);
105 |     set_global(hagrid::grid_dims,  dims);
106 |     set_global(hagrid::cell_size,  cell_size);
107 |     set_global(hagrid::grid_inv,   grid_inv);
108 |     set_global(hagrid::grid_shift, grid.shift);
109 | }
110 | 
111 | void traverse_grid(const Grid& grid, const Tri* tris, const Ray* rays, Hit* hits, int num_rays) {
112 |     if (grid.small_cells) {
113 |         traverse<<<round_div(num_rays, 64), 64>>>(grid.entries, grid.small_cells, grid.ref_ids, tris, rays, hits, num_rays);
114 |     } else {
115 |         traverse<<<round_div(num_rays, 64), 64>>>(grid.entries, grid.cells, grid.ref_ids, tris, rays, hits, num_rays);
116 |     }
117 | }
118 | 
119 | } // namespace hagrid
120 | 


--------------------------------------------------------------------------------
/src/traverse.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRAVERSE_H
 2 | #define TRAVERSE_H
 3 | 
 4 | #include "grid.h"
 5 | #include "vec.h"
 6 | #include "prims.h"
 7 | 
 8 | namespace hagrid {
 9 | 
10 | /// Setups the traversal constants
11 | void setup_traversal(const Grid& grid);
12 | 
13 | /// Traverses the structure with the given set of rays
14 | void traverse_grid(const Grid& grid, const Tri* tris, const Ray* rays, Hit* hits, int num_rays);
15 | 
16 | } // namespace hagrid
17 | 
18 | #endif // TRAVERSE_H
19 | 


--------------------------------------------------------------------------------
/src/vec.h:
--------------------------------------------------------------------------------
  1 | #ifndef VEC_H
  2 | #define VEC_H
  3 | 
  4 | #include <cmath>
  5 | #include "common.h"
  6 | 
  7 | namespace hagrid {
  8 | 
  9 | template <typename T>
 10 | struct tvec2 {
 11 |     T x, y;
 12 |     HOST DEVICE tvec2() {}
 13 |     HOST DEVICE tvec2(T xy) : x(xy), y(xy) {}
 14 |     HOST DEVICE tvec2(T x, T y) : x(x), y(y) {}
 15 |     template <typename U>
 16 |     HOST DEVICE explicit tvec2(const tvec2<U>& xy) : x(xy.x), y(xy.y) {}
 17 | 
 18 |     HOST DEVICE tvec2& operator += (const tvec2& other) { *this = *this + other; return *this; }
 19 |     HOST DEVICE tvec2& operator -= (const tvec2& other) { *this = *this - other; return *this; }
 20 |     HOST DEVICE tvec2& operator *= (const tvec2& other) { *this = *this * other; return *this; }
 21 |     HOST DEVICE tvec2& operator /= (const tvec2& other) { *this = *this / other; return *this; }
 22 | 
 23 |     HOST DEVICE tvec2& operator *= (T t) { *this = *this * t; return *this; }
 24 |     HOST DEVICE tvec2& operator /= (T t) { *this = *this / t; return *this; }
 25 | };
 26 | 
 27 | #define BINARY_OP2(op) \
 28 | template <typename T> HOST DEVICE tvec2<T> operator op (const tvec2<T>& a, const tvec2<T>& b) { return tvec2<T>(a.x op b.x, a.y op b.y); } \
 29 | template <typename T> HOST DEVICE tvec2<T> operator op (const tvec2<T>& a, T b) { return tvec2<T>(a.x op b, a.y op b); } \
 30 | template <typename T> HOST DEVICE tvec2<T> operator op (T a, const tvec2<T>& b) { return tvec2<T>(a op b.x, a op b.y); }
 31 | 
 32 | BINARY_OP2(+)
 33 | BINARY_OP2(-)
 34 | BINARY_OP2(*)
 35 | BINARY_OP2(/)
 36 | BINARY_OP2(<<)
 37 | BINARY_OP2(>>)
 38 | BINARY_OP2(&)
 39 | BINARY_OP2(|)
 40 | 
 41 | #undef BINARY_OP2
 42 | 
 43 | template <typename T> HOST DEVICE tvec2<T> min(const tvec2<T>& a, const tvec2<T>& b) { return tvec2<T>(min(a.x, b.x), min(a.y, b.y)); }
 44 | template <typename T> HOST DEVICE tvec2<T> max(const tvec2<T>& a, const tvec2<T>& b) { return tvec2<T>(max(a.x, b.x), max(a.y, b.y)); }
 45 | template <typename T> HOST DEVICE tvec2<T> clamp(const tvec2<T>& a, T b, T c) { return tvec2<T>(min(max(a.x, b), c), min(max(a.y, b), c)); }
 46 | template <typename T> HOST DEVICE T dot(const tvec2<T>& a, const tvec2<T>& b) { return a.x * b.x + a.y * b.y; }
 47 | template <typename T> HOST DEVICE T length(const tvec2<T>& a) { return std::sqrt(dot(a, a)); }
 48 | template <typename T> HOST DEVICE tvec2<T> normalize(const tvec2<T>& a) { return a * (1.0f / length(a)); }
 49 | 
 50 | template <int axis, typename T>
 51 | HOST DEVICE T get(const tvec2<T>& v) {
 52 |     if (axis == 0) return v.x;
 53 |     return v.y;
 54 | }
 55 | 
 56 | template <typename T>
 57 | struct tvec3 {
 58 |     union { T x; T r; };
 59 |     union { T y; T g; };
 60 |     union { T z; T b; };
 61 |     HOST DEVICE tvec3() {}
 62 |     HOST DEVICE tvec3(T xyz) : x(xyz), y(xyz), z(xyz) {}
 63 |     HOST DEVICE tvec3(T x, T y, T z) : x(x), y(y), z(z) {}
 64 |     template <typename U>
 65 |     HOST DEVICE explicit tvec3(const tvec3<U>& xyz) : x(xyz.x), y(xyz.y), z(xyz.z) {}
 66 | 
 67 |     HOST DEVICE tvec3& operator += (const tvec3& other) { *this = *this + other; return *this; }
 68 |     HOST DEVICE tvec3& operator -= (const tvec3& other) { *this = *this - other; return *this; }
 69 |     HOST DEVICE tvec3& operator *= (const tvec3& other) { *this = *this * other; return *this; }
 70 |     HOST DEVICE tvec3& operator /= (const tvec3& other) { *this = *this / other; return *this; }
 71 | 
 72 |     HOST DEVICE tvec3& operator *= (T t) { *this = *this * t; return *this; }
 73 |     HOST DEVICE tvec3& operator /= (T t) { *this = *this / t; return *this; }
 74 | };
 75 | 
 76 | #define BINARY_OP3(op) \
 77 | template <typename T> HOST DEVICE tvec3<T> operator op (const tvec3<T>& a, const tvec3<T>& b) { return tvec3<T>(a.x op b.x, a.y op b.y, a.z op b.z); } \
 78 | template <typename T> HOST DEVICE tvec3<T> operator op (const tvec3<T>& a, T b) { return tvec3<T>(a.x op b, a.y op b, a.z op b); } \
 79 | template <typename T> HOST DEVICE tvec3<T> operator op (T a, const tvec3<T>& b) { return tvec3<T>(a op b.x, a op b.y, a op b.z); }
 80 | 
 81 | BINARY_OP3(+)
 82 | BINARY_OP3(-)
 83 | BINARY_OP3(*)
 84 | BINARY_OP3(/)
 85 | BINARY_OP3(<<)
 86 | BINARY_OP3(>>)
 87 | BINARY_OP3(&)
 88 | BINARY_OP3(|)
 89 | 
 90 | #undef BINARY_OP3
 91 | 
 92 | template <typename T> HOST DEVICE tvec3<T> min(const tvec3<T>& a, const tvec3<T>& b) { return tvec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
 93 | template <typename T> HOST DEVICE tvec3<T> max(const tvec3<T>& a, const tvec3<T>& b) { return tvec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
 94 | template <typename T> HOST DEVICE tvec3<T> clamp(const tvec3<T>& a, T b, T c) { return tvec3<T>(min(max(a.x, b), c), min(max(a.y, b), c), min(max(a.z, b), c)); }
 95 | template <typename T> HOST DEVICE T dot(const tvec3<T>& a, const tvec3<T>& b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
 96 | template <typename T> HOST DEVICE T length(const tvec3<T>& a) { return std::sqrt(dot(a, a)); }
 97 | template <typename T> HOST DEVICE tvec3<T> normalize(const tvec3<T>& a) { return a * (1.0f / length(a)); }
 98 | 
 99 | template <typename T>
100 | HOST DEVICE tvec3<T> cross(const tvec3<T>& a, const tvec3<T>& b) {
101 |     return tvec3<T>(a.y * b.z - a.z * b.y,
102 |                     a.z * b.x - a.x * b.z,
103 |                     a.x * b.y - a.y * b.x);
104 | }
105 | 
106 | template <typename T>
107 | HOST DEVICE tvec3<T> rotate(const tvec3<T>& v, const tvec3<T>& axis, T angle) {
108 |     T half = angle / 2; 
109 |    
110 |     T q[4] = {
111 |         axis.x * std::sin(half),
112 |         axis.y * std::sin(half),
113 |         axis.z * std::sin(half),
114 |         std::cos(half)
115 |     };
116 | 
117 |     T p[4] = {
118 |         q[3] * v.x + q[1] * v.z - q[2] * v.y,
119 |         q[3] * v.y - q[0] * v.z + q[2] * v.x,
120 |         q[3] * v.z + q[0] * v.y - q[1] * v.x,
121 |         -(q[0] * v.x + q[1] * v.y + q[2] * v.z)
122 |     };
123 | 
124 |     return tvec3<T>(p[3] * -q[0] + p[0] *  q[3] + p[1] * -q[2] - p[2] * -q[1],
125 |                     p[3] * -q[1] - p[0] * -q[2] + p[1] *  q[3] + p[2] * -q[0],
126 |                     p[3] * -q[2] + p[0] * -q[1] - p[1] * -q[0] + p[2] *  q[3]);
127 | }
128 | 
129 | template <int axis, typename T>
130 | HOST DEVICE T get(const tvec3<T>& v) {
131 |     if (axis == 0) return v.x;
132 |     else if (axis == 1) return v.y;
133 |     else return v.z;
134 | }
135 | 
136 | typedef tvec2<float> vec2;
137 | typedef tvec2<int>   ivec2;
138 | typedef tvec2<unsigned short> usvec2;
139 | typedef tvec3<float> vec3;
140 | typedef tvec3<int>   ivec3;
141 | typedef tvec3<unsigned short> usvec3;
142 | 
143 | } // namespace hagrid
144 | 
145 | #endif // VEC_H
146 | 


--------------------------------------------------------------------------------