├── .gitignore ├── .gitmodules ├── .vscode └── settings.json ├── CMakeLists.txt ├── cuda_rasterizer ├── auxiliary.h ├── backward.cu ├── backward.h ├── config.h ├── forward.cu ├── forward.h ├── rasterizer.h ├── rasterizer_impl.cu └── rasterizer_impl.h ├── diff_gauss └── __init__.py ├── ext.cpp ├── license.md ├── rasterize_points.cu ├── rasterize_points.h ├── readme.md ├── setup.py └── third_party └── stbi_image_write.h /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | *.egg-info/ 3 | dist/ 4 | __pycache__/ 5 | *.pyd 6 | Makefile 7 | CMakeCache.txt 8 | CMakeFiles 9 | *.a 10 | *.cmake 11 | *.so 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/glm"] 2 | path = third_party/glm 3 | url = https://github.com/g-truc/glm.git 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "cmath": "cpp", 4 | "stdexcept": "cpp", 5 | "cctype": "cpp", 6 | "clocale": "cpp", 7 | "csignal": "cpp", 8 | "cstdarg": "cpp", 9 | "cstddef": "cpp", 10 | "cstdio": "cpp", 11 | "cstdlib": "cpp", 12 | "cstring": "cpp", 13 | "ctime": "cpp", 14 | "cwchar": "cpp", 15 | "cwctype": "cpp", 16 | "array": "cpp", 17 | "atomic": "cpp", 18 | "strstream": "cpp", 19 | "bit": "cpp", 20 | "*.tcc": "cpp", 21 | "bitset": "cpp", 22 | "chrono": "cpp", 23 | "compare": "cpp", 24 | "complex": "cpp", 25 | "concepts": "cpp", 26 | "condition_variable": "cpp", 27 | "cstdint": "cpp", 28 | "deque": "cpp", 29 | "list": "cpp", 30 | "map": "cpp", 31 | "set": "cpp", 32 | "string": "cpp", 33 | "unordered_map": "cpp", 34 | "unordered_set": "cpp", 35 | "vector": "cpp", 36 | "exception": "cpp", 37 | "algorithm": "cpp", 38 | "functional": "cpp", 39 | "iterator": "cpp", 40 | "memory": "cpp", 41 | "memory_resource": "cpp", 42 | "numeric": "cpp", 43 | "optional": "cpp", 44 | "random": "cpp", 45 | "ratio": "cpp", 46 | "string_view": "cpp", 47 | "system_error": "cpp", 48 | "tuple": "cpp", 49 | "type_traits": "cpp", 50 | "utility": "cpp", 51 | "fstream": "cpp", 52 | "initializer_list": "cpp", 53 | "iomanip": "cpp", 54 | "iosfwd": "cpp", 55 | "iostream": "cpp", 56 | "istream": "cpp", 57 | "limits": "cpp", 58 | "mutex": "cpp", 59 | "new": "cpp", 60 | "numbers": "cpp", 61 | "ostream": "cpp", 62 | "semaphore": "cpp", 63 | "sstream": "cpp", 64 | "stop_token": "cpp", 65 | "streambuf": "cpp", 66 | "thread": "cpp", 67 | "cfenv": "cpp", 68 | "cinttypes": "cpp", 69 | "typeindex": "cpp", 70 | "typeinfo": "cpp", 71 | "valarray": "cpp", 72 | "variant": "cpp", 73 | "*.ipp": "cpp" 74 | } 75 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | cmake_minimum_required(VERSION 3.20) 13 | 14 | project(DiffRast LANGUAGES CUDA CXX) 15 | 16 | set(CMAKE_CXX_STANDARD 17) 17 | set(CMAKE_CXX_EXTENSIONS OFF) 18 | set(CMAKE_CUDA_STANDARD 17) 19 | 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 21 | 22 | add_library(CudaRasterizer 23 | cuda_rasterizer/backward.h 24 | cuda_rasterizer/backward.cu 25 | cuda_rasterizer/forward.h 26 | cuda_rasterizer/forward.cu 27 | cuda_rasterizer/auxiliary.h 28 | cuda_rasterizer/rasterizer_impl.cu 29 | cuda_rasterizer/rasterizer_impl.h 30 | cuda_rasterizer/rasterizer.h 31 | ) 32 | 33 | set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "70;75;86") 34 | 35 | target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer) 36 | target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 37 | -------------------------------------------------------------------------------- /cuda_rasterizer/auxiliary.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED 13 | #define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED 14 | 15 | #include "config.h" 16 | #include "stdio.h" 17 | #include 18 | #include 19 | namespace cg = cooperative_groups; 20 | 21 | #define BLOCK_SIZE (BLOCK_X * BLOCK_Y) 22 | #define NUM_WARPS (BLOCK_SIZE/32) 23 | #define MY_PI 3.14159265 24 | 25 | // Spherical harmonics coefficients 26 | __device__ const float SH_C0 = 0.28209479177387814f; 27 | __device__ const float SH_C1 = 0.4886025119029199f; 28 | __device__ const float SH_C2[] = { 29 | 1.0925484305920792f, 30 | -1.0925484305920792f, 31 | 0.31539156525252005f, 32 | -1.0925484305920792f, 33 | 0.5462742152960396f 34 | }; 35 | __device__ const float SH_C3[] = { 36 | -0.5900435899266435f, 37 | 2.890611442640554f, 38 | -0.4570457994644658f, 39 | 0.3731763325901154f, 40 | -0.4570457994644658f, 41 | 1.445305721320277f, 42 | -0.5900435899266435f 43 | }; 44 | 45 | __forceinline__ __device__ float ndc2Pix(float v, int S) 46 | { 47 | return ((v + 1.0) * S - 1.0) * 0.5; 48 | } 49 | 50 | __forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid) 51 | { 52 | rect_min = { 53 | min(grid.x, max((int)0, (int)((p.x - max_radius - 0.5) / BLOCK_X))), 54 | min(grid.y, max((int)0, (int)((p.y - max_radius - 0.5) / BLOCK_Y))) 55 | }; 56 | rect_max = { 57 | min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1 + 0.5) / BLOCK_X))), 58 | min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1 + 0.5) / BLOCK_Y))) 59 | }; 60 | } 61 | 62 | __forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix) 63 | { 64 | float3 transformed = { 65 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], 66 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], 67 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], 68 | }; 69 | return transformed; 70 | } 71 | 72 | __forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix) 73 | { 74 | float4 transformed = { 75 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], 76 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], 77 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], 78 | matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15] 79 | }; 80 | return transformed; 81 | } 82 | 83 | __forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix) 84 | { 85 | float3 transformed = { 86 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z, 87 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z, 88 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z, 89 | }; 90 | return transformed; 91 | } 92 | 93 | __forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix) 94 | { 95 | float3 transformed = { 96 | matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z, 97 | matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z, 98 | matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z, 99 | }; 100 | return transformed; 101 | } 102 | 103 | __forceinline__ __device__ float dnormvdz(float3 v, float3 dv) 104 | { 105 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; 106 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 107 | float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; 108 | return dnormvdz; 109 | } 110 | 111 | __forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv) 112 | { 113 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; 114 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 115 | 116 | float3 dnormvdv; 117 | dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32; 118 | dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32; 119 | dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; 120 | return dnormvdv; 121 | } 122 | 123 | __forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv) 124 | { 125 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w; 126 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 127 | 128 | float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w }; 129 | float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w; 130 | float4 dnormvdv; 131 | dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32; 132 | dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32; 133 | dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32; 134 | dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32; 135 | return dnormvdv; 136 | } 137 | 138 | __forceinline__ __device__ float sigmoid(float x) 139 | { 140 | return 1.0f / (1.0f + expf(-x)); 141 | } 142 | 143 | __forceinline__ __device__ float dist2(float2 d) 144 | { 145 | return d.x * d.x + d.y * d.y; 146 | } 147 | 148 | __forceinline__ __device__ bool in_frustum(int idx, 149 | const float* orig_points, 150 | const float* viewmatrix, 151 | const float* projmatrix, 152 | bool prefiltered, 153 | float3& p_view, // reference 154 | const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this 155 | const float xy_padding = 0.2f // padding in ndc space // TODO: add api for changing this 156 | ) 157 | { 158 | float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; 159 | p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside 160 | if (prefiltered) return true; 161 | 162 | // Bring points to screen space 163 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 164 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 165 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 166 | 167 | return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding); 168 | } 169 | 170 | __forceinline__ __device__ bool check_frustum( 171 | const float3 p_orig, 172 | const float* viewmatrix, 173 | const float* projmatrix, 174 | const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this 175 | const float xy_padding = 0.2f // padding in ndc space // TODO: add api for changing this 176 | ) 177 | { 178 | float3 p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside 179 | 180 | // Bring points to screen space 181 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 182 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 183 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 184 | 185 | return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding); 186 | } 187 | 188 | 189 | // As mentioned in: StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering 190 | __device__ inline float evaluate_opacity_factor(const float dx, const float dy, const float4 co) 191 | { 192 | return 0.5f * (co.x * dx * dx + co.z * dy * dy) + co.y * dx * dy; 193 | } 194 | 195 | __device__ inline float evaluate_opacity(const float dx, const float dy, const float4 co) 196 | { 197 | return co.w * expf(-evaluate_opacity_factor(dx, dy, co)); 198 | } 199 | 200 | template 201 | __device__ inline float max_contrib_power_rect_gaussian_float( 202 | const float4 co, 203 | const float2 mean, 204 | const glm::vec2 rect_min, 205 | const glm::vec2 rect_max, 206 | glm::vec2& max_pos) 207 | { 208 | const float x_min_diff = rect_min.x - mean.x; 209 | const float x_left = x_min_diff > 0.0f; 210 | // const float x_left = mean.x < rect_min.x; 211 | const float not_in_x_range = x_left + (mean.x > rect_max.x); 212 | 213 | const float y_min_diff = rect_min.y - mean.y; 214 | const float y_above = y_min_diff > 0.0f; 215 | // const float y_above = mean.y < rect_min.y; 216 | const float not_in_y_range = y_above + (mean.y > rect_max.y); 217 | 218 | max_pos = {mean.x, mean.y}; 219 | float max_contrib_power = 0.0f; 220 | 221 | if ((not_in_y_range + not_in_x_range) > 0.0f) 222 | { 223 | const float px = x_left * rect_min.x + (1.0f - x_left) * rect_max.x; 224 | const float py = y_above * rect_min.y + (1.0f - y_above) * rect_max.y; 225 | 226 | const float dx = copysign(float(PATCH_WIDTH), x_min_diff); 227 | const float dy = copysign(float(PATCH_HEIGHT), y_min_diff); 228 | 229 | const float diffx = mean.x - px; 230 | const float diffy = mean.y - py; 231 | 232 | const float rcp_dxdxcox = __frcp_rn(PATCH_WIDTH * PATCH_WIDTH * co.x); // = 1.0 / (dx*dx*co.x) 233 | const float rcp_dydycoz = __frcp_rn(PATCH_HEIGHT * PATCH_HEIGHT * co.z); // = 1.0 / (dy*dy*co.z) 234 | 235 | const float tx = not_in_y_range * __saturatef((dx * co.x * diffx + dx * co.y * diffy) * rcp_dxdxcox); 236 | const float ty = not_in_x_range * __saturatef((dy * co.y * diffx + dy * co.z * diffy) * rcp_dydycoz); 237 | max_pos = {px + tx * dx, py + ty * dy}; 238 | 239 | const float2 max_pos_diff = {mean.x - max_pos.x, mean.y - max_pos.y}; 240 | max_contrib_power = evaluate_opacity_factor(max_pos_diff.x, max_pos_diff.y, co); 241 | } 242 | 243 | return max_contrib_power; 244 | } 245 | 246 | 247 | __device__ inline int computeTilebasedCullingTileCount( 248 | const float4 co_init, 249 | const float2 xy_init, 250 | const float opacity_power_threshold_init, 251 | const uint2 rect_min_init, 252 | const uint2 rect_max_init) 253 | { 254 | const int32_t tile_count_init = (rect_max_init.y - rect_min_init.y) * (rect_max_init.x - rect_min_init.x); 255 | 256 | int tile_count = 0; 257 | const uint32_t rect_width = (rect_max_init.x - rect_min_init.x); 258 | for (int tile_idx = 0; tile_idx < tile_count_init; tile_idx++) 259 | { 260 | const int y = (tile_idx / rect_width) + rect_min_init.y; 261 | const int x = (tile_idx % rect_width) + rect_min_init.x; 262 | 263 | const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y}; 264 | const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1}; 265 | 266 | glm::vec2 max_pos; 267 | float max_opac_factor = max_contrib_power_rect_gaussian_float(co_init, xy_init, tile_min, tile_max, max_pos); 268 | tile_count += (max_opac_factor <= opacity_power_threshold_init); 269 | } 270 | 271 | return tile_count; 272 | } 273 | 274 | #define CHECK_CUDA(A, debug) \ 275 | A; \ 276 | if(debug) { \ 277 | auto ret = cudaDeviceSynchronize(); \ 278 | if (ret != cudaSuccess) { \ 279 | std::cerr << "[CUDA ERROR] in " << __FILE__ \ 280 | << " Line " << __LINE__ << ": " << cudaGetErrorString(ret) << std::endl; \ 281 | throw std::runtime_error(cudaGetErrorString(ret)); \ 282 | } \ 283 | } 284 | 285 | #define TEST_CUDA_MEMORY() \ 286 | do { \ 287 | const int N = 1337, bytes = N * sizeof(float); \ 288 | std::vector cpuvec(N); \ 289 | for (size_t i = 0; i < N; i++) \ 290 | cpuvec[i] = (float)i; \ 291 | float *gpuvec = NULL; \ 292 | CHECK_CUDA(cudaMalloc(&gpuvec, bytes), true); \ 293 | assert(gpuvec != NULL); \ 294 | CHECK_CUDA( \ 295 | cudaMemcpy(gpuvec, cpuvec.data(), bytes, cudaMemcpyHostToDevice), true) \ 296 | CHECK_CUDA( \ 297 | cudaMemcpy(cpuvec.data(), gpuvec, bytes, cudaMemcpyDeviceToHost), true) \ 298 | CHECK_CUDA(cudaFree(gpuvec), true); \ 299 | } while (0); 300 | 301 | #endif 302 | -------------------------------------------------------------------------------- /cuda_rasterizer/backward.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "backward.h" 13 | #include "auxiliary.h" 14 | #include 15 | #include 16 | namespace cg = cooperative_groups; 17 | 18 | // Backward pass for conversion of spherical harmonics to RGB for 19 | // each Gaussian. 20 | __device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs) 21 | { 22 | // Compute intermediate values, as it is done during forward 23 | glm::vec3 pos = means[idx]; 24 | glm::vec3 dir_orig = pos - campos; 25 | glm::vec3 dir = dir_orig / glm::length(dir_orig); 26 | 27 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 28 | 29 | // Use PyTorch rule for clamping: if clamping was applied, 30 | // gradient becomes 0. 31 | glm::vec3 dL_dRGB = dL_dcolor[idx]; 32 | dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1; 33 | dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1; 34 | dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1; 35 | 36 | glm::vec3 dRGBdx(0, 0, 0); 37 | glm::vec3 dRGBdy(0, 0, 0); 38 | glm::vec3 dRGBdz(0, 0, 0); 39 | float x = dir.x; 40 | float y = dir.y; 41 | float z = dir.z; 42 | 43 | // Target location for this Gaussian to write SH gradients to 44 | glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs; 45 | 46 | // No tricks here, just high school-level calculus. 47 | float dRGBdsh0 = SH_C0; 48 | dL_dsh[0] = dRGBdsh0 * dL_dRGB; 49 | if (deg > 0) 50 | { 51 | float dRGBdsh1 = -SH_C1 * y; 52 | float dRGBdsh2 = SH_C1 * z; 53 | float dRGBdsh3 = -SH_C1 * x; 54 | dL_dsh[1] = dRGBdsh1 * dL_dRGB; 55 | dL_dsh[2] = dRGBdsh2 * dL_dRGB; 56 | dL_dsh[3] = dRGBdsh3 * dL_dRGB; 57 | 58 | dRGBdx = -SH_C1 * sh[3]; 59 | dRGBdy = -SH_C1 * sh[1]; 60 | dRGBdz = SH_C1 * sh[2]; 61 | 62 | if (deg > 1) 63 | { 64 | float xx = x * x, yy = y * y, zz = z * z; 65 | float xy = x * y, yz = y * z, xz = x * z; 66 | 67 | float dRGBdsh4 = SH_C2[0] * xy; 68 | float dRGBdsh5 = SH_C2[1] * yz; 69 | float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy); 70 | float dRGBdsh7 = SH_C2[3] * xz; 71 | float dRGBdsh8 = SH_C2[4] * (xx - yy); 72 | dL_dsh[4] = dRGBdsh4 * dL_dRGB; 73 | dL_dsh[5] = dRGBdsh5 * dL_dRGB; 74 | dL_dsh[6] = dRGBdsh6 * dL_dRGB; 75 | dL_dsh[7] = dRGBdsh7 * dL_dRGB; 76 | dL_dsh[8] = dRGBdsh8 * dL_dRGB; 77 | 78 | dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8]; 79 | dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8]; 80 | dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7]; 81 | 82 | if (deg > 2) 83 | { 84 | float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy); 85 | float dRGBdsh10 = SH_C3[1] * xy * z; 86 | float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy); 87 | float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy); 88 | float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy); 89 | float dRGBdsh14 = SH_C3[5] * z * (xx - yy); 90 | float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy); 91 | dL_dsh[9] = dRGBdsh9 * dL_dRGB; 92 | dL_dsh[10] = dRGBdsh10 * dL_dRGB; 93 | dL_dsh[11] = dRGBdsh11 * dL_dRGB; 94 | dL_dsh[12] = dRGBdsh12 * dL_dRGB; 95 | dL_dsh[13] = dRGBdsh13 * dL_dRGB; 96 | dL_dsh[14] = dRGBdsh14 * dL_dRGB; 97 | dL_dsh[15] = dRGBdsh15 * dL_dRGB; 98 | 99 | dRGBdx += ( 100 | SH_C3[0] * sh[9] * 3.f * 2.f * xy + 101 | SH_C3[1] * sh[10] * yz + 102 | SH_C3[2] * sh[11] * -2.f * xy + 103 | SH_C3[3] * sh[12] * -3.f * 2.f * xz + 104 | SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) + 105 | SH_C3[5] * sh[14] * 2.f * xz + 106 | SH_C3[6] * sh[15] * 3.f * (xx - yy)); 107 | 108 | dRGBdy += ( 109 | SH_C3[0] * sh[9] * 3.f * (xx - yy) + 110 | SH_C3[1] * sh[10] * xz + 111 | SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) + 112 | SH_C3[3] * sh[12] * -3.f * 2.f * yz + 113 | SH_C3[4] * sh[13] * -2.f * xy + 114 | SH_C3[5] * sh[14] * -2.f * yz + 115 | SH_C3[6] * sh[15] * -3.f * 2.f * xy); 116 | 117 | dRGBdz += ( 118 | SH_C3[1] * sh[10] * xy + 119 | SH_C3[2] * sh[11] * 4.f * 2.f * yz + 120 | SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) + 121 | SH_C3[4] * sh[13] * 4.f * 2.f * xz + 122 | SH_C3[5] * sh[14] * (xx - yy)); 123 | } 124 | } 125 | } 126 | 127 | // The view direction is an input to the computation. View direction 128 | // is influenced by the Gaussian's mean, so SHs gradients 129 | // must propagate back into 3D position. 130 | glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB)); 131 | 132 | // Account for normalization of direction 133 | float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z }); 134 | 135 | // Gradients of loss w.r.t. Gaussian means, but only the portion 136 | // that is caused because the mean affects the view-dependent color. 137 | // Additional mean gradient is accumulated in below methods. 138 | dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z); 139 | } 140 | 141 | 142 | // Backward pass for conversion of spherical harmonics to RGB for 143 | // each Gaussian. 144 | __device__ void computeColorFromSH_4D(int idx, int deg, int deg_t, int max_coeffs, 145 | const float* shs, const glm::vec3* dirs, const float* dirs_t, const float time_duration, 146 | const glm::vec3* dL_drgb, float* dL_dshs, glm::vec3* dL_ddir, float* dL_ddir_t) 147 | { 148 | // Compute intermediate values, as it is done during forward 149 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 150 | glm::vec3 dir = dirs[idx]; 151 | const float dir_t = dirs_t[idx]; 152 | 153 | // Use PyTorch rule for clamping: if clamping was applied, 154 | // gradient becomes 0. 155 | glm::vec3 dL_dRGB = dL_drgb[idx]; 156 | 157 | glm::vec3 dRGBdx(0, 0, 0); 158 | glm::vec3 dRGBdy(0, 0, 0); 159 | glm::vec3 dRGBdz(0, 0, 0); 160 | glm::vec3 dRGBdt(0, 0, 0); 161 | 162 | // Target location for this Gaussian to write SH gradients to 163 | glm::vec3* dL_dsh = ((glm::vec3*)dL_dshs) + idx * max_coeffs; 164 | 165 | // No tricks here, just high school-level calculus. 166 | float l0m0 = SH_C0; 167 | 168 | float dRGBdsh0 = l0m0; 169 | dL_dsh[0] = dRGBdsh0 * dL_dRGB; 170 | 171 | if (deg > 0){ 172 | float x = dir.x; 173 | float y = dir.y; 174 | float z = dir.z; 175 | 176 | float l1m1 = -1 * SH_C1 * y; 177 | float l1m0 = SH_C1 * z; 178 | float l1p1 = -1 * SH_C1 * x; 179 | 180 | float dl1m1_dy = -1 * SH_C1; 181 | float dl1m0_dz = SH_C1; 182 | float dl1p1_dx = -1 * SH_C1; 183 | 184 | dL_dsh[1] = l1m1 * dL_dRGB; 185 | dL_dsh[2] = l1m0 * dL_dRGB; 186 | dL_dsh[3] = l1p1 * dL_dRGB; 187 | 188 | dRGBdx = dl1p1_dx * sh[3]; 189 | dRGBdy = dl1m1_dy * sh[1]; 190 | dRGBdz = dl1m0_dz * sh[2]; 191 | 192 | if (deg > 1){ 193 | float xx = x * x, yy = y * y, zz = z * z; 194 | float xy = x * y, yz = y * z, xz = x * z; 195 | 196 | float l2m2 = SH_C2[0] * xy; 197 | float l2m1 = SH_C2[1] * yz; 198 | float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy); 199 | float l2p1 = SH_C2[3] * xz; 200 | float l2p2 = SH_C2[4] * (xx - yy); 201 | 202 | float dl2m2_dx = SH_C2[0] * y; 203 | float dl2m2_dy = SH_C2[0] * x; 204 | float dl2m1_dy = SH_C2[1] * z; 205 | float dl2m1_dz = SH_C2[1] * y; 206 | float dl2m0_dx = -2 * SH_C2[2] * x; 207 | float dl2m0_dy = -2 * SH_C2[2] * y; 208 | float dl2m0_dz = 4 * SH_C2[2] * z; 209 | float dl2p1_dx = SH_C2[3] * z; 210 | float dl2p1_dz = SH_C2[3] * x; 211 | float dl2p2_dx = 2 * SH_C2[4] * x; 212 | float dl2p2_dy = -2 * SH_C2[4] * y; 213 | 214 | dL_dsh[4] = l2m2 * dL_dRGB; 215 | dL_dsh[5] = l2m1 * dL_dRGB; 216 | dL_dsh[6] = l2m0 * dL_dRGB; 217 | dL_dsh[7] = l2p1 * dL_dRGB; 218 | dL_dsh[8] = l2p2 * dL_dRGB; 219 | 220 | dRGBdx += ( 221 | dl2m2_dx * sh[4] + dl2m0_dx * sh[6] + dl2p1_dx * sh[7] + dl2p2_dx * sh[8] 222 | ); 223 | dRGBdy += ( 224 | dl2m2_dy * sh[4] + dl2m1_dy * sh[5] + dl2m0_dy * sh[6] + dl2p2_dy * sh[8] 225 | ); 226 | dRGBdz += ( 227 | dl2m1_dz * sh[5] + dl2m0_dz * sh[6] + dl2p1_dz * sh[7] 228 | ); 229 | 230 | if (deg > 2){ 231 | float l3m3 = SH_C3[0] * y * (3 * xx - yy); 232 | float l3m2 = SH_C3[1] * xy * z; 233 | float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy); 234 | float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy); 235 | float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy); 236 | float l3p2 = SH_C3[5] * z * (xx - yy); 237 | float l3p3 = SH_C3[6] * x * (xx - 3 * yy); 238 | 239 | float dl3m3_dx = SH_C3[0] * y * 6 * x; 240 | float dl3m3_dy = SH_C3[0] * (3 * xx - 3 * yy); 241 | float dl3m2_dx = SH_C3[1] * yz; 242 | float dl3m2_dy = SH_C3[1] * xz; 243 | float dl3m2_dz = SH_C3[1] * xy; 244 | float dl3m1_dx = -SH_C3[2] * y * 2 * x; 245 | float dl3m1_dy = SH_C3[2] * (4 * zz - xx - 3 * yy); 246 | float dl3m1_dz = SH_C3[2] * y * 8 * z; 247 | float dl3m0_dx = -SH_C3[3] * z * 6 * x; 248 | float dl3m0_dy = -SH_C3[3] * z * 6 * y; 249 | float dl3m0_dz = SH_C3[3] * (6 * zz - 3 * xx - 3 * yy); 250 | float dl3p1_dx = SH_C3[4] * (4 * zz - 3 * xx - yy); 251 | float dl3p1_dy = -SH_C3[4] * x * 2 * y; 252 | float dl3p1_dz = SH_C3[4] * x * 8 * z; 253 | float dl3p2_dx = SH_C3[5] * z * 2 * x; 254 | float dl3p2_dy = -SH_C3[5] * z * 2 * y; 255 | float dl3p2_dz = SH_C3[5] * (xx - yy); 256 | float dl3p3_dx = SH_C3[6] * (3 * xx - 3 * yy); 257 | float dl3p3_dy = -SH_C3[6] * x * 6 * y; 258 | 259 | dL_dsh[9] = l3m3 * dL_dRGB; 260 | dL_dsh[10] = l3m2 * dL_dRGB; 261 | dL_dsh[11] = l3m1 * dL_dRGB; 262 | dL_dsh[12] = l3m0 * dL_dRGB; 263 | dL_dsh[13] = l3p1 * dL_dRGB; 264 | dL_dsh[14] = l3p2 * dL_dRGB; 265 | dL_dsh[15] = l3p3 * dL_dRGB; 266 | 267 | dRGBdx += ( 268 | dl3m3_dx * sh[9] + 269 | dl3m2_dx * sh[10] + 270 | dl3m1_dx * sh[11] + 271 | dl3m0_dx * sh[12] + 272 | dl3p1_dx * sh[13] + 273 | dl3p2_dx * sh[14] + 274 | dl3p3_dx * sh[15] 275 | ); 276 | 277 | dRGBdy += ( 278 | dl3m3_dy * sh[9] + 279 | dl3m2_dy * sh[10] + 280 | dl3m1_dy * sh[11] + 281 | dl3m0_dy * sh[12] + 282 | dl3p1_dy * sh[13] + 283 | dl3p2_dy * sh[14] + 284 | dl3p3_dy * sh[15] 285 | ); 286 | 287 | dRGBdz += ( 288 | dl3m2_dz * sh[10] + 289 | dl3m1_dz * sh[11] + 290 | dl3m0_dz * sh[12] + 291 | dl3p1_dz * sh[13] + 292 | dl3p2_dz * sh[14] 293 | ); 294 | 295 | if (deg_t > 0){ 296 | float t1 = cos(2 * MY_PI * dir_t / time_duration); 297 | float dt1_dt = sin(2 * MY_PI * dir_t / time_duration) * 2 * MY_PI / time_duration; 298 | 299 | dL_dsh[16] = t1 * l0m0 * dL_dRGB; 300 | dL_dsh[17] = t1 * l1m1 * dL_dRGB; 301 | dL_dsh[18] = t1 * l1m0 * dL_dRGB; 302 | dL_dsh[19] = t1 * l1p1 * dL_dRGB; 303 | dL_dsh[20] = t1 * l2m2 * dL_dRGB; 304 | dL_dsh[21] = t1 * l2m1 * dL_dRGB; 305 | dL_dsh[22] = t1 * l2m0 * dL_dRGB; 306 | dL_dsh[23] = t1 * l2p1 * dL_dRGB; 307 | dL_dsh[24] = t1 * l2p2 * dL_dRGB; 308 | dL_dsh[25] = t1 * l3m3 * dL_dRGB; 309 | dL_dsh[26] = t1 * l3m2 * dL_dRGB; 310 | dL_dsh[27] = t1 * l3m1 * dL_dRGB; 311 | dL_dsh[28] = t1 * l3m0 * dL_dRGB; 312 | dL_dsh[29] = t1 * l3p1 * dL_dRGB; 313 | dL_dsh[30] = t1 * l3p2 * dL_dRGB; 314 | dL_dsh[31] = t1 * l3p3 * dL_dRGB; 315 | 316 | 317 | dRGBdt += dt1_dt * ( 318 | l0m0 * sh[16] + 319 | l1m1 * sh[17] + 320 | l1m0 * sh[18] + 321 | l1p1 * sh[19] + 322 | l2m2 * sh[20] + 323 | l2m1 * sh[21] + 324 | l2m0 * sh[22] + 325 | l2p1 * sh[23] + 326 | l2p2 * sh[24] + 327 | l3m3 * sh[25] + 328 | l3m2 * sh[26] + 329 | l3m1 * sh[27] + 330 | l3m0 * sh[28] + 331 | l3p1 * sh[29] + 332 | l3p2 * sh[30] + 333 | l3p3 * sh[31]); 334 | 335 | dRGBdx += t1 * ( 336 | dl1p1_dx * sh[19] + 337 | dl2m2_dx * sh[20] + 338 | dl2m0_dx * sh[22] + 339 | dl2p1_dx * sh[23] + 340 | dl2p2_dx * sh[24] + 341 | dl3m3_dx * sh[25] + 342 | dl3m2_dx * sh[26] + 343 | dl3m1_dx * sh[27] + 344 | dl3m0_dx * sh[28] + 345 | dl3p1_dx * sh[29] + 346 | dl3p2_dx * sh[30] + 347 | dl3p3_dx * sh[31] 348 | ); 349 | 350 | dRGBdy += t1 * ( 351 | dl1m1_dy * sh[17] + 352 | dl2m2_dy * sh[20] + 353 | dl2m1_dy * sh[21] + 354 | dl2m0_dy * sh[22] + 355 | dl2p2_dy * sh[24] + 356 | dl3m3_dy * sh[25] + 357 | dl3m2_dy * sh[26] + 358 | dl3m1_dy * sh[27] + 359 | dl3m0_dy * sh[28] + 360 | dl3p1_dy * sh[29] + 361 | dl3p2_dy * sh[30] + 362 | dl3p3_dy * sh[31] 363 | ); 364 | 365 | dRGBdz += t1 * ( 366 | dl1m0_dz * sh[18] + 367 | dl2m1_dz * sh[21] + 368 | dl2m0_dz * sh[22] + 369 | dl2p1_dz * sh[23] + 370 | dl3m2_dz * sh[26] + 371 | dl3m1_dz * sh[27] + 372 | dl3m0_dz * sh[28] + 373 | dl3p1_dz * sh[29] + 374 | dl3p2_dz * sh[30] 375 | ); 376 | 377 | if (deg_t > 1){ 378 | float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration); 379 | float dt2_dt = sin(2 * MY_PI * dir_t * 2 / time_duration) * 2 * MY_PI * 2 / time_duration; 380 | 381 | 382 | dL_dsh[32] = t2 * l0m0 * dL_dRGB; 383 | dL_dsh[33] = t2 * l1m1 * dL_dRGB; 384 | dL_dsh[34] = t2 * l1m0 * dL_dRGB; 385 | dL_dsh[35] = t2 * l1p1 * dL_dRGB; 386 | dL_dsh[36] = t2 * l2m2 * dL_dRGB; 387 | dL_dsh[37] = t2 * l2m1 * dL_dRGB; 388 | dL_dsh[38] = t2 * l2m0 * dL_dRGB; 389 | dL_dsh[39] = t2 * l2p1 * dL_dRGB; 390 | dL_dsh[40] = t2 * l2p2 * dL_dRGB; 391 | dL_dsh[41] = t2 * l3m3 * dL_dRGB; 392 | dL_dsh[42] = t2 * l3m2 * dL_dRGB; 393 | dL_dsh[43] = t2 * l3m1 * dL_dRGB; 394 | dL_dsh[44] = t2 * l3m0 * dL_dRGB; 395 | dL_dsh[45] = t2 * l3p1 * dL_dRGB; 396 | dL_dsh[46] = t2 * l3p2 * dL_dRGB; 397 | dL_dsh[47] = t2 * l3p3 * dL_dRGB; 398 | 399 | dRGBdt += dt2_dt * ( 400 | l0m0 * sh[32] + 401 | l1m1 * sh[33] + 402 | l1m0 * sh[34] + 403 | l1p1 * sh[35] + 404 | l2m2 * sh[36] + 405 | l2m1 * sh[37] + 406 | l2m0 * sh[38] + 407 | l2p1 * sh[39] + 408 | l2p2 * sh[40] + 409 | l3m3 * sh[41] + 410 | l3m2 * sh[42] + 411 | l3m1 * sh[43] + 412 | l3m0 * sh[44] + 413 | l3p1 * sh[45] + 414 | l3p2 * sh[46] + 415 | l3p3 * sh[47]); 416 | 417 | dRGBdx += t2 * ( 418 | dl1p1_dx * sh[35] + 419 | dl2m2_dx * sh[36] + 420 | dl2m0_dx * sh[38] + 421 | dl2p1_dx * sh[39] + 422 | dl2p2_dx * sh[40] + 423 | dl3m3_dx * sh[41] + 424 | dl3m2_dx * sh[42] + 425 | dl3m1_dx * sh[43] + 426 | dl3m0_dx * sh[44] + 427 | dl3p1_dx * sh[45] + 428 | dl3p2_dx * sh[46] + 429 | dl3p3_dx * sh[47] 430 | ); 431 | 432 | dRGBdy += t2 * ( 433 | dl1m1_dy * sh[33] + 434 | dl2m2_dy * sh[36] + 435 | dl2m1_dy * sh[37] + 436 | dl2m0_dy * sh[38] + 437 | dl2p2_dy * sh[40] + 438 | dl3m3_dy * sh[41] + 439 | dl3m2_dy * sh[42] + 440 | dl3m1_dy * sh[43] + 441 | dl3m0_dy * sh[44] + 442 | dl3p1_dy * sh[45] + 443 | dl3p2_dy * sh[46] + 444 | dl3p3_dy * sh[47] 445 | ); 446 | 447 | dRGBdz += t2 * ( 448 | dl1m0_dz * sh[34] + 449 | dl2m1_dz * sh[37] + 450 | dl2m0_dz * sh[38] + 451 | dl2p1_dz * sh[39] + 452 | dl3m2_dz * sh[42] + 453 | dl3m1_dz * sh[43] + 454 | dl3m0_dz * sh[44] + 455 | dl3p1_dz * sh[45] + 456 | dl3p2_dz * sh[46] 457 | ); 458 | } 459 | } 460 | } 461 | } 462 | } 463 | 464 | // The view direction is an input to the computation. View direction 465 | // is influenced by the Gaussian's mean, so SHs gradients 466 | // must propagate back into 3D position. 467 | dL_ddir[idx].x = glm::dot(dRGBdx, dL_dRGB); 468 | dL_ddir[idx].y = glm::dot(dRGBdy, dL_dRGB); 469 | dL_ddir[idx].z = glm::dot(dRGBdz, dL_dRGB); 470 | 471 | // Gradients of loss w.r.t. Gaussian means, but only the portion 472 | // that is caused because the mean affects the view-dependent color. 473 | // Additional mean gradient is accumulated in below methods. 474 | dL_ddir_t[idx] = -glm::dot(dRGBdt, dL_dRGB); 475 | } 476 | 477 | __global__ void computeSH4DBackwardCUDA(int P, 478 | int deg, int deg_t, int max_coeffs, 479 | const float* sh, const glm::vec3* dir, const float* dir_t, const float time_duration, 480 | const glm::vec3* dL_drgb, float* dL_dsh, glm::vec3* dL_ddir, float* dL_ddir_t) 481 | { 482 | auto idx = cg::this_grid().thread_rank(); 483 | if (idx >= P) 484 | return; 485 | computeColorFromSH_4D( 486 | idx, 487 | deg, 488 | deg_t, 489 | max_coeffs, 490 | sh, 491 | dir, 492 | dir_t, 493 | time_duration, 494 | dL_drgb, 495 | dL_dsh, 496 | dL_ddir, 497 | dL_ddir_t 498 | ); 499 | } 500 | 501 | 502 | void BACKWARD::computeSH4DBackward( 503 | int P, 504 | int deg, int deg_t, int max_coeffs, 505 | const float* sh, 506 | const glm::vec3* dir, 507 | const float* dir_t, 508 | const float time_duration, 509 | const glm::vec3* dL_drgb, 510 | float* dL_dsh, 511 | glm::vec3* dL_ddir, 512 | float* dL_ddir_t 513 | ) 514 | { 515 | computeSH4DBackwardCUDA << <(P + 255) / 256, 256 >> > ( 516 | P, 517 | deg, 518 | deg_t, 519 | max_coeffs, 520 | sh, 521 | dir, 522 | dir_t, 523 | time_duration, 524 | dL_drgb, 525 | dL_dsh, 526 | dL_ddir, 527 | dL_ddir_t 528 | ); 529 | } 530 | 531 | // Backward version of INVERSE 2D covariance matrix computation 532 | // (due to length launched as separate kernel before other 533 | // backward steps contained in preprocess) 534 | __global__ void computeCov2DCUDA(int P, 535 | const float3* means, 536 | const int* radii, 537 | const float* cov3Ds, 538 | const float h_x, float h_y, 539 | const float tan_fovx, float tan_fovy, 540 | const float* view_matrix, 541 | const float* dL_dconics, 542 | float3* dL_dmeans, 543 | float* dL_dcov) 544 | { 545 | auto idx = cg::this_grid().thread_rank(); 546 | if (idx >= P || !(radii[idx] > 0)) 547 | return; 548 | 549 | // Reading location of 3D covariance for this Gaussian 550 | const float* cov3D = cov3Ds + 6 * idx; 551 | 552 | // Fetch gradients, recompute 2D covariance and relevant 553 | // intermediate forward results needed in the backward. 554 | float3 mean = means[idx]; 555 | float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] }; 556 | float3 t = transformPoint4x3(mean, view_matrix); 557 | 558 | const float limx = 1.3f * tan_fovx; 559 | const float limy = 1.3f * tan_fovy; 560 | const float txtz = t.x / t.z; 561 | const float tytz = t.y / t.z; 562 | t.x = min(limx, max(-limx, txtz)) * t.z; 563 | t.y = min(limy, max(-limy, tytz)) * t.z; 564 | 565 | const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1; 566 | const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1; 567 | 568 | glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z), 569 | 0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z), 570 | 0, 0, 0); 571 | 572 | glm::mat3 W = glm::mat3( 573 | view_matrix[0], view_matrix[4], view_matrix[8], 574 | view_matrix[1], view_matrix[5], view_matrix[9], 575 | view_matrix[2], view_matrix[6], view_matrix[10]); 576 | 577 | glm::mat3 Vrk = glm::mat3( 578 | cov3D[0], cov3D[1], cov3D[2], 579 | cov3D[1], cov3D[3], cov3D[4], 580 | cov3D[2], cov3D[4], cov3D[5]); 581 | 582 | glm::mat3 T = W * J; 583 | 584 | glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T; 585 | 586 | // Use helper variables for 2D covariance entries. More compact. 587 | float a = cov2D[0][0] += 0.3f; 588 | float b = cov2D[0][1]; 589 | float c = cov2D[1][1] += 0.3f; 590 | 591 | float denom = a * c - b * b; 592 | float dL_da = 0, dL_db = 0, dL_dc = 0; 593 | float denom2inv = 1.0f / ((denom * denom) + 0.0000001f); 594 | 595 | if (denom2inv != 0) 596 | { 597 | // Gradients of loss w.r.t. entries of 2D covariance matrix, 598 | // given gradients of loss w.r.t. conic matrix (inverse covariance matrix). 599 | // e.g., dL / da = dL / d_conic_a * d_conic_a / d_a 600 | dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z); 601 | dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x); 602 | dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z); 603 | 604 | // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 605 | // given gradients w.r.t. 2D covariance matrix (diagonal). 606 | // cov2D = transpose(T) * transpose(Vrk) * T; 607 | dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc); 608 | dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc); 609 | dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc); 610 | 611 | // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 612 | // given gradients w.r.t. 2D covariance matrix (off-diagonal). 613 | // Off-diagonal elements appear twice --> double the gradient. 614 | // cov2D = transpose(T) * transpose(Vrk) * T; 615 | dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc; 616 | dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc; 617 | dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc; 618 | } 619 | else 620 | { 621 | for (int i = 0; i < 6; i++) 622 | dL_dcov[6 * idx + i] = 0; 623 | } 624 | 625 | // Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T 626 | // cov2D = transpose(T) * transpose(Vrk) * T; 627 | float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da + 628 | (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db; 629 | float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da + 630 | (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db; 631 | float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da + 632 | (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db; 633 | float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc + 634 | (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db; 635 | float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc + 636 | (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db; 637 | float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc + 638 | (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db; 639 | 640 | // Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix 641 | // T = W * J 642 | float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02; 643 | float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02; 644 | float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12; 645 | float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12; 646 | 647 | float tz = 1.f / t.z; 648 | float tz2 = tz * tz; 649 | float tz3 = tz2 * tz; 650 | 651 | // Gradients of loss w.r.t. transformed Gaussian mean t 652 | float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02; 653 | float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12; 654 | float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12; 655 | 656 | // Account for transformation of mean to t 657 | // t = transformPoint4x3(mean, view_matrix); 658 | float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix); 659 | 660 | // Gradients of loss w.r.t. Gaussian means, but only the portion 661 | // that is caused because the mean affects the covariance matrix. 662 | // Additional mean gradient is accumulated in BACKWARD::preprocess. 663 | dL_dmeans[idx] = dL_dmean; 664 | } 665 | 666 | // Backward pass for the conversion of scale and rotation to a 667 | // 3D covariance matrix for each Gaussian. 668 | __device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov, glm::vec3* dL_dscales, glm::vec4* dL_drots) 669 | { 670 | // Recompute (intermediate) results for the 3D covariance computation. 671 | glm::vec4 q = rot;// / glm::length(rot); 672 | float r = q.x; 673 | float x = q.y; 674 | float y = q.z; 675 | float z = q.w; 676 | 677 | glm::mat3 R = glm::mat3( 678 | 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), 679 | 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), 680 | 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) 681 | ); 682 | 683 | glm::mat3 S = glm::mat3(1.0f); 684 | 685 | glm::vec3 s = mod * scale; 686 | S[0][0] = s.x; 687 | S[1][1] = s.y; 688 | S[2][2] = s.z; 689 | 690 | glm::mat3 M = S * R; 691 | 692 | const float* dL_dcov3D = dL_dcov + 6 * idx; 693 | 694 | glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]); 695 | glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]); 696 | 697 | // Convert per-element covariance loss gradients to matrix form 698 | glm::mat3 dL_dSigma = glm::mat3( 699 | dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2], 700 | 0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4], 701 | 0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5] 702 | ); 703 | 704 | // Compute loss gradient w.r.t. matrix M 705 | // dSigma_dM = 2 * M 706 | glm::mat3 dL_dM = 2.0f * M * dL_dSigma; 707 | 708 | glm::mat3 Rt = glm::transpose(R); 709 | glm::mat3 dL_dMt = glm::transpose(dL_dM); 710 | 711 | // Gradients of loss w.r.t. scale 712 | glm::vec3* dL_dscale = dL_dscales + idx; 713 | dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]); 714 | dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]); 715 | dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]); 716 | 717 | dL_dMt[0] *= s.x; 718 | dL_dMt[1] *= s.y; 719 | dL_dMt[2] *= s.z; 720 | 721 | // Gradients of loss w.r.t. normalized quaternion 722 | glm::vec4 dL_dq; 723 | dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]); 724 | dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]); 725 | dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]); 726 | dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]); 727 | 728 | // Gradients of loss w.r.t. unnormalized quaternion 729 | float4* dL_drot = (float4*)(dL_drots + idx); 730 | *dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w }); 731 | } 732 | 733 | 734 | __global__ void computeCov3DBackwardCUDA(int P, 735 | const glm::vec3* scaling_xyz, 736 | const glm::vec4* rotation_l, 737 | const float* dL_dcov, 738 | glm::vec3* dL_dscaling_xyz, 739 | glm::vec4* dL_drotation_l) 740 | { 741 | auto idx = cg::this_grid().thread_rank(); 742 | if (idx >= P) 743 | return; 744 | computeCov3D( 745 | idx, 746 | scaling_xyz[idx], 747 | 1.0f, 748 | rotation_l[idx], 749 | // dL_dcov + idx * 6, 750 | // dL_dscaling_xyz + idx, 751 | // dL_drotation_l + idx); 752 | dL_dcov, 753 | dL_dscaling_xyz, 754 | dL_drotation_l); 755 | } 756 | 757 | void BACKWARD::computeCov3DBackward( 758 | int P, 759 | const glm::vec3* scaling_xyz, 760 | const glm::vec4* rotation_l, 761 | const float* dL_dcov, 762 | glm::vec3* dL_dscaling_xyz, 763 | glm::vec4* dL_drotation_l) 764 | { 765 | computeCov3DBackwardCUDA << <(P + 255) / 256, 256 >> > ( 766 | P, 767 | scaling_xyz, 768 | rotation_l, 769 | dL_dcov, 770 | dL_dscaling_xyz, 771 | dL_drotation_l); 772 | } 773 | 774 | // Backward pass for the conversion of scale and rotation to a 775 | // 3D covariance matrix for each Gaussian. 776 | __device__ void computeCov4DBackward( 777 | const glm::vec4 scaling_xyzt, 778 | const glm::vec4 rotation_l, 779 | const glm::vec4 rotation_r, 780 | const float* dL_dcov, 781 | const glm::vec3 dL_dms, 782 | const float dL_dcov_t, 783 | glm::vec4 &dL_dscaling_xyzt, 784 | glm::vec4 &dL_drotation_l, 785 | glm::vec4 &dL_drotation_r) 786 | { 787 | glm::mat4 S = glm::mat4(1.0f); 788 | S[0][0] = scaling_xyzt.x; 789 | S[1][1] = scaling_xyzt.y; 790 | S[2][2] = scaling_xyzt.z; 791 | S[3][3] = scaling_xyzt.w; 792 | 793 | const float l_l = glm::length(rotation_l); 794 | const float a = rotation_l.x / l_l; 795 | const float b = rotation_l.y / l_l; 796 | const float c = rotation_l.z / l_l; 797 | const float d = rotation_l.w / l_l; 798 | 799 | const float l_r = glm::length(rotation_r); 800 | const float p = rotation_r.x / l_r; 801 | const float q = rotation_r.y / l_r; 802 | const float r = rotation_r.z / l_r; 803 | const float s = rotation_r.w / l_r; 804 | 805 | glm::mat4 M_l = glm::mat4( 806 | a, -b, -c, -d, 807 | b, a,-d, c, 808 | c, d, a,-b, 809 | d,-c, b, a 810 | ); 811 | 812 | glm::mat4 M_r = glm::mat4( 813 | p, q, r, s, 814 | -q, p,-s, r, 815 | -r, s, p,-q, 816 | -s,-r, q, p 817 | ); 818 | // glm stores in column major 819 | glm::mat4 R = M_r * M_l; 820 | glm::mat4 M = S * R; 821 | glm::mat4 Sigma = glm::transpose(M) * M; 822 | float cov_t = Sigma[3][3]; 823 | 824 | glm::mat3 cov11 = glm::mat3(Sigma); 825 | glm::vec3 cov12 = glm::vec3(Sigma[0][3], Sigma[1][3], Sigma[2][3]); 826 | 827 | glm::vec3 dL_dcov12 = -glm::vec3( 828 | dL_dcov[0] * cov12[0] + dL_dcov[1] * cov12[1] * 0.5 + dL_dcov[2] * cov12[2] * 0.5, 829 | dL_dcov[1] * cov12[0] * 0.5 + dL_dcov[3] * cov12[1] + dL_dcov[4] * cov12[2] * 0.5, 830 | dL_dcov[2] * cov12[0] * 0.5 + dL_dcov[4] * cov12[1] * 0.5 + dL_dcov[5] * cov12[2] 831 | ) * 2.0f / cov_t; 832 | 833 | dL_dcov12 += dL_dms / cov_t; 834 | 835 | float dL_dcov_t_w_ms_cov = dL_dcov_t; 836 | float dL_dms_dot_cov12 = glm::dot(dL_dms, cov12); 837 | dL_dcov_t_w_ms_cov += -dL_dms_dot_cov12 / (cov_t * cov_t); 838 | dL_dcov_t_w_ms_cov += ( 839 | cov12[0] * cov12[0] * dL_dcov[0] + cov12[0] * cov12[1] * dL_dcov[1] + 840 | cov12[0] * cov12[2] * dL_dcov[2] + cov12[1] * cov12[1] * dL_dcov[3] + 841 | cov12[1] * cov12[2] * dL_dcov[4] + cov12[2] * cov12[2] * dL_dcov[5] 842 | ) / (cov_t * cov_t); 843 | 844 | glm::mat4 dL_dSigma = glm::mat4( 845 | dL_dcov[0], 0.5f * dL_dcov[1], 0.5f * dL_dcov[2], 0.5f * dL_dcov12[0], 846 | 0.5f * dL_dcov[1], dL_dcov[3], 0.5f * dL_dcov[4], 0.5f * dL_dcov12[1], 847 | 0.5f * dL_dcov[2], 0.5f * dL_dcov[4], dL_dcov[5], 0.5f * dL_dcov12[2], 848 | 0.5f * dL_dcov12[0], 0.5f * dL_dcov12[1], 0.5f * dL_dcov12[2], dL_dcov_t_w_ms_cov 849 | ); 850 | // Compute loss gradient w.r.t. matrix M 851 | // dSigma_dM = 2 * M 852 | glm::mat4 dL_dM = 2.0f * M * dL_dSigma; 853 | 854 | glm::mat4 Rt = glm::transpose(R); 855 | glm::mat4 dL_dMt = glm::transpose(dL_dM); 856 | 857 | // Gradients of loss w.r.t. scale 858 | dL_dscaling_xyzt.x = glm::dot(Rt[0], dL_dMt[0]); 859 | dL_dscaling_xyzt.y = glm::dot(Rt[1], dL_dMt[1]); 860 | dL_dscaling_xyzt.z = glm::dot(Rt[2], dL_dMt[2]); 861 | dL_dscaling_xyzt.w = glm::dot(Rt[3], dL_dMt[3]); 862 | 863 | dL_dMt[0] *= scaling_xyzt.x; 864 | dL_dMt[1] *= scaling_xyzt.y; 865 | dL_dMt[2] *= scaling_xyzt.z; 866 | dL_dMt[3] *= scaling_xyzt.w; 867 | 868 | glm::mat4 dL_dml_t = dL_dMt * M_r; 869 | glm::vec4 dL_drot_l; 870 | dL_drot_l.x = dL_dml_t[0][0] + dL_dml_t[1][1] + dL_dml_t[2][2] + dL_dml_t[3][3]; 871 | dL_drot_l.y = dL_dml_t[0][1] - dL_dml_t[1][0] + dL_dml_t[2][3] - dL_dml_t[3][2]; 872 | dL_drot_l.z = dL_dml_t[0][2] - dL_dml_t[1][3] - dL_dml_t[2][0] + dL_dml_t[3][1]; 873 | dL_drot_l.w = dL_dml_t[0][3] + dL_dml_t[1][2] - dL_dml_t[2][1] - dL_dml_t[3][0]; 874 | 875 | glm::mat4 dL_dmr_t = M_l * dL_dMt; 876 | glm::vec4 dL_drot_r; 877 | dL_drot_r.x = dL_dmr_t[0][0] + dL_dmr_t[1][1] + dL_dmr_t[2][2] + dL_dmr_t[3][3]; 878 | dL_drot_r.y = -dL_dmr_t[0][1] + dL_dmr_t[1][0] + dL_dmr_t[2][3] - dL_dmr_t[3][2]; 879 | dL_drot_r.z = -dL_dmr_t[0][2] - dL_dmr_t[1][3] + dL_dmr_t[2][0] + dL_dmr_t[3][1]; 880 | dL_drot_r.w = -dL_dmr_t[0][3] + dL_dmr_t[1][2] - dL_dmr_t[2][1] + dL_dmr_t[3][0]; 881 | 882 | float4 dL_drotation_l_f = dnormvdv(float4{rotation_l.x, rotation_l.y, rotation_l.z, rotation_l.w}, float4{dL_drot_l.x, dL_drot_l.y, dL_drot_l.z, dL_drot_l.w}); 883 | float4 dL_drotation_r_f = dnormvdv(float4{rotation_r.x, rotation_r.y, rotation_r.z, rotation_r.w}, float4{dL_drot_r.x, dL_drot_r.y, dL_drot_r.z, dL_drot_r.w}); 884 | dL_drotation_l.x = dL_drotation_l_f.x; 885 | dL_drotation_l.y = dL_drotation_l_f.y; 886 | dL_drotation_l.z = dL_drotation_l_f.z; 887 | dL_drotation_l.w = dL_drotation_l_f.w; 888 | dL_drotation_r.x = dL_drotation_r_f.x; 889 | dL_drotation_r.y = dL_drotation_r_f.y; 890 | dL_drotation_r.z = dL_drotation_r_f.z; 891 | dL_drotation_r.w = dL_drotation_r_f.w; 892 | } 893 | 894 | __global__ void computeCov4DBackwardCUDA(int P, 895 | const glm::vec4* scaling_xyzt, 896 | const glm::vec4* rotation_l, 897 | const glm::vec4* rotation_r, 898 | const float* dL_dcov, 899 | const glm::vec3* dL_dms, 900 | const float* dL_dcov_t, 901 | glm::vec4* dL_dscaling_xyzt, 902 | glm::vec4* dL_drotation_l, 903 | glm::vec4* dL_drotation_r) 904 | { 905 | auto idx = cg::this_grid().thread_rank(); 906 | if (idx >= P) 907 | return; 908 | computeCov4DBackward( 909 | scaling_xyzt[idx], 910 | rotation_l[idx], 911 | rotation_r[idx], 912 | dL_dcov + idx * 6, 913 | dL_dms[idx], 914 | dL_dcov_t[idx], 915 | dL_dscaling_xyzt[idx], 916 | dL_drotation_l[idx], 917 | dL_drotation_r[idx]); 918 | } 919 | 920 | 921 | void BACKWARD::computeCov4DBackward( 922 | int P, 923 | const glm::vec4* scaling_xyzt, 924 | const glm::vec4* rotation_l, 925 | const glm::vec4* rotation_r, 926 | const float* dL_dcov, 927 | const glm::vec3* dL_dms, 928 | const float* dL_dcov_t, 929 | glm::vec4* dL_dscaling_xyzt, 930 | glm::vec4* dL_drotation_l, 931 | glm::vec4* dL_drotation_r) 932 | { 933 | computeCov4DBackwardCUDA << <(P + 255) / 256, 256 >> > ( 934 | P, 935 | scaling_xyzt, 936 | rotation_l, 937 | rotation_r, 938 | dL_dcov, 939 | dL_dms, 940 | dL_dcov_t, 941 | dL_dscaling_xyzt, 942 | dL_drotation_l, 943 | dL_drotation_r); 944 | } 945 | 946 | // Backward pass of the preprocessing steps, except 947 | // for the covariance computation and inversion 948 | // (those are handled by a previous kernel call) 949 | template 950 | __global__ void preprocessCUDA( 951 | int P, int D, int M, 952 | const float3* means, 953 | const int* radii, 954 | const float* shs, 955 | const bool* clamped, 956 | const glm::vec3* scales, 957 | const glm::vec4* rotations, 958 | const float scale_modifier, 959 | const float* view, 960 | const float* proj, 961 | const glm::vec3* campos, 962 | const float3* dL_dmean2D, 963 | glm::vec3* dL_dmeans, 964 | float* dL_dcolor, 965 | float* dL_ddepth, 966 | float* dL_dcov3D, 967 | float* dL_dsh, 968 | glm::vec3* dL_dscale, 969 | glm::vec4* dL_drot) 970 | { 971 | auto idx = cg::this_grid().thread_rank(); 972 | if (idx >= P || !(radii[idx] > 0)) 973 | return; 974 | 975 | float3 m = means[idx]; 976 | 977 | // Taking care of gradients from the screenspace points 978 | float4 m_hom = transformPoint4x4(m, proj); 979 | float m_w = 1.0f / (m_hom.w + 0.0000001f); 980 | 981 | // Compute loss gradient w.r.t. 3D means due to gradients of 2D means 982 | // from rendering procedure 983 | glm::vec3 dL_dmean; 984 | float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w; 985 | float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w; 986 | dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y; 987 | dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y; 988 | dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y; 989 | 990 | // That's the second part of the mean gradient. Previous computation 991 | // of cov2D and following SH conversion also affects it. 992 | dL_dmeans[idx] += dL_dmean; 993 | 994 | // the w must be equal to 1 for view^T * [x,y,z,1] 995 | float3 m_view = transformPoint4x3(m, view); 996 | 997 | // Compute loss gradient w.r.t. 3D means due to gradients of depth 998 | // from rendering procedure 999 | glm::vec3 dL_dmean2; 1000 | float mul3 = view[2] * m.x + view[6] * m.y + view[10] * m.z + view[14]; 1001 | dL_dmean2.x = (view[2] - view[3] * mul3) * dL_ddepth[idx]; 1002 | dL_dmean2.y = (view[6] - view[7] * mul3) * dL_ddepth[idx]; 1003 | dL_dmean2.z = (view[10] - view[11] * mul3) * dL_ddepth[idx]; 1004 | 1005 | // That's the third part of the mean gradient. 1006 | dL_dmeans[idx] += dL_dmean2; 1007 | 1008 | // Compute gradient updates due to computing colors from SHs 1009 | if (shs) 1010 | computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh); 1011 | 1012 | // Compute gradient updates due to computing covariance from scale/rotation 1013 | if (scales) 1014 | computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot); 1015 | } 1016 | 1017 | // Backward version of the rendering procedure. 1018 | template 1019 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 1020 | renderCUDA( 1021 | const uint2* __restrict__ ranges, 1022 | const uint32_t* __restrict__ point_list, 1023 | int W, int H, 1024 | const float* __restrict__ bg_color, 1025 | const float2* __restrict__ points_xy_image, 1026 | const float4* __restrict__ conic_opacity, 1027 | const float* __restrict__ colors, 1028 | const float* __restrict__ depths, 1029 | const float* __restrict__ accum_alphas, 1030 | const uint32_t* __restrict__ n_contrib, 1031 | const float* __restrict__ dL_dpixels, 1032 | const float* __restrict__ dL_dpixel_depths, 1033 | const float* __restrict__ dL_dpixel_alphas, 1034 | float3* __restrict__ dL_dmean2D, 1035 | float3* __restrict__ dL_dabsmean2D, 1036 | float4* __restrict__ dL_dconic2D, 1037 | float* __restrict__ dL_dopacity, 1038 | float* __restrict__ dL_dcolors, 1039 | float* __restrict__ dL_ddepths) 1040 | { 1041 | // We rasterize again. Compute necessary block info. 1042 | auto block = cg::this_thread_block(); 1043 | const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 1044 | const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 1045 | const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 1046 | const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 1047 | const uint32_t pix_id = W * pix.y + pix.x; 1048 | const float2 pixf = { (float)pix.x, (float)pix.y }; 1049 | 1050 | const bool inside = pix.x < W&& pix.y < H; 1051 | const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 1052 | 1053 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 1054 | 1055 | bool done = !inside; 1056 | int toDo = range.y - range.x; 1057 | 1058 | __shared__ int collected_id[BLOCK_SIZE]; 1059 | __shared__ float2 collected_xy[BLOCK_SIZE]; 1060 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 1061 | __shared__ float collected_colors[C * BLOCK_SIZE]; 1062 | __shared__ float collected_depths[BLOCK_SIZE]; 1063 | 1064 | // In the forward, we stored the final value for T, the 1065 | // product of all (1 - alpha) factors. 1066 | const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0; 1067 | float T = T_final; 1068 | 1069 | // We start from the back. The ID of the last contributing 1070 | // Gaussian is known from each pixel from the forward. 1071 | uint32_t contributor = toDo; 1072 | const int last_contributor = inside ? n_contrib[pix_id] : 0; 1073 | 1074 | float accum_rec[C] = { 0 }; 1075 | float accum_red = 0; 1076 | float accum_rea = 0; 1077 | float dL_dpixel[C]; 1078 | float dL_dpixel_depth; 1079 | float dL_dpixel_alpha; 1080 | if (inside) 1081 | { 1082 | for (int i = 0; i < C; i++) 1083 | dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; 1084 | dL_dpixel_depth = dL_dpixel_depths[pix_id]; 1085 | dL_dpixel_alpha = dL_dpixel_alphas[pix_id]; 1086 | } 1087 | float last_alpha = 0; 1088 | float last_color[C] = { 0 }; 1089 | float last_depth = 0; 1090 | // Gradient of pixel coordinate w.r.t. normalized 1091 | // screen-space viewport corrdinates (-1 to 1) 1092 | const float ddelx_dx = 0.5 * W; 1093 | const float ddely_dy = 0.5 * H; 1094 | 1095 | // Traverse all Gaussians 1096 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 1097 | { 1098 | // Load auxiliary data into shared memory, start in the BACK 1099 | // and load them in revers order. 1100 | block.sync(); 1101 | const int progress = i * BLOCK_SIZE + block.thread_rank(); 1102 | if (range.x + progress < range.y) 1103 | { 1104 | const int coll_id = point_list[range.y - progress - 1]; 1105 | collected_id[block.thread_rank()] = coll_id; 1106 | collected_xy[block.thread_rank()] = points_xy_image[coll_id]; 1107 | collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; 1108 | for (int i = 0; i < C; i++) 1109 | collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i]; 1110 | collected_depths[block.thread_rank()] = depths[coll_id]; 1111 | } 1112 | block.sync(); 1113 | 1114 | // Iterate over Gaussians 1115 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) 1116 | { 1117 | // Keep track of current Gaussian ID. Skip, if this one 1118 | // is behind the last contributor for this pixel. 1119 | contributor--; 1120 | if (contributor >= last_contributor) 1121 | continue; 1122 | 1123 | // Compute blending values, as before. 1124 | const float2 xy = collected_xy[j]; 1125 | const float2 d = { xy.x - pixf.x, xy.y - pixf.y }; 1126 | const float4 con_o = collected_conic_opacity[j]; 1127 | const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 1128 | if (power > 0.0f) 1129 | continue; 1130 | 1131 | const float G = __expf(power); 1132 | const float alpha = min(0.99f, con_o.w * G); 1133 | if (alpha < 1.0f / 255.0f) 1134 | continue; 1135 | 1136 | T = T / (1.f - alpha); 1137 | const float dchannel_dcolor = alpha * T; 1138 | const float dpixel_depth_ddepth = alpha * T; 1139 | 1140 | // Propagate gradients to per-Gaussian colors and keep 1141 | // gradients w.r.t. alpha (blending factor for a Gaussian/pixel 1142 | // pair). 1143 | float dL_dalpha = 0.0f; 1144 | const int global_id = collected_id[j]; 1145 | for (int ch = 0; ch < C; ch++) 1146 | { 1147 | const float c = collected_colors[ch * BLOCK_SIZE + j]; 1148 | // Update last color (to be used in the next iteration) 1149 | accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch]; 1150 | last_color[ch] = c; 1151 | 1152 | const float dL_dchannel = dL_dpixel[ch]; 1153 | dL_dalpha += (c - accum_rec[ch]) * dL_dchannel; 1154 | // Update the gradients w.r.t. color of the Gaussian. 1155 | // Atomic, since this pixel is just one of potentially 1156 | // many that were affected by this Gaussian. 1157 | atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel); 1158 | } 1159 | const float dep = collected_depths[j]; 1160 | accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red; 1161 | last_depth = dep; 1162 | dL_dalpha += (dep-accum_red) * dL_dpixel_depth; 1163 | atomicAdd(&(dL_ddepths[global_id]), dpixel_depth_ddepth * dL_dpixel_depth); 1164 | 1165 | accum_rea = last_alpha + (1.f - last_alpha) * accum_rea; 1166 | dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha; 1167 | 1168 | 1169 | dL_dalpha *= T; 1170 | // Update last alpha (to be used in the next iteration) 1171 | last_alpha = alpha; 1172 | 1173 | // Account for fact that alpha also influences how much of 1174 | // the background color is added if nothing left to blend 1175 | float bg_dot_dpixel = 0; 1176 | for (int i = 0; i < C; i++) 1177 | bg_dot_dpixel += bg_color[i] * dL_dpixel[i]; 1178 | dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel; 1179 | 1180 | // Set background depth value == 0, thus no contribution for 1181 | // dL_dalpha 1182 | 1183 | // Helpful reusable temporary variables 1184 | const float dL_dG = con_o.w * dL_dalpha; 1185 | const float gdx = G * d.x; 1186 | const float gdy = G * d.y; 1187 | const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y; 1188 | const float dG_ddely = -gdy * con_o.z - gdx * con_o.y; 1189 | const float dL_dmean2D_x = dL_dG * dG_ddelx * ddelx_dx; 1190 | const float dL_dmean2D_y = dL_dG * dG_ddely * ddely_dy; 1191 | 1192 | // Update gradients w.r.t. 2D mean position of the Gaussian 1193 | atomicAdd(&dL_dmean2D[global_id].x, dL_dmean2D_x); 1194 | atomicAdd(&dL_dmean2D[global_id].y, dL_dmean2D_y); 1195 | // Update gradients w.r.t. 2D mean position of the Gaussian 1196 | atomicAdd(&dL_dabsmean2D[global_id].x, abs(dL_dmean2D_x)); 1197 | atomicAdd(&dL_dabsmean2D[global_id].y, abs(dL_dmean2D_y)); 1198 | 1199 | // Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric) 1200 | atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG); 1201 | atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG); 1202 | atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG); 1203 | 1204 | // Update gradients w.r.t. opacity of the Gaussian 1205 | atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha); 1206 | } 1207 | } 1208 | } 1209 | 1210 | 1211 | // Backward version of the rendering procedure. 1212 | template 1213 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 1214 | renderCUDAShared( 1215 | const uint2* __restrict__ ranges, 1216 | const uint32_t* __restrict__ point_list, 1217 | int W, int H, 1218 | const float* __restrict__ bg_color, 1219 | const float2* __restrict__ points_xy_image, 1220 | const float4* __restrict__ conic_opacity, 1221 | const float* __restrict__ colors, 1222 | const float* __restrict__ depths, 1223 | const float* __restrict__ accum_alphas, 1224 | const uint32_t* __restrict__ n_contrib, 1225 | const float* __restrict__ dL_dpixels, 1226 | const float* __restrict__ dL_dpixel_depths, 1227 | const float* __restrict__ dL_dpixel_alphas, 1228 | float3* __restrict__ dL_dmean2D, 1229 | float3* __restrict__ dL_dabsmean2D, 1230 | float4* __restrict__ dL_dconic2D, 1231 | float* __restrict__ dL_dopacity, 1232 | float* __restrict__ dL_dcolors, 1233 | float* __restrict__ dL_ddepths) 1234 | { 1235 | // We rasterize again. Compute necessary block info. 1236 | auto block = cg::this_thread_block(); 1237 | const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 1238 | const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 1239 | const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 1240 | const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 1241 | const uint32_t pix_id = W * pix.y + pix.x; 1242 | const float2 pixf = { (float)pix.x, (float)pix.y }; 1243 | 1244 | const bool inside = pix.x < W&& pix.y < H; 1245 | const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 1246 | 1247 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 1248 | 1249 | bool done = !inside; 1250 | int toDo = range.y - range.x; 1251 | 1252 | __shared__ int collected_id[BLOCK_SIZE]; 1253 | __shared__ float2 collected_xy[BLOCK_SIZE]; 1254 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 1255 | __shared__ float collected_colors[C * BLOCK_SIZE]; 1256 | __shared__ float collected_depths[BLOCK_SIZE]; 1257 | 1258 | // Heuristic, gaussians are likely to be updated by the same block (same tile) 1259 | // Thus it should be faster to first aggregate the gradients inside this block and update them to the global memory in just one go 1260 | __shared__ float3 s_dL_dmean2D[BLOCK_SIZE]; 1261 | __shared__ float3 s_dL_dabsmean2D[BLOCK_SIZE]; 1262 | __shared__ float4 s_dL_dconic2D[BLOCK_SIZE]; 1263 | __shared__ float s_dL_dopacity[BLOCK_SIZE]; 1264 | __shared__ float s_dL_dcolors[C * BLOCK_SIZE]; 1265 | __shared__ float s_dL_ddepths[BLOCK_SIZE]; 1266 | 1267 | // In the forward, we stored the final value for T, the 1268 | // product of all (1 - alpha) factors. 1269 | const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0; 1270 | float T = T_final; 1271 | 1272 | // We start from the back. The ID of the last contributing 1273 | // Gaussian is known from each pixel from the forward. 1274 | uint32_t contributor = toDo; 1275 | const int last_contributor = inside ? n_contrib[pix_id] : 0; 1276 | 1277 | float accum_rec[C] = { 0 }; 1278 | float accum_red = 0; 1279 | float accum_rea = 0; 1280 | float dL_dpixel[C]; 1281 | float dL_dpixel_depth; 1282 | float dL_dpixel_alpha; 1283 | if (inside) 1284 | { 1285 | for (int i = 0; i < C; i++) 1286 | dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; 1287 | dL_dpixel_depth = dL_dpixel_depths[pix_id]; 1288 | dL_dpixel_alpha = dL_dpixel_alphas[pix_id]; 1289 | } 1290 | float last_alpha = 0; 1291 | float last_color[C] = { 0 }; 1292 | float last_depth = 0; 1293 | // Gradient of pixel coordinate w.r.t. normalized 1294 | // screen-space viewport corrdinates (-1 to 1) 1295 | const float ddelx_dx = 0.5 * W; 1296 | const float ddely_dy = 0.5 * H; 1297 | 1298 | // Traverse all Gaussians 1299 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 1300 | { 1301 | // Load auxiliary data into shared memory, start in the BACK 1302 | // and load them in revers order. 1303 | block.sync(); 1304 | const int progress = i * BLOCK_SIZE + block.thread_rank(); 1305 | if (range.x + progress < range.y) 1306 | { 1307 | const int coll_id = point_list[range.y - progress - 1]; 1308 | collected_id[block.thread_rank()] = coll_id; 1309 | collected_xy[block.thread_rank()] = points_xy_image[coll_id]; 1310 | collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; 1311 | for (int i = 0; i < C; i++) 1312 | collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i]; 1313 | collected_depths[block.thread_rank()] = depths[coll_id]; 1314 | 1315 | // Shared gradient accumulation in this block 1316 | s_dL_dmean2D[block.thread_rank()].x = 0.0f; 1317 | s_dL_dmean2D[block.thread_rank()].y = 0.0f; 1318 | s_dL_dabsmean2D[block.thread_rank()].x = 0.0f; 1319 | s_dL_dabsmean2D[block.thread_rank()].y = 0.0f; 1320 | s_dL_dconic2D[block.thread_rank()].x = 0.0f; 1321 | s_dL_dconic2D[block.thread_rank()].y = 0.0f; 1322 | s_dL_dconic2D[block.thread_rank()].w = 0.0f; 1323 | for (int i = 0; i < C; i++) 1324 | s_dL_dcolors[i * BLOCK_SIZE + block.thread_rank()] = 0.0f; 1325 | s_dL_dopacity[block.thread_rank()] = 0.0f; 1326 | s_dL_ddepths[block.thread_rank()] = 0.0f; 1327 | } 1328 | block.sync(); 1329 | 1330 | // Iterate over Gaussians 1331 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) 1332 | { 1333 | // Keep track of current Gaussian ID. Skip, if this one 1334 | // is behind the last contributor for this pixel. 1335 | contributor--; 1336 | if (contributor >= last_contributor) 1337 | continue; 1338 | 1339 | // Compute blending values, as before. 1340 | const float2 xy = collected_xy[j]; 1341 | const float2 d = { xy.x - pixf.x, xy.y - pixf.y }; 1342 | const float4 con_o = collected_conic_opacity[j]; 1343 | const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 1344 | if (power > 0.0f) 1345 | continue; 1346 | 1347 | const float G = __expf(power); 1348 | const float alpha = min(0.99f, con_o.w * G); 1349 | if (alpha < 1.0f / 255.0f) 1350 | continue; 1351 | 1352 | T = T / (1.f - alpha); 1353 | const float dchannel_dcolor = alpha * T; 1354 | const float dpixel_depth_ddepth = alpha * T; 1355 | 1356 | // Propagate gradients to per-Gaussian colors and keep 1357 | // gradients w.r.t. alpha (blending factor for a Gaussian/pixel 1358 | // pair). 1359 | float dL_dalpha = 0.0f; 1360 | for (int ch = 0; ch < C; ch++) 1361 | { 1362 | const float c = collected_colors[ch * BLOCK_SIZE + j]; 1363 | // Update last color (to be used in the next iteration) 1364 | accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch]; 1365 | last_color[ch] = c; 1366 | 1367 | const float dL_dchannel = dL_dpixel[ch]; 1368 | dL_dalpha += (c - accum_rec[ch]) * dL_dchannel; 1369 | // Update the gradients w.r.t. color of the Gaussian. 1370 | // Atomic, since this pixel is just one of potentially 1371 | // many that were affected by this Gaussian. 1372 | atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), dchannel_dcolor * dL_dchannel); 1373 | } 1374 | const float dep = collected_depths[j]; 1375 | accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red; 1376 | last_depth = dep; 1377 | dL_dalpha += (dep-accum_red) * dL_dpixel_depth; 1378 | atomicAdd(&(s_dL_ddepths[j]), dpixel_depth_ddepth * dL_dpixel_depth); 1379 | 1380 | accum_rea = last_alpha + (1.f - last_alpha) * accum_rea; 1381 | dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha; 1382 | 1383 | 1384 | dL_dalpha *= T; 1385 | // Update last alpha (to be used in the next iteration) 1386 | last_alpha = alpha; 1387 | 1388 | // Account for fact that alpha also influences how much of 1389 | // the background color is added if nothing left to blend 1390 | float bg_dot_dpixel = 0; 1391 | for (int i = 0; i < C; i++) 1392 | bg_dot_dpixel += bg_color[i] * dL_dpixel[i]; 1393 | dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel; 1394 | 1395 | // Set background depth value == 0, thus no contribution for 1396 | // dL_dalpha 1397 | 1398 | // Helpful reusable temporary variables 1399 | const float dL_dG = con_o.w * dL_dalpha; 1400 | const float gdx = G * d.x; 1401 | const float gdy = G * d.y; 1402 | const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y; 1403 | const float dG_ddely = -gdy * con_o.z - gdx * con_o.y; 1404 | const float dL_dmean2D_x = dL_dG * dG_ddelx * ddelx_dx; 1405 | const float dL_dmean2D_y = dL_dG * dG_ddely * ddely_dy; 1406 | 1407 | // Update gradients w.r.t. 2D mean position of the Gaussian 1408 | atomicAdd(&s_dL_dmean2D[j].x, dL_dmean2D_x); 1409 | atomicAdd(&s_dL_dmean2D[j].y, dL_dmean2D_y); 1410 | atomicAdd(&s_dL_dabsmean2D[j].x, abs(dL_dmean2D_x)); 1411 | atomicAdd(&s_dL_dabsmean2D[j].y, abs(dL_dmean2D_y)); 1412 | 1413 | // Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric) 1414 | atomicAdd(&s_dL_dconic2D[j].x, -0.5f * gdx * d.x * dL_dG); 1415 | atomicAdd(&s_dL_dconic2D[j].y, -0.5f * gdx * d.y * dL_dG); 1416 | atomicAdd(&s_dL_dconic2D[j].w, -0.5f * gdy * d.y * dL_dG); 1417 | 1418 | // Update gradients w.r.t. opacity of the Gaussian 1419 | atomicAdd(&(s_dL_dopacity[j]), G * dL_dalpha); 1420 | } 1421 | block.sync(); 1422 | 1423 | if (range.x + progress < range.y && s_dL_dmean2D[block.thread_rank()].x != 0.0) // not exactly zero 1424 | { 1425 | const int global_id = collected_id[block.thread_rank()]; 1426 | 1427 | // Shared gradient accumulation in this block 1428 | atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[block.thread_rank()].x); 1429 | atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[block.thread_rank()].y); 1430 | atomicAdd(&dL_dabsmean2D[global_id].x, s_dL_dabsmean2D[block.thread_rank()].x); 1431 | atomicAdd(&dL_dabsmean2D[global_id].y, s_dL_dabsmean2D[block.thread_rank()].y); 1432 | atomicAdd(&dL_dconic2D[global_id].x, s_dL_dconic2D[block.thread_rank()].x); 1433 | atomicAdd(&dL_dconic2D[global_id].y, s_dL_dconic2D[block.thread_rank()].y); 1434 | atomicAdd(&dL_dconic2D[global_id].w, s_dL_dconic2D[block.thread_rank()].w); 1435 | for (int i = 0; i < C; i++) 1436 | atomicAdd(&(dL_dcolors[global_id * C + i]), s_dL_dcolors[i * BLOCK_SIZE + block.thread_rank()]); 1437 | atomicAdd(&(dL_dopacity[global_id]), s_dL_dopacity[block.thread_rank()]); 1438 | atomicAdd(&(dL_ddepths[global_id]), s_dL_ddepths[block.thread_rank()]); 1439 | } 1440 | 1441 | } 1442 | } 1443 | 1444 | 1445 | __device__ float warpReduceSum(float value) { 1446 | auto warp = cg::coalesced_threads(); 1447 | for (int offset = warp.size() / 2; offset > 0; offset /= 2) { 1448 | value += warp.shfl_down(value, offset); 1449 | } 1450 | return value; 1451 | } 1452 | 1453 | // Backward version of the rendering procedure. 1454 | template 1455 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 1456 | renderCUDAWarp( 1457 | const uint2* __restrict__ ranges, 1458 | const uint32_t* __restrict__ point_list, 1459 | int W, int H, 1460 | const float* __restrict__ bg_color, 1461 | const float2* __restrict__ points_xy_image, 1462 | const float4* __restrict__ conic_opacity, 1463 | const float* __restrict__ colors, 1464 | const float* __restrict__ depths, 1465 | const float* __restrict__ accum_alphas, 1466 | const uint32_t* __restrict__ n_contrib, 1467 | const float* __restrict__ dL_dpixels, 1468 | const float* __restrict__ dL_dpixel_depths, 1469 | const float* __restrict__ dL_dpixel_alphas, 1470 | float3* __restrict__ dL_dmean2D, 1471 | float3* __restrict__ dL_dabsmean2D, 1472 | float4* __restrict__ dL_dconic2D, 1473 | float* __restrict__ dL_dopacity, 1474 | float* __restrict__ dL_dcolors, 1475 | float* __restrict__ dL_ddepths) 1476 | { 1477 | // We rasterize again. Compute necessary block info. 1478 | auto block = cg::this_thread_block(); 1479 | auto warp = cg::coalesced_threads(); 1480 | 1481 | const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 1482 | const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 1483 | const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 1484 | // By default, the thread rank should be computed as block.thread_index().x + BLOCK_X * block.thread_index().y 1485 | // And warps should be arranged in a 2 * 16 (row * col) fashion 1486 | // We want to make this more localized, so thread_ranks are remapped so that a warp is responsible for 4 * 8 smaller patch 1487 | // const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 1488 | const int tx = threadIdx.x; 1489 | const int ty = threadIdx.y; 1490 | 1491 | // Compute warp ID within the block 1492 | // Here we divide the 16x16 block into 8 warps, each warp covers a 4x8 area. 1493 | // Warp IDs are assigned row-wise. 1494 | const int warpId = (ty / 4) * 2 + (tx / 8); 1495 | const int laneId = (ty % 4) * 8 + (tx % 8); 1496 | 1497 | // Compute thread's position within its warp 1498 | // Threads are linearly indexed within each warp from 0 to 31, using row-major ordering within the 4x8 block 1499 | const int local_warp_x = (warpId % 2) * 8; 1500 | const int local_warp_y = (warpId / 2) * 4; 1501 | 1502 | const uint2 pix = { 1503 | pix_min.x + local_warp_x + (laneId % 8), 1504 | pix_min.y + local_warp_y + (laneId / 8) 1505 | }; 1506 | 1507 | const uint32_t pix_id = W * pix.y + pix.x; 1508 | const float2 pixf = { (float)pix.x, (float)pix.y }; 1509 | 1510 | const bool inside = pix.x < W&& pix.y < H; 1511 | const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 1512 | 1513 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 1514 | 1515 | bool done = !inside; 1516 | int toDo = range.y - range.x; 1517 | 1518 | __shared__ int collected_id[BLOCK_SIZE]; 1519 | __shared__ float2 collected_xy[BLOCK_SIZE]; 1520 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 1521 | __shared__ float collected_colors[C * BLOCK_SIZE]; 1522 | __shared__ float collected_depths[BLOCK_SIZE]; 1523 | 1524 | // // Heuristic, gaussians are likely to be updated by the same block (same tile) 1525 | // // Thus it should be faster to first aggregate the gradients inside this block and update them to the global memory in just one go 1526 | // __shared__ float3 s_dL_dmean2D[BLOCK_SIZE]; 1527 | // __shared__ float3 s_dL_dabsmean2D[BLOCK_SIZE]; 1528 | // __shared__ float4 s_dL_dconic2D[BLOCK_SIZE]; 1529 | // __shared__ float s_dL_dopacity[BLOCK_SIZE]; 1530 | // __shared__ float s_dL_dcolors[C * BLOCK_SIZE]; 1531 | // __shared__ float s_dL_ddepths[BLOCK_SIZE]; 1532 | 1533 | // In the forward, we stored the final value for T, the 1534 | // product of all (1 - alpha) factors. 1535 | const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0; 1536 | float T = T_final; 1537 | 1538 | // We start from the back. The ID of the last contributing 1539 | // Gaussian is known from each pixel from the forward. 1540 | uint32_t contributor = toDo; 1541 | const int last_contributor = inside ? n_contrib[pix_id] : 0; 1542 | 1543 | float accum_rec[C] = { 0 }; 1544 | float accum_red = 0; 1545 | float accum_rea = 0; 1546 | float dL_dpixel[C]; 1547 | float dL_dpixel_depth; 1548 | float dL_dpixel_alpha; 1549 | if (inside) 1550 | { 1551 | for (int i = 0; i < C; i++) 1552 | dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; 1553 | dL_dpixel_depth = dL_dpixel_depths[pix_id]; 1554 | dL_dpixel_alpha = dL_dpixel_alphas[pix_id]; 1555 | } 1556 | float last_alpha = 0; 1557 | float last_color[C] = { 0 }; 1558 | float last_depth = 0; 1559 | // Gradient of pixel coordinate w.r.t. normalized 1560 | // screen-space viewport corrdinates (-1 to 1) 1561 | const float ddelx_dx = 0.5 * W; 1562 | const float ddely_dy = 0.5 * H; 1563 | 1564 | auto local_rank = block.thread_rank(); 1565 | 1566 | // Traverse all Gaussians 1567 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 1568 | { 1569 | // Load auxiliary data into shared memory, start in the BACK 1570 | // and load them in revers order. 1571 | block.sync(); 1572 | const int progress = i * BLOCK_SIZE + local_rank; 1573 | if (range.x + progress < range.y) 1574 | { 1575 | const int coll_id = point_list[range.y - progress - 1]; 1576 | collected_id[local_rank] = coll_id; 1577 | collected_xy[local_rank] = points_xy_image[coll_id]; 1578 | collected_conic_opacity[local_rank] = conic_opacity[coll_id]; 1579 | for (int i = 0; i < C; i++) 1580 | collected_colors[i * BLOCK_SIZE + local_rank] = colors[coll_id * C + i]; 1581 | collected_depths[local_rank] = depths[coll_id]; 1582 | 1583 | // // Shared gradient accumulation in this block 1584 | // s_dL_dmean2D[local_rank].x = 0.0f; 1585 | // s_dL_dmean2D[local_rank].y = 0.0f; 1586 | // s_dL_dabsmean2D[local_rank].x = 0.0f; 1587 | // s_dL_dabsmean2D[local_rank].y = 0.0f; 1588 | // s_dL_dconic2D[local_rank].x = 0.0f; 1589 | // s_dL_dconic2D[local_rank].y = 0.0f; 1590 | // s_dL_dconic2D[local_rank].w = 0.0f; 1591 | // for (int i = 0; i < C; i++) 1592 | // s_dL_dcolors[i * BLOCK_SIZE + local_rank] = 0.0f; 1593 | // s_dL_dopacity[local_rank] = 0.0f; 1594 | // s_dL_ddepths[local_rank] = 0.0f; 1595 | } 1596 | block.sync(); 1597 | 1598 | // Iterate over Gaussians 1599 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) 1600 | { 1601 | float2 xy = { 0.0f }; 1602 | float2 d = { 0.0f }; 1603 | float4 con_o = { 0.0f }; 1604 | float power = 0.0f; 1605 | float G = 0.0f; 1606 | float alpha = 0.0f; 1607 | float dchannel_dcolor = 0.0f; 1608 | float dpixel_depth_ddepth = 0.0f; 1609 | float dL_dalpha = 0.0f; 1610 | float dep = 0.0f; 1611 | float bg_dot_dpixel = 0.0f; 1612 | float dL_dG = 0.0f; 1613 | float gdx = 0.0f; 1614 | float gdy = 0.0f; 1615 | float dG_ddelx = 0.0f; 1616 | float dG_ddely = 0.0f; 1617 | 1618 | float w_dL_dcolors[C] = { 0.0f }; 1619 | float w_dL_ddepths = 0.0f; 1620 | float2 w_dL_dmean2D = { 0.0f }; 1621 | float2 w_dL_dabsmean2D = { 0.0f }; 1622 | float4 w_dL_dconic2D = { 0.0f }; 1623 | float w_dL_dopacity = 0.0f; 1624 | 1625 | int global_id; 1626 | bool early_stop = false; 1627 | 1628 | // if (done) { 1629 | // early_stop = true; 1630 | // goto reduce; 1631 | // } 1632 | 1633 | // Keep track of current Gaussian ID. Skip, if this one 1634 | // is behind the last contributor for this pixel. 1635 | contributor--; 1636 | if (contributor >= last_contributor) { 1637 | early_stop = true; 1638 | goto reduce; 1639 | } 1640 | // continue; 1641 | 1642 | // Compute blending values, as before. 1643 | xy = collected_xy[j]; 1644 | d = { xy.x - pixf.x, xy.y - pixf.y }; 1645 | con_o = collected_conic_opacity[j]; 1646 | power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 1647 | if (power > 0.0f) { 1648 | early_stop = true; 1649 | goto reduce; // early stopping 1650 | } 1651 | // continue; 1652 | 1653 | G = __expf(power); 1654 | alpha = min(0.99f, con_o.w * G); 1655 | if (alpha < 1.0f / 255.0f) { 1656 | early_stop = true; 1657 | goto reduce; // early stopping 1658 | } 1659 | // continue; 1660 | 1661 | T = T / (1.f - alpha); 1662 | dchannel_dcolor = alpha * T; 1663 | dpixel_depth_ddepth = alpha * T; 1664 | 1665 | // Propagate gradients to per-Gaussian colors and keep 1666 | // gradients w.r.t. alpha (blending factor for a Gaussian/pixel 1667 | // pair). 1668 | dL_dalpha = 0.0f; 1669 | for (int ch = 0; ch < C; ch++) 1670 | { 1671 | const float c = collected_colors[ch * BLOCK_SIZE + j]; 1672 | // Update last color (to be used in the next iteration) 1673 | accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch]; 1674 | last_color[ch] = c; 1675 | 1676 | const float dL_dchannel = dL_dpixel[ch]; 1677 | dL_dalpha += (c - accum_rec[ch]) * dL_dchannel; 1678 | // Update the gradients w.r.t. color of the Gaussian. 1679 | // Atomic, since this pixel is just one of potentially 1680 | // many that were affected by this Gaussian. 1681 | // atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), dchannel_dcolor * dL_dchannel); 1682 | w_dL_dcolors[ch] = dchannel_dcolor * dL_dchannel; 1683 | } 1684 | dep = collected_depths[j]; 1685 | accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red; 1686 | last_depth = dep; 1687 | dL_dalpha += (dep-accum_red) * dL_dpixel_depth; 1688 | // atomicAdd(&(s_dL_ddepths[j]), dpixel_depth_ddepth * dL_dpixel_depth); 1689 | w_dL_ddepths = dpixel_depth_ddepth * dL_dpixel_depth; 1690 | 1691 | accum_rea = last_alpha + (1.f - last_alpha) * accum_rea; 1692 | dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha; 1693 | 1694 | dL_dalpha *= T; 1695 | // Update last alpha (to be used in the next iteration) 1696 | last_alpha = alpha; 1697 | 1698 | // Account for fact that alpha also influences how much of 1699 | // the background color is added if nothing left to blend 1700 | bg_dot_dpixel = 0; 1701 | for (int i = 0; i < C; i++) 1702 | bg_dot_dpixel += bg_color[i] * dL_dpixel[i]; 1703 | dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel; 1704 | 1705 | // Set background depth value == 0, thus no contribution for 1706 | // dL_dalpha 1707 | 1708 | // Helpful reusable temporary variables 1709 | dL_dG = con_o.w * dL_dalpha; 1710 | gdx = G * d.x; 1711 | gdy = G * d.y; 1712 | dG_ddelx = -gdx * con_o.x - gdy * con_o.y; 1713 | dG_ddely = -gdy * con_o.z - gdx * con_o.y; 1714 | 1715 | // Update gradients w.r.t. 2D mean position of the Gaussian 1716 | // atomicAdd(&s_dL_dmean2D[j].x, dL_dG * dG_ddelx * ddelx_dx); 1717 | // atomicAdd(&s_dL_dmean2D[j].y, dL_dG * dG_ddely * ddely_dy); 1718 | w_dL_dmean2D.x = dL_dG * dG_ddelx * ddelx_dx; 1719 | w_dL_dmean2D.y = dL_dG * dG_ddely * ddely_dy; 1720 | w_dL_dabsmean2D.x = abs(w_dL_dmean2D.x); 1721 | w_dL_dabsmean2D.y = abs(w_dL_dmean2D.y); 1722 | 1723 | // Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric) 1724 | // atomicAdd(&s_dL_dconic2D[j].x, -0.5f * gdx * d.x * dL_dG); 1725 | // atomicAdd(&s_dL_dconic2D[j].y, -0.5f * gdx * d.y * dL_dG); 1726 | // atomicAdd(&s_dL_dconic2D[j].w, -0.5f * gdy * d.y * dL_dG); 1727 | w_dL_dconic2D.x = -0.5f * gdx * d.x * dL_dG; 1728 | w_dL_dconic2D.y = -0.5f * gdx * d.y * dL_dG; 1729 | w_dL_dconic2D.w = -0.5f * gdy * d.y * dL_dG; 1730 | 1731 | // Update gradients w.r.t. opacity of the Gaussian 1732 | // atomicAdd(&(s_dL_dopacity[j]), G * dL_dalpha); 1733 | w_dL_dopacity = G * dL_dalpha; 1734 | 1735 | reduce: 1736 | early_stop = warpReduceSum(early_stop); 1737 | 1738 | // If the whole warp votes for early stop, no need to do any further reduction or computation 1739 | if (!early_stop) { 1740 | 1741 | // Call reduce sum and append results to __shared__ memory 1742 | for (int ch = 0; ch < C; ch++) { 1743 | w_dL_dcolors[ch] = warpReduceSum(w_dL_dcolors[ch]); 1744 | } 1745 | w_dL_ddepths = warpReduceSum(w_dL_ddepths); 1746 | w_dL_dmean2D.x = warpReduceSum(w_dL_dmean2D.x); 1747 | w_dL_dmean2D.y = warpReduceSum(w_dL_dmean2D.y); 1748 | w_dL_dabsmean2D.x = warpReduceSum(w_dL_dabsmean2D.x); 1749 | w_dL_dabsmean2D.y = warpReduceSum(w_dL_dabsmean2D.y); 1750 | w_dL_dconic2D.x = warpReduceSum(w_dL_dconic2D.x); 1751 | w_dL_dconic2D.y = warpReduceSum(w_dL_dconic2D.y); 1752 | w_dL_dconic2D.w = warpReduceSum(w_dL_dconic2D.w); 1753 | w_dL_dopacity = warpReduceSum(w_dL_dopacity); 1754 | 1755 | // Use a single thread from each warp to perform block level reduction 1756 | if (local_rank % warp.size() == 0) { 1757 | // for (int ch = 0; ch < C; ch++) { 1758 | // atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), w_dL_dcolors[ch]); 1759 | // } 1760 | // atomicAdd(&(s_dL_ddepths[j]), w_dL_ddepths); 1761 | // atomicAdd(&s_dL_dmean2D[j].x, w_dL_dmean2D.x); 1762 | // atomicAdd(&s_dL_dmean2D[j].y, w_dL_dmean2D.y); 1763 | // atomicAdd(&s_dL_dabsmean2D[j].x, w_dL_dabsmean2D.x); 1764 | // atomicAdd(&s_dL_dabsmean2D[j].y, w_dL_dabsmean2D.y); 1765 | // atomicAdd(&s_dL_dconic2D[j].x, w_dL_dconic2D.x); 1766 | // atomicAdd(&s_dL_dconic2D[j].y, w_dL_dconic2D.y); 1767 | // atomicAdd(&s_dL_dconic2D[j].w, w_dL_dconic2D.w); 1768 | // atomicAdd(&(s_dL_dopacity[j]), w_dL_dopacity); 1769 | global_id = collected_id[j]; 1770 | 1771 | // Shared gradient accumulation in this block 1772 | for (int i = 0; i < C; i++) 1773 | atomicAdd(&(dL_dcolors[global_id * C + i]), w_dL_dcolors[i]); 1774 | atomicAdd(&dL_dmean2D[global_id].x, w_dL_dmean2D.x); 1775 | atomicAdd(&dL_dmean2D[global_id].y, w_dL_dmean2D.y); 1776 | atomicAdd(&dL_dabsmean2D[global_id].x, w_dL_dabsmean2D.x); 1777 | atomicAdd(&dL_dabsmean2D[global_id].y, w_dL_dabsmean2D.y); 1778 | atomicAdd(&dL_dconic2D[global_id].x, w_dL_dconic2D.x); 1779 | atomicAdd(&dL_dconic2D[global_id].y, w_dL_dconic2D.y); 1780 | atomicAdd(&dL_dconic2D[global_id].w, w_dL_dconic2D.w); 1781 | atomicAdd(&(dL_dopacity[global_id]), w_dL_dopacity); 1782 | atomicAdd(&(dL_ddepths[global_id]), w_dL_ddepths); 1783 | } 1784 | } 1785 | } 1786 | // block.sync(); 1787 | 1788 | // if (range.x + progress < range.y) 1789 | // { 1790 | // const int global_id = point_list[range.y - progress - 1]; 1791 | 1792 | // // Shared gradient accumulation in this block 1793 | // atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[local_rank].x); 1794 | // atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[local_rank].y); 1795 | // atomicAdd(&dL_dabsmean2D[global_id].x, s_dL_dabsmean2D[local_rank].x); 1796 | // atomicAdd(&dL_dabsmean2D[global_id].y, s_dL_dabsmean2D[local_rank].y); 1797 | // atomicAdd(&dL_dconic2D[global_id].x, s_dL_dconic2D[local_rank].x); 1798 | // atomicAdd(&dL_dconic2D[global_id].y, s_dL_dconic2D[local_rank].y); 1799 | // atomicAdd(&dL_dconic2D[global_id].w, s_dL_dconic2D[local_rank].w); 1800 | // for (int i = 0; i < C; i++) 1801 | // atomicAdd(&(dL_dcolors[global_id * C + i]), s_dL_dcolors[i * BLOCK_SIZE + local_rank]); 1802 | // atomicAdd(&(dL_dopacity[global_id]), s_dL_dopacity[local_rank]); 1803 | // atomicAdd(&(dL_ddepths[global_id]), s_dL_ddepths[local_rank]); 1804 | // } 1805 | 1806 | } 1807 | } 1808 | 1809 | void BACKWARD::preprocess( 1810 | int P, int D, int M, 1811 | const float3* means3D, 1812 | const int* radii, 1813 | const float* shs, 1814 | const bool* clamped, 1815 | const glm::vec3* scales, 1816 | const glm::vec4* rotations, 1817 | const float scale_modifier, 1818 | const float* cov3Ds, 1819 | const float* viewmatrix, 1820 | const float* projmatrix, 1821 | const float focal_x, float focal_y, 1822 | const float tan_fovx, float tan_fovy, 1823 | const glm::vec3* campos, 1824 | const float3* dL_dmean2D, 1825 | const float* dL_dconic, 1826 | glm::vec3* dL_dmean3D, 1827 | float* dL_dcolor, 1828 | float* dL_ddepth, 1829 | float* dL_dcov3D, 1830 | float* dL_dsh, 1831 | glm::vec3* dL_dscale, 1832 | glm::vec4* dL_drot) 1833 | { 1834 | // Propagate gradients for the path of 2D conic matrix computation. 1835 | // Somewhat long, thus it is its own kernel rather than being part of 1836 | // "preprocess". When done, loss gradient w.r.t. 3D means has been 1837 | // modified and gradient w.r.t. 3D covariance matrix has been computed. 1838 | computeCov2DCUDA << <(P + 255) / 256, 256 >> > ( 1839 | P, 1840 | means3D, 1841 | radii, 1842 | cov3Ds, 1843 | focal_x, 1844 | focal_y, 1845 | tan_fovx, 1846 | tan_fovy, 1847 | viewmatrix, 1848 | dL_dconic, 1849 | (float3*)dL_dmean3D, 1850 | dL_dcov3D); 1851 | 1852 | // Propagate gradients for remaining steps: finish 3D mean gradients, 1853 | // propagate color gradients to SH (if desireD), propagate 3D covariance 1854 | // matrix gradients to scale and rotation. 1855 | preprocessCUDA << < (P + 255) / 256, 256 >> > ( 1856 | P, D, M, 1857 | (float3*)means3D, 1858 | radii, 1859 | shs, 1860 | clamped, 1861 | (glm::vec3*)scales, 1862 | (glm::vec4*)rotations, 1863 | scale_modifier, 1864 | viewmatrix, 1865 | projmatrix, 1866 | campos, 1867 | (float3*)dL_dmean2D, 1868 | (glm::vec3*)dL_dmean3D, 1869 | dL_dcolor, 1870 | dL_ddepth, 1871 | dL_dcov3D, 1872 | dL_dsh, 1873 | dL_dscale, 1874 | dL_drot); 1875 | } 1876 | 1877 | void BACKWARD::render( 1878 | const dim3 grid, const dim3 block, 1879 | const uint2* ranges, 1880 | const uint32_t* point_list, 1881 | int W, int H, 1882 | const float* bg_color, 1883 | const float2* means2D, 1884 | const float4* conic_opacity, 1885 | const float* colors, 1886 | const float* depths, 1887 | const float* accum_alphas, 1888 | const uint32_t* n_contrib, 1889 | const float* dL_dpixels, 1890 | const float* dL_dpixel_depths, 1891 | const float* dL_dpixel_alphas, 1892 | float3* dL_dmean2D, 1893 | float3* dL_dabsmean2D, 1894 | float4* dL_dconic2D, 1895 | float* dL_dopacity, 1896 | float* dL_dcolors, 1897 | float* dL_ddepths) 1898 | { 1899 | renderCUDA << > >( 1900 | ranges, 1901 | point_list, 1902 | W, H, 1903 | bg_color, 1904 | means2D, 1905 | conic_opacity, 1906 | colors, 1907 | depths, 1908 | accum_alphas, 1909 | n_contrib, 1910 | dL_dpixels, 1911 | dL_dpixel_depths, 1912 | dL_dpixel_alphas, 1913 | dL_dmean2D, 1914 | dL_dabsmean2D, 1915 | dL_dconic2D, 1916 | dL_dopacity, 1917 | dL_dcolors, 1918 | dL_ddepths); 1919 | } -------------------------------------------------------------------------------- /cuda_rasterizer/backward.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED 13 | #define CUDA_RASTERIZER_BACKWARD_H_INCLUDED 14 | 15 | #include 16 | #include "cuda_runtime.h" 17 | #include "device_launch_parameters.h" 18 | #define GLM_FORCE_CUDA 19 | #include 20 | 21 | namespace BACKWARD 22 | { 23 | void render( 24 | const dim3 grid, dim3 block, 25 | const uint2* ranges, 26 | const uint32_t* point_list, 27 | int W, int H, 28 | const float* bg_color, 29 | const float2* means2D, 30 | const float4* conic_opacity, 31 | const float* colors, 32 | const float* depths, 33 | const float* accum_alphas, 34 | const uint32_t* n_contrib, 35 | const float* dL_dpixels, 36 | const float* dL_dpixel_depths, 37 | const float* dL_dpixel_alphas, 38 | float3* dL_dmean2D, 39 | float3* dL_dabsmean2D, 40 | float4* dL_dconic2D, 41 | float* dL_dopacity, 42 | float* dL_dcolors, 43 | float* dL_ddepths); 44 | 45 | void preprocess( 46 | int P, int D, int M, 47 | const float3* means, 48 | const int* radii, 49 | const float* shs, 50 | const bool* clamped, 51 | const glm::vec3* scales, 52 | const glm::vec4* rotations, 53 | const float scale_modifier, 54 | const float* cov3Ds, 55 | const float* view, 56 | const float* proj, 57 | const float focal_x, float focal_y, 58 | const float tan_fovx, float tan_fovy, 59 | const glm::vec3* campos, 60 | const float3* dL_dmean2D, 61 | const float* dL_dconics, 62 | glm::vec3* dL_dmeans, 63 | float* dL_dcolor, 64 | float* dL_ddepth, 65 | float* dL_dcov3D, 66 | float* dL_dsh, 67 | glm::vec3* dL_dscale, 68 | glm::vec4* dL_drot); 69 | 70 | void computeCov3DBackward( 71 | int P, 72 | const glm::vec3* scaling_xyz, 73 | const glm::vec4* rotation_l, 74 | const float* dL_dcov, 75 | glm::vec3* dL_dscaling_xyz, 76 | glm::vec4* dL_drotation_l); 77 | 78 | void computeCov4DBackward( 79 | int P, 80 | const glm::vec4* scaling_xyzt, 81 | const glm::vec4* rotation_l, 82 | const glm::vec4* rotation_r, 83 | const float* dL_dcov, 84 | const glm::vec3* dL_dms, 85 | const float* dL_dcov_t, 86 | glm::vec4* dL_dscaling_xyzt, 87 | glm::vec4* dL_drotation_l, 88 | glm::vec4* dL_drotation_r); 89 | 90 | void computeSH4DBackward( 91 | int P, 92 | int deg, int deg_t, int max_coeffs, 93 | const float* sh, 94 | const glm::vec3* dir, 95 | const float* dir_t, 96 | const float time_duration, 97 | const glm::vec3* dL_drgb, 98 | float* dL_dsh, 99 | glm::vec3* dL_ddir, 100 | float* dL_ddir_t 101 | ); 102 | } 103 | 104 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED 13 | #define CUDA_RASTERIZER_CONFIG_H_INCLUDED 14 | 15 | #define NUM_CHANNELS 3 // Default 3, RGB 16 | #define BLOCK_X 16 17 | #define BLOCK_Y 16 18 | 19 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/forward.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "forward.h" 13 | #include "auxiliary.h" 14 | #include 15 | #include 16 | namespace cg = cooperative_groups; 17 | 18 | // Forward method for converting the input spherical harmonics 19 | // coefficients of each Gaussian to a simple RGB color. 20 | __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) 21 | { 22 | // The implementation is loosely based on code for 23 | // "Differentiable Point-Based Radiance Fields for 24 | // Efficient View Synthesis" by Zhang et al. (2022) 25 | glm::vec3 pos = means[idx]; 26 | glm::vec3 dir = pos - campos; 27 | dir = dir / glm::length(dir); 28 | 29 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 30 | glm::vec3 result = SH_C0 * sh[0]; 31 | 32 | if (deg > 0) 33 | { 34 | float x = dir.x; 35 | float y = dir.y; 36 | float z = dir.z; 37 | result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3]; 38 | 39 | if (deg > 1) 40 | { 41 | float xx = x * x, yy = y * y, zz = z * z; 42 | float xy = x * y, yz = y * z, xz = x * z; 43 | result = result + 44 | SH_C2[0] * xy * sh[4] + 45 | SH_C2[1] * yz * sh[5] + 46 | SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] + 47 | SH_C2[3] * xz * sh[7] + 48 | SH_C2[4] * (xx - yy) * sh[8]; 49 | 50 | if (deg > 2) 51 | { 52 | result = result + 53 | SH_C3[0] * y * (3.0f * xx - yy) * sh[9] + 54 | SH_C3[1] * xy * z * sh[10] + 55 | SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] + 56 | SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] + 57 | SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] + 58 | SH_C3[5] * z * (xx - yy) * sh[14] + 59 | SH_C3[6] * x * (xx - 3.0f * yy) * sh[15]; 60 | } 61 | } 62 | } 63 | result += 0.5f; 64 | 65 | // RGB colors are clamped to positive values. If values are 66 | // clamped, we need to keep track of this for the backward pass. 67 | clamped[3 * idx + 0] = (result.x < 0); 68 | clamped[3 * idx + 1] = (result.y < 0); 69 | clamped[3 * idx + 2] = (result.z < 0); 70 | return glm::max(result, 0.0f); 71 | } 72 | 73 | __device__ glm::vec3 eval4DSH(int deg, int deg_t, const glm::vec3* sh, const glm::vec3 dir, const float dir_t, const float time_duration) 74 | { 75 | 76 | float l0m0 = SH_C0; 77 | glm::vec3 result = l0m0 * sh[0]; 78 | 79 | if (deg > 0) 80 | { 81 | float x = dir.x; 82 | float y = dir.y; 83 | float z = dir.z; 84 | 85 | float l1m1 = -1 * SH_C1 * y; 86 | float l1m0 = SH_C1 * z; 87 | float l1p1 = -1 * SH_C1 * x; 88 | 89 | result += 90 | l1m1 * sh[1] + 91 | l1m0 * sh[2] + 92 | l1p1 * sh[3]; 93 | 94 | if (deg > 1) 95 | { 96 | float xx = x * x, yy = y * y, zz = z * z; 97 | float xy = x * y, yz = y * z, xz = x * z; 98 | 99 | float l2m2 = SH_C2[0] * xy; 100 | float l2m1 = SH_C2[1] * yz; 101 | float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy); 102 | float l2p1 = SH_C2[3] * xz; 103 | float l2p2 = SH_C2[4] * (xx - yy); 104 | 105 | result += 106 | l2m2 * sh[4] + 107 | l2m1 * sh[5] + 108 | l2m0 * sh[6] + 109 | l2p1 * sh[7] + 110 | l2p2 * sh[8]; 111 | 112 | if (deg > 2) 113 | { 114 | float l3m3 = SH_C3[0] * y * (3 * xx - yy); 115 | float l3m2 = SH_C3[1] * xy * z; 116 | float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy); 117 | float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy); 118 | float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy); 119 | float l3p2 = SH_C3[5] * z * (xx - yy); 120 | float l3p3 = SH_C3[6] * x * (xx - 3 * yy); 121 | 122 | result += 123 | l3m3 * sh[9] + 124 | l3m2 * sh[10] + 125 | l3m1 * sh[11] + 126 | l3m0 * sh[12] + 127 | l3p1 * sh[13] + 128 | l3p2 * sh[14] + 129 | l3p3 * sh[15]; 130 | 131 | if (deg_t > 0){ 132 | float t1 = cos(2 * MY_PI * dir_t / time_duration); 133 | 134 | result += t1 * (l0m0 * sh[16] + 135 | l1m1 * sh[17] + 136 | l1m0 * sh[18] + 137 | l1p1 * sh[19] + 138 | l2m2 * sh[20] + 139 | l2m1 * sh[21] + 140 | l2m0 * sh[22] + 141 | l2p1 * sh[23] + 142 | l2p2 * sh[24] + 143 | l3m3 * sh[25] + 144 | l3m2 * sh[26] + 145 | l3m1 * sh[27] + 146 | l3m0 * sh[28] + 147 | l3p1 * sh[29] + 148 | l3p2 * sh[30] + 149 | l3p3 * sh[31]); 150 | 151 | if (deg_t > 1){ 152 | float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration); 153 | 154 | result += t2 * (l0m0 * sh[32] + 155 | l1m1 * sh[33] + 156 | l1m0 * sh[34] + 157 | l1p1 * sh[35] + 158 | l2m2 * sh[36] + 159 | l2m1 * sh[37] + 160 | l2m0 * sh[38] + 161 | l2p1 * sh[39] + 162 | l2p2 * sh[40] + 163 | l3m3 * sh[41] + 164 | l3m2 * sh[42] + 165 | l3m1 * sh[43] + 166 | l3m0 * sh[44] + 167 | l3p1 * sh[45] + 168 | l3p2 * sh[46] + 169 | l3p3 * sh[47]); 170 | } 171 | 172 | } 173 | } 174 | } 175 | } 176 | result += 0.5f; 177 | 178 | return result; 179 | } 180 | 181 | 182 | __device__ glm::vec3 eval4DSHResidual(int deg, int deg_t, const glm::vec3* sh, const glm::vec3 dir, const float dir_t, const float time_duration) 183 | { 184 | 185 | float l0m0 = SH_C0; 186 | // glm::vec3 result = l0m0 * sh[0]; 187 | glm::vec3 result {0.0f}; 188 | 189 | if (deg > 0) 190 | { 191 | float x = dir.x; 192 | float y = dir.y; 193 | float z = dir.z; 194 | 195 | float l1m1 = -1 * SH_C1 * y; 196 | float l1m0 = SH_C1 * z; 197 | float l1p1 = -1 * SH_C1 * x; 198 | 199 | result += 200 | l1m1 * sh[0] + 201 | l1m0 * sh[1] + 202 | l1p1 * sh[2]; 203 | 204 | if (deg > 1) 205 | { 206 | float xx = x * x, yy = y * y, zz = z * z; 207 | float xy = x * y, yz = y * z, xz = x * z; 208 | 209 | float l2m2 = SH_C2[0] * xy; 210 | float l2m1 = SH_C2[1] * yz; 211 | float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy); 212 | float l2p1 = SH_C2[3] * xz; 213 | float l2p2 = SH_C2[4] * (xx - yy); 214 | 215 | result += 216 | l2m2 * sh[3] + 217 | l2m1 * sh[4] + 218 | l2m0 * sh[5] + 219 | l2p1 * sh[6] + 220 | l2p2 * sh[7]; 221 | 222 | if (deg > 2) 223 | { 224 | float l3m3 = SH_C3[0] * y * (3 * xx - yy); 225 | float l3m2 = SH_C3[1] * xy * z; 226 | float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy); 227 | float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy); 228 | float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy); 229 | float l3p2 = SH_C3[5] * z * (xx - yy); 230 | float l3p3 = SH_C3[6] * x * (xx - 3 * yy); 231 | 232 | result += 233 | l3m3 * sh[8] + 234 | l3m2 * sh[9] + 235 | l3m1 * sh[10] + 236 | l3m0 * sh[11] + 237 | l3p1 * sh[12] + 238 | l3p2 * sh[13] + 239 | l3p3 * sh[14]; 240 | 241 | if (deg_t > 0){ 242 | float t1 = cos(2 * MY_PI * dir_t / time_duration); 243 | 244 | result += t1 * (l0m0 * sh[15] + 245 | l1m1 * sh[16] + 246 | l1m0 * sh[17] + 247 | l1p1 * sh[18] + 248 | l2m2 * sh[19] + 249 | l2m1 * sh[20] + 250 | l2m0 * sh[21] + 251 | l2p1 * sh[22] + 252 | l2p2 * sh[23] + 253 | l3m3 * sh[24] + 254 | l3m2 * sh[25] + 255 | l3m1 * sh[26] + 256 | l3m0 * sh[27] + 257 | l3p1 * sh[28] + 258 | l3p2 * sh[29] + 259 | l3p3 * sh[30]); 260 | 261 | if (deg_t > 1){ 262 | float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration); 263 | 264 | result += t2 * (l0m0 * sh[31] + 265 | l1m1 * sh[32] + 266 | l1m0 * sh[33] + 267 | l1p1 * sh[34] + 268 | l2m2 * sh[35] + 269 | l2m1 * sh[36] + 270 | l2m0 * sh[37] + 271 | l2p1 * sh[38] + 272 | l2p2 * sh[39] + 273 | l3m3 * sh[40] + 274 | l3m2 * sh[41] + 275 | l3m1 * sh[42] + 276 | l3m0 * sh[43] + 277 | l3p1 * sh[44] + 278 | l3p2 * sh[45] + 279 | l3p3 * sh[46]); 280 | } 281 | 282 | } 283 | } 284 | } 285 | } 286 | // result += 0.5f; 287 | 288 | return result; 289 | } 290 | 291 | __device__ glm::vec3 computeColorFromSH_4D(int idx, int deg, int deg_t, int max_coeffs, const float* shs, const glm::vec3* dirs, const float* dirs_t, const float time_duration) 292 | { 293 | // The implementation is loosely based on code for 294 | // "Differentiable Point-Based Radiance Fields for 295 | // Efficient View Synthesis" by Zhang et al. (2022) 296 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 297 | glm::vec3 dir = dirs[idx]; 298 | const float dir_t = dirs_t[idx]; 299 | return eval4DSH(deg, deg_t, sh, dir, dir_t, time_duration); 300 | } 301 | 302 | 303 | __global__ void computeSH4DCUDA(int P, 304 | int deg, int deg_t, int max_coeffs, 305 | const float* sh, const glm::vec3* dir, const float* dir_t, const float time_duration, glm::vec3* rgb 306 | ) 307 | { 308 | auto idx = cg::this_grid().thread_rank(); 309 | if (idx >= P) 310 | return; 311 | rgb[idx] = computeColorFromSH_4D( 312 | idx, 313 | deg, 314 | deg_t, 315 | max_coeffs, 316 | sh, 317 | dir, 318 | dir_t, 319 | time_duration 320 | ); 321 | } 322 | 323 | 324 | void FORWARD::computeSH4D( 325 | int P, 326 | int deg, int deg_t, int max_coeffs, 327 | const float* sh, 328 | const glm::vec3* dir, 329 | const float* dir_t, 330 | const float time_duration, 331 | glm::vec3* rgb) 332 | { 333 | computeSH4DCUDA << <(P + 255) / 256, 256 >> > ( 334 | P, 335 | deg, 336 | deg_t, 337 | max_coeffs, 338 | sh, 339 | dir, 340 | dir_t, 341 | time_duration, 342 | rgb 343 | ); 344 | } 345 | 346 | // Forward version of 2D covariance matrix computation 347 | __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix) 348 | { 349 | // The following models the steps outlined by equations 29 350 | // and 31 in "EWA Splatting" (Zwicker et al., 2002). 351 | // Additionally considers aspect / scaling of viewport. 352 | // Transposes used to account for row-/column-major conventions. 353 | float3 t = transformPoint4x3(mean, viewmatrix); 354 | 355 | const float limx = 1.3f * tan_fovx; 356 | const float limy = 1.3f * tan_fovy; 357 | const float txtz = t.x / t.z; 358 | const float tytz = t.y / t.z; 359 | t.x = min(limx, max(-limx, txtz)) * t.z; 360 | t.y = min(limy, max(-limy, tytz)) * t.z; 361 | 362 | glm::mat3 J = glm::mat3( 363 | focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z), 364 | 0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z), 365 | 0, 0, 0); 366 | 367 | glm::mat3 W = glm::mat3( 368 | viewmatrix[0], viewmatrix[4], viewmatrix[8], 369 | viewmatrix[1], viewmatrix[5], viewmatrix[9], 370 | viewmatrix[2], viewmatrix[6], viewmatrix[10]); 371 | 372 | glm::mat3 T = W * J; 373 | 374 | glm::mat3 Vrk = glm::mat3( 375 | cov3D[0], cov3D[1], cov3D[2], 376 | cov3D[1], cov3D[3], cov3D[4], 377 | cov3D[2], cov3D[4], cov3D[5]); 378 | 379 | glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T; 380 | 381 | // compute unblured determinant 382 | // float det_orig = cov[0][0] * cov[1][1] - cov[0][1] * cov[0][1]; 383 | 384 | // Apply low-pass filter: every Gaussian should be at least 385 | // one pixel wide/high. Discard 3rd row and column. 386 | cov[0][0] += 0.3f; 387 | cov[1][1] += 0.3f; 388 | return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) }; 389 | } 390 | 391 | // Forward method for converting scale and rotation properties of each 392 | // Gaussian to a 3D covariance matrix in world space. Also takes care 393 | // of quaternion normalization. 394 | __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D) 395 | { 396 | // Create scaling matrix 397 | glm::mat3 S = glm::mat3(1.0f); 398 | S[0][0] = mod * scale.x; 399 | S[1][1] = mod * scale.y; 400 | S[2][2] = mod * scale.z; 401 | 402 | // Normalize quaternion to get valid rotation 403 | glm::vec4 q = rot;// / glm::length(rot); 404 | float r = q.x; 405 | float x = q.y; 406 | float y = q.z; 407 | float z = q.w; 408 | 409 | // Compute rotation matrix from quaternion 410 | glm::mat3 R = glm::mat3( 411 | 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), 412 | 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), 413 | 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) 414 | ); 415 | 416 | glm::mat3 M = S * R; 417 | 418 | // Compute 3D world covariance matrix Sigma 419 | glm::mat3 Sigma = glm::transpose(M) * M; 420 | 421 | // Covariance is symmetric, only store upper right 422 | cov3D[0] = Sigma[0][0]; 423 | cov3D[1] = Sigma[0][1]; 424 | cov3D[2] = Sigma[0][2]; 425 | cov3D[3] = Sigma[1][1]; 426 | cov3D[4] = Sigma[1][2]; 427 | cov3D[5] = Sigma[2][2]; 428 | } 429 | 430 | __global__ void computeCov3DCUDA(int P, 431 | const glm::vec3* scaling_xyz, 432 | const glm::vec4* rotation_l, 433 | float* cov) 434 | { 435 | auto idx = cg::this_grid().thread_rank(); 436 | if (idx >= P) 437 | return; 438 | computeCov3D( 439 | scaling_xyz[idx], 440 | 1.0f, 441 | rotation_l[idx], 442 | cov + idx * 6); 443 | } 444 | 445 | void FORWARD::computeCov3D( 446 | int P, 447 | const glm::vec3* scaling_xyz, 448 | const glm::vec4* rotation_l, 449 | float* cov) 450 | { 451 | computeCov3DCUDA << <(P + 255) / 256, 256 >> > ( 452 | P, 453 | scaling_xyz, 454 | rotation_l, 455 | cov); 456 | } 457 | 458 | __device__ void computeCov4D(const glm::vec4 scaling_xyzt, const glm::vec4 rotation_l, const glm::vec4 rotation_r, float* cov, glm::vec3 &ms, float &cov_t) 459 | { 460 | // Create scaling matrix 461 | glm::mat4 S = glm::mat4(1.0f); 462 | S[0][0] = scaling_xyzt.x; 463 | S[1][1] = scaling_xyzt.y; 464 | S[2][2] = scaling_xyzt.z; 465 | S[3][3] = scaling_xyzt.w; 466 | 467 | const float l_l = glm::length(rotation_l); 468 | const float a = rotation_l.x / l_l; 469 | const float b = rotation_l.y / l_l; 470 | const float c = rotation_l.z / l_l; 471 | const float d = rotation_l.w / l_l; 472 | 473 | const float l_r = glm::length(rotation_r); 474 | const float p = rotation_r.x / l_r; 475 | const float q = rotation_r.y / l_r; 476 | const float r = rotation_r.z / l_r; 477 | const float s = rotation_r.w / l_r; 478 | 479 | glm::mat4 M_l = glm::mat4( 480 | a, -b, -c, -d, 481 | b, a,-d, c, 482 | c, d, a,-b, 483 | d,-c, b, a 484 | ); 485 | 486 | glm::mat4 M_r = glm::mat4( 487 | p, q, r, s, 488 | -q, p,-s, r, 489 | -r, s, p,-q, 490 | -s,-r, q, p 491 | ); 492 | // glm stores in column major 493 | glm::mat4 R = M_r * M_l; 494 | glm::mat4 M = S * R; 495 | glm::mat4 Sigma = glm::transpose(M) * M; 496 | cov_t = Sigma[3][3]; 497 | 498 | glm::mat3 cov11 = glm::mat3(Sigma); 499 | glm::vec3 cov12 = glm::vec3(Sigma[0][3], Sigma[1][3], Sigma[2][3]); 500 | glm::mat3 cov3D = cov11 - glm::outerProduct(cov12, cov12) / cov_t; 501 | 502 | // Covariance is symmetric, only store upper right 503 | cov[0] = cov3D[0][0]; 504 | cov[1] = cov3D[0][1]; 505 | cov[2] = cov3D[0][2]; 506 | cov[3] = cov3D[1][1]; 507 | cov[4] = cov3D[1][2]; 508 | cov[5] = cov3D[2][2]; 509 | ms = cov12 / cov_t; 510 | } 511 | 512 | 513 | __global__ void computeCov4DCUDA(int P, 514 | const glm::vec4* scaling_xyzt, 515 | const glm::vec4* rotation_l, 516 | const glm::vec4* rotation_r, 517 | float* cov, 518 | glm::vec3* ms, 519 | float* cov_t) 520 | { 521 | auto idx = cg::this_grid().thread_rank(); 522 | if (idx >= P) 523 | return; 524 | computeCov4D( 525 | scaling_xyzt[idx], 526 | rotation_l[idx], 527 | rotation_r[idx], 528 | cov + idx * 6, 529 | ms[idx], 530 | cov_t[idx]); 531 | } 532 | 533 | 534 | void FORWARD::computeCov4D( 535 | int P, 536 | const glm::vec4* scaling_xyzt, 537 | const glm::vec4* rotation_l, 538 | const glm::vec4* rotation_r, 539 | float* cov, 540 | glm::vec3* ms, 541 | float* cov_t 542 | ) { 543 | computeCov4DCUDA << <(P + 255) / 256, 256 >> > ( 544 | P, 545 | scaling_xyzt, 546 | rotation_l, 547 | rotation_r, 548 | cov, 549 | ms, 550 | cov_t); 551 | } 552 | 553 | // Perform initial steps for each Gaussian prior to rasterization. 554 | template 555 | __global__ void fusedPreprocess4DSparseCUDA(int P, 556 | const int deg, 557 | const int deg_t, 558 | const int M, 559 | const glm::vec3* means3D, 560 | const float* cov, 561 | const glm::vec3* ms, 562 | const float* cov_t, 563 | const float* opacities, 564 | const float* t1, 565 | const glm::vec3* bases, 566 | const float* shs, 567 | const float* t, 568 | const int* sparse, 569 | const float* viewmatrix, 570 | const float* projmatrix, 571 | const float* cam_pos, 572 | const float duration, 573 | bool* mask, 574 | float* occ1, 575 | glm::vec3* xyz3, 576 | glm::vec3* rgb3) 577 | { 578 | auto idx = cg::this_grid().thread_rank(); 579 | if (idx >= P) 580 | return; 581 | 582 | // Initialize radius and touched tiles to 0. If this isn't changed, 583 | // this Gaussian will not be processed further. 584 | mask[idx] = false; 585 | 586 | // Perform marginalization using the current time 587 | float dt = t[idx] - t1[idx]; 588 | float marginal_t = __expf(-0.5 * dt * dt / cov_t[idx]); 589 | if (marginal_t <= 0.05) { 590 | return; 591 | } 592 | 593 | glm::vec3 xyz = means3D[idx] + ms[idx] * dt; 594 | 595 | // Filter by frustum 596 | // Perform near culling, quit if outside. 597 | float3 pos {xyz.x, xyz.y, xyz.z}; 598 | if (!check_frustum(pos, viewmatrix, projmatrix) || opacities[idx] < 0.0001f) { 599 | return; 600 | } 601 | 602 | float occ = marginal_t * opacities[idx]; 603 | if (occ < 0.0001f) { 604 | return; 605 | } 606 | 607 | mask[idx] = true; 608 | occ1[idx] = occ; 609 | xyz3[idx] = xyz; 610 | 611 | // Handling sparse SH 612 | glm::vec3 rgb = SH_C0 * bases[idx]; 613 | rgb3[idx] = min(max(rgb + 0.5f, 0.0f), 1.0f); // zero degree sh 614 | // rgb3[idx] = bases[idx]; // zero degree sh 615 | 616 | if (sparse[idx] == -1) { 617 | return; 618 | } 619 | 620 | // Computing 4D SH using the current time and viewing direction 621 | glm::vec3 dir = xyz - *(glm::vec3*)cam_pos; 622 | dir = dir / glm::length(dir); 623 | const glm::vec3* sh = (glm::vec3*)shs + sparse[idx] * M; 624 | rgb += eval4DSHResidual(deg, deg_t, sh, dir, dt, duration); 625 | rgb3[idx] = min(max(rgb + 0.5f, 0.0f), 1.0f); 626 | } 627 | 628 | 629 | // Perform initial steps for each Gaussian prior to rasterization. 630 | template 631 | __global__ void fusedPreprocess4DCUDA(int P, 632 | const int deg, 633 | const int deg_t, 634 | const int M, 635 | const glm::vec3* means3D, 636 | const float* cov, 637 | const glm::vec3* ms, 638 | const float* cov_t, 639 | const float* opacities, 640 | const float* t1, 641 | const float* shs, 642 | const float* t, 643 | const float* viewmatrix, 644 | const float* projmatrix, 645 | const float* cam_pos, 646 | const float duration, 647 | bool* mask, 648 | float* occ1, 649 | glm::vec3* xyz3, 650 | glm::vec3* rgb3) 651 | { 652 | auto idx = cg::this_grid().thread_rank(); 653 | if (idx >= P) 654 | return; 655 | 656 | // Initialize radius and touched tiles to 0. If this isn't changed, 657 | // this Gaussian will not be processed further. 658 | mask[idx] = false; 659 | 660 | // Perform marginalization using the current time 661 | float dt = t[idx] - t1[idx]; 662 | float marginal_t = __expf(-0.5 * dt * dt / cov_t[idx]); 663 | if (marginal_t <= 0.005) { 664 | return; 665 | } 666 | 667 | glm::vec3 xyz = means3D[idx] + ms[idx] * dt; 668 | 669 | // Filter by frustum 670 | // Perform near culling, quit if outside. 671 | float3 pos {xyz.x, xyz.y, xyz.z}; 672 | if (!check_frustum(pos, viewmatrix, projmatrix) || opacities[idx] < 0.0001f) { 673 | return; 674 | } 675 | 676 | float occ = marginal_t * opacities[idx]; 677 | if (occ < 0.0001f) { 678 | return; 679 | } 680 | 681 | // Computing 4D SH using the current time and viewing direction 682 | glm::vec3 dir = xyz - *(glm::vec3*)cam_pos; 683 | dir = dir / glm::length(dir); 684 | const glm::vec3* sh = ((glm::vec3*)shs) + idx * M; 685 | glm::vec3 rgb = eval4DSH(deg, deg_t, sh, dir, dt, duration); 686 | 687 | // glm::vec3 rgb(0.0f); 688 | // const glm::vec3* sh = (glm::vec3*)shs + idx * M; 689 | // glm::vec3 sh0 = *sh; 690 | // float l0m0 = SH_C0; 691 | // glm::vec3 rgb = l0m0 * sh0; 692 | 693 | mask[idx] = true; 694 | occ1[idx] = occ; 695 | xyz3[idx] = xyz; 696 | rgb3[idx] = rgb; 697 | } 698 | 699 | // Perform initial steps for each Gaussian prior to rasterization. 700 | template 701 | __global__ void preprocessCUDA(int P, int D, int M, 702 | const float* orig_points, 703 | const glm::vec3* scales, 704 | const float scale_modifier, 705 | const glm::vec4* rotations, 706 | const float* opacities, 707 | const float* shs, 708 | bool* clamped, 709 | const float* cov3D_precomp, 710 | const bool* tile_mask, 711 | const float* colors_precomp, 712 | const float* viewmatrix, 713 | const float* projmatrix, 714 | const glm::vec3* cam_pos, 715 | const int W, int H, 716 | const float tan_fovx, float tan_fovy, 717 | const float focal_x, float focal_y, 718 | int* radii, 719 | float2* points_xy_image, 720 | float* depths, 721 | float* cov3Ds, 722 | float* rgb, 723 | float4* conic_opacity, 724 | const dim3 grid, 725 | uint32_t* tiles_touched, 726 | bool prefiltered) 727 | { 728 | auto idx = cg::this_grid().thread_rank(); 729 | if (idx >= P) 730 | return; 731 | 732 | // Initialize radius and touched tiles to 0. If this isn't changed, 733 | // this Gaussian will not be processed further. 734 | radii[idx] = 0; 735 | tiles_touched[idx] = 0; 736 | 737 | // Perform near culling, quit if outside. 738 | float3 p_view; 739 | if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view) || opacities[idx] < 0.0001f) { 740 | return; 741 | } 742 | 743 | // Transform point by projecting 744 | float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; 745 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 746 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 747 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 748 | 749 | // If 3D covariance matrix is precomputed, use it, otherwise compute 750 | // from scaling and rotation parameters. 751 | const float* cov3D; 752 | if (cov3D_precomp != nullptr) 753 | { 754 | cov3D = cov3D_precomp + idx * 6; 755 | } 756 | else 757 | { 758 | computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); 759 | cov3D = cov3Ds + idx * 6; 760 | } 761 | 762 | // Compute 2D screen-space covariance matrix and unblurred determinant. 763 | float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); 764 | 765 | // Invert covariance (EWA algorithm) 766 | float det = (cov.x * cov.z - cov.y * cov.y); 767 | if (det <= 0.0f || cov.x <= 0.0f || cov.z <= 0.0f) { 768 | // Illegal cov matrix, this point should be pruned with zero gradients 769 | radii[idx] = -1.0; 770 | return; 771 | } 772 | float det_inv = 1.f / det; 773 | float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; 774 | 775 | // Compute extent in screen space (by finding eigenvalues of 776 | // 2D covariance matrix). Use extent to compute a bounding rectangle 777 | // of screen-space tiles that this Gaussian overlaps with. Quit if 778 | // rectangle covers 0 tiles. 779 | float mid = 0.5f * (cov.x + cov.z); 780 | float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); 781 | float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); 782 | if (lambda1 <= 0.01 || lambda2 <= 0.01 || lambda1 < lambda2 || (lambda1 / lambda2) > 10000.0) { 783 | // Illegal cov matrix, this point should be pruned with zero gradients 784 | radii[idx] = -1.0; 785 | return; 786 | } 787 | float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); 788 | 789 | // float my_radius = max(0.0f, ceil(3.f * sqrt(max(lambda1, lambda2)))); 790 | float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; 791 | uint2 rect_min, rect_max; 792 | getRect(point_image, (int)my_radius, rect_min, rect_max, grid); 793 | 794 | tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); 795 | if (tiles_touched[idx] == 0) { 796 | // Not rendered since outside of all visible tiles 797 | return; 798 | } 799 | 800 | // Perform tile mask check 801 | if (tile_mask != nullptr){ 802 | int touched = 0; 803 | for (int y = rect_min.y; y < rect_max.y; y++) 804 | { 805 | for (int x = rect_min.x; x < rect_max.x; x++) 806 | { 807 | if (tile_mask[y * grid.x + x]) 808 | { 809 | touched += 1; 810 | } 811 | } 812 | } 813 | tiles_touched[idx] = touched; 814 | if (touched == 0) { 815 | // Not rendered since outside of tile mask 816 | // radii[idx] = -1.0; 817 | return; 818 | } 819 | } 820 | 821 | // Inverse 2D covariance and opacity neatly pack into one float4 822 | conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; 823 | 824 | // // Perform accurate per-tile culling test 825 | // // As mentioned in: StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering 826 | // // Slightly higher threshold for tile-based culling; Otherwise, imprecisions could lead to more tiles in preprocess than in duplicate 827 | // constexpr float alpha_threshold = 1.0f / 255.0f; 828 | // const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold); 829 | // tiles_touched[idx] = computeTilebasedCullingTileCount(conic_opacity[idx], point_image, opacity_power_threshold, rect_min, rect_max); 830 | 831 | // If colors have been precomputed, use them, otherwise convert 832 | // spherical harmonics coefficients to RGB color. 833 | if (colors_precomp == nullptr) 834 | { 835 | glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); 836 | rgb[idx * C + 0] = result.x; 837 | rgb[idx * C + 1] = result.y; 838 | rgb[idx * C + 2] = result.z; 839 | } 840 | 841 | // Store some useful helper data for the next steps. 842 | depths[idx] = p_view.z; 843 | radii[idx] = my_radius; 844 | points_xy_image[idx] = point_image; 845 | } 846 | 847 | // Main rasterization method. Collaboratively works on one tile per 848 | // block, each thread treats one pixel. Alternates between fetching 849 | // and rasterizing data. 850 | template 851 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 852 | renderCUDA( 853 | const uint2* __restrict__ ranges, 854 | const uint32_t* __restrict__ point_list, 855 | int W, int H, 856 | const float2* __restrict__ points_xy_image, 857 | const float* __restrict__ features, 858 | const float* __restrict__ depths, 859 | const float4* __restrict__ conic_opacity, 860 | float* __restrict__ out_alpha, 861 | uint32_t* __restrict__ n_contrib, 862 | const float* __restrict__ bg_color, 863 | float* __restrict__ out_color, 864 | float* __restrict__ out_depth) 865 | { 866 | // Identify current tile and associated min/max pixel range. 867 | auto block = cg::this_thread_block(); 868 | uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 869 | uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 870 | uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 871 | uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 872 | uint32_t pix_id = W * pix.y + pix.x; 873 | float2 pixf = { (float)pix.x, (float)pix.y }; 874 | 875 | // Check if this thread is associated with a valid pixel or outside. 876 | bool inside = pix.x < W&& pix.y < H; 877 | // Done threads can help with fetching, but don't rasterize 878 | bool done = !inside; 879 | 880 | // Load start/end range of IDs to process in bit sorted list. 881 | uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 882 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 883 | int toDo = range.y - range.x; 884 | 885 | // Allocate storage for batches of collectively fetched data. 886 | __shared__ int collected_id[BLOCK_SIZE]; 887 | __shared__ float2 collected_xy[BLOCK_SIZE]; 888 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 889 | 890 | // Initialize helper variables 891 | float T = 1.0f; 892 | uint32_t contributor = 0; 893 | uint32_t last_contributor = 0; 894 | float C[CHANNELS] = { 0 }; 895 | float D = 0; 896 | 897 | // Iterate over batches until all done or range is complete 898 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 899 | { 900 | // End if entire block votes that it is done rasterizing 901 | int num_done = __syncthreads_count(done); 902 | if (num_done == BLOCK_SIZE) 903 | break; 904 | 905 | // Collectively fetch per-Gaussian data from global to shared 906 | int progress = i * BLOCK_SIZE + block.thread_rank(); 907 | if (range.x + progress < range.y) 908 | { 909 | int coll_id = point_list[range.x + progress]; 910 | collected_id[block.thread_rank()] = coll_id; 911 | collected_xy[block.thread_rank()] = points_xy_image[coll_id]; 912 | collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; 913 | } 914 | block.sync(); 915 | 916 | // Iterate over current batch 917 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) 918 | { 919 | // Keep track of current position in range 920 | contributor++; 921 | 922 | // Resample using conic matrix (cf. "Surface 923 | // Splatting" by Zwicker et al., 2001) 924 | float2 xy = collected_xy[j]; 925 | float2 d = { xy.x - pixf.x, xy.y - pixf.y }; 926 | float4 con_o = collected_conic_opacity[j]; 927 | float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 928 | if (power > 0.0f) 929 | continue; 930 | 931 | // Eq. (2) from 3D Gaussian splatting paper. 932 | // Obtain alpha by multiplying with Gaussian opacity 933 | // and its exponential falloff from mean. 934 | // Avoid numerical instabilities (see paper appendix). 935 | float alpha = min(0.99f, con_o.w * __expf(power)); 936 | if (alpha < 1.0f / 255.0f) 937 | continue; 938 | float test_T = T * (1 - alpha); 939 | 940 | // Eq. (3) from 3D Gaussian splatting paper. 941 | for (int ch = 0; ch < CHANNELS; ch++) 942 | C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T; 943 | D += depths[collected_id[j]] * alpha * T; 944 | T = test_T; 945 | 946 | // Keep track of last range entry to update this 947 | // pixel. 948 | last_contributor = contributor; 949 | 950 | // Early stopping 951 | if (test_T < 0.0001f) 952 | { 953 | done = true; 954 | continue; 955 | } 956 | } 957 | } 958 | 959 | // All threads that treat valid pixel write out their final 960 | // rendering data to the frame and auxiliary buffers. 961 | if (inside) 962 | { 963 | out_alpha[pix_id] = 1 - T; 964 | n_contrib[pix_id] = last_contributor; 965 | for (int ch = 0; ch < CHANNELS; ch++) 966 | out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; 967 | out_depth[pix_id] = D; 968 | } 969 | } 970 | 971 | void FORWARD::render( 972 | const dim3 grid, dim3 block, 973 | const uint2* ranges, 974 | const uint32_t* point_list, 975 | int W, int H, 976 | const float2* means2D, 977 | const float* colors, 978 | const float* depths, 979 | const float4* conic_opacity, 980 | float* out_alpha, 981 | uint32_t* n_contrib, 982 | const float* bg_color, 983 | float* out_color, 984 | float* out_depth) 985 | { 986 | renderCUDA << > > ( 987 | ranges, 988 | point_list, 989 | W, H, 990 | means2D, 991 | colors, 992 | depths, 993 | conic_opacity, 994 | out_alpha, 995 | n_contrib, 996 | bg_color, 997 | out_color, 998 | out_depth); 999 | } 1000 | 1001 | void FORWARD::preprocess(int P, int D, int M, 1002 | const float* means3D, 1003 | const glm::vec3* scales, 1004 | const float scale_modifier, 1005 | const glm::vec4* rotations, 1006 | const float* opacities, 1007 | const float* shs, 1008 | bool* clamped, 1009 | const float* cov3D_precomp, 1010 | const bool* tile_mask, 1011 | const float* colors_precomp, 1012 | const float* viewmatrix, 1013 | const float* projmatrix, 1014 | const glm::vec3* cam_pos, 1015 | const int W, int H, 1016 | const float focal_x, float focal_y, 1017 | const float tan_fovx, float tan_fovy, 1018 | int* radii, 1019 | float2* means2D, 1020 | float* depths, 1021 | float* cov3Ds, 1022 | float* rgb, 1023 | float4* conic_opacity, 1024 | const dim3 grid, 1025 | uint32_t* tiles_touched, 1026 | bool prefiltered) 1027 | { 1028 | preprocessCUDA << <(P + 255) / 256, 256 >> > ( 1029 | P, D, M, 1030 | means3D, 1031 | scales, 1032 | scale_modifier, 1033 | rotations, 1034 | opacities, 1035 | shs, 1036 | clamped, 1037 | cov3D_precomp, 1038 | tile_mask, 1039 | colors_precomp, 1040 | viewmatrix, 1041 | projmatrix, 1042 | cam_pos, 1043 | W, H, 1044 | tan_fovx, tan_fovy, 1045 | focal_x, focal_y, 1046 | radii, 1047 | means2D, 1048 | depths, 1049 | cov3Ds, 1050 | rgb, 1051 | conic_opacity, 1052 | grid, 1053 | tiles_touched, 1054 | prefiltered 1055 | ); 1056 | } 1057 | 1058 | void FORWARD::fusedPreprocess4D(int P, 1059 | const int deg, 1060 | const int deg_t, 1061 | const int M, 1062 | const glm::vec3* means3D, 1063 | const float* cov, 1064 | const glm::vec3* ms, 1065 | const float* cov_t, 1066 | const float* opacities, 1067 | const float* t1, 1068 | const float* shs, 1069 | const float* t, 1070 | const float* viewmatrix, 1071 | const float* projmatrix, 1072 | const float* cam_pos, 1073 | const float duration, 1074 | bool* mask, 1075 | float* occ1, 1076 | glm::vec3* xyz3, 1077 | glm::vec3* rgb3) 1078 | { 1079 | fusedPreprocess4DCUDA << <(P + 255) / 256, 256 >> > ( 1080 | P, 1081 | deg, 1082 | deg_t, 1083 | M, 1084 | means3D, 1085 | cov, 1086 | ms, 1087 | cov_t, 1088 | opacities, 1089 | t1, 1090 | shs, 1091 | t, 1092 | viewmatrix, 1093 | projmatrix, 1094 | cam_pos, 1095 | duration, 1096 | mask, 1097 | occ1, 1098 | xyz3, 1099 | rgb3); 1100 | } 1101 | 1102 | void FORWARD::fusedPreprocess4DSparse(int P, 1103 | const int deg, 1104 | const int deg_t, 1105 | const int M, 1106 | const glm::vec3* means3D, 1107 | const float* cov, 1108 | const glm::vec3* ms, 1109 | const float* cov_t, 1110 | const float* opacities, 1111 | const float* t1, 1112 | const glm::vec3* bases, 1113 | const float* shs, 1114 | const float* t, 1115 | const int* inverse, 1116 | const float* viewmatrix, 1117 | const float* projmatrix, 1118 | const float* cam_pos, 1119 | const float duration, 1120 | bool* mask, 1121 | float* occ1, 1122 | glm::vec3* xyz3, 1123 | glm::vec3* rgb3) 1124 | { 1125 | fusedPreprocess4DSparseCUDA << <(P + 255) / 256, 256 >> > ( 1126 | P, 1127 | deg, 1128 | deg_t, 1129 | M, 1130 | means3D, 1131 | cov, 1132 | ms, 1133 | cov_t, 1134 | opacities, 1135 | t1, 1136 | bases, 1137 | shs, 1138 | t, 1139 | inverse, 1140 | viewmatrix, 1141 | projmatrix, 1142 | cam_pos, 1143 | duration, 1144 | mask, 1145 | occ1, 1146 | xyz3, 1147 | rgb3); 1148 | } 1149 | -------------------------------------------------------------------------------- /cuda_rasterizer/forward.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED 13 | #define CUDA_RASTERIZER_FORWARD_H_INCLUDED 14 | 15 | #include 16 | #include "cuda_runtime.h" 17 | #include "device_launch_parameters.h" 18 | #define GLM_FORCE_CUDA 19 | #include 20 | 21 | namespace FORWARD 22 | { 23 | // Perform initial steps for each Gaussian prior to rasterization. 24 | void preprocess(int P, int D, int M, 25 | const float* orig_points, 26 | const glm::vec3* scales, 27 | const float scale_modifier, 28 | const glm::vec4* rotations, 29 | const float* opacities, 30 | const float* shs, 31 | bool* clamped, 32 | const float* cov3D_precomp, 33 | const bool* tile_mask, 34 | const float* colors_precomp, 35 | const float* viewmatrix, 36 | const float* projmatrix, 37 | const glm::vec3* cam_pos, 38 | const int W, int H, 39 | const float focal_x, float focal_y, 40 | const float tan_fovx, float tan_fovy, 41 | int* radii, 42 | float2* points_xy_image, 43 | float* depths, 44 | float* cov3Ds, 45 | float* colors, 46 | float4* conic_opacity, 47 | // float* comp, 48 | const dim3 grid, 49 | uint32_t* tiles_touched, 50 | bool prefiltered); 51 | 52 | // Main rasterization method. 53 | void render( 54 | const dim3 grid, dim3 block, 55 | const uint2* ranges, 56 | const uint32_t* point_list, 57 | int W, int H, 58 | const float2* points_xy_image, 59 | const float* features, 60 | const float* depths, 61 | const float4* conic_opacity, 62 | float* out_alpha, 63 | uint32_t* n_contrib, 64 | const float* bg_color, 65 | float* out_color, 66 | float* out_depth); 67 | 68 | void fusedPreprocess4D(int P, 69 | const int deg, 70 | const int deg_t, 71 | const int M, 72 | const glm::vec3* means3D, 73 | const float* cov, 74 | const glm::vec3* ms, 75 | const float* cov_t, 76 | const float* opacities, 77 | const float* t1, 78 | const float* shs, 79 | const float* t, 80 | const float* viewmatrix, 81 | const float* projmatrix, 82 | const float* cam_pos, 83 | const float duration, 84 | bool* mask, 85 | float* occ1, 86 | glm::vec3* xyz3, 87 | glm::vec3* rgb3); 88 | 89 | void fusedPreprocess4DSparse(int P, 90 | const int deg, 91 | const int deg_t, 92 | const int M, 93 | const glm::vec3* means3D, 94 | const float* cov, 95 | const glm::vec3* ms, 96 | const float* cov_t, 97 | const float* opacities, 98 | const float* t1, 99 | const glm::vec3* bases, 100 | const float* shs, 101 | const float* t, 102 | const int* inverse, 103 | const float* viewmatrix, 104 | const float* projmatrix, 105 | const float* cam_pos, 106 | const float duration, 107 | bool* mask, 108 | float* occ1, 109 | glm::vec3* xyz3, 110 | glm::vec3* rgb3); 111 | 112 | void computeCov3D(int P, 113 | const glm::vec3* scaling_xyz, 114 | const glm::vec4* rotation_l, 115 | float* cov); 116 | 117 | void computeCov4D(int P, 118 | const glm::vec4* scaling_xyzt, 119 | const glm::vec4* rotation_l, 120 | const glm::vec4* rotation_r, 121 | float* cov, 122 | glm::vec3* ms, 123 | float* cov_t); 124 | 125 | void computeSH4D( 126 | int P, 127 | int deg, int deg_t, int max_coeffs, 128 | const float* sh, 129 | const glm::vec3* dir, 130 | const float* dir_t, 131 | const float time_duration, 132 | glm::vec3* rgb); 133 | } 134 | 135 | 136 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_H_INCLUDED 13 | #define CUDA_RASTERIZER_H_INCLUDED 14 | 15 | #include 16 | #include 17 | #include "cuda_fp16.h" 18 | 19 | namespace CudaRasterizer 20 | { 21 | class Rasterizer 22 | { 23 | public: 24 | 25 | static void fusedPreprocess4D( 26 | const int P, 27 | const int deg, 28 | const int deg_t, 29 | const int M, 30 | const float* means3D, 31 | const float* cov, 32 | const float* ms, 33 | const float* cov_t, 34 | const float* opacities, 35 | const float* t1, 36 | const float* sh, 37 | const float* t, 38 | const float* viewmatrix, 39 | const float* projmatrix, 40 | const float* cam_pos, 41 | const float duration, 42 | bool* mask, 43 | float* occ1, 44 | float* xyz3, 45 | float* rgb3); 46 | 47 | static void fusedPreprocess4DSparse( 48 | const int P, 49 | const int deg, 50 | const int deg_t, 51 | const int M, 52 | const float* means3D, 53 | const float* cov, 54 | const float* ms, 55 | const float* cov_t, 56 | const float* opacities, 57 | const float* t1, 58 | const float* base, 59 | const float* sh, 60 | const float* t, 61 | const int* inverse, 62 | const float* viewmatrix, 63 | const float* projmatrix, 64 | const float* cam_pos, 65 | const float duration, 66 | bool* mask, 67 | float* occ1, 68 | float* xyz3, 69 | float* rgb3); 70 | 71 | static void markVisible( 72 | int P, 73 | float* means3D, 74 | float* viewmatrix, 75 | float* projmatrix, 76 | bool* present); 77 | 78 | static void computeCov3D( 79 | int P, 80 | const float* scaling_xyz, 81 | const float* rotation_l, 82 | float* cov); 83 | 84 | static void computeCov3DBackward( 85 | int P, 86 | const float* scaling_xyz, 87 | const float* rotation_l, 88 | const float* dL_dcov, 89 | float* dL_dscaling_xyz, 90 | float* dL_drotation_l); 91 | 92 | static void computeCov4D( 93 | int P, 94 | const float* scaling_xyzt, 95 | const float* rotation_l, 96 | const float* rotation_r, 97 | float* cov, 98 | float* ms, 99 | float* cov_t); 100 | 101 | static void computeSH4D( 102 | int P, 103 | int deg, int deg_t, int max_coeffs, 104 | const float* shs, 105 | const float* dir, 106 | const float* dir_t, 107 | const float time_duration, 108 | float* rgb); 109 | 110 | static void computeSH4DBackward( 111 | int P, 112 | int deg, int deg_t, int max_coeffs, 113 | const float* shs, 114 | const float* dir, 115 | const float* dir_t, 116 | const float time_duration, 117 | const float* dL_drgb, 118 | float* dL_dsh, 119 | float* dL_ddir, 120 | float* dL_ddir_t); 121 | 122 | static void computeCov4DBackward( 123 | int P, 124 | const float* scaling_xyzt, 125 | const float* rotation_l, 126 | const float* rotation_r, 127 | const float* dL_dcov, 128 | const float* dL_dms, 129 | const float* dL_dcov_t, 130 | float* dL_dscaling_xyzt, 131 | float* dL_drotation_l, 132 | float* dL_drotation_r); 133 | 134 | static int forward( 135 | std::function geometryBuffer, 136 | std::function binningBuffer, 137 | std::function imageBuffer, 138 | const int P, int D, int M, 139 | const float* background, 140 | const int width, int height, 141 | const float* means3D, 142 | const float* shs, 143 | const float* colors_precomp, 144 | const float* opacities, 145 | const float* scales, 146 | const float scale_modifier, 147 | const float* rotations, 148 | const float* cov3D_precomp, 149 | const bool* tile_mask, 150 | const float* viewmatrix, 151 | const float* projmatrix, 152 | const float* cam_pos, 153 | const float tan_fovx, const float tan_fovy, 154 | const bool prefiltered, 155 | float* out_color, 156 | float* out_depth, 157 | float* out_alpha, 158 | int* radii = nullptr, 159 | bool debug = false); 160 | 161 | static void backward( 162 | const int P, int D, int M, int R, 163 | const float* background, 164 | const int width, int height, 165 | const float* means3D, 166 | const float* shs, 167 | const float* colors_precomp, 168 | const float* scales, 169 | const float scale_modifier, 170 | const float* rotations, 171 | const float* cov3D_precomp, 172 | const float* viewmatrix, 173 | const float* projmatrix, 174 | const float* campos, 175 | const float tan_fovx, const float tan_fovy, 176 | const int* radii, 177 | char* geom_buffer, 178 | char* binning_buffer, 179 | char* image_buffer, 180 | const float* accum_alphas, 181 | const float* dL_dpix, 182 | const float* dL_dpix_depth, 183 | const float* dL_dpix_dalpha, 184 | float* dL_dmean2D, 185 | float* dL_dabsmean2D, 186 | float* dL_dconic, 187 | float* dL_dopacity, 188 | float* dL_dcolor, 189 | float* dL_ddepth, 190 | float* dL_dmean3D, 191 | float* dL_dcov3D, 192 | float* dL_dsh, 193 | float* dL_dscale, 194 | float* dL_drot, 195 | bool debug); 196 | }; 197 | }; 198 | 199 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer_impl.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "rasterizer_impl.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "cuda_runtime.h" 19 | #include "device_launch_parameters.h" 20 | #include 21 | #include 22 | #define GLM_FORCE_CUDA 23 | #include 24 | 25 | #include 26 | #include 27 | namespace cg = cooperative_groups; 28 | 29 | #include "auxiliary.h" 30 | #include "forward.h" 31 | #include "backward.h" 32 | 33 | // Helper function to find the next-highest bit of the MSB 34 | // on the CPU. 35 | uint32_t getHigherMsb(uint32_t n) 36 | { 37 | uint32_t msb = sizeof(n) * 4; 38 | uint32_t step = msb; 39 | while (step > 1) 40 | { 41 | step /= 2; 42 | if (n >> msb) 43 | msb += step; 44 | else 45 | msb -= step; 46 | } 47 | if (n >> msb) 48 | msb++; 49 | return msb; 50 | } 51 | 52 | // Wrapper method to call auxiliary coarse frustum containment test. 53 | // Mark all Gaussians that pass it. 54 | __global__ void checkFrustum(int P, 55 | const float* orig_points, 56 | const float* viewmatrix, 57 | const float* projmatrix, 58 | bool* present) 59 | { 60 | auto idx = cg::this_grid().thread_rank(); 61 | if (idx >= P) 62 | return; 63 | 64 | float3 p_view; 65 | present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view); 66 | } 67 | 68 | // Generates one key/value pair for all Gaussian / tile overlaps. 69 | // Run once per Gaussian (1:N mapping). 70 | __global__ void duplicateWithKeys( 71 | int P, 72 | const float2* points_xy, 73 | const float4* conic_opacity, 74 | const float* depths, 75 | const uint32_t* offsets, 76 | uint64_t* gaussian_keys_unsorted, 77 | uint32_t* gaussian_values_unsorted, 78 | int* radii, 79 | dim3 grid, 80 | const bool* tile_mask) 81 | { 82 | auto idx = cg::this_grid().thread_rank(); 83 | if (idx >= P) 84 | return; 85 | 86 | // Generate no key/value pair for invisible Gaussians 87 | if (radii[idx] > 0) 88 | { 89 | // Find this Gaussian's offset in buffer for writing keys/values. 90 | uint32_t off = (idx == 0) ? 0 : offsets[idx - 1]; 91 | uint2 rect_min, rect_max; 92 | 93 | getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid); 94 | 95 | // For each tile that the bounding rect overlaps, emit a 96 | // key/value pair. The key is | tile ID | depth |, 97 | // and the value is the ID of the Gaussian. Sorting the values 98 | // with this key yields Gaussian IDs in a list, such that they 99 | // are first sorted by tile and then by depth. 100 | for (int y = rect_min.y; y < rect_max.y; y++) 101 | { 102 | for (int x = rect_min.x; x < rect_max.x; x++) 103 | { 104 | uint64_t key = y * grid.x + x; 105 | if (tile_mask != nullptr && !tile_mask[key]) 106 | { 107 | continue; 108 | } 109 | else 110 | { 111 | 112 | // constexpr float alpha_threshold = 1.0f / 255.0f; 113 | // const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold); 114 | // glm::vec2 max_pos; 115 | // const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y}; 116 | // const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1}; 117 | // float max_opac_factor = max_contrib_power_rect_gaussian_float(conic_opacity[idx], points_xy[idx], tile_min, tile_max, max_pos); 118 | 119 | // if (max_opac_factor > opacity_power_threshold) { 120 | // continue; 121 | // } 122 | 123 | key <<= 32; 124 | key |= *((uint32_t*)&depths[idx]); 125 | gaussian_keys_unsorted[off] = key; 126 | gaussian_values_unsorted[off] = idx; 127 | off++; 128 | } 129 | } 130 | } 131 | } 132 | } 133 | 134 | // Check keys to see if it is at the start/end of one tile's range in 135 | // the full sorted list. If yes, write start/end of this tile. 136 | // Run once per instanced (duplicated) Gaussian ID. 137 | __global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges) 138 | { 139 | auto idx = cg::this_grid().thread_rank(); 140 | if (idx >= L) 141 | return; 142 | 143 | // Read tile ID from key. Update start/end of tile range if at limit. 144 | uint64_t key = point_list_keys[idx]; 145 | uint32_t currtile = key >> 32; 146 | if (idx == 0) 147 | ranges[currtile].x = 0; 148 | else 149 | { 150 | uint32_t prevtile = point_list_keys[idx - 1] >> 32; 151 | if (currtile != prevtile) 152 | { 153 | ranges[prevtile].y = idx; 154 | ranges[currtile].x = idx; 155 | } 156 | } 157 | if (idx == L - 1) 158 | ranges[currtile].y = L; 159 | } 160 | 161 | void CudaRasterizer::Rasterizer::computeCov3D(int P, 162 | const float* scaling_xyz, 163 | const float* rotation_l, 164 | float* cov) 165 | { 166 | FORWARD::computeCov3D( 167 | P, 168 | (glm::vec3*)scaling_xyz, 169 | (glm::vec4*)rotation_l, 170 | cov); 171 | } 172 | 173 | void CudaRasterizer::Rasterizer::computeCov3DBackward(int P, 174 | const float* scaling_xyz, 175 | const float* rotation_l, 176 | const float* dL_dcov, 177 | float* dL_dscaling_xyz, 178 | float* dL_drotation_l) 179 | { 180 | BACKWARD::computeCov3DBackward( 181 | P, 182 | (glm::vec3*)scaling_xyz, 183 | (glm::vec4*)rotation_l, 184 | dL_dcov, 185 | (glm::vec3*)dL_dscaling_xyz, 186 | (glm::vec4*)dL_drotation_l); 187 | } 188 | 189 | void CudaRasterizer::Rasterizer::computeCov4D(int P, 190 | const float* scaling_xyzt, 191 | const float* rotation_l, 192 | const float* rotation_r, 193 | float* cov, 194 | float* ms, 195 | float* cov_t) 196 | { 197 | FORWARD::computeCov4D( 198 | P, 199 | (glm::vec4*)scaling_xyzt, 200 | (glm::vec4*)rotation_l, 201 | (glm::vec4*)rotation_r, 202 | cov, 203 | (glm::vec3*)ms, 204 | cov_t); 205 | } 206 | 207 | void CudaRasterizer::Rasterizer::computeCov4DBackward(int P, 208 | const float* scaling_xyzt, 209 | const float* rotation_l, 210 | const float* rotation_r, 211 | const float* dL_dcov, 212 | const float* dL_dms, 213 | const float* dL_dcov_t, 214 | float* dL_dscaling_xyzt, 215 | float* dL_drotation_l, 216 | float* dL_drotation_r) 217 | { 218 | BACKWARD::computeCov4DBackward( 219 | P, 220 | (glm::vec4*)scaling_xyzt, 221 | (glm::vec4*)rotation_l, 222 | (glm::vec4*)rotation_r, 223 | dL_dcov, 224 | (glm::vec3*)dL_dms, 225 | dL_dcov_t, 226 | (glm::vec4*)dL_dscaling_xyzt, 227 | (glm::vec4*)dL_drotation_l, 228 | (glm::vec4*)dL_drotation_r); 229 | } 230 | 231 | 232 | void CudaRasterizer::Rasterizer::computeSH4D(int P, 233 | int deg, int deg_t, int max_coeffs, 234 | const float* shs, 235 | const float* dir, 236 | const float* dir_t, 237 | const float time_duration, 238 | float* rgb) 239 | { 240 | FORWARD::computeSH4D( 241 | P, 242 | deg, 243 | deg_t, 244 | max_coeffs, 245 | shs, 246 | (glm::vec3*)dir, 247 | dir_t, 248 | time_duration, 249 | (glm::vec3*)rgb 250 | ); 251 | } 252 | 253 | void CudaRasterizer::Rasterizer::computeSH4DBackward( 254 | int P, 255 | int deg, int deg_t, int max_coeffs, 256 | const float* shs, 257 | const float* dir, 258 | const float* dir_t, 259 | const float time_duration, 260 | const float* dL_drgb, 261 | float* dL_dsh, 262 | float* dL_ddir, 263 | float* dL_ddir_t 264 | ) 265 | { 266 | BACKWARD::computeSH4DBackward( 267 | P, 268 | deg, 269 | deg_t, 270 | max_coeffs, 271 | shs, 272 | (glm::vec3*)dir, 273 | dir_t, 274 | time_duration, 275 | (glm::vec3*)dL_drgb, 276 | dL_dsh, 277 | (glm::vec3*)dL_ddir, 278 | dL_ddir_t 279 | ); 280 | } 281 | 282 | // Marginalization & color computation 283 | void CudaRasterizer::Rasterizer::fusedPreprocess4DSparse( 284 | const int P, 285 | const int deg, 286 | const int deg_t, 287 | const int M, 288 | const float* means3D, 289 | const float* cov, 290 | const float* ms, 291 | const float* cov_t, 292 | const float* opacities, 293 | const float* t1, 294 | const float* base, 295 | const float* sh, 296 | const float* t, 297 | const int* inverse, 298 | const float* viewmatrix, 299 | const float* projmatrix, 300 | const float* cam_pos, 301 | const float duration, 302 | bool* mask, 303 | float* occ1, 304 | float* xyz3, 305 | float* rgb3) 306 | { 307 | FORWARD::fusedPreprocess4DSparse( 308 | P, 309 | deg, 310 | deg_t, 311 | M, 312 | (glm::vec3*)means3D, 313 | cov, 314 | (glm::vec3*)ms, 315 | cov_t, 316 | opacities, 317 | t1, 318 | (glm::vec3*)base, 319 | sh, 320 | t, 321 | inverse, 322 | viewmatrix, 323 | projmatrix, 324 | cam_pos, 325 | duration, 326 | mask, 327 | occ1, 328 | (glm::vec3*)xyz3, 329 | (glm::vec3*)rgb3); 330 | } 331 | 332 | // Marginalization & color computation 333 | void CudaRasterizer::Rasterizer::fusedPreprocess4D( 334 | const int P, 335 | const int deg, 336 | const int deg_t, 337 | const int M, 338 | const float* means3D, 339 | const float* cov, 340 | const float* ms, 341 | const float* cov_t, 342 | const float* opacities, 343 | const float* t1, 344 | const float* sh, 345 | const float* t, 346 | const float* viewmatrix, 347 | const float* projmatrix, 348 | const float* cam_pos, 349 | const float duration, 350 | bool* mask, 351 | float* occ1, 352 | float* xyz3, 353 | float* rgb3) 354 | { 355 | FORWARD::fusedPreprocess4D( 356 | P, 357 | deg, 358 | deg_t, 359 | M, 360 | (glm::vec3*)means3D, 361 | cov, 362 | (glm::vec3*)ms, 363 | cov_t, 364 | opacities, 365 | t1, 366 | sh, 367 | t, 368 | viewmatrix, 369 | projmatrix, 370 | cam_pos, 371 | duration, 372 | mask, 373 | occ1, 374 | (glm::vec3*)xyz3, 375 | (glm::vec3*)rgb3); 376 | } 377 | 378 | // Mark Gaussians as visible/invisible, based on view frustum testing 379 | void CudaRasterizer::Rasterizer::markVisible( 380 | int P, 381 | float* means3D, 382 | float* viewmatrix, 383 | float* projmatrix, 384 | bool* present) 385 | { 386 | checkFrustum << <(P + 255) / 256, 256 >> > ( 387 | P, 388 | means3D, 389 | viewmatrix, projmatrix, 390 | present); 391 | } 392 | 393 | CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P) 394 | { 395 | GeometryState geom; 396 | obtain(chunk, geom.depths, P, 128); 397 | obtain(chunk, geom.clamped, P * 3, 128); 398 | obtain(chunk, geom.internal_radii, P, 128); 399 | obtain(chunk, geom.means2D, P, 128); 400 | obtain(chunk, geom.cov3D, P * 6, 128); 401 | obtain(chunk, geom.conic_opacity, P, 128); 402 | obtain(chunk, geom.rgb, P * 3, 128); 403 | obtain(chunk, geom.tiles_touched, P, 128); 404 | obtain(chunk, geom.point_offsets, P, 128); 405 | cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P); 406 | obtain(chunk, geom.scanning_space, geom.scan_size, 128); 407 | return geom; 408 | } 409 | 410 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N, size_t M) 411 | { 412 | ImageState img; 413 | obtain(chunk, img.n_contrib, N, 128); 414 | obtain(chunk, img.ranges, M, 128); 415 | return img; 416 | } 417 | 418 | CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P) 419 | { 420 | BinningState binning; 421 | obtain(chunk, binning.point_list, P, 128); 422 | obtain(chunk, binning.point_list_unsorted, P, 128); 423 | obtain(chunk, binning.point_list_keys, P, 128); 424 | obtain(chunk, binning.point_list_keys_unsorted, P, 128); 425 | cub::DeviceRadixSort::SortPairs( 426 | nullptr, binning.sorting_size, 427 | binning.point_list_keys_unsorted, binning.point_list_keys, 428 | binning.point_list_unsorted, binning.point_list, P); 429 | obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128); 430 | return binning; 431 | } 432 | 433 | // Forward rendering procedure for differentiable rasterization 434 | // of Gaussians. 435 | int CudaRasterizer::Rasterizer::forward( 436 | std::function geometryBuffer, 437 | std::function binningBuffer, 438 | std::function imageBuffer, 439 | const int P, int D, int M, 440 | const float* background, 441 | const int width, int height, 442 | const float* means3D, 443 | const float* shs, 444 | const float* colors_precomp, 445 | const float* opacities, 446 | const float* scales, 447 | const float scale_modifier, 448 | const float* rotations, 449 | const float* cov3D_precomp, 450 | const bool* tile_mask, 451 | const float* viewmatrix, 452 | const float* projmatrix, 453 | const float* cam_pos, 454 | const float tan_fovx, const float tan_fovy, 455 | const bool prefiltered, 456 | float* out_color, 457 | float* out_depth, 458 | float* out_alpha, 459 | int* radii, 460 | bool debug) 461 | { 462 | const float focal_y = height / (2.0f * tan_fovy); 463 | const float focal_x = width / (2.0f * tan_fovx); 464 | 465 | size_t chunk_size = required(P); 466 | char* chunkptr = geometryBuffer(chunk_size); // memory allocation 467 | GeometryState geomState = GeometryState::fromChunk(chunkptr, P); 468 | 469 | if (radii == nullptr) 470 | { 471 | radii = geomState.internal_radii; 472 | } 473 | 474 | const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); 475 | dim3 block(BLOCK_X, BLOCK_Y, 1); 476 | 477 | // Dynamically resize image-based auxiliary buffers during training 478 | size_t img_chunk_size = required(width * height, tile_grid.x * tile_grid.y); 479 | char* img_chunkptr = imageBuffer(img_chunk_size); // memory allocation 480 | ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height, tile_grid.x * tile_grid.y); 481 | 482 | if (NUM_CHANNELS != 3 && colors_precomp == nullptr) 483 | { 484 | throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!"); 485 | } 486 | 487 | // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) 488 | CHECK_CUDA(FORWARD::preprocess( 489 | P, D, M, 490 | means3D, 491 | (glm::vec3*)scales, 492 | scale_modifier, 493 | (glm::vec4*)rotations, 494 | opacities, 495 | shs, 496 | geomState.clamped, 497 | cov3D_precomp, 498 | tile_mask, 499 | colors_precomp, 500 | viewmatrix, projmatrix, 501 | (glm::vec3*)cam_pos, 502 | width, height, 503 | focal_x, focal_y, 504 | tan_fovx, tan_fovy, 505 | radii, 506 | geomState.means2D, 507 | geomState.depths, 508 | geomState.cov3D, 509 | geomState.rgb, 510 | geomState.conic_opacity, 511 | tile_grid, 512 | geomState.tiles_touched, 513 | prefiltered 514 | ), debug) 515 | 516 | // Compute prefix sum over full list of touched tile counts by Gaussians 517 | // E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8] 518 | CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug) 519 | 520 | // Retrieve total number of Gaussian instances to launch and resize aux buffers 521 | int num_rendered; 522 | CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug); 523 | 524 | size_t binning_chunk_size = required(num_rendered); 525 | char* binning_chunkptr = binningBuffer(binning_chunk_size); // memory allocation 526 | BinningState binState = BinningState::fromChunk(binning_chunkptr, num_rendered); 527 | 528 | // For each instance to be rendered, produce adequate [ tile | depth ] key 529 | // and corresponding dublicated Gaussian indices to be sorted 530 | duplicateWithKeys << <(P + 255) / 256, 256 >> > ( 531 | P, 532 | geomState.means2D, 533 | geomState.conic_opacity, 534 | geomState.depths, 535 | geomState.point_offsets, 536 | binState.point_list_keys_unsorted, 537 | binState.point_list_unsorted, 538 | radii, 539 | tile_grid, 540 | tile_mask) 541 | CHECK_CUDA(, debug) 542 | 543 | int bit = getHigherMsb(tile_grid.x * tile_grid.y); 544 | 545 | // Sort complete list of (duplicated) Gaussian indices by keys 546 | CHECK_CUDA(cub::DeviceRadixSort::SortPairs( 547 | binState.list_sorting_space, 548 | binState.sorting_size, 549 | binState.point_list_keys_unsorted, binState.point_list_keys, 550 | binState.point_list_unsorted, binState.point_list, 551 | num_rendered, 0, 32 + bit), debug) 552 | 553 | CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug); 554 | 555 | // Identify start and end of per-tile workloads in sorted list 556 | if (num_rendered > 0) 557 | identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( 558 | num_rendered, 559 | binState.point_list_keys, 560 | imgState.ranges); 561 | CHECK_CUDA(, debug) 562 | 563 | // Let each tile blend its range of Gaussians independently in parallel 564 | const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb; 565 | CHECK_CUDA(FORWARD::render( 566 | tile_grid, 567 | block, 568 | imgState.ranges, 569 | binState.point_list, 570 | width, height, 571 | geomState.means2D, 572 | feature_ptr, 573 | geomState.depths, 574 | geomState.conic_opacity, 575 | out_alpha, 576 | imgState.n_contrib, 577 | background, 578 | out_color, 579 | out_depth), debug) 580 | 581 | return num_rendered; 582 | } 583 | 584 | // Produce necessary gradients for optimization, corresponding 585 | // to forward render pass 586 | void CudaRasterizer::Rasterizer::backward( 587 | const int P, int D, int M, int R, 588 | const float* background, 589 | const int width, int height, 590 | const float* means3D, 591 | const float* shs, 592 | const float* colors_precomp, 593 | const float* scales, 594 | const float scale_modifier, 595 | const float* rotations, 596 | const float* cov3D_precomp, 597 | const float* viewmatrix, 598 | const float* projmatrix, 599 | const float* campos, 600 | const float tan_fovx, const float tan_fovy, 601 | const int* radii, 602 | char* geom_buffer, 603 | char* binning_buffer, 604 | char* img_buffer, 605 | const float* accum_alphas, 606 | const float* dL_dpix, 607 | const float* dL_dpix_depth, 608 | const float* dL_dpix_alpha, 609 | float* dL_dmean2D, 610 | float* dL_dabsmean2D, 611 | float* dL_dconic, 612 | float* dL_dopacity, 613 | float* dL_dcolor, 614 | float* dL_ddepth, 615 | float* dL_dmean3D, 616 | float* dL_dcov3D, 617 | float* dL_dsh, 618 | float* dL_dscale, 619 | float* dL_drot, 620 | bool debug) 621 | { 622 | 623 | const float focal_y = height / (2.0f * tan_fovy); 624 | const float focal_x = width / (2.0f * tan_fovx); 625 | 626 | const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); 627 | const dim3 block(BLOCK_X, BLOCK_Y, 1); 628 | 629 | GeometryState geomState = GeometryState::fromChunk(geom_buffer, P); 630 | BinningState binState = BinningState::fromChunk(binning_buffer, R); 631 | ImageState imgState = ImageState::fromChunk(img_buffer, width * height, tile_grid.x * tile_grid.y); 632 | 633 | if (radii == nullptr) 634 | { 635 | radii = geomState.internal_radii; 636 | } 637 | 638 | // Compute loss gradients w.r.t. 2D mean position, conic matrix, 639 | // opacity and RGB of Gaussians from per-pixel loss gradients. 640 | // If we were given precomputed colors and not SHs, use them. 641 | const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb; 642 | CHECK_CUDA(BACKWARD::render( 643 | tile_grid, 644 | block, 645 | imgState.ranges, 646 | binState.point_list, 647 | width, height, 648 | background, 649 | geomState.means2D, 650 | geomState.conic_opacity, 651 | color_ptr, 652 | geomState.depths, 653 | accum_alphas, 654 | imgState.n_contrib, 655 | dL_dpix, 656 | dL_dpix_depth, 657 | dL_dpix_alpha, 658 | (float3*)dL_dmean2D, 659 | (float3*)dL_dabsmean2D, 660 | (float4*)dL_dconic, 661 | dL_dopacity, 662 | dL_dcolor, 663 | dL_ddepth), debug) 664 | 665 | // Take care of the rest of preprocessing. Was the precomputed covariance 666 | // given to us or a scales/rot pair? If precomputed, pass that. If not, 667 | // use the one we computed ourselves. 668 | const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D; 669 | CHECK_CUDA(BACKWARD::preprocess(P, D, M, 670 | (float3*)means3D, 671 | radii, 672 | shs, 673 | geomState.clamped, 674 | (glm::vec3*)scales, 675 | (glm::vec4*)rotations, 676 | scale_modifier, 677 | cov3D_ptr, 678 | viewmatrix, 679 | projmatrix, 680 | focal_x, focal_y, 681 | tan_fovx, tan_fovy, 682 | (glm::vec3*)campos, 683 | (float3*)dL_dmean2D, 684 | dL_dconic, 685 | (glm::vec3*)dL_dmean3D, 686 | dL_dcolor, 687 | dL_ddepth, 688 | dL_dcov3D, 689 | dL_dsh, 690 | (glm::vec3*)dL_dscale, 691 | (glm::vec4*)dL_drot), debug) 692 | } -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include 17 | #include "rasterizer.h" 18 | // #include "auxiliary_half.h" 19 | #include 20 | 21 | namespace CudaRasterizer 22 | { 23 | template 24 | static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment) 25 | { 26 | std::size_t offset = (reinterpret_cast(chunk) + alignment - 1) & ~(alignment - 1); 27 | ptr = reinterpret_cast(offset); 28 | chunk = reinterpret_cast(ptr + count); 29 | } 30 | 31 | struct GeometryState 32 | { 33 | float* depths; 34 | bool* clamped; 35 | int* internal_radii; 36 | float2* means2D; 37 | float* cov3D; 38 | float4* conic_opacity; 39 | float* rgb; 40 | uint32_t* tiles_touched; 41 | uint32_t* point_offsets; 42 | char* scanning_space; 43 | size_t scan_size; 44 | 45 | static GeometryState fromChunk(char*& chunk, size_t P); 46 | }; 47 | 48 | struct ImageState 49 | { 50 | uint32_t* n_contrib; 51 | uint2* ranges; 52 | 53 | static ImageState fromChunk(char*& chunk, size_t N, size_t M); 54 | }; 55 | 56 | struct BinningState 57 | { 58 | uint32_t* point_list; 59 | uint32_t* point_list_unsorted; 60 | uint64_t* point_list_keys; 61 | uint64_t* point_list_keys_unsorted; 62 | char* list_sorting_space; 63 | size_t sorting_size; 64 | 65 | static BinningState fromChunk(char*& chunk, size_t P); 66 | }; 67 | 68 | template 69 | size_t required(size_t P) 70 | { 71 | char* size = nullptr; 72 | T::fromChunk(size, P); 73 | return ((size_t)size) + 128; 74 | } 75 | 76 | template 77 | size_t required(size_t P, size_t N) 78 | { 79 | char* size = nullptr; 80 | T::fromChunk(size, P, N); 81 | return ((size_t)size) + 128; 82 | } 83 | }; 84 | -------------------------------------------------------------------------------- /diff_gauss/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | from typing import NamedTuple 13 | import torch.nn as nn 14 | import torch 15 | from . import _C 16 | 17 | BLOCK_X = 16 18 | BLOCK_Y = 16 19 | 20 | 21 | def fused_preprocess_4d_sparse(xyz3, cov6, ms3, cov_t1, occ1, t1, base, feat, t, inverse, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration): 22 | # Mark visible points (based on frustum culling for camera) with a boolean 23 | with torch.no_grad(): 24 | mask, occ1, xyz3, rgb3 = _C.fused_preprocess_4d_sparse(xyz3, cov6, ms3, cov_t1, occ1, t1, base, feat, t, inverse, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration) 25 | 26 | return mask, occ1, xyz3, rgb3 # mask and output 27 | 28 | def fused_preprocess_4d(xyz3, cov6, ms3, cov_t1, occ1, t1, feat, t, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration): 29 | # Mark visible points (based on frustum culling for camera) with a boolean 30 | with torch.no_grad(): 31 | mask, occ1, xyz3, rgb3 = _C.fused_preprocess_4d(xyz3, cov6, ms3, cov_t1, occ1, t1, feat, t, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration) 32 | 33 | return mask, occ1, xyz3, rgb3 # mask and output 34 | 35 | 36 | def cpu_deep_copy_tuple(input_tuple): 37 | copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple] 38 | return tuple(copied_tensors) 39 | 40 | 41 | def rasterize_gaussians( 42 | means3D, 43 | means2D, 44 | sh, 45 | colors_precomp, 46 | opacities, 47 | scales, 48 | rotations, 49 | cov3Ds_precomp, 50 | tile_mask, 51 | raster_settings, 52 | ): 53 | return _RasterizeGaussians.apply( 54 | means3D, 55 | means2D, 56 | sh, 57 | colors_precomp, 58 | opacities, 59 | scales, 60 | rotations, 61 | cov3Ds_precomp, 62 | tile_mask, 63 | raster_settings, 64 | ) 65 | 66 | 67 | class _RasterizeGaussians(torch.autograd.Function): 68 | @staticmethod 69 | def forward( 70 | ctx, 71 | means3D, 72 | means2D, 73 | sh, 74 | colors_precomp, 75 | opacities, 76 | scales, 77 | rotations, 78 | cov3Ds_precomp, 79 | tile_mask, 80 | raster_settings, 81 | ): 82 | 83 | # Restructure arguments the way that the C++ lib expects them 84 | args = ( 85 | raster_settings.bg, 86 | means3D, 87 | colors_precomp, 88 | opacities, 89 | scales, 90 | rotations, 91 | raster_settings.scale_modifier, 92 | cov3Ds_precomp, 93 | tile_mask, 94 | raster_settings.viewmatrix, 95 | raster_settings.projmatrix, 96 | raster_settings.tanfovx, 97 | raster_settings.tanfovy, 98 | raster_settings.image_height, 99 | raster_settings.image_width, 100 | sh, 101 | raster_settings.sh_degree, 102 | raster_settings.campos, 103 | raster_settings.prefiltered, 104 | raster_settings.debug 105 | ) 106 | 107 | # Invoke C++/CUDA rasterizer 108 | if raster_settings.debug: 109 | cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted 110 | try: 111 | num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) 112 | except Exception as ex: 113 | torch.save(cpu_args, "snapshot_fw.dump") 114 | print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.") 115 | raise ex 116 | else: 117 | num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) 118 | # Keep relevant tensors for backward 119 | ctx.raster_settings = raster_settings 120 | ctx.num_rendered = num_rendered 121 | ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, tile_mask, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha, means2D) 122 | return color, depth, alpha, radii 123 | 124 | @staticmethod 125 | def backward(ctx, grad_out_color, grad_out_depth, grad_out_alpha, _): 126 | 127 | # Restore necessary values from context 128 | num_rendered = ctx.num_rendered 129 | raster_settings = ctx.raster_settings 130 | colors_precomp, means3D, scales, rotations, cov3Ds_precomp, tile_mask, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha, means2D = ctx.saved_tensors 131 | 132 | # Restructure args as C++ method expects them 133 | args = (raster_settings.bg, 134 | means3D, 135 | radii, 136 | colors_precomp, 137 | scales, 138 | rotations, 139 | raster_settings.scale_modifier, 140 | cov3Ds_precomp, 141 | raster_settings.viewmatrix, 142 | raster_settings.projmatrix, 143 | raster_settings.tanfovx, 144 | raster_settings.tanfovy, 145 | grad_out_color, 146 | grad_out_depth, 147 | grad_out_alpha, 148 | sh, 149 | raster_settings.sh_degree, 150 | raster_settings.campos, 151 | geomBuffer, 152 | num_rendered, 153 | binningBuffer, 154 | imgBuffer, 155 | alpha, 156 | raster_settings.debug) 157 | 158 | # Compute gradients for relevant tensors by invoking backward method 159 | if raster_settings.debug: 160 | cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted 161 | try: 162 | absgrad_means2D, grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args) 163 | except Exception as ex: 164 | torch.save(cpu_args, "snapshot_bw.dump") 165 | print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n") 166 | raise ex 167 | else: 168 | absgrad_means2D, grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args) 169 | 170 | grads = ( 171 | grad_means3D, 172 | grad_means2D, 173 | grad_sh, 174 | grad_colors_precomp, 175 | grad_opacities, 176 | grad_scales, 177 | grad_rotations, 178 | grad_cov3Ds_precomp, 179 | None, 180 | None, 181 | ) 182 | 183 | means2D.absgrad = absgrad_means2D # let the user select their grad 184 | 185 | return grads 186 | 187 | 188 | class GaussianRasterizationSettings(NamedTuple): 189 | image_height: int 190 | image_width: int 191 | tanfovx: float 192 | tanfovy: float 193 | bg: torch.Tensor 194 | scale_modifier: float 195 | viewmatrix: torch.Tensor 196 | projmatrix: torch.Tensor 197 | sh_degree: int 198 | campos: torch.Tensor 199 | prefiltered: bool 200 | debug: bool 201 | 202 | 203 | class GaussianRasterizer(nn.Module): 204 | def __init__(self, raster_settings): 205 | super().__init__() 206 | self.raster_settings = raster_settings 207 | 208 | def markVisible(self, positions): 209 | # Mark visible points (based on frustum culling for camera) with a boolean 210 | with torch.no_grad(): 211 | raster_settings = self.raster_settings 212 | visible = _C.mark_visible( 213 | positions, 214 | raster_settings.viewmatrix, 215 | raster_settings.projmatrix) 216 | 217 | return visible 218 | 219 | def forward(self, means3D, means2D, opacities, shs=None, colors_precomp=None, scales=None, rotations=None, cov3D_precomp=None, tile_mask=None): 220 | 221 | raster_settings = self.raster_settings 222 | 223 | if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None): 224 | raise Exception('Please provide excatly one of either SHs or precomputed colors!') 225 | 226 | if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None): 227 | raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!') 228 | 229 | if shs is None: 230 | shs = torch.Tensor([]) 231 | if colors_precomp is None: 232 | colors_precomp = torch.Tensor([]) 233 | 234 | if scales is None: 235 | scales = torch.Tensor([]) 236 | if rotations is None: 237 | rotations = torch.Tensor([]) 238 | if cov3D_precomp is None: 239 | cov3D_precomp = torch.Tensor([]) 240 | if tile_mask is None: 241 | tile_mask = torch.Tensor([]).bool() 242 | # TODO: in sampler `typed` will change the type of the tensor 243 | if tile_mask.dtype != torch.bool: 244 | tile_mask = tile_mask.bool() 245 | 246 | # Invoke C++/CUDA rasterization routine 247 | return rasterize_gaussians( 248 | means3D, 249 | means2D, 250 | shs, 251 | colors_precomp, 252 | opacities, 253 | scales, 254 | rotations, 255 | cov3D_precomp, 256 | tile_mask, 257 | raster_settings, 258 | ) 259 | 260 | 261 | def mark_visible(positions: torch.Tensor, viewmatrix: torch.Tensor, projmatrix: torch.Tensor): 262 | # Mark visible points (based on frustum culling for camera) with a boolean 263 | with torch.no_grad(): 264 | visible = _C.mark_visible( 265 | positions, 266 | viewmatrix, 267 | projmatrix) 268 | 269 | return visible 270 | 271 | def compute_cov_3d(scaling_xyz: torch.Tensor, rotation_l: torch.Tensor): 272 | return _ComputeCov3D.apply( 273 | scaling_xyz, 274 | rotation_l) 275 | 276 | class _ComputeCov3D(torch.autograd.Function): 277 | @staticmethod 278 | def forward(ctx, scaling_xyz, rotation_l): 279 | cov = _C.compute_cov_3d(scaling_xyz, rotation_l) 280 | ctx.save_for_backward(scaling_xyz, rotation_l) 281 | return cov 282 | 283 | @staticmethod 284 | def backward(ctx, grad_out_cov): 285 | scaling_xyz, rotation_l = ctx.saved_tensors 286 | grad_scaling_xyz, grad_rotation_l = _C.compute_cov_3d_backward(scaling_xyz, rotation_l, grad_out_cov) 287 | return grad_scaling_xyz, grad_rotation_l 288 | 289 | def compute_cov_4d(scaling_xyzt: torch.Tensor, rotation_l: torch.Tensor, rotation_r: torch.Tensor): 290 | return _ComputeCov4D.apply( 291 | scaling_xyzt, 292 | rotation_l, 293 | rotation_r) 294 | 295 | 296 | class _ComputeCov4D(torch.autograd.Function): 297 | @staticmethod 298 | def forward( 299 | ctx, 300 | scaling_xyzt, 301 | rotation_l, 302 | rotation_r 303 | ): 304 | cov, ms, cov_t = _C.compute_cov_4d(scaling_xyzt, rotation_l, rotation_r) 305 | ctx.save_for_backward(scaling_xyzt, rotation_l, rotation_r) 306 | return cov, ms, cov_t 307 | 308 | @staticmethod 309 | def backward(ctx, grad_out_cov, grad_out_ms, grad_out_cov_t): 310 | 311 | # Restore necessary values from context 312 | scaling_xyzt, rotation_l, rotation_r = ctx.saved_tensors 313 | 314 | # Restructure args as C++ method expects them 315 | grad_scaling_xyzt, grad_rotation_l, grad_rotation_r = _C.compute_cov_4d_backward( 316 | scaling_xyzt, 317 | rotation_l, 318 | rotation_r, 319 | grad_out_cov, 320 | grad_out_ms, 321 | grad_out_cov_t, 322 | ) 323 | 324 | grads = ( 325 | grad_scaling_xyzt, 326 | grad_rotation_l, 327 | grad_rotation_r, 328 | ) 329 | 330 | return grads 331 | 332 | 333 | def compute_sh_4d(deg: int, deg_t: int, sh: torch.Tensor, dir: torch.Tensor = None, dir_t: torch.Tensor = None, l: float = None): 334 | if dir is None: 335 | dir = torch.Tensor([]) 336 | if dir_t is None: 337 | dir_t = torch.Tensor([]) 338 | if l is None: 339 | l = 0.0 340 | return _ComputeSH4D.apply( 341 | deg, 342 | deg_t, 343 | sh, 344 | dir, 345 | dir_t, 346 | l) 347 | 348 | 349 | class _ComputeSH4D(torch.autograd.Function): 350 | @staticmethod 351 | def forward( 352 | ctx, 353 | deg, 354 | deg_t, 355 | sh, 356 | dir, 357 | dir_t, 358 | l 359 | ): 360 | rgb = _C.compute_sh_4d(deg, deg_t, sh, dir, dir_t, l) 361 | ctx.deg = deg 362 | ctx.deg_t = deg_t 363 | ctx.l = l 364 | ctx.save_for_backward(sh, dir, dir_t) 365 | return rgb 366 | 367 | @staticmethod 368 | def backward(ctx, grad_out_rgb): 369 | 370 | # Restore necessary values from context 371 | deg = ctx.deg 372 | deg_t = ctx.deg_t 373 | l = ctx.l 374 | sh, dir, dir_t = ctx.saved_tensors 375 | 376 | # Restructure args as C++ method expects them 377 | grad_sh, grad_dir, grad_dir_t = _C.compute_sh_4d_backward( 378 | deg, deg_t, sh, dir, dir_t, l, 379 | grad_out_rgb, 380 | ) 381 | 382 | grads = ( 383 | None, 384 | None, 385 | grad_sh, 386 | grad_dir, 387 | grad_dir_t, 388 | None, 389 | ) 390 | 391 | return grads 392 | 393 | 394 | def align_with(p: int, a: int = 128): 395 | p = (p + a - 1) // a * a 396 | return p 397 | 398 | 399 | def interpret_geomBuffer(geomBuffer: torch.Tensor, N: int): 400 | # N: Number of points rendered 401 | ptr = geomBuffer.data_ptr() 402 | p = align_with(ptr, 128) - ptr 403 | 404 | off = 4 * N 405 | depths = geomBuffer[p:p + off].view(torch.float) 406 | p = align_with(p + off, 128) 407 | 408 | off = 3 * N 409 | clamped = geomBuffer[p:p + off].view(torch.bool).view(N, 3) 410 | p = align_with(p + off, 128) 411 | 412 | off = 4 * N 413 | internal_radii = geomBuffer[p:p + off].view(torch.int) 414 | p = align_with(p + off, 128) 415 | 416 | off = 2 * 4 * N 417 | means2D = geomBuffer[p:p + off].view(torch.float).view(N, 2) 418 | p = align_with(p + off, 128) 419 | 420 | off = 6 * 4 * N 421 | cov3D = geomBuffer[p:p + off].view(torch.float).view(N, 6) 422 | p = align_with(p + off, 128) 423 | 424 | off = 4 * 4 * N 425 | conic_opacity = geomBuffer[p:p + off].view(torch.float).view(N, 4) 426 | p = align_with(p + off, 128) 427 | 428 | off = 3 * 4 * N 429 | rgb = geomBuffer[p:p + off].view(torch.float).view(N, 3) 430 | p = align_with(p + off, 128) 431 | 432 | off = 4 * N 433 | tiles_touched = geomBuffer[p:p + off].view(torch.int) 434 | p = align_with(p + off, 128) 435 | 436 | off = 4 * N 437 | point_offsets = geomBuffer[p:p + off].view(torch.int) 438 | 439 | return dict( 440 | depths=depths, 441 | clamped=clamped, 442 | internal_radii=internal_radii, 443 | means2D=means2D, 444 | cov3D=cov3D, 445 | conic_opacity=conic_opacity, 446 | rgb=rgb, 447 | tiles_touched=tiles_touched, 448 | point_offsets=point_offsets 449 | ) 450 | 451 | 452 | def interpret_binningBuffer(binningBuffer: torch.Tensor, N: int): 453 | # N: Number of tile-gaussian pairs 454 | ptr = binningBuffer.data_ptr() 455 | p = align_with(ptr, 128) - ptr 456 | 457 | off = 4 * N 458 | point_list = binningBuffer[p:p + off].view(torch.int) 459 | p = align_with(p + off, 128) 460 | 461 | off = 4 * N 462 | point_list_unsorted = binningBuffer[p:p + off].view(torch.int) 463 | p = align_with(p + off, 128) 464 | 465 | off = 8 * N 466 | point_list_keys = binningBuffer[p:p + off].view(torch.long) 467 | p = align_with(p + off, 128) 468 | 469 | off = 8 * N 470 | point_list_keys_unsorted = binningBuffer[p:p + off].view(torch.long) 471 | p = align_with(p + off, 128) 472 | 473 | return dict( 474 | point_list=point_list, 475 | point_list_unsorted=point_list_unsorted, 476 | point_list_keys=point_list_keys, 477 | point_list_keys_unsorted=point_list_keys_unsorted, 478 | 479 | # Little Endian 480 | depths=point_list_keys.view(torch.float).view(N, 2)[:, 0], 481 | tile_ids=point_list_keys.view(torch.int).view(N, 2)[:, 1], 482 | ) 483 | 484 | 485 | def interpret_imgBuffer(imgBuffer: torch.Tensor, N: int, M: int): 486 | # N: Number of pixels 487 | # M: Number of tiles 488 | ptr = imgBuffer.data_ptr() 489 | p = align_with(ptr, 128) - ptr 490 | 491 | off = 4 * N 492 | n_contrib = imgBuffer[p:p + off].view(torch.int) 493 | p = align_with(p + off, 128) 494 | 495 | off = 2 * 4 * M 496 | ranges = imgBuffer[p:p + off].view(torch.int).view(M, 2) 497 | p = align_with(p + off, 128) 498 | 499 | return dict( 500 | n_contrib=n_contrib, 501 | ranges=ranges 502 | ) 503 | -------------------------------------------------------------------------------- /ext.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include 13 | #include "rasterize_points.h" 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("rasterize_gaussians", &RasterizeGaussiansCUDA); 17 | m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA); 18 | m.def("mark_visible", &markVisible); 19 | m.def("fused_preprocess_4d", &fusedPreprocess4D); 20 | m.def("fused_preprocess_4d_sparse", &fusedPreprocess4DSparse); 21 | m.def("compute_cov_4d", &computeCov4D); 22 | m.def("compute_cov_4d_backward", &computeCov4DBackward); 23 | m.def("compute_cov_3d", &computeCov3D); 24 | m.def("compute_cov_3d_backward", &computeCov3DBackward); 25 | m.def("compute_sh_4d", &computeSH4D); 26 | m.def("compute_sh_4d_backward", &computeSH4DBackward); 27 | } -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | Gaussian-Splatting License 2 | =========================== 3 | 4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**. 5 | The *Software* is in the process of being registered with the Agence pour la Protection des 6 | Programmes (APP). 7 | 8 | The *Software* is still being developed by the *Licensor*. 9 | 10 | *Licensor*'s goal is to allow the research community to use, test and evaluate 11 | the *Software*. 12 | 13 | ## 1. Definitions 14 | 15 | *Licensee* means any person or entity that uses the *Software* and distributes 16 | its *Work*. 17 | 18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII 19 | 20 | *Software* means the original work of authorship made available under this 21 | License ie gaussian-splatting. 22 | 23 | *Work* means the *Software* and any additions to or derivative works of the 24 | *Software* that are made available under this License. 25 | 26 | 27 | ## 2. Purpose 28 | This license is intended to define the rights granted to the *Licensee* by 29 | Licensors under the *Software*. 30 | 31 | ## 3. Rights granted 32 | 33 | For the above reasons Licensors have decided to distribute the *Software*. 34 | Licensors grant non-exclusive rights to use the *Software* for research purposes 35 | to research users (both academic and industrial), free of charge, without right 36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research 37 | and/or evaluation purposes only. 38 | 39 | Subject to the terms and conditions of this License, you are granted a 40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of, 41 | publicly display, publicly perform and distribute its *Work* and any resulting 42 | derivative works in any form. 43 | 44 | ## 4. Limitations 45 | 46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do 47 | so under this License, (b) you include a complete copy of this License with 48 | your distribution, and (c) you retain without modification any copyright, 49 | patent, trademark, or attribution notices that are present in the *Work*. 50 | 51 | **4.2 Derivative Works.** You may specify that additional or different terms apply 52 | to the use, reproduction, and distribution of your derivative works of the *Work* 53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in 54 | Section 2 applies to your derivative works, and (b) you identify the specific 55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms, 56 | this License (including the redistribution requirements in Section 3.1) will 57 | continue to apply to the *Work* itself. 58 | 59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research 60 | users explicitly acknowledge having received from Licensors all information 61 | allowing to appreciate the adequacy between of the *Software* and their needs and 62 | to undertake all necessary precautions for its execution and use. 63 | 64 | **4.4** The *Software* is provided both as a compiled library file and as source 65 | code. In case of using the *Software* for a publication or other results obtained 66 | through the use of the *Software*, users are strongly encouraged to cite the 67 | corresponding publications as explained in the documentation of the *Software*. 68 | 69 | ## 5. Disclaimer 70 | 71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES 72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY 73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL 74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES 75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL 76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR 77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE 78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) 81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR 83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*. 84 | -------------------------------------------------------------------------------- /rasterize_points.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "cuda_rasterizer/config.h" 22 | #include "cuda_rasterizer/rasterizer.h" 23 | #include 24 | #include 25 | #include 26 | 27 | std::function resizeFunctional(torch::Tensor& t) { 28 | auto lambda = [&t](size_t N) { 29 | t.resize_({(long long)N}); 30 | return reinterpret_cast(t.contiguous().data_ptr()); 31 | }; 32 | return lambda; 33 | } 34 | 35 | std::tuple 36 | RasterizeGaussiansCUDA( 37 | const torch::Tensor& background, 38 | const torch::Tensor& means3D, 39 | const torch::Tensor& colors, 40 | const torch::Tensor& opacity, 41 | const torch::Tensor& scales, 42 | const torch::Tensor& rotations, 43 | const float scale_modifier, 44 | const torch::Tensor& cov3D_precomp, 45 | const torch::Tensor& tile_mask, 46 | const torch::Tensor& viewmatrix, 47 | const torch::Tensor& projmatrix, 48 | const float tan_fovx, 49 | const float tan_fovy, 50 | const int image_height, 51 | const int image_width, 52 | const torch::Tensor& sh, 53 | const int degree, 54 | const torch::Tensor& campos, 55 | const bool prefiltered, 56 | const bool debug) 57 | { 58 | if (means3D.ndimension() != 2 || means3D.size(1) != 3) { 59 | AT_ERROR("means3D must have dimensions (num_points, 3)"); 60 | } 61 | 62 | const int P = means3D.size(0); 63 | const int H = image_height; 64 | const int W = image_width; 65 | 66 | auto int_opts = means3D.options().dtype(torch::kInt32); 67 | auto float_opts = means3D.options().dtype(torch::kFloat32); 68 | 69 | torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts); 70 | torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts); 71 | torch::Tensor out_alpha = torch::full({1, H, W}, 0.0, float_opts); 72 | torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); 73 | 74 | torch::Device device(torch::kCUDA); 75 | torch::TensorOptions options(torch::kByte); 76 | torch::Tensor geomBuffer = torch::empty({0}, options.device(device)); 77 | torch::Tensor binningBuffer = torch::empty({0}, options.device(device)); 78 | torch::Tensor imgBuffer = torch::empty({0}, options.device(device)); 79 | std::function geomFunc = resizeFunctional(geomBuffer); 80 | std::function binningFunc = resizeFunctional(binningBuffer); 81 | std::function imgFunc = resizeFunctional(imgBuffer); 82 | 83 | int rendered = 0; 84 | if(P != 0) 85 | { 86 | int M = 0; 87 | if(sh.size(0) != 0) 88 | { 89 | M = sh.size(1); 90 | } 91 | 92 | rendered = CudaRasterizer::Rasterizer::forward( 93 | geomFunc, 94 | binningFunc, 95 | imgFunc, 96 | P, degree, M, 97 | background.contiguous().data_ptr(), 98 | W, H, 99 | means3D.contiguous().data_ptr(), 100 | sh.contiguous().data_ptr(), 101 | colors.contiguous().data_ptr(), 102 | opacity.contiguous().data_ptr(), 103 | scales.contiguous().data_ptr(), 104 | scale_modifier, 105 | rotations.contiguous().data_ptr(), 106 | cov3D_precomp.contiguous().data_ptr(), 107 | tile_mask.contiguous().data_ptr(), 108 | viewmatrix.contiguous().data_ptr(), 109 | projmatrix.contiguous().data_ptr(), 110 | campos.contiguous().data_ptr(), 111 | tan_fovx, 112 | tan_fovy, 113 | prefiltered, 114 | out_color.contiguous().data_ptr(), 115 | out_depth.contiguous().data_ptr(), 116 | out_alpha.contiguous().data_ptr(), 117 | radii.contiguous().data_ptr(), 118 | debug); 119 | } 120 | return std::make_tuple(rendered, out_color, out_depth, out_alpha, radii, geomBuffer, binningBuffer, imgBuffer); 121 | } 122 | 123 | std::tuple 124 | RasterizeGaussiansBackwardCUDA( 125 | const torch::Tensor& background, 126 | const torch::Tensor& means3D, 127 | const torch::Tensor& radii, 128 | const torch::Tensor& colors, 129 | const torch::Tensor& scales, 130 | const torch::Tensor& rotations, 131 | const float scale_modifier, 132 | const torch::Tensor& cov3D_precomp, 133 | const torch::Tensor& viewmatrix, 134 | const torch::Tensor& projmatrix, 135 | const float tan_fovx, 136 | const float tan_fovy, 137 | const torch::Tensor& dL_dout_color, 138 | const torch::Tensor& dL_dout_depth, 139 | const torch::Tensor& dL_dout_alpha, 140 | const torch::Tensor& sh, 141 | const int degree, 142 | const torch::Tensor& campos, 143 | const torch::Tensor& geomBuffer, 144 | const int R, 145 | const torch::Tensor& binningBuffer, 146 | const torch::Tensor& imageBuffer, 147 | const torch::Tensor& out_alpha, 148 | const bool debug) 149 | { 150 | const int P = means3D.size(0); 151 | const int H = dL_dout_color.size(1); 152 | const int W = dL_dout_color.size(2); 153 | 154 | int M = 0; 155 | if(sh.size(0) != 0) 156 | { 157 | M = sh.size(1); 158 | } 159 | 160 | torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options()); 161 | torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options()); 162 | torch::Tensor dL_dabsmeans2D = torch::zeros({P, 3}, means3D.options()); 163 | torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options()); 164 | // just for storing intermediate results 165 | torch::Tensor dL_ddepths = torch::zeros({P, 1}, means3D.options()); 166 | torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options()); 167 | torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options()); 168 | torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options()); 169 | torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options()); 170 | torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options()); 171 | torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options()); 172 | 173 | if(P != 0) 174 | { 175 | CudaRasterizer::Rasterizer::backward(P, degree, M, R, 176 | background.contiguous().data_ptr(), 177 | W, H, 178 | means3D.contiguous().data_ptr(), 179 | sh.contiguous().data_ptr(), 180 | colors.contiguous().data_ptr(), 181 | scales.data_ptr(), 182 | scale_modifier, 183 | rotations.data_ptr(), 184 | cov3D_precomp.contiguous().data_ptr(), 185 | viewmatrix.contiguous().data_ptr(), 186 | projmatrix.contiguous().data_ptr(), 187 | campos.contiguous().data_ptr(), 188 | tan_fovx, 189 | tan_fovy, 190 | radii.contiguous().data_ptr(), 191 | reinterpret_cast(geomBuffer.contiguous().data_ptr()), 192 | reinterpret_cast(binningBuffer.contiguous().data_ptr()), 193 | reinterpret_cast(imageBuffer.contiguous().data_ptr()), 194 | out_alpha.contiguous().data_ptr(), 195 | dL_dout_color.contiguous().data_ptr(), 196 | dL_dout_depth.contiguous().data_ptr(), 197 | dL_dout_alpha.contiguous().data_ptr(), 198 | dL_dmeans2D.contiguous().data_ptr(), 199 | dL_dabsmeans2D.contiguous().data_ptr(), 200 | dL_dconic.contiguous().data_ptr(), 201 | dL_dopacity.contiguous().data_ptr(), 202 | dL_dcolors.contiguous().data_ptr(), 203 | dL_ddepths.contiguous().data_ptr(), 204 | dL_dmeans3D.contiguous().data_ptr(), 205 | dL_dcov3D.contiguous().data_ptr(), 206 | dL_dsh.contiguous().data_ptr(), 207 | dL_dscales.contiguous().data_ptr(), 208 | dL_drotations.contiguous().data_ptr(), 209 | debug); 210 | } 211 | 212 | return std::make_tuple(dL_dabsmeans2D, dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations); 213 | } 214 | 215 | torch::Tensor markVisible( 216 | torch::Tensor& means3D, 217 | torch::Tensor& viewmatrix, 218 | torch::Tensor& projmatrix) 219 | { 220 | const int P = means3D.size(0); 221 | 222 | torch::Tensor present = torch::empty({P}, means3D.options().dtype(at::kBool)); 223 | 224 | if(P != 0) 225 | { 226 | CudaRasterizer::Rasterizer::markVisible(P, 227 | means3D.contiguous().data_ptr(), 228 | viewmatrix.contiguous().data_ptr(), 229 | projmatrix.contiguous().data_ptr(), 230 | present.contiguous().data_ptr()); 231 | } 232 | 233 | return present; 234 | } 235 | 236 | std::tuple fusedPreprocess4D( 237 | const torch::Tensor& means3D, 238 | const torch::Tensor& cov, 239 | const torch::Tensor& ms, 240 | const torch::Tensor& cov_t, 241 | const torch::Tensor& opacities, 242 | const torch::Tensor& t1, 243 | const torch::Tensor& sh, 244 | const torch::Tensor& t, 245 | const torch::Tensor& viewmatrix, 246 | const torch::Tensor& projmatrix, 247 | const torch::Tensor& cam_pos, 248 | const int deg, 249 | const int deg_t, 250 | const float duration 251 | ) 252 | { 253 | const int P = means3D.size(0); 254 | int M = 0; 255 | if(sh.size(0) != 0) M = sh.size(1); 256 | 257 | torch::Tensor mask = torch::empty({P, 1}, means3D.options().dtype(at::kBool)); 258 | torch::Tensor occ1 = torch::empty({P, 1}, means3D.options()); 259 | torch::Tensor xyz3 = torch::empty({P, 3}, means3D.options()); 260 | torch::Tensor rgb3 = torch::empty({P, 3}, means3D.options()); 261 | 262 | if(P != 0) 263 | { 264 | CudaRasterizer::Rasterizer::fusedPreprocess4D(P, deg, deg_t, M, 265 | means3D.contiguous().data_ptr(), 266 | cov.contiguous().data_ptr(), 267 | ms.contiguous().data_ptr(), 268 | cov_t.contiguous().data_ptr(), 269 | opacities.contiguous().data_ptr(), 270 | t1.contiguous().data_ptr(), 271 | sh.contiguous().data_ptr(), 272 | t.contiguous().data_ptr(), 273 | viewmatrix.contiguous().data_ptr(), 274 | projmatrix.contiguous().data_ptr(), 275 | cam_pos.contiguous().data_ptr(), 276 | duration, 277 | mask.contiguous().data_ptr(), 278 | occ1.contiguous().data_ptr(), 279 | xyz3.contiguous().data_ptr(), 280 | rgb3.contiguous().data_ptr()); 281 | } 282 | return std::make_tuple(mask, occ1, xyz3, rgb3); 283 | } 284 | 285 | 286 | std::tuple fusedPreprocess4DSparse( 287 | const torch::Tensor& means3D, 288 | const torch::Tensor& cov, 289 | const torch::Tensor& ms, 290 | const torch::Tensor& cov_t, 291 | const torch::Tensor& opacities, 292 | const torch::Tensor& t1, 293 | const torch::Tensor& base, 294 | const torch::Tensor& sh, 295 | const torch::Tensor& t, 296 | const torch::Tensor& inverse, 297 | const torch::Tensor& viewmatrix, 298 | const torch::Tensor& projmatrix, 299 | const torch::Tensor& cam_pos, 300 | const int deg, 301 | const int deg_t, 302 | const float duration 303 | ) 304 | { 305 | const int P = means3D.size(0); 306 | int M = 0; 307 | if(sh.size(0) != 0) M = sh.size(1); 308 | 309 | torch::Tensor mask = torch::empty({P, 1}, means3D.options().dtype(at::kBool)); 310 | torch::Tensor occ1 = torch::empty({P, 1}, means3D.options()); 311 | torch::Tensor xyz3 = torch::empty({P, 3}, means3D.options()); 312 | torch::Tensor rgb3 = torch::empty({P, 3}, means3D.options()); 313 | 314 | if(P != 0) 315 | { 316 | CudaRasterizer::Rasterizer::fusedPreprocess4DSparse(P, deg, deg_t, M, 317 | means3D.contiguous().data_ptr(), 318 | cov.contiguous().data_ptr(), 319 | ms.contiguous().data_ptr(), 320 | cov_t.contiguous().data_ptr(), 321 | opacities.contiguous().data_ptr(), 322 | t1.contiguous().data_ptr(), 323 | base.contiguous().data_ptr(), 324 | sh.contiguous().data_ptr(), 325 | t.contiguous().data_ptr(), 326 | inverse.contiguous().data_ptr(), 327 | viewmatrix.contiguous().data_ptr(), 328 | projmatrix.contiguous().data_ptr(), 329 | cam_pos.contiguous().data_ptr(), 330 | duration, 331 | mask.contiguous().data_ptr(), 332 | occ1.contiguous().data_ptr(), 333 | xyz3.contiguous().data_ptr(), 334 | rgb3.contiguous().data_ptr()); 335 | } 336 | return std::make_tuple(mask, occ1, xyz3, rgb3); 337 | } 338 | 339 | torch::Tensor computeCov3D( 340 | torch::Tensor& scaling_xyz, 341 | torch::Tensor& rotation_l) 342 | { 343 | const int P = scaling_xyz.size(0); 344 | torch::Tensor cov = torch::empty({P, 6}, scaling_xyz.options()); 345 | 346 | if(P != 0) 347 | { 348 | CudaRasterizer::Rasterizer::computeCov3D(P, 349 | scaling_xyz.contiguous().data_ptr(), 350 | rotation_l.contiguous().data_ptr(), 351 | cov.contiguous().data_ptr()); 352 | } 353 | 354 | return cov; 355 | } 356 | 357 | std::tuple computeCov3DBackward( 358 | torch::Tensor& scaling_xyz, 359 | torch::Tensor& rotation_l, 360 | torch::Tensor& dL_dcov) 361 | { 362 | const int P = scaling_xyz.size(0); 363 | torch::Tensor dL_dscaling_xyz = torch::zeros({P, 3}, scaling_xyz.options()); 364 | torch::Tensor dL_drotation_l = torch::zeros({P, 4}, scaling_xyz.options()); 365 | 366 | if(P != 0) 367 | { 368 | CudaRasterizer::Rasterizer::computeCov3DBackward(P, 369 | scaling_xyz.contiguous().data_ptr(), 370 | rotation_l.contiguous().data_ptr(), 371 | dL_dcov.contiguous().data_ptr(), 372 | dL_dscaling_xyz.contiguous().data_ptr(), 373 | dL_drotation_l.contiguous().data_ptr()); 374 | } 375 | 376 | return std::make_tuple(dL_dscaling_xyz, dL_drotation_l); 377 | } 378 | 379 | std::tuple computeCov4D( 380 | torch::Tensor& scaling_xyzt, 381 | torch::Tensor& rotation_l, 382 | torch::Tensor& rotation_r) 383 | { 384 | const int P = scaling_xyzt.size(0); 385 | 386 | torch::Tensor cov = torch::empty({P, 6}, scaling_xyzt.options()); 387 | torch::Tensor ms = torch::empty({P, 3}, scaling_xyzt.options()); 388 | torch::Tensor cov_t = torch::empty({P, 1}, scaling_xyzt.options()); 389 | 390 | if(P != 0) 391 | { 392 | CudaRasterizer::Rasterizer::computeCov4D(P, 393 | scaling_xyzt.contiguous().data_ptr(), 394 | rotation_l.contiguous().data_ptr(), 395 | rotation_r.contiguous().data_ptr(), 396 | cov.contiguous().data_ptr(), 397 | ms.contiguous().data_ptr(), 398 | cov_t.contiguous().data_ptr()); 399 | } 400 | 401 | return std::make_tuple(cov, ms, cov_t); 402 | } 403 | 404 | std::tuple computeCov4DBackward( 405 | torch::Tensor& scaling_xyzt, 406 | torch::Tensor& rotation_l, 407 | torch::Tensor& rotation_r, 408 | torch::Tensor& dL_dcov, 409 | torch::Tensor& dL_dms, 410 | torch::Tensor& dL_dcov_t) 411 | { 412 | const int P = scaling_xyzt.size(0); 413 | 414 | torch::Tensor dL_dscaling_xyzt = torch::zeros({P, 4}, scaling_xyzt.options()); 415 | torch::Tensor dL_drotation_l = torch::zeros({P, 4}, scaling_xyzt.options()); 416 | torch::Tensor dL_drotation_r = torch::zeros({P, 4}, scaling_xyzt.options()); 417 | 418 | if(P != 0) 419 | { 420 | CudaRasterizer::Rasterizer::computeCov4DBackward(P, 421 | scaling_xyzt.contiguous().data_ptr(), 422 | rotation_l.contiguous().data_ptr(), 423 | rotation_r.contiguous().data_ptr(), 424 | dL_dcov.contiguous().data_ptr(), 425 | dL_dms.contiguous().data_ptr(), 426 | dL_dcov_t.contiguous().data_ptr(), 427 | dL_dscaling_xyzt.contiguous().data_ptr(), 428 | dL_drotation_l.contiguous().data_ptr(), 429 | dL_drotation_r.contiguous().data_ptr()); 430 | } 431 | 432 | return std::make_tuple(dL_dscaling_xyzt, dL_drotation_l, dL_drotation_r); 433 | } 434 | 435 | 436 | torch::Tensor computeSH4D( 437 | const int deg, 438 | const int deg_t, 439 | torch::Tensor& sh, 440 | torch::Tensor& dir, 441 | torch::Tensor& dir_t, 442 | const float duration 443 | ) 444 | { 445 | const int P = sh.size(0); 446 | int M = 0; 447 | if(sh.size(0) != 0) M = sh.size(1); 448 | 449 | torch::Tensor rgb = torch::zeros({P, 3}, sh.options()); 450 | 451 | if(P != 0) 452 | { 453 | CudaRasterizer::Rasterizer::computeSH4D(P, 454 | deg, deg_t, M, 455 | sh.contiguous().data_ptr(), 456 | dir.contiguous().data_ptr(), 457 | dir_t.contiguous().data_ptr(), 458 | duration, 459 | rgb.contiguous().data_ptr() 460 | ); 461 | } 462 | 463 | return rgb; 464 | } 465 | 466 | std::tuple computeSH4DBackward( 467 | const int deg, 468 | const int deg_t, 469 | torch::Tensor& sh, 470 | torch::Tensor& dir, 471 | torch::Tensor& dir_t, 472 | const float duration, 473 | torch::Tensor& dL_drgb 474 | ) 475 | { 476 | const int P = sh.size(0); 477 | int M = 0; 478 | if(sh.size(0) != 0) M = sh.size(1); 479 | 480 | torch::Tensor dL_dsh = torch::zeros({P, M, 3}, sh.options()); 481 | torch::Tensor dL_ddir = torch::zeros({P, 3}, sh.options()); 482 | torch::Tensor dL_ddir_t = torch::zeros({P, 1}, sh.options()); 483 | 484 | if(P != 0) 485 | { 486 | CudaRasterizer::Rasterizer::computeSH4DBackward(P, 487 | deg, deg_t, M, 488 | sh.contiguous().data_ptr(), 489 | dir.contiguous().data_ptr(), 490 | dir_t.contiguous().data_ptr(), 491 | duration, 492 | dL_drgb.contiguous().data_ptr(), 493 | dL_dsh.contiguous().data_ptr(), 494 | dL_ddir.contiguous().data_ptr(), 495 | dL_ddir_t.contiguous().data_ptr() 496 | ); 497 | } 498 | 499 | return std::make_tuple(dL_dsh, dL_ddir, dL_ddir_t); 500 | } -------------------------------------------------------------------------------- /rasterize_points.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #pragma once 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | std::tuple 19 | RasterizeGaussiansCUDA( 20 | const torch::Tensor& background, 21 | const torch::Tensor& means3D, 22 | const torch::Tensor& colors, 23 | const torch::Tensor& opacity, 24 | const torch::Tensor& scales, 25 | const torch::Tensor& rotations, 26 | const float scale_modifier, 27 | const torch::Tensor& cov3D_precomp, 28 | const torch::Tensor& tile_mask, 29 | const torch::Tensor& viewmatrix, 30 | const torch::Tensor& projmatrix, 31 | const float tan_fovx, 32 | const float tan_fovy, 33 | const int image_height, 34 | const int image_width, 35 | const torch::Tensor& sh, 36 | const int degree, 37 | const torch::Tensor& campos, 38 | const bool prefiltered, 39 | const bool debug); 40 | 41 | 42 | std::tuple 43 | RasterizeGaussiansBackwardCUDA( 44 | const torch::Tensor& background, 45 | const torch::Tensor& means3D, 46 | const torch::Tensor& radii, 47 | const torch::Tensor& colors, 48 | const torch::Tensor& scales, 49 | const torch::Tensor& rotations, 50 | const float scale_modifier, 51 | const torch::Tensor& cov3D_precomp, 52 | const torch::Tensor& viewmatrix, 53 | const torch::Tensor& projmatrix, 54 | const float tan_fovx, 55 | const float tan_fovy, 56 | const torch::Tensor& dL_dout_color, 57 | const torch::Tensor& dL_dout_depth, 58 | const torch::Tensor& dL_dout_alpha, 59 | const torch::Tensor& sh, 60 | const int degree, 61 | const torch::Tensor& campos, 62 | const torch::Tensor& geomBuffer, 63 | const int R, 64 | const torch::Tensor& binningBuffer, 65 | const torch::Tensor& imageBuffer, 66 | const torch::Tensor& out_alpha, 67 | const bool debug); 68 | 69 | std::tuple fusedPreprocess4D( 70 | const torch::Tensor& means3D, 71 | const torch::Tensor& cov, 72 | const torch::Tensor& ms, 73 | const torch::Tensor& cov_t, 74 | const torch::Tensor& opacities, 75 | const torch::Tensor& t1, 76 | const torch::Tensor& sh, 77 | const torch::Tensor& t, 78 | const torch::Tensor& viewmatrix, 79 | const torch::Tensor& projmatrix, 80 | const torch::Tensor& cam_pos, 81 | const int deg, 82 | const int deg_t, 83 | const float duration 84 | ); 85 | 86 | std::tuple fusedPreprocess4DSparse( 87 | const torch::Tensor& means3D, 88 | const torch::Tensor& cov, 89 | const torch::Tensor& ms, 90 | const torch::Tensor& cov_t, 91 | const torch::Tensor& opacities, 92 | const torch::Tensor& t1, 93 | const torch::Tensor& base, 94 | const torch::Tensor& sh, 95 | const torch::Tensor& t, 96 | const torch::Tensor& inverse, 97 | const torch::Tensor& viewmatrix, 98 | const torch::Tensor& projmatrix, 99 | const torch::Tensor& cam_pos, 100 | const int deg, 101 | const int deg_t, 102 | const float duration 103 | ); 104 | 105 | torch::Tensor markVisible( 106 | torch::Tensor& means3D, 107 | torch::Tensor& viewmatrix, 108 | torch::Tensor& projmatrix); 109 | 110 | torch::Tensor computeCov3D( 111 | torch::Tensor& scaling_xyz, 112 | torch::Tensor& rotation_l); 113 | 114 | std::tuple computeCov3DBackward( 115 | torch::Tensor& scaling_xyz, 116 | torch::Tensor& rotation_l, 117 | torch::Tensor& dL_dcov); 118 | 119 | std::tuple computeCov4D( 120 | torch::Tensor& scaling_xyzt, 121 | torch::Tensor& rotation_l, 122 | torch::Tensor& rotation_r); 123 | 124 | std::tuple computeCov4DBackward( 125 | torch::Tensor& scaling_xyzt, 126 | torch::Tensor& rotation_l, 127 | torch::Tensor& rotation_r, 128 | torch::Tensor& dL_dcov, 129 | torch::Tensor& dL_dms, 130 | torch::Tensor& dL_dcov_t); 131 | 132 | torch::Tensor computeSH4D( 133 | const int deg, 134 | const int deg_t, 135 | torch::Tensor& sh, 136 | torch::Tensor& dir, 137 | torch::Tensor& dir_t, 138 | const float duration 139 | ); 140 | 141 | std::tuple computeSH4DBackward( 142 | const int deg, 143 | const int deg_t, 144 | torch::Tensor& sh, 145 | torch::Tensor& dir, 146 | torch::Tensor& dir_t, 147 | const float duration, 148 | torch::Tensor& dL_drgb 149 | ); -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Differential Gaussian Rasterization Improved 2 | 3 | ## Faster Backward Pass 4 | 5 | This is only faster if there're large number of semi-transparent (almost) transparent Gaussians to be rendered since it might introduce some small overheads for regular rendering. 6 | 7 | The original backward implementation uses `atomicAdd` on global CUDA memory. 8 | 9 | We further accelerate this process by making use of the `__shared__` memory in a thread block to store the temporal accumulated gradients, just like the original did to the gaussian properties. 10 | 11 | No api change is required for this functionality and you can directly check out what we changed in [backward.cu](cuda_rasterizer/backward.cu#417). 12 | 13 | The change can be summarized in this pseudo-code: 14 | 15 | ```c++ 16 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 17 | renderCUDA(...) { 18 | 19 | __shared__ float3 s_dL_dmean2D[BLOCK_SIZE]; // allocated shared memory 20 | s_dL_dmean2D[block.thread_rank()].x = 0.0f; // fill shared memory with zeros 21 | 22 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) { // iterate over gaussian that has a influence on this pixel 23 | // Compute gradients 24 | ... 25 | 26 | // Update gradients w.r.t. 2D mean position of the Gaussian 27 | atomicAdd(&s_dL_dmean2D[j].x, dL_dG * dG_ddelx * ddelx_dx); 28 | atomicAdd(&s_dL_dmean2D[j].y, dL_dG * dG_ddely * ddely_dy); 29 | } 30 | 31 | atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[block.thread_rank()].x); 32 | atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[block.thread_rank()].y); 33 | } 34 | ``` 35 | 36 | In an effort to make this process even faster, we've also implemented a warp-reduction based version of the backward pass on top of the `__shared__` memory optimization. 37 | 38 | By directly communicating the gradient accumulation in a 32-thread warp using: 39 | 40 | ```c++ 41 | __device__ float warpReduceSum(float value) { 42 | auto warp = cg::coalesced_threads(); 43 | for (int offset = warp.size() / 2; offset > 0; offset /= 2) { 44 | value += warp.shfl_down(value, offset); 45 | } 46 | return value; 47 | } 48 | ``` 49 | 50 | And later aggregate the warp sum into `__shared__` memory: 51 | 52 | ```c++ 53 | ... 54 | // Use a single thread from each warp to perform block level reduction 55 | if (block.thread_rank() % warp.size() == 0) { 56 | for (int ch = 0; ch < C; ch++) { 57 | atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), w_dL_dcolors[ch]); 58 | } 59 | atomicAdd(&(s_dL_ddepths[j]), w_dL_ddepths); 60 | atomicAdd(&s_dL_dmean2D[j].x, w_dL_dmean2D.x); 61 | atomicAdd(&s_dL_dmean2D[j].y, w_dL_dmean2D.y); 62 | atomicAdd(&s_dL_dconic2D[j].x, w_dL_dconic2D.x); 63 | atomicAdd(&s_dL_dconic2D[j].y, w_dL_dconic2D.y); 64 | atomicAdd(&s_dL_dconic2D[j].w, w_dL_dconic2D.w); 65 | atomicAdd(&(s_dL_dopacity[j]), w_dL_dopacity); 66 | } 67 | ... 68 | ``` 69 | 70 | We can shave off another 2-3ms for the backward pass at the start of the training, but curiously it couldn't persist during the whole training process. 71 | 72 | Thus by default only the `__shared__` memory optimization is enabled and in use. 73 | 74 | Note: this seems slower... See: https://developer.nvidia.com/blog/gpu-pro-tip-fast-histograms-using-shared-atomics-maxwell 75 | 76 | ## Tile-Based Culling 77 | 78 | Using the method mentioned: [StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering](https://github.com/r4dl/StopThePop-Rasterization), we borrow the tile-based culling scheme here to reduce the computational cost during training and rendering. 79 | 80 | This section of code is directly adapted from their repository. 81 | 82 | ```c++ 83 | ... 84 | constexpr float alpha_threshold = 1.0f / 255.0f; 85 | const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold); 86 | glm::vec2 max_pos; 87 | const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y}; 88 | const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1}; 89 | float max_opac_factor = max_contrib_power_rect_gaussian_float(conic_opacity[idx], points_xy[idx], tile_min, tile_max, max_pos); 90 | 91 | if (max_opac_factor > opacity_power_threshold) { 92 | continue; 93 | } 94 | ... 95 | ``` 96 | 97 | Note: this seems slower... 98 | 99 | ## Tile-Mask Rendering 100 | 101 | **Note: this api hasn't been fully tested yet.** 102 | 103 | We additionaly provide a interface for adding a tile-mask to the gaussian rasterizer. 104 | 105 | Turns out the tile-based rendering rasterization pipeline can be easily masked out to provide a patch-like rendering result (to simulate a NeRF-like ray sampling approach). 106 | 107 | To implement this as efficiently as possible, we: 108 | 109 | 1. Mark points that's not to be rendered as early as possible in the `preprocessCUDA` kernel. 110 | 2. Make all subsequent operations faster by not including masked-out tiles in the sorting and `renderCUDA` kernel. 111 | 112 | The tile mask can be defined as: 113 | 114 | ```python 115 | from diff_gauss import GaussianRasterizationSettings, GaussianRasterizer 116 | raster_settings = GaussianRasterizationSettings(...) 117 | rasterizer = GaussianRasterizer(raster_settings=raster_settings) 118 | 119 | BLOCK_X, BLOCK_Y = 16, 16 120 | tile_height, tile_width = (raster_settings.image_height + BLOCK_Y - 1) // BLOCK_Y, (raster_settings.image_width + BLOCK_X - 1) // BLOCK_X 121 | tile_mask = torch.ones((tile_height, tile_width), dtype=torch.bool, device='cuda') 122 | 123 | rendered_image, rendered_depth, rendered_alpha, radii = rasterizer( 124 | means3D = means3D, 125 | means2D = means2D, 126 | shs = shs, 127 | colors_precomp = colors_precomp, 128 | opacities = opacity, 129 | scales = scales, 130 | rotations = rotations, 131 | cov3D_precomp = cov3D_precomp, 132 | tile_mask = tile_mask, 133 | ) 134 | ``` 135 | 136 | ## Fixed `ImageState` Buffer Size 137 | 138 | In the [original implementation](https://github.com/graphdeco-inria/diff-gaussian-rasterization), the size of the `ranges` member of the struct `ImageState` was too large (same as the number of pixels). 139 | 140 | In reality, only `number of tiles` of `ranges` are needed, as the `ranges` are used to store the start and end indices of the gaussian splats in the `GeometryState` buffer. 141 | 142 | We fix this by simply replacing the memory allocation of `ImageState` with: 143 | 144 | ```c++ 145 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N, size_t M) 146 | { 147 | ImageState img; 148 | obtain(chunk, img.n_contrib, N, 128); 149 | obtain(chunk, img.ranges, M, 128); 150 | return img; 151 | } 152 | ``` 153 | 154 | ## Fixed Culling 155 | 156 | The [original repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization)'s implementation for view-space culling wasn't effective (no points were culled). 157 | 158 | We fixed that with an improved OpenGL like culling function: 159 | 160 | ```c++ 161 | __forceinline__ __device__ bool in_frustum(int idx, 162 | const float* orig_points, 163 | const float* viewmatrix, 164 | const float* projmatrix, 165 | bool prefiltered, 166 | float3& p_view, // reference 167 | const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this 168 | const float xy_padding = 0.5f // padding in ndc space // TODO: add api for changing this 169 | ) 170 | { 171 | float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; 172 | p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside 173 | if (prefiltered) return true; 174 | 175 | // Bring points to screen space 176 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 177 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 178 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 179 | 180 | return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding); 181 | } 182 | ``` 183 | 184 | ## Depth & Alpha Backward 185 | 186 | **Note: this functionality is directly copied from the [slothfulxtx repository](https://github.com/slothfulxtx/diff-gaussian-rasterization).** 187 | 188 | Except for the RGB image, we also support render depth map and alpha map (both forward and backward process) compared with the [original repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization). 189 | 190 | We modify the dependency name as **diff_gauss** to avoid dependecy conflict with the original version. You can install our repo by executing the following command lines 191 | 192 | Here's an example of our modified differential gaussian rasterization repo 193 | ```python 194 | from diff_gauss import GaussianRasterizationSettings, GaussianRasterizer 195 | raster_settings = GaussianRasterizationSettings(...) 196 | rasterizer = GaussianRasterizer(raster_settings=raster_settings) 197 | 198 | rendered_image, rendered_depth, rendered_alpha, radii = rasterizer( 199 | means3D = means3D, 200 | means2D = means2D, 201 | shs = shs, 202 | colors_precomp = colors_precomp, 203 | opacities = opacity, 204 | scales = scales, 205 | rotations = rotations, 206 | cov3D_precomp = cov3D_precomp 207 | ) 208 | ``` 209 | 210 | Details: By default, the depth is calculated as 'median depth', where the depth values of each pixels covered by 3D Gaussian Splatting are set to be the depth of the 3D Gaussian center. Thus, there exist numerical errors when the scales of 3D Gaussian are large. However, thanks to the densificaiton scheme, most 3D Gaussians are small. Currently, we ignore the numerical error of depth maps. 211 | 212 | ## Differential Gaussian Rasterization 213 | 214 | **Note: this is the original readme for the [original diff-gaussian-rasterization repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization).** 215 | 216 | Used as the rasterization engine for the paper "3D Gaussian Splatting for Real-Time Rendering of Radiance Fields". If you can make use of it in your own research, please be so kind to cite us. 217 | 218 |
219 |
220 |

BibTeX

221 |
@Article{kerbl3Dgaussians,
222 |       author       = {Kerbl, Bernhard and Kopanas, Georgios and Leimk{\"u}hler, Thomas and Drettakis, George},
223 |       title        = {3D Gaussian Splatting for Real-Time Radiance Field Rendering},
224 |       journal      = {ACM Transactions on Graphics},
225 |       number       = {4},
226 |       volume       = {42},
227 |       month        = {July},
228 |       year         = {2023},
229 |       url          = {https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/}
230 | }
231 |
232 |
233 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | from setuptools import setup 13 | from os.path import dirname, join, abspath 14 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension 15 | 16 | dirname(abspath(__file__)) 17 | 18 | setup( 19 | name="diff_gauss", 20 | packages=['diff_gauss'], 21 | ext_modules=[ 22 | CUDAExtension( 23 | name="diff_gauss._C", 24 | sources=[ 25 | "cuda_rasterizer/rasterizer_impl.cu", 26 | "cuda_rasterizer/forward.cu", 27 | # "cuda_rasterizer/forward_half.cu", 28 | "cuda_rasterizer/backward.cu", 29 | "rasterize_points.cu", 30 | "ext.cpp"], 31 | extra_compile_args={"nvcc": [ 32 | "-O3", 33 | "-Xcompiler", 34 | "-fno-gnu-unique", 35 | # "-G", 36 | "-I" + join(dirname(abspath(__file__)), "third_party/glm/")]}) 37 | ], 38 | cmdclass={ 39 | 'build_ext': BuildExtension 40 | }, 41 | ) 42 | --------------------------------------------------------------------------------