├── .gitignore
├── .gitmodules
├── .vscode
    └── settings.json
├── CMakeLists.txt
├── cuda_rasterizer
    ├── auxiliary.h
    ├── backward.cu
    ├── backward.h
    ├── config.h
    ├── forward.cu
    ├── forward.h
    ├── rasterizer.h
    ├── rasterizer_impl.cu
    └── rasterizer_impl.h
├── diff_gauss
    └── __init__.py
├── ext.cpp
├── license.md
├── rasterize_points.cu
├── rasterize_points.h
├── readme.md
├── setup.py
└── third_party
    └── stbi_image_write.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | *.egg-info/
 3 | dist/
 4 | __pycache__/
 5 | *.pyd
 6 | Makefile
 7 | CMakeCache.txt
 8 | CMakeFiles
 9 | *.a
10 | *.cmake
11 | *.so
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/glm"]
2 | 	path = third_party/glm
3 | 	url = https://github.com/g-truc/glm.git
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "cmath": "cpp",
 4 |         "stdexcept": "cpp",
 5 |         "cctype": "cpp",
 6 |         "clocale": "cpp",
 7 |         "csignal": "cpp",
 8 |         "cstdarg": "cpp",
 9 |         "cstddef": "cpp",
10 |         "cstdio": "cpp",
11 |         "cstdlib": "cpp",
12 |         "cstring": "cpp",
13 |         "ctime": "cpp",
14 |         "cwchar": "cpp",
15 |         "cwctype": "cpp",
16 |         "array": "cpp",
17 |         "atomic": "cpp",
18 |         "strstream": "cpp",
19 |         "bit": "cpp",
20 |         "*.tcc": "cpp",
21 |         "bitset": "cpp",
22 |         "chrono": "cpp",
23 |         "compare": "cpp",
24 |         "complex": "cpp",
25 |         "concepts": "cpp",
26 |         "condition_variable": "cpp",
27 |         "cstdint": "cpp",
28 |         "deque": "cpp",
29 |         "list": "cpp",
30 |         "map": "cpp",
31 |         "set": "cpp",
32 |         "string": "cpp",
33 |         "unordered_map": "cpp",
34 |         "unordered_set": "cpp",
35 |         "vector": "cpp",
36 |         "exception": "cpp",
37 |         "algorithm": "cpp",
38 |         "functional": "cpp",
39 |         "iterator": "cpp",
40 |         "memory": "cpp",
41 |         "memory_resource": "cpp",
42 |         "numeric": "cpp",
43 |         "optional": "cpp",
44 |         "random": "cpp",
45 |         "ratio": "cpp",
46 |         "string_view": "cpp",
47 |         "system_error": "cpp",
48 |         "tuple": "cpp",
49 |         "type_traits": "cpp",
50 |         "utility": "cpp",
51 |         "fstream": "cpp",
52 |         "initializer_list": "cpp",
53 |         "iomanip": "cpp",
54 |         "iosfwd": "cpp",
55 |         "iostream": "cpp",
56 |         "istream": "cpp",
57 |         "limits": "cpp",
58 |         "mutex": "cpp",
59 |         "new": "cpp",
60 |         "numbers": "cpp",
61 |         "ostream": "cpp",
62 |         "semaphore": "cpp",
63 |         "sstream": "cpp",
64 |         "stop_token": "cpp",
65 |         "streambuf": "cpp",
66 |         "thread": "cpp",
67 |         "cfenv": "cpp",
68 |         "cinttypes": "cpp",
69 |         "typeindex": "cpp",
70 |         "typeinfo": "cpp",
71 |         "valarray": "cpp",
72 |         "variant": "cpp",
73 |         "*.ipp": "cpp"
74 |     }
75 | }


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2023, Inria
 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 | # All rights reserved.
 5 | #
 6 | # This software is free for non-commercial, research and evaluation use 
 7 | # under the terms of the LICENSE.md file.
 8 | #
 9 | # For inquiries contact  george.drettakis@inria.fr
10 | #
11 | 
12 | cmake_minimum_required(VERSION 3.20)
13 | 
14 | project(DiffRast LANGUAGES CUDA CXX)
15 | 
16 | set(CMAKE_CXX_STANDARD 17)
17 | set(CMAKE_CXX_EXTENSIONS OFF)
18 | set(CMAKE_CUDA_STANDARD 17)
19 | 
20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
21 | 
22 | add_library(CudaRasterizer
23 | 	cuda_rasterizer/backward.h
24 | 	cuda_rasterizer/backward.cu
25 | 	cuda_rasterizer/forward.h
26 | 	cuda_rasterizer/forward.cu
27 | 	cuda_rasterizer/auxiliary.h
28 | 	cuda_rasterizer/rasterizer_impl.cu
29 | 	cuda_rasterizer/rasterizer_impl.h
30 | 	cuda_rasterizer/rasterizer.h
31 | )
32 | 
33 | set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "70;75;86")
34 | 
35 | target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer)
36 | target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
37 | 


--------------------------------------------------------------------------------
/cuda_rasterizer/auxiliary.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
 13 | #define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
 14 | 
 15 | #include "config.h"
 16 | #include "stdio.h"
 17 | #include <cooperative_groups.h>
 18 | #include <cooperative_groups/reduce.h>
 19 | namespace cg = cooperative_groups;
 20 | 
 21 | #define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
 22 | #define NUM_WARPS (BLOCK_SIZE/32)
 23 | #define MY_PI 3.14159265
 24 | 
 25 | // Spherical harmonics coefficients
 26 | __device__ const float SH_C0 = 0.28209479177387814f;
 27 | __device__ const float SH_C1 = 0.4886025119029199f;
 28 | __device__ const float SH_C2[] = {
 29 | 	1.0925484305920792f,
 30 | 	-1.0925484305920792f,
 31 | 	0.31539156525252005f,
 32 | 	-1.0925484305920792f,
 33 | 	0.5462742152960396f
 34 | };
 35 | __device__ const float SH_C3[] = {
 36 | 	-0.5900435899266435f,
 37 | 	2.890611442640554f,
 38 | 	-0.4570457994644658f,
 39 | 	0.3731763325901154f,
 40 | 	-0.4570457994644658f,
 41 | 	1.445305721320277f,
 42 | 	-0.5900435899266435f
 43 | };
 44 | 
 45 | __forceinline__ __device__ float ndc2Pix(float v, int S)
 46 | {
 47 | 	return ((v + 1.0) * S - 1.0) * 0.5;
 48 | }
 49 | 
 50 | __forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
 51 | {
 52 | 	rect_min = {
 53 | 		min(grid.x, max((int)0, (int)((p.x - max_radius - 0.5) / BLOCK_X))),
 54 | 		min(grid.y, max((int)0, (int)((p.y - max_radius - 0.5) / BLOCK_Y)))
 55 | 	};
 56 | 	rect_max = {
 57 | 		min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1 + 0.5) / BLOCK_X))),
 58 | 		min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1 + 0.5) / BLOCK_Y)))
 59 | 	};
 60 | }
 61 | 
 62 | __forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
 63 | {
 64 | 	float3 transformed = {
 65 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
 66 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
 67 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
 68 | 	};
 69 | 	return transformed;
 70 | }
 71 | 
 72 | __forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
 73 | {
 74 | 	float4 transformed = {
 75 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
 76 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
 77 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
 78 | 		matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
 79 | 	};
 80 | 	return transformed;
 81 | }
 82 | 
 83 | __forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix)
 84 | {
 85 | 	float3 transformed = {
 86 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
 87 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
 88 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
 89 | 	};
 90 | 	return transformed;
 91 | }
 92 | 
 93 | __forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix)
 94 | {
 95 | 	float3 transformed = {
 96 | 		matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
 97 | 		matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
 98 | 		matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
 99 | 	};
100 | 	return transformed;
101 | }
102 | 
103 | __forceinline__ __device__ float dnormvdz(float3 v, float3 dv)
104 | {
105 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
106 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
107 | 	float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
108 | 	return dnormvdz;
109 | }
110 | 
111 | __forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv)
112 | {
113 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
114 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
115 | 
116 | 	float3 dnormvdv;
117 | 	dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32;
118 | 	dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32;
119 | 	dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
120 | 	return dnormvdv;
121 | }
122 | 
123 | __forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv)
124 | {
125 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
126 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
127 | 
128 | 	float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w };
129 | 	float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
130 | 	float4 dnormvdv;
131 | 	dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
132 | 	dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
133 | 	dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
134 | 	dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
135 | 	return dnormvdv;
136 | }
137 | 
138 | __forceinline__ __device__ float sigmoid(float x)
139 | {
140 | 	return 1.0f / (1.0f + expf(-x));
141 | }
142 | 
143 | __forceinline__ __device__ float dist2(float2 d)
144 | {
145 | 	return d.x * d.x + d.y * d.y;
146 | }
147 | 
148 | __forceinline__ __device__ bool in_frustum(int idx,
149 | 	const float* orig_points,
150 | 	const float* viewmatrix,
151 | 	const float* projmatrix,
152 | 	bool prefiltered,
153 | 	float3& p_view, // reference
154 | 	const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this
155 | 	const float xy_padding = 0.2f // padding in ndc space // TODO: add api for changing this
156 | 	)
157 | {
158 | 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
159 | 	p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside
160 | 	if (prefiltered) return true;
161 | 
162 | 	// Bring points to screen space
163 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
164 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
165 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
166 | 
167 | 	return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding);
168 | }
169 | 
170 | __forceinline__ __device__ bool check_frustum(
171 | 	const float3 p_orig,
172 | 	const float* viewmatrix,
173 | 	const float* projmatrix,
174 | 	const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this
175 | 	const float xy_padding = 0.2f // padding in ndc space // TODO: add api for changing this
176 | 	)
177 | {
178 | 	float3 p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside
179 | 
180 | 	// Bring points to screen space
181 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
182 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
183 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
184 | 
185 | 	return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding);
186 | }
187 | 
188 | 
189 | // As mentioned in: StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering
190 | __device__ inline float evaluate_opacity_factor(const float dx, const float dy, const float4 co) 
191 | {
192 | 	return 0.5f * (co.x * dx * dx + co.z * dy * dy) + co.y * dx * dy;
193 | }
194 | 
195 | __device__ inline float evaluate_opacity(const float dx, const float dy, const float4 co) 
196 | {
197 | 	return co.w * expf(-evaluate_opacity_factor(dx, dy, co));
198 | }
199 | 
200 | template<uint32_t PATCH_WIDTH, uint32_t PATCH_HEIGHT>
201 | __device__ inline float max_contrib_power_rect_gaussian_float(
202 | 	const float4 co, 
203 | 	const float2 mean, 
204 | 	const glm::vec2 rect_min,
205 | 	const glm::vec2 rect_max,
206 | 	glm::vec2& max_pos)
207 | {
208 | 	const float x_min_diff = rect_min.x - mean.x;
209 | 	const float x_left = x_min_diff > 0.0f;
210 | 	// const float x_left = mean.x < rect_min.x;
211 | 	const float not_in_x_range = x_left + (mean.x > rect_max.x);
212 | 
213 | 	const float y_min_diff = rect_min.y - mean.y;
214 | 	const float y_above =  y_min_diff > 0.0f;
215 | 	// const float y_above = mean.y < rect_min.y;
216 | 	const float not_in_y_range = y_above + (mean.y > rect_max.y);
217 | 
218 | 	max_pos = {mean.x, mean.y};
219 | 	float max_contrib_power = 0.0f;
220 | 
221 | 	if ((not_in_y_range + not_in_x_range) > 0.0f)
222 | 	{
223 | 		const float px = x_left * rect_min.x + (1.0f - x_left) * rect_max.x;
224 | 		const float py = y_above * rect_min.y + (1.0f - y_above) * rect_max.y;
225 | 
226 | 		const float dx = copysign(float(PATCH_WIDTH), x_min_diff);
227 | 		const float dy = copysign(float(PATCH_HEIGHT), y_min_diff);
228 | 
229 | 		const float diffx = mean.x - px;
230 | 		const float diffy = mean.y - py;
231 | 
232 | 		const float rcp_dxdxcox = __frcp_rn(PATCH_WIDTH * PATCH_WIDTH * co.x); // = 1.0 / (dx*dx*co.x)
233 | 		const float rcp_dydycoz = __frcp_rn(PATCH_HEIGHT * PATCH_HEIGHT * co.z); // = 1.0 / (dy*dy*co.z)
234 | 
235 | 		const float tx = not_in_y_range * __saturatef((dx * co.x * diffx + dx * co.y * diffy) * rcp_dxdxcox);
236 | 		const float ty = not_in_x_range * __saturatef((dy * co.y * diffx + dy * co.z * diffy) * rcp_dydycoz);
237 | 		max_pos = {px + tx * dx, py + ty * dy};
238 | 		
239 | 		const float2 max_pos_diff = {mean.x - max_pos.x, mean.y - max_pos.y};
240 | 		max_contrib_power = evaluate_opacity_factor(max_pos_diff.x, max_pos_diff.y, co);
241 | 	}
242 | 
243 | 	return max_contrib_power;
244 | }
245 | 
246 | 
247 | __device__ inline int computeTilebasedCullingTileCount(
248 | 	const float4 co_init, 
249 | 	const float2 xy_init, 
250 | 	const float opacity_power_threshold_init,
251 | 	const uint2 rect_min_init, 
252 | 	const uint2 rect_max_init)
253 | {
254 | 	const int32_t tile_count_init = (rect_max_init.y - rect_min_init.y) * (rect_max_init.x - rect_min_init.x);
255 | 
256 | 	int tile_count = 0;
257 | 	const uint32_t rect_width = (rect_max_init.x - rect_min_init.x);
258 | 	for (int tile_idx = 0; tile_idx < tile_count_init; tile_idx++)
259 | 	{
260 | 		const int y = (tile_idx / rect_width) + rect_min_init.y;
261 | 		const int x = (tile_idx % rect_width) + rect_min_init.x;
262 | 
263 | 		const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y};
264 | 		const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1};
265 | 
266 | 		glm::vec2 max_pos;
267 | 		float max_opac_factor = max_contrib_power_rect_gaussian_float<BLOCK_X-1, BLOCK_Y-1>(co_init, xy_init, tile_min, tile_max, max_pos);
268 | 		tile_count += (max_opac_factor <= opacity_power_threshold_init);
269 | 	}
270 | 
271 | 	return tile_count;
272 | }
273 | 
274 | #define CHECK_CUDA(A, debug) 														 \
275 | 	A; 																				 \
276 | 	if(debug) { 																	 \
277 | 		auto ret = cudaDeviceSynchronize(); 										 \
278 | 		if (ret != cudaSuccess) { 													 \
279 | 			std::cerr << "[CUDA ERROR] in " << __FILE__ 							 \
280 | 			<< " Line " << __LINE__ << ": " << cudaGetErrorString(ret) << std::endl; \
281 | 			throw std::runtime_error(cudaGetErrorString(ret)); 						 \
282 | 		}								 											 \
283 | 	}
284 | 
285 | #define TEST_CUDA_MEMORY()                                                      \
286 |   do {                                                                          \
287 |     const int N = 1337, bytes = N * sizeof(float);                              \
288 |     std::vector<float> cpuvec(N);                                               \
289 |     for (size_t i = 0; i < N; i++)                                              \
290 |       cpuvec[i] = (float)i;                                                     \
291 |     float *gpuvec = NULL;                                                       \
292 |     CHECK_CUDA(cudaMalloc(&gpuvec, bytes), true);                               \
293 |     assert(gpuvec != NULL);                                                     \
294 |     CHECK_CUDA(                                                                 \
295 |         cudaMemcpy(gpuvec, cpuvec.data(), bytes, cudaMemcpyHostToDevice), true) \
296 |     CHECK_CUDA(                                                                 \
297 |         cudaMemcpy(cpuvec.data(), gpuvec, bytes, cudaMemcpyDeviceToHost), true) \
298 |     CHECK_CUDA(cudaFree(gpuvec), true);                                         \
299 |   } while (0);
300 | 
301 | #endif
302 | 


--------------------------------------------------------------------------------
/cuda_rasterizer/backward.cu:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (C) 2023, Inria
   3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
   4 |  * All rights reserved.
   5 |  *
   6 |  * This software is free for non-commercial, research and evaluation use 
   7 |  * under the terms of the LICENSE.md file.
   8 |  *
   9 |  * For inquiries contact  george.drettakis@inria.fr
  10 |  */
  11 | 
  12 | #include "backward.h"
  13 | #include "auxiliary.h"
  14 | #include <cooperative_groups.h>
  15 | #include <cooperative_groups/reduce.h>
  16 | namespace cg = cooperative_groups;
  17 | 
  18 | // Backward pass for conversion of spherical harmonics to RGB for
  19 | // each Gaussian.
  20 | __device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs)
  21 | {
  22 | 	// Compute intermediate values, as it is done during forward
  23 | 	glm::vec3 pos = means[idx];
  24 | 	glm::vec3 dir_orig = pos - campos;
  25 | 	glm::vec3 dir = dir_orig / glm::length(dir_orig);
  26 | 
  27 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
  28 | 
  29 | 	// Use PyTorch rule for clamping: if clamping was applied,
  30 | 	// gradient becomes 0.
  31 | 	glm::vec3 dL_dRGB = dL_dcolor[idx];
  32 | 	dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
  33 | 	dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
  34 | 	dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
  35 | 
  36 | 	glm::vec3 dRGBdx(0, 0, 0);
  37 | 	glm::vec3 dRGBdy(0, 0, 0);
  38 | 	glm::vec3 dRGBdz(0, 0, 0);
  39 | 	float x = dir.x;
  40 | 	float y = dir.y;
  41 | 	float z = dir.z;
  42 | 
  43 | 	// Target location for this Gaussian to write SH gradients to
  44 | 	glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs;
  45 | 
  46 | 	// No tricks here, just high school-level calculus.
  47 | 	float dRGBdsh0 = SH_C0;
  48 | 	dL_dsh[0] = dRGBdsh0 * dL_dRGB;
  49 | 	if (deg > 0)
  50 | 	{
  51 | 		float dRGBdsh1 = -SH_C1 * y;
  52 | 		float dRGBdsh2 = SH_C1 * z;
  53 | 		float dRGBdsh3 = -SH_C1 * x;
  54 | 		dL_dsh[1] = dRGBdsh1 * dL_dRGB;
  55 | 		dL_dsh[2] = dRGBdsh2 * dL_dRGB;
  56 | 		dL_dsh[3] = dRGBdsh3 * dL_dRGB;
  57 | 
  58 | 		dRGBdx = -SH_C1 * sh[3];
  59 | 		dRGBdy = -SH_C1 * sh[1];
  60 | 		dRGBdz = SH_C1 * sh[2];
  61 | 
  62 | 		if (deg > 1)
  63 | 		{
  64 | 			float xx = x * x, yy = y * y, zz = z * z;
  65 | 			float xy = x * y, yz = y * z, xz = x * z;
  66 | 
  67 | 			float dRGBdsh4 = SH_C2[0] * xy;
  68 | 			float dRGBdsh5 = SH_C2[1] * yz;
  69 | 			float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
  70 | 			float dRGBdsh7 = SH_C2[3] * xz;
  71 | 			float dRGBdsh8 = SH_C2[4] * (xx - yy);
  72 | 			dL_dsh[4] = dRGBdsh4 * dL_dRGB;
  73 | 			dL_dsh[5] = dRGBdsh5 * dL_dRGB;
  74 | 			dL_dsh[6] = dRGBdsh6 * dL_dRGB;
  75 | 			dL_dsh[7] = dRGBdsh7 * dL_dRGB;
  76 | 			dL_dsh[8] = dRGBdsh8 * dL_dRGB;
  77 | 
  78 | 			dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
  79 | 			dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
  80 | 			dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7];
  81 | 
  82 | 			if (deg > 2)
  83 | 			{
  84 | 				float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
  85 | 				float dRGBdsh10 = SH_C3[1] * xy * z;
  86 | 				float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
  87 | 				float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
  88 | 				float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
  89 | 				float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
  90 | 				float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
  91 | 				dL_dsh[9] = dRGBdsh9 * dL_dRGB;
  92 | 				dL_dsh[10] = dRGBdsh10 * dL_dRGB;
  93 | 				dL_dsh[11] = dRGBdsh11 * dL_dRGB;
  94 | 				dL_dsh[12] = dRGBdsh12 * dL_dRGB;
  95 | 				dL_dsh[13] = dRGBdsh13 * dL_dRGB;
  96 | 				dL_dsh[14] = dRGBdsh14 * dL_dRGB;
  97 | 				dL_dsh[15] = dRGBdsh15 * dL_dRGB;
  98 | 
  99 | 				dRGBdx += (
 100 | 					SH_C3[0] * sh[9] * 3.f * 2.f * xy +
 101 | 					SH_C3[1] * sh[10] * yz +
 102 | 					SH_C3[2] * sh[11] * -2.f * xy +
 103 | 					SH_C3[3] * sh[12] * -3.f * 2.f * xz +
 104 | 					SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
 105 | 					SH_C3[5] * sh[14] * 2.f * xz +
 106 | 					SH_C3[6] * sh[15] * 3.f * (xx - yy));
 107 | 
 108 | 				dRGBdy += (
 109 | 					SH_C3[0] * sh[9] * 3.f * (xx - yy) +
 110 | 					SH_C3[1] * sh[10] * xz +
 111 | 					SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
 112 | 					SH_C3[3] * sh[12] * -3.f * 2.f * yz +
 113 | 					SH_C3[4] * sh[13] * -2.f * xy +
 114 | 					SH_C3[5] * sh[14] * -2.f * yz +
 115 | 					SH_C3[6] * sh[15] * -3.f * 2.f * xy);
 116 | 
 117 | 				dRGBdz += (
 118 | 					SH_C3[1] * sh[10] * xy +
 119 | 					SH_C3[2] * sh[11] * 4.f * 2.f * yz +
 120 | 					SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
 121 | 					SH_C3[4] * sh[13] * 4.f * 2.f * xz +
 122 | 					SH_C3[5] * sh[14] * (xx - yy));
 123 | 			}
 124 | 		}
 125 | 	}
 126 | 
 127 | 	// The view direction is an input to the computation. View direction
 128 | 	// is influenced by the Gaussian's mean, so SHs gradients
 129 | 	// must propagate back into 3D position.
 130 | 	glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB));
 131 | 
 132 | 	// Account for normalization of direction
 133 | 	float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z });
 134 | 
 135 | 	// Gradients of loss w.r.t. Gaussian means, but only the portion 
 136 | 	// that is caused because the mean affects the view-dependent color.
 137 | 	// Additional mean gradient is accumulated in below methods.
 138 | 	dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
 139 | }
 140 | 
 141 | 
 142 | // Backward pass for conversion of spherical harmonics to RGB for
 143 | // each Gaussian.
 144 | __device__ void computeColorFromSH_4D(int idx, int deg, int deg_t, int max_coeffs,
 145 | 	const float* shs, const glm::vec3* dirs, const float* dirs_t, const float time_duration,
 146 | 	const glm::vec3* dL_drgb, float* dL_dshs, glm::vec3* dL_ddir, float* dL_ddir_t)
 147 | {
 148 | 	// Compute intermediate values, as it is done during forward
 149 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
 150 | 	glm::vec3 dir = dirs[idx];
 151 | 	const float dir_t = dirs_t[idx];
 152 | 
 153 | 	// Use PyTorch rule for clamping: if clamping was applied,
 154 | 	// gradient becomes 0.
 155 | 	glm::vec3 dL_dRGB = dL_drgb[idx];
 156 | 
 157 | 	glm::vec3 dRGBdx(0, 0, 0);
 158 | 	glm::vec3 dRGBdy(0, 0, 0);
 159 | 	glm::vec3 dRGBdz(0, 0, 0);
 160 | 	glm::vec3 dRGBdt(0, 0, 0);
 161 | 
 162 | 	// Target location for this Gaussian to write SH gradients to
 163 | 	glm::vec3* dL_dsh = ((glm::vec3*)dL_dshs) + idx * max_coeffs;
 164 | 
 165 | 	// No tricks here, just high school-level calculus.
 166 | 	float l0m0 = SH_C0;
 167 | 
 168 | 	float dRGBdsh0 = l0m0;
 169 | 	dL_dsh[0] = dRGBdsh0 * dL_dRGB;
 170 | 
 171 | 	if (deg > 0){
 172 | 		float x = dir.x;
 173 | 		float y = dir.y;
 174 | 		float z = dir.z;
 175 | 
 176 | 		float l1m1 = -1 * SH_C1 * y;
 177 | 		float l1m0 = SH_C1 * z;
 178 | 		float l1p1 = -1 * SH_C1 * x;
 179 | 
 180 | 		float dl1m1_dy = -1 * SH_C1;
 181 | 		float dl1m0_dz = SH_C1;
 182 | 		float dl1p1_dx = -1 * SH_C1;
 183 | 
 184 | 		dL_dsh[1] = l1m1 * dL_dRGB;
 185 | 		dL_dsh[2] = l1m0 * dL_dRGB;
 186 | 		dL_dsh[3] = l1p1 * dL_dRGB;
 187 | 
 188 | 		dRGBdx = dl1p1_dx * sh[3];
 189 | 		dRGBdy = dl1m1_dy * sh[1];
 190 | 		dRGBdz = dl1m0_dz * sh[2];
 191 | 
 192 | 		if (deg > 1){
 193 | 			float xx = x * x, yy = y * y, zz = z * z;
 194 | 			float xy = x * y, yz = y * z, xz = x * z;
 195 | 
 196 | 			float l2m2 = SH_C2[0] * xy;
 197 | 			float l2m1 = SH_C2[1] * yz;
 198 | 			float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy);
 199 | 			float l2p1 = SH_C2[3] * xz;
 200 | 			float l2p2 = SH_C2[4] * (xx - yy);
 201 | 
 202 | 			float dl2m2_dx = SH_C2[0] * y;
 203 | 			float dl2m2_dy = SH_C2[0] * x;
 204 | 			float dl2m1_dy = SH_C2[1] * z;
 205 | 			float dl2m1_dz = SH_C2[1] * y;
 206 | 			float dl2m0_dx = -2 * SH_C2[2] * x;
 207 | 			float dl2m0_dy = -2 * SH_C2[2] * y;
 208 | 			float dl2m0_dz = 4 * SH_C2[2] * z;
 209 | 			float dl2p1_dx = SH_C2[3] * z;
 210 | 			float dl2p1_dz = SH_C2[3] * x;
 211 | 			float dl2p2_dx = 2 * SH_C2[4] * x;
 212 | 			float dl2p2_dy = -2 * SH_C2[4] * y;
 213 | 
 214 | 			dL_dsh[4] = l2m2 * dL_dRGB;
 215 | 			dL_dsh[5] = l2m1 * dL_dRGB;
 216 | 			dL_dsh[6] = l2m0 * dL_dRGB;
 217 | 			dL_dsh[7] = l2p1 * dL_dRGB;
 218 | 			dL_dsh[8] = l2p2 * dL_dRGB;
 219 | 
 220 | 			dRGBdx += (
 221 | 				dl2m2_dx * sh[4] + dl2m0_dx * sh[6] + dl2p1_dx * sh[7] + dl2p2_dx * sh[8]
 222 | 			);
 223 | 			dRGBdy += (
 224 | 				dl2m2_dy * sh[4] + dl2m1_dy * sh[5] + dl2m0_dy * sh[6] + dl2p2_dy * sh[8]
 225 | 			);
 226 | 			dRGBdz += (
 227 | 				dl2m1_dz * sh[5] + dl2m0_dz * sh[6] + dl2p1_dz * sh[7]
 228 | 			);
 229 | 
 230 | 			if (deg > 2){
 231 | 				float l3m3 = SH_C3[0] * y * (3 * xx - yy);
 232 | 				float l3m2 = SH_C3[1] * xy * z;
 233 | 				float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy);
 234 | 				float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy);
 235 | 				float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy);
 236 | 				float l3p2 = SH_C3[5] * z * (xx - yy);
 237 | 				float l3p3 = SH_C3[6] * x * (xx - 3 * yy);
 238 | 
 239 | 				float dl3m3_dx = SH_C3[0] * y * 6 * x;
 240 | 				float dl3m3_dy = SH_C3[0] * (3 * xx - 3 * yy);
 241 | 				float dl3m2_dx = SH_C3[1] * yz;
 242 | 				float dl3m2_dy = SH_C3[1] * xz;
 243 | 				float dl3m2_dz = SH_C3[1] * xy;
 244 | 				float dl3m1_dx = -SH_C3[2] * y * 2 * x;
 245 | 				float dl3m1_dy = SH_C3[2] * (4 * zz - xx - 3 * yy);
 246 | 				float dl3m1_dz = SH_C3[2] * y * 8 * z;
 247 | 				float dl3m0_dx = -SH_C3[3] * z * 6 * x;
 248 | 				float dl3m0_dy = -SH_C3[3] * z * 6 * y;
 249 | 				float dl3m0_dz = SH_C3[3] * (6 * zz - 3 * xx - 3 * yy);
 250 | 				float dl3p1_dx = SH_C3[4] * (4 * zz - 3 * xx - yy);
 251 | 				float dl3p1_dy = -SH_C3[4] * x * 2 * y;
 252 | 				float dl3p1_dz = SH_C3[4] * x * 8 * z;
 253 | 				float dl3p2_dx = SH_C3[5] * z * 2 * x;
 254 | 				float dl3p2_dy = -SH_C3[5] * z * 2 * y;
 255 | 				float dl3p2_dz = SH_C3[5] * (xx - yy);
 256 | 				float dl3p3_dx = SH_C3[6] * (3 * xx - 3 * yy);
 257 | 				float dl3p3_dy = -SH_C3[6] * x * 6 * y;
 258 | 
 259 | 				dL_dsh[9] = l3m3 * dL_dRGB;
 260 | 				dL_dsh[10] = l3m2 * dL_dRGB;
 261 | 				dL_dsh[11] = l3m1 * dL_dRGB;
 262 | 				dL_dsh[12] = l3m0 * dL_dRGB;
 263 | 				dL_dsh[13] = l3p1 * dL_dRGB;
 264 | 				dL_dsh[14] = l3p2 * dL_dRGB;
 265 | 				dL_dsh[15] = l3p3 * dL_dRGB;
 266 | 
 267 | 				dRGBdx += (
 268 | 					dl3m3_dx * sh[9] +
 269 | 					dl3m2_dx * sh[10] +
 270 | 					dl3m1_dx * sh[11] +
 271 | 					dl3m0_dx * sh[12] +
 272 | 					dl3p1_dx * sh[13] +
 273 | 					dl3p2_dx * sh[14] +
 274 | 					dl3p3_dx * sh[15]
 275 | 				);
 276 | 
 277 | 				dRGBdy += (
 278 | 					dl3m3_dy * sh[9] +
 279 | 					dl3m2_dy * sh[10] +
 280 | 					dl3m1_dy * sh[11] +
 281 | 					dl3m0_dy * sh[12] +
 282 | 					dl3p1_dy * sh[13] +
 283 | 					dl3p2_dy * sh[14] +
 284 | 					dl3p3_dy * sh[15]
 285 | 				);
 286 | 
 287 | 				dRGBdz += (
 288 | 					dl3m2_dz * sh[10] +
 289 | 					dl3m1_dz * sh[11] +
 290 | 					dl3m0_dz * sh[12] +
 291 | 					dl3p1_dz * sh[13] +
 292 | 					dl3p2_dz * sh[14]
 293 | 				);
 294 | 
 295 | 				if (deg_t > 0){
 296 | 					float t1 = cos(2 * MY_PI * dir_t / time_duration);
 297 | 					float dt1_dt = sin(2 * MY_PI * dir_t / time_duration) * 2 * MY_PI / time_duration;
 298 | 
 299 | 					dL_dsh[16] = t1 * l0m0 * dL_dRGB;
 300 | 					dL_dsh[17] = t1 * l1m1 * dL_dRGB;
 301 | 					dL_dsh[18] = t1 * l1m0 * dL_dRGB;
 302 | 					dL_dsh[19] = t1 * l1p1 * dL_dRGB;
 303 | 					dL_dsh[20] = t1 * l2m2 * dL_dRGB;
 304 | 					dL_dsh[21] = t1 * l2m1 * dL_dRGB;
 305 | 					dL_dsh[22] = t1 * l2m0 * dL_dRGB;
 306 | 					dL_dsh[23] = t1 * l2p1 * dL_dRGB;
 307 | 					dL_dsh[24] = t1 * l2p2 * dL_dRGB;
 308 | 					dL_dsh[25] = t1 * l3m3 * dL_dRGB;
 309 | 					dL_dsh[26] = t1 * l3m2 * dL_dRGB;
 310 | 					dL_dsh[27] = t1 * l3m1 * dL_dRGB;
 311 | 					dL_dsh[28] = t1 * l3m0 * dL_dRGB;
 312 | 					dL_dsh[29] = t1 * l3p1 * dL_dRGB;
 313 | 					dL_dsh[30] = t1 * l3p2 * dL_dRGB;
 314 | 					dL_dsh[31] = t1 * l3p3 * dL_dRGB;
 315 | 	
 316 | 
 317 | 					dRGBdt += dt1_dt * (
 318 | 						l0m0 * sh[16] +
 319 | 						l1m1 * sh[17] +
 320 | 						l1m0 * sh[18] +
 321 | 						l1p1 * sh[19] + 
 322 | 						l2m2 * sh[20] +
 323 | 						l2m1 * sh[21] +
 324 | 						l2m0 * sh[22] +
 325 | 						l2p1 * sh[23] +
 326 | 						l2p2 * sh[24] + 
 327 | 						l3m3 * sh[25] +
 328 | 						l3m2 * sh[26] +
 329 | 						l3m1 * sh[27] +
 330 | 						l3m0 * sh[28] +
 331 | 						l3p1 * sh[29] +
 332 | 						l3p2 * sh[30] +
 333 | 						l3p3 * sh[31]);
 334 | 
 335 | 					dRGBdx += t1 * (
 336 | 						dl1p1_dx * sh[19] + 
 337 | 						dl2m2_dx * sh[20] + 
 338 | 						dl2m0_dx * sh[22] + 
 339 | 						dl2p1_dx * sh[23] + 
 340 | 						dl2p2_dx * sh[24] + 
 341 | 						dl3m3_dx * sh[25] +
 342 | 						dl3m2_dx * sh[26] +
 343 | 						dl3m1_dx * sh[27] +
 344 | 						dl3m0_dx * sh[28] +
 345 | 						dl3p1_dx * sh[29] +
 346 | 						dl3p2_dx * sh[30] +
 347 | 						dl3p3_dx * sh[31]
 348 | 					);
 349 | 
 350 | 					dRGBdy += t1 * (
 351 | 						dl1m1_dy * sh[17] +
 352 | 						dl2m2_dy * sh[20] + 
 353 | 						dl2m1_dy * sh[21] + 
 354 | 						dl2m0_dy * sh[22] + 
 355 | 						dl2p2_dy * sh[24] + 
 356 | 						dl3m3_dy * sh[25] +
 357 | 						dl3m2_dy * sh[26] +
 358 | 						dl3m1_dy * sh[27] +
 359 | 						dl3m0_dy * sh[28] +
 360 | 						dl3p1_dy * sh[29] +
 361 | 						dl3p2_dy * sh[30] +
 362 | 						dl3p3_dy * sh[31]
 363 | 					);
 364 | 
 365 | 					dRGBdz += t1 * (
 366 | 						dl1m0_dz * sh[18] +
 367 | 						dl2m1_dz * sh[21] + 
 368 | 						dl2m0_dz * sh[22] + 
 369 | 						dl2p1_dz * sh[23] +
 370 | 						dl3m2_dz * sh[26] +
 371 | 						dl3m1_dz * sh[27] +
 372 | 						dl3m0_dz * sh[28] +
 373 | 						dl3p1_dz * sh[29] +
 374 | 						dl3p2_dz * sh[30]
 375 | 					);
 376 | 
 377 | 					if (deg_t > 1){
 378 | 						float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration);
 379 | 						float dt2_dt = sin(2 * MY_PI * dir_t * 2 / time_duration) * 2 * MY_PI * 2 / time_duration;
 380 | 
 381 | 
 382 | 						dL_dsh[32] = t2 * l0m0 * dL_dRGB;
 383 | 						dL_dsh[33] = t2 * l1m1 * dL_dRGB;
 384 | 						dL_dsh[34] = t2 * l1m0 * dL_dRGB;
 385 | 						dL_dsh[35] = t2 * l1p1 * dL_dRGB;
 386 | 						dL_dsh[36] = t2 * l2m2 * dL_dRGB;
 387 | 						dL_dsh[37] = t2 * l2m1 * dL_dRGB;
 388 | 						dL_dsh[38] = t2 * l2m0 * dL_dRGB;
 389 | 						dL_dsh[39] = t2 * l2p1 * dL_dRGB;
 390 | 						dL_dsh[40] = t2 * l2p2 * dL_dRGB;
 391 | 						dL_dsh[41] = t2 * l3m3 * dL_dRGB;
 392 | 						dL_dsh[42] = t2 * l3m2 * dL_dRGB;
 393 | 						dL_dsh[43] = t2 * l3m1 * dL_dRGB;
 394 | 						dL_dsh[44] = t2 * l3m0 * dL_dRGB;
 395 | 						dL_dsh[45] = t2 * l3p1 * dL_dRGB;
 396 | 						dL_dsh[46] = t2 * l3p2 * dL_dRGB;
 397 | 						dL_dsh[47] = t2 * l3p3 * dL_dRGB;
 398 | 
 399 | 						dRGBdt += dt2_dt * (
 400 | 							l0m0 * sh[32] +
 401 | 							l1m1 * sh[33] +
 402 | 							l1m0 * sh[34] +
 403 | 							l1p1 * sh[35] + 
 404 | 							l2m2 * sh[36] +
 405 | 							l2m1 * sh[37] +
 406 | 							l2m0 * sh[38] +
 407 | 							l2p1 * sh[39] +
 408 | 							l2p2 * sh[40] + 
 409 | 							l3m3 * sh[41] +
 410 | 							l3m2 * sh[42] +
 411 | 							l3m1 * sh[43] +
 412 | 							l3m0 * sh[44] +
 413 | 							l3p1 * sh[45] +
 414 | 							l3p2 * sh[46] +
 415 | 							l3p3 * sh[47]);
 416 | 
 417 | 						dRGBdx += t2 * (
 418 | 							dl1p1_dx * sh[35] + 
 419 | 							dl2m2_dx * sh[36] + 
 420 | 							dl2m0_dx * sh[38] + 
 421 | 							dl2p1_dx * sh[39] + 
 422 | 							dl2p2_dx * sh[40] + 
 423 | 							dl3m3_dx * sh[41] +
 424 | 							dl3m2_dx * sh[42] +
 425 | 							dl3m1_dx * sh[43] +
 426 | 							dl3m0_dx * sh[44] +
 427 | 							dl3p1_dx * sh[45] +
 428 | 							dl3p2_dx * sh[46] +
 429 | 							dl3p3_dx * sh[47]
 430 | 						);
 431 | 
 432 | 						dRGBdy += t2 * (
 433 | 							dl1m1_dy * sh[33] +
 434 | 							dl2m2_dy * sh[36] + 
 435 | 							dl2m1_dy * sh[37] + 
 436 | 							dl2m0_dy * sh[38] + 
 437 | 							dl2p2_dy * sh[40] + 
 438 | 							dl3m3_dy * sh[41] +
 439 | 							dl3m2_dy * sh[42] +
 440 | 							dl3m1_dy * sh[43] +
 441 | 							dl3m0_dy * sh[44] +
 442 | 							dl3p1_dy * sh[45] +
 443 | 							dl3p2_dy * sh[46] +
 444 | 							dl3p3_dy * sh[47]
 445 | 						);
 446 | 
 447 | 						dRGBdz += t2 * (
 448 | 							dl1m0_dz * sh[34] +
 449 | 							dl2m1_dz * sh[37] + 
 450 | 							dl2m0_dz * sh[38] + 
 451 | 							dl2p1_dz * sh[39] +
 452 | 							dl3m2_dz * sh[42] +
 453 | 							dl3m1_dz * sh[43] +
 454 | 							dl3m0_dz * sh[44] +
 455 | 							dl3p1_dz * sh[45] +
 456 | 							dl3p2_dz * sh[46]
 457 | 						);
 458 | 					}
 459 | 				}
 460 | 			}
 461 | 		}
 462 | 	}
 463 | 
 464 | 	// The view direction is an input to the computation. View direction
 465 | 	// is influenced by the Gaussian's mean, so SHs gradients
 466 | 	// must propagate back into 3D position.
 467 | 	dL_ddir[idx].x = glm::dot(dRGBdx, dL_dRGB);
 468 | 	dL_ddir[idx].y = glm::dot(dRGBdy, dL_dRGB);
 469 | 	dL_ddir[idx].z = glm::dot(dRGBdz, dL_dRGB);
 470 | 
 471 | 	// Gradients of loss w.r.t. Gaussian means, but only the portion
 472 | 	// that is caused because the mean affects the view-dependent color.
 473 | 	// Additional mean gradient is accumulated in below methods.
 474 | 	dL_ddir_t[idx] = -glm::dot(dRGBdt, dL_dRGB);
 475 | }
 476 | 
 477 | __global__ void computeSH4DBackwardCUDA(int P,
 478 | 								int deg, int deg_t, int max_coeffs, 
 479 | 								const float* sh, const glm::vec3* dir, const float* dir_t, const float time_duration,
 480 | 								const glm::vec3* dL_drgb, float* dL_dsh, glm::vec3* dL_ddir, float* dL_ddir_t)
 481 | {
 482 | 	auto idx = cg::this_grid().thread_rank();
 483 | 	if (idx >= P)
 484 | 		return;
 485 | 	computeColorFromSH_4D(
 486 | 		idx,
 487 | 		deg,
 488 | 		deg_t,
 489 | 		max_coeffs,
 490 | 		sh,
 491 | 		dir,
 492 | 		dir_t,
 493 | 		time_duration,
 494 | 		dL_drgb,
 495 | 		dL_dsh,
 496 | 		dL_ddir,
 497 | 		dL_ddir_t
 498 | 	);
 499 | }
 500 | 
 501 | 
 502 | void BACKWARD::computeSH4DBackward(
 503 | 	int P,
 504 | 	int deg, int deg_t, int max_coeffs, 
 505 | 	const float* sh, 
 506 | 	const glm::vec3* dir, 
 507 | 	const float* dir_t, 
 508 | 	const float time_duration,
 509 | 	const glm::vec3* dL_drgb,
 510 | 	float* dL_dsh,
 511 | 	glm::vec3* dL_ddir,
 512 | 	float* dL_ddir_t
 513 | ) 
 514 | {
 515 | 	computeSH4DBackwardCUDA << <(P + 255) / 256, 256 >> > (
 516 | 		P,
 517 | 		deg,
 518 | 		deg_t,
 519 | 		max_coeffs,
 520 | 		sh,
 521 | 		dir,
 522 | 		dir_t,
 523 | 		time_duration,
 524 | 		dL_drgb,
 525 | 		dL_dsh,
 526 | 		dL_ddir,
 527 | 		dL_ddir_t
 528 | 	);
 529 | }
 530 | 
 531 | // Backward version of INVERSE 2D covariance matrix computation
 532 | // (due to length launched as separate kernel before other 
 533 | // backward steps contained in preprocess)
 534 | __global__ void computeCov2DCUDA(int P,
 535 | 	const float3* means,
 536 | 	const int* radii,
 537 | 	const float* cov3Ds,
 538 | 	const float h_x, float h_y,
 539 | 	const float tan_fovx, float tan_fovy,
 540 | 	const float* view_matrix,
 541 | 	const float* dL_dconics,
 542 | 	float3* dL_dmeans,
 543 | 	float* dL_dcov)
 544 | {
 545 | 	auto idx = cg::this_grid().thread_rank();
 546 | 	if (idx >= P || !(radii[idx] > 0))
 547 | 		return;
 548 | 
 549 | 	// Reading location of 3D covariance for this Gaussian
 550 | 	const float* cov3D = cov3Ds + 6 * idx;
 551 | 
 552 | 	// Fetch gradients, recompute 2D covariance and relevant 
 553 | 	// intermediate forward results needed in the backward.
 554 | 	float3 mean = means[idx];
 555 | 	float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] };
 556 | 	float3 t = transformPoint4x3(mean, view_matrix);
 557 | 	
 558 | 	const float limx = 1.3f * tan_fovx;
 559 | 	const float limy = 1.3f * tan_fovy;
 560 | 	const float txtz = t.x / t.z;
 561 | 	const float tytz = t.y / t.z;
 562 | 	t.x = min(limx, max(-limx, txtz)) * t.z;
 563 | 	t.y = min(limy, max(-limy, tytz)) * t.z;
 564 | 	
 565 | 	const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1;
 566 | 	const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1;
 567 | 
 568 | 	glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z),
 569 | 		0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z),
 570 | 		0, 0, 0);
 571 | 
 572 | 	glm::mat3 W = glm::mat3(
 573 | 		view_matrix[0], view_matrix[4], view_matrix[8],
 574 | 		view_matrix[1], view_matrix[5], view_matrix[9],
 575 | 		view_matrix[2], view_matrix[6], view_matrix[10]);
 576 | 
 577 | 	glm::mat3 Vrk = glm::mat3(
 578 | 		cov3D[0], cov3D[1], cov3D[2],
 579 | 		cov3D[1], cov3D[3], cov3D[4],
 580 | 		cov3D[2], cov3D[4], cov3D[5]);
 581 | 
 582 | 	glm::mat3 T = W * J;
 583 | 
 584 | 	glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
 585 | 
 586 | 	// Use helper variables for 2D covariance entries. More compact.
 587 | 	float a = cov2D[0][0] += 0.3f;
 588 | 	float b = cov2D[0][1];
 589 | 	float c = cov2D[1][1] += 0.3f;
 590 | 
 591 | 	float denom = a * c - b * b;
 592 | 	float dL_da = 0, dL_db = 0, dL_dc = 0;
 593 | 	float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
 594 | 
 595 | 	if (denom2inv != 0)
 596 | 	{
 597 | 		// Gradients of loss w.r.t. entries of 2D covariance matrix,
 598 | 		// given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
 599 | 		// e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
 600 | 		dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z);
 601 | 		dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x);
 602 | 		dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z);
 603 | 
 604 | 		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
 605 | 		// given gradients w.r.t. 2D covariance matrix (diagonal).
 606 | 		// cov2D = transpose(T) * transpose(Vrk) * T;
 607 | 		dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc);
 608 | 		dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc);
 609 | 		dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc);
 610 | 
 611 | 		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
 612 | 		// given gradients w.r.t. 2D covariance matrix (off-diagonal).
 613 | 		// Off-diagonal elements appear twice --> double the gradient.
 614 | 		// cov2D = transpose(T) * transpose(Vrk) * T;
 615 | 		dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc;
 616 | 		dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc;
 617 | 		dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc;
 618 | 	}
 619 | 	else
 620 | 	{
 621 | 		for (int i = 0; i < 6; i++)
 622 | 			dL_dcov[6 * idx + i] = 0;
 623 | 	}
 624 | 
 625 | 	// Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
 626 | 	// cov2D = transpose(T) * transpose(Vrk) * T;
 627 | 	float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da +
 628 | 		(T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
 629 | 	float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da +
 630 | 		(T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
 631 | 	float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da +
 632 | 		(T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
 633 | 	float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc +
 634 | 		(T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
 635 | 	float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc +
 636 | 		(T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
 637 | 	float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc +
 638 | 		(T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
 639 | 
 640 | 	// Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
 641 | 	// T = W * J
 642 | 	float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
 643 | 	float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
 644 | 	float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
 645 | 	float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
 646 | 
 647 | 	float tz = 1.f / t.z;
 648 | 	float tz2 = tz * tz;
 649 | 	float tz3 = tz2 * tz;
 650 | 
 651 | 	// Gradients of loss w.r.t. transformed Gaussian mean t
 652 | 	float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02;
 653 | 	float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12;
 654 | 	float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12;
 655 | 
 656 | 	// Account for transformation of mean to t
 657 | 	// t = transformPoint4x3(mean, view_matrix);
 658 | 	float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix);
 659 | 
 660 | 	// Gradients of loss w.r.t. Gaussian means, but only the portion 
 661 | 	// that is caused because the mean affects the covariance matrix.
 662 | 	// Additional mean gradient is accumulated in BACKWARD::preprocess.
 663 | 	dL_dmeans[idx] = dL_dmean;
 664 | }
 665 | 
 666 | // Backward pass for the conversion of scale and rotation to a 
 667 | // 3D covariance matrix for each Gaussian. 
 668 | __device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov, glm::vec3* dL_dscales, glm::vec4* dL_drots)
 669 | {
 670 | 	// Recompute (intermediate) results for the 3D covariance computation.
 671 | 	glm::vec4 q = rot;// / glm::length(rot);
 672 | 	float r = q.x;
 673 | 	float x = q.y;
 674 | 	float y = q.z;
 675 | 	float z = q.w;
 676 | 
 677 | 	glm::mat3 R = glm::mat3(
 678 | 		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
 679 | 		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
 680 | 		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
 681 | 	);
 682 | 
 683 | 	glm::mat3 S = glm::mat3(1.0f);
 684 | 
 685 | 	glm::vec3 s = mod * scale;
 686 | 	S[0][0] = s.x;
 687 | 	S[1][1] = s.y;
 688 | 	S[2][2] = s.z;
 689 | 
 690 | 	glm::mat3 M = S * R;
 691 | 
 692 | 	const float* dL_dcov3D = dL_dcov + 6 * idx;
 693 | 
 694 | 	glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
 695 | 	glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
 696 | 
 697 | 	// Convert per-element covariance loss gradients to matrix form
 698 | 	glm::mat3 dL_dSigma = glm::mat3(
 699 | 		dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
 700 | 		0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
 701 | 		0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]
 702 | 	);
 703 | 
 704 | 	// Compute loss gradient w.r.t. matrix M
 705 | 	// dSigma_dM = 2 * M
 706 | 	glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
 707 | 
 708 | 	glm::mat3 Rt = glm::transpose(R);
 709 | 	glm::mat3 dL_dMt = glm::transpose(dL_dM);
 710 | 
 711 | 	// Gradients of loss w.r.t. scale
 712 | 	glm::vec3* dL_dscale = dL_dscales + idx;
 713 | 	dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
 714 | 	dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
 715 | 	dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
 716 | 
 717 | 	dL_dMt[0] *= s.x;
 718 | 	dL_dMt[1] *= s.y;
 719 | 	dL_dMt[2] *= s.z;
 720 | 
 721 | 	// Gradients of loss w.r.t. normalized quaternion
 722 | 	glm::vec4 dL_dq;
 723 | 	dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
 724 | 	dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
 725 | 	dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
 726 | 	dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
 727 | 
 728 | 	// Gradients of loss w.r.t. unnormalized quaternion
 729 | 	float4* dL_drot = (float4*)(dL_drots + idx);
 730 | 	*dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
 731 | }
 732 | 
 733 | 
 734 | __global__ void computeCov3DBackwardCUDA(int P,
 735 | 	const glm::vec3* scaling_xyz,
 736 | 	const glm::vec4* rotation_l,
 737 | 	const float* dL_dcov,
 738 | 	glm::vec3* dL_dscaling_xyz,
 739 | 	glm::vec4* dL_drotation_l)
 740 | {
 741 | 	auto idx = cg::this_grid().thread_rank();
 742 | 	if (idx >= P)
 743 | 		return;
 744 | 	computeCov3D(
 745 | 		idx,
 746 | 		scaling_xyz[idx],
 747 | 		1.0f,
 748 | 		rotation_l[idx],
 749 | 		// dL_dcov + idx * 6,
 750 | 		// dL_dscaling_xyz + idx,
 751 | 		// dL_drotation_l + idx);
 752 | 		dL_dcov,
 753 | 		dL_dscaling_xyz,
 754 | 		dL_drotation_l);
 755 | }
 756 | 
 757 | void BACKWARD::computeCov3DBackward(
 758 | 	int P,
 759 | 	const glm::vec3* scaling_xyz,
 760 | 	const glm::vec4* rotation_l,
 761 | 	const float* dL_dcov,
 762 | 	glm::vec3* dL_dscaling_xyz,
 763 | 	glm::vec4* dL_drotation_l)
 764 | {
 765 | 	computeCov3DBackwardCUDA << <(P + 255) / 256, 256 >> > (
 766 | 		P,
 767 | 		scaling_xyz,
 768 | 		rotation_l,
 769 | 		dL_dcov,
 770 | 		dL_dscaling_xyz,
 771 | 		dL_drotation_l);
 772 | }
 773 | 
 774 | // Backward pass for the conversion of scale and rotation to a
 775 | // 3D covariance matrix for each Gaussian.
 776 | __device__ void computeCov4DBackward(
 777 | 	const glm::vec4 scaling_xyzt,
 778 | 	const glm::vec4 rotation_l, 
 779 | 	const glm::vec4 rotation_r,
 780 | 	const float* dL_dcov, 
 781 | 	const glm::vec3 dL_dms, 
 782 | 	const float dL_dcov_t,
 783 | 	glm::vec4 &dL_dscaling_xyzt,
 784 | 	glm::vec4 &dL_drotation_l, 
 785 | 	glm::vec4 &dL_drotation_r)
 786 | {
 787 | 	glm::mat4 S = glm::mat4(1.0f);
 788 | 	S[0][0] = scaling_xyzt.x;
 789 | 	S[1][1] = scaling_xyzt.y;
 790 | 	S[2][2] = scaling_xyzt.z;
 791 | 	S[3][3] = scaling_xyzt.w;
 792 | 
 793 | 	const float l_l = glm::length(rotation_l);
 794 | 	const float a = rotation_l.x / l_l;
 795 | 	const float b = rotation_l.y / l_l;
 796 | 	const float c = rotation_l.z / l_l;
 797 | 	const float d = rotation_l.w / l_l;
 798 | 
 799 | 	const float l_r = glm::length(rotation_r);
 800 | 	const float p = rotation_r.x / l_r;
 801 | 	const float q = rotation_r.y / l_r;
 802 | 	const float r = rotation_r.z / l_r;
 803 | 	const float s = rotation_r.w / l_r;
 804 | 
 805 | 	glm::mat4 M_l = glm::mat4(
 806 | 		a, -b, -c, -d,
 807 | 		b, a,-d, c,
 808 | 		c, d, a,-b,
 809 | 		d,-c, b, a
 810 | 	);
 811 | 
 812 | 	glm::mat4 M_r = glm::mat4(
 813 | 		p, q, r, s,
 814 | 		-q, p,-s, r,
 815 | 		-r, s, p,-q,
 816 | 		-s,-r, q, p
 817 | 	);
 818 | 	// glm stores in column major
 819 | 	glm::mat4 R = M_r * M_l;
 820 | 	glm::mat4 M = S * R;
 821 | 	glm::mat4 Sigma = glm::transpose(M) * M;
 822 | 	float cov_t = Sigma[3][3];
 823 | 
 824 | 	glm::mat3 cov11 = glm::mat3(Sigma);
 825 | 	glm::vec3 cov12 = glm::vec3(Sigma[0][3], Sigma[1][3], Sigma[2][3]);
 826 | 
 827 | 	glm::vec3 dL_dcov12 = -glm::vec3(
 828 | 		dL_dcov[0] * cov12[0] + dL_dcov[1] * cov12[1] * 0.5 + dL_dcov[2] * cov12[2] * 0.5,
 829 | 		dL_dcov[1] * cov12[0] * 0.5 + dL_dcov[3] * cov12[1] + dL_dcov[4] * cov12[2] * 0.5,
 830 | 		dL_dcov[2] * cov12[0] * 0.5 + dL_dcov[4] * cov12[1] * 0.5 + dL_dcov[5] * cov12[2]
 831 | 	) * 2.0f / cov_t;
 832 | 
 833 | 	dL_dcov12 += dL_dms / cov_t;
 834 | 
 835 | 	float dL_dcov_t_w_ms_cov = dL_dcov_t;
 836 | 	float dL_dms_dot_cov12 = glm::dot(dL_dms, cov12);
 837 | 	dL_dcov_t_w_ms_cov += -dL_dms_dot_cov12 / (cov_t * cov_t);
 838 | 	dL_dcov_t_w_ms_cov += (
 839 | 		cov12[0] * cov12[0] * dL_dcov[0] + cov12[0] * cov12[1] * dL_dcov[1] +
 840 | 		cov12[0] * cov12[2] * dL_dcov[2] + cov12[1] * cov12[1] * dL_dcov[3] +
 841 | 		cov12[1] * cov12[2] * dL_dcov[4] + cov12[2] * cov12[2] * dL_dcov[5]
 842 |     ) / (cov_t * cov_t);
 843 | 
 844 | 	glm::mat4 dL_dSigma = glm::mat4(
 845 | 		dL_dcov[0], 0.5f * dL_dcov[1], 0.5f * dL_dcov[2], 0.5f * dL_dcov12[0],
 846 | 		0.5f * dL_dcov[1], dL_dcov[3], 0.5f * dL_dcov[4], 0.5f * dL_dcov12[1],
 847 | 		0.5f * dL_dcov[2], 0.5f * dL_dcov[4], dL_dcov[5], 0.5f * dL_dcov12[2],
 848 | 		0.5f * dL_dcov12[0], 0.5f * dL_dcov12[1], 0.5f * dL_dcov12[2], dL_dcov_t_w_ms_cov
 849 | 	);
 850 | 	// Compute loss gradient w.r.t. matrix M
 851 | 	// dSigma_dM = 2 * M
 852 | 	glm::mat4 dL_dM = 2.0f * M * dL_dSigma;
 853 | 
 854 | 	glm::mat4 Rt = glm::transpose(R);
 855 | 	glm::mat4 dL_dMt = glm::transpose(dL_dM);
 856 | 
 857 | 	// Gradients of loss w.r.t. scale
 858 | 	dL_dscaling_xyzt.x = glm::dot(Rt[0], dL_dMt[0]);
 859 | 	dL_dscaling_xyzt.y = glm::dot(Rt[1], dL_dMt[1]);
 860 | 	dL_dscaling_xyzt.z = glm::dot(Rt[2], dL_dMt[2]);
 861 | 	dL_dscaling_xyzt.w = glm::dot(Rt[3], dL_dMt[3]);
 862 | 
 863 | 	dL_dMt[0] *= scaling_xyzt.x;
 864 | 	dL_dMt[1] *= scaling_xyzt.y;
 865 | 	dL_dMt[2] *= scaling_xyzt.z;
 866 | 	dL_dMt[3] *= scaling_xyzt.w;
 867 | 
 868 | 	glm::mat4 dL_dml_t = dL_dMt * M_r;
 869 | 	glm::vec4 dL_drot_l;
 870 | 	dL_drot_l.x = dL_dml_t[0][0] + dL_dml_t[1][1] + dL_dml_t[2][2] + dL_dml_t[3][3];
 871 | 	dL_drot_l.y = dL_dml_t[0][1] - dL_dml_t[1][0] + dL_dml_t[2][3] - dL_dml_t[3][2];
 872 | 	dL_drot_l.z = dL_dml_t[0][2] - dL_dml_t[1][3] - dL_dml_t[2][0] + dL_dml_t[3][1];
 873 | 	dL_drot_l.w = dL_dml_t[0][3] + dL_dml_t[1][2] - dL_dml_t[2][1] - dL_dml_t[3][0];
 874 | 
 875 | 	glm::mat4 dL_dmr_t = M_l * dL_dMt;
 876 | 	glm::vec4 dL_drot_r;
 877 | 	dL_drot_r.x = dL_dmr_t[0][0] + dL_dmr_t[1][1] + dL_dmr_t[2][2] + dL_dmr_t[3][3];
 878 | 	dL_drot_r.y = -dL_dmr_t[0][1] + dL_dmr_t[1][0] + dL_dmr_t[2][3] - dL_dmr_t[3][2];
 879 | 	dL_drot_r.z = -dL_dmr_t[0][2] - dL_dmr_t[1][3] + dL_dmr_t[2][0] + dL_dmr_t[3][1];
 880 | 	dL_drot_r.w = -dL_dmr_t[0][3] + dL_dmr_t[1][2] - dL_dmr_t[2][1] + dL_dmr_t[3][0];
 881 | 
 882 | 	float4 dL_drotation_l_f = dnormvdv(float4{rotation_l.x, rotation_l.y, rotation_l.z, rotation_l.w}, float4{dL_drot_l.x, dL_drot_l.y, dL_drot_l.z, dL_drot_l.w});
 883 | 	float4 dL_drotation_r_f = dnormvdv(float4{rotation_r.x, rotation_r.y, rotation_r.z, rotation_r.w}, float4{dL_drot_r.x, dL_drot_r.y, dL_drot_r.z, dL_drot_r.w});
 884 | 	dL_drotation_l.x = dL_drotation_l_f.x;
 885 | 	dL_drotation_l.y = dL_drotation_l_f.y;
 886 | 	dL_drotation_l.z = dL_drotation_l_f.z;
 887 | 	dL_drotation_l.w = dL_drotation_l_f.w;
 888 | 	dL_drotation_r.x = dL_drotation_r_f.x;
 889 | 	dL_drotation_r.y = dL_drotation_r_f.y;
 890 | 	dL_drotation_r.z = dL_drotation_r_f.z;
 891 | 	dL_drotation_r.w = dL_drotation_r_f.w;
 892 | }
 893 | 
 894 | __global__ void computeCov4DBackwardCUDA(int P,
 895 | 	const glm::vec4* scaling_xyzt,
 896 | 	const glm::vec4* rotation_l,
 897 | 	const glm::vec4* rotation_r,
 898 | 	const float* dL_dcov, 
 899 | 	const glm::vec3* dL_dms, 
 900 | 	const float* dL_dcov_t,
 901 | 	glm::vec4* dL_dscaling_xyzt,
 902 | 	glm::vec4* dL_drotation_l, 
 903 | 	glm::vec4* dL_drotation_r)
 904 | {
 905 | 	auto idx = cg::this_grid().thread_rank();
 906 | 	if (idx >= P)
 907 | 		return;
 908 | 	computeCov4DBackward(
 909 | 		scaling_xyzt[idx],
 910 | 		rotation_l[idx],
 911 | 		rotation_r[idx],
 912 | 		dL_dcov + idx * 6,
 913 | 		dL_dms[idx],
 914 | 		dL_dcov_t[idx],
 915 | 		dL_dscaling_xyzt[idx],
 916 | 		dL_drotation_l[idx],
 917 | 		dL_drotation_r[idx]);
 918 | }
 919 | 
 920 | 
 921 | void BACKWARD::computeCov4DBackward(
 922 | 	int P,
 923 | 	const glm::vec4* scaling_xyzt,
 924 | 	const glm::vec4* rotation_l,
 925 | 	const glm::vec4* rotation_r,
 926 | 	const float* dL_dcov, 
 927 | 	const glm::vec3* dL_dms, 
 928 | 	const float* dL_dcov_t,
 929 | 	glm::vec4* dL_dscaling_xyzt,
 930 | 	glm::vec4* dL_drotation_l, 
 931 | 	glm::vec4* dL_drotation_r)
 932 | {
 933 | 	computeCov4DBackwardCUDA << <(P + 255) / 256, 256 >> > (
 934 | 		P,
 935 | 		scaling_xyzt,
 936 | 		rotation_l,
 937 | 		rotation_r,
 938 | 		dL_dcov,
 939 | 		dL_dms,
 940 | 		dL_dcov_t,
 941 | 		dL_dscaling_xyzt,
 942 | 		dL_drotation_l,
 943 | 		dL_drotation_r);
 944 | }
 945 | 
 946 | // Backward pass of the preprocessing steps, except
 947 | // for the covariance computation and inversion
 948 | // (those are handled by a previous kernel call)
 949 | template<int C>
 950 | __global__ void preprocessCUDA(
 951 | 	int P, int D, int M,
 952 | 	const float3* means,
 953 | 	const int* radii,
 954 | 	const float* shs,
 955 | 	const bool* clamped,
 956 | 	const glm::vec3* scales,
 957 | 	const glm::vec4* rotations,
 958 | 	const float scale_modifier,
 959 | 	const float* view,
 960 | 	const float* proj,
 961 | 	const glm::vec3* campos,
 962 | 	const float3* dL_dmean2D,
 963 | 	glm::vec3* dL_dmeans,
 964 | 	float* dL_dcolor,
 965 | 	float* dL_ddepth,
 966 | 	float* dL_dcov3D,
 967 | 	float* dL_dsh,
 968 | 	glm::vec3* dL_dscale,
 969 | 	glm::vec4* dL_drot)
 970 | {
 971 | 	auto idx = cg::this_grid().thread_rank();
 972 | 	if (idx >= P || !(radii[idx] > 0))
 973 | 		return;
 974 | 
 975 | 	float3 m = means[idx];
 976 | 
 977 | 	// Taking care of gradients from the screenspace points
 978 | 	float4 m_hom = transformPoint4x4(m, proj);
 979 | 	float m_w = 1.0f / (m_hom.w + 0.0000001f);
 980 | 
 981 | 	// Compute loss gradient w.r.t. 3D means due to gradients of 2D means
 982 | 	// from rendering procedure
 983 | 	glm::vec3 dL_dmean;
 984 | 	float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
 985 | 	float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
 986 | 	dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
 987 | 	dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
 988 | 	dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
 989 | 
 990 | 	// That's the second part of the mean gradient. Previous computation
 991 | 	// of cov2D and following SH conversion also affects it.
 992 | 	dL_dmeans[idx] += dL_dmean;
 993 | 
 994 | 	// the w must be equal to 1 for view^T * [x,y,z,1]
 995 | 	float3 m_view = transformPoint4x3(m, view);
 996 | 	
 997 | 	// Compute loss gradient w.r.t. 3D means due to gradients of depth
 998 | 	// from rendering procedure
 999 | 	glm::vec3 dL_dmean2;
1000 | 	float mul3 = view[2] * m.x + view[6] * m.y + view[10] * m.z + view[14];
1001 | 	dL_dmean2.x = (view[2] - view[3] * mul3) * dL_ddepth[idx];
1002 | 	dL_dmean2.y = (view[6] - view[7] * mul3) * dL_ddepth[idx];
1003 | 	dL_dmean2.z = (view[10] - view[11] * mul3) * dL_ddepth[idx];
1004 | 	
1005 | 	// That's the third part of the mean gradient.
1006 | 	dL_dmeans[idx] += dL_dmean2;
1007 | 
1008 | 	// Compute gradient updates due to computing colors from SHs
1009 | 	if (shs)
1010 | 		computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh);
1011 | 
1012 | 	// Compute gradient updates due to computing covariance from scale/rotation
1013 | 	if (scales)
1014 | 		computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
1015 | }
1016 | 
1017 | // Backward version of the rendering procedure.
1018 | template <uint32_t C>
1019 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
1020 | renderCUDA(
1021 | 	const uint2* __restrict__ ranges,
1022 | 	const uint32_t* __restrict__ point_list,
1023 | 	int W, int H,
1024 | 	const float* __restrict__ bg_color,
1025 | 	const float2* __restrict__ points_xy_image,
1026 | 	const float4* __restrict__ conic_opacity,
1027 | 	const float* __restrict__ colors,
1028 | 	const float* __restrict__ depths,
1029 | 	const float* __restrict__ accum_alphas,
1030 | 	const uint32_t* __restrict__ n_contrib,
1031 | 	const float* __restrict__ dL_dpixels,
1032 | 	const float* __restrict__ dL_dpixel_depths,
1033 | 	const float* __restrict__ dL_dpixel_alphas,
1034 | 	float3* __restrict__ dL_dmean2D,
1035 | 	float3* __restrict__ dL_dabsmean2D,
1036 | 	float4* __restrict__ dL_dconic2D,
1037 | 	float* __restrict__ dL_dopacity,
1038 | 	float* __restrict__ dL_dcolors,
1039 | 	float* __restrict__ dL_ddepths)
1040 | {
1041 | 	// We rasterize again. Compute necessary block info.
1042 | 	auto block = cg::this_thread_block();
1043 | 	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
1044 | 	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
1045 | 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
1046 | 	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
1047 | 	const uint32_t pix_id = W * pix.y + pix.x;
1048 | 	const float2 pixf = { (float)pix.x, (float)pix.y };
1049 | 
1050 | 	const bool inside = pix.x < W&& pix.y < H;
1051 | 	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
1052 | 
1053 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
1054 | 
1055 | 	bool done = !inside;
1056 | 	int toDo = range.y - range.x;
1057 | 
1058 | 	__shared__ int collected_id[BLOCK_SIZE];
1059 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
1060 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
1061 | 	__shared__ float collected_colors[C * BLOCK_SIZE];
1062 | 	__shared__ float collected_depths[BLOCK_SIZE];
1063 | 
1064 | 	// In the forward, we stored the final value for T, the
1065 | 	// product of all (1 - alpha) factors. 
1066 | 	const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0;
1067 | 	float T = T_final;
1068 | 
1069 | 	// We start from the back. The ID of the last contributing
1070 | 	// Gaussian is known from each pixel from the forward.
1071 | 	uint32_t contributor = toDo;
1072 | 	const int last_contributor = inside ? n_contrib[pix_id] : 0;
1073 | 
1074 | 	float accum_rec[C] = { 0 };
1075 | 	float accum_red = 0;
1076 | 	float accum_rea = 0;
1077 | 	float dL_dpixel[C];
1078 | 	float dL_dpixel_depth;
1079 | 	float dL_dpixel_alpha;
1080 | 	if (inside) 
1081 | 	{
1082 | 		for (int i = 0; i < C; i++)
1083 | 			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
1084 | 		dL_dpixel_depth = dL_dpixel_depths[pix_id];
1085 | 		dL_dpixel_alpha = dL_dpixel_alphas[pix_id];
1086 | 	}
1087 | 	float last_alpha = 0;
1088 | 	float last_color[C] = { 0 };
1089 | 	float last_depth = 0;
1090 | 	// Gradient of pixel coordinate w.r.t. normalized 
1091 | 	// screen-space viewport corrdinates (-1 to 1)
1092 | 	const float ddelx_dx = 0.5 * W;
1093 | 	const float ddely_dy = 0.5 * H;
1094 | 
1095 | 	// Traverse all Gaussians
1096 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
1097 | 	{
1098 | 		// Load auxiliary data into shared memory, start in the BACK
1099 | 		// and load them in revers order.
1100 | 		block.sync();
1101 | 		const int progress = i * BLOCK_SIZE + block.thread_rank();
1102 | 		if (range.x + progress < range.y)
1103 | 		{
1104 | 			const int coll_id = point_list[range.y - progress - 1];
1105 | 			collected_id[block.thread_rank()] = coll_id;
1106 | 			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
1107 | 			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
1108 | 			for (int i = 0; i < C; i++)
1109 | 				collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
1110 | 			collected_depths[block.thread_rank()] = depths[coll_id];
1111 | 		}
1112 | 		block.sync();
1113 | 
1114 | 		// Iterate over Gaussians
1115 | 		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
1116 | 		{
1117 | 			// Keep track of current Gaussian ID. Skip, if this one
1118 | 			// is behind the last contributor for this pixel.
1119 | 			contributor--;
1120 | 			if (contributor >= last_contributor)
1121 | 				continue;
1122 | 
1123 | 			// Compute blending values, as before.
1124 | 			const float2 xy = collected_xy[j];
1125 | 			const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
1126 | 			const float4 con_o = collected_conic_opacity[j];
1127 | 			const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
1128 | 			if (power > 0.0f)
1129 | 				continue;
1130 | 
1131 | 			const float G = __expf(power);
1132 | 			const float alpha = min(0.99f, con_o.w * G);
1133 | 			if (alpha < 1.0f / 255.0f)
1134 | 				continue;
1135 | 
1136 | 			T = T / (1.f - alpha);
1137 | 			const float dchannel_dcolor = alpha * T;
1138 | 			const float dpixel_depth_ddepth = alpha * T;
1139 | 
1140 | 			// Propagate gradients to per-Gaussian colors and keep
1141 | 			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
1142 | 			// pair).
1143 | 			float dL_dalpha = 0.0f;
1144 | 			const int global_id = collected_id[j];
1145 | 			for (int ch = 0; ch < C; ch++)
1146 | 			{
1147 | 				const float c = collected_colors[ch * BLOCK_SIZE + j];
1148 | 				// Update last color (to be used in the next iteration)
1149 | 				accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
1150 | 				last_color[ch] = c;
1151 | 
1152 | 				const float dL_dchannel = dL_dpixel[ch];
1153 | 				dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
1154 | 				// Update the gradients w.r.t. color of the Gaussian. 
1155 | 				// Atomic, since this pixel is just one of potentially
1156 | 				// many that were affected by this Gaussian.
1157 | 				atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel);
1158 | 			}
1159 | 			const float dep = collected_depths[j];
1160 | 			accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red;
1161 | 			last_depth = dep;
1162 | 			dL_dalpha += (dep-accum_red) * dL_dpixel_depth;
1163 | 			atomicAdd(&(dL_ddepths[global_id]), dpixel_depth_ddepth * dL_dpixel_depth);
1164 | 			
1165 | 			accum_rea = last_alpha + (1.f - last_alpha) * accum_rea;
1166 | 			dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha;
1167 | 
1168 | 
1169 | 			dL_dalpha *= T;
1170 | 			// Update last alpha (to be used in the next iteration)
1171 | 			last_alpha = alpha;
1172 | 
1173 | 			// Account for fact that alpha also influences how much of
1174 | 			// the background color is added if nothing left to blend
1175 | 			float bg_dot_dpixel = 0;
1176 | 			for (int i = 0; i < C; i++)
1177 | 				bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
1178 | 			dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
1179 | 
1180 | 			// Set background depth value == 0, thus no contribution for
1181 | 			// dL_dalpha
1182 | 
1183 | 			// Helpful reusable temporary variables
1184 | 			const float dL_dG = con_o.w * dL_dalpha;
1185 | 			const float gdx = G * d.x;
1186 | 			const float gdy = G * d.y;
1187 | 			const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
1188 | 			const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
1189 | 			const float dL_dmean2D_x = dL_dG * dG_ddelx * ddelx_dx;
1190 | 			const float dL_dmean2D_y = dL_dG * dG_ddely * ddely_dy;
1191 | 
1192 | 			// Update gradients w.r.t. 2D mean position of the Gaussian
1193 | 			atomicAdd(&dL_dmean2D[global_id].x, dL_dmean2D_x);
1194 | 			atomicAdd(&dL_dmean2D[global_id].y, dL_dmean2D_y);
1195 | 			// Update gradients w.r.t. 2D mean position of the Gaussian
1196 | 			atomicAdd(&dL_dabsmean2D[global_id].x, abs(dL_dmean2D_x));
1197 | 			atomicAdd(&dL_dabsmean2D[global_id].y, abs(dL_dmean2D_y));
1198 | 
1199 | 			// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
1200 | 			atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
1201 | 			atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
1202 | 			atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
1203 | 
1204 | 			// Update gradients w.r.t. opacity of the Gaussian
1205 | 			atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha);
1206 | 		}
1207 | 	}
1208 | }
1209 | 
1210 | 
1211 | // Backward version of the rendering procedure.
1212 | template <uint32_t C>
1213 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
1214 | renderCUDAShared(
1215 | 	const uint2* __restrict__ ranges,
1216 | 	const uint32_t* __restrict__ point_list,
1217 | 	int W, int H,
1218 | 	const float* __restrict__ bg_color,
1219 | 	const float2* __restrict__ points_xy_image,
1220 | 	const float4* __restrict__ conic_opacity,
1221 | 	const float* __restrict__ colors,
1222 | 	const float* __restrict__ depths,
1223 | 	const float* __restrict__ accum_alphas,
1224 | 	const uint32_t* __restrict__ n_contrib,
1225 | 	const float* __restrict__ dL_dpixels,
1226 | 	const float* __restrict__ dL_dpixel_depths,
1227 | 	const float* __restrict__ dL_dpixel_alphas,
1228 | 	float3* __restrict__ dL_dmean2D,
1229 | 	float3* __restrict__ dL_dabsmean2D,
1230 | 	float4* __restrict__ dL_dconic2D,
1231 | 	float* __restrict__ dL_dopacity,
1232 | 	float* __restrict__ dL_dcolors,
1233 | 	float* __restrict__ dL_ddepths)
1234 | {
1235 | 	// We rasterize again. Compute necessary block info.
1236 | 	auto block = cg::this_thread_block();
1237 | 	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
1238 | 	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
1239 | 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
1240 | 	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
1241 | 	const uint32_t pix_id = W * pix.y + pix.x;
1242 | 	const float2 pixf = { (float)pix.x, (float)pix.y };
1243 | 
1244 | 	const bool inside = pix.x < W&& pix.y < H;
1245 | 	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
1246 | 
1247 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
1248 | 
1249 | 	bool done = !inside;
1250 | 	int toDo = range.y - range.x;
1251 | 
1252 | 	__shared__ int collected_id[BLOCK_SIZE];
1253 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
1254 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
1255 | 	__shared__ float collected_colors[C * BLOCK_SIZE];
1256 | 	__shared__ float collected_depths[BLOCK_SIZE];
1257 | 
1258 | 	// Heuristic, gaussians are likely to be updated by the same block (same tile)
1259 | 	// Thus it should be faster to first aggregate the gradients inside this block and update them to the global memory in just one go
1260 | 	__shared__ float3 s_dL_dmean2D[BLOCK_SIZE];
1261 | 	__shared__ float3 s_dL_dabsmean2D[BLOCK_SIZE];
1262 | 	__shared__ float4 s_dL_dconic2D[BLOCK_SIZE];
1263 | 	__shared__ float s_dL_dopacity[BLOCK_SIZE];
1264 | 	__shared__ float s_dL_dcolors[C * BLOCK_SIZE];
1265 | 	__shared__ float s_dL_ddepths[BLOCK_SIZE];
1266 | 
1267 | 	// In the forward, we stored the final value for T, the
1268 | 	// product of all (1 - alpha) factors. 
1269 | 	const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0;
1270 | 	float T = T_final;
1271 | 
1272 | 	// We start from the back. The ID of the last contributing
1273 | 	// Gaussian is known from each pixel from the forward.
1274 | 	uint32_t contributor = toDo;
1275 | 	const int last_contributor = inside ? n_contrib[pix_id] : 0;
1276 | 
1277 | 	float accum_rec[C] = { 0 };
1278 | 	float accum_red = 0;
1279 | 	float accum_rea = 0;
1280 | 	float dL_dpixel[C];
1281 | 	float dL_dpixel_depth;
1282 | 	float dL_dpixel_alpha;
1283 | 	if (inside) 
1284 | 	{
1285 | 		for (int i = 0; i < C; i++)
1286 | 			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
1287 | 		dL_dpixel_depth = dL_dpixel_depths[pix_id];
1288 | 		dL_dpixel_alpha = dL_dpixel_alphas[pix_id];
1289 | 	}
1290 | 	float last_alpha = 0;
1291 | 	float last_color[C] = { 0 };
1292 | 	float last_depth = 0;
1293 | 	// Gradient of pixel coordinate w.r.t. normalized 
1294 | 	// screen-space viewport corrdinates (-1 to 1)
1295 | 	const float ddelx_dx = 0.5 * W;
1296 | 	const float ddely_dy = 0.5 * H;
1297 | 
1298 | 	// Traverse all Gaussians
1299 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
1300 | 	{
1301 | 		// Load auxiliary data into shared memory, start in the BACK
1302 | 		// and load them in revers order.
1303 | 		block.sync();
1304 | 		const int progress = i * BLOCK_SIZE + block.thread_rank();
1305 | 		if (range.x + progress < range.y)
1306 | 		{
1307 | 			const int coll_id = point_list[range.y - progress - 1];
1308 | 			collected_id[block.thread_rank()] = coll_id;
1309 | 			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
1310 | 			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
1311 | 			for (int i = 0; i < C; i++)
1312 | 				collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
1313 | 			collected_depths[block.thread_rank()] = depths[coll_id];
1314 | 
1315 | 			// Shared gradient accumulation in this block
1316 | 			s_dL_dmean2D[block.thread_rank()].x = 0.0f;
1317 | 			s_dL_dmean2D[block.thread_rank()].y = 0.0f;
1318 | 			s_dL_dabsmean2D[block.thread_rank()].x = 0.0f;
1319 | 			s_dL_dabsmean2D[block.thread_rank()].y = 0.0f;
1320 | 			s_dL_dconic2D[block.thread_rank()].x = 0.0f;
1321 | 			s_dL_dconic2D[block.thread_rank()].y = 0.0f;
1322 | 			s_dL_dconic2D[block.thread_rank()].w = 0.0f;
1323 | 			for (int i = 0; i < C; i++)
1324 | 				s_dL_dcolors[i * BLOCK_SIZE + block.thread_rank()] = 0.0f;
1325 | 			s_dL_dopacity[block.thread_rank()] = 0.0f;
1326 | 			s_dL_ddepths[block.thread_rank()] = 0.0f;
1327 | 		}
1328 | 		block.sync();
1329 | 
1330 | 		// Iterate over Gaussians
1331 | 		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
1332 | 		{
1333 | 			// Keep track of current Gaussian ID. Skip, if this one
1334 | 			// is behind the last contributor for this pixel.
1335 | 			contributor--;
1336 | 			if (contributor >= last_contributor)
1337 | 				continue;
1338 | 
1339 | 			// Compute blending values, as before.
1340 | 			const float2 xy = collected_xy[j];
1341 | 			const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
1342 | 			const float4 con_o = collected_conic_opacity[j];
1343 | 			const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
1344 | 			if (power > 0.0f)
1345 | 				continue;
1346 | 
1347 | 			const float G = __expf(power);
1348 | 			const float alpha = min(0.99f, con_o.w * G);
1349 | 			if (alpha < 1.0f / 255.0f)
1350 | 				continue;
1351 | 
1352 | 			T = T / (1.f - alpha);
1353 | 			const float dchannel_dcolor = alpha * T;
1354 | 			const float dpixel_depth_ddepth = alpha * T;
1355 | 
1356 | 			// Propagate gradients to per-Gaussian colors and keep
1357 | 			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
1358 | 			// pair).
1359 | 			float dL_dalpha = 0.0f;
1360 | 			for (int ch = 0; ch < C; ch++)
1361 | 			{
1362 | 				const float c = collected_colors[ch * BLOCK_SIZE + j];
1363 | 				// Update last color (to be used in the next iteration)
1364 | 				accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
1365 | 				last_color[ch] = c;
1366 | 
1367 | 				const float dL_dchannel = dL_dpixel[ch];
1368 | 				dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
1369 | 				// Update the gradients w.r.t. color of the Gaussian. 
1370 | 				// Atomic, since this pixel is just one of potentially
1371 | 				// many that were affected by this Gaussian.
1372 | 				atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), dchannel_dcolor * dL_dchannel);
1373 | 			}
1374 | 			const float dep = collected_depths[j];
1375 | 			accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red;
1376 | 			last_depth = dep;
1377 | 			dL_dalpha += (dep-accum_red) * dL_dpixel_depth;
1378 | 			atomicAdd(&(s_dL_ddepths[j]), dpixel_depth_ddepth * dL_dpixel_depth);
1379 | 			
1380 | 			accum_rea = last_alpha + (1.f - last_alpha) * accum_rea;
1381 | 			dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha;
1382 | 
1383 | 
1384 | 			dL_dalpha *= T;
1385 | 			// Update last alpha (to be used in the next iteration)
1386 | 			last_alpha = alpha;
1387 | 
1388 | 			// Account for fact that alpha also influences how much of
1389 | 			// the background color is added if nothing left to blend
1390 | 			float bg_dot_dpixel = 0;
1391 | 			for (int i = 0; i < C; i++)
1392 | 				bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
1393 | 			dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
1394 | 
1395 | 			// Set background depth value == 0, thus no contribution for
1396 | 			// dL_dalpha
1397 | 
1398 | 			// Helpful reusable temporary variables
1399 | 			const float dL_dG = con_o.w * dL_dalpha;
1400 | 			const float gdx = G * d.x;
1401 | 			const float gdy = G * d.y;
1402 | 			const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
1403 | 			const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
1404 | 			const float dL_dmean2D_x = dL_dG * dG_ddelx * ddelx_dx;
1405 | 			const float dL_dmean2D_y = dL_dG * dG_ddely * ddely_dy;
1406 | 
1407 | 			// Update gradients w.r.t. 2D mean position of the Gaussian
1408 | 			atomicAdd(&s_dL_dmean2D[j].x, dL_dmean2D_x);
1409 | 			atomicAdd(&s_dL_dmean2D[j].y, dL_dmean2D_y);
1410 | 			atomicAdd(&s_dL_dabsmean2D[j].x, abs(dL_dmean2D_x));
1411 | 			atomicAdd(&s_dL_dabsmean2D[j].y, abs(dL_dmean2D_y));
1412 | 
1413 | 			// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
1414 | 			atomicAdd(&s_dL_dconic2D[j].x, -0.5f * gdx * d.x * dL_dG);
1415 | 			atomicAdd(&s_dL_dconic2D[j].y, -0.5f * gdx * d.y * dL_dG);
1416 | 			atomicAdd(&s_dL_dconic2D[j].w, -0.5f * gdy * d.y * dL_dG);
1417 | 
1418 | 			// Update gradients w.r.t. opacity of the Gaussian
1419 | 			atomicAdd(&(s_dL_dopacity[j]), G * dL_dalpha);
1420 | 		}
1421 | 		block.sync();
1422 | 
1423 | 		if (range.x + progress < range.y && s_dL_dmean2D[block.thread_rank()].x != 0.0) // not exactly zero
1424 | 		{
1425 | 			const int global_id = collected_id[block.thread_rank()];
1426 | 
1427 | 			// Shared gradient accumulation in this block
1428 | 			atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[block.thread_rank()].x);
1429 | 			atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[block.thread_rank()].y);
1430 | 			atomicAdd(&dL_dabsmean2D[global_id].x, s_dL_dabsmean2D[block.thread_rank()].x);
1431 | 			atomicAdd(&dL_dabsmean2D[global_id].y, s_dL_dabsmean2D[block.thread_rank()].y);
1432 | 			atomicAdd(&dL_dconic2D[global_id].x, s_dL_dconic2D[block.thread_rank()].x);
1433 | 			atomicAdd(&dL_dconic2D[global_id].y, s_dL_dconic2D[block.thread_rank()].y);
1434 | 			atomicAdd(&dL_dconic2D[global_id].w, s_dL_dconic2D[block.thread_rank()].w);
1435 | 			for (int i = 0; i < C; i++)
1436 | 				atomicAdd(&(dL_dcolors[global_id * C + i]), s_dL_dcolors[i * BLOCK_SIZE + block.thread_rank()]);
1437 | 			atomicAdd(&(dL_dopacity[global_id]), s_dL_dopacity[block.thread_rank()]);
1438 | 			atomicAdd(&(dL_ddepths[global_id]), s_dL_ddepths[block.thread_rank()]);
1439 | 		}
1440 | 
1441 | 	}
1442 | }
1443 | 
1444 | 
1445 | __device__ float warpReduceSum(float value) {
1446 |     auto warp = cg::coalesced_threads();
1447 |     for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
1448 |         value += warp.shfl_down(value, offset);
1449 |     }
1450 |     return value;
1451 | }
1452 | 
1453 | // Backward version of the rendering procedure.
1454 | template <uint32_t C>
1455 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
1456 | renderCUDAWarp(
1457 | 	const uint2* __restrict__ ranges,
1458 | 	const uint32_t* __restrict__ point_list,
1459 | 	int W, int H,
1460 | 	const float* __restrict__ bg_color,
1461 | 	const float2* __restrict__ points_xy_image,
1462 | 	const float4* __restrict__ conic_opacity,
1463 | 	const float* __restrict__ colors,
1464 | 	const float* __restrict__ depths,
1465 | 	const float* __restrict__ accum_alphas,
1466 | 	const uint32_t* __restrict__ n_contrib,
1467 | 	const float* __restrict__ dL_dpixels,
1468 | 	const float* __restrict__ dL_dpixel_depths,
1469 | 	const float* __restrict__ dL_dpixel_alphas,
1470 | 	float3* __restrict__ dL_dmean2D,
1471 | 	float3* __restrict__ dL_dabsmean2D,
1472 | 	float4* __restrict__ dL_dconic2D,
1473 | 	float* __restrict__ dL_dopacity,
1474 | 	float* __restrict__ dL_dcolors,
1475 | 	float* __restrict__ dL_ddepths)
1476 | {
1477 | 	// We rasterize again. Compute necessary block info.
1478 | 	auto block = cg::this_thread_block();
1479 | 	auto warp = cg::coalesced_threads();
1480 | 
1481 | 	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
1482 | 	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
1483 | 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
1484 | 	// By default, the thread rank should be computed as block.thread_index().x + BLOCK_X * block.thread_index().y
1485 | 	// And warps should be arranged in a 2 * 16 (row * col) fashion
1486 | 	// We want to make this more localized, so thread_ranks are remapped so that a warp is responsible for 4 * 8 smaller patch
1487 | 	// const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
1488 | 	const int tx = threadIdx.x;
1489 | 	const int ty = threadIdx.y;
1490 | 
1491 | 	// Compute warp ID within the block
1492 | 	// Here we divide the 16x16 block into 8 warps, each warp covers a 4x8 area.
1493 | 	// Warp IDs are assigned row-wise.
1494 | 	const int warpId = (ty / 4) * 2 + (tx / 8);
1495 | 	const int laneId = (ty % 4) * 8 + (tx % 8);
1496 | 
1497 | 	// Compute thread's position within its warp
1498 | 	// Threads are linearly indexed within each warp from 0 to 31, using row-major ordering within the 4x8 block
1499 | 	const int local_warp_x = (warpId % 2) * 8;
1500 | 	const int local_warp_y = (warpId / 2) * 4;
1501 | 
1502 | 	const uint2 pix = {
1503 | 		pix_min.x + local_warp_x + (laneId % 8),
1504 | 		pix_min.y + local_warp_y + (laneId / 8)
1505 | 	};
1506 | 
1507 | 	const uint32_t pix_id = W * pix.y + pix.x;
1508 | 	const float2 pixf = { (float)pix.x, (float)pix.y };
1509 | 
1510 | 	const bool inside = pix.x < W&& pix.y < H;
1511 | 	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
1512 | 
1513 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
1514 | 
1515 | 	bool done = !inside;
1516 | 	int toDo = range.y - range.x;
1517 | 
1518 | 	__shared__ int collected_id[BLOCK_SIZE];
1519 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
1520 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
1521 | 	__shared__ float collected_colors[C * BLOCK_SIZE];
1522 | 	__shared__ float collected_depths[BLOCK_SIZE];
1523 | 
1524 | 	// // Heuristic, gaussians are likely to be updated by the same block (same tile)
1525 | 	// // Thus it should be faster to first aggregate the gradients inside this block and update them to the global memory in just one go
1526 | 	// __shared__ float3 s_dL_dmean2D[BLOCK_SIZE];
1527 | 	// __shared__ float3 s_dL_dabsmean2D[BLOCK_SIZE];
1528 | 	// __shared__ float4 s_dL_dconic2D[BLOCK_SIZE];
1529 | 	// __shared__ float s_dL_dopacity[BLOCK_SIZE];
1530 | 	// __shared__ float s_dL_dcolors[C * BLOCK_SIZE];
1531 | 	// __shared__ float s_dL_ddepths[BLOCK_SIZE];
1532 | 
1533 | 	// In the forward, we stored the final value for T, the
1534 | 	// product of all (1 - alpha) factors. 
1535 | 	const float T_final = inside ? (1 - accum_alphas[pix_id]) : 0;
1536 | 	float T = T_final;
1537 | 
1538 | 	// We start from the back. The ID of the last contributing
1539 | 	// Gaussian is known from each pixel from the forward.
1540 | 	uint32_t contributor = toDo;
1541 | 	const int last_contributor = inside ? n_contrib[pix_id] : 0;
1542 | 
1543 | 	float accum_rec[C] = { 0 };
1544 | 	float accum_red = 0;
1545 | 	float accum_rea = 0;
1546 | 	float dL_dpixel[C];
1547 | 	float dL_dpixel_depth;
1548 | 	float dL_dpixel_alpha;
1549 | 	if (inside) 
1550 | 	{
1551 | 		for (int i = 0; i < C; i++)
1552 | 			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
1553 | 		dL_dpixel_depth = dL_dpixel_depths[pix_id];
1554 | 		dL_dpixel_alpha = dL_dpixel_alphas[pix_id];
1555 | 	}
1556 | 	float last_alpha = 0;
1557 | 	float last_color[C] = { 0 };
1558 | 	float last_depth = 0;
1559 | 	// Gradient of pixel coordinate w.r.t. normalized 
1560 | 	// screen-space viewport corrdinates (-1 to 1)
1561 | 	const float ddelx_dx = 0.5 * W;
1562 | 	const float ddely_dy = 0.5 * H;
1563 | 
1564 | 	auto local_rank = block.thread_rank();
1565 | 
1566 | 	// Traverse all Gaussians
1567 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
1568 | 	{
1569 | 		// Load auxiliary data into shared memory, start in the BACK
1570 | 		// and load them in revers order.
1571 | 		block.sync();
1572 | 		const int progress = i * BLOCK_SIZE + local_rank;
1573 | 		if (range.x + progress < range.y)
1574 | 		{
1575 | 			const int coll_id = point_list[range.y - progress - 1];
1576 | 			collected_id[local_rank] = coll_id;
1577 | 			collected_xy[local_rank] = points_xy_image[coll_id];
1578 | 			collected_conic_opacity[local_rank] = conic_opacity[coll_id];
1579 | 			for (int i = 0; i < C; i++)
1580 | 				collected_colors[i * BLOCK_SIZE + local_rank] = colors[coll_id * C + i];
1581 | 			collected_depths[local_rank] = depths[coll_id];
1582 | 
1583 | 			// // Shared gradient accumulation in this block
1584 | 			// s_dL_dmean2D[local_rank].x = 0.0f;
1585 | 			// s_dL_dmean2D[local_rank].y = 0.0f;
1586 | 			// s_dL_dabsmean2D[local_rank].x = 0.0f;
1587 | 			// s_dL_dabsmean2D[local_rank].y = 0.0f;
1588 | 			// s_dL_dconic2D[local_rank].x = 0.0f;
1589 | 			// s_dL_dconic2D[local_rank].y = 0.0f;
1590 | 			// s_dL_dconic2D[local_rank].w = 0.0f;
1591 | 			// for (int i = 0; i < C; i++)
1592 | 			// 	s_dL_dcolors[i * BLOCK_SIZE + local_rank] = 0.0f;
1593 | 			// s_dL_dopacity[local_rank] = 0.0f;
1594 | 			// s_dL_ddepths[local_rank] = 0.0f;
1595 | 		}
1596 | 		block.sync();
1597 | 
1598 | 		// Iterate over Gaussians
1599 | 		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
1600 | 		{
1601 | 			float2 xy = { 0.0f };
1602 | 			float2 d = { 0.0f };
1603 | 			float4 con_o = { 0.0f };
1604 | 			float power = 0.0f;
1605 | 			float G = 0.0f;
1606 | 			float alpha = 0.0f;
1607 | 			float dchannel_dcolor = 0.0f;
1608 | 			float dpixel_depth_ddepth = 0.0f;
1609 | 			float dL_dalpha = 0.0f;
1610 | 			float dep = 0.0f;
1611 | 			float bg_dot_dpixel = 0.0f;
1612 | 			float dL_dG = 0.0f;
1613 | 			float gdx = 0.0f;
1614 | 			float gdy = 0.0f;
1615 | 			float dG_ddelx = 0.0f;
1616 | 			float dG_ddely = 0.0f;
1617 | 
1618 | 			float w_dL_dcolors[C] = { 0.0f };
1619 | 			float w_dL_ddepths = 0.0f;
1620 | 			float2 w_dL_dmean2D = { 0.0f };
1621 | 			float2 w_dL_dabsmean2D = { 0.0f };
1622 | 			float4 w_dL_dconic2D = { 0.0f };
1623 | 			float w_dL_dopacity = 0.0f;
1624 | 
1625 | 			int global_id;
1626 | 			bool early_stop = false;
1627 | 
1628 | 			// if (done) {
1629 | 			// 	early_stop = true;
1630 | 			// 	goto reduce;
1631 | 			// }
1632 | 
1633 | 			// Keep track of current Gaussian ID. Skip, if this one
1634 | 			// is behind the last contributor for this pixel.
1635 | 			contributor--;
1636 | 			if (contributor >= last_contributor) {
1637 | 				early_stop = true;
1638 | 				goto reduce;
1639 | 			}
1640 | 				// continue;
1641 | 
1642 | 			// Compute blending values, as before.
1643 | 			xy = collected_xy[j];
1644 | 			d = { xy.x - pixf.x, xy.y - pixf.y };
1645 | 			con_o = collected_conic_opacity[j];
1646 | 			power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
1647 | 			if (power > 0.0f) {
1648 | 				early_stop = true;
1649 | 				goto reduce; // early stopping
1650 | 			}
1651 | 				// continue;
1652 | 
1653 | 			G = __expf(power);
1654 | 			alpha = min(0.99f, con_o.w * G);
1655 | 			if (alpha < 1.0f / 255.0f) {
1656 | 				early_stop = true;
1657 | 				goto reduce; // early stopping
1658 | 			}
1659 | 				// continue;
1660 | 
1661 | 			T = T / (1.f - alpha);
1662 | 			dchannel_dcolor = alpha * T;
1663 | 			dpixel_depth_ddepth = alpha * T;
1664 | 
1665 | 			// Propagate gradients to per-Gaussian colors and keep
1666 | 			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
1667 | 			// pair).
1668 | 			dL_dalpha = 0.0f;
1669 | 			for (int ch = 0; ch < C; ch++)
1670 | 			{
1671 | 				const float c = collected_colors[ch * BLOCK_SIZE + j];
1672 | 				// Update last color (to be used in the next iteration)
1673 | 				accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
1674 | 				last_color[ch] = c;
1675 | 
1676 | 				const float dL_dchannel = dL_dpixel[ch];
1677 | 				dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
1678 | 				// Update the gradients w.r.t. color of the Gaussian. 
1679 | 				// Atomic, since this pixel is just one of potentially
1680 | 				// many that were affected by this Gaussian.
1681 | 				// atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), dchannel_dcolor * dL_dchannel);
1682 | 				w_dL_dcolors[ch] = dchannel_dcolor * dL_dchannel;
1683 | 			}
1684 | 			dep = collected_depths[j];
1685 | 			accum_red = last_alpha * last_depth + (1.f - last_alpha) * accum_red;
1686 | 			last_depth = dep;
1687 | 			dL_dalpha += (dep-accum_red) * dL_dpixel_depth;
1688 | 			// atomicAdd(&(s_dL_ddepths[j]), dpixel_depth_ddepth * dL_dpixel_depth);
1689 | 			w_dL_ddepths = dpixel_depth_ddepth * dL_dpixel_depth;
1690 | 
1691 | 			accum_rea = last_alpha + (1.f - last_alpha) * accum_rea;
1692 | 			dL_dalpha += (1 - accum_rea) * dL_dpixel_alpha;
1693 | 
1694 | 			dL_dalpha *= T;
1695 | 			// Update last alpha (to be used in the next iteration)
1696 | 			last_alpha = alpha;
1697 | 
1698 | 			// Account for fact that alpha also influences how much of
1699 | 			// the background color is added if nothing left to blend
1700 | 			bg_dot_dpixel = 0;
1701 | 			for (int i = 0; i < C; i++)
1702 | 				bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
1703 | 			dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
1704 | 
1705 | 			// Set background depth value == 0, thus no contribution for
1706 | 			// dL_dalpha
1707 | 
1708 | 			// Helpful reusable temporary variables
1709 | 			dL_dG = con_o.w * dL_dalpha;
1710 | 			gdx = G * d.x;
1711 | 			gdy = G * d.y;
1712 | 			dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
1713 | 			dG_ddely = -gdy * con_o.z - gdx * con_o.y;
1714 | 
1715 | 			// Update gradients w.r.t. 2D mean position of the Gaussian
1716 | 			// atomicAdd(&s_dL_dmean2D[j].x, dL_dG * dG_ddelx * ddelx_dx);
1717 | 			// atomicAdd(&s_dL_dmean2D[j].y, dL_dG * dG_ddely * ddely_dy);
1718 | 			w_dL_dmean2D.x = dL_dG * dG_ddelx * ddelx_dx;
1719 | 			w_dL_dmean2D.y = dL_dG * dG_ddely * ddely_dy;
1720 | 			w_dL_dabsmean2D.x = abs(w_dL_dmean2D.x);
1721 | 			w_dL_dabsmean2D.y = abs(w_dL_dmean2D.y);
1722 | 
1723 | 			// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
1724 | 			// atomicAdd(&s_dL_dconic2D[j].x, -0.5f * gdx * d.x * dL_dG);
1725 | 			// atomicAdd(&s_dL_dconic2D[j].y, -0.5f * gdx * d.y * dL_dG);
1726 | 			// atomicAdd(&s_dL_dconic2D[j].w, -0.5f * gdy * d.y * dL_dG);
1727 | 			w_dL_dconic2D.x = -0.5f * gdx * d.x * dL_dG;
1728 | 			w_dL_dconic2D.y = -0.5f * gdx * d.y * dL_dG;
1729 | 			w_dL_dconic2D.w = -0.5f * gdy * d.y * dL_dG;
1730 | 
1731 | 			// Update gradients w.r.t. opacity of the Gaussian
1732 | 			// atomicAdd(&(s_dL_dopacity[j]), G * dL_dalpha);
1733 | 			w_dL_dopacity = G * dL_dalpha;
1734 | 
1735 | 		reduce:
1736 | 			early_stop = warpReduceSum(early_stop);
1737 | 
1738 | 			// If the whole warp votes for early stop, no need to do any further reduction or computation
1739 | 			if (!early_stop) {
1740 | 					
1741 | 				// Call reduce sum and append results to __shared__ memory
1742 | 				for (int ch = 0; ch < C; ch++) {
1743 | 					w_dL_dcolors[ch] = warpReduceSum(w_dL_dcolors[ch]);
1744 | 				}
1745 | 				w_dL_ddepths = warpReduceSum(w_dL_ddepths);
1746 | 				w_dL_dmean2D.x = warpReduceSum(w_dL_dmean2D.x);
1747 | 				w_dL_dmean2D.y = warpReduceSum(w_dL_dmean2D.y);
1748 | 				w_dL_dabsmean2D.x = warpReduceSum(w_dL_dabsmean2D.x);
1749 | 				w_dL_dabsmean2D.y = warpReduceSum(w_dL_dabsmean2D.y);
1750 | 				w_dL_dconic2D.x = warpReduceSum(w_dL_dconic2D.x);
1751 | 				w_dL_dconic2D.y = warpReduceSum(w_dL_dconic2D.y);
1752 | 				w_dL_dconic2D.w = warpReduceSum(w_dL_dconic2D.w);
1753 | 				w_dL_dopacity = warpReduceSum(w_dL_dopacity);
1754 | 
1755 | 				// Use a single thread from each warp to perform block level reduction
1756 | 				if (local_rank % warp.size() == 0) {
1757 | 					// for (int ch = 0; ch < C; ch++) {
1758 | 					// 	atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), w_dL_dcolors[ch]);
1759 | 					// }
1760 | 					// atomicAdd(&(s_dL_ddepths[j]), w_dL_ddepths);
1761 | 					// atomicAdd(&s_dL_dmean2D[j].x, w_dL_dmean2D.x);
1762 | 					// atomicAdd(&s_dL_dmean2D[j].y, w_dL_dmean2D.y);
1763 | 					// atomicAdd(&s_dL_dabsmean2D[j].x, w_dL_dabsmean2D.x);
1764 | 					// atomicAdd(&s_dL_dabsmean2D[j].y, w_dL_dabsmean2D.y);
1765 | 					// atomicAdd(&s_dL_dconic2D[j].x, w_dL_dconic2D.x);
1766 | 					// atomicAdd(&s_dL_dconic2D[j].y, w_dL_dconic2D.y);
1767 | 					// atomicAdd(&s_dL_dconic2D[j].w, w_dL_dconic2D.w);
1768 | 					// atomicAdd(&(s_dL_dopacity[j]), w_dL_dopacity);
1769 | 					global_id = collected_id[j];
1770 | 
1771 | 					// Shared gradient accumulation in this block
1772 | 					for (int i = 0; i < C; i++)
1773 | 						atomicAdd(&(dL_dcolors[global_id * C + i]), w_dL_dcolors[i]);
1774 | 					atomicAdd(&dL_dmean2D[global_id].x, w_dL_dmean2D.x);
1775 | 					atomicAdd(&dL_dmean2D[global_id].y, w_dL_dmean2D.y);
1776 | 					atomicAdd(&dL_dabsmean2D[global_id].x, w_dL_dabsmean2D.x);
1777 | 					atomicAdd(&dL_dabsmean2D[global_id].y, w_dL_dabsmean2D.y);
1778 | 					atomicAdd(&dL_dconic2D[global_id].x, w_dL_dconic2D.x);
1779 | 					atomicAdd(&dL_dconic2D[global_id].y, w_dL_dconic2D.y);
1780 | 					atomicAdd(&dL_dconic2D[global_id].w, w_dL_dconic2D.w);
1781 | 					atomicAdd(&(dL_dopacity[global_id]), w_dL_dopacity);
1782 | 					atomicAdd(&(dL_ddepths[global_id]), w_dL_ddepths);
1783 | 				}
1784 | 			}
1785 | 		}
1786 | 		// block.sync();
1787 | 
1788 | 		// if (range.x + progress < range.y)
1789 | 		// {
1790 | 		// 	const int global_id = point_list[range.y - progress - 1];
1791 | 
1792 | 		// 	// Shared gradient accumulation in this block
1793 | 		// 	atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[local_rank].x);
1794 | 		// 	atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[local_rank].y);
1795 | 		// 	atomicAdd(&dL_dabsmean2D[global_id].x, s_dL_dabsmean2D[local_rank].x);
1796 | 		// 	atomicAdd(&dL_dabsmean2D[global_id].y, s_dL_dabsmean2D[local_rank].y);
1797 | 		// 	atomicAdd(&dL_dconic2D[global_id].x, s_dL_dconic2D[local_rank].x);
1798 | 		// 	atomicAdd(&dL_dconic2D[global_id].y, s_dL_dconic2D[local_rank].y);
1799 | 		// 	atomicAdd(&dL_dconic2D[global_id].w, s_dL_dconic2D[local_rank].w);
1800 | 		// 	for (int i = 0; i < C; i++)
1801 | 		// 		atomicAdd(&(dL_dcolors[global_id * C + i]), s_dL_dcolors[i * BLOCK_SIZE + local_rank]);
1802 | 		// 	atomicAdd(&(dL_dopacity[global_id]), s_dL_dopacity[local_rank]);
1803 | 		// 	atomicAdd(&(dL_ddepths[global_id]), s_dL_ddepths[local_rank]);
1804 | 		// }
1805 | 
1806 | 	}
1807 | }
1808 | 
1809 | void BACKWARD::preprocess(
1810 | 	int P, int D, int M,
1811 | 	const float3* means3D,
1812 | 	const int* radii,
1813 | 	const float* shs,
1814 | 	const bool* clamped,
1815 | 	const glm::vec3* scales,
1816 | 	const glm::vec4* rotations,
1817 | 	const float scale_modifier,
1818 | 	const float* cov3Ds,
1819 | 	const float* viewmatrix,
1820 | 	const float* projmatrix,
1821 | 	const float focal_x, float focal_y,
1822 | 	const float tan_fovx, float tan_fovy,
1823 | 	const glm::vec3* campos,
1824 | 	const float3* dL_dmean2D,
1825 | 	const float* dL_dconic,
1826 | 	glm::vec3* dL_dmean3D,
1827 | 	float* dL_dcolor,
1828 | 	float* dL_ddepth,
1829 | 	float* dL_dcov3D,
1830 | 	float* dL_dsh,
1831 | 	glm::vec3* dL_dscale,
1832 | 	glm::vec4* dL_drot)
1833 | {
1834 | 	// Propagate gradients for the path of 2D conic matrix computation. 
1835 | 	// Somewhat long, thus it is its own kernel rather than being part of 
1836 | 	// "preprocess". When done, loss gradient w.r.t. 3D means has been
1837 | 	// modified and gradient w.r.t. 3D covariance matrix has been computed.	
1838 | 	computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
1839 | 		P,
1840 | 		means3D,
1841 | 		radii,
1842 | 		cov3Ds,
1843 | 		focal_x,
1844 | 		focal_y,
1845 | 		tan_fovx,
1846 | 		tan_fovy,
1847 | 		viewmatrix,
1848 | 		dL_dconic,
1849 | 		(float3*)dL_dmean3D,
1850 | 		dL_dcov3D);
1851 | 
1852 | 	// Propagate gradients for remaining steps: finish 3D mean gradients,
1853 | 	// propagate color gradients to SH (if desireD), propagate 3D covariance
1854 | 	// matrix gradients to scale and rotation.
1855 | 	preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
1856 | 		P, D, M,
1857 | 		(float3*)means3D,
1858 | 		radii,
1859 | 		shs,
1860 | 		clamped,
1861 | 		(glm::vec3*)scales,
1862 | 		(glm::vec4*)rotations,
1863 | 		scale_modifier,
1864 | 		viewmatrix,
1865 | 		projmatrix,
1866 | 		campos,
1867 | 		(float3*)dL_dmean2D,
1868 | 		(glm::vec3*)dL_dmean3D,
1869 | 		dL_dcolor,
1870 | 		dL_ddepth,
1871 | 		dL_dcov3D,
1872 | 		dL_dsh,
1873 | 		dL_dscale,
1874 | 		dL_drot);
1875 | }
1876 | 
1877 | void BACKWARD::render(
1878 | 	const dim3 grid, const dim3 block,
1879 | 	const uint2* ranges,
1880 | 	const uint32_t* point_list,
1881 | 	int W, int H,
1882 | 	const float* bg_color,
1883 | 	const float2* means2D,
1884 | 	const float4* conic_opacity,
1885 | 	const float* colors,
1886 | 	const float* depths,
1887 | 	const float* accum_alphas,
1888 | 	const uint32_t* n_contrib,
1889 | 	const float* dL_dpixels,
1890 | 	const float* dL_dpixel_depths,
1891 | 	const float* dL_dpixel_alphas,
1892 | 	float3* dL_dmean2D,
1893 | 	float3* dL_dabsmean2D,
1894 | 	float4* dL_dconic2D,
1895 | 	float* dL_dopacity,
1896 | 	float* dL_dcolors,
1897 | 	float* dL_ddepths)
1898 | {
1899 | 	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
1900 | 		ranges,
1901 | 		point_list,
1902 | 		W, H,
1903 | 		bg_color,
1904 | 		means2D,
1905 | 		conic_opacity,
1906 | 		colors,
1907 | 		depths,
1908 | 		accum_alphas,
1909 | 		n_contrib,
1910 | 		dL_dpixels,
1911 | 		dL_dpixel_depths,
1912 | 		dL_dpixel_alphas,
1913 | 		dL_dmean2D,
1914 | 		dL_dabsmean2D,
1915 | 		dL_dconic2D,
1916 | 		dL_dopacity,
1917 | 		dL_dcolors,
1918 | 		dL_ddepths);
1919 | }


--------------------------------------------------------------------------------
/cuda_rasterizer/backward.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
 13 | #define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
 14 | 
 15 | #include <cuda.h>
 16 | #include "cuda_runtime.h"
 17 | #include "device_launch_parameters.h"
 18 | #define GLM_FORCE_CUDA
 19 | #include <glm/glm.hpp>
 20 | 
 21 | namespace BACKWARD
 22 | {
 23 | 	void render(
 24 | 		const dim3 grid, dim3 block,
 25 | 		const uint2* ranges,
 26 | 		const uint32_t* point_list,
 27 | 		int W, int H,
 28 | 		const float* bg_color,
 29 | 		const float2* means2D,
 30 | 		const float4* conic_opacity,
 31 | 		const float* colors,
 32 | 		const float* depths,
 33 | 		const float* accum_alphas,
 34 | 		const uint32_t* n_contrib,
 35 | 		const float* dL_dpixels,
 36 | 		const float* dL_dpixel_depths,
 37 | 		const float* dL_dpixel_alphas,
 38 | 		float3* dL_dmean2D,
 39 | 		float3* dL_dabsmean2D,
 40 | 		float4* dL_dconic2D,
 41 | 		float* dL_dopacity,
 42 | 		float* dL_dcolors,
 43 | 		float* dL_ddepths);
 44 | 
 45 | 	void preprocess(
 46 | 		int P, int D, int M,
 47 | 		const float3* means,
 48 | 		const int* radii,
 49 | 		const float* shs,
 50 | 		const bool* clamped,
 51 | 		const glm::vec3* scales,
 52 | 		const glm::vec4* rotations,
 53 | 		const float scale_modifier,
 54 | 		const float* cov3Ds,
 55 | 		const float* view,
 56 | 		const float* proj,
 57 | 		const float focal_x, float focal_y,
 58 | 		const float tan_fovx, float tan_fovy,
 59 | 		const glm::vec3* campos,
 60 | 		const float3* dL_dmean2D,
 61 | 		const float* dL_dconics,
 62 | 		glm::vec3* dL_dmeans,
 63 | 		float* dL_dcolor,
 64 | 		float* dL_ddepth,
 65 | 		float* dL_dcov3D,
 66 | 		float* dL_dsh,
 67 | 		glm::vec3* dL_dscale,
 68 | 		glm::vec4* dL_drot);
 69 | 
 70 | 	void computeCov3DBackward(
 71 | 		int P,
 72 | 		const glm::vec3* scaling_xyz,
 73 | 		const glm::vec4* rotation_l,
 74 | 		const float* dL_dcov,
 75 | 		glm::vec3* dL_dscaling_xyz,
 76 | 		glm::vec4* dL_drotation_l);
 77 | 
 78 | 	void computeCov4DBackward(
 79 | 		int P,
 80 | 		const glm::vec4* scaling_xyzt,
 81 | 		const glm::vec4* rotation_l,
 82 | 		const glm::vec4* rotation_r,
 83 | 		const float* dL_dcov, 
 84 | 		const glm::vec3* dL_dms, 
 85 | 		const float* dL_dcov_t,
 86 | 		glm::vec4* dL_dscaling_xyzt,
 87 | 		glm::vec4* dL_drotation_l, 
 88 | 		glm::vec4* dL_drotation_r);
 89 | 
 90 | 	void computeSH4DBackward(
 91 | 		int P,
 92 | 		int deg, int deg_t, int max_coeffs, 
 93 | 		const float* sh, 
 94 | 		const glm::vec3* dir, 
 95 | 		const float* dir_t, 
 96 | 		const float time_duration,
 97 | 		const glm::vec3* dL_drgb,
 98 | 		float* dL_dsh,
 99 | 		glm::vec3* dL_ddir,
100 | 		float* dL_ddir_t
101 | 	);
102 | }
103 | 
104 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
13 | #define CUDA_RASTERIZER_CONFIG_H_INCLUDED
14 | 
15 | #define NUM_CHANNELS 3 // Default 3, RGB
16 | #define BLOCK_X 16
17 | #define BLOCK_Y 16
18 | 
19 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/forward.cu:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (C) 2023, Inria
   3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
   4 |  * All rights reserved.
   5 |  *
   6 |  * This software is free for non-commercial, research and evaluation use 
   7 |  * under the terms of the LICENSE.md file.
   8 |  *
   9 |  * For inquiries contact  george.drettakis@inria.fr
  10 |  */
  11 | 
  12 | #include "forward.h"
  13 | #include "auxiliary.h"
  14 | #include <cooperative_groups.h>
  15 | #include <cooperative_groups/reduce.h>
  16 | namespace cg = cooperative_groups;
  17 | 
  18 | // Forward method for converting the input spherical harmonics
  19 | // coefficients of each Gaussian to a simple RGB color.
  20 | __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
  21 | {
  22 | 	// The implementation is loosely based on code for 
  23 | 	// "Differentiable Point-Based Radiance Fields for 
  24 | 	// Efficient View Synthesis" by Zhang et al. (2022)
  25 | 	glm::vec3 pos = means[idx];
  26 | 	glm::vec3 dir = pos - campos;
  27 | 	dir = dir / glm::length(dir);
  28 | 
  29 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
  30 | 	glm::vec3 result = SH_C0 * sh[0];
  31 | 
  32 | 	if (deg > 0)
  33 | 	{
  34 | 		float x = dir.x;
  35 | 		float y = dir.y;
  36 | 		float z = dir.z;
  37 | 		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
  38 | 
  39 | 		if (deg > 1)
  40 | 		{
  41 | 			float xx = x * x, yy = y * y, zz = z * z;
  42 | 			float xy = x * y, yz = y * z, xz = x * z;
  43 | 			result = result +
  44 | 				SH_C2[0] * xy * sh[4] +
  45 | 				SH_C2[1] * yz * sh[5] +
  46 | 				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
  47 | 				SH_C2[3] * xz * sh[7] +
  48 | 				SH_C2[4] * (xx - yy) * sh[8];
  49 | 
  50 | 			if (deg > 2)
  51 | 			{
  52 | 				result = result +
  53 | 					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
  54 | 					SH_C3[1] * xy * z * sh[10] +
  55 | 					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
  56 | 					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
  57 | 					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
  58 | 					SH_C3[5] * z * (xx - yy) * sh[14] +
  59 | 					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
  60 | 			}
  61 | 		}
  62 | 	}
  63 | 	result += 0.5f;
  64 | 
  65 | 	// RGB colors are clamped to positive values. If values are
  66 | 	// clamped, we need to keep track of this for the backward pass.
  67 | 	clamped[3 * idx + 0] = (result.x < 0);
  68 | 	clamped[3 * idx + 1] = (result.y < 0);
  69 | 	clamped[3 * idx + 2] = (result.z < 0);
  70 | 	return glm::max(result, 0.0f);
  71 | }
  72 | 
  73 | __device__ glm::vec3 eval4DSH(int deg, int deg_t, const glm::vec3* sh, const glm::vec3 dir, const float dir_t, const float time_duration)
  74 | {
  75 | 
  76 | 	float l0m0 = SH_C0;
  77 | 	glm::vec3 result = l0m0 * sh[0];
  78 | 
  79 | 	if (deg > 0)
  80 | 		{
  81 | 		float x = dir.x;
  82 | 		float y = dir.y;
  83 | 		float z = dir.z;
  84 | 
  85 | 		float l1m1 = -1 * SH_C1 * y;
  86 | 		float l1m0 = SH_C1 * z;
  87 | 		float l1p1 = -1 * SH_C1 * x;
  88 | 
  89 | 		result += 
  90 | 			l1m1 * sh[1] +
  91 | 			l1m0 * sh[2] +
  92 | 			l1p1 * sh[3];
  93 | 
  94 | 		if (deg > 1)
  95 | 		{
  96 | 			float xx = x * x, yy = y * y, zz = z * z;
  97 | 			float xy = x * y, yz = y * z, xz = x * z;
  98 | 
  99 | 			float l2m2 = SH_C2[0] * xy;
 100 | 			float l2m1 = SH_C2[1] * yz;
 101 | 			float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy);
 102 | 			float l2p1 = SH_C2[3] * xz;
 103 | 			float l2p2 = SH_C2[4] * (xx - yy);
 104 | 
 105 | 			result +=
 106 | 				l2m2 * sh[4] +
 107 | 				l2m1 * sh[5] +
 108 | 				l2m0 * sh[6] +
 109 | 				l2p1 * sh[7] +
 110 | 				l2p2 * sh[8];
 111 | 
 112 | 			if (deg > 2)
 113 | 			{
 114 | 				float l3m3 = SH_C3[0] * y * (3 * xx - yy);
 115 | 				float l3m2 = SH_C3[1] * xy * z;
 116 | 				float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy);
 117 | 				float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy);
 118 | 				float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy);
 119 | 				float l3p2 = SH_C3[5] * z * (xx - yy);
 120 | 				float l3p3 = SH_C3[6] * x * (xx - 3 * yy);
 121 | 
 122 | 				result +=
 123 | 					l3m3 * sh[9] +
 124 | 					l3m2 * sh[10] +
 125 | 					l3m1 * sh[11] +
 126 | 					l3m0 * sh[12] +
 127 | 					l3p1 * sh[13] +
 128 | 					l3p2 * sh[14] +
 129 | 					l3p3 * sh[15];
 130 | 
 131 | 				if (deg_t > 0){
 132 | 					float t1 = cos(2 * MY_PI * dir_t / time_duration);
 133 | 
 134 | 					result += t1 * (l0m0 * sh[16] +
 135 | 									l1m1 * sh[17] +
 136 | 									l1m0 * sh[18] +
 137 | 									l1p1 * sh[19] + 
 138 | 									l2m2 * sh[20] +
 139 | 									l2m1 * sh[21] +
 140 | 									l2m0 * sh[22] +
 141 | 									l2p1 * sh[23] +
 142 | 									l2p2 * sh[24] + 
 143 | 									l3m3 * sh[25] +
 144 | 									l3m2 * sh[26] +
 145 | 									l3m1 * sh[27] +
 146 | 									l3m0 * sh[28] +
 147 | 									l3p1 * sh[29] +
 148 | 									l3p2 * sh[30] +
 149 | 									l3p3 * sh[31]);
 150 | 
 151 | 					if (deg_t > 1){
 152 | 						float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration);
 153 | 
 154 | 						result += t2 * (l0m0 * sh[32] +
 155 | 										l1m1 * sh[33] +
 156 | 										l1m0 * sh[34] +
 157 | 										l1p1 * sh[35] + 
 158 | 										l2m2 * sh[36] +
 159 | 										l2m1 * sh[37] +
 160 | 										l2m0 * sh[38] +
 161 | 										l2p1 * sh[39] +
 162 | 										l2p2 * sh[40] + 
 163 | 										l3m3 * sh[41] +
 164 | 										l3m2 * sh[42] +
 165 | 										l3m1 * sh[43] +
 166 | 										l3m0 * sh[44] +
 167 | 										l3p1 * sh[45] +
 168 | 										l3p2 * sh[46] +
 169 | 										l3p3 * sh[47]);
 170 | 					}
 171 | 
 172 | 				}
 173 | 			}
 174 | 		}
 175 | 	}
 176 | 	result += 0.5f;
 177 | 
 178 | 	return result;
 179 | }
 180 | 
 181 | 
 182 | __device__ glm::vec3 eval4DSHResidual(int deg, int deg_t, const glm::vec3* sh, const glm::vec3 dir, const float dir_t, const float time_duration)
 183 | {
 184 | 
 185 | 	float l0m0 = SH_C0;
 186 | 	// glm::vec3 result = l0m0 * sh[0];
 187 | 	glm::vec3 result {0.0f};
 188 | 
 189 | 	if (deg > 0)
 190 | 		{
 191 | 		float x = dir.x;
 192 | 		float y = dir.y;
 193 | 		float z = dir.z;
 194 | 
 195 | 		float l1m1 = -1 * SH_C1 * y;
 196 | 		float l1m0 = SH_C1 * z;
 197 | 		float l1p1 = -1 * SH_C1 * x;
 198 | 
 199 | 		result += 
 200 | 			l1m1 * sh[0] +
 201 | 			l1m0 * sh[1] +
 202 | 			l1p1 * sh[2];
 203 | 
 204 | 		if (deg > 1)
 205 | 		{
 206 | 			float xx = x * x, yy = y * y, zz = z * z;
 207 | 			float xy = x * y, yz = y * z, xz = x * z;
 208 | 
 209 | 			float l2m2 = SH_C2[0] * xy;
 210 | 			float l2m1 = SH_C2[1] * yz;
 211 | 			float l2m0 = SH_C2[2] * (2.0 * zz - xx - yy);
 212 | 			float l2p1 = SH_C2[3] * xz;
 213 | 			float l2p2 = SH_C2[4] * (xx - yy);
 214 | 
 215 | 			result +=
 216 | 				l2m2 * sh[3] +
 217 | 				l2m1 * sh[4] +
 218 | 				l2m0 * sh[5] +
 219 | 				l2p1 * sh[6] +
 220 | 				l2p2 * sh[7];
 221 | 
 222 | 			if (deg > 2)
 223 | 			{
 224 | 				float l3m3 = SH_C3[0] * y * (3 * xx - yy);
 225 | 				float l3m2 = SH_C3[1] * xy * z;
 226 | 				float l3m1 = SH_C3[2] * y * (4 * zz - xx - yy);
 227 | 				float l3m0 = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy);
 228 | 				float l3p1 = SH_C3[4] * x * (4 * zz - xx - yy);
 229 | 				float l3p2 = SH_C3[5] * z * (xx - yy);
 230 | 				float l3p3 = SH_C3[6] * x * (xx - 3 * yy);
 231 | 
 232 | 				result +=
 233 | 					l3m3 * sh[8] +
 234 | 					l3m2 * sh[9] +
 235 | 					l3m1 * sh[10] +
 236 | 					l3m0 * sh[11] +
 237 | 					l3p1 * sh[12] +
 238 | 					l3p2 * sh[13] +
 239 | 					l3p3 * sh[14];
 240 | 
 241 | 				if (deg_t > 0){
 242 | 					float t1 = cos(2 * MY_PI * dir_t / time_duration);
 243 | 
 244 | 					result += t1 * (l0m0 * sh[15] +
 245 | 									l1m1 * sh[16] +
 246 | 									l1m0 * sh[17] +
 247 | 									l1p1 * sh[18] + 
 248 | 									l2m2 * sh[19] +
 249 | 									l2m1 * sh[20] +
 250 | 									l2m0 * sh[21] +
 251 | 									l2p1 * sh[22] +
 252 | 									l2p2 * sh[23] + 
 253 | 									l3m3 * sh[24] +
 254 | 									l3m2 * sh[25] +
 255 | 									l3m1 * sh[26] +
 256 | 									l3m0 * sh[27] +
 257 | 									l3p1 * sh[28] +
 258 | 									l3p2 * sh[29] +
 259 | 									l3p3 * sh[30]);
 260 | 
 261 | 					if (deg_t > 1){
 262 | 						float t2 = cos(2 * MY_PI * dir_t * 2 / time_duration);
 263 | 
 264 | 						result += t2 * (l0m0 * sh[31] +
 265 | 										l1m1 * sh[32] +
 266 | 										l1m0 * sh[33] +
 267 | 										l1p1 * sh[34] + 
 268 | 										l2m2 * sh[35] +
 269 | 										l2m1 * sh[36] +
 270 | 										l2m0 * sh[37] +
 271 | 										l2p1 * sh[38] +
 272 | 										l2p2 * sh[39] + 
 273 | 										l3m3 * sh[40] +
 274 | 										l3m2 * sh[41] +
 275 | 										l3m1 * sh[42] +
 276 | 										l3m0 * sh[43] +
 277 | 										l3p1 * sh[44] +
 278 | 										l3p2 * sh[45] +
 279 | 										l3p3 * sh[46]);
 280 | 					}
 281 | 
 282 | 				}
 283 | 			}
 284 | 		}
 285 | 	}
 286 | 	// result += 0.5f;
 287 | 
 288 | 	return result;
 289 | }
 290 | 
 291 | __device__ glm::vec3 computeColorFromSH_4D(int idx, int deg, int deg_t, int max_coeffs, const float* shs, const glm::vec3* dirs, const float* dirs_t, const float time_duration)
 292 | {
 293 | 	// The implementation is loosely based on code for
 294 | 	// "Differentiable Point-Based Radiance Fields for
 295 | 	// Efficient View Synthesis" by Zhang et al. (2022)
 296 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
 297 | 	glm::vec3 dir = dirs[idx];
 298 | 	const float dir_t = dirs_t[idx];
 299 | 	return eval4DSH(deg, deg_t, sh, dir, dir_t, time_duration);
 300 | }
 301 | 
 302 | 
 303 | __global__ void computeSH4DCUDA(int P,
 304 | 								int deg, int deg_t, int max_coeffs, 
 305 | 								const float* sh, const glm::vec3* dir, const float* dir_t, const float time_duration, glm::vec3* rgb
 306 | )
 307 | {
 308 | 	auto idx = cg::this_grid().thread_rank();
 309 | 	if (idx >= P)
 310 | 		return;
 311 | 	rgb[idx] =  computeColorFromSH_4D(
 312 | 		idx,
 313 | 		deg,
 314 | 		deg_t,
 315 | 		max_coeffs,
 316 | 		sh,
 317 | 		dir,
 318 | 		dir_t,
 319 | 		time_duration
 320 | 	);
 321 | }
 322 | 
 323 | 
 324 | void FORWARD::computeSH4D(
 325 | 	int P,
 326 | 	int deg, int deg_t, int max_coeffs, 
 327 | 	const float* sh, 
 328 | 	const glm::vec3* dir, 
 329 | 	const float* dir_t, 
 330 | 	const float time_duration,
 331 | 	glm::vec3* rgb) 
 332 | {
 333 | 	computeSH4DCUDA << <(P + 255) / 256, 256 >> > (
 334 | 		P,
 335 | 		deg,
 336 | 		deg_t,
 337 | 		max_coeffs,
 338 | 		sh,
 339 | 		dir,
 340 | 		dir_t,
 341 | 		time_duration,
 342 | 		rgb
 343 | 	);
 344 | }
 345 | 
 346 | // Forward version of 2D covariance matrix computation
 347 | __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
 348 | {
 349 | 	// The following models the steps outlined by equations 29
 350 | 	// and 31 in "EWA Splatting" (Zwicker et al., 2002). 
 351 | 	// Additionally considers aspect / scaling of viewport.
 352 | 	// Transposes used to account for row-/column-major conventions.
 353 | 	float3 t = transformPoint4x3(mean, viewmatrix);
 354 | 
 355 | 	const float limx = 1.3f * tan_fovx;
 356 | 	const float limy = 1.3f * tan_fovy;
 357 | 	const float txtz = t.x / t.z;
 358 | 	const float tytz = t.y / t.z;
 359 | 	t.x = min(limx, max(-limx, txtz)) * t.z;
 360 | 	t.y = min(limy, max(-limy, tytz)) * t.z;
 361 | 
 362 | 	glm::mat3 J = glm::mat3(
 363 | 		focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
 364 | 		0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
 365 | 		0, 0, 0);
 366 | 
 367 | 	glm::mat3 W = glm::mat3(
 368 | 		viewmatrix[0], viewmatrix[4], viewmatrix[8],
 369 | 		viewmatrix[1], viewmatrix[5], viewmatrix[9],
 370 | 		viewmatrix[2], viewmatrix[6], viewmatrix[10]);
 371 | 
 372 | 	glm::mat3 T = W * J;
 373 | 
 374 | 	glm::mat3 Vrk = glm::mat3(
 375 | 		cov3D[0], cov3D[1], cov3D[2],
 376 | 		cov3D[1], cov3D[3], cov3D[4],
 377 | 		cov3D[2], cov3D[4], cov3D[5]);
 378 | 
 379 | 	glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
 380 | 
 381 | 	// compute unblured determinant
 382 | 	// float det_orig = cov[0][0] * cov[1][1] - cov[0][1] * cov[0][1];
 383 | 
 384 | 	// Apply low-pass filter: every Gaussian should be at least
 385 | 	// one pixel wide/high. Discard 3rd row and column.
 386 | 	cov[0][0] += 0.3f;
 387 | 	cov[1][1] += 0.3f;
 388 | 	return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
 389 | }
 390 | 
 391 | // Forward method for converting scale and rotation properties of each
 392 | // Gaussian to a 3D covariance matrix in world space. Also takes care
 393 | // of quaternion normalization.
 394 | __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
 395 | {
 396 | 	// Create scaling matrix
 397 | 	glm::mat3 S = glm::mat3(1.0f);
 398 | 	S[0][0] = mod * scale.x;
 399 | 	S[1][1] = mod * scale.y;
 400 | 	S[2][2] = mod * scale.z;
 401 | 
 402 | 	// Normalize quaternion to get valid rotation
 403 | 	glm::vec4 q = rot;// / glm::length(rot);
 404 | 	float r = q.x;
 405 | 	float x = q.y;
 406 | 	float y = q.z;
 407 | 	float z = q.w;
 408 | 
 409 | 	// Compute rotation matrix from quaternion
 410 | 	glm::mat3 R = glm::mat3(
 411 | 		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
 412 | 		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
 413 | 		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
 414 | 	);
 415 | 
 416 | 	glm::mat3 M = S * R;
 417 | 
 418 | 	// Compute 3D world covariance matrix Sigma
 419 | 	glm::mat3 Sigma = glm::transpose(M) * M;
 420 | 
 421 | 	// Covariance is symmetric, only store upper right
 422 | 	cov3D[0] = Sigma[0][0];
 423 | 	cov3D[1] = Sigma[0][1];
 424 | 	cov3D[2] = Sigma[0][2];
 425 | 	cov3D[3] = Sigma[1][1];
 426 | 	cov3D[4] = Sigma[1][2];
 427 | 	cov3D[5] = Sigma[2][2];
 428 | }
 429 | 
 430 | __global__ void computeCov3DCUDA(int P,
 431 | 	const glm::vec3* scaling_xyz,
 432 | 	const glm::vec4* rotation_l,
 433 | 	float* cov)
 434 | {
 435 | 	auto idx = cg::this_grid().thread_rank();
 436 | 	if (idx >= P)
 437 | 		return;
 438 | 	computeCov3D(
 439 | 		scaling_xyz[idx],
 440 | 		1.0f,
 441 | 		rotation_l[idx],
 442 | 		cov + idx * 6);
 443 | }
 444 | 
 445 | void FORWARD::computeCov3D(
 446 | 	int P,
 447 | 	const glm::vec3* scaling_xyz,
 448 | 	const glm::vec4* rotation_l,
 449 | 	float* cov)
 450 | {
 451 | 	computeCov3DCUDA << <(P + 255) / 256, 256 >> > (
 452 | 		P,
 453 | 		scaling_xyz,
 454 | 		rotation_l,
 455 | 		cov);
 456 | }
 457 | 
 458 | __device__ void computeCov4D(const glm::vec4 scaling_xyzt, const glm::vec4 rotation_l, const glm::vec4 rotation_r, float* cov, glm::vec3 &ms, float &cov_t)
 459 | {
 460 | 	// Create scaling matrix
 461 | 	glm::mat4 S = glm::mat4(1.0f);
 462 | 	S[0][0] = scaling_xyzt.x;
 463 | 	S[1][1] = scaling_xyzt.y;
 464 | 	S[2][2] = scaling_xyzt.z;
 465 | 	S[3][3] = scaling_xyzt.w;
 466 | 
 467 | 	const float l_l = glm::length(rotation_l);
 468 | 	const float a = rotation_l.x / l_l;
 469 | 	const float b = rotation_l.y / l_l;
 470 | 	const float c = rotation_l.z / l_l;
 471 | 	const float d = rotation_l.w / l_l;
 472 | 
 473 | 	const float l_r = glm::length(rotation_r);
 474 | 	const float p = rotation_r.x / l_r;
 475 | 	const float q = rotation_r.y / l_r;
 476 | 	const float r = rotation_r.z / l_r;
 477 | 	const float s = rotation_r.w / l_r;
 478 | 
 479 | 	glm::mat4 M_l = glm::mat4(
 480 | 		a, -b, -c, -d,
 481 | 		b, a,-d, c,
 482 | 		c, d, a,-b,
 483 | 		d,-c, b, a
 484 | 	);
 485 | 
 486 | 	glm::mat4 M_r = glm::mat4(
 487 | 		p, q, r, s,
 488 | 		-q, p,-s, r,
 489 | 		-r, s, p,-q,
 490 | 		-s,-r, q, p
 491 | 	);
 492 | 	// glm stores in column major
 493 | 	glm::mat4 R = M_r * M_l;
 494 | 	glm::mat4 M = S * R;
 495 | 	glm::mat4 Sigma = glm::transpose(M) * M;
 496 | 	cov_t = Sigma[3][3];
 497 | 
 498 | 	glm::mat3 cov11 = glm::mat3(Sigma);
 499 | 	glm::vec3 cov12 = glm::vec3(Sigma[0][3], Sigma[1][3], Sigma[2][3]);
 500 | 	glm::mat3 cov3D = cov11 - glm::outerProduct(cov12, cov12) / cov_t;
 501 | 
 502 | 	// Covariance is symmetric, only store upper right
 503 | 	cov[0] = cov3D[0][0];
 504 | 	cov[1] = cov3D[0][1];
 505 | 	cov[2] = cov3D[0][2];
 506 | 	cov[3] = cov3D[1][1];
 507 | 	cov[4] = cov3D[1][2];
 508 | 	cov[5] = cov3D[2][2];
 509 | 	ms = cov12 / cov_t;
 510 | }
 511 | 
 512 | 
 513 | __global__ void computeCov4DCUDA(int P,
 514 | 	const glm::vec4* scaling_xyzt,
 515 | 	const glm::vec4* rotation_l,
 516 | 	const glm::vec4* rotation_r,
 517 | 	float* cov,
 518 | 	glm::vec3* ms,
 519 | 	float* cov_t)
 520 | {
 521 | 	auto idx = cg::this_grid().thread_rank();
 522 | 	if (idx >= P)
 523 | 		return;
 524 | 	computeCov4D(
 525 | 		scaling_xyzt[idx],
 526 | 		rotation_l[idx],
 527 | 		rotation_r[idx],
 528 | 		cov + idx * 6,
 529 | 		ms[idx],
 530 | 		cov_t[idx]);
 531 | }
 532 | 
 533 | 
 534 | void FORWARD::computeCov4D(
 535 | 	int P,
 536 | 	const glm::vec4* scaling_xyzt,
 537 | 	const glm::vec4* rotation_l,
 538 | 	const glm::vec4* rotation_r,
 539 | 	float* cov,
 540 | 	glm::vec3* ms,
 541 | 	float* cov_t
 542 | ) {
 543 | 	computeCov4DCUDA << <(P + 255) / 256, 256 >> > (
 544 | 		P,
 545 | 		scaling_xyzt,
 546 | 		rotation_l,
 547 | 		rotation_r,
 548 | 		cov,
 549 | 		ms,
 550 | 		cov_t);
 551 | }
 552 | 
 553 | // Perform initial steps for each Gaussian prior to rasterization.
 554 | template<int C>
 555 | __global__ void fusedPreprocess4DSparseCUDA(int P,
 556 | 	const int deg,
 557 | 	const int deg_t,
 558 | 	const int M,
 559 | 	const glm::vec3* means3D,
 560 | 	const float* cov,
 561 | 	const glm::vec3* ms,
 562 | 	const float* cov_t,
 563 | 	const float* opacities,
 564 | 	const float* t1,
 565 | 	const glm::vec3* bases,
 566 | 	const float* shs,
 567 | 	const float* t,
 568 | 	const int* sparse,
 569 | 	const float* viewmatrix,
 570 | 	const float* projmatrix,
 571 | 	const float* cam_pos,
 572 | 	const float duration,
 573 | 	bool* mask,
 574 | 	float* occ1,
 575 | 	glm::vec3* xyz3,
 576 | 	glm::vec3* rgb3)
 577 | {
 578 | 	auto idx = cg::this_grid().thread_rank();
 579 | 	if (idx >= P)
 580 | 		return;
 581 | 
 582 | 	// Initialize radius and touched tiles to 0. If this isn't changed,
 583 | 	// this Gaussian will not be processed further.
 584 | 	mask[idx] = false;
 585 | 
 586 | 	// Perform marginalization using the current time
 587 | 	float dt = t[idx] - t1[idx];
 588 | 	float marginal_t = __expf(-0.5 * dt * dt / cov_t[idx]);
 589 | 	if (marginal_t <= 0.05) {
 590 | 		return;
 591 | 	}
 592 | 
 593 | 	glm::vec3 xyz = means3D[idx] + ms[idx] * dt;
 594 | 
 595 | 	// Filter by frustum
 596 | 	// Perform near culling, quit if outside.
 597 | 	float3 pos {xyz.x, xyz.y, xyz.z};
 598 | 	if (!check_frustum(pos, viewmatrix, projmatrix) || opacities[idx] < 0.0001f) {
 599 | 		return;
 600 | 	}
 601 | 
 602 | 	float occ = marginal_t * opacities[idx];
 603 | 	if (occ < 0.0001f) {
 604 | 		return;
 605 | 	}
 606 | 
 607 | 	mask[idx] = true;
 608 | 	occ1[idx] = occ;
 609 | 	xyz3[idx] = xyz;
 610 | 
 611 | 	// Handling sparse SH
 612 | 	glm::vec3 rgb = SH_C0 * bases[idx];
 613 | 	rgb3[idx] = min(max(rgb + 0.5f, 0.0f), 1.0f); // zero degree sh
 614 | 	// rgb3[idx] = bases[idx]; // zero degree sh
 615 | 	
 616 | 	if (sparse[idx] == -1) {
 617 | 		return;
 618 | 	}
 619 | 	
 620 | 	// Computing 4D SH using the current time and viewing direction
 621 | 	glm::vec3 dir = xyz - *(glm::vec3*)cam_pos;
 622 | 	dir = dir / glm::length(dir);
 623 | 	const glm::vec3* sh = (glm::vec3*)shs + sparse[idx] * M;
 624 | 	rgb += eval4DSHResidual(deg, deg_t, sh, dir, dt, duration);
 625 | 	rgb3[idx] = min(max(rgb + 0.5f, 0.0f), 1.0f);
 626 | }
 627 | 
 628 | 
 629 | // Perform initial steps for each Gaussian prior to rasterization.
 630 | template<int C>
 631 | __global__ void fusedPreprocess4DCUDA(int P,
 632 | 	const int deg,
 633 | 	const int deg_t,
 634 | 	const int M,
 635 | 	const glm::vec3* means3D,
 636 | 	const float* cov,
 637 | 	const glm::vec3* ms,
 638 | 	const float* cov_t,
 639 | 	const float* opacities,
 640 | 	const float* t1,
 641 | 	const float* shs,
 642 | 	const float* t,
 643 | 	const float* viewmatrix,
 644 | 	const float* projmatrix,
 645 | 	const float* cam_pos,
 646 | 	const float duration,
 647 | 	bool* mask,
 648 | 	float* occ1,
 649 | 	glm::vec3* xyz3,
 650 | 	glm::vec3* rgb3)
 651 | {
 652 | 	auto idx = cg::this_grid().thread_rank();
 653 | 	if (idx >= P)
 654 | 		return;
 655 | 
 656 | 	// Initialize radius and touched tiles to 0. If this isn't changed,
 657 | 	// this Gaussian will not be processed further.
 658 | 	mask[idx] = false;
 659 | 
 660 | 	// Perform marginalization using the current time
 661 | 	float dt = t[idx] - t1[idx];
 662 | 	float marginal_t = __expf(-0.5 * dt * dt / cov_t[idx]);
 663 | 	if (marginal_t <= 0.005) {
 664 | 		return;
 665 | 	}
 666 | 
 667 | 	glm::vec3 xyz = means3D[idx] + ms[idx] * dt;
 668 | 
 669 | 	// Filter by frustum
 670 | 	// Perform near culling, quit if outside.
 671 | 	float3 pos {xyz.x, xyz.y, xyz.z};
 672 | 	if (!check_frustum(pos, viewmatrix, projmatrix) || opacities[idx] < 0.0001f) {
 673 | 		return;
 674 | 	}
 675 | 
 676 | 	float occ = marginal_t * opacities[idx];
 677 | 	if (occ < 0.0001f) {
 678 | 		return;
 679 | 	}
 680 | 
 681 | 	// Computing 4D SH using the current time and viewing direction
 682 | 	glm::vec3 dir = xyz - *(glm::vec3*)cam_pos;
 683 | 	dir = dir / glm::length(dir);
 684 | 	const glm::vec3* sh = ((glm::vec3*)shs) + idx * M;
 685 | 	glm::vec3 rgb = eval4DSH(deg, deg_t, sh, dir, dt, duration);
 686 | 
 687 | 	// glm::vec3 rgb(0.0f);
 688 | 	// const glm::vec3* sh = (glm::vec3*)shs + idx * M;
 689 | 	// glm::vec3 sh0 = *sh;
 690 | 	// float l0m0 = SH_C0;
 691 | 	// glm::vec3 rgb = l0m0 * sh0;
 692 | 
 693 | 	mask[idx] = true;
 694 | 	occ1[idx] = occ;
 695 | 	xyz3[idx] = xyz;
 696 | 	rgb3[idx] = rgb;
 697 | }
 698 | 
 699 | // Perform initial steps for each Gaussian prior to rasterization.
 700 | template<int C>
 701 | __global__ void preprocessCUDA(int P, int D, int M,
 702 | 	const float* orig_points,
 703 | 	const glm::vec3* scales,
 704 | 	const float scale_modifier,
 705 | 	const glm::vec4* rotations,
 706 | 	const float* opacities,
 707 | 	const float* shs,
 708 | 	bool* clamped,
 709 | 	const float* cov3D_precomp,
 710 | 	const bool* tile_mask,
 711 | 	const float* colors_precomp,
 712 | 	const float* viewmatrix,
 713 | 	const float* projmatrix,
 714 | 	const glm::vec3* cam_pos,
 715 | 	const int W, int H,
 716 | 	const float tan_fovx, float tan_fovy,
 717 | 	const float focal_x, float focal_y,
 718 | 	int* radii,
 719 | 	float2* points_xy_image,
 720 | 	float* depths,
 721 | 	float* cov3Ds,
 722 | 	float* rgb,
 723 | 	float4* conic_opacity,
 724 | 	const dim3 grid,
 725 | 	uint32_t* tiles_touched,
 726 | 	bool prefiltered)
 727 | {
 728 | 	auto idx = cg::this_grid().thread_rank();
 729 | 	if (idx >= P)
 730 | 		return;
 731 | 
 732 | 	// Initialize radius and touched tiles to 0. If this isn't changed,
 733 | 	// this Gaussian will not be processed further.
 734 | 	radii[idx] = 0;
 735 | 	tiles_touched[idx] = 0;
 736 | 
 737 | 	// Perform near culling, quit if outside.
 738 | 	float3 p_view;
 739 | 	if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view) || opacities[idx] < 0.0001f) {
 740 | 		return;
 741 | 	}
 742 | 
 743 | 	// Transform point by projecting
 744 | 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
 745 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
 746 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
 747 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
 748 | 
 749 | 	// If 3D covariance matrix is precomputed, use it, otherwise compute
 750 | 	// from scaling and rotation parameters. 
 751 | 	const float* cov3D;
 752 | 	if (cov3D_precomp != nullptr)
 753 | 	{
 754 | 		cov3D = cov3D_precomp + idx * 6;
 755 | 	}
 756 | 	else
 757 | 	{
 758 | 		computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
 759 | 		cov3D = cov3Ds + idx * 6;
 760 | 	}
 761 | 
 762 | 	// Compute 2D screen-space covariance matrix and unblurred determinant.
 763 | 	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
 764 | 
 765 | 	// Invert covariance (EWA algorithm)
 766 | 	float det = (cov.x * cov.z - cov.y * cov.y);
 767 | 	if (det <= 0.0f || cov.x <= 0.0f || cov.z <= 0.0f) {
 768 | 		// Illegal cov matrix, this point should be pruned with zero gradients
 769 | 		radii[idx] = -1.0;
 770 | 		return;
 771 | 	}
 772 | 	float det_inv = 1.f / det;
 773 | 	float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
 774 | 
 775 | 	// Compute extent in screen space (by finding eigenvalues of
 776 | 	// 2D covariance matrix). Use extent to compute a bounding rectangle
 777 | 	// of screen-space tiles that this Gaussian overlaps with. Quit if
 778 | 	// rectangle covers 0 tiles. 
 779 | 	float mid = 0.5f * (cov.x + cov.z);
 780 | 	float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
 781 | 	float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
 782 | 	if (lambda1 <= 0.01 || lambda2 <= 0.01 || lambda1 < lambda2 || (lambda1 / lambda2) > 10000.0) {
 783 | 		// Illegal cov matrix, this point should be pruned with zero gradients
 784 | 		radii[idx] = -1.0;
 785 | 		return;
 786 | 	}
 787 | 	float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
 788 | 
 789 | 	// float my_radius = max(0.0f, ceil(3.f * sqrt(max(lambda1, lambda2))));
 790 | 	float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
 791 | 	uint2 rect_min, rect_max;
 792 | 	getRect(point_image, (int)my_radius, rect_min, rect_max, grid);
 793 | 
 794 | 	tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
 795 | 	if (tiles_touched[idx] == 0) {
 796 | 		// Not rendered since outside of all visible tiles
 797 | 		return;
 798 | 	}
 799 | 
 800 | 	// Perform tile mask check
 801 | 	if (tile_mask != nullptr){
 802 | 		int touched = 0;
 803 | 		for (int y = rect_min.y; y < rect_max.y; y++)
 804 | 		{
 805 | 			for (int x = rect_min.x; x < rect_max.x; x++)
 806 | 			{
 807 | 				if (tile_mask[y * grid.x + x])
 808 | 				{
 809 | 					touched += 1;
 810 | 				}
 811 | 			}
 812 | 		}
 813 | 		tiles_touched[idx] = touched;
 814 | 		if (touched == 0) {
 815 | 			// Not rendered since outside of tile mask
 816 | 			// radii[idx] = -1.0;
 817 | 			return;
 818 | 		}
 819 | 	}
 820 | 
 821 | 	// Inverse 2D covariance and opacity neatly pack into one float4
 822 | 	conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
 823 | 
 824 | 	// // Perform accurate per-tile culling test
 825 | 	// // As mentioned in: StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering
 826 | 	// // Slightly higher threshold for tile-based culling; Otherwise, imprecisions could lead to more tiles in preprocess than in duplicate
 827 | 	// constexpr float alpha_threshold = 1.0f / 255.0f;
 828 | 	// const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold);
 829 | 	// tiles_touched[idx] = computeTilebasedCullingTileCount(conic_opacity[idx], point_image, opacity_power_threshold, rect_min, rect_max);
 830 | 
 831 | 	// If colors have been precomputed, use them, otherwise convert
 832 | 	// spherical harmonics coefficients to RGB color.
 833 | 	if (colors_precomp == nullptr)
 834 | 	{
 835 | 		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
 836 | 		rgb[idx * C + 0] = result.x;
 837 | 		rgb[idx * C + 1] = result.y;
 838 | 		rgb[idx * C + 2] = result.z;
 839 | 	}
 840 | 
 841 | 	// Store some useful helper data for the next steps.
 842 | 	depths[idx] = p_view.z;
 843 | 	radii[idx] = my_radius;
 844 | 	points_xy_image[idx] = point_image;
 845 | }
 846 | 
 847 | // Main rasterization method. Collaboratively works on one tile per
 848 | // block, each thread treats one pixel. Alternates between fetching 
 849 | // and rasterizing data.
 850 | template <uint32_t CHANNELS>
 851 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
 852 | renderCUDA(
 853 | 	const uint2* __restrict__ ranges,
 854 | 	const uint32_t* __restrict__ point_list,
 855 | 	int W, int H,
 856 | 	const float2* __restrict__ points_xy_image,
 857 | 	const float* __restrict__ features,
 858 | 	const float* __restrict__ depths,
 859 | 	const float4* __restrict__ conic_opacity,
 860 | 	float* __restrict__ out_alpha,
 861 | 	uint32_t* __restrict__ n_contrib,
 862 | 	const float* __restrict__ bg_color,
 863 | 	float* __restrict__ out_color,
 864 | 	float* __restrict__ out_depth)
 865 | {
 866 | 	// Identify current tile and associated min/max pixel range.
 867 | 	auto block = cg::this_thread_block();
 868 | 	uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
 869 | 	uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
 870 | 	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
 871 | 	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
 872 | 	uint32_t pix_id = W * pix.y + pix.x;
 873 | 	float2 pixf = { (float)pix.x, (float)pix.y };
 874 | 
 875 | 	// Check if this thread is associated with a valid pixel or outside.
 876 | 	bool inside = pix.x < W&& pix.y < H;
 877 | 	// Done threads can help with fetching, but don't rasterize
 878 | 	bool done = !inside;
 879 | 
 880 | 	// Load start/end range of IDs to process in bit sorted list.
 881 | 	uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
 882 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
 883 | 	int toDo = range.y - range.x;
 884 | 
 885 | 	// Allocate storage for batches of collectively fetched data.
 886 | 	__shared__ int collected_id[BLOCK_SIZE];
 887 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
 888 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
 889 | 
 890 | 	// Initialize helper variables
 891 | 	float T = 1.0f;
 892 | 	uint32_t contributor = 0;
 893 | 	uint32_t last_contributor = 0;
 894 | 	float C[CHANNELS] = { 0 };
 895 | 	float D = 0;
 896 | 
 897 | 	// Iterate over batches until all done or range is complete
 898 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
 899 | 	{
 900 | 		// End if entire block votes that it is done rasterizing
 901 | 		int num_done = __syncthreads_count(done);
 902 | 		if (num_done == BLOCK_SIZE)
 903 | 			break;
 904 | 
 905 | 		// Collectively fetch per-Gaussian data from global to shared
 906 | 		int progress = i * BLOCK_SIZE + block.thread_rank();
 907 | 		if (range.x + progress < range.y)
 908 | 		{
 909 | 			int coll_id = point_list[range.x + progress];
 910 | 			collected_id[block.thread_rank()] = coll_id;
 911 | 			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
 912 | 			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
 913 | 		}
 914 | 		block.sync();
 915 | 
 916 | 		// Iterate over current batch
 917 | 		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
 918 | 		{
 919 | 			// Keep track of current position in range
 920 | 			contributor++;
 921 | 
 922 | 			// Resample using conic matrix (cf. "Surface 
 923 | 			// Splatting" by Zwicker et al., 2001)
 924 | 			float2 xy = collected_xy[j];
 925 | 			float2 d = { xy.x - pixf.x, xy.y - pixf.y };
 926 | 			float4 con_o = collected_conic_opacity[j];
 927 | 			float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
 928 | 			if (power > 0.0f)
 929 | 				continue;
 930 | 
 931 | 			// Eq. (2) from 3D Gaussian splatting paper.
 932 | 			// Obtain alpha by multiplying with Gaussian opacity
 933 | 			// and its exponential falloff from mean.
 934 | 			// Avoid numerical instabilities (see paper appendix). 
 935 | 			float alpha = min(0.99f, con_o.w * __expf(power));
 936 | 			if (alpha < 1.0f / 255.0f)
 937 | 				continue;
 938 | 			float test_T = T * (1 - alpha);
 939 | 
 940 | 			// Eq. (3) from 3D Gaussian splatting paper.
 941 | 			for (int ch = 0; ch < CHANNELS; ch++)
 942 | 				C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
 943 | 			D += depths[collected_id[j]] * alpha * T;
 944 | 			T = test_T;
 945 | 
 946 | 			// Keep track of last range entry to update this
 947 | 			// pixel.
 948 | 			last_contributor = contributor;
 949 | 
 950 | 			// Early stopping
 951 | 			if (test_T < 0.0001f)
 952 | 			{
 953 | 				done = true;
 954 | 				continue;
 955 | 			}
 956 | 		}
 957 | 	}
 958 | 
 959 | 	// All threads that treat valid pixel write out their final
 960 | 	// rendering data to the frame and auxiliary buffers.
 961 | 	if (inside)
 962 | 	{
 963 | 		out_alpha[pix_id] = 1 - T;
 964 | 		n_contrib[pix_id] = last_contributor;
 965 | 		for (int ch = 0; ch < CHANNELS; ch++)
 966 | 			out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
 967 | 		out_depth[pix_id] = D;
 968 | 	}
 969 | }
 970 | 
 971 | void FORWARD::render(
 972 | 	const dim3 grid, dim3 block,
 973 | 	const uint2* ranges,
 974 | 	const uint32_t* point_list,
 975 | 	int W, int H,
 976 | 	const float2* means2D,
 977 | 	const float* colors,
 978 | 	const float* depths,
 979 | 	const float4* conic_opacity,
 980 | 	float* out_alpha,
 981 | 	uint32_t* n_contrib,
 982 | 	const float* bg_color,
 983 | 	float* out_color,
 984 | 	float* out_depth)
 985 | {
 986 | 	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
 987 | 		ranges,
 988 | 		point_list,
 989 | 		W, H,
 990 | 		means2D,
 991 | 		colors,
 992 | 		depths,
 993 | 		conic_opacity,
 994 | 		out_alpha,
 995 | 		n_contrib,
 996 | 		bg_color,
 997 | 		out_color,
 998 | 		out_depth);
 999 | }
1000 | 
1001 | void FORWARD::preprocess(int P, int D, int M,
1002 | 	const float* means3D,
1003 | 	const glm::vec3* scales,
1004 | 	const float scale_modifier,
1005 | 	const glm::vec4* rotations,
1006 | 	const float* opacities,
1007 | 	const float* shs,
1008 | 	bool* clamped,
1009 | 	const float* cov3D_precomp,
1010 | 	const bool* tile_mask,
1011 | 	const float* colors_precomp,
1012 | 	const float* viewmatrix,
1013 | 	const float* projmatrix,
1014 | 	const glm::vec3* cam_pos,
1015 | 	const int W, int H,
1016 | 	const float focal_x, float focal_y,
1017 | 	const float tan_fovx, float tan_fovy,
1018 | 	int* radii,
1019 | 	float2* means2D,
1020 | 	float* depths,
1021 | 	float* cov3Ds,
1022 | 	float* rgb,
1023 | 	float4* conic_opacity,
1024 | 	const dim3 grid,
1025 | 	uint32_t* tiles_touched,
1026 | 	bool prefiltered)
1027 | {
1028 | 	preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
1029 | 		P, D, M,
1030 | 		means3D,
1031 | 		scales,
1032 | 		scale_modifier,
1033 | 		rotations,
1034 | 		opacities,
1035 | 		shs,
1036 | 		clamped,
1037 | 		cov3D_precomp,
1038 | 		tile_mask,
1039 | 		colors_precomp,
1040 | 		viewmatrix, 
1041 | 		projmatrix,
1042 | 		cam_pos,
1043 | 		W, H,
1044 | 		tan_fovx, tan_fovy,
1045 | 		focal_x, focal_y,
1046 | 		radii,
1047 | 		means2D,
1048 | 		depths,
1049 | 		cov3Ds,
1050 | 		rgb,
1051 | 		conic_opacity,
1052 | 		grid,
1053 | 		tiles_touched,
1054 | 		prefiltered
1055 | 		);
1056 | }
1057 | 
1058 | void FORWARD::fusedPreprocess4D(int P,
1059 | 	const int deg,
1060 | 	const int deg_t,
1061 | 	const int M,
1062 | 	const glm::vec3* means3D,
1063 | 	const float* cov,
1064 | 	const glm::vec3* ms,
1065 | 	const float* cov_t,
1066 | 	const float* opacities,
1067 | 	const float* t1,
1068 | 	const float* shs,
1069 | 	const float* t,
1070 | 	const float* viewmatrix,
1071 | 	const float* projmatrix,
1072 | 	const float* cam_pos,
1073 | 	const float duration,
1074 | 	bool* mask,
1075 | 	float* occ1,
1076 | 	glm::vec3* xyz3,
1077 | 	glm::vec3* rgb3)
1078 | {
1079 | 	fusedPreprocess4DCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
1080 | 		P,
1081 | 		deg,
1082 | 		deg_t,
1083 | 		M,
1084 | 		means3D,
1085 | 		cov,
1086 | 		ms,
1087 | 		cov_t,
1088 | 		opacities,
1089 | 		t1,
1090 | 		shs,
1091 | 		t,
1092 | 		viewmatrix,
1093 | 		projmatrix,
1094 | 		cam_pos,
1095 | 		duration,
1096 | 		mask,
1097 | 		occ1,
1098 | 		xyz3,
1099 | 		rgb3);
1100 | }
1101 | 
1102 | void FORWARD::fusedPreprocess4DSparse(int P,
1103 | 	const int deg,
1104 | 	const int deg_t,
1105 | 	const int M,
1106 | 	const glm::vec3* means3D,
1107 | 	const float* cov,
1108 | 	const glm::vec3* ms,
1109 | 	const float* cov_t,
1110 | 	const float* opacities,
1111 | 	const float* t1,
1112 | 	const glm::vec3* bases,
1113 | 	const float* shs,
1114 | 	const float* t,
1115 | 	const int* inverse,
1116 | 	const float* viewmatrix,
1117 | 	const float* projmatrix,
1118 | 	const float* cam_pos,
1119 | 	const float duration,
1120 | 	bool* mask,
1121 | 	float* occ1,
1122 | 	glm::vec3* xyz3,
1123 | 	glm::vec3* rgb3)
1124 | {
1125 | 	fusedPreprocess4DSparseCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
1126 | 		P,
1127 | 		deg,
1128 | 		deg_t,
1129 | 		M,
1130 | 		means3D,
1131 | 		cov,
1132 | 		ms,
1133 | 		cov_t,
1134 | 		opacities,
1135 | 		t1,
1136 | 		bases,
1137 | 		shs,
1138 | 		t,
1139 | 		inverse,
1140 | 		viewmatrix,
1141 | 		projmatrix,
1142 | 		cam_pos,
1143 | 		duration,
1144 | 		mask,
1145 | 		occ1,
1146 | 		xyz3,
1147 | 		rgb3);
1148 | }
1149 | 


--------------------------------------------------------------------------------
/cuda_rasterizer/forward.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
 13 | #define CUDA_RASTERIZER_FORWARD_H_INCLUDED
 14 | 
 15 | #include <cuda.h>
 16 | #include "cuda_runtime.h"
 17 | #include "device_launch_parameters.h"
 18 | #define GLM_FORCE_CUDA
 19 | #include <glm/glm.hpp>
 20 | 
 21 | namespace FORWARD
 22 | {
 23 | 	// Perform initial steps for each Gaussian prior to rasterization.
 24 | 	void preprocess(int P, int D, int M,
 25 | 		const float* orig_points,
 26 | 		const glm::vec3* scales,
 27 | 		const float scale_modifier,
 28 | 		const glm::vec4* rotations,
 29 | 		const float* opacities,
 30 | 		const float* shs,
 31 | 		bool* clamped,
 32 | 		const float* cov3D_precomp,
 33 | 		const bool* tile_mask,
 34 | 		const float* colors_precomp,
 35 | 		const float* viewmatrix,
 36 | 		const float* projmatrix,
 37 | 		const glm::vec3* cam_pos,
 38 | 		const int W, int H,
 39 | 		const float focal_x, float focal_y,
 40 | 		const float tan_fovx, float tan_fovy,
 41 | 		int* radii,
 42 | 		float2* points_xy_image,
 43 | 		float* depths,
 44 | 		float* cov3Ds,
 45 | 		float* colors,
 46 | 		float4* conic_opacity,
 47 | 		// float* comp,
 48 | 		const dim3 grid,
 49 | 		uint32_t* tiles_touched,
 50 | 		bool prefiltered);
 51 | 
 52 | 	// Main rasterization method.
 53 | 	void render(
 54 | 		const dim3 grid, dim3 block,
 55 | 		const uint2* ranges,
 56 | 		const uint32_t* point_list,
 57 | 		int W, int H,
 58 | 		const float2* points_xy_image,
 59 | 		const float* features,
 60 | 		const float* depths,
 61 | 		const float4* conic_opacity,
 62 | 		float* out_alpha,
 63 | 		uint32_t* n_contrib,
 64 | 		const float* bg_color,
 65 | 		float* out_color,
 66 | 		float* out_depth);
 67 | 
 68 | 	void fusedPreprocess4D(int P,
 69 | 		const int deg,
 70 | 		const int deg_t,
 71 | 		const int M,
 72 | 		const glm::vec3* means3D,
 73 | 		const float* cov,
 74 | 		const glm::vec3* ms,
 75 | 		const float* cov_t,
 76 | 		const float* opacities,
 77 | 		const float* t1,
 78 | 		const float* shs,
 79 | 		const float* t,
 80 | 		const float* viewmatrix,
 81 | 		const float* projmatrix,
 82 | 		const float* cam_pos,
 83 | 		const float duration,
 84 | 		bool* mask,
 85 | 		float* occ1,
 86 | 		glm::vec3* xyz3,
 87 | 		glm::vec3* rgb3);
 88 | 
 89 | 	void fusedPreprocess4DSparse(int P,
 90 | 		const int deg,
 91 | 		const int deg_t,
 92 | 		const int M,
 93 | 		const glm::vec3* means3D,
 94 | 		const float* cov,
 95 | 		const glm::vec3* ms,
 96 | 		const float* cov_t,
 97 | 		const float* opacities,
 98 | 		const float* t1,
 99 | 		const glm::vec3* bases,
100 | 		const float* shs,
101 | 		const float* t,
102 | 		const int* inverse,
103 | 		const float* viewmatrix,
104 | 		const float* projmatrix,
105 | 		const float* cam_pos,
106 | 		const float duration,
107 | 		bool* mask,
108 | 		float* occ1,
109 | 		glm::vec3* xyz3,
110 | 		glm::vec3* rgb3);
111 | 
112 | 	void computeCov3D(int P,
113 | 		const glm::vec3* scaling_xyz,
114 | 		const glm::vec4* rotation_l,
115 | 		float* cov);
116 | 
117 | 	void computeCov4D(int P,
118 | 		const glm::vec4* scaling_xyzt,
119 | 		const glm::vec4* rotation_l,
120 | 		const glm::vec4* rotation_r,
121 | 		float* cov,
122 | 		glm::vec3* ms,
123 | 		float* cov_t);
124 | 
125 | 	void computeSH4D(
126 | 		int P,
127 | 		int deg, int deg_t, int max_coeffs, 
128 | 		const float* sh, 
129 | 		const glm::vec3* dir, 
130 | 		const float* dir_t, 
131 | 		const float time_duration,
132 | 		glm::vec3* rgb);
133 | }
134 | 
135 | 
136 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #ifndef CUDA_RASTERIZER_H_INCLUDED
 13 | #define CUDA_RASTERIZER_H_INCLUDED
 14 | 
 15 | #include <vector>
 16 | #include <functional>
 17 | #include "cuda_fp16.h"
 18 | 
 19 | namespace CudaRasterizer
 20 | {
 21 | 	class Rasterizer
 22 | 	{
 23 | 	public:
 24 | 
 25 | 		static void fusedPreprocess4D(
 26 | 			const int P,
 27 | 			const int deg,
 28 | 			const int deg_t,
 29 | 			const int M,
 30 | 			const float* means3D,
 31 | 			const float* cov,
 32 | 			const float* ms,
 33 | 			const float* cov_t,
 34 | 			const float* opacities,
 35 | 			const float* t1,
 36 | 			const float* sh,
 37 | 			const float* t,
 38 | 			const float* viewmatrix,
 39 | 			const float* projmatrix,
 40 | 			const float* cam_pos,
 41 | 			const float duration,
 42 | 			bool* mask,
 43 | 			float* occ1,
 44 | 			float* xyz3,
 45 | 			float* rgb3);
 46 | 
 47 | 		static void fusedPreprocess4DSparse(
 48 | 			const int P,
 49 | 			const int deg,
 50 | 			const int deg_t,
 51 | 			const int M,
 52 | 			const float* means3D,
 53 | 			const float* cov,
 54 | 			const float* ms,
 55 | 			const float* cov_t,
 56 | 			const float* opacities,
 57 | 			const float* t1,
 58 | 			const float* base,
 59 | 			const float* sh,
 60 | 			const float* t,
 61 | 			const int* inverse,
 62 | 			const float* viewmatrix,
 63 | 			const float* projmatrix,
 64 | 			const float* cam_pos,
 65 | 			const float duration,
 66 | 			bool* mask,
 67 | 			float* occ1,
 68 | 			float* xyz3,
 69 | 			float* rgb3);
 70 | 
 71 | 		static void markVisible(
 72 | 			int P,
 73 | 			float* means3D,
 74 | 			float* viewmatrix,
 75 | 			float* projmatrix,
 76 | 			bool* present);
 77 | 
 78 | 		static void computeCov3D(
 79 | 			int P,
 80 | 			const float* scaling_xyz,
 81 | 			const float* rotation_l,
 82 | 			float* cov);
 83 | 
 84 | 		static void computeCov3DBackward(
 85 | 			int P,
 86 | 			const float* scaling_xyz,
 87 | 			const float* rotation_l,
 88 | 			const float* dL_dcov,
 89 | 			float* dL_dscaling_xyz,
 90 | 			float* dL_drotation_l);
 91 | 
 92 | 		static void computeCov4D(
 93 | 			int P,
 94 | 			const float* scaling_xyzt,
 95 | 			const float* rotation_l,
 96 | 			const float* rotation_r,
 97 | 			float* cov,
 98 | 			float* ms,
 99 | 			float* cov_t);
100 | 
101 | 		static void computeSH4D(
102 | 			int P,
103 | 			int deg, int deg_t, int max_coeffs, 
104 | 			const float* shs, 
105 | 			const float* dir, 
106 | 			const float* dir_t, 
107 | 			const float time_duration,
108 | 			float* rgb);
109 | 
110 | 		static void computeSH4DBackward(
111 | 			int P,
112 | 			int deg, int deg_t, int max_coeffs, 
113 | 			const float* shs, 
114 | 			const float* dir, 
115 | 			const float* dir_t, 
116 | 			const float time_duration,
117 | 			const float* dL_drgb,
118 | 			float* dL_dsh,
119 | 			float* dL_ddir,
120 | 			float* dL_ddir_t);
121 | 
122 | 		static void computeCov4DBackward(
123 | 			int P,
124 | 			const float* scaling_xyzt,
125 | 			const float* rotation_l,
126 | 			const float* rotation_r,
127 | 			const float* dL_dcov,
128 | 			const float* dL_dms,
129 | 			const float* dL_dcov_t,
130 | 			float* dL_dscaling_xyzt,
131 | 			float* dL_drotation_l,
132 | 			float* dL_drotation_r);
133 | 
134 | 		static int forward(
135 | 			std::function<char* (size_t)> geometryBuffer,
136 | 			std::function<char* (size_t)> binningBuffer,
137 | 			std::function<char* (size_t)> imageBuffer,
138 | 			const int P, int D, int M,
139 | 			const float* background,
140 | 			const int width, int height,
141 | 			const float* means3D,
142 | 			const float* shs,
143 | 			const float* colors_precomp,
144 | 			const float* opacities,
145 | 			const float* scales,
146 | 			const float scale_modifier,
147 | 			const float* rotations,
148 | 			const float* cov3D_precomp,
149 | 			const bool* tile_mask,
150 | 			const float* viewmatrix,
151 | 			const float* projmatrix,
152 | 			const float* cam_pos,
153 | 			const float tan_fovx, const float tan_fovy,
154 | 			const bool prefiltered,
155 | 			float* out_color,
156 | 			float* out_depth,
157 | 			float* out_alpha,
158 | 			int* radii = nullptr,
159 | 			bool debug = false);
160 | 
161 | 		static void backward(
162 | 			const int P, int D, int M, int R,
163 | 			const float* background,
164 | 			const int width, int height,
165 | 			const float* means3D,
166 | 			const float* shs,
167 | 			const float* colors_precomp,
168 | 			const float* scales,
169 | 			const float scale_modifier,
170 | 			const float* rotations,
171 | 			const float* cov3D_precomp,
172 | 			const float* viewmatrix,
173 | 			const float* projmatrix,
174 | 			const float* campos,
175 | 			const float tan_fovx, const float tan_fovy,
176 | 			const int* radii,
177 | 			char* geom_buffer,
178 | 			char* binning_buffer,
179 | 			char* image_buffer,
180 | 			const float* accum_alphas,
181 | 			const float* dL_dpix,
182 | 			const float* dL_dpix_depth,
183 | 			const float* dL_dpix_dalpha,
184 | 			float* dL_dmean2D,
185 | 			float* dL_dabsmean2D,
186 | 			float* dL_dconic,
187 | 			float* dL_dopacity,
188 | 			float* dL_dcolor,
189 | 			float* dL_ddepth,
190 | 			float* dL_dmean3D,
191 | 			float* dL_dcov3D,
192 | 			float* dL_dsh,
193 | 			float* dL_dscale,
194 | 			float* dL_drot,
195 | 			bool debug);
196 | 	};
197 | };
198 | 
199 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer_impl.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include "rasterizer_impl.h"
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <algorithm>
 16 | #include <numeric>
 17 | #include <cuda.h>
 18 | #include "cuda_runtime.h"
 19 | #include "device_launch_parameters.h"
 20 | #include <cub/cub.cuh>
 21 | #include <cub/device/device_radix_sort.cuh>
 22 | #define GLM_FORCE_CUDA
 23 | #include <glm/glm.hpp>
 24 | 
 25 | #include <cooperative_groups.h>
 26 | #include <cooperative_groups/reduce.h>
 27 | namespace cg = cooperative_groups;
 28 | 
 29 | #include "auxiliary.h"
 30 | #include "forward.h"
 31 | #include "backward.h"
 32 | 
 33 | // Helper function to find the next-highest bit of the MSB
 34 | // on the CPU.
 35 | uint32_t getHigherMsb(uint32_t n)
 36 | {
 37 | 	uint32_t msb = sizeof(n) * 4;
 38 | 	uint32_t step = msb;
 39 | 	while (step > 1)
 40 | 	{
 41 | 		step /= 2;
 42 | 		if (n >> msb)
 43 | 			msb += step;
 44 | 		else
 45 | 			msb -= step;
 46 | 	}
 47 | 	if (n >> msb)
 48 | 		msb++;
 49 | 	return msb;
 50 | }
 51 | 
 52 | // Wrapper method to call auxiliary coarse frustum containment test.
 53 | // Mark all Gaussians that pass it.
 54 | __global__ void checkFrustum(int P,
 55 | 	const float* orig_points,
 56 | 	const float* viewmatrix,
 57 | 	const float* projmatrix,
 58 | 	bool* present)
 59 | {
 60 | 	auto idx = cg::this_grid().thread_rank();
 61 | 	if (idx >= P)
 62 | 		return;
 63 | 
 64 | 	float3 p_view;
 65 | 	present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
 66 | }
 67 | 
 68 | // Generates one key/value pair for all Gaussian / tile overlaps. 
 69 | // Run once per Gaussian (1:N mapping).
 70 | __global__ void duplicateWithKeys(
 71 | 	int P,
 72 | 	const float2* points_xy,
 73 | 	const float4* conic_opacity,
 74 | 	const float* depths,
 75 | 	const uint32_t* offsets,
 76 | 	uint64_t* gaussian_keys_unsorted,
 77 | 	uint32_t* gaussian_values_unsorted,
 78 | 	int* radii,
 79 | 	dim3 grid,
 80 | 	const bool* tile_mask)
 81 | {
 82 | 	auto idx = cg::this_grid().thread_rank();
 83 | 	if (idx >= P)
 84 | 		return;
 85 | 
 86 | 	// Generate no key/value pair for invisible Gaussians
 87 | 	if (radii[idx] > 0)
 88 | 	{
 89 | 		// Find this Gaussian's offset in buffer for writing keys/values.
 90 | 		uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
 91 | 		uint2 rect_min, rect_max;
 92 | 
 93 | 		getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
 94 | 
 95 | 		// For each tile that the bounding rect overlaps, emit a 
 96 | 		// key/value pair. The key is |  tile ID  |      depth      |,
 97 | 		// and the value is the ID of the Gaussian. Sorting the values 
 98 | 		// with this key yields Gaussian IDs in a list, such that they
 99 | 		// are first sorted by tile and then by depth. 
100 | 		for (int y = rect_min.y; y < rect_max.y; y++)
101 | 		{
102 | 			for (int x = rect_min.x; x < rect_max.x; x++)
103 | 			{
104 | 				uint64_t key = y * grid.x + x;
105 | 				if (tile_mask != nullptr && !tile_mask[key])
106 | 				{
107 | 					continue;
108 | 				}
109 | 				else 
110 | 				{
111 | 
112 | 					// constexpr float alpha_threshold = 1.0f / 255.0f;
113 | 					// const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold);
114 | 					// glm::vec2 max_pos;
115 | 					// const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y};
116 | 					// const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1};
117 | 					// float max_opac_factor = max_contrib_power_rect_gaussian_float<BLOCK_X-1, BLOCK_Y-1>(conic_opacity[idx], points_xy[idx], tile_min, tile_max, max_pos);
118 | 
119 | 					// if (max_opac_factor > opacity_power_threshold) {
120 | 					// 	continue;
121 | 					// }
122 | 
123 | 					key <<= 32;
124 | 					key |= *((uint32_t*)&depths[idx]);
125 | 					gaussian_keys_unsorted[off] = key;
126 | 					gaussian_values_unsorted[off] = idx;
127 | 					off++;
128 | 				}
129 | 			}
130 | 		}
131 | 	}
132 | }
133 | 
134 | // Check keys to see if it is at the start/end of one tile's range in 
135 | // the full sorted list. If yes, write start/end of this tile. 
136 | // Run once per instanced (duplicated) Gaussian ID.
137 | __global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
138 | {
139 | 	auto idx = cg::this_grid().thread_rank();
140 | 	if (idx >= L)
141 | 		return;
142 | 
143 | 	// Read tile ID from key. Update start/end of tile range if at limit.
144 | 	uint64_t key = point_list_keys[idx];
145 | 	uint32_t currtile = key >> 32;
146 | 	if (idx == 0)
147 | 		ranges[currtile].x = 0;
148 | 	else
149 | 	{
150 | 		uint32_t prevtile = point_list_keys[idx - 1] >> 32;
151 | 		if (currtile != prevtile)
152 | 		{
153 | 			ranges[prevtile].y = idx;
154 | 			ranges[currtile].x = idx;
155 | 		}
156 | 	}
157 | 	if (idx == L - 1)
158 | 		ranges[currtile].y = L;
159 | }
160 | 
161 | void CudaRasterizer::Rasterizer::computeCov3D(int P,
162 | 	const float* scaling_xyz,
163 | 	const float* rotation_l,
164 | 	float* cov)
165 | {
166 | 	FORWARD::computeCov3D(
167 | 		P,
168 | 		(glm::vec3*)scaling_xyz,
169 | 		(glm::vec4*)rotation_l,
170 | 		cov);
171 | }
172 | 
173 | void CudaRasterizer::Rasterizer::computeCov3DBackward(int P,
174 | 	const float* scaling_xyz,
175 | 	const float* rotation_l,
176 | 	const float* dL_dcov,
177 | 	float* dL_dscaling_xyz,
178 | 	float* dL_drotation_l)
179 | {
180 | 	BACKWARD::computeCov3DBackward(
181 | 		P,
182 | 		(glm::vec3*)scaling_xyz,
183 | 		(glm::vec4*)rotation_l,
184 | 		dL_dcov,
185 | 		(glm::vec3*)dL_dscaling_xyz,
186 | 		(glm::vec4*)dL_drotation_l);
187 | }
188 | 
189 | void CudaRasterizer::Rasterizer::computeCov4D(int P,
190 | 	const float* scaling_xyzt,
191 | 	const float* rotation_l,
192 | 	const float* rotation_r,
193 | 	float* cov,
194 | 	float* ms,
195 | 	float* cov_t)
196 | {
197 | 	FORWARD::computeCov4D(
198 | 		P,
199 | 		(glm::vec4*)scaling_xyzt,
200 | 		(glm::vec4*)rotation_l,
201 | 		(glm::vec4*)rotation_r,
202 | 		cov,
203 | 		(glm::vec3*)ms,
204 | 		cov_t);
205 | }
206 | 
207 | void CudaRasterizer::Rasterizer::computeCov4DBackward(int P,
208 | 	const float* scaling_xyzt,
209 | 	const float* rotation_l,
210 | 	const float* rotation_r,
211 | 	const float* dL_dcov,
212 | 	const float* dL_dms,
213 | 	const float* dL_dcov_t,
214 | 	float* dL_dscaling_xyzt,
215 | 	float* dL_drotation_l,
216 | 	float* dL_drotation_r)
217 | {
218 | 	BACKWARD::computeCov4DBackward(
219 | 		P,
220 | 		(glm::vec4*)scaling_xyzt,
221 | 		(glm::vec4*)rotation_l,
222 | 		(glm::vec4*)rotation_r,
223 | 		dL_dcov,
224 | 		(glm::vec3*)dL_dms,
225 | 		dL_dcov_t,
226 | 		(glm::vec4*)dL_dscaling_xyzt,
227 | 		(glm::vec4*)dL_drotation_l,
228 | 		(glm::vec4*)dL_drotation_r);
229 | }
230 | 
231 | 
232 | void CudaRasterizer::Rasterizer::computeSH4D(int P,
233 | 	int deg, int deg_t, int max_coeffs, 
234 | 	const float* shs, 
235 | 	const float* dir, 
236 | 	const float* dir_t, 
237 | 	const float time_duration,
238 | 	float* rgb)
239 | {
240 | 	FORWARD::computeSH4D(
241 | 		P,
242 | 		deg,
243 | 		deg_t,
244 | 		max_coeffs,
245 | 		shs,
246 | 		(glm::vec3*)dir,
247 | 		dir_t,
248 | 		time_duration,
249 | 		(glm::vec3*)rgb
250 | 	);
251 | }
252 | 
253 | void CudaRasterizer::Rasterizer::computeSH4DBackward(
254 | 	int P,
255 | 	int deg, int deg_t, int max_coeffs, 
256 | 	const float* shs, 
257 | 	const float* dir, 
258 | 	const float* dir_t, 
259 | 	const float time_duration,
260 | 	const float* dL_drgb,
261 | 	float* dL_dsh,
262 | 	float* dL_ddir,
263 | 	float* dL_ddir_t
264 | )
265 | {
266 | 	BACKWARD::computeSH4DBackward(
267 | 		P,
268 | 		deg,
269 | 		deg_t,
270 | 		max_coeffs,
271 | 		shs,
272 | 		(glm::vec3*)dir,
273 | 		dir_t,
274 | 		time_duration,
275 | 		(glm::vec3*)dL_drgb,
276 | 		dL_dsh,
277 | 		(glm::vec3*)dL_ddir,
278 | 		dL_ddir_t
279 | 	);
280 | }
281 | 
282 | // Marginalization & color computation
283 | void CudaRasterizer::Rasterizer::fusedPreprocess4DSparse(
284 | 	const int P,
285 | 	const int deg,
286 | 	const int deg_t,
287 | 	const int M,
288 | 	const float* means3D,
289 | 	const float* cov,
290 | 	const float* ms,
291 | 	const float* cov_t,
292 | 	const float* opacities,
293 | 	const float* t1,
294 | 	const float* base,
295 | 	const float* sh,
296 | 	const float* t,
297 | 	const int* inverse,
298 | 	const float* viewmatrix,
299 | 	const float* projmatrix,
300 | 	const float* cam_pos,
301 | 	const float duration,
302 | 	bool* mask,
303 | 	float* occ1,
304 | 	float* xyz3,
305 | 	float* rgb3)
306 | {
307 | 	FORWARD::fusedPreprocess4DSparse(
308 | 		P,
309 | 		deg,
310 | 		deg_t,
311 | 		M,
312 | 		(glm::vec3*)means3D,
313 | 		cov,
314 | 		(glm::vec3*)ms,
315 | 		cov_t,
316 | 		opacities,
317 | 		t1,
318 | 		(glm::vec3*)base,
319 | 		sh,
320 | 		t,
321 | 		inverse,
322 | 		viewmatrix,
323 | 		projmatrix,
324 | 		cam_pos,
325 | 		duration,
326 | 		mask,
327 | 		occ1,
328 | 		(glm::vec3*)xyz3,
329 | 		(glm::vec3*)rgb3);
330 | }
331 | 
332 | // Marginalization & color computation
333 | void CudaRasterizer::Rasterizer::fusedPreprocess4D(
334 | 	const int P,
335 | 	const int deg,
336 | 	const int deg_t,
337 | 	const int M,
338 | 	const float* means3D,
339 | 	const float* cov,
340 | 	const float* ms,
341 | 	const float* cov_t,
342 | 	const float* opacities,
343 | 	const float* t1,
344 | 	const float* sh,
345 | 	const float* t,
346 | 	const float* viewmatrix,
347 | 	const float* projmatrix,
348 | 	const float* cam_pos,
349 | 	const float duration,
350 | 	bool* mask,
351 | 	float* occ1,
352 | 	float* xyz3,
353 | 	float* rgb3)
354 | {
355 | 	FORWARD::fusedPreprocess4D(
356 | 		P,
357 | 		deg,
358 | 		deg_t,
359 | 		M,
360 | 		(glm::vec3*)means3D,
361 | 		cov,
362 | 		(glm::vec3*)ms,
363 | 		cov_t,
364 | 		opacities,
365 | 		t1,
366 | 		sh,
367 | 		t,
368 | 		viewmatrix,
369 | 		projmatrix,
370 | 		cam_pos,
371 | 		duration,
372 | 		mask,
373 | 		occ1,
374 | 		(glm::vec3*)xyz3,
375 | 		(glm::vec3*)rgb3);
376 | }
377 | 
378 | // Mark Gaussians as visible/invisible, based on view frustum testing
379 | void CudaRasterizer::Rasterizer::markVisible(
380 | 	int P,
381 | 	float* means3D,
382 | 	float* viewmatrix,
383 | 	float* projmatrix,
384 | 	bool* present)
385 | {
386 | 	checkFrustum << <(P + 255) / 256, 256 >> > (
387 | 		P,
388 | 		means3D,
389 | 		viewmatrix, projmatrix,
390 | 		present);
391 | }
392 | 
393 | CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
394 | {
395 | 	GeometryState geom;
396 | 	obtain(chunk, geom.depths, P, 128);
397 | 	obtain(chunk, geom.clamped, P * 3, 128);
398 | 	obtain(chunk, geom.internal_radii, P, 128);
399 | 	obtain(chunk, geom.means2D, P, 128);
400 | 	obtain(chunk, geom.cov3D, P * 6, 128);
401 | 	obtain(chunk, geom.conic_opacity, P, 128);
402 | 	obtain(chunk, geom.rgb, P * 3, 128);
403 | 	obtain(chunk, geom.tiles_touched, P, 128);
404 | 	obtain(chunk, geom.point_offsets, P, 128);
405 | 	cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P);
406 | 	obtain(chunk, geom.scanning_space, geom.scan_size, 128);
407 | 	return geom;
408 | }
409 | 
410 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N, size_t M)
411 | {
412 | 	ImageState img;
413 | 	obtain(chunk, img.n_contrib, N, 128);
414 | 	obtain(chunk, img.ranges, M, 128);
415 | 	return img;
416 | }
417 | 
418 | CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
419 | {
420 | 	BinningState binning;
421 | 	obtain(chunk, binning.point_list, P, 128);
422 | 	obtain(chunk, binning.point_list_unsorted, P, 128);
423 | 	obtain(chunk, binning.point_list_keys, P, 128);
424 | 	obtain(chunk, binning.point_list_keys_unsorted, P, 128);
425 | 	cub::DeviceRadixSort::SortPairs(
426 | 		nullptr, binning.sorting_size,
427 | 		binning.point_list_keys_unsorted, binning.point_list_keys,
428 | 		binning.point_list_unsorted, binning.point_list, P);
429 | 	obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
430 | 	return binning;
431 | }
432 | 
433 | // Forward rendering procedure for differentiable rasterization
434 | // of Gaussians.
435 | int CudaRasterizer::Rasterizer::forward(
436 | 	std::function<char* (size_t)> geometryBuffer,
437 | 	std::function<char* (size_t)> binningBuffer,
438 | 	std::function<char* (size_t)> imageBuffer,
439 | 	const int P, int D, int M,
440 | 	const float* background,
441 | 	const int width, int height,
442 | 	const float* means3D,
443 | 	const float* shs,
444 | 	const float* colors_precomp,
445 | 	const float* opacities,
446 | 	const float* scales,
447 | 	const float scale_modifier,
448 | 	const float* rotations,
449 | 	const float* cov3D_precomp,
450 | 	const bool* tile_mask,
451 | 	const float* viewmatrix,
452 | 	const float* projmatrix,
453 | 	const float* cam_pos,
454 | 	const float tan_fovx, const float tan_fovy,
455 | 	const bool prefiltered,
456 | 	float* out_color,
457 | 	float* out_depth,
458 | 	float* out_alpha,
459 | 	int* radii,
460 | 	bool debug)
461 | {
462 | 	const float focal_y = height / (2.0f * tan_fovy);
463 | 	const float focal_x = width / (2.0f * tan_fovx);
464 | 
465 | 	size_t chunk_size = required<GeometryState>(P);
466 | 	char* chunkptr = geometryBuffer(chunk_size); // memory allocation
467 | 	GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
468 | 
469 | 	if (radii == nullptr)
470 | 	{
471 | 		radii = geomState.internal_radii;
472 | 	}
473 | 
474 | 	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
475 | 	dim3 block(BLOCK_X, BLOCK_Y, 1);
476 | 
477 | 	// Dynamically resize image-based auxiliary buffers during training
478 | 	size_t img_chunk_size = required<ImageState>(width * height, tile_grid.x * tile_grid.y);
479 | 	char* img_chunkptr = imageBuffer(img_chunk_size); // memory allocation
480 | 	ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height, tile_grid.x * tile_grid.y);
481 | 
482 | 	if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
483 | 	{
484 | 		throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
485 | 	}
486 | 
487 | 	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
488 | 	CHECK_CUDA(FORWARD::preprocess(
489 | 		P, D, M,
490 | 		means3D,
491 | 		(glm::vec3*)scales,
492 | 		scale_modifier,
493 | 		(glm::vec4*)rotations,
494 | 		opacities,
495 | 		shs,
496 | 		geomState.clamped,
497 | 		cov3D_precomp,
498 | 		tile_mask,
499 | 		colors_precomp,
500 | 		viewmatrix, projmatrix,
501 | 		(glm::vec3*)cam_pos,
502 | 		width, height,
503 | 		focal_x, focal_y,
504 | 		tan_fovx, tan_fovy,
505 | 		radii,
506 | 		geomState.means2D,
507 | 		geomState.depths,
508 | 		geomState.cov3D,
509 | 		geomState.rgb,
510 | 		geomState.conic_opacity,
511 | 		tile_grid,
512 | 		geomState.tiles_touched,
513 | 		prefiltered
514 | 	), debug)
515 | 
516 | 	// Compute prefix sum over full list of touched tile counts by Gaussians
517 | 	// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
518 | 	CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
519 | 
520 | 	// Retrieve total number of Gaussian instances to launch and resize aux buffers
521 | 	int num_rendered;
522 | 	CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
523 | 
524 | 	size_t binning_chunk_size = required<BinningState>(num_rendered);
525 | 	char* binning_chunkptr = binningBuffer(binning_chunk_size); // memory allocation
526 | 	BinningState binState = BinningState::fromChunk(binning_chunkptr, num_rendered);
527 | 
528 | 	// For each instance to be rendered, produce adequate [ tile | depth ] key 
529 | 	// and corresponding dublicated Gaussian indices to be sorted
530 | 	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
531 | 		P,
532 | 		geomState.means2D,
533 | 		geomState.conic_opacity,
534 | 		geomState.depths,
535 | 		geomState.point_offsets,
536 | 		binState.point_list_keys_unsorted,
537 | 		binState.point_list_unsorted,
538 | 		radii,
539 | 		tile_grid,
540 | 		tile_mask)
541 | 	CHECK_CUDA(, debug)
542 | 
543 | 	int bit = getHigherMsb(tile_grid.x * tile_grid.y);
544 | 
545 | 	// Sort complete list of (duplicated) Gaussian indices by keys
546 | 	CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
547 | 		binState.list_sorting_space,
548 | 		binState.sorting_size,
549 | 		binState.point_list_keys_unsorted, binState.point_list_keys,
550 | 		binState.point_list_unsorted, binState.point_list,
551 | 		num_rendered, 0, 32 + bit), debug)
552 | 
553 | 	CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
554 | 
555 | 	// Identify start and end of per-tile workloads in sorted list
556 | 	if (num_rendered > 0)
557 | 		identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
558 | 			num_rendered,
559 | 			binState.point_list_keys,
560 | 			imgState.ranges);
561 | 	CHECK_CUDA(, debug)
562 | 
563 | 	// Let each tile blend its range of Gaussians independently in parallel
564 | 	const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
565 | 	CHECK_CUDA(FORWARD::render(
566 | 		tile_grid, 
567 | 		block,
568 | 		imgState.ranges,
569 | 		binState.point_list,
570 | 		width, height,
571 | 		geomState.means2D,
572 | 		feature_ptr,
573 | 		geomState.depths,
574 | 		geomState.conic_opacity,
575 | 		out_alpha,
576 | 		imgState.n_contrib,
577 | 		background,
578 | 		out_color,
579 | 		out_depth), debug)
580 | 
581 | 	return num_rendered;
582 | }
583 | 
584 | // Produce necessary gradients for optimization, corresponding
585 | // to forward render pass
586 | void CudaRasterizer::Rasterizer::backward(
587 | 	const int P, int D, int M, int R,
588 | 	const float* background,
589 | 	const int width, int height,
590 | 	const float* means3D,
591 | 	const float* shs,
592 | 	const float* colors_precomp,
593 | 	const float* scales,
594 | 	const float scale_modifier,
595 | 	const float* rotations,
596 | 	const float* cov3D_precomp,
597 | 	const float* viewmatrix,
598 | 	const float* projmatrix,
599 | 	const float* campos,
600 | 	const float tan_fovx, const float tan_fovy,
601 | 	const int* radii,
602 | 	char* geom_buffer,
603 | 	char* binning_buffer,
604 | 	char* img_buffer,
605 | 	const float* accum_alphas,
606 | 	const float* dL_dpix,
607 | 	const float* dL_dpix_depth,
608 | 	const float* dL_dpix_alpha,
609 | 	float* dL_dmean2D,
610 | 	float* dL_dabsmean2D,
611 | 	float* dL_dconic,
612 | 	float* dL_dopacity,
613 | 	float* dL_dcolor,
614 | 	float* dL_ddepth,
615 | 	float* dL_dmean3D,
616 | 	float* dL_dcov3D,
617 | 	float* dL_dsh,
618 | 	float* dL_dscale,
619 | 	float* dL_drot,
620 | 	bool debug)
621 | {
622 | 
623 | 	const float focal_y = height / (2.0f * tan_fovy);
624 | 	const float focal_x = width / (2.0f * tan_fovx);
625 | 	
626 | 	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
627 | 	const dim3 block(BLOCK_X, BLOCK_Y, 1);
628 | 	
629 | 	GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
630 | 	BinningState binState = BinningState::fromChunk(binning_buffer, R);
631 | 	ImageState imgState = ImageState::fromChunk(img_buffer, width * height, tile_grid.x * tile_grid.y);
632 | 
633 | 	if (radii == nullptr)
634 | 	{
635 | 		radii = geomState.internal_radii;
636 | 	}
637 | 
638 | 	// Compute loss gradients w.r.t. 2D mean position, conic matrix,
639 | 	// opacity and RGB of Gaussians from per-pixel loss gradients.
640 | 	// If we were given precomputed colors and not SHs, use them.
641 | 	const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
642 | 	CHECK_CUDA(BACKWARD::render(
643 | 		tile_grid,
644 | 		block,
645 | 		imgState.ranges,
646 | 		binState.point_list,
647 | 		width, height,
648 | 		background,
649 | 		geomState.means2D,
650 | 		geomState.conic_opacity,
651 | 		color_ptr,
652 | 		geomState.depths,
653 | 		accum_alphas,
654 | 		imgState.n_contrib,
655 | 		dL_dpix,
656 | 		dL_dpix_depth,
657 | 		dL_dpix_alpha,
658 | 		(float3*)dL_dmean2D,
659 | 		(float3*)dL_dabsmean2D,
660 | 		(float4*)dL_dconic,
661 | 		dL_dopacity,
662 | 		dL_dcolor,
663 | 		dL_ddepth), debug)
664 | 
665 | 	// Take care of the rest of preprocessing. Was the precomputed covariance
666 | 	// given to us or a scales/rot pair? If precomputed, pass that. If not,
667 | 	// use the one we computed ourselves.
668 | 	const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
669 | 	CHECK_CUDA(BACKWARD::preprocess(P, D, M,
670 | 		(float3*)means3D,
671 | 		radii,
672 | 		shs,
673 | 		geomState.clamped,
674 | 		(glm::vec3*)scales,
675 | 		(glm::vec4*)rotations,
676 | 		scale_modifier,
677 | 		cov3D_ptr,
678 | 		viewmatrix,
679 | 		projmatrix,
680 | 		focal_x, focal_y,
681 | 		tan_fovx, tan_fovy,
682 | 		(glm::vec3*)campos,
683 | 		(float3*)dL_dmean2D,
684 | 		dL_dconic,
685 | 		(glm::vec3*)dL_dmean3D,
686 | 		dL_dcolor,
687 | 		dL_ddepth,
688 | 		dL_dcov3D,
689 | 		dL_dsh,
690 | 		(glm::vec3*)dL_dscale,
691 | 		(glm::vec4*)dL_drot), debug)
692 | }


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer_impl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #pragma once
13 | 
14 | #include <cstdint>
15 | #include <iostream>
16 | #include <vector>
17 | #include "rasterizer.h"
18 | // #include "auxiliary_half.h"
19 | #include <cuda_runtime_api.h>
20 | 
21 | namespace CudaRasterizer
22 | {
23 | 	template <typename T>
24 | 	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
25 | 	{
26 | 		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
27 | 		ptr = reinterpret_cast<T*>(offset);
28 | 		chunk = reinterpret_cast<char*>(ptr + count);
29 | 	}
30 | 
31 | 	struct GeometryState
32 | 	{
33 | 		float* depths;
34 | 		bool* clamped;
35 | 		int* internal_radii;
36 | 		float2* means2D;
37 | 		float* cov3D;
38 | 		float4* conic_opacity;
39 | 		float* rgb;
40 | 		uint32_t* tiles_touched;
41 | 		uint32_t* point_offsets;
42 | 		char* scanning_space;
43 | 		size_t scan_size;
44 | 
45 | 		static GeometryState fromChunk(char*& chunk, size_t P);
46 | 	};
47 | 
48 | 	struct ImageState
49 | 	{
50 | 		uint32_t* n_contrib;
51 | 		uint2* ranges;
52 | 
53 | 		static ImageState fromChunk(char*& chunk, size_t N, size_t M);
54 | 	};
55 | 
56 | 	struct BinningState
57 | 	{
58 | 		uint32_t* point_list;
59 | 		uint32_t* point_list_unsorted;
60 | 		uint64_t* point_list_keys;
61 | 		uint64_t* point_list_keys_unsorted;
62 | 		char* list_sorting_space;
63 | 		size_t sorting_size;
64 | 
65 | 		static BinningState fromChunk(char*& chunk, size_t P);
66 | 	};
67 | 
68 | 	template<typename T> 
69 | 	size_t required(size_t P)
70 | 	{
71 | 		char* size = nullptr;
72 | 		T::fromChunk(size, P);
73 | 		return ((size_t)size) + 128;
74 | 	}
75 | 
76 | 	template<typename T> 
77 | 	size_t required(size_t P, size_t N)
78 | 	{
79 | 		char* size = nullptr;
80 | 		T::fromChunk(size, P, N);
81 | 		return ((size_t)size) + 128;
82 | 	}
83 | };
84 | 


--------------------------------------------------------------------------------
/diff_gauss/__init__.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2023, Inria
  3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 | # All rights reserved.
  5 | #
  6 | # This software is free for non-commercial, research and evaluation use
  7 | # under the terms of the LICENSE.md file.
  8 | #
  9 | # For inquiries contact  george.drettakis@inria.fr
 10 | #
 11 | 
 12 | from typing import NamedTuple
 13 | import torch.nn as nn
 14 | import torch
 15 | from . import _C
 16 | 
 17 | BLOCK_X = 16
 18 | BLOCK_Y = 16
 19 | 
 20 | 
 21 | def fused_preprocess_4d_sparse(xyz3, cov6, ms3, cov_t1, occ1, t1, base, feat, t, inverse, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration):
 22 |     # Mark visible points (based on frustum culling for camera) with a boolean
 23 |     with torch.no_grad():
 24 |         mask, occ1, xyz3, rgb3 = _C.fused_preprocess_4d_sparse(xyz3, cov6, ms3, cov_t1, occ1, t1, base, feat, t, inverse, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration)
 25 | 
 26 |     return mask, occ1, xyz3, rgb3  # mask and output
 27 | 
 28 | def fused_preprocess_4d(xyz3, cov6, ms3, cov_t1, occ1, t1, feat, t, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration):
 29 |     # Mark visible points (based on frustum culling for camera) with a boolean
 30 |     with torch.no_grad():
 31 |         mask, occ1, xyz3, rgb3 = _C.fused_preprocess_4d(xyz3, cov6, ms3, cov_t1, occ1, t1, feat, t, world_view_transform, full_proj_transform, cam_pos, deg, deg_t, duration)
 32 | 
 33 |     return mask, occ1, xyz3, rgb3  # mask and output
 34 | 
 35 | 
 36 | def cpu_deep_copy_tuple(input_tuple):
 37 |     copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple]
 38 |     return tuple(copied_tensors)
 39 | 
 40 | 
 41 | def rasterize_gaussians(
 42 |     means3D,
 43 |     means2D,
 44 |     sh,
 45 |     colors_precomp,
 46 |     opacities,
 47 |     scales,
 48 |     rotations,
 49 |     cov3Ds_precomp,
 50 |     tile_mask,
 51 |     raster_settings,
 52 | ):
 53 |     return _RasterizeGaussians.apply(
 54 |         means3D,
 55 |         means2D,
 56 |         sh,
 57 |         colors_precomp,
 58 |         opacities,
 59 |         scales,
 60 |         rotations,
 61 |         cov3Ds_precomp,
 62 |         tile_mask,
 63 |         raster_settings,
 64 |     )
 65 | 
 66 | 
 67 | class _RasterizeGaussians(torch.autograd.Function):
 68 |     @staticmethod
 69 |     def forward(
 70 |         ctx,
 71 |         means3D,
 72 |         means2D,
 73 |         sh,
 74 |         colors_precomp,
 75 |         opacities,
 76 |         scales,
 77 |         rotations,
 78 |         cov3Ds_precomp,
 79 |         tile_mask,
 80 |         raster_settings,
 81 |     ):
 82 | 
 83 |         # Restructure arguments the way that the C++ lib expects them
 84 |         args = (
 85 |             raster_settings.bg,
 86 |             means3D,
 87 |             colors_precomp,
 88 |             opacities,
 89 |             scales,
 90 |             rotations,
 91 |             raster_settings.scale_modifier,
 92 |             cov3Ds_precomp,
 93 |             tile_mask,
 94 |             raster_settings.viewmatrix,
 95 |             raster_settings.projmatrix,
 96 |             raster_settings.tanfovx,
 97 |             raster_settings.tanfovy,
 98 |             raster_settings.image_height,
 99 |             raster_settings.image_width,
100 |             sh,
101 |             raster_settings.sh_degree,
102 |             raster_settings.campos,
103 |             raster_settings.prefiltered,
104 |             raster_settings.debug
105 |         )
106 | 
107 |         # Invoke C++/CUDA rasterizer
108 |         if raster_settings.debug:
109 |             cpu_args = cpu_deep_copy_tuple(args)  # Copy them before they can be corrupted
110 |             try:
111 |                 num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
112 |             except Exception as ex:
113 |                 torch.save(cpu_args, "snapshot_fw.dump")
114 |                 print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.")
115 |                 raise ex
116 |         else:
117 |             num_rendered, color, depth, alpha, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
118 |         # Keep relevant tensors for backward
119 |         ctx.raster_settings = raster_settings
120 |         ctx.num_rendered = num_rendered
121 |         ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, tile_mask, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha, means2D)
122 |         return color, depth, alpha, radii
123 | 
124 |     @staticmethod
125 |     def backward(ctx, grad_out_color, grad_out_depth, grad_out_alpha, _):
126 | 
127 |         # Restore necessary values from context
128 |         num_rendered = ctx.num_rendered
129 |         raster_settings = ctx.raster_settings
130 |         colors_precomp, means3D, scales, rotations, cov3Ds_precomp, tile_mask, radii, sh, geomBuffer, binningBuffer, imgBuffer, alpha, means2D = ctx.saved_tensors
131 | 
132 |         # Restructure args as C++ method expects them
133 |         args = (raster_settings.bg,
134 |                 means3D,
135 |                 radii,
136 |                 colors_precomp,
137 |                 scales,
138 |                 rotations,
139 |                 raster_settings.scale_modifier,
140 |                 cov3Ds_precomp,
141 |                 raster_settings.viewmatrix,
142 |                 raster_settings.projmatrix,
143 |                 raster_settings.tanfovx,
144 |                 raster_settings.tanfovy,
145 |                 grad_out_color,
146 |                 grad_out_depth,
147 |                 grad_out_alpha,
148 |                 sh,
149 |                 raster_settings.sh_degree,
150 |                 raster_settings.campos,
151 |                 geomBuffer,
152 |                 num_rendered,
153 |                 binningBuffer,
154 |                 imgBuffer,
155 |                 alpha,
156 |                 raster_settings.debug)
157 | 
158 |         # Compute gradients for relevant tensors by invoking backward method
159 |         if raster_settings.debug:
160 |             cpu_args = cpu_deep_copy_tuple(args)  # Copy them before they can be corrupted
161 |             try:
162 |                 absgrad_means2D, grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
163 |             except Exception as ex:
164 |                 torch.save(cpu_args, "snapshot_bw.dump")
165 |                 print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n")
166 |                 raise ex
167 |         else:
168 |             absgrad_means2D, grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
169 | 
170 |         grads = (
171 |             grad_means3D,
172 |             grad_means2D,
173 |             grad_sh,
174 |             grad_colors_precomp,
175 |             grad_opacities,
176 |             grad_scales,
177 |             grad_rotations,
178 |             grad_cov3Ds_precomp,
179 |             None,
180 |             None,
181 |         )
182 | 
183 |         means2D.absgrad = absgrad_means2D  # let the user select their grad
184 | 
185 |         return grads
186 | 
187 | 
188 | class GaussianRasterizationSettings(NamedTuple):
189 |     image_height: int
190 |     image_width: int
191 |     tanfovx: float
192 |     tanfovy: float
193 |     bg: torch.Tensor
194 |     scale_modifier: float
195 |     viewmatrix: torch.Tensor
196 |     projmatrix: torch.Tensor
197 |     sh_degree: int
198 |     campos: torch.Tensor
199 |     prefiltered: bool
200 |     debug: bool
201 | 
202 | 
203 | class GaussianRasterizer(nn.Module):
204 |     def __init__(self, raster_settings):
205 |         super().__init__()
206 |         self.raster_settings = raster_settings
207 | 
208 |     def markVisible(self, positions):
209 |         # Mark visible points (based on frustum culling for camera) with a boolean
210 |         with torch.no_grad():
211 |             raster_settings = self.raster_settings
212 |             visible = _C.mark_visible(
213 |                 positions,
214 |                 raster_settings.viewmatrix,
215 |                 raster_settings.projmatrix)
216 | 
217 |         return visible
218 | 
219 |     def forward(self, means3D, means2D, opacities, shs=None, colors_precomp=None, scales=None, rotations=None, cov3D_precomp=None, tile_mask=None):
220 | 
221 |         raster_settings = self.raster_settings
222 | 
223 |         if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None):
224 |             raise Exception('Please provide excatly one of either SHs or precomputed colors!')
225 | 
226 |         if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None):
227 |             raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!')
228 | 
229 |         if shs is None:
230 |             shs = torch.Tensor([])
231 |         if colors_precomp is None:
232 |             colors_precomp = torch.Tensor([])
233 | 
234 |         if scales is None:
235 |             scales = torch.Tensor([])
236 |         if rotations is None:
237 |             rotations = torch.Tensor([])
238 |         if cov3D_precomp is None:
239 |             cov3D_precomp = torch.Tensor([])
240 |         if tile_mask is None:
241 |             tile_mask = torch.Tensor([]).bool()
242 |         # TODO: in sampler `typed` will change the type of the tensor
243 |         if tile_mask.dtype != torch.bool:
244 |             tile_mask = tile_mask.bool()
245 | 
246 |         # Invoke C++/CUDA rasterization routine
247 |         return rasterize_gaussians(
248 |             means3D,
249 |             means2D,
250 |             shs,
251 |             colors_precomp,
252 |             opacities,
253 |             scales,
254 |             rotations,
255 |             cov3D_precomp,
256 |             tile_mask,
257 |             raster_settings,
258 |         )
259 | 
260 | 
261 | def mark_visible(positions: torch.Tensor, viewmatrix: torch.Tensor, projmatrix: torch.Tensor):
262 |     # Mark visible points (based on frustum culling for camera) with a boolean
263 |     with torch.no_grad():
264 |         visible = _C.mark_visible(
265 |             positions,
266 |             viewmatrix,
267 |             projmatrix)
268 | 
269 |     return visible
270 | 
271 | def compute_cov_3d(scaling_xyz: torch.Tensor, rotation_l: torch.Tensor):
272 |     return _ComputeCov3D.apply(
273 |         scaling_xyz,
274 |         rotation_l)
275 | 
276 | class _ComputeCov3D(torch.autograd.Function):
277 |     @staticmethod
278 |     def forward(ctx, scaling_xyz, rotation_l):
279 |         cov = _C.compute_cov_3d(scaling_xyz, rotation_l)
280 |         ctx.save_for_backward(scaling_xyz, rotation_l)
281 |         return cov
282 | 
283 |     @staticmethod
284 |     def backward(ctx, grad_out_cov):
285 |         scaling_xyz, rotation_l = ctx.saved_tensors
286 |         grad_scaling_xyz, grad_rotation_l = _C.compute_cov_3d_backward(scaling_xyz, rotation_l, grad_out_cov)
287 |         return grad_scaling_xyz, grad_rotation_l
288 | 
289 | def compute_cov_4d(scaling_xyzt: torch.Tensor, rotation_l: torch.Tensor, rotation_r: torch.Tensor):
290 |     return _ComputeCov4D.apply(
291 |         scaling_xyzt,
292 |         rotation_l,
293 |         rotation_r)
294 | 
295 | 
296 | class _ComputeCov4D(torch.autograd.Function):
297 |     @staticmethod
298 |     def forward(
299 |         ctx,
300 |         scaling_xyzt,
301 |         rotation_l,
302 |         rotation_r
303 |     ):
304 |         cov, ms, cov_t = _C.compute_cov_4d(scaling_xyzt, rotation_l, rotation_r)
305 |         ctx.save_for_backward(scaling_xyzt, rotation_l, rotation_r)
306 |         return cov, ms, cov_t
307 | 
308 |     @staticmethod
309 |     def backward(ctx, grad_out_cov, grad_out_ms, grad_out_cov_t):
310 | 
311 |         # Restore necessary values from context
312 |         scaling_xyzt, rotation_l, rotation_r = ctx.saved_tensors
313 | 
314 |         # Restructure args as C++ method expects them
315 |         grad_scaling_xyzt, grad_rotation_l, grad_rotation_r = _C.compute_cov_4d_backward(
316 |             scaling_xyzt,
317 |             rotation_l,
318 |             rotation_r,
319 |             grad_out_cov,
320 |             grad_out_ms,
321 |             grad_out_cov_t,
322 |         )
323 | 
324 |         grads = (
325 |             grad_scaling_xyzt,
326 |             grad_rotation_l,
327 |             grad_rotation_r,
328 |         )
329 | 
330 |         return grads
331 | 
332 | 
333 | def compute_sh_4d(deg: int, deg_t: int, sh: torch.Tensor, dir: torch.Tensor = None, dir_t: torch.Tensor = None, l: float = None):
334 |     if dir is None:
335 |         dir = torch.Tensor([])
336 |     if dir_t is None:
337 |         dir_t = torch.Tensor([])
338 |     if l is None:
339 |         l = 0.0
340 |     return _ComputeSH4D.apply(
341 |         deg,
342 |         deg_t,
343 |         sh,
344 |         dir,
345 |         dir_t,
346 |         l)
347 | 
348 | 
349 | class _ComputeSH4D(torch.autograd.Function):
350 |     @staticmethod
351 |     def forward(
352 |         ctx,
353 |         deg,
354 |         deg_t,
355 |         sh,
356 |         dir,
357 |         dir_t,
358 |         l
359 |     ):
360 |         rgb = _C.compute_sh_4d(deg, deg_t, sh, dir, dir_t, l)
361 |         ctx.deg = deg
362 |         ctx.deg_t = deg_t
363 |         ctx.l = l
364 |         ctx.save_for_backward(sh, dir, dir_t)
365 |         return rgb
366 | 
367 |     @staticmethod
368 |     def backward(ctx, grad_out_rgb):
369 | 
370 |         # Restore necessary values from context
371 |         deg = ctx.deg
372 |         deg_t = ctx.deg_t
373 |         l = ctx.l
374 |         sh, dir, dir_t = ctx.saved_tensors
375 | 
376 |         # Restructure args as C++ method expects them
377 |         grad_sh, grad_dir, grad_dir_t = _C.compute_sh_4d_backward(
378 |             deg, deg_t, sh, dir, dir_t, l,
379 |             grad_out_rgb,
380 |         )
381 | 
382 |         grads = (
383 |             None,
384 |             None,
385 |             grad_sh,
386 |             grad_dir,
387 |             grad_dir_t,
388 |             None,
389 |         )
390 | 
391 |         return grads
392 | 
393 | 
394 | def align_with(p: int, a: int = 128):
395 |     p = (p + a - 1) // a * a
396 |     return p
397 | 
398 | 
399 | def interpret_geomBuffer(geomBuffer: torch.Tensor, N: int):
400 |     # N: Number of points rendered
401 |     ptr = geomBuffer.data_ptr()
402 |     p = align_with(ptr, 128) - ptr
403 | 
404 |     off = 4 * N
405 |     depths = geomBuffer[p:p + off].view(torch.float)
406 |     p = align_with(p + off, 128)
407 | 
408 |     off = 3 * N
409 |     clamped = geomBuffer[p:p + off].view(torch.bool).view(N, 3)
410 |     p = align_with(p + off, 128)
411 | 
412 |     off = 4 * N
413 |     internal_radii = geomBuffer[p:p + off].view(torch.int)
414 |     p = align_with(p + off, 128)
415 | 
416 |     off = 2 * 4 * N
417 |     means2D = geomBuffer[p:p + off].view(torch.float).view(N, 2)
418 |     p = align_with(p + off, 128)
419 | 
420 |     off = 6 * 4 * N
421 |     cov3D = geomBuffer[p:p + off].view(torch.float).view(N, 6)
422 |     p = align_with(p + off, 128)
423 | 
424 |     off = 4 * 4 * N
425 |     conic_opacity = geomBuffer[p:p + off].view(torch.float).view(N, 4)
426 |     p = align_with(p + off, 128)
427 | 
428 |     off = 3 * 4 * N
429 |     rgb = geomBuffer[p:p + off].view(torch.float).view(N, 3)
430 |     p = align_with(p + off, 128)
431 | 
432 |     off = 4 * N
433 |     tiles_touched = geomBuffer[p:p + off].view(torch.int)
434 |     p = align_with(p + off, 128)
435 | 
436 |     off = 4 * N
437 |     point_offsets = geomBuffer[p:p + off].view(torch.int)
438 | 
439 |     return dict(
440 |         depths=depths,
441 |         clamped=clamped,
442 |         internal_radii=internal_radii,
443 |         means2D=means2D,
444 |         cov3D=cov3D,
445 |         conic_opacity=conic_opacity,
446 |         rgb=rgb,
447 |         tiles_touched=tiles_touched,
448 |         point_offsets=point_offsets
449 |     )
450 | 
451 | 
452 | def interpret_binningBuffer(binningBuffer: torch.Tensor, N: int):
453 |     # N: Number of tile-gaussian pairs
454 |     ptr = binningBuffer.data_ptr()
455 |     p = align_with(ptr, 128) - ptr
456 | 
457 |     off = 4 * N
458 |     point_list = binningBuffer[p:p + off].view(torch.int)
459 |     p = align_with(p + off, 128)
460 | 
461 |     off = 4 * N
462 |     point_list_unsorted = binningBuffer[p:p + off].view(torch.int)
463 |     p = align_with(p + off, 128)
464 | 
465 |     off = 8 * N
466 |     point_list_keys = binningBuffer[p:p + off].view(torch.long)
467 |     p = align_with(p + off, 128)
468 | 
469 |     off = 8 * N
470 |     point_list_keys_unsorted = binningBuffer[p:p + off].view(torch.long)
471 |     p = align_with(p + off, 128)
472 | 
473 |     return dict(
474 |         point_list=point_list,
475 |         point_list_unsorted=point_list_unsorted,
476 |         point_list_keys=point_list_keys,
477 |         point_list_keys_unsorted=point_list_keys_unsorted,
478 | 
479 |         # Little Endian
480 |         depths=point_list_keys.view(torch.float).view(N, 2)[:, 0],
481 |         tile_ids=point_list_keys.view(torch.int).view(N, 2)[:, 1],
482 |     )
483 | 
484 | 
485 | def interpret_imgBuffer(imgBuffer: torch.Tensor, N: int, M: int):
486 |     # N: Number of pixels
487 |     # M: Number of tiles
488 |     ptr = imgBuffer.data_ptr()
489 |     p = align_with(ptr, 128) - ptr
490 | 
491 |     off = 4 * N
492 |     n_contrib = imgBuffer[p:p + off].view(torch.int)
493 |     p = align_with(p + off, 128)
494 | 
495 |     off = 2 * 4 * M
496 |     ranges = imgBuffer[p:p + off].view(torch.int).view(M, 2)
497 |     p = align_with(p + off, 128)
498 | 
499 |     return dict(
500 |         n_contrib=n_contrib,
501 |         ranges=ranges
502 |     )
503 | 


--------------------------------------------------------------------------------
/ext.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #include <torch/extension.h>
13 | #include "rasterize_points.h"
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
17 |   m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
18 |   m.def("mark_visible", &markVisible);
19 |   m.def("fused_preprocess_4d", &fusedPreprocess4D);
20 |   m.def("fused_preprocess_4d_sparse", &fusedPreprocess4DSparse);
21 |   m.def("compute_cov_4d", &computeCov4D);
22 |   m.def("compute_cov_4d_backward", &computeCov4DBackward);
23 |   m.def("compute_cov_3d", &computeCov3D);
24 |   m.def("compute_cov_3d_backward", &computeCov3DBackward);
25 |   m.def("compute_sh_4d", &computeSH4D);
26 |   m.def("compute_sh_4d_backward", &computeSH4DBackward);
27 | }


--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
 1 | Gaussian-Splatting License  
 2 | ===========================  
 3 | 
 4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.  
 5 | The *Software* is in the process of being registered with the Agence pour la Protection des  
 6 | Programmes (APP).  
 7 | 
 8 | The *Software* is still being developed by the *Licensor*.  
 9 | 
10 | *Licensor*'s goal is to allow the research community to use, test and evaluate  
11 | the *Software*.  
12 | 
13 | ## 1.  Definitions  
14 | 
15 | *Licensee* means any person or entity that uses the *Software* and distributes  
16 | its *Work*.  
17 | 
18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII  
19 | 
20 | *Software* means the original work of authorship made available under this  
21 | License ie gaussian-splatting.  
22 | 
23 | *Work* means the *Software* and any additions to or derivative works of the  
24 | *Software* that are made available under this License.  
25 | 
26 | 
27 | ## 2.  Purpose  
28 | This license is intended to define the rights granted to the *Licensee* by  
29 | Licensors under the *Software*.  
30 | 
31 | ## 3.  Rights granted  
32 | 
33 | For the above reasons Licensors have decided to distribute the *Software*.  
34 | Licensors grant non-exclusive rights to use the *Software* for research purposes  
35 | to research users (both academic and industrial), free of charge, without right  
36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research  
37 | and/or evaluation purposes only.  
38 | 
39 | Subject to the terms and conditions of this License, you are granted a  
40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of,  
41 | publicly display, publicly perform and distribute its *Work* and any resulting  
42 | derivative works in any form.  
43 | 
44 | ## 4.  Limitations  
45 | 
46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do  
47 | so under this License, (b) you include a complete copy of this License with  
48 | your distribution, and (c) you retain without modification any copyright,  
49 | patent, trademark, or attribution notices that are present in the *Work*.  
50 | 
51 | **4.2 Derivative Works.** You may specify that additional or different terms apply  
52 | to the use, reproduction, and distribution of your derivative works of the *Work*  
53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in  
54 | Section 2 applies to your derivative works, and (b) you identify the specific  
55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms,  
56 | this License (including the redistribution requirements in Section 3.1) will  
57 | continue to apply to the *Work* itself.  
58 | 
59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research  
60 | users explicitly acknowledge having received from Licensors all information  
61 | allowing to appreciate the adequacy between of the *Software* and their needs and  
62 | to undertake all necessary precautions for its execution and use.  
63 | 
64 | **4.4** The *Software* is provided both as a compiled library file and as source  
65 | code. In case of using the *Software* for a publication or other results obtained  
66 | through the use of the *Software*, users are strongly encouraged to cite the  
67 | corresponding publications as explained in the documentation of the *Software*.  
68 | 
69 | ## 5.  Disclaimer  
70 | 
71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES  
72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY  
73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL  
74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES  
75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL  
76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR  
77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE  
78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR  
79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE  
80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)  
81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT  
82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR  
83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.  
84 | 


--------------------------------------------------------------------------------
/rasterize_points.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include <math.h>
 13 | #include <torch/extension.h>
 14 | #include <cstdio>
 15 | #include <sstream>
 16 | #include <iostream>
 17 | #include <tuple>
 18 | #include <stdio.h>
 19 | #include <cuda_runtime_api.h>
 20 | #include <memory>
 21 | #include "cuda_rasterizer/config.h"
 22 | #include "cuda_rasterizer/rasterizer.h"
 23 | #include <fstream>
 24 | #include <string>
 25 | #include <functional>
 26 | 
 27 | std::function<char*(size_t N)> resizeFunctional(torch::Tensor& t) {
 28 |     auto lambda = [&t](size_t N) {
 29 |         t.resize_({(long long)N});
 30 | 		return reinterpret_cast<char*>(t.contiguous().data_ptr());
 31 |     };
 32 |     return lambda;
 33 | }
 34 | 
 35 | std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 36 | RasterizeGaussiansCUDA(
 37 | 	const torch::Tensor& background,
 38 | 	const torch::Tensor& means3D,
 39 |     const torch::Tensor& colors,
 40 |     const torch::Tensor& opacity,
 41 | 	const torch::Tensor& scales,
 42 | 	const torch::Tensor& rotations,
 43 | 	const float scale_modifier,
 44 | 	const torch::Tensor& cov3D_precomp,
 45 | 	const torch::Tensor& tile_mask,
 46 | 	const torch::Tensor& viewmatrix,
 47 | 	const torch::Tensor& projmatrix,
 48 | 	const float tan_fovx, 
 49 | 	const float tan_fovy,
 50 |     const int image_height,
 51 |     const int image_width,
 52 | 	const torch::Tensor& sh,
 53 | 	const int degree,
 54 | 	const torch::Tensor& campos,
 55 | 	const bool prefiltered,
 56 | 	const bool debug)
 57 | {
 58 |   if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
 59 |     AT_ERROR("means3D must have dimensions (num_points, 3)");
 60 |   }
 61 |   
 62 |   const int P = means3D.size(0);
 63 |   const int H = image_height;
 64 |   const int W = image_width;
 65 | 
 66 |   auto int_opts = means3D.options().dtype(torch::kInt32);
 67 |   auto float_opts = means3D.options().dtype(torch::kFloat32);
 68 | 
 69 |   torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts);
 70 |   torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts);
 71 |   torch::Tensor out_alpha = torch::full({1, H, W}, 0.0, float_opts);
 72 |   torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
 73 |   
 74 |   torch::Device device(torch::kCUDA);
 75 |   torch::TensorOptions options(torch::kByte);
 76 |   torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
 77 |   torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
 78 |   torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
 79 |   std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer);
 80 |   std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer);
 81 |   std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer);
 82 |   
 83 |   int rendered = 0;
 84 |   if(P != 0)
 85 |   {
 86 | 	  int M = 0;
 87 | 	  if(sh.size(0) != 0)
 88 | 	  {
 89 | 		M = sh.size(1);
 90 |       }
 91 | 
 92 | 	  rendered = CudaRasterizer::Rasterizer::forward(
 93 | 	    geomFunc,
 94 | 		binningFunc,
 95 | 		imgFunc,
 96 | 	    P, degree, M,
 97 | 		background.contiguous().data_ptr<float>(),
 98 | 		W, H,
 99 | 		means3D.contiguous().data_ptr<float>(),
100 | 		sh.contiguous().data_ptr<float>(),
101 | 		colors.contiguous().data_ptr<float>(), 
102 | 		opacity.contiguous().data_ptr<float>(), 
103 | 		scales.contiguous().data_ptr<float>(),
104 | 		scale_modifier,
105 | 		rotations.contiguous().data_ptr<float>(),
106 | 		cov3D_precomp.contiguous().data_ptr<float>(), 
107 | 		tile_mask.contiguous().data_ptr<bool>(),
108 | 		viewmatrix.contiguous().data_ptr<float>(), 
109 | 		projmatrix.contiguous().data_ptr<float>(),
110 | 		campos.contiguous().data_ptr<float>(),
111 | 		tan_fovx,
112 | 		tan_fovy,
113 | 		prefiltered,
114 | 		out_color.contiguous().data_ptr<float>(),
115 | 		out_depth.contiguous().data_ptr<float>(),
116 | 		out_alpha.contiguous().data_ptr<float>(),
117 | 		radii.contiguous().data_ptr<int>(),
118 | 		debug);
119 |   }
120 |   return std::make_tuple(rendered, out_color, out_depth, out_alpha, radii, geomBuffer, binningBuffer, imgBuffer);
121 | }
122 | 
123 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
124 |  RasterizeGaussiansBackwardCUDA(
125 |  	const torch::Tensor& background,
126 | 	const torch::Tensor& means3D,
127 | 	const torch::Tensor& radii,
128 |     const torch::Tensor& colors,
129 | 	const torch::Tensor& scales,
130 | 	const torch::Tensor& rotations,
131 | 	const float scale_modifier,
132 | 	const torch::Tensor& cov3D_precomp,
133 | 	const torch::Tensor& viewmatrix,
134 |     const torch::Tensor& projmatrix,
135 | 	const float tan_fovx,
136 | 	const float tan_fovy,
137 |     const torch::Tensor& dL_dout_color,
138 |     const torch::Tensor& dL_dout_depth,
139 | 	const torch::Tensor& dL_dout_alpha,
140 | 	const torch::Tensor& sh,
141 | 	const int degree,
142 | 	const torch::Tensor& campos,
143 | 	const torch::Tensor& geomBuffer,
144 | 	const int R,
145 | 	const torch::Tensor& binningBuffer,
146 | 	const torch::Tensor& imageBuffer,
147 | 	const torch::Tensor& out_alpha,
148 | 	const bool debug) 
149 | {
150 |   const int P = means3D.size(0);
151 |   const int H = dL_dout_color.size(1);
152 |   const int W = dL_dout_color.size(2);
153 |   
154 |   int M = 0;
155 |   if(sh.size(0) != 0)
156 |   {	
157 | 	M = sh.size(1);
158 |   }
159 | 
160 |   torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
161 |   torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
162 |   torch::Tensor dL_dabsmeans2D = torch::zeros({P, 3}, means3D.options());
163 |   torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
164 |   // just for storing intermediate results
165 |   torch::Tensor dL_ddepths = torch::zeros({P, 1}, means3D.options());
166 |   torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
167 |   torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
168 |   torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
169 |   torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
170 |   torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
171 |   torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
172 |   
173 |   if(P != 0)
174 |   {  
175 | 	  CudaRasterizer::Rasterizer::backward(P, degree, M, R,
176 | 	  background.contiguous().data_ptr<float>(),
177 | 	  W, H, 
178 | 	  means3D.contiguous().data_ptr<float>(),
179 | 	  sh.contiguous().data_ptr<float>(),
180 | 	  colors.contiguous().data_ptr<float>(),
181 | 	  scales.data_ptr<float>(),
182 | 	  scale_modifier,
183 | 	  rotations.data_ptr<float>(),
184 | 	  cov3D_precomp.contiguous().data_ptr<float>(),
185 | 	  viewmatrix.contiguous().data_ptr<float>(),
186 | 	  projmatrix.contiguous().data_ptr<float>(),
187 | 	  campos.contiguous().data_ptr<float>(),
188 | 	  tan_fovx,
189 | 	  tan_fovy,
190 | 	  radii.contiguous().data_ptr<int>(),
191 | 	  reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
192 | 	  reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
193 | 	  reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
194 | 	  out_alpha.contiguous().data_ptr<float>(),
195 | 	  dL_dout_color.contiguous().data_ptr<float>(),
196 | 	  dL_dout_depth.contiguous().data_ptr<float>(),
197 | 	  dL_dout_alpha.contiguous().data_ptr<float>(),
198 | 	  dL_dmeans2D.contiguous().data_ptr<float>(),
199 | 	  dL_dabsmeans2D.contiguous().data_ptr<float>(),
200 | 	  dL_dconic.contiguous().data_ptr<float>(),  
201 | 	  dL_dopacity.contiguous().data_ptr<float>(),
202 | 	  dL_dcolors.contiguous().data_ptr<float>(),
203 | 	  dL_ddepths.contiguous().data_ptr<float>(),
204 | 	  dL_dmeans3D.contiguous().data_ptr<float>(),
205 | 	  dL_dcov3D.contiguous().data_ptr<float>(),
206 | 	  dL_dsh.contiguous().data_ptr<float>(),
207 | 	  dL_dscales.contiguous().data_ptr<float>(),
208 | 	  dL_drotations.contiguous().data_ptr<float>(),
209 | 	  debug);
210 |   }
211 | 
212 |   return std::make_tuple(dL_dabsmeans2D, dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
213 | }
214 | 
215 | torch::Tensor markVisible(
216 | 		torch::Tensor& means3D,
217 | 		torch::Tensor& viewmatrix,
218 | 		torch::Tensor& projmatrix)
219 | { 
220 |   const int P = means3D.size(0);
221 |   
222 |   torch::Tensor present = torch::empty({P}, means3D.options().dtype(at::kBool));
223 |  
224 |   if(P != 0)
225 |   {
226 | 	CudaRasterizer::Rasterizer::markVisible(P,
227 | 		means3D.contiguous().data_ptr<float>(),
228 | 		viewmatrix.contiguous().data_ptr<float>(),
229 | 		projmatrix.contiguous().data_ptr<float>(),
230 | 		present.contiguous().data_ptr<bool>());
231 |   }
232 |   
233 |   return present;
234 | }
235 | 
236 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> fusedPreprocess4D(
237 | 	const torch::Tensor& means3D,
238 | 	const torch::Tensor& cov,
239 | 	const torch::Tensor& ms,
240 | 	const torch::Tensor& cov_t,
241 | 	const torch::Tensor& opacities,
242 | 	const torch::Tensor& t1,
243 | 	const torch::Tensor& sh,
244 | 	const torch::Tensor& t,
245 | 	const torch::Tensor& viewmatrix,
246 | 	const torch::Tensor& projmatrix,
247 | 	const torch::Tensor& cam_pos,
248 | 	const int deg,
249 | 	const int deg_t,
250 | 	const float duration
251 | 	)
252 | { 
253 |   const int P = means3D.size(0);
254 |   int M = 0;
255 |   if(sh.size(0) != 0) M = sh.size(1);
256 | 
257 |   torch::Tensor mask = torch::empty({P, 1}, means3D.options().dtype(at::kBool));
258 |   torch::Tensor occ1 = torch::empty({P, 1}, means3D.options());
259 |   torch::Tensor xyz3 = torch::empty({P, 3}, means3D.options());
260 |   torch::Tensor rgb3 = torch::empty({P, 3}, means3D.options());
261 |  
262 |   if(P != 0)
263 |   {
264 | 	CudaRasterizer::Rasterizer::fusedPreprocess4D(P, deg, deg_t, M,
265 | 		means3D.contiguous().data_ptr<float>(),
266 | 		cov.contiguous().data_ptr<float>(),
267 | 		ms.contiguous().data_ptr<float>(),
268 | 		cov_t.contiguous().data_ptr<float>(),
269 | 		opacities.contiguous().data_ptr<float>(),
270 | 		t1.contiguous().data_ptr<float>(),
271 | 		sh.contiguous().data_ptr<float>(),
272 | 		t.contiguous().data_ptr<float>(),
273 | 		viewmatrix.contiguous().data_ptr<float>(),
274 | 		projmatrix.contiguous().data_ptr<float>(),
275 | 		cam_pos.contiguous().data_ptr<float>(),
276 | 		duration,
277 | 		mask.contiguous().data_ptr<bool>(),
278 | 		occ1.contiguous().data_ptr<float>(),
279 | 		xyz3.contiguous().data_ptr<float>(),
280 | 		rgb3.contiguous().data_ptr<float>());
281 | 	}
282 | 	return std::make_tuple(mask, occ1, xyz3, rgb3);
283 | }
284 | 
285 | 
286 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> fusedPreprocess4DSparse(
287 | 	const torch::Tensor& means3D,
288 | 	const torch::Tensor& cov,
289 | 	const torch::Tensor& ms,
290 | 	const torch::Tensor& cov_t,
291 | 	const torch::Tensor& opacities,
292 | 	const torch::Tensor& t1,
293 | 	const torch::Tensor& base,
294 | 	const torch::Tensor& sh,
295 | 	const torch::Tensor& t,
296 | 	const torch::Tensor& inverse,
297 | 	const torch::Tensor& viewmatrix,
298 | 	const torch::Tensor& projmatrix,
299 | 	const torch::Tensor& cam_pos,
300 | 	const int deg,
301 | 	const int deg_t,
302 | 	const float duration
303 | 	)
304 | { 
305 |   const int P = means3D.size(0);
306 |   int M = 0;
307 |   if(sh.size(0) != 0) M = sh.size(1);
308 | 
309 |   torch::Tensor mask = torch::empty({P, 1}, means3D.options().dtype(at::kBool));
310 |   torch::Tensor occ1 = torch::empty({P, 1}, means3D.options());
311 |   torch::Tensor xyz3 = torch::empty({P, 3}, means3D.options());
312 |   torch::Tensor rgb3 = torch::empty({P, 3}, means3D.options());
313 |  
314 |   if(P != 0)
315 |   {
316 | 	CudaRasterizer::Rasterizer::fusedPreprocess4DSparse(P, deg, deg_t, M,
317 | 		means3D.contiguous().data_ptr<float>(),
318 | 		cov.contiguous().data_ptr<float>(),
319 | 		ms.contiguous().data_ptr<float>(),
320 | 		cov_t.contiguous().data_ptr<float>(),
321 | 		opacities.contiguous().data_ptr<float>(),
322 | 		t1.contiguous().data_ptr<float>(),
323 | 		base.contiguous().data_ptr<float>(),
324 | 		sh.contiguous().data_ptr<float>(),
325 | 		t.contiguous().data_ptr<float>(),
326 | 		inverse.contiguous().data_ptr<int>(),
327 | 		viewmatrix.contiguous().data_ptr<float>(),
328 | 		projmatrix.contiguous().data_ptr<float>(),
329 | 		cam_pos.contiguous().data_ptr<float>(),
330 | 		duration,
331 | 		mask.contiguous().data_ptr<bool>(),
332 | 		occ1.contiguous().data_ptr<float>(),
333 | 		xyz3.contiguous().data_ptr<float>(),
334 | 		rgb3.contiguous().data_ptr<float>());
335 | 	}
336 | 	return std::make_tuple(mask, occ1, xyz3, rgb3);
337 | }
338 | 
339 | torch::Tensor computeCov3D(
340 | 		torch::Tensor& scaling_xyz,
341 | 		torch::Tensor& rotation_l)
342 | {
343 | 	const int P = scaling_xyz.size(0);
344 | 	torch::Tensor cov = torch::empty({P, 6}, scaling_xyz.options());
345 | 
346 | 	if(P != 0)
347 | 	{
348 | 		CudaRasterizer::Rasterizer::computeCov3D(P,
349 | 			scaling_xyz.contiguous().data_ptr<float>(),
350 | 			rotation_l.contiguous().data_ptr<float>(),
351 | 			cov.contiguous().data_ptr<float>());
352 | 	}
353 | 
354 | 	return cov;
355 | }
356 | 
357 | std::tuple<torch::Tensor, torch::Tensor> computeCov3DBackward(
358 | 		torch::Tensor& scaling_xyz,
359 | 		torch::Tensor& rotation_l,
360 | 		torch::Tensor& dL_dcov)
361 | {
362 | 	const int P = scaling_xyz.size(0);
363 | 	torch::Tensor dL_dscaling_xyz = torch::zeros({P, 3}, scaling_xyz.options());
364 | 	torch::Tensor dL_drotation_l = torch::zeros({P, 4}, scaling_xyz.options());
365 | 
366 | 	if(P != 0)
367 | 	{
368 | 		CudaRasterizer::Rasterizer::computeCov3DBackward(P,
369 | 			scaling_xyz.contiguous().data_ptr<float>(),
370 | 			rotation_l.contiguous().data_ptr<float>(),
371 | 			dL_dcov.contiguous().data_ptr<float>(),
372 | 			dL_dscaling_xyz.contiguous().data_ptr<float>(),
373 | 			dL_drotation_l.contiguous().data_ptr<float>());
374 | 	}
375 | 
376 | 	return std::make_tuple(dL_dscaling_xyz, dL_drotation_l);
377 | }
378 | 
379 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeCov4D(
380 | 		torch::Tensor& scaling_xyzt,
381 | 		torch::Tensor& rotation_l,
382 | 		torch::Tensor& rotation_r)
383 | { 
384 |   const int P = scaling_xyzt.size(0);
385 |   
386 |   torch::Tensor cov = torch::empty({P, 6}, scaling_xyzt.options());
387 |   torch::Tensor ms = torch::empty({P, 3}, scaling_xyzt.options());
388 |   torch::Tensor cov_t = torch::empty({P, 1}, scaling_xyzt.options());
389 |  
390 |   if(P != 0)
391 |   {
392 | 	CudaRasterizer::Rasterizer::computeCov4D(P,
393 | 		scaling_xyzt.contiguous().data_ptr<float>(),
394 | 		rotation_l.contiguous().data_ptr<float>(),
395 | 		rotation_r.contiguous().data_ptr<float>(),
396 | 		cov.contiguous().data_ptr<float>(),
397 | 		ms.contiguous().data_ptr<float>(),
398 | 		cov_t.contiguous().data_ptr<float>());
399 |   }
400 |   
401 |   return std::make_tuple(cov, ms, cov_t);
402 | }
403 | 
404 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeCov4DBackward(
405 | 		torch::Tensor& scaling_xyzt,
406 | 		torch::Tensor& rotation_l,
407 | 		torch::Tensor& rotation_r,
408 | 		torch::Tensor& dL_dcov,
409 | 		torch::Tensor& dL_dms,
410 | 		torch::Tensor& dL_dcov_t)
411 | { 
412 |   const int P = scaling_xyzt.size(0);
413 |   
414 |   torch::Tensor dL_dscaling_xyzt = torch::zeros({P, 4}, scaling_xyzt.options());
415 |   torch::Tensor dL_drotation_l = torch::zeros({P, 4}, scaling_xyzt.options());
416 |   torch::Tensor dL_drotation_r = torch::zeros({P, 4}, scaling_xyzt.options());
417 |  
418 |   if(P != 0)
419 |   {
420 | 	CudaRasterizer::Rasterizer::computeCov4DBackward(P,
421 | 		scaling_xyzt.contiguous().data_ptr<float>(),
422 | 		rotation_l.contiguous().data_ptr<float>(),
423 | 		rotation_r.contiguous().data_ptr<float>(),
424 | 		dL_dcov.contiguous().data_ptr<float>(),
425 | 		dL_dms.contiguous().data_ptr<float>(),
426 | 		dL_dcov_t.contiguous().data_ptr<float>(),
427 | 		dL_dscaling_xyzt.contiguous().data_ptr<float>(),
428 | 		dL_drotation_l.contiguous().data_ptr<float>(),
429 | 		dL_drotation_r.contiguous().data_ptr<float>());
430 |   }
431 |   
432 |   return std::make_tuple(dL_dscaling_xyzt, dL_drotation_l, dL_drotation_r);
433 | }
434 | 
435 | 
436 | torch::Tensor computeSH4D(
437 | 	const int deg,
438 | 	const int deg_t,
439 | 	torch::Tensor& sh,
440 | 	torch::Tensor& dir,
441 | 	torch::Tensor& dir_t,
442 | 	const float duration
443 | )
444 | { 
445 | 	const int P = sh.size(0);
446 | 	int M = 0;
447 | 	if(sh.size(0) != 0) M = sh.size(1);
448 | 
449 | 	torch::Tensor rgb = torch::zeros({P, 3}, sh.options());
450 | 
451 | 	if(P != 0)
452 | 	{
453 | 		CudaRasterizer::Rasterizer::computeSH4D(P,
454 | 			deg, deg_t, M,
455 | 			sh.contiguous().data_ptr<float>(),
456 | 			dir.contiguous().data_ptr<float>(),
457 | 			dir_t.contiguous().data_ptr<float>(),
458 | 			duration,
459 | 			rgb.contiguous().data_ptr<float>()
460 | 		);
461 | 	}
462 | 
463 | 	return rgb;
464 | }
465 | 
466 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeSH4DBackward(
467 | 	const int deg,
468 | 	const int deg_t,
469 | 	torch::Tensor& sh,
470 | 	torch::Tensor& dir,
471 | 	torch::Tensor& dir_t,
472 | 	const float duration,
473 | 	torch::Tensor& dL_drgb
474 | )
475 | { 
476 | 	const int P = sh.size(0);
477 | 	int M = 0;
478 | 	if(sh.size(0) != 0) M = sh.size(1);
479 | 
480 | 	torch::Tensor dL_dsh = torch::zeros({P, M, 3}, sh.options());
481 | 	torch::Tensor dL_ddir = torch::zeros({P, 3}, sh.options());
482 | 	torch::Tensor dL_ddir_t = torch::zeros({P, 1}, sh.options());
483 | 
484 | 	if(P != 0)
485 | 	{
486 | 		CudaRasterizer::Rasterizer::computeSH4DBackward(P,
487 | 			deg, deg_t, M,
488 | 			sh.contiguous().data_ptr<float>(),
489 | 			dir.contiguous().data_ptr<float>(),
490 | 			dir_t.contiguous().data_ptr<float>(),
491 | 			duration,
492 | 			dL_drgb.contiguous().data_ptr<float>(),
493 | 			dL_dsh.contiguous().data_ptr<float>(),
494 | 			dL_ddir.contiguous().data_ptr<float>(),
495 | 			dL_ddir_t.contiguous().data_ptr<float>()
496 | 		);
497 | 	}
498 | 
499 | 	return std::make_tuple(dL_dsh, dL_ddir, dL_ddir_t);
500 | }


--------------------------------------------------------------------------------
/rasterize_points.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #pragma once
 13 | #include <torch/extension.h>
 14 | #include <cstdio>
 15 | #include <tuple>
 16 | #include <string>
 17 | 	
 18 | std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 19 | RasterizeGaussiansCUDA(
 20 | 	const torch::Tensor& background,
 21 | 	const torch::Tensor& means3D,
 22 |     const torch::Tensor& colors,
 23 |     const torch::Tensor& opacity,
 24 | 	const torch::Tensor& scales,
 25 | 	const torch::Tensor& rotations,
 26 | 	const float scale_modifier,
 27 | 	const torch::Tensor& cov3D_precomp,
 28 | 	const torch::Tensor& tile_mask,
 29 | 	const torch::Tensor& viewmatrix,
 30 | 	const torch::Tensor& projmatrix,
 31 | 	const float tan_fovx, 
 32 | 	const float tan_fovy,
 33 |     const int image_height,
 34 |     const int image_width,
 35 | 	const torch::Tensor& sh,
 36 | 	const int degree,
 37 | 	const torch::Tensor& campos,
 38 | 	const bool prefiltered,
 39 | 	const bool debug);
 40 | 
 41 | 
 42 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 43 |  RasterizeGaussiansBackwardCUDA(
 44 |  	const torch::Tensor& background,
 45 | 	const torch::Tensor& means3D,
 46 | 	const torch::Tensor& radii,
 47 |     const torch::Tensor& colors,
 48 | 	const torch::Tensor& scales,
 49 | 	const torch::Tensor& rotations,
 50 | 	const float scale_modifier,
 51 | 	const torch::Tensor& cov3D_precomp,
 52 | 	const torch::Tensor& viewmatrix,
 53 |     const torch::Tensor& projmatrix,
 54 | 	const float tan_fovx, 
 55 | 	const float tan_fovy,
 56 |     const torch::Tensor& dL_dout_color,
 57 | 	const torch::Tensor& dL_dout_depth,
 58 | 	const torch::Tensor& dL_dout_alpha,
 59 | 	const torch::Tensor& sh,
 60 | 	const int degree,
 61 | 	const torch::Tensor& campos,
 62 | 	const torch::Tensor& geomBuffer,
 63 | 	const int R,
 64 | 	const torch::Tensor& binningBuffer,
 65 | 	const torch::Tensor& imageBuffer,
 66 | 	const torch::Tensor& out_alpha,
 67 | 	const bool debug);
 68 | 		
 69 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> fusedPreprocess4D(
 70 | 		const torch::Tensor& means3D,
 71 | 		const torch::Tensor& cov,
 72 | 		const torch::Tensor& ms,
 73 | 		const torch::Tensor& cov_t,
 74 | 		const torch::Tensor& opacities,
 75 | 		const torch::Tensor& t1,
 76 | 		const torch::Tensor& sh,
 77 | 		const torch::Tensor& t,
 78 | 		const torch::Tensor& viewmatrix,
 79 | 		const torch::Tensor& projmatrix,
 80 | 		const torch::Tensor& cam_pos,
 81 | 		const int deg,
 82 | 		const int deg_t,
 83 | 		const float duration
 84 | 		);
 85 | 
 86 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> fusedPreprocess4DSparse(
 87 | 		const torch::Tensor& means3D,
 88 | 		const torch::Tensor& cov,
 89 | 		const torch::Tensor& ms,
 90 | 		const torch::Tensor& cov_t,
 91 | 		const torch::Tensor& opacities,
 92 | 		const torch::Tensor& t1,
 93 | 		const torch::Tensor& base,
 94 | 		const torch::Tensor& sh,
 95 | 		const torch::Tensor& t,
 96 | 		const torch::Tensor& inverse,
 97 | 		const torch::Tensor& viewmatrix,
 98 | 		const torch::Tensor& projmatrix,
 99 | 		const torch::Tensor& cam_pos,
100 | 		const int deg,
101 | 		const int deg_t,
102 | 		const float duration
103 | 		);
104 | 
105 | torch::Tensor markVisible(
106 | 		torch::Tensor& means3D,
107 | 		torch::Tensor& viewmatrix,
108 | 		torch::Tensor& projmatrix);
109 | 
110 | torch::Tensor computeCov3D(
111 | 		torch::Tensor& scaling_xyz,
112 | 		torch::Tensor& rotation_l);
113 | 
114 | std::tuple<torch::Tensor, torch::Tensor> computeCov3DBackward(
115 | 		torch::Tensor& scaling_xyz,
116 | 		torch::Tensor& rotation_l,
117 | 		torch::Tensor& dL_dcov);
118 | 
119 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeCov4D(
120 | 		torch::Tensor& scaling_xyzt,
121 | 		torch::Tensor& rotation_l,
122 | 		torch::Tensor& rotation_r);
123 | 
124 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeCov4DBackward(
125 | 		torch::Tensor& scaling_xyzt,
126 | 		torch::Tensor& rotation_l,
127 | 		torch::Tensor& rotation_r,
128 | 		torch::Tensor& dL_dcov,
129 | 		torch::Tensor& dL_dms,
130 | 		torch::Tensor& dL_dcov_t);
131 | 
132 | torch::Tensor computeSH4D(
133 | 	const int deg,
134 | 	const int deg_t,
135 | 	torch::Tensor& sh,
136 | 	torch::Tensor& dir,
137 | 	torch::Tensor& dir_t,
138 | 	const float duration
139 | );
140 | 
141 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> computeSH4DBackward(
142 | 	const int deg,
143 | 	const int deg_t,
144 | 	torch::Tensor& sh,
145 | 	torch::Tensor& dir,
146 | 	torch::Tensor& dir_t,
147 | 	const float duration,
148 | 	torch::Tensor& dL_drgb
149 | );


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Differential Gaussian Rasterization Improved
  2 | 
  3 | ## Faster Backward Pass
  4 | 
  5 | This is only faster if there're large number of semi-transparent (almost) transparent Gaussians to be rendered since it might introduce some small overheads for regular rendering.
  6 | 
  7 | The original backward implementation uses `atomicAdd` on global CUDA memory.
  8 | 
  9 | We further accelerate this process by making use of the `__shared__` memory in a thread block to store the temporal accumulated gradients, just like the original did to the gaussian properties.
 10 | 
 11 | No api change is required for this functionality and you can directly check out what we changed in [backward.cu](cuda_rasterizer/backward.cu#417).
 12 | 
 13 | The change can be summarized in this pseudo-code:
 14 | 
 15 | ```c++
 16 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
 17 | renderCUDA(...) {
 18 | 
 19 |     __shared__ float3 s_dL_dmean2D[BLOCK_SIZE]; // allocated shared memory
 20 |     s_dL_dmean2D[block.thread_rank()].x = 0.0f; // fill shared memory with zeros
 21 | 
 22 |     for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) { // iterate over gaussian that has a influence on this pixel
 23 |         // Compute gradients
 24 |         ...
 25 | 
 26 |         // Update gradients w.r.t. 2D mean position of the Gaussian
 27 |         atomicAdd(&s_dL_dmean2D[j].x, dL_dG * dG_ddelx * ddelx_dx);
 28 |         atomicAdd(&s_dL_dmean2D[j].y, dL_dG * dG_ddely * ddely_dy);
 29 |     }
 30 | 
 31 |     atomicAdd(&dL_dmean2D[global_id].x, s_dL_dmean2D[block.thread_rank()].x);
 32 |     atomicAdd(&dL_dmean2D[global_id].y, s_dL_dmean2D[block.thread_rank()].y);
 33 | }
 34 | ```
 35 | 
 36 | In an effort to make this process even faster, we've also implemented a warp-reduction based version of the backward pass on top of the `__shared__` memory optimization.
 37 | 
 38 | By directly communicating the gradient accumulation in a 32-thread warp using:
 39 | 
 40 | ```c++
 41 | __device__ float warpReduceSum(float value) {
 42 |     auto warp = cg::coalesced_threads();
 43 |     for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
 44 |         value += warp.shfl_down(value, offset);
 45 |     }
 46 |     return value;
 47 | }
 48 | ```
 49 | 
 50 | And later aggregate the warp sum into `__shared__` memory:
 51 | 
 52 | ```c++
 53 | ...
 54 | 			// Use a single thread from each warp to perform block level reduction
 55 | 			if (block.thread_rank() % warp.size() == 0) {
 56 | 				for (int ch = 0; ch < C; ch++) {
 57 | 					atomicAdd(&(s_dL_dcolors[ch * BLOCK_SIZE + j]), w_dL_dcolors[ch]);
 58 | 				}
 59 | 				atomicAdd(&(s_dL_ddepths[j]), w_dL_ddepths);
 60 | 				atomicAdd(&s_dL_dmean2D[j].x, w_dL_dmean2D.x);
 61 | 				atomicAdd(&s_dL_dmean2D[j].y, w_dL_dmean2D.y);
 62 | 				atomicAdd(&s_dL_dconic2D[j].x, w_dL_dconic2D.x);
 63 | 				atomicAdd(&s_dL_dconic2D[j].y, w_dL_dconic2D.y);
 64 | 				atomicAdd(&s_dL_dconic2D[j].w, w_dL_dconic2D.w);
 65 | 				atomicAdd(&(s_dL_dopacity[j]), w_dL_dopacity);
 66 | 			}
 67 | ...
 68 | ```
 69 | 
 70 | We can shave off another 2-3ms for the backward pass at the start of the training, but curiously it couldn't persist during the whole training process.
 71 | 
 72 | Thus by default only the `__shared__` memory optimization is enabled and in use.
 73 | 
 74 | Note: this seems slower... See: https://developer.nvidia.com/blog/gpu-pro-tip-fast-histograms-using-shared-atomics-maxwell
 75 | 
 76 | ## Tile-Based Culling
 77 | 
 78 | Using the method mentioned: [StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time Rendering](https://github.com/r4dl/StopThePop-Rasterization), we borrow the tile-based culling scheme here to reduce the computational cost during training and rendering.
 79 | 
 80 | This section of code is directly adapted from their repository.
 81 | 
 82 | ```c++
 83 | ...
 84 |     constexpr float alpha_threshold = 1.0f / 255.0f;
 85 |     const float opacity_power_threshold = log(conic_opacity[idx].w / alpha_threshold);
 86 |     glm::vec2 max_pos;
 87 |     const glm::vec2 tile_min = {x * BLOCK_X, y * BLOCK_Y};
 88 |     const glm::vec2 tile_max = {(x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1};
 89 |     float max_opac_factor = max_contrib_power_rect_gaussian_float<BLOCK_X-1, BLOCK_Y-1>(conic_opacity[idx], points_xy[idx], tile_min, tile_max, max_pos);
 90 | 
 91 |     if (max_opac_factor > opacity_power_threshold) {
 92 |         continue;
 93 |     }
 94 | ...
 95 | ```
 96 | 
 97 | Note: this seems slower...
 98 | 
 99 | ## Tile-Mask Rendering
100 | 
101 | **Note: this api hasn't been fully tested yet.**
102 | 
103 | We additionaly provide a interface for adding a tile-mask to the gaussian rasterizer.
104 | 
105 | Turns out the tile-based rendering rasterization pipeline can be easily masked out to provide a patch-like rendering result (to simulate a NeRF-like ray sampling approach).
106 | 
107 | To implement this as efficiently as possible, we:
108 | 
109 | 1. Mark points that's not to be rendered as early as possible in the `preprocessCUDA` kernel.
110 | 2. Make all subsequent operations faster by not including masked-out tiles in the sorting and `renderCUDA` kernel.
111 | 
112 | The tile mask can be defined as:
113 | 
114 | ```python
115 | from diff_gauss import GaussianRasterizationSettings, GaussianRasterizer
116 | raster_settings = GaussianRasterizationSettings(...)
117 | rasterizer = GaussianRasterizer(raster_settings=raster_settings)
118 | 
119 | BLOCK_X, BLOCK_Y = 16, 16
120 | tile_height, tile_width = (raster_settings.image_height + BLOCK_Y - 1) // BLOCK_Y, (raster_settings.image_width + BLOCK_X - 1) // BLOCK_X
121 | tile_mask = torch.ones((tile_height, tile_width), dtype=torch.bool, device='cuda')
122 | 
123 | rendered_image, rendered_depth, rendered_alpha, radii = rasterizer(
124 |     means3D = means3D,
125 |     means2D = means2D,
126 |     shs = shs,
127 |     colors_precomp = colors_precomp,
128 |     opacities = opacity,
129 |     scales = scales,
130 |     rotations = rotations,
131 |     cov3D_precomp = cov3D_precomp,
132 |     tile_mask = tile_mask,
133 | )
134 | ```
135 | 
136 | ## Fixed `ImageState` Buffer Size
137 | 
138 | In the [original implementation](https://github.com/graphdeco-inria/diff-gaussian-rasterization), the size of the `ranges` member of the struct `ImageState` was too large (same as the number of pixels).
139 | 
140 | In reality, only `number of tiles` of `ranges` are needed, as the `ranges` are used to store the start and end indices of the gaussian splats in the `GeometryState` buffer.
141 | 
142 | We fix this by simply replacing the memory allocation of `ImageState` with:
143 | 
144 | ```c++
145 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N, size_t M)
146 | {
147 | 	ImageState img;
148 | 	obtain(chunk, img.n_contrib, N, 128);
149 | 	obtain(chunk, img.ranges, M, 128);
150 | 	return img;
151 | }
152 | ```
153 | 
154 | ## Fixed Culling
155 | 
156 | The [original repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization)'s implementation for view-space culling wasn't effective (no points were culled).
157 | 
158 | We fixed that with an improved OpenGL like culling function:
159 | 
160 | ```c++
161 | __forceinline__ __device__ bool in_frustum(int idx,
162 | 	const float* orig_points,
163 | 	const float* viewmatrix,
164 | 	const float* projmatrix,
165 | 	bool prefiltered,
166 | 	float3& p_view, // reference
167 | 	const float padding = 0.01f, // padding in ndc space // TODO: add api for changing this
168 | 	const float xy_padding = 0.5f // padding in ndc space // TODO: add api for changing this
169 | 	)
170 | {
171 | 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
172 | 	p_view = transformPoint4x3(p_orig, viewmatrix); // write this outside
173 | 	if (prefiltered) return true;
174 | 
175 | 	// Bring points to screen space
176 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
177 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
178 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
179 | 
180 | 	return (p_proj.z > -1 - padding) && (p_proj.z < 1 + padding) && (p_proj.x > -1 - xy_padding) && (p_proj.x < 1. + xy_padding) && (p_proj.y > -1 - xy_padding) && (p_proj.y < 1. + xy_padding);
181 | }
182 | ```
183 | 
184 | ## Depth & Alpha Backward
185 | 
186 | **Note: this functionality is directly copied from the [slothfulxtx repository](https://github.com/slothfulxtx/diff-gaussian-rasterization).**
187 | 
188 | Except for the RGB image, we also support render depth map and alpha map (both forward and backward process) compared with the [original repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization).
189 | 
190 | We modify the dependency name as **diff_gauss** to avoid dependecy conflict with the original version. You can install our repo by executing the following command lines
191 | 
192 | Here's an example of our modified differential gaussian rasterization repo
193 | ```python
194 | from diff_gauss import GaussianRasterizationSettings, GaussianRasterizer
195 | raster_settings = GaussianRasterizationSettings(...)
196 | rasterizer = GaussianRasterizer(raster_settings=raster_settings)
197 | 
198 | rendered_image, rendered_depth, rendered_alpha, radii = rasterizer(
199 |     means3D = means3D,
200 |     means2D = means2D,
201 |     shs = shs,
202 |     colors_precomp = colors_precomp,
203 |     opacities = opacity,
204 |     scales = scales,
205 |     rotations = rotations,
206 |     cov3D_precomp = cov3D_precomp
207 | )
208 | ```
209 | 
210 | Details: By default, the depth is calculated as 'median depth', where the depth values of each pixels covered by 3D Gaussian Splatting are set to be the depth of the 3D Gaussian center. Thus, there exist numerical errors when the scales of 3D Gaussian are large. However, thanks to the densificaiton scheme, most 3D Gaussians are small. Currently, we ignore the numerical error of depth maps. 
211 | 
212 | ## Differential Gaussian Rasterization
213 | 
214 | **Note: this is the original readme for the [original diff-gaussian-rasterization repository](https://github.com/graphdeco-inria/diff-gaussian-rasterization).**
215 | 
216 | Used as the rasterization engine for the paper "3D Gaussian Splatting for Real-Time Rendering of Radiance Fields". If you can make use of it in your own research, please be so kind to cite us.
217 | 
218 | <section class="section" id="BibTeX">
219 |   <div class="container is-max-desktop content">
220 |     <h2 class="title">BibTeX</h2>
221 |     <pre><code>@Article{kerbl3Dgaussians,
222 |       author       = {Kerbl, Bernhard and Kopanas, Georgios and Leimk{\"u}hler, Thomas and Drettakis, George},
223 |       title        = {3D Gaussian Splatting for Real-Time Radiance Field Rendering},
224 |       journal      = {ACM Transactions on Graphics},
225 |       number       = {4},
226 |       volume       = {42},
227 |       month        = {July},
228 |       year         = {2023},
229 |       url          = {https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/}
230 | }</code></pre>
231 |   </div>
232 | </section>
233 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2023, Inria
 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 | # All rights reserved.
 5 | #
 6 | # This software is free for non-commercial, research and evaluation use
 7 | # under the terms of the LICENSE.md file.
 8 | #
 9 | # For inquiries contact  george.drettakis@inria.fr
10 | #
11 | 
12 | from setuptools import setup
13 | from os.path import dirname, join, abspath
14 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension
15 | 
16 | dirname(abspath(__file__))
17 | 
18 | setup(
19 |     name="diff_gauss",
20 |     packages=['diff_gauss'],
21 |     ext_modules=[
22 |         CUDAExtension(
23 |             name="diff_gauss._C",
24 |             sources=[
25 |                 "cuda_rasterizer/rasterizer_impl.cu",
26 |                 "cuda_rasterizer/forward.cu",
27 |                 # "cuda_rasterizer/forward_half.cu",
28 |                 "cuda_rasterizer/backward.cu",
29 |                 "rasterize_points.cu",
30 |                 "ext.cpp"],
31 |             extra_compile_args={"nvcc": [
32 |                 "-O3", 
33 |                 "-Xcompiler", 
34 |                 "-fno-gnu-unique", 
35 |                 # "-G",
36 |                 "-I" + join(dirname(abspath(__file__)), "third_party/glm/")]})
37 |     ],
38 |     cmdclass={
39 |         'build_ext': BuildExtension
40 |     },
41 | )
42 | 


--------------------------------------------------------------------------------