├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE.md
├── README.md
├── cuda_rasterizer
    ├── auxiliary.h
    ├── backward.cu
    ├── backward.h
    ├── config.h
    ├── forward.cu
    ├── forward.h
    ├── helper_math.h
    ├── math.h
    ├── rasterizer.h
    ├── rasterizer_impl.cu
    └── rasterizer_impl.h
├── diff_gaussian_rasterization
    └── __init__.py
├── ext.cpp
├── rasterize_points.cu
├── rasterize_points.h
├── setup.py
└── third_party
    └── stbi_image_write.h


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | diff_gaussian_rasterization.egg-info/
3 | dist/
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/glm"]
2 | 	path = third_party/glm
3 | 	url = https://github.com/g-truc/glm.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2023, Inria
 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 | # All rights reserved.
 5 | #
 6 | # This software is free for non-commercial, research and evaluation use 
 7 | # under the terms of the LICENSE.md file.
 8 | #
 9 | # For inquiries contact  george.drettakis@inria.fr
10 | #
11 | 
12 | cmake_minimum_required(VERSION 3.20)
13 | 
14 | project(DiffRast LANGUAGES CUDA CXX)
15 | 
16 | set(CMAKE_CXX_STANDARD 17)
17 | set(CMAKE_CXX_EXTENSIONS OFF)
18 | set(CMAKE_CUDA_STANDARD 17)
19 | 
20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
21 | 
22 | add_library(CudaRasterizer
23 | 	cuda_rasterizer/backward.h
24 | 	cuda_rasterizer/backward.cu
25 | 	cuda_rasterizer/forward.h
26 | 	cuda_rasterizer/forward.cu
27 | 	cuda_rasterizer/auxiliary.h
28 | 	cuda_rasterizer/rasterizer_impl.cu
29 | 	cuda_rasterizer/rasterizer_impl.h
30 | 	cuda_rasterizer/rasterizer.h
31 | )
32 | 
33 | set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "70;75;86")
34 | 
35 | target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer)
36 | target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
37 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Gaussian-Splatting License  
 2 | ===========================  
 3 | 
 4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.  
 5 | The *Software* is in the process of being registered with the Agence pour la Protection des  
 6 | Programmes (APP).  
 7 | 
 8 | The *Software* is still being developed by the *Licensor*.  
 9 | 
10 | *Licensor*'s goal is to allow the research community to use, test and evaluate  
11 | the *Software*.  
12 | 
13 | ## 1.  Definitions  
14 | 
15 | *Licensee* means any person or entity that uses the *Software* and distributes  
16 | its *Work*.  
17 | 
18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII  
19 | 
20 | *Software* means the original work of authorship made available under this  
21 | License ie gaussian-splatting.  
22 | 
23 | *Work* means the *Software* and any additions to or derivative works of the  
24 | *Software* that are made available under this License.  
25 | 
26 | 
27 | ## 2.  Purpose  
28 | This license is intended to define the rights granted to the *Licensee* by  
29 | Licensors under the *Software*.  
30 | 
31 | ## 3.  Rights granted  
32 | 
33 | For the above reasons Licensors have decided to distribute the *Software*.  
34 | Licensors grant non-exclusive rights to use the *Software* for research purposes  
35 | to research users (both academic and industrial), free of charge, without right  
36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research  
37 | and/or evaluation purposes only.  
38 | 
39 | Subject to the terms and conditions of this License, you are granted a  
40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of,  
41 | publicly display, publicly perform and distribute its *Work* and any resulting  
42 | derivative works in any form.  
43 | 
44 | ## 4.  Limitations  
45 | 
46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do  
47 | so under this License, (b) you include a complete copy of this License with  
48 | your distribution, and (c) you retain without modification any copyright,  
49 | patent, trademark, or attribution notices that are present in the *Work*.  
50 | 
51 | **4.2 Derivative Works.** You may specify that additional or different terms apply  
52 | to the use, reproduction, and distribution of your derivative works of the *Work*  
53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in  
54 | Section 2 applies to your derivative works, and (b) you identify the specific  
55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms,  
56 | this License (including the redistribution requirements in Section 3.1) will  
57 | continue to apply to the *Work* itself.  
58 | 
59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research  
60 | users explicitly acknowledge having received from Licensors all information  
61 | allowing to appreciate the adequacy between of the *Software* and their needs and  
62 | to undertake all necessary precautions for its execution and use.  
63 | 
64 | **4.4** The *Software* is provided both as a compiled library file and as source  
65 | code. In case of using the *Software* for a publication or other results obtained  
66 | through the use of the *Software*, users are strongly encouraged to cite the  
67 | corresponding publications as explained in the documentation of the *Software*.  
68 | 
69 | ## 5.  Disclaimer  
70 | 
71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES  
72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY  
73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL  
74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES  
75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL  
76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR  
77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE  
78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR  
79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE  
80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)  
81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT  
82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR  
83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.  
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Differential Gaussian Rasterization with Camera Pose Jacobians
 2 | 
 3 | This software is used as the rasterization engine in the paper ["Gaussian Splatting SLAM"](https://arxiv.org/abs/2312.06741), and supports:
 4 | 
 5 | * Analytical gradient for SE(3) camera poses.
 6 | * Analytical gradient for rendered depth.
 7 | 
 8 | The code is built on top of the original [Differential Gaussian Rasterization](https://github.com/graphdeco-inria/diff-gaussian-rasterization) used in "3D Gaussian Splatting for Real-Time Rendering of Radiance Fields".
 9 | 
10 | If you can make use of it in your own research, please be so kind to cite both papers.
11 | 
12 | 
13 | <section class="section" id="BibTeX">
14 |   <div class="container is-max-desktop content">
15 |     <h2 class="title">BibTeX</h2>
16 |     <pre><code>@Article{kerbl3Dgaussians,
17 |       author       = {Kerbl, Bernhard and Kopanas, Georgios and Leimk{\"u}hler, Thomas and Drettakis, George},
18 |       title        = {3D Gaussian Splatting for Real-Time Radiance Field Rendering},
19 |       journal      = {ACM Transactions on Graphics},
20 |       number       = {4},
21 |       volume       = {42},
22 |       month        = {July},
23 |       year         = {2023},
24 |       url          = {https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/}
25 | }</code></pre>
26 | </code></pre>
27 |     <pre><code>@inproceedings{Matsuki:Murai:etal:CVPR2024,
28 |   title={{G}aussian {S}platting {SLAM}},
29 |   author={Hidenobu Matsuki and Riku Murai and Paul H. J. Kelly and Andrew J. Davison},
30 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
31 |   year={2024}
32 | }</code></pre>
33 | 
34 | </div>
35 | </section>
36 | 
37 | 


--------------------------------------------------------------------------------
/cuda_rasterizer/auxiliary.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
 13 | #define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
 14 | 
 15 | #include "config.h"
 16 | #include "stdio.h"
 17 | 
 18 | #define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
 19 | #define NUM_WARPS (BLOCK_SIZE/32)
 20 | 
 21 | // Spherical harmonics coefficients
 22 | __device__ const float SH_C0 = 0.28209479177387814f;
 23 | __device__ const float SH_C1 = 0.4886025119029199f;
 24 | __device__ const float SH_C2[] = {
 25 | 	1.0925484305920792f,
 26 | 	-1.0925484305920792f,
 27 | 	0.31539156525252005f,
 28 | 	-1.0925484305920792f,
 29 | 	0.5462742152960396f
 30 | };
 31 | __device__ const float SH_C3[] = {
 32 | 	-0.5900435899266435f,
 33 | 	2.890611442640554f,
 34 | 	-0.4570457994644658f,
 35 | 	0.3731763325901154f,
 36 | 	-0.4570457994644658f,
 37 | 	1.445305721320277f,
 38 | 	-0.5900435899266435f
 39 | };
 40 | 
 41 | __forceinline__ __device__ float ndc2Pix(float v, int S)
 42 | {
 43 | 	return ((v + 1.0) * S - 1.0) * 0.5;
 44 | }
 45 | 
 46 | __forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
 47 | {
 48 | 	rect_min = {
 49 | 		min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
 50 | 		min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
 51 | 	};
 52 | 	rect_max = {
 53 | 		min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
 54 | 		min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
 55 | 	};
 56 | }
 57 | 
 58 | __forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
 59 | {
 60 | 	float3 transformed = {
 61 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
 62 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
 63 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
 64 | 	};
 65 | 	return transformed;
 66 | }
 67 | 
 68 | __forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
 69 | {
 70 | 	float4 transformed = {
 71 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
 72 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
 73 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
 74 | 		matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
 75 | 	};
 76 | 	return transformed;
 77 | }
 78 | 
 79 | __forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix)
 80 | {
 81 | 	float3 transformed = {
 82 | 		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
 83 | 		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
 84 | 		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
 85 | 	};
 86 | 	return transformed;
 87 | }
 88 | 
 89 | __forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix)
 90 | {
 91 | 	float3 transformed = {
 92 | 		matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
 93 | 		matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
 94 | 		matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
 95 | 	};
 96 | 	return transformed;
 97 | }
 98 | 
 99 | __forceinline__ __device__ float dnormvdz(float3 v, float3 dv)
100 | {
101 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
102 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
103 | 	float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
104 | 	return dnormvdz;
105 | }
106 | 
107 | __forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv)
108 | {
109 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
110 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
111 | 
112 | 	float3 dnormvdv;
113 | 	dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32;
114 | 	dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32;
115 | 	dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
116 | 	return dnormvdv;
117 | }
118 | 
119 | __forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv)
120 | {
121 | 	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
122 | 	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
123 | 
124 | 	float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w };
125 | 	float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
126 | 	float4 dnormvdv;
127 | 	dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
128 | 	dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
129 | 	dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
130 | 	dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
131 | 	return dnormvdv;
132 | }
133 | 
134 | __forceinline__ __device__ float sigmoid(float x)
135 | {
136 | 	return 1.0f / (1.0f + expf(-x));
137 | }
138 | 
139 | __forceinline__ __device__ bool in_frustum(int idx,
140 | 	const float* orig_points,
141 | 	const float* viewmatrix,
142 | 	const float* projmatrix,
143 | 	bool prefiltered,
144 | 	float3& p_view)
145 | {
146 | 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
147 | 
148 | 	// Bring points to screen space
149 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
150 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
151 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
152 | 	p_view = transformPoint4x3(p_orig, viewmatrix);
153 | 
154 | 	if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3)))
155 | 	{
156 | 		if (prefiltered)
157 | 		{
158 | 			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
159 | 			__trap();
160 | 		}
161 | 		return false;
162 | 	}
163 | 	return true;
164 | }
165 | 
166 | #define CHECK_CUDA(A, debug) \
167 | A; if(debug) { \
168 | auto ret = cudaDeviceSynchronize(); \
169 | if (ret != cudaSuccess) { \
170 | std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__ << ": " << cudaGetErrorString(ret); \
171 | throw std::runtime_error(cudaGetErrorString(ret)); \
172 | } \
173 | }
174 | 
175 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/backward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include "backward.h"
 13 | #include "auxiliary.h"
 14 | #include "math.h"
 15 | #include <cooperative_groups.h>
 16 | #include <cooperative_groups/reduce.h>
 17 | namespace cg = cooperative_groups;
 18 | 
 19 | // Backward pass for conversion of spherical harmonics to RGB for
 20 | // each Gaussian.
 21 | __device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs,  float *dL_dtau)
 22 | {
 23 | 	// Compute intermediate values, as it is done during forward
 24 | 	glm::vec3 pos = means[idx];
 25 | 	glm::vec3 dir_orig = pos - campos;
 26 | 	glm::vec3 dir = dir_orig / glm::length(dir_orig);
 27 | 
 28 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
 29 | 
 30 | 	// Use PyTorch rule for clamping: if clamping was applied,
 31 | 	// gradient becomes 0.
 32 | 	glm::vec3 dL_dRGB = dL_dcolor[idx];
 33 | 	dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
 34 | 	dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
 35 | 	dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
 36 | 
 37 | 	glm::vec3 dRGBdx(0, 0, 0);
 38 | 	glm::vec3 dRGBdy(0, 0, 0);
 39 | 	glm::vec3 dRGBdz(0, 0, 0);
 40 | 	float x = dir.x;
 41 | 	float y = dir.y;
 42 | 	float z = dir.z;
 43 | 
 44 | 	// Target location for this Gaussian to write SH gradients to
 45 | 	glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs;
 46 | 
 47 | 	// No tricks here, just high school-level calculus.
 48 | 	float dRGBdsh0 = SH_C0;
 49 | 	dL_dsh[0] = dRGBdsh0 * dL_dRGB;
 50 | 	if (deg > 0)
 51 | 	{
 52 | 		float dRGBdsh1 = -SH_C1 * y;
 53 | 		float dRGBdsh2 = SH_C1 * z;
 54 | 		float dRGBdsh3 = -SH_C1 * x;
 55 | 		dL_dsh[1] = dRGBdsh1 * dL_dRGB;
 56 | 		dL_dsh[2] = dRGBdsh2 * dL_dRGB;
 57 | 		dL_dsh[3] = dRGBdsh3 * dL_dRGB;
 58 | 
 59 | 		dRGBdx = -SH_C1 * sh[3];
 60 | 		dRGBdy = -SH_C1 * sh[1];
 61 | 		dRGBdz = SH_C1 * sh[2];
 62 | 
 63 | 		if (deg > 1)
 64 | 		{
 65 | 			float xx = x * x, yy = y * y, zz = z * z;
 66 | 			float xy = x * y, yz = y * z, xz = x * z;
 67 | 
 68 | 			float dRGBdsh4 = SH_C2[0] * xy;
 69 | 			float dRGBdsh5 = SH_C2[1] * yz;
 70 | 			float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
 71 | 			float dRGBdsh7 = SH_C2[3] * xz;
 72 | 			float dRGBdsh8 = SH_C2[4] * (xx - yy);
 73 | 			dL_dsh[4] = dRGBdsh4 * dL_dRGB;
 74 | 			dL_dsh[5] = dRGBdsh5 * dL_dRGB;
 75 | 			dL_dsh[6] = dRGBdsh6 * dL_dRGB;
 76 | 			dL_dsh[7] = dRGBdsh7 * dL_dRGB;
 77 | 			dL_dsh[8] = dRGBdsh8 * dL_dRGB;
 78 | 
 79 | 			dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
 80 | 			dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
 81 | 			dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7];
 82 | 
 83 | 			if (deg > 2)
 84 | 			{
 85 | 				float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
 86 | 				float dRGBdsh10 = SH_C3[1] * xy * z;
 87 | 				float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
 88 | 				float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
 89 | 				float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
 90 | 				float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
 91 | 				float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
 92 | 				dL_dsh[9] = dRGBdsh9 * dL_dRGB;
 93 | 				dL_dsh[10] = dRGBdsh10 * dL_dRGB;
 94 | 				dL_dsh[11] = dRGBdsh11 * dL_dRGB;
 95 | 				dL_dsh[12] = dRGBdsh12 * dL_dRGB;
 96 | 				dL_dsh[13] = dRGBdsh13 * dL_dRGB;
 97 | 				dL_dsh[14] = dRGBdsh14 * dL_dRGB;
 98 | 				dL_dsh[15] = dRGBdsh15 * dL_dRGB;
 99 | 
100 | 				dRGBdx += (
101 | 					SH_C3[0] * sh[9] * 3.f * 2.f * xy +
102 | 					SH_C3[1] * sh[10] * yz +
103 | 					SH_C3[2] * sh[11] * -2.f * xy +
104 | 					SH_C3[3] * sh[12] * -3.f * 2.f * xz +
105 | 					SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
106 | 					SH_C3[5] * sh[14] * 2.f * xz +
107 | 					SH_C3[6] * sh[15] * 3.f * (xx - yy));
108 | 
109 | 				dRGBdy += (
110 | 					SH_C3[0] * sh[9] * 3.f * (xx - yy) +
111 | 					SH_C3[1] * sh[10] * xz +
112 | 					SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
113 | 					SH_C3[3] * sh[12] * -3.f * 2.f * yz +
114 | 					SH_C3[4] * sh[13] * -2.f * xy +
115 | 					SH_C3[5] * sh[14] * -2.f * yz +
116 | 					SH_C3[6] * sh[15] * -3.f * 2.f * xy);
117 | 
118 | 				dRGBdz += (
119 | 					SH_C3[1] * sh[10] * xy +
120 | 					SH_C3[2] * sh[11] * 4.f * 2.f * yz +
121 | 					SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
122 | 					SH_C3[4] * sh[13] * 4.f * 2.f * xz +
123 | 					SH_C3[5] * sh[14] * (xx - yy));
124 | 			}
125 | 		}
126 | 	}
127 | 
128 | 	// The view direction is an input to the computation. View direction
129 | 	// is influenced by the Gaussian's mean, so SHs gradients
130 | 	// must propagate back into 3D position.
131 | 	glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB));
132 | 
133 | 	// Account for normalization of direction
134 | 	float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z });
135 | 
136 | 	// Gradients of loss w.r.t. Gaussian means, but only the portion 
137 | 	// that is caused because the mean affects the view-dependent color.
138 | 	// Additional mean gradient is accumulated in below methods.
139 | 	dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
140 | 
141 | 	dL_dtau[6 * idx + 0] += -dL_dmean.x;
142 | 	dL_dtau[6 * idx + 1] += -dL_dmean.y;
143 | 	dL_dtau[6 * idx + 2] += -dL_dmean.z;
144 | 
145 | }
146 | 
147 | // Backward version of INVERSE 2D covariance matrix computation
148 | // (due to length launched as separate kernel before other 
149 | // backward steps contained in preprocess)
150 | __global__ void computeCov2DCUDA(int P,
151 | 	const float3* means,
152 | 	const int* radii,
153 | 	const float* cov3Ds,
154 | 	const float h_x, float h_y,
155 | 	const float tan_fovx, float tan_fovy,
156 | 	const float* view_matrix,
157 | 	const float* dL_dconics,
158 | 	float3* dL_dmeans,
159 | 	float* dL_dcov,
160 | 	float *dL_dtau)
161 | {
162 | 	auto idx = cg::this_grid().thread_rank();
163 | 	if (idx >= P || !(radii[idx] > 0))
164 | 		return;
165 | 
166 | 	// Reading location of 3D covariance for this Gaussian
167 | 	const float* cov3D = cov3Ds + 6 * idx;
168 | 
169 | 	// Fetch gradients, recompute 2D covariance and relevant 
170 | 	// intermediate forward results needed in the backward.
171 | 	float3 mean = means[idx];
172 | 	float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] };
173 | 	float3 t = transformPoint4x3(mean, view_matrix);
174 | 	
175 | 	const float limx = 1.3f * tan_fovx;
176 | 	const float limy = 1.3f * tan_fovy;
177 | 	const float txtz = t.x / t.z;
178 | 	const float tytz = t.y / t.z;
179 | 	t.x = min(limx, max(-limx, txtz)) * t.z;
180 | 	t.y = min(limy, max(-limy, tytz)) * t.z;
181 | 	
182 | 	const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1;
183 | 	const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1;
184 | 
185 | 	glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z),
186 | 		0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z),
187 | 		0, 0, 0);
188 | 
189 | 	glm::mat3 W = glm::mat3(
190 | 		view_matrix[0], view_matrix[4], view_matrix[8],
191 | 		view_matrix[1], view_matrix[5], view_matrix[9],
192 | 		view_matrix[2], view_matrix[6], view_matrix[10]);
193 | 
194 | 	glm::mat3 Vrk = glm::mat3(
195 | 		cov3D[0], cov3D[1], cov3D[2],
196 | 		cov3D[1], cov3D[3], cov3D[4],
197 | 		cov3D[2], cov3D[4], cov3D[5]);
198 | 
199 | 	glm::mat3 T = W * J;
200 | 
201 | 	glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
202 | 
203 | 	// Use helper variables for 2D covariance entries. More compact.
204 | 	float a = cov2D[0][0] += 0.3f;
205 | 	float b = cov2D[0][1];
206 | 	float c = cov2D[1][1] += 0.3f;
207 | 
208 | 	float denom = a * c - b * b;
209 | 	float dL_da = 0, dL_db = 0, dL_dc = 0;
210 | 	float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
211 | 
212 | 	if (denom2inv != 0)
213 | 	{
214 | 		// Gradients of loss w.r.t. entries of 2D covariance matrix,
215 | 		// given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
216 | 		// e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
217 | 		dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z);
218 | 		dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x);
219 | 		dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z);
220 | 
221 | 		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
222 | 		// given gradients w.r.t. 2D covariance matrix (diagonal).
223 | 		// cov2D = transpose(T) * transpose(Vrk) * T;
224 | 		dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc);
225 | 		dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc);
226 | 		dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc);
227 | 
228 | 		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
229 | 		// given gradients w.r.t. 2D covariance matrix (off-diagonal).
230 | 		// Off-diagonal elements appear twice --> double the gradient.
231 | 		// cov2D = transpose(T) * transpose(Vrk) * T;
232 | 		dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc;
233 | 		dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc;
234 | 		dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc;
235 | 	}
236 | 	else
237 | 	{
238 | 		for (int i = 0; i < 6; i++)
239 | 			dL_dcov[6 * idx + i] = 0;
240 | 	}
241 | 
242 | 	// Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
243 | 	// cov2D = transpose(T) * transpose(Vrk) * T;
244 | 	float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da +
245 | 		(T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
246 | 	float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da +
247 | 		(T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
248 | 	float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da +
249 | 		(T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
250 | 	float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc +
251 | 		(T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
252 | 	float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc +
253 | 		(T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
254 | 	float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc +
255 | 		(T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
256 | 
257 | 	// Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
258 | 	// T = W * J
259 | 	float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
260 | 	float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
261 | 	float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
262 | 	float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
263 | 
264 | 	float tz = 1.f / t.z;
265 | 	float tz2 = tz * tz;
266 | 	float tz3 = tz2 * tz;
267 | 
268 | 	// Gradients of loss w.r.t. transformed Gaussian mean t
269 | 	float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02;
270 | 	float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12;
271 | 	float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12;
272 | 
273 | 	SE3 T_CW(view_matrix);
274 | 	mat33 R = T_CW.R().data();
275 | 	mat33 RT = R.transpose();
276 | 	float3 t_ = T_CW.t();
277 | 	mat33 dpC_drho = mat33::identity();
278 | 	mat33 dpC_dtheta = -mat33::skew_symmetric(t);
279 | 	float dL_dt[6];
280 | 	for (int i = 0; i < 3; i++) {
281 | 		float3 c_rho = dpC_drho.cols[i];
282 | 		float3 c_theta = dpC_dtheta.cols[i];
283 | 		dL_dt[i] = dL_dtx * c_rho.x + dL_dty * c_rho.y + dL_dtz * c_rho.z;
284 | 		dL_dt[i + 3] = dL_dtx * c_theta.x + dL_dty * c_theta.y + dL_dtz * c_theta.z;
285 | 	}
286 | 	for (int i = 0; i < 6; i++) {
287 | 		dL_dtau[6 * idx + i] += dL_dt[i];
288 | 	}
289 | 
290 | 	// Account for transformation of mean to t
291 | 	// t = transformPoint4x3(mean, view_matrix);
292 | 	float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix);
293 | 
294 | 	// Gradients of loss w.r.t. Gaussian means, but only the portion 
295 | 	// that is caused because the mean affects the covariance matrix.
296 | 	// Additional mean gradient is accumulated in BACKWARD::preprocess.
297 | 	dL_dmeans[idx] = dL_dmean;
298 | 
299 | 	float dL_dW00 = J[0][0] * dL_dT00;
300 | 	float dL_dW01 = J[0][0] * dL_dT01;
301 | 	float dL_dW02 = J[0][0] * dL_dT02;
302 | 	float dL_dW10 = J[1][1] * dL_dT10;
303 | 	float dL_dW11 = J[1][1] * dL_dT11;
304 | 	float dL_dW12 = J[1][1] * dL_dT12;
305 | 	float dL_dW20 = J[0][2] * dL_dT00 + J[1][2] * dL_dT10;
306 | 	float dL_dW21 = J[0][2] * dL_dT01 + J[1][2] * dL_dT11;
307 | 	float dL_dW22 = J[0][2] * dL_dT02 + J[1][2] * dL_dT12;
308 | 
309 | 	float3 c1 = R.cols[0];
310 | 	float3 c2 = R.cols[1];
311 | 	float3 c3 = R.cols[2];
312 | 
313 | 	float dL_dW_data[9];
314 | 	dL_dW_data[0] = dL_dW00;
315 | 	dL_dW_data[3] = dL_dW01;
316 | 	dL_dW_data[6] = dL_dW02;
317 | 	dL_dW_data[1] = dL_dW10;
318 | 	dL_dW_data[4] = dL_dW11;
319 | 	dL_dW_data[7] = dL_dW12;
320 | 	dL_dW_data[2] = dL_dW20;
321 | 	dL_dW_data[5] = dL_dW21;
322 | 	dL_dW_data[8] = dL_dW22;
323 | 
324 | 	mat33 dL_dW(dL_dW_data);
325 | 	float3 dL_dWc1 = dL_dW.cols[0];
326 | 	float3 dL_dWc2 = dL_dW.cols[1];
327 | 	float3 dL_dWc3 = dL_dW.cols[2];
328 | 
329 | 	mat33 n_W1_x = -mat33::skew_symmetric(c1);
330 | 	mat33 n_W2_x = -mat33::skew_symmetric(c2);
331 | 	mat33 n_W3_x = -mat33::skew_symmetric(c3);
332 | 
333 | 	float3 dL_dtheta = {};
334 | 	dL_dtheta.x = dot(dL_dWc1, n_W1_x.cols[0]) + dot(dL_dWc2, n_W2_x.cols[0]) +
335 | 				dot(dL_dWc3, n_W3_x.cols[0]);
336 | 	dL_dtheta.y = dot(dL_dWc1, n_W1_x.cols[1]) + dot(dL_dWc2, n_W2_x.cols[1]) +
337 | 				dot(dL_dWc3, n_W3_x.cols[1]);
338 | 	dL_dtheta.z = dot(dL_dWc1, n_W1_x.cols[2]) + dot(dL_dWc2, n_W2_x.cols[2]) +
339 | 				dot(dL_dWc3, n_W3_x.cols[2]);
340 | 
341 | 	dL_dtau[6 * idx + 3] += dL_dtheta.x;
342 | 	dL_dtau[6 * idx + 4] += dL_dtheta.y;
343 | 	dL_dtau[6 * idx + 5] += dL_dtheta.z;
344 | 
345 | 
346 | }
347 | 
348 | // Backward pass for the conversion of scale and rotation to a 
349 | // 3D covariance matrix for each Gaussian. 
350 | __device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots)
351 | {
352 | 	// Recompute (intermediate) results for the 3D covariance computation.
353 | 	glm::vec4 q = rot;// / glm::length(rot);
354 | 	float r = q.x;
355 | 	float x = q.y;
356 | 	float y = q.z;
357 | 	float z = q.w;
358 | 
359 | 	glm::mat3 R = glm::mat3(
360 | 		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
361 | 		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
362 | 		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
363 | 	);
364 | 
365 | 	glm::mat3 S = glm::mat3(1.0f);
366 | 
367 | 	glm::vec3 s = mod * scale;
368 | 	S[0][0] = s.x;
369 | 	S[1][1] = s.y;
370 | 	S[2][2] = s.z;
371 | 
372 | 	glm::mat3 M = S * R;
373 | 
374 | 	const float* dL_dcov3D = dL_dcov3Ds + 6 * idx;
375 | 
376 | 	glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
377 | 	glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
378 | 
379 | 	// Convert per-element covariance loss gradients to matrix form
380 | 	glm::mat3 dL_dSigma = glm::mat3(
381 | 		dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
382 | 		0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
383 | 		0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]
384 | 	);
385 | 
386 | 	// Compute loss gradient w.r.t. matrix M
387 | 	// dSigma_dM = 2 * M
388 | 	glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
389 | 
390 | 	glm::mat3 Rt = glm::transpose(R);
391 | 	glm::mat3 dL_dMt = glm::transpose(dL_dM);
392 | 
393 | 	// Gradients of loss w.r.t. scale
394 | 	glm::vec3* dL_dscale = dL_dscales + idx;
395 | 	dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
396 | 	dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
397 | 	dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
398 | 
399 | 	dL_dMt[0] *= s.x;
400 | 	dL_dMt[1] *= s.y;
401 | 	dL_dMt[2] *= s.z;
402 | 
403 | 	// Gradients of loss w.r.t. normalized quaternion
404 | 	glm::vec4 dL_dq;
405 | 	dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
406 | 	dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
407 | 	dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
408 | 	dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
409 | 
410 | 	// Gradients of loss w.r.t. unnormalized quaternion
411 | 	float4* dL_drot = (float4*)(dL_drots + idx);
412 | 	*dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
413 | }
414 | 
415 | // Backward pass of the preprocessing steps, except
416 | // for the covariance computation and inversion
417 | // (those are handled by a previous kernel call)
418 | template<int C>
419 | __global__ void preprocessCUDA(
420 | 	int P, int D, int M,
421 | 	const float3* means,
422 | 	const int* radii,
423 | 	const float* shs,
424 | 	const bool* clamped,
425 | 	const glm::vec3* scales,
426 | 	const glm::vec4* rotations,
427 | 	const float scale_modifier,
428 | 	const float *viewmatrix,
429 | 	const float* proj,
430 | 	const float *proj_raw,
431 | 	const glm::vec3* campos,
432 | 	const float3* dL_dmean2D,
433 | 	glm::vec3* dL_dmeans,
434 | 	float* dL_dcolor,
435 | 	float *dL_ddepth,
436 | 	float* dL_dcov3D,
437 | 	float* dL_dsh,
438 | 	glm::vec3* dL_dscale,
439 | 	glm::vec4* dL_drot,
440 | 	float *dL_dtau)
441 | {
442 | 	auto idx = cg::this_grid().thread_rank();
443 | 	if (idx >= P || !(radii[idx] > 0))
444 | 		return;
445 | 
446 | 	float3 m = means[idx];
447 | 
448 | 	// Taking care of gradients from the screenspace points
449 | 	float4 m_hom = transformPoint4x4(m, proj);
450 | 	float m_w = 1.0f / (m_hom.w + 0.0000001f);
451 | 
452 | 	// Compute loss gradient w.r.t. 3D means due to gradients of 2D means
453 | 	// from rendering procedure
454 | 	glm::vec3 dL_dmean;
455 | 	float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
456 | 	float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
457 | 	dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
458 | 	dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
459 | 	dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
460 | 
461 | 	// That's the second part of the mean gradient. Previous computation
462 | 	// of cov2D and following SH conversion also affects it.
463 | 	dL_dmeans[idx] += dL_dmean;
464 | 
465 | 	float alpha = 1.0f * m_w;
466 | 	float beta = -m_hom.x * m_w * m_w;
467 | 	float gamma = -m_hom.y * m_w * m_w;
468 | 
469 | 	float a = proj_raw[0];
470 | 	float b = proj_raw[5];
471 | 	float c = proj_raw[10];
472 | 	float d = proj_raw[14];
473 | 	float e = proj_raw[11];
474 | 
475 | 	SE3 T_CW(viewmatrix);
476 | 	mat33 R = T_CW.R().data();
477 | 	mat33 RT = R.transpose();
478 | 	float3 t = T_CW.t();
479 | 	float3 p_C = T_CW * m;
480 | 	mat33 dp_C_d_rho = mat33::identity();
481 | 	mat33 dp_C_d_theta = -mat33::skew_symmetric(p_C);
482 | 
483 | 	float3 d_proj_dp_C1 = make_float3(alpha * a, 0.f, beta * e);
484 | 	float3 d_proj_dp_C2 = make_float3(0.f, alpha * b, gamma * e);
485 | 
486 | 	float3 d_proj_dp_C1_d_rho = dp_C_d_rho.transpose() * d_proj_dp_C1; // x.T A = A.T x
487 | 	float3 d_proj_dp_C2_d_rho = dp_C_d_rho.transpose() * d_proj_dp_C2;
488 | 	float3 d_proj_dp_C1_d_theta = dp_C_d_theta.transpose() * d_proj_dp_C1;
489 | 	float3 d_proj_dp_C2_d_theta = dp_C_d_theta.transpose() * d_proj_dp_C2;
490 | 
491 | 	float2 dmean2D_dtau[6];
492 | 	dmean2D_dtau[0].x = d_proj_dp_C1_d_rho.x;
493 | 	dmean2D_dtau[1].x = d_proj_dp_C1_d_rho.y;
494 | 	dmean2D_dtau[2].x = d_proj_dp_C1_d_rho.z;
495 | 	dmean2D_dtau[3].x = d_proj_dp_C1_d_theta.x;
496 | 	dmean2D_dtau[4].x = d_proj_dp_C1_d_theta.y;
497 | 	dmean2D_dtau[5].x = d_proj_dp_C1_d_theta.z;
498 | 
499 | 	dmean2D_dtau[0].y = d_proj_dp_C2_d_rho.x;
500 | 	dmean2D_dtau[1].y = d_proj_dp_C2_d_rho.y;
501 | 	dmean2D_dtau[2].y = d_proj_dp_C2_d_rho.z;
502 | 	dmean2D_dtau[3].y = d_proj_dp_C2_d_theta.x;
503 | 	dmean2D_dtau[4].y = d_proj_dp_C2_d_theta.y;
504 | 	dmean2D_dtau[5].y = d_proj_dp_C2_d_theta.z;
505 | 
506 | 	float dL_dt[6];
507 | 	for (int i = 0; i < 6; i++) {
508 | 		dL_dt[i] = dL_dmean2D[idx].x * dmean2D_dtau[i].x + dL_dmean2D[idx].y * dmean2D_dtau[i].y;
509 | 	}
510 | 	for (int i = 0; i < 6; i++) {
511 | 		dL_dtau[6 * idx + i] += dL_dt[i];
512 | 	}
513 | 
514 | 	// Compute gradient update due to computing depths
515 | 	// p_orig = m
516 | 	// p_view = transformPoint4x3(p_orig, viewmatrix);
517 | 	// depth = p_view.z;
518 | 	float dL_dpCz = dL_ddepth[idx];
519 | 	dL_dmeans[idx].x += dL_dpCz * viewmatrix[2];
520 | 	dL_dmeans[idx].y += dL_dpCz * viewmatrix[6];
521 | 	dL_dmeans[idx].z += dL_dpCz * viewmatrix[10];
522 | 
523 | 	for (int i = 0; i < 3; i++) {
524 | 		float3 c_rho = dp_C_d_rho.cols[i];
525 | 		float3 c_theta = dp_C_d_theta.cols[i];
526 | 		dL_dtau[6 * idx + i] += dL_dpCz * c_rho.z;
527 | 		dL_dtau[6 * idx + i + 3] += dL_dpCz * c_theta.z;
528 | 	}
529 | 
530 | 
531 | 
532 | 	// Compute gradient updates due to computing colors from SHs
533 | 	if (shs)
534 | 		computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh, dL_dtau);
535 | 
536 | 	// Compute gradient updates due to computing covariance from scale/rotation
537 | 	if (scales)
538 | 		computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
539 | }
540 | 
541 | template <typename T>
542 | __device__ void inline reduce_helper(int lane, int i, T *data) {
543 |   if (lane < i) {
544 |     data[lane] += data[lane + i];
545 |   }
546 | }
547 | 
548 | template <typename group_t, typename... Lists>
549 | __device__ void render_cuda_reduce_sum(group_t g, Lists... lists) {
550 |   int lane = g.thread_rank();
551 |   g.sync();
552 | 
553 |   for (int i = g.size() / 2; i > 0; i /= 2) {
554 |     (...,
555 |      reduce_helper(
556 |          lane, i, lists)); // Fold expression: apply reduce_helper for each list
557 |     g.sync();
558 |   }
559 | }
560 | 
561 | 
562 | // Backward version of the rendering procedure.
563 | template <uint32_t C>
564 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
565 | renderCUDA(
566 | 	const uint2* __restrict__ ranges,
567 | 	const uint32_t* __restrict__ point_list,
568 | 	int W, int H,
569 | 	const float* __restrict__ bg_color,
570 | 	const float2* __restrict__ points_xy_image,
571 | 	const float4* __restrict__ conic_opacity,
572 | 	const float* __restrict__ colors,
573 | 	const float* __restrict__ depths,
574 | 	const float* __restrict__ final_Ts,
575 | 	const uint32_t* __restrict__ n_contrib,
576 | 	const float* __restrict__ dL_dpixels,
577 | 	const float* __restrict__ dL_dpixels_depth,
578 | 	float3* __restrict__ dL_dmean2D,
579 | 	float4* __restrict__ dL_dconic2D,
580 | 	float* __restrict__ dL_dopacity,
581 | 	float* __restrict__ dL_dcolors,
582 | 	float* __restrict__ dL_ddepths)
583 | {
584 | 	// We rasterize again. Compute necessary block info.
585 | 	auto block = cg::this_thread_block();
586 | 	auto tid = block.thread_rank();
587 |     
588 | 	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
589 | 	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
590 | 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
591 | 	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
592 | 	const uint32_t pix_id = W * pix.y + pix.x;
593 | 	const float2 pixf = { (float)pix.x, (float)pix.y };
594 | 
595 | 	const bool inside = pix.x < W&& pix.y < H;
596 | 	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
597 | 
598 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
599 | 
600 | 	bool done = !inside;
601 | 	int toDo = range.y - range.x;
602 | 
603 | 	__shared__ int collected_id[BLOCK_SIZE];
604 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
605 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
606 | 	__shared__ float collected_colors[C * BLOCK_SIZE];
607 | 	__shared__ float collected_depths[BLOCK_SIZE];
608 | 
609 | 	__shared__ float2 dL_dmean2D_shared[BLOCK_SIZE];
610 | 	__shared__ float3 dL_dcolors_shared[BLOCK_SIZE];
611 | 	__shared__ float dL_ddepths_shared[BLOCK_SIZE];
612 | 	__shared__ float dL_dopacity_shared[BLOCK_SIZE];
613 | 	__shared__ float4 dL_dconic2D_shared[BLOCK_SIZE];
614 | 
615 | 	// In the forward, we stored the final value for T, the
616 | 	// product of all (1 - alpha) factors. 
617 | 	const float T_final = inside ? final_Ts[pix_id] : 0;
618 | 	float T = T_final;
619 | 
620 | 	// We start from the back. The ID of the last contributing
621 | 	// Gaussian is known from each pixel from the forward.
622 | 	uint32_t contributor = toDo;
623 | 	const int last_contributor = inside ? n_contrib[pix_id] : 0;
624 | 
625 | 	float accum_rec[C] = { 0 };
626 | 	float dL_dpixel[C] = { 0 };
627 | 	float accum_rec_depth = 0;
628 | 	float dL_dpixel_depth = 0;
629 | 	if (inside) {
630 | 		#pragma unroll
631 | 		for (int i = 0; i < C; i++) {
632 | 			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
633 | 		}
634 | 		dL_dpixel_depth = dL_dpixels_depth[pix_id];
635 | 	}
636 | 
637 | 	float last_alpha = 0.f;
638 | 	float last_color[C] = { 0.f };
639 | 	float last_depth = 0.f;
640 | 
641 | 	// Gradient of pixel coordinate w.r.t. normalized 
642 | 	// screen-space viewport corrdinates (-1 to 1)
643 | 	const float ddelx_dx = 0.5f * W;
644 | 	const float ddely_dy = 0.5f * H;
645 | 	__shared__ int skip_counter;
646 | 
647 | 	// Traverse all Gaussians
648 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
649 | 	{
650 | 		// Load auxiliary data into shared memory, start in the BACK
651 | 		// and load them in revers order.
652 | 		// block.sync();
653 | 		const int progress = i * BLOCK_SIZE + tid;
654 | 		if (range.x + progress < range.y)
655 | 		{
656 | 			const int coll_id = point_list[range.y - progress - 1];
657 | 			collected_id[tid] = coll_id;
658 | 			collected_xy[tid] = points_xy_image[coll_id];
659 | 			collected_conic_opacity[tid] = conic_opacity[coll_id];
660 | 			#pragma unroll
661 | 			for (int i = 0; i < C; i++) {
662 | 				collected_colors[i * BLOCK_SIZE + tid] = colors[coll_id * C + i];
663 | 				
664 | 			}
665 | 			collected_depths[tid] = depths[coll_id];
666 | 		}
667 | 		for (int j = 0; j < min(BLOCK_SIZE, toDo); j++) {
668 | 			block.sync();
669 | 			if (tid == 0) {
670 | 				skip_counter = 0;
671 | 			}
672 | 			block.sync();
673 | 
674 | 			// Keep track of current Gaussian ID. Skip, if this one
675 | 			// is behind the last contributor for this pixel.
676 | 			bool skip = done;
677 | 			contributor = done ? contributor : contributor - 1;
678 | 			skip |= contributor >= last_contributor;
679 | 
680 | 			// Compute blending values, as before.
681 | 			const float2 xy = collected_xy[j];
682 | 			const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
683 | 			const float4 con_o = collected_conic_opacity[j];
684 | 			const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
685 | 			skip |= power > 0.0f;
686 | 
687 | 			const float G = exp(power);
688 | 			const float alpha = min(0.99f, con_o.w * G);
689 | 			skip |= alpha < 1.0f / 255.0f;
690 | 
691 | 			if (skip) {
692 | 				atomicAdd(&skip_counter, 1);
693 | 			}
694 | 			block.sync();
695 | 			if (skip_counter == BLOCK_SIZE) {
696 | 				continue;
697 | 			}
698 | 
699 | 
700 | 			T = skip ? T : T / (1.f - alpha);
701 | 			const float dchannel_dcolor = alpha * T;
702 | 
703 | 			// Propagate gradients to per-Gaussian colors and keep
704 | 			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
705 | 			// pair).
706 | 			float dL_dalpha = 0.0f;
707 | 			const int global_id = collected_id[j];
708 | 			float local_dL_dcolors[3];
709 | 			#pragma unroll
710 | 			for (int ch = 0; ch < C; ch++)
711 | 			{
712 | 				const float c = collected_colors[ch * BLOCK_SIZE + j];
713 | 				// Update last color (to be used in the next iteration)
714 | 				accum_rec[ch] = skip ? accum_rec[ch] : last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
715 | 				last_color[ch] = skip ? last_color[ch] : c;
716 | 
717 | 				const float dL_dchannel = dL_dpixel[ch];
718 | 				dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
719 | 				local_dL_dcolors[ch] = skip ? 0.0f : dchannel_dcolor * dL_dchannel;
720 | 			}
721 | 			dL_dcolors_shared[tid].x = local_dL_dcolors[0];
722 | 			dL_dcolors_shared[tid].y = local_dL_dcolors[1];
723 | 			dL_dcolors_shared[tid].z = local_dL_dcolors[2];
724 | 
725 | 			const float depth = collected_depths[j];
726 | 			accum_rec_depth = skip ? accum_rec_depth : last_alpha * last_depth + (1.f - last_alpha) * accum_rec_depth;
727 | 			last_depth = skip ? last_depth : depth;
728 | 			dL_dalpha += (depth - accum_rec_depth) * dL_dpixel_depth;
729 | 			dL_ddepths_shared[tid] = skip ? 0.f : dchannel_dcolor * dL_dpixel_depth;
730 | 
731 | 
732 | 			dL_dalpha *= T;
733 | 			// Update last alpha (to be used in the next iteration)
734 | 			last_alpha = skip ? last_alpha : alpha;
735 | 
736 | 			// Account for fact that alpha also influences how much of
737 | 			// the background color is added if nothing left to blend
738 | 			float bg_dot_dpixel = 0.f;
739 | 			#pragma unroll
740 | 			for (int i = 0; i < C; i++) {
741 | 				bg_dot_dpixel +=  bg_color[i] * dL_dpixel[i];
742 | 			}
743 | 			dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
744 | 
745 | 			// Helpful reusable temporary variables
746 | 			const float dL_dG = con_o.w * dL_dalpha;
747 | 			const float gdx = G * d.x;
748 | 			const float gdy = G * d.y;
749 | 			const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
750 | 			const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
751 | 
752 | 			dL_dmean2D_shared[tid].x = skip ? 0.f : dL_dG * dG_ddelx * ddelx_dx;
753 | 			dL_dmean2D_shared[tid].y = skip ? 0.f : dL_dG * dG_ddely * ddely_dy;
754 | 			dL_dconic2D_shared[tid].x = skip ? 0.f : -0.5f * gdx * d.x * dL_dG;
755 | 			dL_dconic2D_shared[tid].y = skip ? 0.f : -0.5f * gdx * d.y * dL_dG;
756 | 			dL_dconic2D_shared[tid].w = skip ? 0.f : -0.5f * gdy * d.y * dL_dG;
757 | 			dL_dopacity_shared[tid] = skip ? 0.f : G * dL_dalpha;
758 | 
759 | 			render_cuda_reduce_sum(block, 
760 | 				dL_dmean2D_shared,
761 | 				dL_dconic2D_shared,
762 | 				dL_dopacity_shared,
763 | 				dL_dcolors_shared, 
764 | 				dL_ddepths_shared
765 | 			);	
766 | 			
767 | 			if (tid == 0) {
768 | 				float2 dL_dmean2D_acc = dL_dmean2D_shared[0];
769 | 				float4 dL_dconic2D_acc = dL_dconic2D_shared[0];
770 | 				float dL_dopacity_acc = dL_dopacity_shared[0];
771 | 				float3 dL_dcolors_acc = dL_dcolors_shared[0];
772 | 				float dL_ddepths_acc = dL_ddepths_shared[0];
773 | 
774 | 				atomicAdd(&dL_dmean2D[global_id].x, dL_dmean2D_acc.x);
775 | 				atomicAdd(&dL_dmean2D[global_id].y, dL_dmean2D_acc.y);
776 | 				atomicAdd(&dL_dconic2D[global_id].x, dL_dconic2D_acc.x);
777 | 				atomicAdd(&dL_dconic2D[global_id].y, dL_dconic2D_acc.y);
778 | 				atomicAdd(&dL_dconic2D[global_id].w, dL_dconic2D_acc.w);
779 | 				atomicAdd(&dL_dopacity[global_id], dL_dopacity_acc);
780 | 				atomicAdd(&dL_dcolors[global_id * C + 0], dL_dcolors_acc.x);
781 | 				atomicAdd(&dL_dcolors[global_id * C + 1], dL_dcolors_acc.y);
782 | 				atomicAdd(&dL_dcolors[global_id * C + 2], dL_dcolors_acc.z);
783 | 				atomicAdd(&dL_ddepths[global_id], dL_ddepths_acc);
784 | 			}
785 | 		}
786 | 	}
787 | }
788 | 
789 | void BACKWARD::preprocess(
790 | 	int P, int D, int M,
791 | 	const float3* means3D,
792 | 	const int* radii,
793 | 	const float* shs,
794 | 	const bool* clamped,
795 | 	const glm::vec3* scales,
796 | 	const glm::vec4* rotations,
797 | 	const float scale_modifier,
798 | 	const float* cov3Ds,
799 | 	const float* viewmatrix,
800 | 	const float* projmatrix,
801 | 	const float* projmatrix_raw,
802 | 	const float focal_x, float focal_y,
803 | 	const float tan_fovx, float tan_fovy,
804 | 	const glm::vec3* campos,
805 | 	const float3* dL_dmean2D,
806 | 	const float* dL_dconic,
807 | 	glm::vec3* dL_dmean3D,
808 | 	float* dL_dcolor,
809 | 	float* dL_ddepth,
810 | 	float* dL_dcov3D,
811 | 	float* dL_dsh,
812 | 	glm::vec3* dL_dscale,
813 | 	glm::vec4* dL_drot,
814 | 	float* dL_dtau)
815 | {
816 | 	// Propagate gradients for the path of 2D conic matrix computation. 
817 | 	// Somewhat long, thus it is its own kernel rather than being part of 
818 | 	// "preprocess". When done, loss gradient w.r.t. 3D means has been
819 | 	// modified and gradient w.r.t. 3D covariance matrix has been computed.	
820 | 	computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
821 | 		P,
822 | 		means3D,
823 | 		radii,
824 | 		cov3Ds,
825 | 		focal_x,
826 | 		focal_y,
827 | 		tan_fovx,
828 | 		tan_fovy,
829 | 		viewmatrix,
830 | 		dL_dconic,
831 | 		(float3*)dL_dmean3D,
832 | 		dL_dcov3D,
833 | 		dL_dtau);
834 | 
835 | 	// Propagate gradients for remaining steps: finish 3D mean gradients,
836 | 	// propagate color gradients to SH (if desireD), propagate 3D covariance
837 | 	// matrix gradients to scale and rotation.
838 | 	preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
839 | 		P, D, M,
840 | 		(float3*)means3D,
841 | 		radii,
842 | 		shs,
843 | 		clamped,
844 | 		(glm::vec3*)scales,
845 | 		(glm::vec4*)rotations,
846 | 		scale_modifier,
847 | 		viewmatrix,
848 | 		projmatrix,
849 | 		projmatrix_raw,
850 | 		campos,
851 | 		(float3*)dL_dmean2D,
852 | 		(glm::vec3*)dL_dmean3D,
853 | 		dL_dcolor,
854 | 		dL_ddepth,
855 | 		dL_dcov3D,
856 | 		dL_dsh,
857 | 		dL_dscale,
858 | 		dL_drot,
859 | 		dL_dtau);
860 | }
861 | 
862 | void BACKWARD::render(
863 | 	const dim3 grid, const dim3 block,
864 | 	const uint2* ranges,
865 | 	const uint32_t* point_list,
866 | 	int W, int H,
867 | 	const float* bg_color,
868 | 	const float2* means2D,
869 | 	const float4* conic_opacity,
870 | 	const float* colors,
871 | 	const float* depths,
872 | 	const float* final_Ts,
873 | 	const uint32_t* n_contrib,
874 | 	const float* dL_dpixels,
875 | 	const float* dL_dpixels_depth,
876 | 	float3* dL_dmean2D,
877 | 	float4* dL_dconic2D,
878 | 	float* dL_dopacity,
879 | 	float* dL_dcolors,
880 | 	float* dL_ddepths)
881 | {
882 | 	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
883 | 		ranges,
884 | 		point_list,
885 | 		W, H,
886 | 		bg_color,
887 | 		means2D,
888 | 		conic_opacity,
889 | 		colors,
890 | 		depths,
891 | 		final_Ts,
892 | 		n_contrib,
893 | 		dL_dpixels,
894 | 		dL_dpixels_depth,
895 | 		dL_dmean2D,
896 | 		dL_dconic2D,
897 | 		dL_dopacity,
898 | 		dL_dcolors,
899 | 		dL_ddepths
900 | 	);
901 | }


--------------------------------------------------------------------------------
/cuda_rasterizer/backward.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
13 | #define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
14 | 
15 | #include <cuda.h>
16 | #include "cuda_runtime.h"
17 | #include "device_launch_parameters.h"
18 | #define GLM_FORCE_CUDA
19 | #include <glm/glm.hpp>
20 | 
21 | namespace BACKWARD
22 | {
23 | 	void render(
24 | 		const dim3 grid, const dim3 block,
25 | 		const uint2* ranges,
26 | 		const uint32_t* point_list,
27 | 		int W, int H,
28 | 		const float* bg_color,
29 | 		const float2* means2D,
30 | 		const float4* conic_opacity,
31 | 		const float* colors,
32 | 		const float* depths,
33 | 		const float* final_Ts,
34 | 		const uint32_t* n_contrib,
35 | 		const float* dL_dpixels,
36 | 		const float* dL_dpixels_depth,
37 | 		float3* dL_dmean2D,
38 | 		float4* dL_dconic2D,
39 | 		float* dL_dopacity,
40 | 		float* dL_dcolors,
41 | 		float* dL_ddepths);
42 | 
43 | 	void preprocess(
44 | 		int P, int D, int M,
45 | 		const float3* means,
46 | 		const int* radii,
47 | 		const float* shs,
48 | 		const bool* clamped,
49 | 		const glm::vec3* scales,
50 | 		const glm::vec4* rotations,
51 | 		const float scale_modifier,
52 | 		const float* cov3Ds,
53 | 		const float* view,
54 | 		const float* proj,
55 | 		const float* proj_raw,
56 | 		const float focal_x, float focal_y,
57 | 		const float tan_fovx, float tan_fovy,
58 | 		const glm::vec3* campos,
59 | 		const float3* dL_dmean2D,
60 | 		const float* dL_dconics,
61 | 		glm::vec3* dL_dmeans,
62 | 		float* dL_dcolor,
63 | 		float* dL_ddepth,
64 | 		float* dL_dcov3D,
65 | 		float* dL_dsh,
66 | 		glm::vec3* dL_dscale,
67 | 		glm::vec4* dL_drot,
68 | 		float* dL_dtau);
69 | }
70 | 
71 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
13 | #define CUDA_RASTERIZER_CONFIG_H_INCLUDED
14 | 
15 | #define NUM_CHANNELS 3 // Default 3, RGB
16 | #define BLOCK_X 16
17 | #define BLOCK_Y 16
18 | 
19 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/forward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include "forward.h"
 13 | #include "auxiliary.h"
 14 | #include "helper_math.h"
 15 | #include "math.h"
 16 | #include <cooperative_groups.h>
 17 | #include <cooperative_groups/reduce.h>
 18 | namespace cg = cooperative_groups;
 19 | 
 20 | // Forward method for converting the input spherical harmonics
 21 | // coefficients of each Gaussian to a simple RGB color.
 22 | __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
 23 | {
 24 | 	// The implementation is loosely based on code for 
 25 | 	// "Differentiable Point-Based Radiance Fields for 
 26 | 	// Efficient View Synthesis" by Zhang et al. (2022)
 27 | 	glm::vec3 pos = means[idx];
 28 | 	glm::vec3 dir = pos - campos;
 29 | 	dir = dir / glm::length(dir);
 30 | 
 31 | 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
 32 | 	glm::vec3 result = SH_C0 * sh[0];
 33 | 
 34 | 	if (deg > 0)
 35 | 	{
 36 | 		float x = dir.x;
 37 | 		float y = dir.y;
 38 | 		float z = dir.z;
 39 | 		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
 40 | 
 41 | 		if (deg > 1)
 42 | 		{
 43 | 			float xx = x * x, yy = y * y, zz = z * z;
 44 | 			float xy = x * y, yz = y * z, xz = x * z;
 45 | 			result = result +
 46 | 				SH_C2[0] * xy * sh[4] +
 47 | 				SH_C2[1] * yz * sh[5] +
 48 | 				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
 49 | 				SH_C2[3] * xz * sh[7] +
 50 | 				SH_C2[4] * (xx - yy) * sh[8];
 51 | 
 52 | 			if (deg > 2)
 53 | 			{
 54 | 				result = result +
 55 | 					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
 56 | 					SH_C3[1] * xy * z * sh[10] +
 57 | 					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
 58 | 					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
 59 | 					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
 60 | 					SH_C3[5] * z * (xx - yy) * sh[14] +
 61 | 					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
 62 | 			}
 63 | 		}
 64 | 	}
 65 | 	result += 0.5f;
 66 | 
 67 | 	// RGB colors are clamped to positive values. If values are
 68 | 	// clamped, we need to keep track of this for the backward pass.
 69 | 	clamped[3 * idx + 0] = (result.x < 0);
 70 | 	clamped[3 * idx + 1] = (result.y < 0);
 71 | 	clamped[3 * idx + 2] = (result.z < 0);
 72 | 	return glm::max(result, 0.0f);
 73 | }
 74 | 
 75 | // Forward version of 2D covariance matrix computation
 76 | __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
 77 | {
 78 | 	// The following models the steps outlined by equations 29
 79 | 	// and 31 in "EWA Splatting" (Zwicker et al., 2002). 
 80 | 	// Additionally considers aspect / scaling of viewport.
 81 | 	// Transposes used to account for row-/column-major conventions.
 82 | 	float3 t = transformPoint4x3(mean, viewmatrix);
 83 | 
 84 | 	const float limx = 1.3f * tan_fovx;
 85 | 	const float limy = 1.3f * tan_fovy;
 86 | 	const float txtz = t.x / t.z;
 87 | 	const float tytz = t.y / t.z;
 88 | 	t.x = min(limx, max(-limx, txtz)) * t.z;
 89 | 	t.y = min(limy, max(-limy, tytz)) * t.z;
 90 | 
 91 | 	glm::mat3 J = glm::mat3(
 92 | 		focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
 93 | 		0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
 94 | 		0, 0, 0);
 95 | 
 96 | 	glm::mat3 W = glm::mat3(
 97 | 		viewmatrix[0], viewmatrix[4], viewmatrix[8],
 98 | 		viewmatrix[1], viewmatrix[5], viewmatrix[9],
 99 | 		viewmatrix[2], viewmatrix[6], viewmatrix[10]);
100 | 
101 | 	glm::mat3 T = W * J;
102 | 
103 | 	glm::mat3 Vrk = glm::mat3(
104 | 		cov3D[0], cov3D[1], cov3D[2],
105 | 		cov3D[1], cov3D[3], cov3D[4],
106 | 		cov3D[2], cov3D[4], cov3D[5]);
107 | 
108 | 	glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
109 | 
110 | 	// Apply low-pass filter: every Gaussian should be at least
111 | 	// one pixel wide/high. Discard 3rd row and column.
112 | 	cov[0][0] += 0.3f;
113 | 	cov[1][1] += 0.3f;
114 | 	return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
115 | }
116 | 
117 | // Forward method for converting scale and rotation properties of each
118 | // Gaussian to a 3D covariance matrix in world space. Also takes care
119 | // of quaternion normalization.
120 | __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
121 | {
122 | 	// Create scaling matrix
123 | 	glm::mat3 S = glm::mat3(1.0f);
124 | 	S[0][0] = mod * scale.x;
125 | 	S[1][1] = mod * scale.y;
126 | 	S[2][2] = mod * scale.z;
127 | 
128 | 	// Normalize quaternion to get valid rotation
129 | 	glm::vec4 q = rot;// / glm::length(rot);
130 | 	float r = q.x;
131 | 	float x = q.y;
132 | 	float y = q.z;
133 | 	float z = q.w;
134 | 
135 | 	// Compute rotation matrix from quaternion
136 | 	glm::mat3 R = glm::mat3(
137 | 		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
138 | 		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
139 | 		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
140 | 	);
141 | 
142 | 	glm::mat3 M = S * R;
143 | 
144 | 	// Compute 3D world covariance matrix Sigma
145 | 	glm::mat3 Sigma = glm::transpose(M) * M;
146 | 
147 | 	// Covariance is symmetric, only store upper right
148 | 	cov3D[0] = Sigma[0][0];
149 | 	cov3D[1] = Sigma[0][1];
150 | 	cov3D[2] = Sigma[0][2];
151 | 	cov3D[3] = Sigma[1][1];
152 | 	cov3D[4] = Sigma[1][2];
153 | 	cov3D[5] = Sigma[2][2];
154 | }
155 | 
156 | // Perform initial steps for each Gaussian prior to rasterization.
157 | template<int C>
158 | __global__ void preprocessCUDA(int P, int D, int M,
159 | 	const float* orig_points,
160 | 	const glm::vec3* scales,
161 | 	const float scale_modifier,
162 | 	const glm::vec4* rotations,
163 | 	const float* opacities,
164 | 	const float* shs,
165 | 	bool* clamped,
166 | 	const float* cov3D_precomp,
167 | 	const float* colors_precomp,
168 | 	const float* viewmatrix,
169 | 	const float* projmatrix,
170 | 	const glm::vec3* cam_pos,
171 | 	const int W, int H,
172 | 	const float tan_fovx, float tan_fovy,
173 | 	const float focal_x, float focal_y,
174 | 	int* radii,
175 | 	float2* points_xy_image,
176 | 	float* depths,
177 | 	float* cov3Ds,
178 | 	float* rgb,
179 | 	float4* conic_opacity,
180 | 	const dim3 grid,
181 | 	uint32_t* tiles_touched,
182 | 	bool prefiltered)
183 | {
184 | 	auto idx = cg::this_grid().thread_rank();
185 | 	if (idx >= P)
186 | 		return;
187 | 
188 | 	// Initialize radius and touched tiles to 0. If this isn't changed,
189 | 	// this Gaussian will not be processed further.
190 | 	radii[idx] = 0;
191 | 	tiles_touched[idx] = 0;
192 | 
193 | 	// Perform near culling, quit if outside.
194 | 	float3 p_view;
195 | 	if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
196 | 		return;
197 | 
198 | 	// Transform point by projecting
199 | 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
200 | 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
201 | 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
202 | 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
203 | 
204 | 	// If 3D covariance matrix is precomputed, use it, otherwise compute
205 | 	// from scaling and rotation parameters. 
206 | 	const float* cov3D;
207 | 	if (cov3D_precomp != nullptr)
208 | 	{
209 | 		cov3D = cov3D_precomp + idx * 6;
210 | 	}
211 | 	else
212 | 	{
213 | 		computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
214 | 		cov3D = cov3Ds + idx * 6;
215 | 	}
216 | 
217 | 	// Compute 2D screen-space covariance matrix
218 | 	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
219 | 
220 | 	// Invert covariance (EWA algorithm)
221 | 	float det = (cov.x * cov.z - cov.y * cov.y);
222 | 	if (det == 0.0f)
223 | 		return;
224 | 	float det_inv = 1.f / det;
225 | 	float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
226 | 
227 | 	// Compute extent in screen space (by finding eigenvalues of
228 | 	// 2D covariance matrix). Use extent to compute a bounding rectangle
229 | 	// of screen-space tiles that this Gaussian overlaps with. Quit if
230 | 	// rectangle covers 0 tiles. 
231 | 	float mid = 0.5f * (cov.x + cov.z);
232 | 	float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
233 | 	float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
234 | 	float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
235 | 	float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
236 | 	uint2 rect_min, rect_max;
237 | 	getRect(point_image, my_radius, rect_min, rect_max, grid);
238 | 	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
239 | 		return;
240 | 
241 | 	// If colors have been precomputed, use them, otherwise convert
242 | 	// spherical harmonics coefficients to RGB color.
243 | 	if (colors_precomp == nullptr)
244 | 	{
245 | 		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
246 | 		rgb[idx * C + 0] = result.x;
247 | 		rgb[idx * C + 1] = result.y;
248 | 		rgb[idx * C + 2] = result.z;
249 | 	}
250 | 
251 | 	// Store some useful helper data for the next steps.
252 | 	depths[idx] = p_view.z;
253 | 	radii[idx] = my_radius;
254 | 	points_xy_image[idx] = point_image;
255 | 	// Inverse 2D covariance and opacity neatly pack into one float4
256 | 	conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
257 | 	tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
258 | }
259 | 
260 | // Main rasterization method. Collaboratively works on one tile per
261 | // block, each thread treats one pixel. Alternates between fetching 
262 | // and rasterizing data.
263 | template <uint32_t CHANNELS>
264 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
265 | renderCUDA(
266 | 	const uint2* __restrict__ ranges,
267 | 	const uint32_t* __restrict__ point_list,
268 | 	int W, int H,
269 | 	const float2* __restrict__ points_xy_image,
270 | 	const float* __restrict__ features,
271 | 	const float4* __restrict__ conic_opacity,
272 | 	float* __restrict__ final_T,
273 | 	uint32_t* __restrict__ n_contrib,
274 | 	const float* __restrict__ bg_color,
275 | 	float* __restrict__ out_color,
276 | 	const float* __restrict__ depth,
277 | 	float* __restrict__ out_depth, 
278 | 	float* __restrict__ out_opacity,
279 | 	int * __restrict__ n_touched)
280 | {
281 | 	// Identify current tile and associated min/max pixel range.
282 | 	auto block = cg::this_thread_block();
283 |     uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
284 | 	// uint32_t horizontal_blocks = gridDim.x; # TODO Maybe it's different?
285 | 	uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
286 | 	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
287 | 	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
288 | 	uint32_t pix_id = W * pix.y + pix.x;
289 | 	float2 pixf = { (float)pix.x, (float)pix.y };
290 | 
291 | 	// Check if this thread is associated with a valid pixel or outside.
292 | 	bool inside = pix.x < W&& pix.y < H;
293 | 	// Done threads can help with fetching, but don't rasterize
294 | 	bool done = !inside;
295 | 
296 | 	// Load start/end range of IDs to process in bit sorted list.
297 | 	uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
298 | 	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
299 | 	int toDo = range.y - range.x;
300 | 
301 | 	// Allocate storage for batches of collectively fetched data.
302 | 	__shared__ int collected_id[BLOCK_SIZE];
303 | 	__shared__ float2 collected_xy[BLOCK_SIZE];
304 | 	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
305 | 	__shared__ float collected_depth[BLOCK_SIZE];
306 | 
307 | 	// Initialize helper variables
308 | 	float T = 1.0f;
309 | 	uint32_t contributor = 0;
310 | 	uint32_t last_contributor = 0;
311 | 	float C[CHANNELS] = { 0 };
312 | 	float D = 0.0f;
313 | 
314 | 	// Iterate over batches until all done or range is complete
315 | 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
316 | 	{
317 | 		// End if entire block votes that it is done rasterizing
318 | 		int num_done = __syncthreads_count(done);
319 | 		if (num_done == BLOCK_SIZE)
320 | 			break;
321 | 
322 | 		// Collectively fetch per-Gaussian data from global to shared
323 | 		int progress = i * BLOCK_SIZE + block.thread_rank();
324 | 		if (range.x + progress < range.y)
325 | 		{
326 | 			int coll_id = point_list[range.x + progress];
327 | 			collected_id[block.thread_rank()] = coll_id;
328 | 			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
329 | 			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
330 | 			collected_depth[block.thread_rank()] = depth[coll_id];
331 | 		}
332 | 		block.sync();
333 | 
334 | 		// Iterate over current batch
335 | 		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
336 | 		{
337 | 			// Keep track of current position in range
338 | 			contributor++;
339 | 
340 | 			// Resample using conic matrix (cf. "Surface 
341 | 			// Splatting" by Zwicker et al., 2001)
342 | 			float2 xy = collected_xy[j];
343 | 			float2 d = { xy.x - pixf.x, xy.y - pixf.y };
344 | 			float4 con_o = collected_conic_opacity[j];
345 | 			float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
346 | 			if (power > 0.0f)
347 | 				continue;
348 | 
349 | 			// Eq. (2) from 3D Gaussian splatting paper.
350 | 			// Obtain alpha by multiplying with Gaussian opacity
351 | 			// and its exponential falloff from mean.
352 | 			// Avoid numerical instabilities (see paper appendix). 
353 | 			float alpha = min(0.99f, con_o.w * exp(power));
354 | 			if (alpha < 1.0f / 255.0f) {
355 | 				continue;
356 | 			}
357 | 			float test_T = T * (1 - alpha);
358 | 			if (test_T < 0.0001f)
359 | 			{
360 | 				done = true;
361 | 				continue;
362 | 			}
363 | 			// Eq. (3) from 3D Gaussian splatting paper.
364 | 			for (int ch = 0; ch < CHANNELS; ch++) {
365 | 				C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
366 | 			}
367 | 			D += collected_depth[j] * alpha * T;
368 | 			// Keep track of how many pixels touched this Gaussian.
369 | 			if (test_T > 0.5f) {
370 | 				atomicAdd(&(n_touched[collected_id[j]]), 1);
371 | 			}
372 | 			T = test_T;
373 | 
374 | 			// Keep track of last range entry to update this
375 | 			// pixel.
376 | 			last_contributor = contributor;
377 | 		}
378 | 	}
379 | 
380 | 	// All threads that treat valid pixel write out their final
381 | 	// rendering data to the frame and auxiliary buffers.
382 | 	if (inside)
383 | 	{
384 | 		final_T[pix_id] = T;
385 | 		n_contrib[pix_id] = last_contributor;
386 | 		for (int ch = 0; ch < CHANNELS; ch++) {
387 | 			out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
388 | 		}
389 | 		out_depth[pix_id] = D;
390 | 		out_opacity[pix_id] = 1 - T;
391 | 	}
392 | }
393 | 
394 | void FORWARD::render(
395 | 	const dim3 grid, dim3 block,
396 | 	const uint2* ranges,
397 | 	const uint32_t* point_list,
398 | 	int W, int H,
399 | 	const float2* means2D,
400 | 	const float* colors,
401 | 	const float4* conic_opacity,
402 | 	float* final_T,
403 | 	uint32_t* n_contrib,
404 | 	const float* bg_color,
405 | 	float* out_color,
406 | 	const float* depth,
407 | 	float* out_depth, 
408 | 	float* out_opacity,
409 | 	int* n_touched)
410 | {
411 | 	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
412 | 		ranges,
413 | 		point_list,
414 | 		W, H,
415 | 		means2D,
416 | 		colors,
417 | 		conic_opacity,
418 | 		final_T,
419 | 		n_contrib,
420 | 		bg_color,
421 | 		out_color,
422 | 		depth,
423 | 		out_depth,
424 | 		out_opacity,
425 | 		n_touched);
426 | }
427 | 
428 | void FORWARD::preprocess(int P, int D, int M,
429 | 	const float* means3D,
430 | 	const glm::vec3* scales,
431 | 	const float scale_modifier,
432 | 	const glm::vec4* rotations,
433 | 	const float* opacities,
434 | 	const float* shs,
435 | 	bool* clamped,
436 | 	const float* cov3D_precomp,
437 | 	const float* colors_precomp,
438 | 	const float* viewmatrix,
439 | 	const float* projmatrix,
440 | 	const glm::vec3* cam_pos,
441 | 	const int W, int H,
442 | 	const float focal_x, float focal_y,
443 | 	const float tan_fovx, float tan_fovy,
444 | 	int* radii,
445 | 	float2* means2D,
446 | 	float* depths,
447 | 	float* cov3Ds,
448 | 	float* rgb,
449 | 	float4* conic_opacity,
450 | 	const dim3 grid,
451 | 	uint32_t* tiles_touched,
452 | 	bool prefiltered)
453 | {
454 | 	preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
455 | 		P, D, M,
456 | 		means3D,
457 | 		scales,
458 | 		scale_modifier,
459 | 		rotations,
460 | 		opacities,
461 | 		shs,
462 | 		clamped,
463 | 		cov3D_precomp,
464 | 		colors_precomp,
465 | 		viewmatrix, 
466 | 		projmatrix,
467 | 		cam_pos,
468 | 		W, H,
469 | 		tan_fovx, tan_fovy,
470 | 		focal_x, focal_y,
471 | 		radii,
472 | 		means2D,
473 | 		depths,
474 | 		cov3Ds,
475 | 		rgb,
476 | 		conic_opacity,
477 | 		grid,
478 | 		tiles_touched,
479 | 		prefiltered
480 | 		);
481 | }


--------------------------------------------------------------------------------
/cuda_rasterizer/forward.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
13 | #define CUDA_RASTERIZER_FORWARD_H_INCLUDED
14 | 
15 | #include <cuda.h>
16 | #include "cuda_runtime.h"
17 | #include "device_launch_parameters.h"
18 | #define GLM_FORCE_CUDA
19 | #include <glm/glm.hpp>
20 | 
21 | namespace FORWARD
22 | {
23 | 	// Perform initial steps for each Gaussian prior to rasterization.
24 | 	void preprocess(int P, int D, int M,
25 | 		const float* orig_points,
26 | 		const glm::vec3* scales,
27 | 		const float scale_modifier,
28 | 		const glm::vec4* rotations,
29 | 		const float* opacities,
30 | 		const float* shs,
31 | 		bool* clamped,
32 | 		const float* cov3D_precomp,
33 | 		const float* colors_precomp,
34 | 		const float* viewmatrix,
35 | 		const float* projmatrix,
36 | 		const glm::vec3* cam_pos,
37 | 		const int W, int H,
38 | 		const float focal_x, float focal_y,
39 | 		const float tan_fovx, float tan_fovy,
40 | 		int* radii,
41 | 		float2* points_xy_image,
42 | 		float* depths,
43 | 		float* cov3Ds,
44 | 		float* colors,
45 | 		float4* conic_opacity,
46 | 		const dim3 grid,
47 | 		uint32_t* tiles_touched,
48 | 		bool prefiltered);
49 | 
50 | 	// Main rasterization method.
51 | 	void render(
52 | 		const dim3 grid, dim3 block,
53 | 		const uint2* ranges,
54 | 		const uint32_t* point_list,
55 | 		int W, int H,
56 | 		const float2* points_xy_image,
57 | 		const float* features,
58 | 		const float4* conic_opacity,
59 | 		float* final_T,
60 | 		uint32_t* n_contrib,
61 | 		const float* bg_color,
62 | 		float* out_color,
63 | 		const float* depth,
64 | 	    float* out_depth,
65 | 		float* out_opacity,
66 | 		int* n_touched);
67 | }
68 | 
69 | 
70 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/helper_math.h:
--------------------------------------------------------------------------------
   1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
   2 |  *
   3 |  * Redistribution and use in source and binary forms, with or without
   4 |  * modification, are permitted provided that the following conditions
   5 |  * are met:
   6 |  *  * Redistributions of source code must retain the above copyright
   7 |  *    notice, this list of conditions and the following disclaimer.
   8 |  *  * Redistributions in binary form must reproduce the above copyright
   9 |  *    notice, this list of conditions and the following disclaimer in the
  10 |  *    documentation and/or other materials provided with the distribution.
  11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
  12 |  *    contributors may be used to endorse or promote products derived
  13 |  *    from this software without specific prior written permission.
  14 |  *
  15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 |  */
  27 | 
  28 | /*
  29 |  *  This file implements common mathematical operations on vector types
  30 |  *  (float3, float4 etc.) since these are not provided as standard by CUDA.
  31 |  *
  32 |  *  The syntax is modeled on the Cg standard library.
  33 |  *
  34 |  *  This is part of the Helper library includes
  35 |  *
  36 |  *    Thanks to Linh Hah for additions and fixes.
  37 |  */
  38 | 
  39 | #ifndef HELPER_MATH_H
  40 | #define HELPER_MATH_H
  41 | 
  42 | #include "cuda_runtime.h"
  43 | 
  44 | typedef unsigned int uint;
  45 | typedef unsigned short ushort;
  46 | 
  47 | #ifndef EXIT_WAIVED
  48 | #define EXIT_WAIVED 2
  49 | #endif
  50 | 
  51 | #ifndef __CUDACC__
  52 | #include <math.h>
  53 | 
  54 | ////////////////////////////////////////////////////////////////////////////////
  55 | // host implementations of CUDA functions
  56 | ////////////////////////////////////////////////////////////////////////////////
  57 | 
  58 | inline float fminf(float a, float b) { return a < b ? a : b; }
  59 | 
  60 | inline float fmaxf(float a, float b) { return a > b ? a : b; }
  61 | 
  62 | inline int max(int a, int b) { return a > b ? a : b; }
  63 | 
  64 | inline int min(int a, int b) { return a < b ? a : b; }
  65 | 
  66 | inline float rsqrtf(float x) { return 1.0f / sqrtf(x); }
  67 | #endif
  68 | 
  69 | ////////////////////////////////////////////////////////////////////////////////
  70 | // constructors
  71 | ////////////////////////////////////////////////////////////////////////////////
  72 | 
  73 | inline __host__ __device__ float2 make_float2(float s) { return make_float2(s, s); }
  74 | inline __host__ __device__ float2 make_float2(float3 a) { return make_float2(a.x, a.y); }
  75 | inline __host__ __device__ float2 make_float2(int2 a) {
  76 |   return make_float2(float(a.x), float(a.y));
  77 | }
  78 | inline __host__ __device__ float2 make_float2(uint2 a) {
  79 |   return make_float2(float(a.x), float(a.y));
  80 | }
  81 | 
  82 | inline __host__ __device__ int2 make_int2(int s) { return make_int2(s, s); }
  83 | inline __host__ __device__ int2 make_int2(int3 a) { return make_int2(a.x, a.y); }
  84 | inline __host__ __device__ int2 make_int2(uint2 a) { return make_int2(int(a.x), int(a.y)); }
  85 | inline __host__ __device__ int2 make_int2(float2 a) { return make_int2(int(a.x), int(a.y)); }
  86 | 
  87 | inline __host__ __device__ uint2 make_uint2(uint s) { return make_uint2(s, s); }
  88 | inline __host__ __device__ uint2 make_uint2(uint3 a) { return make_uint2(a.x, a.y); }
  89 | inline __host__ __device__ uint2 make_uint2(int2 a) { return make_uint2(uint(a.x), uint(a.y)); }
  90 | 
  91 | inline __host__ __device__ float3 make_float3(float s) { return make_float3(s, s, s); }
  92 | inline __host__ __device__ float3 make_float3(float2 a) { return make_float3(a.x, a.y, 0.0f); }
  93 | inline __host__ __device__ float3 make_float3(float2 a, float s) {
  94 |   return make_float3(a.x, a.y, s);
  95 | }
  96 | inline __host__ __device__ float3 make_float3(float4 a) { return make_float3(a.x, a.y, a.z); }
  97 | inline __host__ __device__ float3 make_float3(int3 a) {
  98 |   return make_float3(float(a.x), float(a.y), float(a.z));
  99 | }
 100 | inline __host__ __device__ float3 make_float3(uint3 a) {
 101 |   return make_float3(float(a.x), float(a.y), float(a.z));
 102 | }
 103 | 
 104 | inline __host__ __device__ int3 make_int3(int s) { return make_int3(s, s, s); }
 105 | inline __host__ __device__ int3 make_int3(int2 a) { return make_int3(a.x, a.y, 0); }
 106 | inline __host__ __device__ int3 make_int3(int2 a, int s) { return make_int3(a.x, a.y, s); }
 107 | inline __host__ __device__ int3 make_int3(uint3 a) {
 108 |   return make_int3(int(a.x), int(a.y), int(a.z));
 109 | }
 110 | inline __host__ __device__ int3 make_int3(float3 a) {
 111 |   return make_int3(int(a.x), int(a.y), int(a.z));
 112 | }
 113 | 
 114 | inline __host__ __device__ uint3 make_uint3(uint s) { return make_uint3(s, s, s); }
 115 | inline __host__ __device__ uint3 make_uint3(uint2 a) { return make_uint3(a.x, a.y, 0); }
 116 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) { return make_uint3(a.x, a.y, s); }
 117 | inline __host__ __device__ uint3 make_uint3(uint4 a) { return make_uint3(a.x, a.y, a.z); }
 118 | inline __host__ __device__ uint3 make_uint3(int3 a) {
 119 |   return make_uint3(uint(a.x), uint(a.y), uint(a.z));
 120 | }
 121 | 
 122 | inline __host__ __device__ float4 make_float4(float s) { return make_float4(s, s, s, s); }
 123 | inline __host__ __device__ float4 make_float4(float3 a) { return make_float4(a.x, a.y, a.z, 0.0f); }
 124 | inline __host__ __device__ float4 make_float4(float3 a, float w) {
 125 |   return make_float4(a.x, a.y, a.z, w);
 126 | }
 127 | inline __host__ __device__ float4 make_float4(int4 a) {
 128 |   return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 129 | }
 130 | inline __host__ __device__ float4 make_float4(uint4 a) {
 131 |   return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 132 | }
 133 | 
 134 | inline __host__ __device__ int4 make_int4(int s) { return make_int4(s, s, s, s); }
 135 | inline __host__ __device__ int4 make_int4(int3 a) { return make_int4(a.x, a.y, a.z, 0); }
 136 | inline __host__ __device__ int4 make_int4(int3 a, int w) { return make_int4(a.x, a.y, a.z, w); }
 137 | inline __host__ __device__ int4 make_int4(uint4 a) {
 138 |   return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 139 | }
 140 | inline __host__ __device__ int4 make_int4(float4 a) {
 141 |   return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 142 | }
 143 | 
 144 | inline __host__ __device__ uint4 make_uint4(uint s) { return make_uint4(s, s, s, s); }
 145 | inline __host__ __device__ uint4 make_uint4(uint3 a) { return make_uint4(a.x, a.y, a.z, 0); }
 146 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) {
 147 |   return make_uint4(a.x, a.y, a.z, w);
 148 | }
 149 | inline __host__ __device__ uint4 make_uint4(int4 a) {
 150 |   return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
 151 | }
 152 | 
 153 | ////////////////////////////////////////////////////////////////////////////////
 154 | // negate
 155 | ////////////////////////////////////////////////////////////////////////////////
 156 | 
 157 | inline __host__ __device__ float2 operator-(float2 &a) { return make_float2(-a.x, -a.y); }
 158 | inline __host__ __device__ int2 operator-(int2 &a) { return make_int2(-a.x, -a.y); }
 159 | inline __host__ __device__ float3 operator-(float3 &a) { return make_float3(-a.x, -a.y, -a.z); }
 160 | inline __host__ __device__ int3 operator-(int3 &a) { return make_int3(-a.x, -a.y, -a.z); }
 161 | inline __host__ __device__ float4 operator-(float4 &a) {
 162 |   return make_float4(-a.x, -a.y, -a.z, -a.w);
 163 | }
 164 | inline __host__ __device__ int4 operator-(int4 &a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
 165 | 
 166 | ////////////////////////////////////////////////////////////////////////////////
 167 | // addition
 168 | ////////////////////////////////////////////////////////////////////////////////
 169 | 
 170 | inline __host__ __device__ float2 operator+(float2 a, float2 b) {
 171 |   return make_float2(a.x + b.x, a.y + b.y);
 172 | }
 173 | inline __host__ __device__ void operator+=(float2 &a, float2 b) {
 174 |   a.x += b.x;
 175 |   a.y += b.y;
 176 | }
 177 | inline __host__ __device__ float2 operator+(float2 a, float b) {
 178 |   return make_float2(a.x + b, a.y + b);
 179 | }
 180 | inline __host__ __device__ float2 operator+(float b, float2 a) {
 181 |   return make_float2(a.x + b, a.y + b);
 182 | }
 183 | inline __host__ __device__ void operator+=(float2 &a, float b) {
 184 |   a.x += b;
 185 |   a.y += b;
 186 | }
 187 | 
 188 | inline __host__ __device__ int2 operator+(int2 a, int2 b) {
 189 |   return make_int2(a.x + b.x, a.y + b.y);
 190 | }
 191 | inline __host__ __device__ void operator+=(int2 &a, int2 b) {
 192 |   a.x += b.x;
 193 |   a.y += b.y;
 194 | }
 195 | inline __host__ __device__ int2 operator+(int2 a, int b) { return make_int2(a.x + b, a.y + b); }
 196 | inline __host__ __device__ int2 operator+(int b, int2 a) { return make_int2(a.x + b, a.y + b); }
 197 | inline __host__ __device__ void operator+=(int2 &a, int b) {
 198 |   a.x += b;
 199 |   a.y += b;
 200 | }
 201 | 
 202 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
 203 |   return make_uint2(a.x + b.x, a.y + b.y);
 204 | }
 205 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b) {
 206 |   a.x += b.x;
 207 |   a.y += b.y;
 208 | }
 209 | inline __host__ __device__ uint2 operator+(uint2 a, uint b) { return make_uint2(a.x + b, a.y + b); }
 210 | inline __host__ __device__ uint2 operator+(uint b, uint2 a) { return make_uint2(a.x + b, a.y + b); }
 211 | inline __host__ __device__ void operator+=(uint2 &a, uint b) {
 212 |   a.x += b;
 213 |   a.y += b;
 214 | }
 215 | 
 216 | inline __host__ __device__ float3 operator+(float3 a, float3 b) {
 217 |   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 218 | }
 219 | inline __host__ __device__ void operator+=(float3 &a, float3 b) {
 220 |   a.x += b.x;
 221 |   a.y += b.y;
 222 |   a.z += b.z;
 223 | }
 224 | inline __host__ __device__ float3 operator+(float3 a, float b) {
 225 |   return make_float3(a.x + b, a.y + b, a.z + b);
 226 | }
 227 | inline __host__ __device__ void operator+=(float3 &a, float b) {
 228 |   a.x += b;
 229 |   a.y += b;
 230 |   a.z += b;
 231 | }
 232 | 
 233 | inline __host__ __device__ int3 operator+(int3 a, int3 b) {
 234 |   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
 235 | }
 236 | inline __host__ __device__ void operator+=(int3 &a, int3 b) {
 237 |   a.x += b.x;
 238 |   a.y += b.y;
 239 |   a.z += b.z;
 240 | }
 241 | inline __host__ __device__ int3 operator+(int3 a, int b) {
 242 |   return make_int3(a.x + b, a.y + b, a.z + b);
 243 | }
 244 | inline __host__ __device__ void operator+=(int3 &a, int b) {
 245 |   a.x += b;
 246 |   a.y += b;
 247 |   a.z += b;
 248 | }
 249 | 
 250 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) {
 251 |   return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
 252 | }
 253 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b) {
 254 |   a.x += b.x;
 255 |   a.y += b.y;
 256 |   a.z += b.z;
 257 | }
 258 | inline __host__ __device__ uint3 operator+(uint3 a, uint b) {
 259 |   return make_uint3(a.x + b, a.y + b, a.z + b);
 260 | }
 261 | inline __host__ __device__ void operator+=(uint3 &a, uint b) {
 262 |   a.x += b;
 263 |   a.y += b;
 264 |   a.z += b;
 265 | }
 266 | 
 267 | inline __host__ __device__ int3 operator+(int b, int3 a) {
 268 |   return make_int3(a.x + b, a.y + b, a.z + b);
 269 | }
 270 | inline __host__ __device__ uint3 operator+(uint b, uint3 a) {
 271 |   return make_uint3(a.x + b, a.y + b, a.z + b);
 272 | }
 273 | inline __host__ __device__ float3 operator+(float b, float3 a) {
 274 |   return make_float3(a.x + b, a.y + b, a.z + b);
 275 | }
 276 | 
 277 | inline __host__ __device__ float4 operator+(float4 a, float4 b) {
 278 |   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 279 | }
 280 | inline __host__ __device__ void operator+=(float4 &a, float4 b) {
 281 |   a.x += b.x;
 282 |   a.y += b.y;
 283 |   a.z += b.z;
 284 |   a.w += b.w;
 285 | }
 286 | inline __host__ __device__ float4 operator+(float4 a, float b) {
 287 |   return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 288 | }
 289 | inline __host__ __device__ float4 operator+(float b, float4 a) {
 290 |   return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 291 | }
 292 | inline __host__ __device__ void operator+=(float4 &a, float b) {
 293 |   a.x += b;
 294 |   a.y += b;
 295 |   a.z += b;
 296 |   a.w += b;
 297 | }
 298 | 
 299 | inline __host__ __device__ int4 operator+(int4 a, int4 b) {
 300 |   return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 301 | }
 302 | inline __host__ __device__ void operator+=(int4 &a, int4 b) {
 303 |   a.x += b.x;
 304 |   a.y += b.y;
 305 |   a.z += b.z;
 306 |   a.w += b.w;
 307 | }
 308 | inline __host__ __device__ int4 operator+(int4 a, int b) {
 309 |   return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
 310 | }
 311 | inline __host__ __device__ int4 operator+(int b, int4 a) {
 312 |   return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
 313 | }
 314 | inline __host__ __device__ void operator+=(int4 &a, int b) {
 315 |   a.x += b;
 316 |   a.y += b;
 317 |   a.z += b;
 318 |   a.w += b;
 319 | }
 320 | 
 321 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) {
 322 |   return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 323 | }
 324 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b) {
 325 |   a.x += b.x;
 326 |   a.y += b.y;
 327 |   a.z += b.z;
 328 |   a.w += b.w;
 329 | }
 330 | inline __host__ __device__ uint4 operator+(uint4 a, uint b) {
 331 |   return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
 332 | }
 333 | inline __host__ __device__ uint4 operator+(uint b, uint4 a) {
 334 |   return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
 335 | }
 336 | inline __host__ __device__ void operator+=(uint4 &a, uint b) {
 337 |   a.x += b;
 338 |   a.y += b;
 339 |   a.z += b;
 340 |   a.w += b;
 341 | }
 342 | 
 343 | ////////////////////////////////////////////////////////////////////////////////
 344 | // subtract
 345 | ////////////////////////////////////////////////////////////////////////////////
 346 | 
 347 | inline __host__ __device__ float2 operator-(float2 a, float2 b) {
 348 |   return make_float2(a.x - b.x, a.y - b.y);
 349 | }
 350 | inline __host__ __device__ void operator-=(float2 &a, float2 b) {
 351 |   a.x -= b.x;
 352 |   a.y -= b.y;
 353 | }
 354 | inline __host__ __device__ float2 operator-(float2 a, float b) {
 355 |   return make_float2(a.x - b, a.y - b);
 356 | }
 357 | inline __host__ __device__ float2 operator-(float b, float2 a) {
 358 |   return make_float2(b - a.x, b - a.y);
 359 | }
 360 | inline __host__ __device__ void operator-=(float2 &a, float b) {
 361 |   a.x -= b;
 362 |   a.y -= b;
 363 | }
 364 | 
 365 | inline __host__ __device__ int2 operator-(int2 a, int2 b) {
 366 |   return make_int2(a.x - b.x, a.y - b.y);
 367 | }
 368 | inline __host__ __device__ void operator-=(int2 &a, int2 b) {
 369 |   a.x -= b.x;
 370 |   a.y -= b.y;
 371 | }
 372 | inline __host__ __device__ int2 operator-(int2 a, int b) { return make_int2(a.x - b, a.y - b); }
 373 | inline __host__ __device__ int2 operator-(int b, int2 a) { return make_int2(b - a.x, b - a.y); }
 374 | inline __host__ __device__ void operator-=(int2 &a, int b) {
 375 |   a.x -= b;
 376 |   a.y -= b;
 377 | }
 378 | 
 379 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) {
 380 |   return make_uint2(a.x - b.x, a.y - b.y);
 381 | }
 382 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b) {
 383 |   a.x -= b.x;
 384 |   a.y -= b.y;
 385 | }
 386 | inline __host__ __device__ uint2 operator-(uint2 a, uint b) { return make_uint2(a.x - b, a.y - b); }
 387 | inline __host__ __device__ uint2 operator-(uint b, uint2 a) { return make_uint2(b - a.x, b - a.y); }
 388 | inline __host__ __device__ void operator-=(uint2 &a, uint b) {
 389 |   a.x -= b;
 390 |   a.y -= b;
 391 | }
 392 | 
 393 | inline __host__ __device__ float3 operator-(float3 a, float3 b) {
 394 |   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
 395 | }
 396 | inline __host__ __device__ void operator-=(float3 &a, float3 b) {
 397 |   a.x -= b.x;
 398 |   a.y -= b.y;
 399 |   a.z -= b.z;
 400 | }
 401 | inline __host__ __device__ float3 operator-(float3 a, float b) {
 402 |   return make_float3(a.x - b, a.y - b, a.z - b);
 403 | }
 404 | inline __host__ __device__ float3 operator-(float b, float3 a) {
 405 |   return make_float3(b - a.x, b - a.y, b - a.z);
 406 | }
 407 | inline __host__ __device__ void operator-=(float3 &a, float b) {
 408 |   a.x -= b;
 409 |   a.y -= b;
 410 |   a.z -= b;
 411 | }
 412 | 
 413 | inline __host__ __device__ int3 operator-(int3 a, int3 b) {
 414 |   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
 415 | }
 416 | inline __host__ __device__ void operator-=(int3 &a, int3 b) {
 417 |   a.x -= b.x;
 418 |   a.y -= b.y;
 419 |   a.z -= b.z;
 420 | }
 421 | inline __host__ __device__ int3 operator-(int3 a, int b) {
 422 |   return make_int3(a.x - b, a.y - b, a.z - b);
 423 | }
 424 | inline __host__ __device__ int3 operator-(int b, int3 a) {
 425 |   return make_int3(b - a.x, b - a.y, b - a.z);
 426 | }
 427 | inline __host__ __device__ void operator-=(int3 &a, int b) {
 428 |   a.x -= b;
 429 |   a.y -= b;
 430 |   a.z -= b;
 431 | }
 432 | 
 433 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) {
 434 |   return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
 435 | }
 436 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b) {
 437 |   a.x -= b.x;
 438 |   a.y -= b.y;
 439 |   a.z -= b.z;
 440 | }
 441 | inline __host__ __device__ uint3 operator-(uint3 a, uint b) {
 442 |   return make_uint3(a.x - b, a.y - b, a.z - b);
 443 | }
 444 | inline __host__ __device__ uint3 operator-(uint b, uint3 a) {
 445 |   return make_uint3(b - a.x, b - a.y, b - a.z);
 446 | }
 447 | inline __host__ __device__ void operator-=(uint3 &a, uint b) {
 448 |   a.x -= b;
 449 |   a.y -= b;
 450 |   a.z -= b;
 451 | }
 452 | 
 453 | inline __host__ __device__ float4 operator-(float4 a, float4 b) {
 454 |   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 455 | }
 456 | inline __host__ __device__ void operator-=(float4 &a, float4 b) {
 457 |   a.x -= b.x;
 458 |   a.y -= b.y;
 459 |   a.z -= b.z;
 460 |   a.w -= b.w;
 461 | }
 462 | inline __host__ __device__ float4 operator-(float4 a, float b) {
 463 |   return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
 464 | }
 465 | inline __host__ __device__ void operator-=(float4 &a, float b) {
 466 |   a.x -= b;
 467 |   a.y -= b;
 468 |   a.z -= b;
 469 |   a.w -= b;
 470 | }
 471 | 
 472 | inline __host__ __device__ int4 operator-(int4 a, int4 b) {
 473 |   return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 474 | }
 475 | inline __host__ __device__ void operator-=(int4 &a, int4 b) {
 476 |   a.x -= b.x;
 477 |   a.y -= b.y;
 478 |   a.z -= b.z;
 479 |   a.w -= b.w;
 480 | }
 481 | inline __host__ __device__ int4 operator-(int4 a, int b) {
 482 |   return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
 483 | }
 484 | inline __host__ __device__ int4 operator-(int b, int4 a) {
 485 |   return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
 486 | }
 487 | inline __host__ __device__ void operator-=(int4 &a, int b) {
 488 |   a.x -= b;
 489 |   a.y -= b;
 490 |   a.z -= b;
 491 |   a.w -= b;
 492 | }
 493 | 
 494 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) {
 495 |   return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 496 | }
 497 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b) {
 498 |   a.x -= b.x;
 499 |   a.y -= b.y;
 500 |   a.z -= b.z;
 501 |   a.w -= b.w;
 502 | }
 503 | inline __host__ __device__ uint4 operator-(uint4 a, uint b) {
 504 |   return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
 505 | }
 506 | inline __host__ __device__ uint4 operator-(uint b, uint4 a) {
 507 |   return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
 508 | }
 509 | inline __host__ __device__ void operator-=(uint4 &a, uint b) {
 510 |   a.x -= b;
 511 |   a.y -= b;
 512 |   a.z -= b;
 513 |   a.w -= b;
 514 | }
 515 | 
 516 | ////////////////////////////////////////////////////////////////////////////////
 517 | // multiply
 518 | ////////////////////////////////////////////////////////////////////////////////
 519 | 
 520 | inline __host__ __device__ float2 operator*(float2 a, float2 b) {
 521 |   return make_float2(a.x * b.x, a.y * b.y);
 522 | }
 523 | inline __host__ __device__ void operator*=(float2 &a, float2 b) {
 524 |   a.x *= b.x;
 525 |   a.y *= b.y;
 526 | }
 527 | inline __host__ __device__ float2 operator*(float2 a, float b) {
 528 |   return make_float2(a.x * b, a.y * b);
 529 | }
 530 | inline __host__ __device__ float2 operator*(float b, float2 a) {
 531 |   return make_float2(b * a.x, b * a.y);
 532 | }
 533 | inline __host__ __device__ void operator*=(float2 &a, float b) {
 534 |   a.x *= b;
 535 |   a.y *= b;
 536 | }
 537 | 
 538 | inline __host__ __device__ int2 operator*(int2 a, int2 b) {
 539 |   return make_int2(a.x * b.x, a.y * b.y);
 540 | }
 541 | inline __host__ __device__ void operator*=(int2 &a, int2 b) {
 542 |   a.x *= b.x;
 543 |   a.y *= b.y;
 544 | }
 545 | inline __host__ __device__ int2 operator*(int2 a, int b) { return make_int2(a.x * b, a.y * b); }
 546 | inline __host__ __device__ int2 operator*(int b, int2 a) { return make_int2(b * a.x, b * a.y); }
 547 | inline __host__ __device__ void operator*=(int2 &a, int b) {
 548 |   a.x *= b;
 549 |   a.y *= b;
 550 | }
 551 | 
 552 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) {
 553 |   return make_uint2(a.x * b.x, a.y * b.y);
 554 | }
 555 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b) {
 556 |   a.x *= b.x;
 557 |   a.y *= b.y;
 558 | }
 559 | inline __host__ __device__ uint2 operator*(uint2 a, uint b) { return make_uint2(a.x * b, a.y * b); }
 560 | inline __host__ __device__ uint2 operator*(uint b, uint2 a) { return make_uint2(b * a.x, b * a.y); }
 561 | inline __host__ __device__ void operator*=(uint2 &a, uint b) {
 562 |   a.x *= b;
 563 |   a.y *= b;
 564 | }
 565 | 
 566 | inline __host__ __device__ float3 operator*(float3 a, float3 b) {
 567 |   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 568 | }
 569 | inline __host__ __device__ void operator*=(float3 &a, float3 b) {
 570 |   a.x *= b.x;
 571 |   a.y *= b.y;
 572 |   a.z *= b.z;
 573 | }
 574 | inline __host__ __device__ float3 operator*(float3 a, float b) {
 575 |   return make_float3(a.x * b, a.y * b, a.z * b);
 576 | }
 577 | inline __host__ __device__ float3 operator*(float b, float3 a) {
 578 |   return make_float3(b * a.x, b * a.y, b * a.z);
 579 | }
 580 | inline __host__ __device__ void operator*=(float3 &a, float b) {
 581 |   a.x *= b;
 582 |   a.y *= b;
 583 |   a.z *= b;
 584 | }
 585 | 
 586 | inline __host__ __device__ int3 operator*(int3 a, int3 b) {
 587 |   return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
 588 | }
 589 | inline __host__ __device__ void operator*=(int3 &a, int3 b) {
 590 |   a.x *= b.x;
 591 |   a.y *= b.y;
 592 |   a.z *= b.z;
 593 | }
 594 | inline __host__ __device__ int3 operator*(int3 a, int b) {
 595 |   return make_int3(a.x * b, a.y * b, a.z * b);
 596 | }
 597 | inline __host__ __device__ int3 operator*(int b, int3 a) {
 598 |   return make_int3(b * a.x, b * a.y, b * a.z);
 599 | }
 600 | inline __host__ __device__ void operator*=(int3 &a, int b) {
 601 |   a.x *= b;
 602 |   a.y *= b;
 603 |   a.z *= b;
 604 | }
 605 | 
 606 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) {
 607 |   return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
 608 | }
 609 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b) {
 610 |   a.x *= b.x;
 611 |   a.y *= b.y;
 612 |   a.z *= b.z;
 613 | }
 614 | inline __host__ __device__ uint3 operator*(uint3 a, uint b) {
 615 |   return make_uint3(a.x * b, a.y * b, a.z * b);
 616 | }
 617 | inline __host__ __device__ uint3 operator*(uint b, uint3 a) {
 618 |   return make_uint3(b * a.x, b * a.y, b * a.z);
 619 | }
 620 | inline __host__ __device__ void operator*=(uint3 &a, uint b) {
 621 |   a.x *= b;
 622 |   a.y *= b;
 623 |   a.z *= b;
 624 | }
 625 | 
 626 | inline __host__ __device__ float4 operator*(float4 a, float4 b) {
 627 |   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 628 | }
 629 | inline __host__ __device__ void operator*=(float4 &a, float4 b) {
 630 |   a.x *= b.x;
 631 |   a.y *= b.y;
 632 |   a.z *= b.z;
 633 |   a.w *= b.w;
 634 | }
 635 | inline __host__ __device__ float4 operator*(float4 a, float b) {
 636 |   return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
 637 | }
 638 | inline __host__ __device__ float4 operator*(float b, float4 a) {
 639 |   return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
 640 | }
 641 | inline __host__ __device__ void operator*=(float4 &a, float b) {
 642 |   a.x *= b;
 643 |   a.y *= b;
 644 |   a.z *= b;
 645 |   a.w *= b;
 646 | }
 647 | 
 648 | inline __host__ __device__ int4 operator*(int4 a, int4 b) {
 649 |   return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 650 | }
 651 | inline __host__ __device__ void operator*=(int4 &a, int4 b) {
 652 |   a.x *= b.x;
 653 |   a.y *= b.y;
 654 |   a.z *= b.z;
 655 |   a.w *= b.w;
 656 | }
 657 | inline __host__ __device__ int4 operator*(int4 a, int b) {
 658 |   return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
 659 | }
 660 | inline __host__ __device__ int4 operator*(int b, int4 a) {
 661 |   return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
 662 | }
 663 | inline __host__ __device__ void operator*=(int4 &a, int b) {
 664 |   a.x *= b;
 665 |   a.y *= b;
 666 |   a.z *= b;
 667 |   a.w *= b;
 668 | }
 669 | 
 670 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) {
 671 |   return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 672 | }
 673 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b) {
 674 |   a.x *= b.x;
 675 |   a.y *= b.y;
 676 |   a.z *= b.z;
 677 |   a.w *= b.w;
 678 | }
 679 | inline __host__ __device__ uint4 operator*(uint4 a, uint b) {
 680 |   return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
 681 | }
 682 | inline __host__ __device__ uint4 operator*(uint b, uint4 a) {
 683 |   return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
 684 | }
 685 | inline __host__ __device__ void operator*=(uint4 &a, uint b) {
 686 |   a.x *= b;
 687 |   a.y *= b;
 688 |   a.z *= b;
 689 |   a.w *= b;
 690 | }
 691 | 
 692 | ////////////////////////////////////////////////////////////////////////////////
 693 | // divide
 694 | ////////////////////////////////////////////////////////////////////////////////
 695 | 
 696 | inline __host__ __device__ float2 operator/(float2 a, float2 b) {
 697 |   return make_float2(a.x / b.x, a.y / b.y);
 698 | }
 699 | inline __host__ __device__ void operator/=(float2 &a, float2 b) {
 700 |   a.x /= b.x;
 701 |   a.y /= b.y;
 702 | }
 703 | inline __host__ __device__ float2 operator/(float2 a, float b) {
 704 |   return make_float2(a.x / b, a.y / b);
 705 | }
 706 | inline __host__ __device__ void operator/=(float2 &a, float b) {
 707 |   a.x /= b;
 708 |   a.y /= b;
 709 | }
 710 | inline __host__ __device__ float2 operator/(float b, float2 a) {
 711 |   return make_float2(b / a.x, b / a.y);
 712 | }
 713 | 
 714 | inline __host__ __device__ float3 operator/(float3 a, float3 b) {
 715 |   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 716 | }
 717 | inline __host__ __device__ void operator/=(float3 &a, float3 b) {
 718 |   a.x /= b.x;
 719 |   a.y /= b.y;
 720 |   a.z /= b.z;
 721 | }
 722 | inline __host__ __device__ float3 operator/(float3 a, float b) {
 723 |   return make_float3(a.x / b, a.y / b, a.z / b);
 724 | }
 725 | inline __host__ __device__ void operator/=(float3 &a, float b) {
 726 |   a.x /= b;
 727 |   a.y /= b;
 728 |   a.z /= b;
 729 | }
 730 | inline __host__ __device__ float3 operator/(float b, float3 a) {
 731 |   return make_float3(b / a.x, b / a.y, b / a.z);
 732 | }
 733 | 
 734 | inline __host__ __device__ float4 operator/(float4 a, float4 b) {
 735 |   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
 736 | }
 737 | inline __host__ __device__ void operator/=(float4 &a, float4 b) {
 738 |   a.x /= b.x;
 739 |   a.y /= b.y;
 740 |   a.z /= b.z;
 741 |   a.w /= b.w;
 742 | }
 743 | inline __host__ __device__ float4 operator/(float4 a, float b) {
 744 |   return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
 745 | }
 746 | inline __host__ __device__ void operator/=(float4 &a, float b) {
 747 |   a.x /= b;
 748 |   a.y /= b;
 749 |   a.z /= b;
 750 |   a.w /= b;
 751 | }
 752 | inline __host__ __device__ float4 operator/(float b, float4 a) {
 753 |   return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
 754 | }
 755 | 
 756 | ////////////////////////////////////////////////////////////////////////////////
 757 | // min
 758 | ////////////////////////////////////////////////////////////////////////////////
 759 | 
 760 | inline __host__ __device__ float2 fminf(float2 a, float2 b) {
 761 |   return make_float2(fminf(a.x, b.x), fminf(a.y, b.y));
 762 | }
 763 | inline __host__ __device__ float3 fminf(float3 a, float3 b) {
 764 |   return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z));
 765 | }
 766 | inline __host__ __device__ float4 fminf(float4 a, float4 b) {
 767 |   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
 768 | }
 769 | 
 770 | inline __host__ __device__ int2 min(int2 a, int2 b) {
 771 |   return make_int2(min(a.x, b.x), min(a.y, b.y));
 772 | }
 773 | inline __host__ __device__ int3 min(int3 a, int3 b) {
 774 |   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 775 | }
 776 | inline __host__ __device__ int4 min(int4 a, int4 b) {
 777 |   return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 778 | }
 779 | 
 780 | inline __host__ __device__ uint2 min(uint2 a, uint2 b) {
 781 |   return make_uint2(min(a.x, b.x), min(a.y, b.y));
 782 | }
 783 | inline __host__ __device__ uint3 min(uint3 a, uint3 b) {
 784 |   return make_uint3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 785 | }
 786 | inline __host__ __device__ uint4 min(uint4 a, uint4 b) {
 787 |   return make_uint4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 788 | }
 789 | 
 790 | ////////////////////////////////////////////////////////////////////////////////
 791 | // max
 792 | ////////////////////////////////////////////////////////////////////////////////
 793 | 
 794 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b) {
 795 |   return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y));
 796 | }
 797 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b) {
 798 |   return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
 799 | }
 800 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b) {
 801 |   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
 802 | }
 803 | 
 804 | inline __host__ __device__ int2 max(int2 a, int2 b) {
 805 |   return make_int2(max(a.x, b.x), max(a.y, b.y));
 806 | }
 807 | inline __host__ __device__ int3 max(int3 a, int3 b) {
 808 |   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 809 | }
 810 | inline __host__ __device__ int4 max(int4 a, int4 b) {
 811 |   return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
 812 | }
 813 | 
 814 | inline __host__ __device__ uint2 max(uint2 a, uint2 b) {
 815 |   return make_uint2(max(a.x, b.x), max(a.y, b.y));
 816 | }
 817 | inline __host__ __device__ uint3 max(uint3 a, uint3 b) {
 818 |   return make_uint3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 819 | }
 820 | inline __host__ __device__ uint4 max(uint4 a, uint4 b) {
 821 |   return make_uint4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
 822 | }
 823 | 
 824 | ////////////////////////////////////////////////////////////////////////////////
 825 | // lerp
 826 | // - linear interpolation between a and b, based on value t in [0, 1] range
 827 | ////////////////////////////////////////////////////////////////////////////////
 828 | 
 829 | inline __device__ __host__ float lerp(float a, float b, float t) { return a + t * (b - a); }
 830 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) { return a + t * (b - a); }
 831 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) { return a + t * (b - a); }
 832 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) { return a + t * (b - a); }
 833 | 
 834 | ////////////////////////////////////////////////////////////////////////////////
 835 | // clamp
 836 | // - clamp the value v to be in the range [a, b]
 837 | ////////////////////////////////////////////////////////////////////////////////
 838 | 
 839 | inline __device__ __host__ float clamp(float f, float a, float b) { return fmaxf(a, fminf(f, b)); }
 840 | inline __device__ __host__ int clamp(int f, int a, int b) { return max(a, min(f, b)); }
 841 | inline __device__ __host__ uint clamp(uint f, uint a, uint b) { return max(a, min(f, b)); }
 842 | 
 843 | inline __device__ __host__ float2 clamp(float2 v, float a, float b) {
 844 |   return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
 845 | }
 846 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) {
 847 |   return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
 848 | }
 849 | inline __device__ __host__ float3 clamp(float3 v, float a, float b) {
 850 |   return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
 851 | }
 852 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) {
 853 |   return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
 854 | }
 855 | inline __device__ __host__ float4 clamp(float4 v, float a, float b) {
 856 |   return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
 857 | }
 858 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) {
 859 |   return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z),
 860 |                      clamp(v.w, a.w, b.w));
 861 | }
 862 | 
 863 | inline __device__ __host__ int2 clamp(int2 v, int a, int b) {
 864 |   return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
 865 | }
 866 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) {
 867 |   return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
 868 | }
 869 | inline __device__ __host__ int3 clamp(int3 v, int a, int b) {
 870 |   return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
 871 | }
 872 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) {
 873 |   return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
 874 | }
 875 | inline __device__ __host__ int4 clamp(int4 v, int a, int b) {
 876 |   return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
 877 | }
 878 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) {
 879 |   return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z),
 880 |                    clamp(v.w, a.w, b.w));
 881 | }
 882 | 
 883 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) {
 884 |   return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
 885 | }
 886 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) {
 887 |   return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
 888 | }
 889 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) {
 890 |   return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
 891 | }
 892 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) {
 893 |   return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
 894 | }
 895 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) {
 896 |   return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
 897 | }
 898 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) {
 899 |   return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z),
 900 |                     clamp(v.w, a.w, b.w));
 901 | }
 902 | 
 903 | ////////////////////////////////////////////////////////////////////////////////
 904 | // dot product
 905 | ////////////////////////////////////////////////////////////////////////////////
 906 | 
 907 | inline __host__ __device__ float dot(float2 a, float2 b) { return a.x * b.x + a.y * b.y; }
 908 | inline __host__ __device__ float dot(float3 a, float3 b) {
 909 |   return a.x * b.x + a.y * b.y + a.z * b.z;
 910 | }
 911 | inline __host__ __device__ float dot(float4 a, float4 b) {
 912 |   return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
 913 | }
 914 | 
 915 | inline __host__ __device__ int dot(int2 a, int2 b) { return a.x * b.x + a.y * b.y; }
 916 | inline __host__ __device__ int dot(int3 a, int3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
 917 | inline __host__ __device__ int dot(int4 a, int4 b) {
 918 |   return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
 919 | }
 920 | 
 921 | inline __host__ __device__ uint dot(uint2 a, uint2 b) { return a.x * b.x + a.y * b.y; }
 922 | inline __host__ __device__ uint dot(uint3 a, uint3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
 923 | inline __host__ __device__ uint dot(uint4 a, uint4 b) {
 924 |   return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
 925 | }
 926 | 
 927 | ////////////////////////////////////////////////////////////////////////////////
 928 | // length
 929 | ////////////////////////////////////////////////////////////////////////////////
 930 | 
 931 | inline __host__ __device__ float length(float2 v) { return sqrtf(dot(v, v)); }
 932 | inline __host__ __device__ float length(float3 v) { return sqrtf(dot(v, v)); }
 933 | inline __host__ __device__ float length(float4 v) { return sqrtf(dot(v, v)); }
 934 | 
 935 | ////////////////////////////////////////////////////////////////////////////////
 936 | // normalize
 937 | ////////////////////////////////////////////////////////////////////////////////
 938 | 
 939 | inline __host__ __device__ float2 normalize(float2 v) {
 940 |   float invLen = rsqrtf(dot(v, v));
 941 |   return v * invLen;
 942 | }
 943 | inline __host__ __device__ float3 normalize(float3 v) {
 944 |   float invLen = rsqrtf(dot(v, v));
 945 |   return v * invLen;
 946 | }
 947 | inline __host__ __device__ float4 normalize(float4 v) {
 948 |   float invLen = rsqrtf(dot(v, v));
 949 |   return v * invLen;
 950 | }
 951 | 
 952 | ////////////////////////////////////////////////////////////////////////////////
 953 | // floor
 954 | ////////////////////////////////////////////////////////////////////////////////
 955 | 
 956 | inline __host__ __device__ float2 floorf(float2 v) { return make_float2(floorf(v.x), floorf(v.y)); }
 957 | inline __host__ __device__ float3 floorf(float3 v) {
 958 |   return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
 959 | }
 960 | inline __host__ __device__ float4 floorf(float4 v) {
 961 |   return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
 962 | }
 963 | 
 964 | ////////////////////////////////////////////////////////////////////////////////
 965 | // frac - returns the fractional portion of a scalar or each vector component
 966 | ////////////////////////////////////////////////////////////////////////////////
 967 | 
 968 | inline __host__ __device__ float fracf(float v) { return v - floorf(v); }
 969 | inline __host__ __device__ float2 fracf(float2 v) { return make_float2(fracf(v.x), fracf(v.y)); }
 970 | inline __host__ __device__ float3 fracf(float3 v) {
 971 |   return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
 972 | }
 973 | inline __host__ __device__ float4 fracf(float4 v) {
 974 |   return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
 975 | }
 976 | 
 977 | ////////////////////////////////////////////////////////////////////////////////
 978 | // fmod
 979 | ////////////////////////////////////////////////////////////////////////////////
 980 | 
 981 | inline __host__ __device__ float2 fmodf(float2 a, float2 b) {
 982 |   return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
 983 | }
 984 | inline __host__ __device__ float3 fmodf(float3 a, float3 b) {
 985 |   return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
 986 | }
 987 | inline __host__ __device__ float4 fmodf(float4 a, float4 b) {
 988 |   return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
 989 | }
 990 | 
 991 | ////////////////////////////////////////////////////////////////////////////////
 992 | // absolute value
 993 | ////////////////////////////////////////////////////////////////////////////////
 994 | 
 995 | inline __host__ __device__ float2 fabs(float2 v) { return make_float2(fabs(v.x), fabs(v.y)); }
 996 | inline __host__ __device__ float3 fabs(float3 v) {
 997 |   return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
 998 | }
 999 | inline __host__ __device__ float4 fabs(float4 v) {
1000 |   return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
1001 | }
1002 | 
1003 | inline __host__ __device__ int2 abs(int2 v) { return make_int2(abs(v.x), abs(v.y)); }
1004 | inline __host__ __device__ int3 abs(int3 v) { return make_int3(abs(v.x), abs(v.y), abs(v.z)); }
1005 | inline __host__ __device__ int4 abs(int4 v) {
1006 |   return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
1007 | }
1008 | 
1009 | ////////////////////////////////////////////////////////////////////////////////
1010 | // reflect
1011 | // - returns reflection of incident ray I around surface normal N
1012 | // - N should be normalized, reflected vector's length is equal to length of I
1013 | ////////////////////////////////////////////////////////////////////////////////
1014 | 
1015 | inline __host__ __device__ float3 reflect(float3 i, float3 n) { return i - 2.0f * n * dot(n, i); }
1016 | 
1017 | ////////////////////////////////////////////////////////////////////////////////
1018 | // cross product
1019 | ////////////////////////////////////////////////////////////////////////////////
1020 | 
1021 | inline __host__ __device__ float3 cross(float3 a, float3 b) {
1022 |   return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
1023 | }
1024 | 
1025 | ////////////////////////////////////////////////////////////////////////////////
1026 | // smoothstep
1027 | // - returns 0 if x < a
1028 | // - returns 1 if x > b
1029 | // - otherwise returns smooth interpolation between 0 and 1 based on x
1030 | ////////////////////////////////////////////////////////////////////////////////
1031 | 
1032 | inline __device__ __host__ float smoothstep(float a, float b, float x) {
1033 |   float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1034 |   return (y * y * (3.0f - (2.0f * y)));
1035 | }
1036 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) {
1037 |   float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1038 |   return (y * y * (make_float2(3.0f) - (make_float2(2.0f) * y)));
1039 | }
1040 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) {
1041 |   float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1042 |   return (y * y * (make_float3(3.0f) - (make_float3(2.0f) * y)));
1043 | }
1044 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) {
1045 |   float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1046 |   return (y * y * (make_float4(3.0f) - (make_float4(2.0f) * y)));
1047 | }
1048 | 
1049 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/math.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "helper_math.h"
  3 | 
  4 | struct mat33 {
  5 |     float3 cols[3];
  6 | 
  7 |   __host__ __device__ mat33() {}
  8 |   __host__ __device__ mat33(const float3 &c0, 
  9 |                             const float3 &c1,
 10 |                             const float3 &c2) {
 11 |     cols[0] = c0;
 12 |     cols[1] = c1;
 13 |     cols[2] = c2;
 14 |     }
 15 |     __host__ __device__ mat33(const float *data) {
 16 |         cols[0] = make_float3(data[0], data[1], data[2]);
 17 |         cols[1] = make_float3(data[3], data[4], data[5]);
 18 |         cols[2] = make_float3(data[6], data[7], data[8]);
 19 |     }
 20 | 
 21 |     __host__ __device__ static mat33 identity() {
 22 |         return mat33(make_float3(1, 0, 0),
 23 |                      make_float3(0, 1, 0),
 24 |                      make_float3(0, 0, 1));
 25 |     }
 26 | 
 27 |     __host__ __device__ static mat33 skew_symmetric(const float3 &v) {
 28 |         return mat33(make_float3(0, v.z, -v.y),
 29 |                      make_float3(-v.z, 0, v.x),
 30 |                      make_float3(v.y, -v.x, 0));
 31 |     }
 32 | 
 33 |     __host__ __device__ mat33 transpose() const {
 34 |         float3 c0 = cols[0];
 35 |         float3 c1 = cols[1];
 36 |         float3 c2 = cols[2];
 37 |         return mat33(make_float3(c0.x, c1.x, c2.x),
 38 |                      make_float3(c0.y, c1.y, c2.y),
 39 |                      make_float3(c0.z, c1.z, c2.z));
 40 |     }
 41 |     
 42 |     __host__ __device__ float &operator[](int i) { 
 43 |         float3 &col = cols[i / 3];
 44 |         return (&col.x)[i % 3]; 
 45 |     }    
 46 | 
 47 |     __host__ __device__ const mat33 operator+(const mat33 &m) const {
 48 |         float3 c0 = cols[0] + m.cols[0];
 49 |         float3 c1 = cols[1] + m.cols[1];
 50 |         float3 c2 = cols[2] + m.cols[2];
 51 |         return mat33(c0, c1, c2);
 52 |     }
 53 | 
 54 |     __host__ __device__ const mat33 operator*(const mat33 &m) const {
 55 |         float3 c0 = cols[0];
 56 |         float3 c1 = cols[1];
 57 |         float3 c2 = cols[2];
 58 |         float3 m0 = m.cols[0];
 59 |         float3 m1 = m.cols[1];
 60 |         float3 m2 = m.cols[2];
 61 | 
 62 |         float3 n0 = make_float3(c0.x * m0.x + c1.x * m0.y + c2.x * m0.z,
 63 |                                 c0.y * m0.x + c1.y * m0.y + c2.y * m0.z,
 64 |                                 c0.z * m0.x + c1.z * m0.y + c2.z * m0.z);
 65 |         float3 n1 = make_float3(c0.x * m1.x + c1.x * m1.y + c2.x * m1.z,
 66 |                                 c0.y * m1.x + c1.y * m1.y + c2.y * m1.z,
 67 |                                 c0.z * m1.x + c1.z * m1.y + c2.z * m1.z);
 68 |         float3 n2 = make_float3(c0.x * m2.x + c1.x * m2.y + c2.x * m2.z,
 69 |                                 c0.y * m2.x + c1.y * m2.y + c2.y * m2.z,
 70 |                                 c0.z * m2.x + c1.z * m2.y + c2.z * m2.z);   
 71 |         return mat33(n0, n1, n2);
 72 |     }
 73 | 
 74 |     __host__ __device__ const mat33 operator*(const float &s) const {
 75 |         float3 c0 = cols[0];
 76 |         float3 c1 = cols[1];
 77 |         float3 c2 = cols[2];
 78 |         return mat33(c0 * s, c1 * s, c2 * s);
 79 |     }
 80 | 
 81 |     __host__ __device__ const float3 operator*(const float3 &v) const {
 82 |         float3 c0 = cols[0];
 83 |         float3 c1 = cols[1];
 84 |         float3 c2 = cols[2];
 85 |         return make_float3(c0.x * v.x + c1.x * v.y + c2.x * v.z,
 86 |                            c0.y * v.x + c1.y * v.y + c2.y * v.z,
 87 |                            c0.z * v.x + c1.z * v.y + c2.z * v.z);
 88 |     }
 89 | 
 90 |     __host__ __device__ const mat33 operator-() const {
 91 |         float3 c0 = cols[0];
 92 |         float3 c1 = cols[1];
 93 |         float3 c2 = cols[2];
 94 |         return mat33(-c0, -c1, -c2);
 95 |     }
 96 |     
 97 |     friend __host__ __device__  mat33 operator*(const float &s, const mat33 &m) {
 98 |         return m * s;
 99 |     }
100 | };
101 | 
102 | 
103 | 
104 | struct mat34 {
105 |     float3 cols[4];
106 |     __host__ __device__ mat34() {}
107 |     __host__ __device__ mat34(const float3 &c0, 
108 |           const float3 &c1,
109 |           const float3 &c2,
110 |           const float3 &c3) {
111 |         cols[0] = c0;
112 |         cols[1] = c1;
113 |         cols[2] = c2;
114 |         cols[3] = c3;
115 |     }
116 |     __host__ __device__ mat34(const float *data) {
117 |         cols[0] = make_float3(data[0], data[1], data[2]);
118 |         cols[1] = make_float3(data[3], data[4], data[5]);
119 |         cols[2] = make_float3(data[6], data[7], data[8]);
120 |         cols[3] = make_float3(data[9], data[10], data[11]);
121 |     }
122 |     __host__ __device__ mat34(const mat33 &m, const float3 &v) {
123 |         cols[0] = m.cols[0];
124 |         cols[1] = m.cols[1];
125 |         cols[2] = m.cols[2];
126 |         cols[3] = v;
127 |     }
128 | 
129 |     __host__ __device__ float &operator[](int i) { 
130 |         float3 &col = cols[i / 3];
131 |         return (&col.x)[i % 3]; 
132 |     }
133 | 
134 |     __host__ __device__ const mat34 operator+(const mat34 &m) const {
135 |         float3 c0 = cols[0] + m.cols[0];
136 |         float3 c1 = cols[1] + m.cols[1];
137 |         float3 c2 = cols[2] + m.cols[2];
138 |         float3 c3 = cols[3] + m.cols[3];
139 |         return mat34(c0, c1, c2, c3);
140 |     }
141 | };
142 | 
143 | struct mat44 {
144 |     float4 cols[4];
145 |     __host__ __device__ mat44() {}
146 |     __host__ __device__ mat44(const float4 &c0, const float4 &c1, const float4 &c2, const float4 &c3) {
147 |         cols[0] = c0; cols[1] = c1; cols[2] = c2; cols[3] = c3;
148 |     }
149 |     __host__ __device__ mat44(const float *data) {
150 |         cols[0] = make_float4(data[0], data[1], data[2], data[3]);
151 |         cols[1] = make_float4(data[4], data[5], data[6], data[7]);
152 |         cols[2] = make_float4(data[8], data[9], data[10], data[11]);
153 |         cols[3] = make_float4(data[12], data[13], data[14], data[15]);
154 |     }
155 |     __host__ __device__ mat44(const mat33 &m, const float3 &v) {
156 |         cols[0] = make_float4(m.cols[0], 0);
157 |         cols[1] = make_float4(m.cols[1], 0);
158 |         cols[2] = make_float4(m.cols[2], 0);
159 |         cols[3] = make_float4(v, 1);
160 |     }
161 |     __host__ __device__ mat44(const mat34 &m) {
162 |         cols[0] = make_float4(m.cols[0], 0);
163 |         cols[1] = make_float4(m.cols[1], 0);
164 |         cols[2] = make_float4(m.cols[2], 0);
165 |         cols[3] = make_float4(m.cols[3], 1);
166 |     }
167 | 
168 |     __host__ __device__ float &operator[](int i) { 
169 |         float4 &col = cols[i / 4];
170 |         return (&col.x)[i % 4]; 
171 |     }
172 | 
173 |     __host__ __device__ mat44 operator+(const mat44 &m) const {
174 |         float4 c0 = cols[0] + m.cols[0];
175 |         float4 c1 = cols[1] + m.cols[1];
176 |         float4 c2 = cols[2] + m.cols[2];
177 |         float4 c3 = cols[3] + m.cols[3];
178 |         return mat44(c0, c1, c2, c3);
179 |     }
180 | 
181 |     __host__ __device__ mat44 operator*(const mat44 &m) const {
182 |         float4 c0 = cols[0];
183 |         float4 c1 = cols[1];
184 |         float4 c2 = cols[2];
185 |         float4 c3 = cols[3];
186 |         float4 m0 = m.cols[0];
187 |         float4 m1 = m.cols[1];
188 |         float4 m2 = m.cols[2];
189 |         float4 m3 = m.cols[3];
190 | 
191 |         float4 n0 = make_float4(c0.x * m0.x + c1.x * m0.y + c2.x * m0.z + c3.x * m0.w,
192 |                                 c0.y * m0.x + c1.y * m0.y + c2.y * m0.z + c3.y * m0.w,
193 |                                 c0.z * m0.x + c1.z * m0.y + c2.z * m0.z + c3.z * m0.w,
194 |                                 c0.w * m0.x + c1.w * m0.y + c2.w * m0.z + c3.w * m0.w);
195 |         float4 n1 = make_float4(c0.x * m1.x + c1.x * m1.y + c2.x * m1.z + c3.x * m1.w,
196 |                                 c0.y * m1.x + c1.y * m1.y + c2.y * m1.z + c3.y * m1.w,
197 |                                 c0.z * m1.x + c1.z * m1.y + c2.z * m1.z + c3.z * m1.w,
198 |                                 c0.w * m1.x + c1.w * m1.y + c2.w * m1.z + c3.w * m1.w);
199 |         float4 n2 = make_float4(c0.x * m2.x + c1.x * m2.y + c2.x * m2.z + c3.x * m2.w,
200 |                                 c0.y * m2.x + c1.y * m2.y + c2.y * m2.z + c3.y * m2.w,
201 |                                 c0.z * m2.x + c1.z * m2.y + c2.z * m2.z + c3.z * m2.w,
202 |                                 c0.w * m2.x + c1.w * m2.y + c2.w * m2.z + c3.w * m2.w);
203 |         float4 n3 = make_float4(c0.x * m3.x + c1.x * m3.y + c2.x * m3.z + c3.x * m3.w,
204 |                                 c0.y * m3.x + c1.y * m3.y + c2.y * m3.z + c3.y * m3.w,
205 |                                 c0.z * m3.x + c1.z * m3.y + c2.z * m3.z + c3.z * m3.w,
206 |                                 c0.w * m3.x + c1.w * m3.y + c2.w * m3.z + c3.w * m3.w);
207 |         return mat44(n0, n1, n2, n3);
208 | 
209 |     }
210 | 
211 | };
212 | 
213 | __forceinline__ __host__ __device__ float norm(const float3 &v) {
214 |     return length(v);
215 | }
216 | 
217 | struct SO3 {
218 |     mat33 data_;
219 |     __host__ __device__ SO3() {}
220 |     __host__ __device__ SO3(const float3 &theta) {
221 |         data_ = SO3::Exp(theta).data();
222 |     }
223 |     __host__ __device__ SO3(const mat33 &data) {
224 |         data_ = data;
225 |     }
226 |     __host__ __device__  mat33 data() const {
227 |         return data_;
228 |     }
229 | 
230 |     __host__ __device__  mat33 static hat(const float3 &theta) {
231 |         return mat33::skew_symmetric(theta);
232 |     }
233 | 
234 |     __host__ __device__  SO3 static Exp(const float3 &theta) {
235 |         mat33 W = SO3::hat(theta);
236 |         mat33 W2 = W * W;
237 |         float angle = norm(theta);
238 |         mat33 I = mat33::identity();
239 |         if (angle < 1e-5) {
240 |             return SO3(I + W + 0.5f * W2);
241 |         }
242 |         else {
243 |             return SO3(I + sin(angle) / angle * W + ((1 - cos(angle)) / (angle * angle)) * W2);
244 |         }
245 |     }
246 |     __host__ __device__ float3 operator*(const float3 &v) const {
247 |         return data_ * v;
248 |     }
249 | 
250 |     __host__ __device__ SO3 operator*(const SO3 &R) const {
251 |         return SO3(data_ * R.data_);
252 |     }
253 | 
254 |     __host__ __device__ SO3 inverse() const {
255 |         return SO3(data_.transpose());
256 |     }
257 | };
258 | 
259 | struct SE3 {
260 |     SO3 R_data_;
261 |     float3 t_data_;
262 | 
263 |     __host__ __device__ SE3() {}
264 |     __host__ __device__ SE3(const float3 &rho, const float3 &theta) {
265 |         SE3 T = SE3::Exp(rho, theta);
266 |         t_data_ = T.t();
267 |         R_data_ = T.R();
268 |     }
269 | 
270 |     __host__ __device__ SE3(const float3 &t, const SO3 &R) {
271 |         t_data_ = t;
272 |         R_data_ = R;
273 |     }
274 | 
275 |     __host__ __device__ SE3(const float *data) {
276 |         mat44 T(data);
277 |         t_data_ = make_float3(T.cols[3]);
278 |         R_data_ = SO3(mat33(
279 |             make_float3(T.cols[0]), make_float3(T.cols[1]), make_float3(T.cols[2]))
280 |         );
281 |     }
282 | 
283 |     __host__ __device__ SE3(const mat44 &data) {
284 |         t_data_ = make_float3(data.cols[3]);
285 |         R_data_ = SO3(mat33(
286 |             make_float3(data.cols[0]), make_float3(data.cols[1]), make_float3(data.cols[2]))
287 |         );
288 |     }
289 | 
290 |     __host__ __device__ SO3 R() const {
291 |         return R_data_;
292 |     }
293 | 
294 |     __host__ __device__ float3 t() const {
295 |         return t_data_;
296 |     }
297 | 
298 |     __host__ __device__ mat44 data() const {
299 |         return mat44(R_data_.data(), t_data_);
300 |     }
301 | 
302 |     __host__ __device__ static mat44 hat(const float3 &rho, const float3 &theta) {
303 |         mat33 W = SO3::hat(theta);
304 |         mat44 T(W, rho);
305 |         T.cols[3].w = 0;
306 |         return T;
307 |     }
308 | 
309 |     __host__ __device__ static SE3 Exp(const float3 &rho, const float3 &theta) {    
310 |         mat33 W = SO3::hat(theta);
311 |         mat33 W2 = W * W;
312 |         SO3 R = SO3::Exp(theta);
313 |         float angle = norm(theta);
314 |         mat33 I = mat33::identity();
315 |         mat33 V;
316 |         if (angle < 1e-5) {
317 |             V = I + 0.5f * W + 1.f / 6.f * W2;
318 |         }
319 |         else {
320 |             V = I + W * ((1 - cos(angle)) / (angle * angle)) 
321 |                 + W2 * ((angle - sin(angle)) / (angle * angle * angle));
322 |         }
323 |         float3 t = V * rho;
324 |         return SE3(t, R);
325 |     }
326 | 
327 |     __host__ __device__ float3 operator*(const float3 &v) const {
328 |         return R_data_ * v + t_data_;
329 |     }
330 | 
331 |     __host__ __device__ SE3 operator*(const SE3 &T) const {
332 |         return SE3(t_data_ + R_data_ * T.t_data_, R_data_ * T.R_data_);
333 |     }
334 | 
335 |     __host__ __device__ SE3 inverse() const {
336 |         SO3 R_inv = R_data_.inverse();
337 |         float3 t = R_inv * t_data_;
338 |         return SE3(-t, R_inv);
339 |     }
340 | };


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #ifndef CUDA_RASTERIZER_H_INCLUDED
13 | #define CUDA_RASTERIZER_H_INCLUDED
14 | 
15 | #include <vector>
16 | #include <functional>
17 | 
18 | namespace CudaRasterizer
19 | {
20 | 	class Rasterizer
21 | 	{
22 | 	public:
23 | 
24 | 		static void markVisible(
25 | 			int P,
26 | 			float* means3D,
27 | 			float* viewmatrix,
28 | 			float* projmatrix,
29 | 			bool* present);
30 | 
31 | 		static int forward(
32 | 			std::function<char* (size_t)> geometryBuffer,
33 | 			std::function<char* (size_t)> binningBuffer,
34 | 			std::function<char* (size_t)> imageBuffer,
35 | 			const int P, int D, int M,
36 | 			const float* background,
37 | 			const int width, int height,
38 | 			const float* means3D,
39 | 			const float* shs,
40 | 			const float* colors_precomp,
41 | 			const float* opacities,
42 | 			const float* scales,
43 | 			const float scale_modifier,
44 | 			const float* rotations,
45 | 			const float* cov3D_precomp,
46 | 			const float* viewmatrix,
47 | 			const float* projmatrix,
48 | 			const float* cam_pos,
49 | 			const float tan_fovx, float tan_fovy,
50 | 			const bool prefiltered,
51 | 			float* out_color,
52 | 			float* out_depth,
53 | 			float* out_opacity,
54 | 			int* radii = nullptr,
55 | 			int* n_touched = nullptr,
56 | 			bool debug = false);
57 | 
58 | 		static void backward(
59 | 			const int P, int D, int M, int R,
60 | 			const float* background,
61 | 			const int width, int height,
62 | 			const float* means3D,
63 | 			const float* shs,
64 | 			const float* colors_precomp,
65 | 			const float* scales,
66 | 			const float scale_modifier,
67 | 			const float* rotations,
68 | 			const float* cov3D_precomp,
69 | 			const float* viewmatrix,
70 | 			const float* projmatrix,
71 |             const float* projmatrix_raw,
72 |             const float* campos,
73 | 			const float tan_fovx, float tan_fovy,
74 | 			const int* radii,
75 | 			char* geom_buffer,
76 | 			char* binning_buffer,
77 | 			char* image_buffer,
78 | 			const float* dL_dpix,
79 | 			const float* dL_dpix_depth,
80 | 			float* dL_dmean2D,
81 | 			float* dL_dconic,
82 | 			float* dL_dopacity,
83 | 			float* dL_dcolor,
84 | 			float* dL_ddepths,
85 | 			float* dL_dmean3D,
86 | 			float* dL_dcov3D,
87 | 			float* dL_dsh,
88 | 			float* dL_dscale,
89 | 			float* dL_drot,
90 | 			float* dL_dtau,
91 | 			bool debug);
92 | 	};
93 | };
94 | 
95 | #endif


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer_impl.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include "rasterizer_impl.h"
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <algorithm>
 16 | #include <numeric>
 17 | #include <cuda.h>
 18 | #include "cuda_runtime.h"
 19 | #include "device_launch_parameters.h"
 20 | #include <cub/cub.cuh>
 21 | #include <cub/device/device_radix_sort.cuh>
 22 | #define GLM_FORCE_CUDA
 23 | #include <glm/glm.hpp>
 24 | 
 25 | #include <cooperative_groups.h>
 26 | #include <cooperative_groups/reduce.h>
 27 | namespace cg = cooperative_groups;
 28 | 
 29 | #include "auxiliary.h"
 30 | #include "forward.h"
 31 | #include "backward.h"
 32 | 
 33 | // Helper function to find the next-highest bit of the MSB
 34 | // on the CPU.
 35 | uint32_t getHigherMsb(uint32_t n)
 36 | {
 37 | 	uint32_t msb = sizeof(n) * 4;
 38 | 	uint32_t step = msb;
 39 | 	while (step > 1)
 40 | 	{
 41 | 		step /= 2;
 42 | 		if (n >> msb)
 43 | 			msb += step;
 44 | 		else
 45 | 			msb -= step;
 46 | 	}
 47 | 	if (n >> msb)
 48 | 		msb++;
 49 | 	return msb;
 50 | }
 51 | 
 52 | // Wrapper method to call auxiliary coarse frustum containment test.
 53 | // Mark all Gaussians that pass it.
 54 | __global__ void checkFrustum(int P,
 55 | 	const float* orig_points,
 56 | 	const float* viewmatrix,
 57 | 	const float* projmatrix,
 58 | 	bool* present)
 59 | {
 60 | 	auto idx = cg::this_grid().thread_rank();
 61 | 	if (idx >= P)
 62 | 		return;
 63 | 
 64 | 	float3 p_view;
 65 | 	present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
 66 | }
 67 | 
 68 | // Generates one key/value pair for all Gaussian / tile overlaps. 
 69 | // Run once per Gaussian (1:N mapping).
 70 | __global__ void duplicateWithKeys(
 71 | 	int P,
 72 | 	const float2* points_xy,
 73 | 	const float* depths,
 74 | 	const uint32_t* offsets,
 75 | 	uint64_t* gaussian_keys_unsorted,
 76 | 	uint32_t* gaussian_values_unsorted,
 77 | 	int* radii,
 78 | 	dim3 grid)
 79 | {
 80 | 	auto idx = cg::this_grid().thread_rank();
 81 | 	if (idx >= P)
 82 | 		return;
 83 | 
 84 | 	// Generate no key/value pair for invisible Gaussians
 85 | 	if (radii[idx] > 0)
 86 | 	{
 87 | 		// Find this Gaussian's offset in buffer for writing keys/values.
 88 | 		uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
 89 | 		uint2 rect_min, rect_max;
 90 | 
 91 | 		getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
 92 | 
 93 | 		// For each tile that the bounding rect overlaps, emit a 
 94 | 		// key/value pair. The key is |  tile ID  |      depth      |,
 95 | 		// and the value is the ID of the Gaussian. Sorting the values 
 96 | 		// with this key yields Gaussian IDs in a list, such that they
 97 | 		// are first sorted by tile and then by depth. 
 98 | 		for (int y = rect_min.y; y < rect_max.y; y++)
 99 | 		{
100 | 			for (int x = rect_min.x; x < rect_max.x; x++)
101 | 			{
102 | 				uint64_t key = y * grid.x + x;
103 | 				key <<= 32;
104 | 				key |= *((uint32_t*)&depths[idx]);
105 | 				gaussian_keys_unsorted[off] = key;
106 | 				gaussian_values_unsorted[off] = idx;
107 | 				off++;
108 | 			}
109 | 		}
110 | 	}
111 | }
112 | 
113 | // Check keys to see if it is at the start/end of one tile's range in 
114 | // the full sorted list. If yes, write start/end of this tile. 
115 | // Run once per instanced (duplicated) Gaussian ID.
116 | __global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
117 | {
118 | 	auto idx = cg::this_grid().thread_rank();
119 | 	if (idx >= L)
120 | 		return;
121 | 
122 | 	// Read tile ID from key. Update start/end of tile range if at limit.
123 | 	uint64_t key = point_list_keys[idx];
124 | 	uint32_t currtile = key >> 32;
125 | 	if (idx == 0)
126 | 		ranges[currtile].x = 0;
127 | 	else
128 | 	{
129 | 		uint32_t prevtile = point_list_keys[idx - 1] >> 32;
130 | 		if (currtile != prevtile)
131 | 		{
132 | 			ranges[prevtile].y = idx;
133 | 			ranges[currtile].x = idx;
134 | 		}
135 | 	}
136 | 	if (idx == L - 1)
137 | 		ranges[currtile].y = L;
138 | }
139 | 
140 | // Mark Gaussians as visible/invisible, based on view frustum testing
141 | void CudaRasterizer::Rasterizer::markVisible(
142 | 	int P,
143 | 	float* means3D,
144 | 	float* viewmatrix,
145 | 	float* projmatrix,
146 | 	bool* present)
147 | {
148 | 	checkFrustum << <(P + 255) / 256, 256 >> > (
149 | 		P,
150 | 		means3D,
151 | 		viewmatrix, projmatrix,
152 | 		present);
153 | }
154 | 
155 | CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
156 | {
157 | 	GeometryState geom;
158 | 	obtain(chunk, geom.depths, P, 128);
159 | 	obtain(chunk, geom.clamped, P * 3, 128);
160 | 	obtain(chunk, geom.internal_radii, P, 128);
161 | 	obtain(chunk, geom.means2D, P, 128);
162 | 	obtain(chunk, geom.cov3D, P * 6, 128);
163 | 	obtain(chunk, geom.conic_opacity, P, 128);
164 | 	obtain(chunk, geom.rgb, P * 3, 128);
165 | 	obtain(chunk, geom.tiles_touched, P, 128);
166 | 	cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P);
167 | 	obtain(chunk, geom.scanning_space, geom.scan_size, 128);
168 | 	obtain(chunk, geom.point_offsets, P, 128);
169 | 	return geom;
170 | }
171 | 
172 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N)
173 | {
174 | 	ImageState img;
175 | 	obtain(chunk, img.accum_alpha, N, 128);
176 | 	obtain(chunk, img.n_contrib, N, 128);
177 | 	obtain(chunk, img.ranges, N, 128);
178 | 	return img;
179 | }
180 | 
181 | CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
182 | {
183 | 	BinningState binning;
184 | 	obtain(chunk, binning.point_list, P, 128);
185 | 	obtain(chunk, binning.point_list_unsorted, P, 128);
186 | 	obtain(chunk, binning.point_list_keys, P, 128);
187 | 	obtain(chunk, binning.point_list_keys_unsorted, P, 128);
188 | 	cub::DeviceRadixSort::SortPairs(
189 | 		nullptr, binning.sorting_size,
190 | 		binning.point_list_keys_unsorted, binning.point_list_keys,
191 | 		binning.point_list_unsorted, binning.point_list, P);
192 | 	obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
193 | 	return binning;
194 | }
195 | 
196 | // Forward rendering procedure for differentiable rasterization
197 | // of Gaussians.
198 | int CudaRasterizer::Rasterizer::forward(
199 | 	std::function<char* (size_t)> geometryBuffer,
200 | 	std::function<char* (size_t)> binningBuffer,
201 | 	std::function<char* (size_t)> imageBuffer,
202 | 	const int P, int D, int M,
203 | 	const float* background,
204 | 	const int width, int height,
205 | 	const float* means3D,
206 | 	const float* shs,
207 | 	const float* colors_precomp,
208 | 	const float* opacities,
209 | 	const float* scales,
210 | 	const float scale_modifier,
211 | 	const float* rotations,
212 | 	const float* cov3D_precomp,
213 | 	const float* viewmatrix,
214 | 	const float* projmatrix,
215 | 	const float* cam_pos,
216 | 	const float tan_fovx, float tan_fovy,
217 | 	const bool prefiltered,
218 | 	float* out_color,
219 | 	float* out_depth,
220 | 	float* out_opacity,
221 | 	int* radii,
222 | 	int* n_touched,
223 | 	bool debug)
224 | {
225 | 	const float focal_y = height / (2.0f * tan_fovy);
226 | 	const float focal_x = width / (2.0f * tan_fovx);
227 | 
228 | 	size_t chunk_size = required<GeometryState>(P);
229 | 	char* chunkptr = geometryBuffer(chunk_size);
230 | 	GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
231 | 
232 | 	if (radii == nullptr)
233 | 	{
234 | 		radii = geomState.internal_radii;
235 | 	}
236 | 
237 | 	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
238 | 	dim3 block(BLOCK_X, BLOCK_Y, 1);
239 | 
240 | 	// Dynamically resize image-based auxiliary buffers during training
241 | 	size_t img_chunk_size = required<ImageState>(width * height);
242 | 	char* img_chunkptr = imageBuffer(img_chunk_size);
243 | 	ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
244 | 
245 | 	if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
246 | 	{
247 | 		throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
248 | 	}
249 | 
250 | 	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
251 | 	CHECK_CUDA(FORWARD::preprocess(
252 | 		P, D, M,
253 | 		means3D,
254 | 		(glm::vec3*)scales,
255 | 		scale_modifier,
256 | 		(glm::vec4*)rotations,
257 | 		opacities,
258 | 		shs,
259 | 		geomState.clamped,
260 | 		cov3D_precomp,
261 | 		colors_precomp,
262 | 		viewmatrix, projmatrix,
263 | 		(glm::vec3*)cam_pos,
264 | 		width, height,
265 | 		focal_x, focal_y,
266 | 		tan_fovx, tan_fovy,
267 | 		radii,
268 | 		geomState.means2D,
269 | 		geomState.depths,
270 | 		geomState.cov3D,
271 | 		geomState.rgb,
272 | 		geomState.conic_opacity,
273 | 		tile_grid,
274 | 		geomState.tiles_touched,
275 | 		prefiltered
276 | 	), debug)
277 | 
278 | 	// Compute prefix sum over full list of touched tile counts by Gaussians
279 | 	// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
280 | 	CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
281 | 
282 | 	// Retrieve total number of Gaussian instances to launch and resize aux buffers
283 | 	int num_rendered;
284 | 	CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
285 | 
286 | 	size_t binning_chunk_size = required<BinningState>(num_rendered);
287 | 	char* binning_chunkptr = binningBuffer(binning_chunk_size);
288 | 	BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
289 | 
290 | 	// For each instance to be rendered, produce adequate [ tile | depth ] key 
291 | 	// and corresponding dublicated Gaussian indices to be sorted
292 | 	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
293 | 		P,
294 | 		geomState.means2D,
295 | 		geomState.depths,
296 | 		geomState.point_offsets,
297 | 		binningState.point_list_keys_unsorted,
298 | 		binningState.point_list_unsorted,
299 | 		radii,
300 | 		tile_grid)
301 | 	CHECK_CUDA(, debug)
302 | 
303 | 	int bit = getHigherMsb(tile_grid.x * tile_grid.y);
304 | 
305 | 	// Sort complete list of (duplicated) Gaussian indices by keys
306 | 	CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
307 | 		binningState.list_sorting_space,
308 | 		binningState.sorting_size,
309 | 		binningState.point_list_keys_unsorted, binningState.point_list_keys,
310 | 		binningState.point_list_unsorted, binningState.point_list,
311 | 		num_rendered, 0, 32 + bit), debug)
312 | 
313 | 	CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
314 | 
315 | 	// Identify start and end of per-tile workloads in sorted list
316 | 	if (num_rendered > 0)
317 | 		identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
318 | 			num_rendered,
319 | 			binningState.point_list_keys,
320 | 			imgState.ranges);
321 | 	CHECK_CUDA(, debug)
322 | 
323 | 	// Let each tile blend its range of Gaussians independently in parallel
324 | 	const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
325 | 	CHECK_CUDA(FORWARD::render(
326 | 		tile_grid, block,
327 | 		imgState.ranges,
328 | 		binningState.point_list,
329 | 		width, height,
330 | 		geomState.means2D,
331 | 		feature_ptr,
332 | 		geomState.conic_opacity,
333 | 		imgState.accum_alpha,
334 | 		imgState.n_contrib,
335 | 		background,
336 | 		out_color,
337 | 		geomState.depths,
338 | 		out_depth, 
339 | 		out_opacity,
340 | 		n_touched
341 |     ), debug)
342 | 
343 | 	return num_rendered;
344 | }
345 | 
346 | // Produce necessary gradients for optimization, corresponding
347 | // to forward render pass
348 | void CudaRasterizer::Rasterizer::backward(
349 | 	const int P, int D, int M, int R,
350 | 	const float* background,
351 | 	const int width, int height,
352 | 	const float* means3D,
353 | 	const float* shs,
354 | 	const float* colors_precomp,
355 | 	const float* scales,
356 | 	const float scale_modifier,
357 | 	const float* rotations,
358 | 	const float* cov3D_precomp,
359 | 	const float* viewmatrix,
360 | 	const float* projmatrix,
361 |     const float* projmatrix_raw,
362 |     const float* campos,
363 | 	const float tan_fovx, float tan_fovy,
364 | 	const int* radii,
365 | 	char* geom_buffer,
366 | 	char* binning_buffer,
367 | 	char* img_buffer,
368 | 	const float* dL_dpix,
369 | 	const float* dL_dpix_depth,
370 | 	float* dL_dmean2D,
371 | 	float* dL_dconic,
372 | 	float* dL_dopacity,
373 | 	float* dL_dcolor,
374 | 	float* dL_ddepth,
375 | 	float* dL_dmean3D,
376 | 	float* dL_dcov3D,
377 | 	float* dL_dsh,
378 | 	float* dL_dscale,
379 | 	float* dL_drot,
380 | 	float* dL_dtau,
381 | 	bool debug)
382 | {
383 | 	GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
384 | 	BinningState binningState = BinningState::fromChunk(binning_buffer, R);
385 | 	ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
386 | 
387 | 	if (radii == nullptr)
388 | 	{
389 | 		radii = geomState.internal_radii;
390 | 	}
391 | 
392 | 	const float focal_y = height / (2.0f * tan_fovy);
393 | 	const float focal_x = width / (2.0f * tan_fovx);
394 | 
395 | 	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
396 | 	const dim3 block(BLOCK_X, BLOCK_Y, 1);
397 | 
398 | 	// Compute loss gradients w.r.t. 2D mean position, conic matrix,
399 | 	// opacity and RGB of Gaussians from per-pixel loss gradients.
400 | 	// If we were given precomputed colors and not SHs, use them.
401 | 	const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
402 |     const float* depth_ptr = geomState.depths;
403 | 
404 | 	CHECK_CUDA(BACKWARD::render(
405 | 		tile_grid,
406 | 		block,
407 | 		imgState.ranges,
408 | 		binningState.point_list,
409 | 		width, height,
410 | 		background,
411 | 		geomState.means2D,
412 | 		geomState.conic_opacity,
413 | 		color_ptr,
414 | 		depth_ptr,
415 | 		imgState.accum_alpha,
416 | 		imgState.n_contrib,
417 | 		dL_dpix,
418 | 		dL_dpix_depth,
419 | 		(float3*)dL_dmean2D,
420 | 		(float4*)dL_dconic,
421 | 		dL_dopacity,
422 | 		dL_dcolor,
423 | 		dL_ddepth
424 |     ), debug)
425 | 
426 | 	// Take care of the rest of preprocessing. Was the precomputed covariance
427 | 	// given to us or a scales/rot pair? If precomputed, pass that. If not,
428 | 	// use the one we computed ourselves.
429 | 	const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
430 | 	CHECK_CUDA(BACKWARD::preprocess(P, D, M,
431 | 		(float3*)means3D,
432 | 		radii,
433 | 		shs,
434 | 		geomState.clamped,
435 | 		(glm::vec3*)scales,
436 | 		(glm::vec4*)rotations,
437 | 		scale_modifier,
438 | 		cov3D_ptr,
439 | 		viewmatrix,
440 | 		projmatrix,
441 |         projmatrix_raw,
442 | 		focal_x, focal_y,
443 | 		tan_fovx, tan_fovy,
444 | 		(glm::vec3*)campos,
445 | 		(float3*)dL_dmean2D,
446 | 		dL_dconic,
447 | 		(glm::vec3*)dL_dmean3D,
448 | 		dL_dcolor,
449 | 		dL_ddepth,
450 | 		dL_dcov3D,
451 | 		dL_dsh,
452 | 		(glm::vec3*)dL_dscale,
453 | 		(glm::vec4*)dL_drot,
454 | 		dL_dtau), debug)
455 | }


--------------------------------------------------------------------------------
/cuda_rasterizer/rasterizer_impl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #pragma once
13 | 
14 | #include <iostream>
15 | #include <vector>
16 | #include "rasterizer.h"
17 | #include <cuda_runtime_api.h>
18 | 
19 | namespace CudaRasterizer
20 | {
21 | 	template <typename T>
22 | 	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
23 | 	{
24 | 		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
25 | 		ptr = reinterpret_cast<T*>(offset);
26 | 		chunk = reinterpret_cast<char*>(ptr + count);
27 | 	}
28 | 
29 | 	struct GeometryState
30 | 	{
31 | 		size_t scan_size;
32 | 		float* depths;
33 | 		char* scanning_space;
34 | 		bool* clamped;
35 | 		int* internal_radii;
36 | 		float2* means2D;
37 | 		float* cov3D;
38 | 		float4* conic_opacity;
39 | 		float* rgb;
40 | 		uint32_t* point_offsets;
41 | 		uint32_t* tiles_touched;
42 | 
43 | 		static GeometryState fromChunk(char*& chunk, size_t P);
44 | 	};
45 | 
46 | 	struct ImageState
47 | 	{
48 | 		uint2* ranges;
49 | 		uint32_t* n_contrib;
50 | 		float* accum_alpha;
51 | 
52 | 		static ImageState fromChunk(char*& chunk, size_t N);
53 | 	};
54 | 
55 | 	struct BinningState
56 | 	{
57 | 		size_t sorting_size;
58 | 		uint64_t* point_list_keys_unsorted;
59 | 		uint64_t* point_list_keys;
60 | 		uint32_t* point_list_unsorted;
61 | 		uint32_t* point_list;
62 | 		char* list_sorting_space;
63 | 
64 | 		static BinningState fromChunk(char*& chunk, size_t P);
65 | 	};
66 | 
67 | 	template<typename T> 
68 | 	size_t required(size_t P)
69 | 	{
70 | 		char* size = nullptr;
71 | 		T::fromChunk(size, P);
72 | 		return ((size_t)size) + 128;
73 | 	}
74 | };


--------------------------------------------------------------------------------
/diff_gaussian_rasterization/__init__.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2023, Inria
  3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 | # All rights reserved.
  5 | #
  6 | # This software is free for non-commercial, research and evaluation use 
  7 | # under the terms of the LICENSE.md file.
  8 | #
  9 | # For inquiries contact  george.drettakis@inria.fr
 10 | #
 11 | 
 12 | from typing import NamedTuple
 13 | import torch.nn as nn
 14 | import torch
 15 | from . import _C
 16 | 
 17 | def cpu_deep_copy_tuple(input_tuple):
 18 |     copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple]
 19 |     return tuple(copied_tensors)
 20 | 
 21 | def rasterize_gaussians(
 22 |     means3D,
 23 |     means2D,
 24 |     sh,
 25 |     colors_precomp,
 26 |     opacities,
 27 |     scales,
 28 |     rotations,
 29 |     cov3Ds_precomp,
 30 |     theta,
 31 |     rho,
 32 |     raster_settings,
 33 | ):
 34 |     return _RasterizeGaussians.apply(
 35 |         means3D,
 36 |         means2D,
 37 |         sh,
 38 |         colors_precomp,
 39 |         opacities,
 40 |         scales,
 41 |         rotations,
 42 |         cov3Ds_precomp,
 43 |         theta,
 44 |         rho,
 45 |         raster_settings,
 46 |     )
 47 | 
 48 | class _RasterizeGaussians(torch.autograd.Function):
 49 |     @staticmethod
 50 |     def forward(
 51 |         ctx,
 52 |         means3D,
 53 |         means2D,
 54 |         sh,
 55 |         colors_precomp,
 56 |         opacities,
 57 |         scales,
 58 |         rotations,
 59 |         cov3Ds_precomp,
 60 |         theta,
 61 |         rho,
 62 |         raster_settings,
 63 |     ):
 64 | 
 65 |         # Restructure arguments the way that the C++ lib expects them
 66 |         args = (
 67 |             raster_settings.bg,
 68 |             means3D,
 69 |             colors_precomp,
 70 |             opacities,
 71 |             scales,
 72 |             rotations,
 73 |             raster_settings.scale_modifier,
 74 |             cov3Ds_precomp,
 75 |             raster_settings.viewmatrix,
 76 |             raster_settings.projmatrix,
 77 |             raster_settings.projmatrix_raw,
 78 |             raster_settings.tanfovx,
 79 |             raster_settings.tanfovy,
 80 |             raster_settings.image_height,
 81 |             raster_settings.image_width,
 82 |             sh,
 83 |             raster_settings.sh_degree,
 84 |             raster_settings.campos,
 85 |             raster_settings.prefiltered,
 86 |             raster_settings.debug,
 87 |         )
 88 | 
 89 |         # Invoke C++/CUDA rasterizer
 90 |         if raster_settings.debug:
 91 |             cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
 92 |             try:
 93 |                 num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer, depth, opacity, n_touched = _C.rasterize_gaussians(*args)
 94 |             except Exception as ex:
 95 |                 torch.save(cpu_args, "snapshot_fw.dump")
 96 |                 print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.")
 97 |                 raise ex
 98 |         else:
 99 |             num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer, depth, opacity, n_touched = _C.rasterize_gaussians(*args)
100 | 
101 |         # Keep relevant tensors for backward
102 |         ctx.raster_settings = raster_settings
103 |         ctx.num_rendered = num_rendered
104 |         ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer)
105 |         return color, radii, depth, opacity, n_touched
106 | 
107 |     @staticmethod
108 |     def backward(ctx, grad_out_color, grad_out_radii, grad_out_depth, grad_out_opacity, grad_n_touched):
109 | 
110 |         # Restore necessary values from context
111 |         num_rendered = ctx.num_rendered
112 |         raster_settings = ctx.raster_settings
113 |         colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer = ctx.saved_tensors
114 | 
115 |         # Restructure args as C++ method expects them
116 |         args = (raster_settings.bg,
117 |                 means3D,
118 |                 radii,
119 |                 colors_precomp,
120 |                 scales,
121 |                 rotations,
122 |                 raster_settings.scale_modifier,
123 |                 cov3Ds_precomp,
124 |                 raster_settings.viewmatrix,
125 |                 raster_settings.projmatrix,
126 |                 raster_settings.projmatrix_raw,
127 |                 raster_settings.tanfovx,
128 |                 raster_settings.tanfovy,
129 |                 grad_out_color,
130 |                 grad_out_depth,
131 |                 sh,
132 |                 raster_settings.sh_degree,
133 |                 raster_settings.campos,
134 |                 geomBuffer,
135 |                 num_rendered,
136 |                 binningBuffer,
137 |                 imgBuffer,
138 |                 raster_settings.debug)
139 | 
140 |         # Compute gradients for relevant tensors by invoking backward method
141 |         if raster_settings.debug:
142 |             cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
143 |             try:
144 |                 grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations, grad_tau = _C.rasterize_gaussians_backward(*args)
145 |             except Exception as ex:
146 |                 torch.save(cpu_args, "snapshot_bw.dump")
147 |                 print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n")
148 |                 raise ex
149 |         else:
150 |              grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations, grad_tau = _C.rasterize_gaussians_backward(*args)
151 |         
152 |         grad_tau = torch.sum(grad_tau.view(-1, 6), dim=0)
153 |         grad_rho = grad_tau[:3].view(1, -1)
154 |         grad_theta = grad_tau[3:].view(1, -1)
155 | 
156 | 
157 |         grads = (
158 |             grad_means3D,
159 |             grad_means2D,
160 |             grad_sh,
161 |             grad_colors_precomp,
162 |             grad_opacities,
163 |             grad_scales,
164 |             grad_rotations,
165 |             grad_cov3Ds_precomp,
166 |             grad_theta,
167 |             grad_rho,
168 |             None,
169 |         )
170 | 
171 |         return grads
172 | 
173 | class GaussianRasterizationSettings(NamedTuple):
174 |     image_height: int
175 |     image_width: int 
176 |     tanfovx : float
177 |     tanfovy : float
178 |     bg : torch.Tensor
179 |     scale_modifier : float
180 |     viewmatrix : torch.Tensor
181 |     projmatrix : torch.Tensor
182 |     projmatrix_raw : torch.Tensor
183 |     sh_degree : int
184 |     campos : torch.Tensor
185 |     prefiltered : bool
186 |     debug : bool
187 | 
188 | class GaussianRasterizer(nn.Module):
189 |     def __init__(self, raster_settings):
190 |         super().__init__()
191 |         self.raster_settings = raster_settings
192 | 
193 |     def markVisible(self, positions):
194 |         # Mark visible points (based on frustum culling for camera) with a boolean 
195 |         with torch.no_grad():
196 |             raster_settings = self.raster_settings
197 |             visible = _C.mark_visible(
198 |                 positions,
199 |                 raster_settings.viewmatrix,
200 |                 raster_settings.projmatrix)
201 |             
202 |         return visible
203 | 
204 |     def forward(self, means3D, means2D, opacities, shs = None, colors_precomp = None, scales = None, rotations = None, cov3D_precomp = None, theta=None, rho=None):
205 |         
206 |         raster_settings = self.raster_settings
207 | 
208 |         if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None):
209 |             raise Exception('Please provide excatly one of either SHs or precomputed colors!')
210 |         
211 |         if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None):
212 |             raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!')
213 |         
214 |         if shs is None:
215 |             shs = torch.Tensor([])
216 |         if colors_precomp is None:
217 |             colors_precomp = torch.Tensor([])
218 | 
219 |         if scales is None:
220 |             scales = torch.Tensor([])
221 |         if rotations is None:
222 |             rotations = torch.Tensor([])
223 |         if cov3D_precomp is None:
224 |             cov3D_precomp = torch.Tensor([])
225 |         if theta is None:
226 |             theta = torch.Tensor([])
227 |         if rho is None:
228 |             rho = torch.Tensor([])
229 |         
230 | 
231 |         # Invoke C++/CUDA rasterization routine
232 |         return rasterize_gaussians(
233 |             means3D,
234 |             means2D,
235 |             shs,
236 |             colors_precomp,
237 |             opacities,
238 |             scales, 
239 |             rotations,
240 |             cov3D_precomp,
241 |             theta,
242 |             rho,
243 |             raster_settings, 
244 |         )
245 | 
246 | 


--------------------------------------------------------------------------------
/ext.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #include <torch/extension.h>
13 | #include "rasterize_points.h"
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
17 |   m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
18 |   m.def("mark_visible", &markVisible);
19 | }


--------------------------------------------------------------------------------
/rasterize_points.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2023, Inria
  3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
  4 |  * All rights reserved.
  5 |  *
  6 |  * This software is free for non-commercial, research and evaluation use 
  7 |  * under the terms of the LICENSE.md file.
  8 |  *
  9 |  * For inquiries contact  george.drettakis@inria.fr
 10 |  */
 11 | 
 12 | #include <math.h>
 13 | #include <torch/extension.h>
 14 | #include <cstdio>
 15 | #include <sstream>
 16 | #include <iostream>
 17 | #include <tuple>
 18 | #include <stdio.h>
 19 | #include <cuda_runtime_api.h>
 20 | #include <memory>
 21 | #include "cuda_rasterizer/config.h"
 22 | #include "cuda_rasterizer/rasterizer.h"
 23 | #include <fstream>
 24 | #include <string>
 25 | #include <functional>
 26 | 
 27 | std::function<char*(size_t N)> resizeFunctional(torch::Tensor& t) {
 28 |     auto lambda = [&t](size_t N) {
 29 |         t.resize_({(long long)N});
 30 | 		return reinterpret_cast<char*>(t.contiguous().data_ptr());
 31 |     };
 32 |     return lambda;
 33 | }
 34 | 
 35 | std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 36 | RasterizeGaussiansCUDA(
 37 | 	const torch::Tensor& background,
 38 | 	const torch::Tensor& means3D,
 39 |     const torch::Tensor& colors,
 40 |     const torch::Tensor& opacity,
 41 | 	const torch::Tensor& scales,
 42 | 	const torch::Tensor& rotations,
 43 | 	const float scale_modifier,
 44 | 	const torch::Tensor& cov3D_precomp,
 45 | 	const torch::Tensor& viewmatrix,
 46 | 	const torch::Tensor& projmatrix,
 47 |     const torch::Tensor& projmatrix_raw,
 48 |     const float tan_fovx,
 49 | 	const float tan_fovy,
 50 |     const int image_height,
 51 |     const int image_width,
 52 | 	const torch::Tensor& sh,
 53 | 	const int degree,
 54 | 	const torch::Tensor& campos,
 55 | 	const bool prefiltered,
 56 | 	const bool debug)
 57 | {
 58 |   if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
 59 |     AT_ERROR("means3D must have dimensions (num_points, 3)");
 60 |   }
 61 |   
 62 |   const int P = means3D.size(0);
 63 |   const int H = image_height;
 64 |   const int W = image_width;
 65 | 
 66 |   auto int_opts = means3D.options().dtype(torch::kInt32);
 67 |   auto float_opts = means3D.options().dtype(torch::kFloat32);
 68 | 
 69 |   torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts);
 70 |   torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
 71 |   torch::Tensor n_touched = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
 72 |   torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts);
 73 |   torch::Tensor out_opaticy = torch::full({1, H, W}, 0.0, float_opts);
 74 | 
 75 |   torch::Device device(torch::kCUDA);
 76 |   torch::TensorOptions options(torch::kByte);
 77 |   torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
 78 |   torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
 79 |   torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
 80 |   std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer);
 81 |   std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer);
 82 |   std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer);
 83 |   
 84 |   int rendered = 0;
 85 |   if(P != 0)
 86 |   {
 87 | 	  int M = 0;
 88 | 	  if(sh.size(0) != 0)
 89 | 	  {
 90 | 		M = sh.size(1);
 91 |       }
 92 | 
 93 | 	  rendered = CudaRasterizer::Rasterizer::forward(
 94 | 	    geomFunc,
 95 | 		binningFunc,
 96 | 		imgFunc,
 97 | 	    P, degree, M,
 98 | 		background.contiguous().data<float>(),
 99 | 		W, H,
100 | 		means3D.contiguous().data<float>(),
101 | 		sh.contiguous().data_ptr<float>(),
102 | 		colors.contiguous().data<float>(), 
103 | 		opacity.contiguous().data<float>(), 
104 | 		scales.contiguous().data_ptr<float>(),
105 | 		scale_modifier,
106 | 		rotations.contiguous().data_ptr<float>(),
107 | 		cov3D_precomp.contiguous().data<float>(), 
108 | 		viewmatrix.contiguous().data<float>(), 
109 | 		projmatrix.contiguous().data<float>(),
110 | 		campos.contiguous().data<float>(),
111 | 		tan_fovx,
112 | 		tan_fovy,
113 | 		prefiltered,
114 | 		out_color.contiguous().data<float>(),
115 | 		out_depth.contiguous().data<float>(),
116 | 		out_opaticy.contiguous().data<float>(),
117 | 		radii.contiguous().data<int>(),
118 | 		n_touched.contiguous().data<int>(),
119 |         debug);
120 |   }
121 |   return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer, out_depth, out_opaticy, n_touched);
122 | }
123 | 
124 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
125 |  RasterizeGaussiansBackwardCUDA(
126 |  	const torch::Tensor& background,
127 | 	const torch::Tensor& means3D,
128 | 	const torch::Tensor& radii,
129 |     const torch::Tensor& colors,
130 | 	const torch::Tensor& scales,
131 | 	const torch::Tensor& rotations,
132 | 	const float scale_modifier,
133 | 	const torch::Tensor& cov3D_precomp,
134 | 	const torch::Tensor& viewmatrix,
135 |     const torch::Tensor& projmatrix,
136 |     const torch::Tensor& projmatrix_raw,
137 |     const float tan_fovx,
138 | 	const float tan_fovy,
139 |     const torch::Tensor& dL_dout_color,
140 | 	const torch::Tensor& dL_dout_depths,
141 | 	const torch::Tensor& sh,
142 | 	const int degree,
143 | 	const torch::Tensor& campos,
144 | 	const torch::Tensor& geomBuffer,
145 | 	const int R,
146 | 	const torch::Tensor& binningBuffer,
147 | 	const torch::Tensor& imageBuffer,
148 | 	const bool debug) 
149 | {
150 |   const int P = means3D.size(0);
151 |   const int H = dL_dout_color.size(1);
152 |   const int W = dL_dout_color.size(2);
153 |   
154 |   int M = 0;
155 |   if(sh.size(0) != 0)
156 |   {	
157 | 	M = sh.size(1);
158 |   }
159 | 
160 |   torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
161 |   torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
162 |   torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
163 |   torch::Tensor dL_ddepths = torch::zeros({P, 1}, means3D.options());
164 |   torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
165 |   torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
166 |   torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
167 |   torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
168 |   torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
169 |   torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
170 |   torch::Tensor dL_dtau = torch::zeros({P,6}, means3D.options());
171 | 
172 |   if(P != 0)
173 |   {  
174 | 	  CudaRasterizer::Rasterizer::backward(P, degree, M, R,
175 | 	  background.contiguous().data<float>(),
176 | 	  W, H, 
177 | 	  means3D.contiguous().data<float>(),
178 | 	  sh.contiguous().data<float>(),
179 | 	  colors.contiguous().data<float>(),
180 | 	  scales.data_ptr<float>(),
181 | 	  scale_modifier,
182 | 	  rotations.data_ptr<float>(),
183 | 	  cov3D_precomp.contiguous().data<float>(),
184 | 	  viewmatrix.contiguous().data<float>(),
185 | 	  projmatrix.contiguous().data<float>(),
186 |       projmatrix_raw.contiguous().data<float>(),
187 | 	  campos.contiguous().data<float>(),
188 | 	  tan_fovx,
189 | 	  tan_fovy,
190 | 	  radii.contiguous().data<int>(),
191 | 	  reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
192 | 	  reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
193 | 	  reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
194 | 	  dL_dout_color.contiguous().data<float>(),
195 | 	  dL_dout_depths.contiguous().data<float>(),
196 | 	  dL_dmeans2D.contiguous().data<float>(),
197 | 	  dL_dconic.contiguous().data<float>(),  
198 | 	  dL_dopacity.contiguous().data<float>(),
199 | 	  dL_dcolors.contiguous().data<float>(),
200 | 	  dL_ddepths.contiguous().data<float>(),
201 | 	  dL_dmeans3D.contiguous().data<float>(),
202 | 	  dL_dcov3D.contiguous().data<float>(),
203 | 	  dL_dsh.contiguous().data<float>(),
204 | 	  dL_dscales.contiguous().data<float>(),
205 | 	  dL_drotations.contiguous().data<float>(),
206 |       dL_dtau.contiguous().data<float>(),
207 | 	  debug);
208 |   }
209 | 
210 |   return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations, dL_dtau);
211 | }
212 | 
213 | torch::Tensor markVisible(
214 | 		torch::Tensor& means3D,
215 | 		torch::Tensor& viewmatrix,
216 | 		torch::Tensor& projmatrix)
217 | { 
218 |   const int P = means3D.size(0);
219 |   
220 |   torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool));
221 |  
222 |   if(P != 0)
223 |   {
224 | 	CudaRasterizer::Rasterizer::markVisible(P,
225 | 		means3D.contiguous().data<float>(),
226 | 		viewmatrix.contiguous().data<float>(),
227 | 		projmatrix.contiguous().data<float>(),
228 | 		present.contiguous().data<bool>());
229 |   }
230 |   
231 |   return present;
232 | }


--------------------------------------------------------------------------------
/rasterize_points.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2023, Inria
 3 |  * GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 |  * All rights reserved.
 5 |  *
 6 |  * This software is free for non-commercial, research and evaluation use 
 7 |  * under the terms of the LICENSE.md file.
 8 |  *
 9 |  * For inquiries contact  george.drettakis@inria.fr
10 |  */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | #include <cstdio>
15 | #include <tuple>
16 | #include <string>
17 | 	
18 | std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
19 | RasterizeGaussiansCUDA(
20 | 	const torch::Tensor& background,
21 | 	const torch::Tensor& means3D,
22 |     const torch::Tensor& colors,
23 |     const torch::Tensor& opacity,
24 | 	const torch::Tensor& scales,
25 | 	const torch::Tensor& rotations,
26 | 	const float scale_modifier,
27 | 	const torch::Tensor& cov3D_precomp,
28 | 	const torch::Tensor& viewmatrix,
29 | 	const torch::Tensor& projmatrix,
30 |     const torch::Tensor& projmatrix_raw,
31 |     const float tan_fovx,
32 | 	const float tan_fovy,
33 |     const int image_height,
34 |     const int image_width,
35 | 	const torch::Tensor& sh,
36 | 	const int degree,
37 | 	const torch::Tensor& campos,
38 |     const bool prefiltered,
39 | 	const bool debug);
40 | 
41 | std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
42 |  RasterizeGaussiansBackwardCUDA(
43 |  	const torch::Tensor& background,
44 | 	const torch::Tensor& means3D,
45 | 	const torch::Tensor& radii,
46 |     const torch::Tensor& colors,
47 | 	const torch::Tensor& scales,
48 | 	const torch::Tensor& rotations,
49 | 	const float scale_modifier,
50 | 	const torch::Tensor& cov3D_precomp,
51 | 	const torch::Tensor& viewmatrix,
52 |     const torch::Tensor& projmatrix,
53 |     const torch::Tensor& projmatrix_raw,
54 |     const float tan_fovx,
55 | 	const float tan_fovy,
56 |     const torch::Tensor& dL_dout_color,
57 |     const torch::Tensor& dL_dout_depth,
58 | 	const torch::Tensor& sh,
59 | 	const int degree,
60 | 	const torch::Tensor& campos,
61 | 	const torch::Tensor& geomBuffer,
62 | 	const int R,
63 | 	const torch::Tensor& binningBuffer,
64 | 	const torch::Tensor& imageBuffer,
65 | 	const bool debug);
66 | 		
67 | torch::Tensor markVisible(
68 | 		torch::Tensor& means3D,
69 | 		torch::Tensor& viewmatrix,
70 | 		torch::Tensor& projmatrix);


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2023, Inria
 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco
 4 | # All rights reserved.
 5 | #
 6 | # This software is free for non-commercial, research and evaluation use 
 7 | # under the terms of the LICENSE.md file.
 8 | #
 9 | # For inquiries contact  george.drettakis@inria.fr
10 | #
11 | 
12 | from setuptools import setup
13 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension
14 | import os
15 | os.path.dirname(os.path.abspath(__file__))
16 | 
17 | setup(
18 |     name="diff_gaussian_rasterization",
19 |     packages=['diff_gaussian_rasterization'],
20 |     ext_modules=[
21 |         CUDAExtension(
22 |             name="diff_gaussian_rasterization._C",
23 |             sources=[
24 |             "cuda_rasterizer/rasterizer_impl.cu",
25 |             "cuda_rasterizer/forward.cu",
26 |             "cuda_rasterizer/backward.cu",
27 |             "rasterize_points.cu",
28 |             "ext.cpp"],
29 |             extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
30 |         ],
31 |     cmdclass={
32 |         'build_ext': BuildExtension
33 |     }
34 | )
35 | 


--------------------------------------------------------------------------------