├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE.md ├── README.md ├── cuda_rasterizer ├── auxiliary.h ├── backward.cu ├── backward.h ├── config.h ├── forward.cu ├── forward.h ├── helper_math.h ├── math.h ├── rasterizer.h ├── rasterizer_impl.cu └── rasterizer_impl.h ├── diff_gaussian_rasterization └── __init__.py ├── ext.cpp ├── rasterize_points.cu ├── rasterize_points.h ├── setup.py └── third_party └── stbi_image_write.h /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | diff_gaussian_rasterization.egg-info/ 3 | dist/ 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/glm"] 2 | path = third_party/glm 3 | url = https://github.com/g-truc/glm.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | cmake_minimum_required(VERSION 3.20) 13 | 14 | project(DiffRast LANGUAGES CUDA CXX) 15 | 16 | set(CMAKE_CXX_STANDARD 17) 17 | set(CMAKE_CXX_EXTENSIONS OFF) 18 | set(CMAKE_CUDA_STANDARD 17) 19 | 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 21 | 22 | add_library(CudaRasterizer 23 | cuda_rasterizer/backward.h 24 | cuda_rasterizer/backward.cu 25 | cuda_rasterizer/forward.h 26 | cuda_rasterizer/forward.cu 27 | cuda_rasterizer/auxiliary.h 28 | cuda_rasterizer/rasterizer_impl.cu 29 | cuda_rasterizer/rasterizer_impl.h 30 | cuda_rasterizer/rasterizer.h 31 | ) 32 | 33 | set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "70;75;86") 34 | 35 | target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer) 36 | target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 37 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Gaussian-Splatting License 2 | =========================== 3 | 4 | **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**. 5 | The *Software* is in the process of being registered with the Agence pour la Protection des 6 | Programmes (APP). 7 | 8 | The *Software* is still being developed by the *Licensor*. 9 | 10 | *Licensor*'s goal is to allow the research community to use, test and evaluate 11 | the *Software*. 12 | 13 | ## 1. Definitions 14 | 15 | *Licensee* means any person or entity that uses the *Software* and distributes 16 | its *Work*. 17 | 18 | *Licensor* means the owners of the *Software*, i.e Inria and MPII 19 | 20 | *Software* means the original work of authorship made available under this 21 | License ie gaussian-splatting. 22 | 23 | *Work* means the *Software* and any additions to or derivative works of the 24 | *Software* that are made available under this License. 25 | 26 | 27 | ## 2. Purpose 28 | This license is intended to define the rights granted to the *Licensee* by 29 | Licensors under the *Software*. 30 | 31 | ## 3. Rights granted 32 | 33 | For the above reasons Licensors have decided to distribute the *Software*. 34 | Licensors grant non-exclusive rights to use the *Software* for research purposes 35 | to research users (both academic and industrial), free of charge, without right 36 | to sublicense.. The *Software* may be used "non-commercially", i.e., for research 37 | and/or evaluation purposes only. 38 | 39 | Subject to the terms and conditions of this License, you are granted a 40 | non-exclusive, royalty-free, license to reproduce, prepare derivative works of, 41 | publicly display, publicly perform and distribute its *Work* and any resulting 42 | derivative works in any form. 43 | 44 | ## 4. Limitations 45 | 46 | **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do 47 | so under this License, (b) you include a complete copy of this License with 48 | your distribution, and (c) you retain without modification any copyright, 49 | patent, trademark, or attribution notices that are present in the *Work*. 50 | 51 | **4.2 Derivative Works.** You may specify that additional or different terms apply 52 | to the use, reproduction, and distribution of your derivative works of the *Work* 53 | ("Your Terms") only if (a) Your Terms provide that the use limitation in 54 | Section 2 applies to your derivative works, and (b) you identify the specific 55 | derivative works that are subject to Your Terms. Notwithstanding Your Terms, 56 | this License (including the redistribution requirements in Section 3.1) will 57 | continue to apply to the *Work* itself. 58 | 59 | **4.3** Any other use without of prior consent of Licensors is prohibited. Research 60 | users explicitly acknowledge having received from Licensors all information 61 | allowing to appreciate the adequacy between of the *Software* and their needs and 62 | to undertake all necessary precautions for its execution and use. 63 | 64 | **4.4** The *Software* is provided both as a compiled library file and as source 65 | code. In case of using the *Software* for a publication or other results obtained 66 | through the use of the *Software*, users are strongly encouraged to cite the 67 | corresponding publications as explained in the documentation of the *Software*. 68 | 69 | ## 5. Disclaimer 70 | 71 | THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES 72 | WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY 73 | UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL 74 | CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES 75 | OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL 76 | USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR 77 | ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE 78 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 79 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 80 | GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) 81 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 82 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR 83 | IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*. 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Differential Gaussian Rasterization with Camera Pose Jacobians 2 | 3 | This software is used as the rasterization engine in the paper ["Gaussian Splatting SLAM"](https://arxiv.org/abs/2312.06741), and supports: 4 | 5 | * Analytical gradient for SE(3) camera poses. 6 | * Analytical gradient for rendered depth. 7 | 8 | The code is built on top of the original [Differential Gaussian Rasterization](https://github.com/graphdeco-inria/diff-gaussian-rasterization) used in "3D Gaussian Splatting for Real-Time Rendering of Radiance Fields". 9 | 10 | If you can make use of it in your own research, please be so kind to cite both papers. 11 | 12 | 13 |
14 |
15 |

BibTeX

16 |
@Article{kerbl3Dgaussians,
17 |       author       = {Kerbl, Bernhard and Kopanas, Georgios and Leimk{\"u}hler, Thomas and Drettakis, George},
18 |       title        = {3D Gaussian Splatting for Real-Time Radiance Field Rendering},
19 |       journal      = {ACM Transactions on Graphics},
20 |       number       = {4},
21 |       volume       = {42},
22 |       month        = {July},
23 |       year         = {2023},
24 |       url          = {https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/}
25 | }
26 | 27 |
@inproceedings{Matsuki:Murai:etal:CVPR2024,
28 |   title={{G}aussian {S}platting {SLAM}},
29 |   author={Hidenobu Matsuki and Riku Murai and Paul H. J. Kelly and Andrew J. Davison},
30 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
31 |   year={2024}
32 | }
33 | 34 |
35 |
36 | 37 | -------------------------------------------------------------------------------- /cuda_rasterizer/auxiliary.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED 13 | #define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED 14 | 15 | #include "config.h" 16 | #include "stdio.h" 17 | 18 | #define BLOCK_SIZE (BLOCK_X * BLOCK_Y) 19 | #define NUM_WARPS (BLOCK_SIZE/32) 20 | 21 | // Spherical harmonics coefficients 22 | __device__ const float SH_C0 = 0.28209479177387814f; 23 | __device__ const float SH_C1 = 0.4886025119029199f; 24 | __device__ const float SH_C2[] = { 25 | 1.0925484305920792f, 26 | -1.0925484305920792f, 27 | 0.31539156525252005f, 28 | -1.0925484305920792f, 29 | 0.5462742152960396f 30 | }; 31 | __device__ const float SH_C3[] = { 32 | -0.5900435899266435f, 33 | 2.890611442640554f, 34 | -0.4570457994644658f, 35 | 0.3731763325901154f, 36 | -0.4570457994644658f, 37 | 1.445305721320277f, 38 | -0.5900435899266435f 39 | }; 40 | 41 | __forceinline__ __device__ float ndc2Pix(float v, int S) 42 | { 43 | return ((v + 1.0) * S - 1.0) * 0.5; 44 | } 45 | 46 | __forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid) 47 | { 48 | rect_min = { 49 | min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))), 50 | min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y))) 51 | }; 52 | rect_max = { 53 | min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))), 54 | min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y))) 55 | }; 56 | } 57 | 58 | __forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix) 59 | { 60 | float3 transformed = { 61 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], 62 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], 63 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], 64 | }; 65 | return transformed; 66 | } 67 | 68 | __forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix) 69 | { 70 | float4 transformed = { 71 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], 72 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], 73 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], 74 | matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15] 75 | }; 76 | return transformed; 77 | } 78 | 79 | __forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix) 80 | { 81 | float3 transformed = { 82 | matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z, 83 | matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z, 84 | matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z, 85 | }; 86 | return transformed; 87 | } 88 | 89 | __forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix) 90 | { 91 | float3 transformed = { 92 | matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z, 93 | matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z, 94 | matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z, 95 | }; 96 | return transformed; 97 | } 98 | 99 | __forceinline__ __device__ float dnormvdz(float3 v, float3 dv) 100 | { 101 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; 102 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 103 | float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; 104 | return dnormvdz; 105 | } 106 | 107 | __forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv) 108 | { 109 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; 110 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 111 | 112 | float3 dnormvdv; 113 | dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32; 114 | dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32; 115 | dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; 116 | return dnormvdv; 117 | } 118 | 119 | __forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv) 120 | { 121 | float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w; 122 | float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); 123 | 124 | float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w }; 125 | float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w; 126 | float4 dnormvdv; 127 | dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32; 128 | dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32; 129 | dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32; 130 | dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32; 131 | return dnormvdv; 132 | } 133 | 134 | __forceinline__ __device__ float sigmoid(float x) 135 | { 136 | return 1.0f / (1.0f + expf(-x)); 137 | } 138 | 139 | __forceinline__ __device__ bool in_frustum(int idx, 140 | const float* orig_points, 141 | const float* viewmatrix, 142 | const float* projmatrix, 143 | bool prefiltered, 144 | float3& p_view) 145 | { 146 | float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; 147 | 148 | // Bring points to screen space 149 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 150 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 151 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 152 | p_view = transformPoint4x3(p_orig, viewmatrix); 153 | 154 | if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3))) 155 | { 156 | if (prefiltered) 157 | { 158 | printf("Point is filtered although prefiltered is set. This shouldn't happen!"); 159 | __trap(); 160 | } 161 | return false; 162 | } 163 | return true; 164 | } 165 | 166 | #define CHECK_CUDA(A, debug) \ 167 | A; if(debug) { \ 168 | auto ret = cudaDeviceSynchronize(); \ 169 | if (ret != cudaSuccess) { \ 170 | std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__ << ": " << cudaGetErrorString(ret); \ 171 | throw std::runtime_error(cudaGetErrorString(ret)); \ 172 | } \ 173 | } 174 | 175 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/backward.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "backward.h" 13 | #include "auxiliary.h" 14 | #include "math.h" 15 | #include 16 | #include 17 | namespace cg = cooperative_groups; 18 | 19 | // Backward pass for conversion of spherical harmonics to RGB for 20 | // each Gaussian. 21 | __device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs, float *dL_dtau) 22 | { 23 | // Compute intermediate values, as it is done during forward 24 | glm::vec3 pos = means[idx]; 25 | glm::vec3 dir_orig = pos - campos; 26 | glm::vec3 dir = dir_orig / glm::length(dir_orig); 27 | 28 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 29 | 30 | // Use PyTorch rule for clamping: if clamping was applied, 31 | // gradient becomes 0. 32 | glm::vec3 dL_dRGB = dL_dcolor[idx]; 33 | dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1; 34 | dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1; 35 | dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1; 36 | 37 | glm::vec3 dRGBdx(0, 0, 0); 38 | glm::vec3 dRGBdy(0, 0, 0); 39 | glm::vec3 dRGBdz(0, 0, 0); 40 | float x = dir.x; 41 | float y = dir.y; 42 | float z = dir.z; 43 | 44 | // Target location for this Gaussian to write SH gradients to 45 | glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs; 46 | 47 | // No tricks here, just high school-level calculus. 48 | float dRGBdsh0 = SH_C0; 49 | dL_dsh[0] = dRGBdsh0 * dL_dRGB; 50 | if (deg > 0) 51 | { 52 | float dRGBdsh1 = -SH_C1 * y; 53 | float dRGBdsh2 = SH_C1 * z; 54 | float dRGBdsh3 = -SH_C1 * x; 55 | dL_dsh[1] = dRGBdsh1 * dL_dRGB; 56 | dL_dsh[2] = dRGBdsh2 * dL_dRGB; 57 | dL_dsh[3] = dRGBdsh3 * dL_dRGB; 58 | 59 | dRGBdx = -SH_C1 * sh[3]; 60 | dRGBdy = -SH_C1 * sh[1]; 61 | dRGBdz = SH_C1 * sh[2]; 62 | 63 | if (deg > 1) 64 | { 65 | float xx = x * x, yy = y * y, zz = z * z; 66 | float xy = x * y, yz = y * z, xz = x * z; 67 | 68 | float dRGBdsh4 = SH_C2[0] * xy; 69 | float dRGBdsh5 = SH_C2[1] * yz; 70 | float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy); 71 | float dRGBdsh7 = SH_C2[3] * xz; 72 | float dRGBdsh8 = SH_C2[4] * (xx - yy); 73 | dL_dsh[4] = dRGBdsh4 * dL_dRGB; 74 | dL_dsh[5] = dRGBdsh5 * dL_dRGB; 75 | dL_dsh[6] = dRGBdsh6 * dL_dRGB; 76 | dL_dsh[7] = dRGBdsh7 * dL_dRGB; 77 | dL_dsh[8] = dRGBdsh8 * dL_dRGB; 78 | 79 | dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8]; 80 | dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8]; 81 | dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7]; 82 | 83 | if (deg > 2) 84 | { 85 | float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy); 86 | float dRGBdsh10 = SH_C3[1] * xy * z; 87 | float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy); 88 | float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy); 89 | float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy); 90 | float dRGBdsh14 = SH_C3[5] * z * (xx - yy); 91 | float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy); 92 | dL_dsh[9] = dRGBdsh9 * dL_dRGB; 93 | dL_dsh[10] = dRGBdsh10 * dL_dRGB; 94 | dL_dsh[11] = dRGBdsh11 * dL_dRGB; 95 | dL_dsh[12] = dRGBdsh12 * dL_dRGB; 96 | dL_dsh[13] = dRGBdsh13 * dL_dRGB; 97 | dL_dsh[14] = dRGBdsh14 * dL_dRGB; 98 | dL_dsh[15] = dRGBdsh15 * dL_dRGB; 99 | 100 | dRGBdx += ( 101 | SH_C3[0] * sh[9] * 3.f * 2.f * xy + 102 | SH_C3[1] * sh[10] * yz + 103 | SH_C3[2] * sh[11] * -2.f * xy + 104 | SH_C3[3] * sh[12] * -3.f * 2.f * xz + 105 | SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) + 106 | SH_C3[5] * sh[14] * 2.f * xz + 107 | SH_C3[6] * sh[15] * 3.f * (xx - yy)); 108 | 109 | dRGBdy += ( 110 | SH_C3[0] * sh[9] * 3.f * (xx - yy) + 111 | SH_C3[1] * sh[10] * xz + 112 | SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) + 113 | SH_C3[3] * sh[12] * -3.f * 2.f * yz + 114 | SH_C3[4] * sh[13] * -2.f * xy + 115 | SH_C3[5] * sh[14] * -2.f * yz + 116 | SH_C3[6] * sh[15] * -3.f * 2.f * xy); 117 | 118 | dRGBdz += ( 119 | SH_C3[1] * sh[10] * xy + 120 | SH_C3[2] * sh[11] * 4.f * 2.f * yz + 121 | SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) + 122 | SH_C3[4] * sh[13] * 4.f * 2.f * xz + 123 | SH_C3[5] * sh[14] * (xx - yy)); 124 | } 125 | } 126 | } 127 | 128 | // The view direction is an input to the computation. View direction 129 | // is influenced by the Gaussian's mean, so SHs gradients 130 | // must propagate back into 3D position. 131 | glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB)); 132 | 133 | // Account for normalization of direction 134 | float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z }); 135 | 136 | // Gradients of loss w.r.t. Gaussian means, but only the portion 137 | // that is caused because the mean affects the view-dependent color. 138 | // Additional mean gradient is accumulated in below methods. 139 | dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z); 140 | 141 | dL_dtau[6 * idx + 0] += -dL_dmean.x; 142 | dL_dtau[6 * idx + 1] += -dL_dmean.y; 143 | dL_dtau[6 * idx + 2] += -dL_dmean.z; 144 | 145 | } 146 | 147 | // Backward version of INVERSE 2D covariance matrix computation 148 | // (due to length launched as separate kernel before other 149 | // backward steps contained in preprocess) 150 | __global__ void computeCov2DCUDA(int P, 151 | const float3* means, 152 | const int* radii, 153 | const float* cov3Ds, 154 | const float h_x, float h_y, 155 | const float tan_fovx, float tan_fovy, 156 | const float* view_matrix, 157 | const float* dL_dconics, 158 | float3* dL_dmeans, 159 | float* dL_dcov, 160 | float *dL_dtau) 161 | { 162 | auto idx = cg::this_grid().thread_rank(); 163 | if (idx >= P || !(radii[idx] > 0)) 164 | return; 165 | 166 | // Reading location of 3D covariance for this Gaussian 167 | const float* cov3D = cov3Ds + 6 * idx; 168 | 169 | // Fetch gradients, recompute 2D covariance and relevant 170 | // intermediate forward results needed in the backward. 171 | float3 mean = means[idx]; 172 | float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] }; 173 | float3 t = transformPoint4x3(mean, view_matrix); 174 | 175 | const float limx = 1.3f * tan_fovx; 176 | const float limy = 1.3f * tan_fovy; 177 | const float txtz = t.x / t.z; 178 | const float tytz = t.y / t.z; 179 | t.x = min(limx, max(-limx, txtz)) * t.z; 180 | t.y = min(limy, max(-limy, tytz)) * t.z; 181 | 182 | const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1; 183 | const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1; 184 | 185 | glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z), 186 | 0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z), 187 | 0, 0, 0); 188 | 189 | glm::mat3 W = glm::mat3( 190 | view_matrix[0], view_matrix[4], view_matrix[8], 191 | view_matrix[1], view_matrix[5], view_matrix[9], 192 | view_matrix[2], view_matrix[6], view_matrix[10]); 193 | 194 | glm::mat3 Vrk = glm::mat3( 195 | cov3D[0], cov3D[1], cov3D[2], 196 | cov3D[1], cov3D[3], cov3D[4], 197 | cov3D[2], cov3D[4], cov3D[5]); 198 | 199 | glm::mat3 T = W * J; 200 | 201 | glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T; 202 | 203 | // Use helper variables for 2D covariance entries. More compact. 204 | float a = cov2D[0][0] += 0.3f; 205 | float b = cov2D[0][1]; 206 | float c = cov2D[1][1] += 0.3f; 207 | 208 | float denom = a * c - b * b; 209 | float dL_da = 0, dL_db = 0, dL_dc = 0; 210 | float denom2inv = 1.0f / ((denom * denom) + 0.0000001f); 211 | 212 | if (denom2inv != 0) 213 | { 214 | // Gradients of loss w.r.t. entries of 2D covariance matrix, 215 | // given gradients of loss w.r.t. conic matrix (inverse covariance matrix). 216 | // e.g., dL / da = dL / d_conic_a * d_conic_a / d_a 217 | dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z); 218 | dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x); 219 | dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z); 220 | 221 | // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 222 | // given gradients w.r.t. 2D covariance matrix (diagonal). 223 | // cov2D = transpose(T) * transpose(Vrk) * T; 224 | dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc); 225 | dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc); 226 | dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc); 227 | 228 | // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 229 | // given gradients w.r.t. 2D covariance matrix (off-diagonal). 230 | // Off-diagonal elements appear twice --> double the gradient. 231 | // cov2D = transpose(T) * transpose(Vrk) * T; 232 | dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc; 233 | dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc; 234 | dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc; 235 | } 236 | else 237 | { 238 | for (int i = 0; i < 6; i++) 239 | dL_dcov[6 * idx + i] = 0; 240 | } 241 | 242 | // Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T 243 | // cov2D = transpose(T) * transpose(Vrk) * T; 244 | float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da + 245 | (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db; 246 | float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da + 247 | (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db; 248 | float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da + 249 | (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db; 250 | float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc + 251 | (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db; 252 | float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc + 253 | (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db; 254 | float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc + 255 | (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db; 256 | 257 | // Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix 258 | // T = W * J 259 | float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02; 260 | float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02; 261 | float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12; 262 | float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12; 263 | 264 | float tz = 1.f / t.z; 265 | float tz2 = tz * tz; 266 | float tz3 = tz2 * tz; 267 | 268 | // Gradients of loss w.r.t. transformed Gaussian mean t 269 | float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02; 270 | float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12; 271 | float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12; 272 | 273 | SE3 T_CW(view_matrix); 274 | mat33 R = T_CW.R().data(); 275 | mat33 RT = R.transpose(); 276 | float3 t_ = T_CW.t(); 277 | mat33 dpC_drho = mat33::identity(); 278 | mat33 dpC_dtheta = -mat33::skew_symmetric(t); 279 | float dL_dt[6]; 280 | for (int i = 0; i < 3; i++) { 281 | float3 c_rho = dpC_drho.cols[i]; 282 | float3 c_theta = dpC_dtheta.cols[i]; 283 | dL_dt[i] = dL_dtx * c_rho.x + dL_dty * c_rho.y + dL_dtz * c_rho.z; 284 | dL_dt[i + 3] = dL_dtx * c_theta.x + dL_dty * c_theta.y + dL_dtz * c_theta.z; 285 | } 286 | for (int i = 0; i < 6; i++) { 287 | dL_dtau[6 * idx + i] += dL_dt[i]; 288 | } 289 | 290 | // Account for transformation of mean to t 291 | // t = transformPoint4x3(mean, view_matrix); 292 | float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix); 293 | 294 | // Gradients of loss w.r.t. Gaussian means, but only the portion 295 | // that is caused because the mean affects the covariance matrix. 296 | // Additional mean gradient is accumulated in BACKWARD::preprocess. 297 | dL_dmeans[idx] = dL_dmean; 298 | 299 | float dL_dW00 = J[0][0] * dL_dT00; 300 | float dL_dW01 = J[0][0] * dL_dT01; 301 | float dL_dW02 = J[0][0] * dL_dT02; 302 | float dL_dW10 = J[1][1] * dL_dT10; 303 | float dL_dW11 = J[1][1] * dL_dT11; 304 | float dL_dW12 = J[1][1] * dL_dT12; 305 | float dL_dW20 = J[0][2] * dL_dT00 + J[1][2] * dL_dT10; 306 | float dL_dW21 = J[0][2] * dL_dT01 + J[1][2] * dL_dT11; 307 | float dL_dW22 = J[0][2] * dL_dT02 + J[1][2] * dL_dT12; 308 | 309 | float3 c1 = R.cols[0]; 310 | float3 c2 = R.cols[1]; 311 | float3 c3 = R.cols[2]; 312 | 313 | float dL_dW_data[9]; 314 | dL_dW_data[0] = dL_dW00; 315 | dL_dW_data[3] = dL_dW01; 316 | dL_dW_data[6] = dL_dW02; 317 | dL_dW_data[1] = dL_dW10; 318 | dL_dW_data[4] = dL_dW11; 319 | dL_dW_data[7] = dL_dW12; 320 | dL_dW_data[2] = dL_dW20; 321 | dL_dW_data[5] = dL_dW21; 322 | dL_dW_data[8] = dL_dW22; 323 | 324 | mat33 dL_dW(dL_dW_data); 325 | float3 dL_dWc1 = dL_dW.cols[0]; 326 | float3 dL_dWc2 = dL_dW.cols[1]; 327 | float3 dL_dWc3 = dL_dW.cols[2]; 328 | 329 | mat33 n_W1_x = -mat33::skew_symmetric(c1); 330 | mat33 n_W2_x = -mat33::skew_symmetric(c2); 331 | mat33 n_W3_x = -mat33::skew_symmetric(c3); 332 | 333 | float3 dL_dtheta = {}; 334 | dL_dtheta.x = dot(dL_dWc1, n_W1_x.cols[0]) + dot(dL_dWc2, n_W2_x.cols[0]) + 335 | dot(dL_dWc3, n_W3_x.cols[0]); 336 | dL_dtheta.y = dot(dL_dWc1, n_W1_x.cols[1]) + dot(dL_dWc2, n_W2_x.cols[1]) + 337 | dot(dL_dWc3, n_W3_x.cols[1]); 338 | dL_dtheta.z = dot(dL_dWc1, n_W1_x.cols[2]) + dot(dL_dWc2, n_W2_x.cols[2]) + 339 | dot(dL_dWc3, n_W3_x.cols[2]); 340 | 341 | dL_dtau[6 * idx + 3] += dL_dtheta.x; 342 | dL_dtau[6 * idx + 4] += dL_dtheta.y; 343 | dL_dtau[6 * idx + 5] += dL_dtheta.z; 344 | 345 | 346 | } 347 | 348 | // Backward pass for the conversion of scale and rotation to a 349 | // 3D covariance matrix for each Gaussian. 350 | __device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots) 351 | { 352 | // Recompute (intermediate) results for the 3D covariance computation. 353 | glm::vec4 q = rot;// / glm::length(rot); 354 | float r = q.x; 355 | float x = q.y; 356 | float y = q.z; 357 | float z = q.w; 358 | 359 | glm::mat3 R = glm::mat3( 360 | 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), 361 | 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), 362 | 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) 363 | ); 364 | 365 | glm::mat3 S = glm::mat3(1.0f); 366 | 367 | glm::vec3 s = mod * scale; 368 | S[0][0] = s.x; 369 | S[1][1] = s.y; 370 | S[2][2] = s.z; 371 | 372 | glm::mat3 M = S * R; 373 | 374 | const float* dL_dcov3D = dL_dcov3Ds + 6 * idx; 375 | 376 | glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]); 377 | glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]); 378 | 379 | // Convert per-element covariance loss gradients to matrix form 380 | glm::mat3 dL_dSigma = glm::mat3( 381 | dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2], 382 | 0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4], 383 | 0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5] 384 | ); 385 | 386 | // Compute loss gradient w.r.t. matrix M 387 | // dSigma_dM = 2 * M 388 | glm::mat3 dL_dM = 2.0f * M * dL_dSigma; 389 | 390 | glm::mat3 Rt = glm::transpose(R); 391 | glm::mat3 dL_dMt = glm::transpose(dL_dM); 392 | 393 | // Gradients of loss w.r.t. scale 394 | glm::vec3* dL_dscale = dL_dscales + idx; 395 | dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]); 396 | dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]); 397 | dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]); 398 | 399 | dL_dMt[0] *= s.x; 400 | dL_dMt[1] *= s.y; 401 | dL_dMt[2] *= s.z; 402 | 403 | // Gradients of loss w.r.t. normalized quaternion 404 | glm::vec4 dL_dq; 405 | dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]); 406 | dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]); 407 | dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]); 408 | dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]); 409 | 410 | // Gradients of loss w.r.t. unnormalized quaternion 411 | float4* dL_drot = (float4*)(dL_drots + idx); 412 | *dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w }); 413 | } 414 | 415 | // Backward pass of the preprocessing steps, except 416 | // for the covariance computation and inversion 417 | // (those are handled by a previous kernel call) 418 | template 419 | __global__ void preprocessCUDA( 420 | int P, int D, int M, 421 | const float3* means, 422 | const int* radii, 423 | const float* shs, 424 | const bool* clamped, 425 | const glm::vec3* scales, 426 | const glm::vec4* rotations, 427 | const float scale_modifier, 428 | const float *viewmatrix, 429 | const float* proj, 430 | const float *proj_raw, 431 | const glm::vec3* campos, 432 | const float3* dL_dmean2D, 433 | glm::vec3* dL_dmeans, 434 | float* dL_dcolor, 435 | float *dL_ddepth, 436 | float* dL_dcov3D, 437 | float* dL_dsh, 438 | glm::vec3* dL_dscale, 439 | glm::vec4* dL_drot, 440 | float *dL_dtau) 441 | { 442 | auto idx = cg::this_grid().thread_rank(); 443 | if (idx >= P || !(radii[idx] > 0)) 444 | return; 445 | 446 | float3 m = means[idx]; 447 | 448 | // Taking care of gradients from the screenspace points 449 | float4 m_hom = transformPoint4x4(m, proj); 450 | float m_w = 1.0f / (m_hom.w + 0.0000001f); 451 | 452 | // Compute loss gradient w.r.t. 3D means due to gradients of 2D means 453 | // from rendering procedure 454 | glm::vec3 dL_dmean; 455 | float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w; 456 | float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w; 457 | dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y; 458 | dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y; 459 | dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y; 460 | 461 | // That's the second part of the mean gradient. Previous computation 462 | // of cov2D and following SH conversion also affects it. 463 | dL_dmeans[idx] += dL_dmean; 464 | 465 | float alpha = 1.0f * m_w; 466 | float beta = -m_hom.x * m_w * m_w; 467 | float gamma = -m_hom.y * m_w * m_w; 468 | 469 | float a = proj_raw[0]; 470 | float b = proj_raw[5]; 471 | float c = proj_raw[10]; 472 | float d = proj_raw[14]; 473 | float e = proj_raw[11]; 474 | 475 | SE3 T_CW(viewmatrix); 476 | mat33 R = T_CW.R().data(); 477 | mat33 RT = R.transpose(); 478 | float3 t = T_CW.t(); 479 | float3 p_C = T_CW * m; 480 | mat33 dp_C_d_rho = mat33::identity(); 481 | mat33 dp_C_d_theta = -mat33::skew_symmetric(p_C); 482 | 483 | float3 d_proj_dp_C1 = make_float3(alpha * a, 0.f, beta * e); 484 | float3 d_proj_dp_C2 = make_float3(0.f, alpha * b, gamma * e); 485 | 486 | float3 d_proj_dp_C1_d_rho = dp_C_d_rho.transpose() * d_proj_dp_C1; // x.T A = A.T x 487 | float3 d_proj_dp_C2_d_rho = dp_C_d_rho.transpose() * d_proj_dp_C2; 488 | float3 d_proj_dp_C1_d_theta = dp_C_d_theta.transpose() * d_proj_dp_C1; 489 | float3 d_proj_dp_C2_d_theta = dp_C_d_theta.transpose() * d_proj_dp_C2; 490 | 491 | float2 dmean2D_dtau[6]; 492 | dmean2D_dtau[0].x = d_proj_dp_C1_d_rho.x; 493 | dmean2D_dtau[1].x = d_proj_dp_C1_d_rho.y; 494 | dmean2D_dtau[2].x = d_proj_dp_C1_d_rho.z; 495 | dmean2D_dtau[3].x = d_proj_dp_C1_d_theta.x; 496 | dmean2D_dtau[4].x = d_proj_dp_C1_d_theta.y; 497 | dmean2D_dtau[5].x = d_proj_dp_C1_d_theta.z; 498 | 499 | dmean2D_dtau[0].y = d_proj_dp_C2_d_rho.x; 500 | dmean2D_dtau[1].y = d_proj_dp_C2_d_rho.y; 501 | dmean2D_dtau[2].y = d_proj_dp_C2_d_rho.z; 502 | dmean2D_dtau[3].y = d_proj_dp_C2_d_theta.x; 503 | dmean2D_dtau[4].y = d_proj_dp_C2_d_theta.y; 504 | dmean2D_dtau[5].y = d_proj_dp_C2_d_theta.z; 505 | 506 | float dL_dt[6]; 507 | for (int i = 0; i < 6; i++) { 508 | dL_dt[i] = dL_dmean2D[idx].x * dmean2D_dtau[i].x + dL_dmean2D[idx].y * dmean2D_dtau[i].y; 509 | } 510 | for (int i = 0; i < 6; i++) { 511 | dL_dtau[6 * idx + i] += dL_dt[i]; 512 | } 513 | 514 | // Compute gradient update due to computing depths 515 | // p_orig = m 516 | // p_view = transformPoint4x3(p_orig, viewmatrix); 517 | // depth = p_view.z; 518 | float dL_dpCz = dL_ddepth[idx]; 519 | dL_dmeans[idx].x += dL_dpCz * viewmatrix[2]; 520 | dL_dmeans[idx].y += dL_dpCz * viewmatrix[6]; 521 | dL_dmeans[idx].z += dL_dpCz * viewmatrix[10]; 522 | 523 | for (int i = 0; i < 3; i++) { 524 | float3 c_rho = dp_C_d_rho.cols[i]; 525 | float3 c_theta = dp_C_d_theta.cols[i]; 526 | dL_dtau[6 * idx + i] += dL_dpCz * c_rho.z; 527 | dL_dtau[6 * idx + i + 3] += dL_dpCz * c_theta.z; 528 | } 529 | 530 | 531 | 532 | // Compute gradient updates due to computing colors from SHs 533 | if (shs) 534 | computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh, dL_dtau); 535 | 536 | // Compute gradient updates due to computing covariance from scale/rotation 537 | if (scales) 538 | computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot); 539 | } 540 | 541 | template 542 | __device__ void inline reduce_helper(int lane, int i, T *data) { 543 | if (lane < i) { 544 | data[lane] += data[lane + i]; 545 | } 546 | } 547 | 548 | template 549 | __device__ void render_cuda_reduce_sum(group_t g, Lists... lists) { 550 | int lane = g.thread_rank(); 551 | g.sync(); 552 | 553 | for (int i = g.size() / 2; i > 0; i /= 2) { 554 | (..., 555 | reduce_helper( 556 | lane, i, lists)); // Fold expression: apply reduce_helper for each list 557 | g.sync(); 558 | } 559 | } 560 | 561 | 562 | // Backward version of the rendering procedure. 563 | template 564 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 565 | renderCUDA( 566 | const uint2* __restrict__ ranges, 567 | const uint32_t* __restrict__ point_list, 568 | int W, int H, 569 | const float* __restrict__ bg_color, 570 | const float2* __restrict__ points_xy_image, 571 | const float4* __restrict__ conic_opacity, 572 | const float* __restrict__ colors, 573 | const float* __restrict__ depths, 574 | const float* __restrict__ final_Ts, 575 | const uint32_t* __restrict__ n_contrib, 576 | const float* __restrict__ dL_dpixels, 577 | const float* __restrict__ dL_dpixels_depth, 578 | float3* __restrict__ dL_dmean2D, 579 | float4* __restrict__ dL_dconic2D, 580 | float* __restrict__ dL_dopacity, 581 | float* __restrict__ dL_dcolors, 582 | float* __restrict__ dL_ddepths) 583 | { 584 | // We rasterize again. Compute necessary block info. 585 | auto block = cg::this_thread_block(); 586 | auto tid = block.thread_rank(); 587 | 588 | const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 589 | const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 590 | const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 591 | const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 592 | const uint32_t pix_id = W * pix.y + pix.x; 593 | const float2 pixf = { (float)pix.x, (float)pix.y }; 594 | 595 | const bool inside = pix.x < W&& pix.y < H; 596 | const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 597 | 598 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 599 | 600 | bool done = !inside; 601 | int toDo = range.y - range.x; 602 | 603 | __shared__ int collected_id[BLOCK_SIZE]; 604 | __shared__ float2 collected_xy[BLOCK_SIZE]; 605 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 606 | __shared__ float collected_colors[C * BLOCK_SIZE]; 607 | __shared__ float collected_depths[BLOCK_SIZE]; 608 | 609 | __shared__ float2 dL_dmean2D_shared[BLOCK_SIZE]; 610 | __shared__ float3 dL_dcolors_shared[BLOCK_SIZE]; 611 | __shared__ float dL_ddepths_shared[BLOCK_SIZE]; 612 | __shared__ float dL_dopacity_shared[BLOCK_SIZE]; 613 | __shared__ float4 dL_dconic2D_shared[BLOCK_SIZE]; 614 | 615 | // In the forward, we stored the final value for T, the 616 | // product of all (1 - alpha) factors. 617 | const float T_final = inside ? final_Ts[pix_id] : 0; 618 | float T = T_final; 619 | 620 | // We start from the back. The ID of the last contributing 621 | // Gaussian is known from each pixel from the forward. 622 | uint32_t contributor = toDo; 623 | const int last_contributor = inside ? n_contrib[pix_id] : 0; 624 | 625 | float accum_rec[C] = { 0 }; 626 | float dL_dpixel[C] = { 0 }; 627 | float accum_rec_depth = 0; 628 | float dL_dpixel_depth = 0; 629 | if (inside) { 630 | #pragma unroll 631 | for (int i = 0; i < C; i++) { 632 | dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; 633 | } 634 | dL_dpixel_depth = dL_dpixels_depth[pix_id]; 635 | } 636 | 637 | float last_alpha = 0.f; 638 | float last_color[C] = { 0.f }; 639 | float last_depth = 0.f; 640 | 641 | // Gradient of pixel coordinate w.r.t. normalized 642 | // screen-space viewport corrdinates (-1 to 1) 643 | const float ddelx_dx = 0.5f * W; 644 | const float ddely_dy = 0.5f * H; 645 | __shared__ int skip_counter; 646 | 647 | // Traverse all Gaussians 648 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 649 | { 650 | // Load auxiliary data into shared memory, start in the BACK 651 | // and load them in revers order. 652 | // block.sync(); 653 | const int progress = i * BLOCK_SIZE + tid; 654 | if (range.x + progress < range.y) 655 | { 656 | const int coll_id = point_list[range.y - progress - 1]; 657 | collected_id[tid] = coll_id; 658 | collected_xy[tid] = points_xy_image[coll_id]; 659 | collected_conic_opacity[tid] = conic_opacity[coll_id]; 660 | #pragma unroll 661 | for (int i = 0; i < C; i++) { 662 | collected_colors[i * BLOCK_SIZE + tid] = colors[coll_id * C + i]; 663 | 664 | } 665 | collected_depths[tid] = depths[coll_id]; 666 | } 667 | for (int j = 0; j < min(BLOCK_SIZE, toDo); j++) { 668 | block.sync(); 669 | if (tid == 0) { 670 | skip_counter = 0; 671 | } 672 | block.sync(); 673 | 674 | // Keep track of current Gaussian ID. Skip, if this one 675 | // is behind the last contributor for this pixel. 676 | bool skip = done; 677 | contributor = done ? contributor : contributor - 1; 678 | skip |= contributor >= last_contributor; 679 | 680 | // Compute blending values, as before. 681 | const float2 xy = collected_xy[j]; 682 | const float2 d = { xy.x - pixf.x, xy.y - pixf.y }; 683 | const float4 con_o = collected_conic_opacity[j]; 684 | const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 685 | skip |= power > 0.0f; 686 | 687 | const float G = exp(power); 688 | const float alpha = min(0.99f, con_o.w * G); 689 | skip |= alpha < 1.0f / 255.0f; 690 | 691 | if (skip) { 692 | atomicAdd(&skip_counter, 1); 693 | } 694 | block.sync(); 695 | if (skip_counter == BLOCK_SIZE) { 696 | continue; 697 | } 698 | 699 | 700 | T = skip ? T : T / (1.f - alpha); 701 | const float dchannel_dcolor = alpha * T; 702 | 703 | // Propagate gradients to per-Gaussian colors and keep 704 | // gradients w.r.t. alpha (blending factor for a Gaussian/pixel 705 | // pair). 706 | float dL_dalpha = 0.0f; 707 | const int global_id = collected_id[j]; 708 | float local_dL_dcolors[3]; 709 | #pragma unroll 710 | for (int ch = 0; ch < C; ch++) 711 | { 712 | const float c = collected_colors[ch * BLOCK_SIZE + j]; 713 | // Update last color (to be used in the next iteration) 714 | accum_rec[ch] = skip ? accum_rec[ch] : last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch]; 715 | last_color[ch] = skip ? last_color[ch] : c; 716 | 717 | const float dL_dchannel = dL_dpixel[ch]; 718 | dL_dalpha += (c - accum_rec[ch]) * dL_dchannel; 719 | local_dL_dcolors[ch] = skip ? 0.0f : dchannel_dcolor * dL_dchannel; 720 | } 721 | dL_dcolors_shared[tid].x = local_dL_dcolors[0]; 722 | dL_dcolors_shared[tid].y = local_dL_dcolors[1]; 723 | dL_dcolors_shared[tid].z = local_dL_dcolors[2]; 724 | 725 | const float depth = collected_depths[j]; 726 | accum_rec_depth = skip ? accum_rec_depth : last_alpha * last_depth + (1.f - last_alpha) * accum_rec_depth; 727 | last_depth = skip ? last_depth : depth; 728 | dL_dalpha += (depth - accum_rec_depth) * dL_dpixel_depth; 729 | dL_ddepths_shared[tid] = skip ? 0.f : dchannel_dcolor * dL_dpixel_depth; 730 | 731 | 732 | dL_dalpha *= T; 733 | // Update last alpha (to be used in the next iteration) 734 | last_alpha = skip ? last_alpha : alpha; 735 | 736 | // Account for fact that alpha also influences how much of 737 | // the background color is added if nothing left to blend 738 | float bg_dot_dpixel = 0.f; 739 | #pragma unroll 740 | for (int i = 0; i < C; i++) { 741 | bg_dot_dpixel += bg_color[i] * dL_dpixel[i]; 742 | } 743 | dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel; 744 | 745 | // Helpful reusable temporary variables 746 | const float dL_dG = con_o.w * dL_dalpha; 747 | const float gdx = G * d.x; 748 | const float gdy = G * d.y; 749 | const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y; 750 | const float dG_ddely = -gdy * con_o.z - gdx * con_o.y; 751 | 752 | dL_dmean2D_shared[tid].x = skip ? 0.f : dL_dG * dG_ddelx * ddelx_dx; 753 | dL_dmean2D_shared[tid].y = skip ? 0.f : dL_dG * dG_ddely * ddely_dy; 754 | dL_dconic2D_shared[tid].x = skip ? 0.f : -0.5f * gdx * d.x * dL_dG; 755 | dL_dconic2D_shared[tid].y = skip ? 0.f : -0.5f * gdx * d.y * dL_dG; 756 | dL_dconic2D_shared[tid].w = skip ? 0.f : -0.5f * gdy * d.y * dL_dG; 757 | dL_dopacity_shared[tid] = skip ? 0.f : G * dL_dalpha; 758 | 759 | render_cuda_reduce_sum(block, 760 | dL_dmean2D_shared, 761 | dL_dconic2D_shared, 762 | dL_dopacity_shared, 763 | dL_dcolors_shared, 764 | dL_ddepths_shared 765 | ); 766 | 767 | if (tid == 0) { 768 | float2 dL_dmean2D_acc = dL_dmean2D_shared[0]; 769 | float4 dL_dconic2D_acc = dL_dconic2D_shared[0]; 770 | float dL_dopacity_acc = dL_dopacity_shared[0]; 771 | float3 dL_dcolors_acc = dL_dcolors_shared[0]; 772 | float dL_ddepths_acc = dL_ddepths_shared[0]; 773 | 774 | atomicAdd(&dL_dmean2D[global_id].x, dL_dmean2D_acc.x); 775 | atomicAdd(&dL_dmean2D[global_id].y, dL_dmean2D_acc.y); 776 | atomicAdd(&dL_dconic2D[global_id].x, dL_dconic2D_acc.x); 777 | atomicAdd(&dL_dconic2D[global_id].y, dL_dconic2D_acc.y); 778 | atomicAdd(&dL_dconic2D[global_id].w, dL_dconic2D_acc.w); 779 | atomicAdd(&dL_dopacity[global_id], dL_dopacity_acc); 780 | atomicAdd(&dL_dcolors[global_id * C + 0], dL_dcolors_acc.x); 781 | atomicAdd(&dL_dcolors[global_id * C + 1], dL_dcolors_acc.y); 782 | atomicAdd(&dL_dcolors[global_id * C + 2], dL_dcolors_acc.z); 783 | atomicAdd(&dL_ddepths[global_id], dL_ddepths_acc); 784 | } 785 | } 786 | } 787 | } 788 | 789 | void BACKWARD::preprocess( 790 | int P, int D, int M, 791 | const float3* means3D, 792 | const int* radii, 793 | const float* shs, 794 | const bool* clamped, 795 | const glm::vec3* scales, 796 | const glm::vec4* rotations, 797 | const float scale_modifier, 798 | const float* cov3Ds, 799 | const float* viewmatrix, 800 | const float* projmatrix, 801 | const float* projmatrix_raw, 802 | const float focal_x, float focal_y, 803 | const float tan_fovx, float tan_fovy, 804 | const glm::vec3* campos, 805 | const float3* dL_dmean2D, 806 | const float* dL_dconic, 807 | glm::vec3* dL_dmean3D, 808 | float* dL_dcolor, 809 | float* dL_ddepth, 810 | float* dL_dcov3D, 811 | float* dL_dsh, 812 | glm::vec3* dL_dscale, 813 | glm::vec4* dL_drot, 814 | float* dL_dtau) 815 | { 816 | // Propagate gradients for the path of 2D conic matrix computation. 817 | // Somewhat long, thus it is its own kernel rather than being part of 818 | // "preprocess". When done, loss gradient w.r.t. 3D means has been 819 | // modified and gradient w.r.t. 3D covariance matrix has been computed. 820 | computeCov2DCUDA << <(P + 255) / 256, 256 >> > ( 821 | P, 822 | means3D, 823 | radii, 824 | cov3Ds, 825 | focal_x, 826 | focal_y, 827 | tan_fovx, 828 | tan_fovy, 829 | viewmatrix, 830 | dL_dconic, 831 | (float3*)dL_dmean3D, 832 | dL_dcov3D, 833 | dL_dtau); 834 | 835 | // Propagate gradients for remaining steps: finish 3D mean gradients, 836 | // propagate color gradients to SH (if desireD), propagate 3D covariance 837 | // matrix gradients to scale and rotation. 838 | preprocessCUDA << < (P + 255) / 256, 256 >> > ( 839 | P, D, M, 840 | (float3*)means3D, 841 | radii, 842 | shs, 843 | clamped, 844 | (glm::vec3*)scales, 845 | (glm::vec4*)rotations, 846 | scale_modifier, 847 | viewmatrix, 848 | projmatrix, 849 | projmatrix_raw, 850 | campos, 851 | (float3*)dL_dmean2D, 852 | (glm::vec3*)dL_dmean3D, 853 | dL_dcolor, 854 | dL_ddepth, 855 | dL_dcov3D, 856 | dL_dsh, 857 | dL_dscale, 858 | dL_drot, 859 | dL_dtau); 860 | } 861 | 862 | void BACKWARD::render( 863 | const dim3 grid, const dim3 block, 864 | const uint2* ranges, 865 | const uint32_t* point_list, 866 | int W, int H, 867 | const float* bg_color, 868 | const float2* means2D, 869 | const float4* conic_opacity, 870 | const float* colors, 871 | const float* depths, 872 | const float* final_Ts, 873 | const uint32_t* n_contrib, 874 | const float* dL_dpixels, 875 | const float* dL_dpixels_depth, 876 | float3* dL_dmean2D, 877 | float4* dL_dconic2D, 878 | float* dL_dopacity, 879 | float* dL_dcolors, 880 | float* dL_ddepths) 881 | { 882 | renderCUDA << > >( 883 | ranges, 884 | point_list, 885 | W, H, 886 | bg_color, 887 | means2D, 888 | conic_opacity, 889 | colors, 890 | depths, 891 | final_Ts, 892 | n_contrib, 893 | dL_dpixels, 894 | dL_dpixels_depth, 895 | dL_dmean2D, 896 | dL_dconic2D, 897 | dL_dopacity, 898 | dL_dcolors, 899 | dL_ddepths 900 | ); 901 | } -------------------------------------------------------------------------------- /cuda_rasterizer/backward.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED 13 | #define CUDA_RASTERIZER_BACKWARD_H_INCLUDED 14 | 15 | #include 16 | #include "cuda_runtime.h" 17 | #include "device_launch_parameters.h" 18 | #define GLM_FORCE_CUDA 19 | #include 20 | 21 | namespace BACKWARD 22 | { 23 | void render( 24 | const dim3 grid, const dim3 block, 25 | const uint2* ranges, 26 | const uint32_t* point_list, 27 | int W, int H, 28 | const float* bg_color, 29 | const float2* means2D, 30 | const float4* conic_opacity, 31 | const float* colors, 32 | const float* depths, 33 | const float* final_Ts, 34 | const uint32_t* n_contrib, 35 | const float* dL_dpixels, 36 | const float* dL_dpixels_depth, 37 | float3* dL_dmean2D, 38 | float4* dL_dconic2D, 39 | float* dL_dopacity, 40 | float* dL_dcolors, 41 | float* dL_ddepths); 42 | 43 | void preprocess( 44 | int P, int D, int M, 45 | const float3* means, 46 | const int* radii, 47 | const float* shs, 48 | const bool* clamped, 49 | const glm::vec3* scales, 50 | const glm::vec4* rotations, 51 | const float scale_modifier, 52 | const float* cov3Ds, 53 | const float* view, 54 | const float* proj, 55 | const float* proj_raw, 56 | const float focal_x, float focal_y, 57 | const float tan_fovx, float tan_fovy, 58 | const glm::vec3* campos, 59 | const float3* dL_dmean2D, 60 | const float* dL_dconics, 61 | glm::vec3* dL_dmeans, 62 | float* dL_dcolor, 63 | float* dL_ddepth, 64 | float* dL_dcov3D, 65 | float* dL_dsh, 66 | glm::vec3* dL_dscale, 67 | glm::vec4* dL_drot, 68 | float* dL_dtau); 69 | } 70 | 71 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED 13 | #define CUDA_RASTERIZER_CONFIG_H_INCLUDED 14 | 15 | #define NUM_CHANNELS 3 // Default 3, RGB 16 | #define BLOCK_X 16 17 | #define BLOCK_Y 16 18 | 19 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/forward.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "forward.h" 13 | #include "auxiliary.h" 14 | #include "helper_math.h" 15 | #include "math.h" 16 | #include 17 | #include 18 | namespace cg = cooperative_groups; 19 | 20 | // Forward method for converting the input spherical harmonics 21 | // coefficients of each Gaussian to a simple RGB color. 22 | __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) 23 | { 24 | // The implementation is loosely based on code for 25 | // "Differentiable Point-Based Radiance Fields for 26 | // Efficient View Synthesis" by Zhang et al. (2022) 27 | glm::vec3 pos = means[idx]; 28 | glm::vec3 dir = pos - campos; 29 | dir = dir / glm::length(dir); 30 | 31 | glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; 32 | glm::vec3 result = SH_C0 * sh[0]; 33 | 34 | if (deg > 0) 35 | { 36 | float x = dir.x; 37 | float y = dir.y; 38 | float z = dir.z; 39 | result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3]; 40 | 41 | if (deg > 1) 42 | { 43 | float xx = x * x, yy = y * y, zz = z * z; 44 | float xy = x * y, yz = y * z, xz = x * z; 45 | result = result + 46 | SH_C2[0] * xy * sh[4] + 47 | SH_C2[1] * yz * sh[5] + 48 | SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] + 49 | SH_C2[3] * xz * sh[7] + 50 | SH_C2[4] * (xx - yy) * sh[8]; 51 | 52 | if (deg > 2) 53 | { 54 | result = result + 55 | SH_C3[0] * y * (3.0f * xx - yy) * sh[9] + 56 | SH_C3[1] * xy * z * sh[10] + 57 | SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] + 58 | SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] + 59 | SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] + 60 | SH_C3[5] * z * (xx - yy) * sh[14] + 61 | SH_C3[6] * x * (xx - 3.0f * yy) * sh[15]; 62 | } 63 | } 64 | } 65 | result += 0.5f; 66 | 67 | // RGB colors are clamped to positive values. If values are 68 | // clamped, we need to keep track of this for the backward pass. 69 | clamped[3 * idx + 0] = (result.x < 0); 70 | clamped[3 * idx + 1] = (result.y < 0); 71 | clamped[3 * idx + 2] = (result.z < 0); 72 | return glm::max(result, 0.0f); 73 | } 74 | 75 | // Forward version of 2D covariance matrix computation 76 | __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix) 77 | { 78 | // The following models the steps outlined by equations 29 79 | // and 31 in "EWA Splatting" (Zwicker et al., 2002). 80 | // Additionally considers aspect / scaling of viewport. 81 | // Transposes used to account for row-/column-major conventions. 82 | float3 t = transformPoint4x3(mean, viewmatrix); 83 | 84 | const float limx = 1.3f * tan_fovx; 85 | const float limy = 1.3f * tan_fovy; 86 | const float txtz = t.x / t.z; 87 | const float tytz = t.y / t.z; 88 | t.x = min(limx, max(-limx, txtz)) * t.z; 89 | t.y = min(limy, max(-limy, tytz)) * t.z; 90 | 91 | glm::mat3 J = glm::mat3( 92 | focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z), 93 | 0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z), 94 | 0, 0, 0); 95 | 96 | glm::mat3 W = glm::mat3( 97 | viewmatrix[0], viewmatrix[4], viewmatrix[8], 98 | viewmatrix[1], viewmatrix[5], viewmatrix[9], 99 | viewmatrix[2], viewmatrix[6], viewmatrix[10]); 100 | 101 | glm::mat3 T = W * J; 102 | 103 | glm::mat3 Vrk = glm::mat3( 104 | cov3D[0], cov3D[1], cov3D[2], 105 | cov3D[1], cov3D[3], cov3D[4], 106 | cov3D[2], cov3D[4], cov3D[5]); 107 | 108 | glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T; 109 | 110 | // Apply low-pass filter: every Gaussian should be at least 111 | // one pixel wide/high. Discard 3rd row and column. 112 | cov[0][0] += 0.3f; 113 | cov[1][1] += 0.3f; 114 | return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) }; 115 | } 116 | 117 | // Forward method for converting scale and rotation properties of each 118 | // Gaussian to a 3D covariance matrix in world space. Also takes care 119 | // of quaternion normalization. 120 | __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D) 121 | { 122 | // Create scaling matrix 123 | glm::mat3 S = glm::mat3(1.0f); 124 | S[0][0] = mod * scale.x; 125 | S[1][1] = mod * scale.y; 126 | S[2][2] = mod * scale.z; 127 | 128 | // Normalize quaternion to get valid rotation 129 | glm::vec4 q = rot;// / glm::length(rot); 130 | float r = q.x; 131 | float x = q.y; 132 | float y = q.z; 133 | float z = q.w; 134 | 135 | // Compute rotation matrix from quaternion 136 | glm::mat3 R = glm::mat3( 137 | 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), 138 | 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), 139 | 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) 140 | ); 141 | 142 | glm::mat3 M = S * R; 143 | 144 | // Compute 3D world covariance matrix Sigma 145 | glm::mat3 Sigma = glm::transpose(M) * M; 146 | 147 | // Covariance is symmetric, only store upper right 148 | cov3D[0] = Sigma[0][0]; 149 | cov3D[1] = Sigma[0][1]; 150 | cov3D[2] = Sigma[0][2]; 151 | cov3D[3] = Sigma[1][1]; 152 | cov3D[4] = Sigma[1][2]; 153 | cov3D[5] = Sigma[2][2]; 154 | } 155 | 156 | // Perform initial steps for each Gaussian prior to rasterization. 157 | template 158 | __global__ void preprocessCUDA(int P, int D, int M, 159 | const float* orig_points, 160 | const glm::vec3* scales, 161 | const float scale_modifier, 162 | const glm::vec4* rotations, 163 | const float* opacities, 164 | const float* shs, 165 | bool* clamped, 166 | const float* cov3D_precomp, 167 | const float* colors_precomp, 168 | const float* viewmatrix, 169 | const float* projmatrix, 170 | const glm::vec3* cam_pos, 171 | const int W, int H, 172 | const float tan_fovx, float tan_fovy, 173 | const float focal_x, float focal_y, 174 | int* radii, 175 | float2* points_xy_image, 176 | float* depths, 177 | float* cov3Ds, 178 | float* rgb, 179 | float4* conic_opacity, 180 | const dim3 grid, 181 | uint32_t* tiles_touched, 182 | bool prefiltered) 183 | { 184 | auto idx = cg::this_grid().thread_rank(); 185 | if (idx >= P) 186 | return; 187 | 188 | // Initialize radius and touched tiles to 0. If this isn't changed, 189 | // this Gaussian will not be processed further. 190 | radii[idx] = 0; 191 | tiles_touched[idx] = 0; 192 | 193 | // Perform near culling, quit if outside. 194 | float3 p_view; 195 | if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) 196 | return; 197 | 198 | // Transform point by projecting 199 | float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; 200 | float4 p_hom = transformPoint4x4(p_orig, projmatrix); 201 | float p_w = 1.0f / (p_hom.w + 0.0000001f); 202 | float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; 203 | 204 | // If 3D covariance matrix is precomputed, use it, otherwise compute 205 | // from scaling and rotation parameters. 206 | const float* cov3D; 207 | if (cov3D_precomp != nullptr) 208 | { 209 | cov3D = cov3D_precomp + idx * 6; 210 | } 211 | else 212 | { 213 | computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); 214 | cov3D = cov3Ds + idx * 6; 215 | } 216 | 217 | // Compute 2D screen-space covariance matrix 218 | float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); 219 | 220 | // Invert covariance (EWA algorithm) 221 | float det = (cov.x * cov.z - cov.y * cov.y); 222 | if (det == 0.0f) 223 | return; 224 | float det_inv = 1.f / det; 225 | float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; 226 | 227 | // Compute extent in screen space (by finding eigenvalues of 228 | // 2D covariance matrix). Use extent to compute a bounding rectangle 229 | // of screen-space tiles that this Gaussian overlaps with. Quit if 230 | // rectangle covers 0 tiles. 231 | float mid = 0.5f * (cov.x + cov.z); 232 | float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); 233 | float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); 234 | float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); 235 | float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; 236 | uint2 rect_min, rect_max; 237 | getRect(point_image, my_radius, rect_min, rect_max, grid); 238 | if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) 239 | return; 240 | 241 | // If colors have been precomputed, use them, otherwise convert 242 | // spherical harmonics coefficients to RGB color. 243 | if (colors_precomp == nullptr) 244 | { 245 | glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); 246 | rgb[idx * C + 0] = result.x; 247 | rgb[idx * C + 1] = result.y; 248 | rgb[idx * C + 2] = result.z; 249 | } 250 | 251 | // Store some useful helper data for the next steps. 252 | depths[idx] = p_view.z; 253 | radii[idx] = my_radius; 254 | points_xy_image[idx] = point_image; 255 | // Inverse 2D covariance and opacity neatly pack into one float4 256 | conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; 257 | tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); 258 | } 259 | 260 | // Main rasterization method. Collaboratively works on one tile per 261 | // block, each thread treats one pixel. Alternates between fetching 262 | // and rasterizing data. 263 | template 264 | __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) 265 | renderCUDA( 266 | const uint2* __restrict__ ranges, 267 | const uint32_t* __restrict__ point_list, 268 | int W, int H, 269 | const float2* __restrict__ points_xy_image, 270 | const float* __restrict__ features, 271 | const float4* __restrict__ conic_opacity, 272 | float* __restrict__ final_T, 273 | uint32_t* __restrict__ n_contrib, 274 | const float* __restrict__ bg_color, 275 | float* __restrict__ out_color, 276 | const float* __restrict__ depth, 277 | float* __restrict__ out_depth, 278 | float* __restrict__ out_opacity, 279 | int * __restrict__ n_touched) 280 | { 281 | // Identify current tile and associated min/max pixel range. 282 | auto block = cg::this_thread_block(); 283 | uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; 284 | // uint32_t horizontal_blocks = gridDim.x; # TODO Maybe it's different? 285 | uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; 286 | uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; 287 | uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; 288 | uint32_t pix_id = W * pix.y + pix.x; 289 | float2 pixf = { (float)pix.x, (float)pix.y }; 290 | 291 | // Check if this thread is associated with a valid pixel or outside. 292 | bool inside = pix.x < W&& pix.y < H; 293 | // Done threads can help with fetching, but don't rasterize 294 | bool done = !inside; 295 | 296 | // Load start/end range of IDs to process in bit sorted list. 297 | uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; 298 | const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); 299 | int toDo = range.y - range.x; 300 | 301 | // Allocate storage for batches of collectively fetched data. 302 | __shared__ int collected_id[BLOCK_SIZE]; 303 | __shared__ float2 collected_xy[BLOCK_SIZE]; 304 | __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; 305 | __shared__ float collected_depth[BLOCK_SIZE]; 306 | 307 | // Initialize helper variables 308 | float T = 1.0f; 309 | uint32_t contributor = 0; 310 | uint32_t last_contributor = 0; 311 | float C[CHANNELS] = { 0 }; 312 | float D = 0.0f; 313 | 314 | // Iterate over batches until all done or range is complete 315 | for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) 316 | { 317 | // End if entire block votes that it is done rasterizing 318 | int num_done = __syncthreads_count(done); 319 | if (num_done == BLOCK_SIZE) 320 | break; 321 | 322 | // Collectively fetch per-Gaussian data from global to shared 323 | int progress = i * BLOCK_SIZE + block.thread_rank(); 324 | if (range.x + progress < range.y) 325 | { 326 | int coll_id = point_list[range.x + progress]; 327 | collected_id[block.thread_rank()] = coll_id; 328 | collected_xy[block.thread_rank()] = points_xy_image[coll_id]; 329 | collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; 330 | collected_depth[block.thread_rank()] = depth[coll_id]; 331 | } 332 | block.sync(); 333 | 334 | // Iterate over current batch 335 | for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) 336 | { 337 | // Keep track of current position in range 338 | contributor++; 339 | 340 | // Resample using conic matrix (cf. "Surface 341 | // Splatting" by Zwicker et al., 2001) 342 | float2 xy = collected_xy[j]; 343 | float2 d = { xy.x - pixf.x, xy.y - pixf.y }; 344 | float4 con_o = collected_conic_opacity[j]; 345 | float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; 346 | if (power > 0.0f) 347 | continue; 348 | 349 | // Eq. (2) from 3D Gaussian splatting paper. 350 | // Obtain alpha by multiplying with Gaussian opacity 351 | // and its exponential falloff from mean. 352 | // Avoid numerical instabilities (see paper appendix). 353 | float alpha = min(0.99f, con_o.w * exp(power)); 354 | if (alpha < 1.0f / 255.0f) { 355 | continue; 356 | } 357 | float test_T = T * (1 - alpha); 358 | if (test_T < 0.0001f) 359 | { 360 | done = true; 361 | continue; 362 | } 363 | // Eq. (3) from 3D Gaussian splatting paper. 364 | for (int ch = 0; ch < CHANNELS; ch++) { 365 | C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T; 366 | } 367 | D += collected_depth[j] * alpha * T; 368 | // Keep track of how many pixels touched this Gaussian. 369 | if (test_T > 0.5f) { 370 | atomicAdd(&(n_touched[collected_id[j]]), 1); 371 | } 372 | T = test_T; 373 | 374 | // Keep track of last range entry to update this 375 | // pixel. 376 | last_contributor = contributor; 377 | } 378 | } 379 | 380 | // All threads that treat valid pixel write out their final 381 | // rendering data to the frame and auxiliary buffers. 382 | if (inside) 383 | { 384 | final_T[pix_id] = T; 385 | n_contrib[pix_id] = last_contributor; 386 | for (int ch = 0; ch < CHANNELS; ch++) { 387 | out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; 388 | } 389 | out_depth[pix_id] = D; 390 | out_opacity[pix_id] = 1 - T; 391 | } 392 | } 393 | 394 | void FORWARD::render( 395 | const dim3 grid, dim3 block, 396 | const uint2* ranges, 397 | const uint32_t* point_list, 398 | int W, int H, 399 | const float2* means2D, 400 | const float* colors, 401 | const float4* conic_opacity, 402 | float* final_T, 403 | uint32_t* n_contrib, 404 | const float* bg_color, 405 | float* out_color, 406 | const float* depth, 407 | float* out_depth, 408 | float* out_opacity, 409 | int* n_touched) 410 | { 411 | renderCUDA << > > ( 412 | ranges, 413 | point_list, 414 | W, H, 415 | means2D, 416 | colors, 417 | conic_opacity, 418 | final_T, 419 | n_contrib, 420 | bg_color, 421 | out_color, 422 | depth, 423 | out_depth, 424 | out_opacity, 425 | n_touched); 426 | } 427 | 428 | void FORWARD::preprocess(int P, int D, int M, 429 | const float* means3D, 430 | const glm::vec3* scales, 431 | const float scale_modifier, 432 | const glm::vec4* rotations, 433 | const float* opacities, 434 | const float* shs, 435 | bool* clamped, 436 | const float* cov3D_precomp, 437 | const float* colors_precomp, 438 | const float* viewmatrix, 439 | const float* projmatrix, 440 | const glm::vec3* cam_pos, 441 | const int W, int H, 442 | const float focal_x, float focal_y, 443 | const float tan_fovx, float tan_fovy, 444 | int* radii, 445 | float2* means2D, 446 | float* depths, 447 | float* cov3Ds, 448 | float* rgb, 449 | float4* conic_opacity, 450 | const dim3 grid, 451 | uint32_t* tiles_touched, 452 | bool prefiltered) 453 | { 454 | preprocessCUDA << <(P + 255) / 256, 256 >> > ( 455 | P, D, M, 456 | means3D, 457 | scales, 458 | scale_modifier, 459 | rotations, 460 | opacities, 461 | shs, 462 | clamped, 463 | cov3D_precomp, 464 | colors_precomp, 465 | viewmatrix, 466 | projmatrix, 467 | cam_pos, 468 | W, H, 469 | tan_fovx, tan_fovy, 470 | focal_x, focal_y, 471 | radii, 472 | means2D, 473 | depths, 474 | cov3Ds, 475 | rgb, 476 | conic_opacity, 477 | grid, 478 | tiles_touched, 479 | prefiltered 480 | ); 481 | } -------------------------------------------------------------------------------- /cuda_rasterizer/forward.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED 13 | #define CUDA_RASTERIZER_FORWARD_H_INCLUDED 14 | 15 | #include 16 | #include "cuda_runtime.h" 17 | #include "device_launch_parameters.h" 18 | #define GLM_FORCE_CUDA 19 | #include 20 | 21 | namespace FORWARD 22 | { 23 | // Perform initial steps for each Gaussian prior to rasterization. 24 | void preprocess(int P, int D, int M, 25 | const float* orig_points, 26 | const glm::vec3* scales, 27 | const float scale_modifier, 28 | const glm::vec4* rotations, 29 | const float* opacities, 30 | const float* shs, 31 | bool* clamped, 32 | const float* cov3D_precomp, 33 | const float* colors_precomp, 34 | const float* viewmatrix, 35 | const float* projmatrix, 36 | const glm::vec3* cam_pos, 37 | const int W, int H, 38 | const float focal_x, float focal_y, 39 | const float tan_fovx, float tan_fovy, 40 | int* radii, 41 | float2* points_xy_image, 42 | float* depths, 43 | float* cov3Ds, 44 | float* colors, 45 | float4* conic_opacity, 46 | const dim3 grid, 47 | uint32_t* tiles_touched, 48 | bool prefiltered); 49 | 50 | // Main rasterization method. 51 | void render( 52 | const dim3 grid, dim3 block, 53 | const uint2* ranges, 54 | const uint32_t* point_list, 55 | int W, int H, 56 | const float2* points_xy_image, 57 | const float* features, 58 | const float4* conic_opacity, 59 | float* final_T, 60 | uint32_t* n_contrib, 61 | const float* bg_color, 62 | float* out_color, 63 | const float* depth, 64 | float* out_depth, 65 | float* out_opacity, 66 | int* n_touched); 67 | } 68 | 69 | 70 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/helper_math.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* 29 | * This file implements common mathematical operations on vector types 30 | * (float3, float4 etc.) since these are not provided as standard by CUDA. 31 | * 32 | * The syntax is modeled on the Cg standard library. 33 | * 34 | * This is part of the Helper library includes 35 | * 36 | * Thanks to Linh Hah for additions and fixes. 37 | */ 38 | 39 | #ifndef HELPER_MATH_H 40 | #define HELPER_MATH_H 41 | 42 | #include "cuda_runtime.h" 43 | 44 | typedef unsigned int uint; 45 | typedef unsigned short ushort; 46 | 47 | #ifndef EXIT_WAIVED 48 | #define EXIT_WAIVED 2 49 | #endif 50 | 51 | #ifndef __CUDACC__ 52 | #include 53 | 54 | //////////////////////////////////////////////////////////////////////////////// 55 | // host implementations of CUDA functions 56 | //////////////////////////////////////////////////////////////////////////////// 57 | 58 | inline float fminf(float a, float b) { return a < b ? a : b; } 59 | 60 | inline float fmaxf(float a, float b) { return a > b ? a : b; } 61 | 62 | inline int max(int a, int b) { return a > b ? a : b; } 63 | 64 | inline int min(int a, int b) { return a < b ? a : b; } 65 | 66 | inline float rsqrtf(float x) { return 1.0f / sqrtf(x); } 67 | #endif 68 | 69 | //////////////////////////////////////////////////////////////////////////////// 70 | // constructors 71 | //////////////////////////////////////////////////////////////////////////////// 72 | 73 | inline __host__ __device__ float2 make_float2(float s) { return make_float2(s, s); } 74 | inline __host__ __device__ float2 make_float2(float3 a) { return make_float2(a.x, a.y); } 75 | inline __host__ __device__ float2 make_float2(int2 a) { 76 | return make_float2(float(a.x), float(a.y)); 77 | } 78 | inline __host__ __device__ float2 make_float2(uint2 a) { 79 | return make_float2(float(a.x), float(a.y)); 80 | } 81 | 82 | inline __host__ __device__ int2 make_int2(int s) { return make_int2(s, s); } 83 | inline __host__ __device__ int2 make_int2(int3 a) { return make_int2(a.x, a.y); } 84 | inline __host__ __device__ int2 make_int2(uint2 a) { return make_int2(int(a.x), int(a.y)); } 85 | inline __host__ __device__ int2 make_int2(float2 a) { return make_int2(int(a.x), int(a.y)); } 86 | 87 | inline __host__ __device__ uint2 make_uint2(uint s) { return make_uint2(s, s); } 88 | inline __host__ __device__ uint2 make_uint2(uint3 a) { return make_uint2(a.x, a.y); } 89 | inline __host__ __device__ uint2 make_uint2(int2 a) { return make_uint2(uint(a.x), uint(a.y)); } 90 | 91 | inline __host__ __device__ float3 make_float3(float s) { return make_float3(s, s, s); } 92 | inline __host__ __device__ float3 make_float3(float2 a) { return make_float3(a.x, a.y, 0.0f); } 93 | inline __host__ __device__ float3 make_float3(float2 a, float s) { 94 | return make_float3(a.x, a.y, s); 95 | } 96 | inline __host__ __device__ float3 make_float3(float4 a) { return make_float3(a.x, a.y, a.z); } 97 | inline __host__ __device__ float3 make_float3(int3 a) { 98 | return make_float3(float(a.x), float(a.y), float(a.z)); 99 | } 100 | inline __host__ __device__ float3 make_float3(uint3 a) { 101 | return make_float3(float(a.x), float(a.y), float(a.z)); 102 | } 103 | 104 | inline __host__ __device__ int3 make_int3(int s) { return make_int3(s, s, s); } 105 | inline __host__ __device__ int3 make_int3(int2 a) { return make_int3(a.x, a.y, 0); } 106 | inline __host__ __device__ int3 make_int3(int2 a, int s) { return make_int3(a.x, a.y, s); } 107 | inline __host__ __device__ int3 make_int3(uint3 a) { 108 | return make_int3(int(a.x), int(a.y), int(a.z)); 109 | } 110 | inline __host__ __device__ int3 make_int3(float3 a) { 111 | return make_int3(int(a.x), int(a.y), int(a.z)); 112 | } 113 | 114 | inline __host__ __device__ uint3 make_uint3(uint s) { return make_uint3(s, s, s); } 115 | inline __host__ __device__ uint3 make_uint3(uint2 a) { return make_uint3(a.x, a.y, 0); } 116 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) { return make_uint3(a.x, a.y, s); } 117 | inline __host__ __device__ uint3 make_uint3(uint4 a) { return make_uint3(a.x, a.y, a.z); } 118 | inline __host__ __device__ uint3 make_uint3(int3 a) { 119 | return make_uint3(uint(a.x), uint(a.y), uint(a.z)); 120 | } 121 | 122 | inline __host__ __device__ float4 make_float4(float s) { return make_float4(s, s, s, s); } 123 | inline __host__ __device__ float4 make_float4(float3 a) { return make_float4(a.x, a.y, a.z, 0.0f); } 124 | inline __host__ __device__ float4 make_float4(float3 a, float w) { 125 | return make_float4(a.x, a.y, a.z, w); 126 | } 127 | inline __host__ __device__ float4 make_float4(int4 a) { 128 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 129 | } 130 | inline __host__ __device__ float4 make_float4(uint4 a) { 131 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 132 | } 133 | 134 | inline __host__ __device__ int4 make_int4(int s) { return make_int4(s, s, s, s); } 135 | inline __host__ __device__ int4 make_int4(int3 a) { return make_int4(a.x, a.y, a.z, 0); } 136 | inline __host__ __device__ int4 make_int4(int3 a, int w) { return make_int4(a.x, a.y, a.z, w); } 137 | inline __host__ __device__ int4 make_int4(uint4 a) { 138 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 139 | } 140 | inline __host__ __device__ int4 make_int4(float4 a) { 141 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 142 | } 143 | 144 | inline __host__ __device__ uint4 make_uint4(uint s) { return make_uint4(s, s, s, s); } 145 | inline __host__ __device__ uint4 make_uint4(uint3 a) { return make_uint4(a.x, a.y, a.z, 0); } 146 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) { 147 | return make_uint4(a.x, a.y, a.z, w); 148 | } 149 | inline __host__ __device__ uint4 make_uint4(int4 a) { 150 | return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); 151 | } 152 | 153 | //////////////////////////////////////////////////////////////////////////////// 154 | // negate 155 | //////////////////////////////////////////////////////////////////////////////// 156 | 157 | inline __host__ __device__ float2 operator-(float2 &a) { return make_float2(-a.x, -a.y); } 158 | inline __host__ __device__ int2 operator-(int2 &a) { return make_int2(-a.x, -a.y); } 159 | inline __host__ __device__ float3 operator-(float3 &a) { return make_float3(-a.x, -a.y, -a.z); } 160 | inline __host__ __device__ int3 operator-(int3 &a) { return make_int3(-a.x, -a.y, -a.z); } 161 | inline __host__ __device__ float4 operator-(float4 &a) { 162 | return make_float4(-a.x, -a.y, -a.z, -a.w); 163 | } 164 | inline __host__ __device__ int4 operator-(int4 &a) { return make_int4(-a.x, -a.y, -a.z, -a.w); } 165 | 166 | //////////////////////////////////////////////////////////////////////////////// 167 | // addition 168 | //////////////////////////////////////////////////////////////////////////////// 169 | 170 | inline __host__ __device__ float2 operator+(float2 a, float2 b) { 171 | return make_float2(a.x + b.x, a.y + b.y); 172 | } 173 | inline __host__ __device__ void operator+=(float2 &a, float2 b) { 174 | a.x += b.x; 175 | a.y += b.y; 176 | } 177 | inline __host__ __device__ float2 operator+(float2 a, float b) { 178 | return make_float2(a.x + b, a.y + b); 179 | } 180 | inline __host__ __device__ float2 operator+(float b, float2 a) { 181 | return make_float2(a.x + b, a.y + b); 182 | } 183 | inline __host__ __device__ void operator+=(float2 &a, float b) { 184 | a.x += b; 185 | a.y += b; 186 | } 187 | 188 | inline __host__ __device__ int2 operator+(int2 a, int2 b) { 189 | return make_int2(a.x + b.x, a.y + b.y); 190 | } 191 | inline __host__ __device__ void operator+=(int2 &a, int2 b) { 192 | a.x += b.x; 193 | a.y += b.y; 194 | } 195 | inline __host__ __device__ int2 operator+(int2 a, int b) { return make_int2(a.x + b, a.y + b); } 196 | inline __host__ __device__ int2 operator+(int b, int2 a) { return make_int2(a.x + b, a.y + b); } 197 | inline __host__ __device__ void operator+=(int2 &a, int b) { 198 | a.x += b; 199 | a.y += b; 200 | } 201 | 202 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) { 203 | return make_uint2(a.x + b.x, a.y + b.y); 204 | } 205 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b) { 206 | a.x += b.x; 207 | a.y += b.y; 208 | } 209 | inline __host__ __device__ uint2 operator+(uint2 a, uint b) { return make_uint2(a.x + b, a.y + b); } 210 | inline __host__ __device__ uint2 operator+(uint b, uint2 a) { return make_uint2(a.x + b, a.y + b); } 211 | inline __host__ __device__ void operator+=(uint2 &a, uint b) { 212 | a.x += b; 213 | a.y += b; 214 | } 215 | 216 | inline __host__ __device__ float3 operator+(float3 a, float3 b) { 217 | return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); 218 | } 219 | inline __host__ __device__ void operator+=(float3 &a, float3 b) { 220 | a.x += b.x; 221 | a.y += b.y; 222 | a.z += b.z; 223 | } 224 | inline __host__ __device__ float3 operator+(float3 a, float b) { 225 | return make_float3(a.x + b, a.y + b, a.z + b); 226 | } 227 | inline __host__ __device__ void operator+=(float3 &a, float b) { 228 | a.x += b; 229 | a.y += b; 230 | a.z += b; 231 | } 232 | 233 | inline __host__ __device__ int3 operator+(int3 a, int3 b) { 234 | return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); 235 | } 236 | inline __host__ __device__ void operator+=(int3 &a, int3 b) { 237 | a.x += b.x; 238 | a.y += b.y; 239 | a.z += b.z; 240 | } 241 | inline __host__ __device__ int3 operator+(int3 a, int b) { 242 | return make_int3(a.x + b, a.y + b, a.z + b); 243 | } 244 | inline __host__ __device__ void operator+=(int3 &a, int b) { 245 | a.x += b; 246 | a.y += b; 247 | a.z += b; 248 | } 249 | 250 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) { 251 | return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); 252 | } 253 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b) { 254 | a.x += b.x; 255 | a.y += b.y; 256 | a.z += b.z; 257 | } 258 | inline __host__ __device__ uint3 operator+(uint3 a, uint b) { 259 | return make_uint3(a.x + b, a.y + b, a.z + b); 260 | } 261 | inline __host__ __device__ void operator+=(uint3 &a, uint b) { 262 | a.x += b; 263 | a.y += b; 264 | a.z += b; 265 | } 266 | 267 | inline __host__ __device__ int3 operator+(int b, int3 a) { 268 | return make_int3(a.x + b, a.y + b, a.z + b); 269 | } 270 | inline __host__ __device__ uint3 operator+(uint b, uint3 a) { 271 | return make_uint3(a.x + b, a.y + b, a.z + b); 272 | } 273 | inline __host__ __device__ float3 operator+(float b, float3 a) { 274 | return make_float3(a.x + b, a.y + b, a.z + b); 275 | } 276 | 277 | inline __host__ __device__ float4 operator+(float4 a, float4 b) { 278 | return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 279 | } 280 | inline __host__ __device__ void operator+=(float4 &a, float4 b) { 281 | a.x += b.x; 282 | a.y += b.y; 283 | a.z += b.z; 284 | a.w += b.w; 285 | } 286 | inline __host__ __device__ float4 operator+(float4 a, float b) { 287 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 288 | } 289 | inline __host__ __device__ float4 operator+(float b, float4 a) { 290 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 291 | } 292 | inline __host__ __device__ void operator+=(float4 &a, float b) { 293 | a.x += b; 294 | a.y += b; 295 | a.z += b; 296 | a.w += b; 297 | } 298 | 299 | inline __host__ __device__ int4 operator+(int4 a, int4 b) { 300 | return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 301 | } 302 | inline __host__ __device__ void operator+=(int4 &a, int4 b) { 303 | a.x += b.x; 304 | a.y += b.y; 305 | a.z += b.z; 306 | a.w += b.w; 307 | } 308 | inline __host__ __device__ int4 operator+(int4 a, int b) { 309 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 310 | } 311 | inline __host__ __device__ int4 operator+(int b, int4 a) { 312 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 313 | } 314 | inline __host__ __device__ void operator+=(int4 &a, int b) { 315 | a.x += b; 316 | a.y += b; 317 | a.z += b; 318 | a.w += b; 319 | } 320 | 321 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) { 322 | return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 323 | } 324 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b) { 325 | a.x += b.x; 326 | a.y += b.y; 327 | a.z += b.z; 328 | a.w += b.w; 329 | } 330 | inline __host__ __device__ uint4 operator+(uint4 a, uint b) { 331 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 332 | } 333 | inline __host__ __device__ uint4 operator+(uint b, uint4 a) { 334 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 335 | } 336 | inline __host__ __device__ void operator+=(uint4 &a, uint b) { 337 | a.x += b; 338 | a.y += b; 339 | a.z += b; 340 | a.w += b; 341 | } 342 | 343 | //////////////////////////////////////////////////////////////////////////////// 344 | // subtract 345 | //////////////////////////////////////////////////////////////////////////////// 346 | 347 | inline __host__ __device__ float2 operator-(float2 a, float2 b) { 348 | return make_float2(a.x - b.x, a.y - b.y); 349 | } 350 | inline __host__ __device__ void operator-=(float2 &a, float2 b) { 351 | a.x -= b.x; 352 | a.y -= b.y; 353 | } 354 | inline __host__ __device__ float2 operator-(float2 a, float b) { 355 | return make_float2(a.x - b, a.y - b); 356 | } 357 | inline __host__ __device__ float2 operator-(float b, float2 a) { 358 | return make_float2(b - a.x, b - a.y); 359 | } 360 | inline __host__ __device__ void operator-=(float2 &a, float b) { 361 | a.x -= b; 362 | a.y -= b; 363 | } 364 | 365 | inline __host__ __device__ int2 operator-(int2 a, int2 b) { 366 | return make_int2(a.x - b.x, a.y - b.y); 367 | } 368 | inline __host__ __device__ void operator-=(int2 &a, int2 b) { 369 | a.x -= b.x; 370 | a.y -= b.y; 371 | } 372 | inline __host__ __device__ int2 operator-(int2 a, int b) { return make_int2(a.x - b, a.y - b); } 373 | inline __host__ __device__ int2 operator-(int b, int2 a) { return make_int2(b - a.x, b - a.y); } 374 | inline __host__ __device__ void operator-=(int2 &a, int b) { 375 | a.x -= b; 376 | a.y -= b; 377 | } 378 | 379 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) { 380 | return make_uint2(a.x - b.x, a.y - b.y); 381 | } 382 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b) { 383 | a.x -= b.x; 384 | a.y -= b.y; 385 | } 386 | inline __host__ __device__ uint2 operator-(uint2 a, uint b) { return make_uint2(a.x - b, a.y - b); } 387 | inline __host__ __device__ uint2 operator-(uint b, uint2 a) { return make_uint2(b - a.x, b - a.y); } 388 | inline __host__ __device__ void operator-=(uint2 &a, uint b) { 389 | a.x -= b; 390 | a.y -= b; 391 | } 392 | 393 | inline __host__ __device__ float3 operator-(float3 a, float3 b) { 394 | return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); 395 | } 396 | inline __host__ __device__ void operator-=(float3 &a, float3 b) { 397 | a.x -= b.x; 398 | a.y -= b.y; 399 | a.z -= b.z; 400 | } 401 | inline __host__ __device__ float3 operator-(float3 a, float b) { 402 | return make_float3(a.x - b, a.y - b, a.z - b); 403 | } 404 | inline __host__ __device__ float3 operator-(float b, float3 a) { 405 | return make_float3(b - a.x, b - a.y, b - a.z); 406 | } 407 | inline __host__ __device__ void operator-=(float3 &a, float b) { 408 | a.x -= b; 409 | a.y -= b; 410 | a.z -= b; 411 | } 412 | 413 | inline __host__ __device__ int3 operator-(int3 a, int3 b) { 414 | return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); 415 | } 416 | inline __host__ __device__ void operator-=(int3 &a, int3 b) { 417 | a.x -= b.x; 418 | a.y -= b.y; 419 | a.z -= b.z; 420 | } 421 | inline __host__ __device__ int3 operator-(int3 a, int b) { 422 | return make_int3(a.x - b, a.y - b, a.z - b); 423 | } 424 | inline __host__ __device__ int3 operator-(int b, int3 a) { 425 | return make_int3(b - a.x, b - a.y, b - a.z); 426 | } 427 | inline __host__ __device__ void operator-=(int3 &a, int b) { 428 | a.x -= b; 429 | a.y -= b; 430 | a.z -= b; 431 | } 432 | 433 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) { 434 | return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); 435 | } 436 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b) { 437 | a.x -= b.x; 438 | a.y -= b.y; 439 | a.z -= b.z; 440 | } 441 | inline __host__ __device__ uint3 operator-(uint3 a, uint b) { 442 | return make_uint3(a.x - b, a.y - b, a.z - b); 443 | } 444 | inline __host__ __device__ uint3 operator-(uint b, uint3 a) { 445 | return make_uint3(b - a.x, b - a.y, b - a.z); 446 | } 447 | inline __host__ __device__ void operator-=(uint3 &a, uint b) { 448 | a.x -= b; 449 | a.y -= b; 450 | a.z -= b; 451 | } 452 | 453 | inline __host__ __device__ float4 operator-(float4 a, float4 b) { 454 | return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 455 | } 456 | inline __host__ __device__ void operator-=(float4 &a, float4 b) { 457 | a.x -= b.x; 458 | a.y -= b.y; 459 | a.z -= b.z; 460 | a.w -= b.w; 461 | } 462 | inline __host__ __device__ float4 operator-(float4 a, float b) { 463 | return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); 464 | } 465 | inline __host__ __device__ void operator-=(float4 &a, float b) { 466 | a.x -= b; 467 | a.y -= b; 468 | a.z -= b; 469 | a.w -= b; 470 | } 471 | 472 | inline __host__ __device__ int4 operator-(int4 a, int4 b) { 473 | return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 474 | } 475 | inline __host__ __device__ void operator-=(int4 &a, int4 b) { 476 | a.x -= b.x; 477 | a.y -= b.y; 478 | a.z -= b.z; 479 | a.w -= b.w; 480 | } 481 | inline __host__ __device__ int4 operator-(int4 a, int b) { 482 | return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); 483 | } 484 | inline __host__ __device__ int4 operator-(int b, int4 a) { 485 | return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); 486 | } 487 | inline __host__ __device__ void operator-=(int4 &a, int b) { 488 | a.x -= b; 489 | a.y -= b; 490 | a.z -= b; 491 | a.w -= b; 492 | } 493 | 494 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) { 495 | return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 496 | } 497 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b) { 498 | a.x -= b.x; 499 | a.y -= b.y; 500 | a.z -= b.z; 501 | a.w -= b.w; 502 | } 503 | inline __host__ __device__ uint4 operator-(uint4 a, uint b) { 504 | return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); 505 | } 506 | inline __host__ __device__ uint4 operator-(uint b, uint4 a) { 507 | return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); 508 | } 509 | inline __host__ __device__ void operator-=(uint4 &a, uint b) { 510 | a.x -= b; 511 | a.y -= b; 512 | a.z -= b; 513 | a.w -= b; 514 | } 515 | 516 | //////////////////////////////////////////////////////////////////////////////// 517 | // multiply 518 | //////////////////////////////////////////////////////////////////////////////// 519 | 520 | inline __host__ __device__ float2 operator*(float2 a, float2 b) { 521 | return make_float2(a.x * b.x, a.y * b.y); 522 | } 523 | inline __host__ __device__ void operator*=(float2 &a, float2 b) { 524 | a.x *= b.x; 525 | a.y *= b.y; 526 | } 527 | inline __host__ __device__ float2 operator*(float2 a, float b) { 528 | return make_float2(a.x * b, a.y * b); 529 | } 530 | inline __host__ __device__ float2 operator*(float b, float2 a) { 531 | return make_float2(b * a.x, b * a.y); 532 | } 533 | inline __host__ __device__ void operator*=(float2 &a, float b) { 534 | a.x *= b; 535 | a.y *= b; 536 | } 537 | 538 | inline __host__ __device__ int2 operator*(int2 a, int2 b) { 539 | return make_int2(a.x * b.x, a.y * b.y); 540 | } 541 | inline __host__ __device__ void operator*=(int2 &a, int2 b) { 542 | a.x *= b.x; 543 | a.y *= b.y; 544 | } 545 | inline __host__ __device__ int2 operator*(int2 a, int b) { return make_int2(a.x * b, a.y * b); } 546 | inline __host__ __device__ int2 operator*(int b, int2 a) { return make_int2(b * a.x, b * a.y); } 547 | inline __host__ __device__ void operator*=(int2 &a, int b) { 548 | a.x *= b; 549 | a.y *= b; 550 | } 551 | 552 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) { 553 | return make_uint2(a.x * b.x, a.y * b.y); 554 | } 555 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b) { 556 | a.x *= b.x; 557 | a.y *= b.y; 558 | } 559 | inline __host__ __device__ uint2 operator*(uint2 a, uint b) { return make_uint2(a.x * b, a.y * b); } 560 | inline __host__ __device__ uint2 operator*(uint b, uint2 a) { return make_uint2(b * a.x, b * a.y); } 561 | inline __host__ __device__ void operator*=(uint2 &a, uint b) { 562 | a.x *= b; 563 | a.y *= b; 564 | } 565 | 566 | inline __host__ __device__ float3 operator*(float3 a, float3 b) { 567 | return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); 568 | } 569 | inline __host__ __device__ void operator*=(float3 &a, float3 b) { 570 | a.x *= b.x; 571 | a.y *= b.y; 572 | a.z *= b.z; 573 | } 574 | inline __host__ __device__ float3 operator*(float3 a, float b) { 575 | return make_float3(a.x * b, a.y * b, a.z * b); 576 | } 577 | inline __host__ __device__ float3 operator*(float b, float3 a) { 578 | return make_float3(b * a.x, b * a.y, b * a.z); 579 | } 580 | inline __host__ __device__ void operator*=(float3 &a, float b) { 581 | a.x *= b; 582 | a.y *= b; 583 | a.z *= b; 584 | } 585 | 586 | inline __host__ __device__ int3 operator*(int3 a, int3 b) { 587 | return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); 588 | } 589 | inline __host__ __device__ void operator*=(int3 &a, int3 b) { 590 | a.x *= b.x; 591 | a.y *= b.y; 592 | a.z *= b.z; 593 | } 594 | inline __host__ __device__ int3 operator*(int3 a, int b) { 595 | return make_int3(a.x * b, a.y * b, a.z * b); 596 | } 597 | inline __host__ __device__ int3 operator*(int b, int3 a) { 598 | return make_int3(b * a.x, b * a.y, b * a.z); 599 | } 600 | inline __host__ __device__ void operator*=(int3 &a, int b) { 601 | a.x *= b; 602 | a.y *= b; 603 | a.z *= b; 604 | } 605 | 606 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) { 607 | return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); 608 | } 609 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b) { 610 | a.x *= b.x; 611 | a.y *= b.y; 612 | a.z *= b.z; 613 | } 614 | inline __host__ __device__ uint3 operator*(uint3 a, uint b) { 615 | return make_uint3(a.x * b, a.y * b, a.z * b); 616 | } 617 | inline __host__ __device__ uint3 operator*(uint b, uint3 a) { 618 | return make_uint3(b * a.x, b * a.y, b * a.z); 619 | } 620 | inline __host__ __device__ void operator*=(uint3 &a, uint b) { 621 | a.x *= b; 622 | a.y *= b; 623 | a.z *= b; 624 | } 625 | 626 | inline __host__ __device__ float4 operator*(float4 a, float4 b) { 627 | return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 628 | } 629 | inline __host__ __device__ void operator*=(float4 &a, float4 b) { 630 | a.x *= b.x; 631 | a.y *= b.y; 632 | a.z *= b.z; 633 | a.w *= b.w; 634 | } 635 | inline __host__ __device__ float4 operator*(float4 a, float b) { 636 | return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); 637 | } 638 | inline __host__ __device__ float4 operator*(float b, float4 a) { 639 | return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); 640 | } 641 | inline __host__ __device__ void operator*=(float4 &a, float b) { 642 | a.x *= b; 643 | a.y *= b; 644 | a.z *= b; 645 | a.w *= b; 646 | } 647 | 648 | inline __host__ __device__ int4 operator*(int4 a, int4 b) { 649 | return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 650 | } 651 | inline __host__ __device__ void operator*=(int4 &a, int4 b) { 652 | a.x *= b.x; 653 | a.y *= b.y; 654 | a.z *= b.z; 655 | a.w *= b.w; 656 | } 657 | inline __host__ __device__ int4 operator*(int4 a, int b) { 658 | return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); 659 | } 660 | inline __host__ __device__ int4 operator*(int b, int4 a) { 661 | return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); 662 | } 663 | inline __host__ __device__ void operator*=(int4 &a, int b) { 664 | a.x *= b; 665 | a.y *= b; 666 | a.z *= b; 667 | a.w *= b; 668 | } 669 | 670 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) { 671 | return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 672 | } 673 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b) { 674 | a.x *= b.x; 675 | a.y *= b.y; 676 | a.z *= b.z; 677 | a.w *= b.w; 678 | } 679 | inline __host__ __device__ uint4 operator*(uint4 a, uint b) { 680 | return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); 681 | } 682 | inline __host__ __device__ uint4 operator*(uint b, uint4 a) { 683 | return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); 684 | } 685 | inline __host__ __device__ void operator*=(uint4 &a, uint b) { 686 | a.x *= b; 687 | a.y *= b; 688 | a.z *= b; 689 | a.w *= b; 690 | } 691 | 692 | //////////////////////////////////////////////////////////////////////////////// 693 | // divide 694 | //////////////////////////////////////////////////////////////////////////////// 695 | 696 | inline __host__ __device__ float2 operator/(float2 a, float2 b) { 697 | return make_float2(a.x / b.x, a.y / b.y); 698 | } 699 | inline __host__ __device__ void operator/=(float2 &a, float2 b) { 700 | a.x /= b.x; 701 | a.y /= b.y; 702 | } 703 | inline __host__ __device__ float2 operator/(float2 a, float b) { 704 | return make_float2(a.x / b, a.y / b); 705 | } 706 | inline __host__ __device__ void operator/=(float2 &a, float b) { 707 | a.x /= b; 708 | a.y /= b; 709 | } 710 | inline __host__ __device__ float2 operator/(float b, float2 a) { 711 | return make_float2(b / a.x, b / a.y); 712 | } 713 | 714 | inline __host__ __device__ float3 operator/(float3 a, float3 b) { 715 | return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); 716 | } 717 | inline __host__ __device__ void operator/=(float3 &a, float3 b) { 718 | a.x /= b.x; 719 | a.y /= b.y; 720 | a.z /= b.z; 721 | } 722 | inline __host__ __device__ float3 operator/(float3 a, float b) { 723 | return make_float3(a.x / b, a.y / b, a.z / b); 724 | } 725 | inline __host__ __device__ void operator/=(float3 &a, float b) { 726 | a.x /= b; 727 | a.y /= b; 728 | a.z /= b; 729 | } 730 | inline __host__ __device__ float3 operator/(float b, float3 a) { 731 | return make_float3(b / a.x, b / a.y, b / a.z); 732 | } 733 | 734 | inline __host__ __device__ float4 operator/(float4 a, float4 b) { 735 | return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); 736 | } 737 | inline __host__ __device__ void operator/=(float4 &a, float4 b) { 738 | a.x /= b.x; 739 | a.y /= b.y; 740 | a.z /= b.z; 741 | a.w /= b.w; 742 | } 743 | inline __host__ __device__ float4 operator/(float4 a, float b) { 744 | return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); 745 | } 746 | inline __host__ __device__ void operator/=(float4 &a, float b) { 747 | a.x /= b; 748 | a.y /= b; 749 | a.z /= b; 750 | a.w /= b; 751 | } 752 | inline __host__ __device__ float4 operator/(float b, float4 a) { 753 | return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); 754 | } 755 | 756 | //////////////////////////////////////////////////////////////////////////////// 757 | // min 758 | //////////////////////////////////////////////////////////////////////////////// 759 | 760 | inline __host__ __device__ float2 fminf(float2 a, float2 b) { 761 | return make_float2(fminf(a.x, b.x), fminf(a.y, b.y)); 762 | } 763 | inline __host__ __device__ float3 fminf(float3 a, float3 b) { 764 | return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z)); 765 | } 766 | inline __host__ __device__ float4 fminf(float4 a, float4 b) { 767 | return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); 768 | } 769 | 770 | inline __host__ __device__ int2 min(int2 a, int2 b) { 771 | return make_int2(min(a.x, b.x), min(a.y, b.y)); 772 | } 773 | inline __host__ __device__ int3 min(int3 a, int3 b) { 774 | return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); 775 | } 776 | inline __host__ __device__ int4 min(int4 a, int4 b) { 777 | return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); 778 | } 779 | 780 | inline __host__ __device__ uint2 min(uint2 a, uint2 b) { 781 | return make_uint2(min(a.x, b.x), min(a.y, b.y)); 782 | } 783 | inline __host__ __device__ uint3 min(uint3 a, uint3 b) { 784 | return make_uint3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); 785 | } 786 | inline __host__ __device__ uint4 min(uint4 a, uint4 b) { 787 | return make_uint4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); 788 | } 789 | 790 | //////////////////////////////////////////////////////////////////////////////// 791 | // max 792 | //////////////////////////////////////////////////////////////////////////////// 793 | 794 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b) { 795 | return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y)); 796 | } 797 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b) { 798 | return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z)); 799 | } 800 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b) { 801 | return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); 802 | } 803 | 804 | inline __host__ __device__ int2 max(int2 a, int2 b) { 805 | return make_int2(max(a.x, b.x), max(a.y, b.y)); 806 | } 807 | inline __host__ __device__ int3 max(int3 a, int3 b) { 808 | return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); 809 | } 810 | inline __host__ __device__ int4 max(int4 a, int4 b) { 811 | return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); 812 | } 813 | 814 | inline __host__ __device__ uint2 max(uint2 a, uint2 b) { 815 | return make_uint2(max(a.x, b.x), max(a.y, b.y)); 816 | } 817 | inline __host__ __device__ uint3 max(uint3 a, uint3 b) { 818 | return make_uint3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); 819 | } 820 | inline __host__ __device__ uint4 max(uint4 a, uint4 b) { 821 | return make_uint4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); 822 | } 823 | 824 | //////////////////////////////////////////////////////////////////////////////// 825 | // lerp 826 | // - linear interpolation between a and b, based on value t in [0, 1] range 827 | //////////////////////////////////////////////////////////////////////////////// 828 | 829 | inline __device__ __host__ float lerp(float a, float b, float t) { return a + t * (b - a); } 830 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) { return a + t * (b - a); } 831 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) { return a + t * (b - a); } 832 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) { return a + t * (b - a); } 833 | 834 | //////////////////////////////////////////////////////////////////////////////// 835 | // clamp 836 | // - clamp the value v to be in the range [a, b] 837 | //////////////////////////////////////////////////////////////////////////////// 838 | 839 | inline __device__ __host__ float clamp(float f, float a, float b) { return fmaxf(a, fminf(f, b)); } 840 | inline __device__ __host__ int clamp(int f, int a, int b) { return max(a, min(f, b)); } 841 | inline __device__ __host__ uint clamp(uint f, uint a, uint b) { return max(a, min(f, b)); } 842 | 843 | inline __device__ __host__ float2 clamp(float2 v, float a, float b) { 844 | return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); 845 | } 846 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) { 847 | return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 848 | } 849 | inline __device__ __host__ float3 clamp(float3 v, float a, float b) { 850 | return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 851 | } 852 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) { 853 | return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 854 | } 855 | inline __device__ __host__ float4 clamp(float4 v, float a, float b) { 856 | return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 857 | } 858 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) { 859 | return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), 860 | clamp(v.w, a.w, b.w)); 861 | } 862 | 863 | inline __device__ __host__ int2 clamp(int2 v, int a, int b) { 864 | return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); 865 | } 866 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) { 867 | return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 868 | } 869 | inline __device__ __host__ int3 clamp(int3 v, int a, int b) { 870 | return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 871 | } 872 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) { 873 | return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 874 | } 875 | inline __device__ __host__ int4 clamp(int4 v, int a, int b) { 876 | return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 877 | } 878 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) { 879 | return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), 880 | clamp(v.w, a.w, b.w)); 881 | } 882 | 883 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) { 884 | return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); 885 | } 886 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) { 887 | return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 888 | } 889 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) { 890 | return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 891 | } 892 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) { 893 | return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 894 | } 895 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) { 896 | return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 897 | } 898 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) { 899 | return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), 900 | clamp(v.w, a.w, b.w)); 901 | } 902 | 903 | //////////////////////////////////////////////////////////////////////////////// 904 | // dot product 905 | //////////////////////////////////////////////////////////////////////////////// 906 | 907 | inline __host__ __device__ float dot(float2 a, float2 b) { return a.x * b.x + a.y * b.y; } 908 | inline __host__ __device__ float dot(float3 a, float3 b) { 909 | return a.x * b.x + a.y * b.y + a.z * b.z; 910 | } 911 | inline __host__ __device__ float dot(float4 a, float4 b) { 912 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 913 | } 914 | 915 | inline __host__ __device__ int dot(int2 a, int2 b) { return a.x * b.x + a.y * b.y; } 916 | inline __host__ __device__ int dot(int3 a, int3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } 917 | inline __host__ __device__ int dot(int4 a, int4 b) { 918 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 919 | } 920 | 921 | inline __host__ __device__ uint dot(uint2 a, uint2 b) { return a.x * b.x + a.y * b.y; } 922 | inline __host__ __device__ uint dot(uint3 a, uint3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } 923 | inline __host__ __device__ uint dot(uint4 a, uint4 b) { 924 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 925 | } 926 | 927 | //////////////////////////////////////////////////////////////////////////////// 928 | // length 929 | //////////////////////////////////////////////////////////////////////////////// 930 | 931 | inline __host__ __device__ float length(float2 v) { return sqrtf(dot(v, v)); } 932 | inline __host__ __device__ float length(float3 v) { return sqrtf(dot(v, v)); } 933 | inline __host__ __device__ float length(float4 v) { return sqrtf(dot(v, v)); } 934 | 935 | //////////////////////////////////////////////////////////////////////////////// 936 | // normalize 937 | //////////////////////////////////////////////////////////////////////////////// 938 | 939 | inline __host__ __device__ float2 normalize(float2 v) { 940 | float invLen = rsqrtf(dot(v, v)); 941 | return v * invLen; 942 | } 943 | inline __host__ __device__ float3 normalize(float3 v) { 944 | float invLen = rsqrtf(dot(v, v)); 945 | return v * invLen; 946 | } 947 | inline __host__ __device__ float4 normalize(float4 v) { 948 | float invLen = rsqrtf(dot(v, v)); 949 | return v * invLen; 950 | } 951 | 952 | //////////////////////////////////////////////////////////////////////////////// 953 | // floor 954 | //////////////////////////////////////////////////////////////////////////////// 955 | 956 | inline __host__ __device__ float2 floorf(float2 v) { return make_float2(floorf(v.x), floorf(v.y)); } 957 | inline __host__ __device__ float3 floorf(float3 v) { 958 | return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); 959 | } 960 | inline __host__ __device__ float4 floorf(float4 v) { 961 | return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); 962 | } 963 | 964 | //////////////////////////////////////////////////////////////////////////////// 965 | // frac - returns the fractional portion of a scalar or each vector component 966 | //////////////////////////////////////////////////////////////////////////////// 967 | 968 | inline __host__ __device__ float fracf(float v) { return v - floorf(v); } 969 | inline __host__ __device__ float2 fracf(float2 v) { return make_float2(fracf(v.x), fracf(v.y)); } 970 | inline __host__ __device__ float3 fracf(float3 v) { 971 | return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); 972 | } 973 | inline __host__ __device__ float4 fracf(float4 v) { 974 | return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); 975 | } 976 | 977 | //////////////////////////////////////////////////////////////////////////////// 978 | // fmod 979 | //////////////////////////////////////////////////////////////////////////////// 980 | 981 | inline __host__ __device__ float2 fmodf(float2 a, float2 b) { 982 | return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); 983 | } 984 | inline __host__ __device__ float3 fmodf(float3 a, float3 b) { 985 | return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); 986 | } 987 | inline __host__ __device__ float4 fmodf(float4 a, float4 b) { 988 | return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); 989 | } 990 | 991 | //////////////////////////////////////////////////////////////////////////////// 992 | // absolute value 993 | //////////////////////////////////////////////////////////////////////////////// 994 | 995 | inline __host__ __device__ float2 fabs(float2 v) { return make_float2(fabs(v.x), fabs(v.y)); } 996 | inline __host__ __device__ float3 fabs(float3 v) { 997 | return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); 998 | } 999 | inline __host__ __device__ float4 fabs(float4 v) { 1000 | return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); 1001 | } 1002 | 1003 | inline __host__ __device__ int2 abs(int2 v) { return make_int2(abs(v.x), abs(v.y)); } 1004 | inline __host__ __device__ int3 abs(int3 v) { return make_int3(abs(v.x), abs(v.y), abs(v.z)); } 1005 | inline __host__ __device__ int4 abs(int4 v) { 1006 | return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); 1007 | } 1008 | 1009 | //////////////////////////////////////////////////////////////////////////////// 1010 | // reflect 1011 | // - returns reflection of incident ray I around surface normal N 1012 | // - N should be normalized, reflected vector's length is equal to length of I 1013 | //////////////////////////////////////////////////////////////////////////////// 1014 | 1015 | inline __host__ __device__ float3 reflect(float3 i, float3 n) { return i - 2.0f * n * dot(n, i); } 1016 | 1017 | //////////////////////////////////////////////////////////////////////////////// 1018 | // cross product 1019 | //////////////////////////////////////////////////////////////////////////////// 1020 | 1021 | inline __host__ __device__ float3 cross(float3 a, float3 b) { 1022 | return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); 1023 | } 1024 | 1025 | //////////////////////////////////////////////////////////////////////////////// 1026 | // smoothstep 1027 | // - returns 0 if x < a 1028 | // - returns 1 if x > b 1029 | // - otherwise returns smooth interpolation between 0 and 1 based on x 1030 | //////////////////////////////////////////////////////////////////////////////// 1031 | 1032 | inline __device__ __host__ float smoothstep(float a, float b, float x) { 1033 | float y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1034 | return (y * y * (3.0f - (2.0f * y))); 1035 | } 1036 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) { 1037 | float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1038 | return (y * y * (make_float2(3.0f) - (make_float2(2.0f) * y))); 1039 | } 1040 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) { 1041 | float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1042 | return (y * y * (make_float3(3.0f) - (make_float3(2.0f) * y))); 1043 | } 1044 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) { 1045 | float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1046 | return (y * y * (make_float4(3.0f) - (make_float4(2.0f) * y))); 1047 | } 1048 | 1049 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/math.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "helper_math.h" 3 | 4 | struct mat33 { 5 | float3 cols[3]; 6 | 7 | __host__ __device__ mat33() {} 8 | __host__ __device__ mat33(const float3 &c0, 9 | const float3 &c1, 10 | const float3 &c2) { 11 | cols[0] = c0; 12 | cols[1] = c1; 13 | cols[2] = c2; 14 | } 15 | __host__ __device__ mat33(const float *data) { 16 | cols[0] = make_float3(data[0], data[1], data[2]); 17 | cols[1] = make_float3(data[3], data[4], data[5]); 18 | cols[2] = make_float3(data[6], data[7], data[8]); 19 | } 20 | 21 | __host__ __device__ static mat33 identity() { 22 | return mat33(make_float3(1, 0, 0), 23 | make_float3(0, 1, 0), 24 | make_float3(0, 0, 1)); 25 | } 26 | 27 | __host__ __device__ static mat33 skew_symmetric(const float3 &v) { 28 | return mat33(make_float3(0, v.z, -v.y), 29 | make_float3(-v.z, 0, v.x), 30 | make_float3(v.y, -v.x, 0)); 31 | } 32 | 33 | __host__ __device__ mat33 transpose() const { 34 | float3 c0 = cols[0]; 35 | float3 c1 = cols[1]; 36 | float3 c2 = cols[2]; 37 | return mat33(make_float3(c0.x, c1.x, c2.x), 38 | make_float3(c0.y, c1.y, c2.y), 39 | make_float3(c0.z, c1.z, c2.z)); 40 | } 41 | 42 | __host__ __device__ float &operator[](int i) { 43 | float3 &col = cols[i / 3]; 44 | return (&col.x)[i % 3]; 45 | } 46 | 47 | __host__ __device__ const mat33 operator+(const mat33 &m) const { 48 | float3 c0 = cols[0] + m.cols[0]; 49 | float3 c1 = cols[1] + m.cols[1]; 50 | float3 c2 = cols[2] + m.cols[2]; 51 | return mat33(c0, c1, c2); 52 | } 53 | 54 | __host__ __device__ const mat33 operator*(const mat33 &m) const { 55 | float3 c0 = cols[0]; 56 | float3 c1 = cols[1]; 57 | float3 c2 = cols[2]; 58 | float3 m0 = m.cols[0]; 59 | float3 m1 = m.cols[1]; 60 | float3 m2 = m.cols[2]; 61 | 62 | float3 n0 = make_float3(c0.x * m0.x + c1.x * m0.y + c2.x * m0.z, 63 | c0.y * m0.x + c1.y * m0.y + c2.y * m0.z, 64 | c0.z * m0.x + c1.z * m0.y + c2.z * m0.z); 65 | float3 n1 = make_float3(c0.x * m1.x + c1.x * m1.y + c2.x * m1.z, 66 | c0.y * m1.x + c1.y * m1.y + c2.y * m1.z, 67 | c0.z * m1.x + c1.z * m1.y + c2.z * m1.z); 68 | float3 n2 = make_float3(c0.x * m2.x + c1.x * m2.y + c2.x * m2.z, 69 | c0.y * m2.x + c1.y * m2.y + c2.y * m2.z, 70 | c0.z * m2.x + c1.z * m2.y + c2.z * m2.z); 71 | return mat33(n0, n1, n2); 72 | } 73 | 74 | __host__ __device__ const mat33 operator*(const float &s) const { 75 | float3 c0 = cols[0]; 76 | float3 c1 = cols[1]; 77 | float3 c2 = cols[2]; 78 | return mat33(c0 * s, c1 * s, c2 * s); 79 | } 80 | 81 | __host__ __device__ const float3 operator*(const float3 &v) const { 82 | float3 c0 = cols[0]; 83 | float3 c1 = cols[1]; 84 | float3 c2 = cols[2]; 85 | return make_float3(c0.x * v.x + c1.x * v.y + c2.x * v.z, 86 | c0.y * v.x + c1.y * v.y + c2.y * v.z, 87 | c0.z * v.x + c1.z * v.y + c2.z * v.z); 88 | } 89 | 90 | __host__ __device__ const mat33 operator-() const { 91 | float3 c0 = cols[0]; 92 | float3 c1 = cols[1]; 93 | float3 c2 = cols[2]; 94 | return mat33(-c0, -c1, -c2); 95 | } 96 | 97 | friend __host__ __device__ mat33 operator*(const float &s, const mat33 &m) { 98 | return m * s; 99 | } 100 | }; 101 | 102 | 103 | 104 | struct mat34 { 105 | float3 cols[4]; 106 | __host__ __device__ mat34() {} 107 | __host__ __device__ mat34(const float3 &c0, 108 | const float3 &c1, 109 | const float3 &c2, 110 | const float3 &c3) { 111 | cols[0] = c0; 112 | cols[1] = c1; 113 | cols[2] = c2; 114 | cols[3] = c3; 115 | } 116 | __host__ __device__ mat34(const float *data) { 117 | cols[0] = make_float3(data[0], data[1], data[2]); 118 | cols[1] = make_float3(data[3], data[4], data[5]); 119 | cols[2] = make_float3(data[6], data[7], data[8]); 120 | cols[3] = make_float3(data[9], data[10], data[11]); 121 | } 122 | __host__ __device__ mat34(const mat33 &m, const float3 &v) { 123 | cols[0] = m.cols[0]; 124 | cols[1] = m.cols[1]; 125 | cols[2] = m.cols[2]; 126 | cols[3] = v; 127 | } 128 | 129 | __host__ __device__ float &operator[](int i) { 130 | float3 &col = cols[i / 3]; 131 | return (&col.x)[i % 3]; 132 | } 133 | 134 | __host__ __device__ const mat34 operator+(const mat34 &m) const { 135 | float3 c0 = cols[0] + m.cols[0]; 136 | float3 c1 = cols[1] + m.cols[1]; 137 | float3 c2 = cols[2] + m.cols[2]; 138 | float3 c3 = cols[3] + m.cols[3]; 139 | return mat34(c0, c1, c2, c3); 140 | } 141 | }; 142 | 143 | struct mat44 { 144 | float4 cols[4]; 145 | __host__ __device__ mat44() {} 146 | __host__ __device__ mat44(const float4 &c0, const float4 &c1, const float4 &c2, const float4 &c3) { 147 | cols[0] = c0; cols[1] = c1; cols[2] = c2; cols[3] = c3; 148 | } 149 | __host__ __device__ mat44(const float *data) { 150 | cols[0] = make_float4(data[0], data[1], data[2], data[3]); 151 | cols[1] = make_float4(data[4], data[5], data[6], data[7]); 152 | cols[2] = make_float4(data[8], data[9], data[10], data[11]); 153 | cols[3] = make_float4(data[12], data[13], data[14], data[15]); 154 | } 155 | __host__ __device__ mat44(const mat33 &m, const float3 &v) { 156 | cols[0] = make_float4(m.cols[0], 0); 157 | cols[1] = make_float4(m.cols[1], 0); 158 | cols[2] = make_float4(m.cols[2], 0); 159 | cols[3] = make_float4(v, 1); 160 | } 161 | __host__ __device__ mat44(const mat34 &m) { 162 | cols[0] = make_float4(m.cols[0], 0); 163 | cols[1] = make_float4(m.cols[1], 0); 164 | cols[2] = make_float4(m.cols[2], 0); 165 | cols[3] = make_float4(m.cols[3], 1); 166 | } 167 | 168 | __host__ __device__ float &operator[](int i) { 169 | float4 &col = cols[i / 4]; 170 | return (&col.x)[i % 4]; 171 | } 172 | 173 | __host__ __device__ mat44 operator+(const mat44 &m) const { 174 | float4 c0 = cols[0] + m.cols[0]; 175 | float4 c1 = cols[1] + m.cols[1]; 176 | float4 c2 = cols[2] + m.cols[2]; 177 | float4 c3 = cols[3] + m.cols[3]; 178 | return mat44(c0, c1, c2, c3); 179 | } 180 | 181 | __host__ __device__ mat44 operator*(const mat44 &m) const { 182 | float4 c0 = cols[0]; 183 | float4 c1 = cols[1]; 184 | float4 c2 = cols[2]; 185 | float4 c3 = cols[3]; 186 | float4 m0 = m.cols[0]; 187 | float4 m1 = m.cols[1]; 188 | float4 m2 = m.cols[2]; 189 | float4 m3 = m.cols[3]; 190 | 191 | float4 n0 = make_float4(c0.x * m0.x + c1.x * m0.y + c2.x * m0.z + c3.x * m0.w, 192 | c0.y * m0.x + c1.y * m0.y + c2.y * m0.z + c3.y * m0.w, 193 | c0.z * m0.x + c1.z * m0.y + c2.z * m0.z + c3.z * m0.w, 194 | c0.w * m0.x + c1.w * m0.y + c2.w * m0.z + c3.w * m0.w); 195 | float4 n1 = make_float4(c0.x * m1.x + c1.x * m1.y + c2.x * m1.z + c3.x * m1.w, 196 | c0.y * m1.x + c1.y * m1.y + c2.y * m1.z + c3.y * m1.w, 197 | c0.z * m1.x + c1.z * m1.y + c2.z * m1.z + c3.z * m1.w, 198 | c0.w * m1.x + c1.w * m1.y + c2.w * m1.z + c3.w * m1.w); 199 | float4 n2 = make_float4(c0.x * m2.x + c1.x * m2.y + c2.x * m2.z + c3.x * m2.w, 200 | c0.y * m2.x + c1.y * m2.y + c2.y * m2.z + c3.y * m2.w, 201 | c0.z * m2.x + c1.z * m2.y + c2.z * m2.z + c3.z * m2.w, 202 | c0.w * m2.x + c1.w * m2.y + c2.w * m2.z + c3.w * m2.w); 203 | float4 n3 = make_float4(c0.x * m3.x + c1.x * m3.y + c2.x * m3.z + c3.x * m3.w, 204 | c0.y * m3.x + c1.y * m3.y + c2.y * m3.z + c3.y * m3.w, 205 | c0.z * m3.x + c1.z * m3.y + c2.z * m3.z + c3.z * m3.w, 206 | c0.w * m3.x + c1.w * m3.y + c2.w * m3.z + c3.w * m3.w); 207 | return mat44(n0, n1, n2, n3); 208 | 209 | } 210 | 211 | }; 212 | 213 | __forceinline__ __host__ __device__ float norm(const float3 &v) { 214 | return length(v); 215 | } 216 | 217 | struct SO3 { 218 | mat33 data_; 219 | __host__ __device__ SO3() {} 220 | __host__ __device__ SO3(const float3 &theta) { 221 | data_ = SO3::Exp(theta).data(); 222 | } 223 | __host__ __device__ SO3(const mat33 &data) { 224 | data_ = data; 225 | } 226 | __host__ __device__ mat33 data() const { 227 | return data_; 228 | } 229 | 230 | __host__ __device__ mat33 static hat(const float3 &theta) { 231 | return mat33::skew_symmetric(theta); 232 | } 233 | 234 | __host__ __device__ SO3 static Exp(const float3 &theta) { 235 | mat33 W = SO3::hat(theta); 236 | mat33 W2 = W * W; 237 | float angle = norm(theta); 238 | mat33 I = mat33::identity(); 239 | if (angle < 1e-5) { 240 | return SO3(I + W + 0.5f * W2); 241 | } 242 | else { 243 | return SO3(I + sin(angle) / angle * W + ((1 - cos(angle)) / (angle * angle)) * W2); 244 | } 245 | } 246 | __host__ __device__ float3 operator*(const float3 &v) const { 247 | return data_ * v; 248 | } 249 | 250 | __host__ __device__ SO3 operator*(const SO3 &R) const { 251 | return SO3(data_ * R.data_); 252 | } 253 | 254 | __host__ __device__ SO3 inverse() const { 255 | return SO3(data_.transpose()); 256 | } 257 | }; 258 | 259 | struct SE3 { 260 | SO3 R_data_; 261 | float3 t_data_; 262 | 263 | __host__ __device__ SE3() {} 264 | __host__ __device__ SE3(const float3 &rho, const float3 &theta) { 265 | SE3 T = SE3::Exp(rho, theta); 266 | t_data_ = T.t(); 267 | R_data_ = T.R(); 268 | } 269 | 270 | __host__ __device__ SE3(const float3 &t, const SO3 &R) { 271 | t_data_ = t; 272 | R_data_ = R; 273 | } 274 | 275 | __host__ __device__ SE3(const float *data) { 276 | mat44 T(data); 277 | t_data_ = make_float3(T.cols[3]); 278 | R_data_ = SO3(mat33( 279 | make_float3(T.cols[0]), make_float3(T.cols[1]), make_float3(T.cols[2])) 280 | ); 281 | } 282 | 283 | __host__ __device__ SE3(const mat44 &data) { 284 | t_data_ = make_float3(data.cols[3]); 285 | R_data_ = SO3(mat33( 286 | make_float3(data.cols[0]), make_float3(data.cols[1]), make_float3(data.cols[2])) 287 | ); 288 | } 289 | 290 | __host__ __device__ SO3 R() const { 291 | return R_data_; 292 | } 293 | 294 | __host__ __device__ float3 t() const { 295 | return t_data_; 296 | } 297 | 298 | __host__ __device__ mat44 data() const { 299 | return mat44(R_data_.data(), t_data_); 300 | } 301 | 302 | __host__ __device__ static mat44 hat(const float3 &rho, const float3 &theta) { 303 | mat33 W = SO3::hat(theta); 304 | mat44 T(W, rho); 305 | T.cols[3].w = 0; 306 | return T; 307 | } 308 | 309 | __host__ __device__ static SE3 Exp(const float3 &rho, const float3 &theta) { 310 | mat33 W = SO3::hat(theta); 311 | mat33 W2 = W * W; 312 | SO3 R = SO3::Exp(theta); 313 | float angle = norm(theta); 314 | mat33 I = mat33::identity(); 315 | mat33 V; 316 | if (angle < 1e-5) { 317 | V = I + 0.5f * W + 1.f / 6.f * W2; 318 | } 319 | else { 320 | V = I + W * ((1 - cos(angle)) / (angle * angle)) 321 | + W2 * ((angle - sin(angle)) / (angle * angle * angle)); 322 | } 323 | float3 t = V * rho; 324 | return SE3(t, R); 325 | } 326 | 327 | __host__ __device__ float3 operator*(const float3 &v) const { 328 | return R_data_ * v + t_data_; 329 | } 330 | 331 | __host__ __device__ SE3 operator*(const SE3 &T) const { 332 | return SE3(t_data_ + R_data_ * T.t_data_, R_data_ * T.R_data_); 333 | } 334 | 335 | __host__ __device__ SE3 inverse() const { 336 | SO3 R_inv = R_data_.inverse(); 337 | float3 t = R_inv * t_data_; 338 | return SE3(-t, R_inv); 339 | } 340 | }; -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #ifndef CUDA_RASTERIZER_H_INCLUDED 13 | #define CUDA_RASTERIZER_H_INCLUDED 14 | 15 | #include 16 | #include 17 | 18 | namespace CudaRasterizer 19 | { 20 | class Rasterizer 21 | { 22 | public: 23 | 24 | static void markVisible( 25 | int P, 26 | float* means3D, 27 | float* viewmatrix, 28 | float* projmatrix, 29 | bool* present); 30 | 31 | static int forward( 32 | std::function geometryBuffer, 33 | std::function binningBuffer, 34 | std::function imageBuffer, 35 | const int P, int D, int M, 36 | const float* background, 37 | const int width, int height, 38 | const float* means3D, 39 | const float* shs, 40 | const float* colors_precomp, 41 | const float* opacities, 42 | const float* scales, 43 | const float scale_modifier, 44 | const float* rotations, 45 | const float* cov3D_precomp, 46 | const float* viewmatrix, 47 | const float* projmatrix, 48 | const float* cam_pos, 49 | const float tan_fovx, float tan_fovy, 50 | const bool prefiltered, 51 | float* out_color, 52 | float* out_depth, 53 | float* out_opacity, 54 | int* radii = nullptr, 55 | int* n_touched = nullptr, 56 | bool debug = false); 57 | 58 | static void backward( 59 | const int P, int D, int M, int R, 60 | const float* background, 61 | const int width, int height, 62 | const float* means3D, 63 | const float* shs, 64 | const float* colors_precomp, 65 | const float* scales, 66 | const float scale_modifier, 67 | const float* rotations, 68 | const float* cov3D_precomp, 69 | const float* viewmatrix, 70 | const float* projmatrix, 71 | const float* projmatrix_raw, 72 | const float* campos, 73 | const float tan_fovx, float tan_fovy, 74 | const int* radii, 75 | char* geom_buffer, 76 | char* binning_buffer, 77 | char* image_buffer, 78 | const float* dL_dpix, 79 | const float* dL_dpix_depth, 80 | float* dL_dmean2D, 81 | float* dL_dconic, 82 | float* dL_dopacity, 83 | float* dL_dcolor, 84 | float* dL_ddepths, 85 | float* dL_dmean3D, 86 | float* dL_dcov3D, 87 | float* dL_dsh, 88 | float* dL_dscale, 89 | float* dL_drot, 90 | float* dL_dtau, 91 | bool debug); 92 | }; 93 | }; 94 | 95 | #endif -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer_impl.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include "rasterizer_impl.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "cuda_runtime.h" 19 | #include "device_launch_parameters.h" 20 | #include 21 | #include 22 | #define GLM_FORCE_CUDA 23 | #include 24 | 25 | #include 26 | #include 27 | namespace cg = cooperative_groups; 28 | 29 | #include "auxiliary.h" 30 | #include "forward.h" 31 | #include "backward.h" 32 | 33 | // Helper function to find the next-highest bit of the MSB 34 | // on the CPU. 35 | uint32_t getHigherMsb(uint32_t n) 36 | { 37 | uint32_t msb = sizeof(n) * 4; 38 | uint32_t step = msb; 39 | while (step > 1) 40 | { 41 | step /= 2; 42 | if (n >> msb) 43 | msb += step; 44 | else 45 | msb -= step; 46 | } 47 | if (n >> msb) 48 | msb++; 49 | return msb; 50 | } 51 | 52 | // Wrapper method to call auxiliary coarse frustum containment test. 53 | // Mark all Gaussians that pass it. 54 | __global__ void checkFrustum(int P, 55 | const float* orig_points, 56 | const float* viewmatrix, 57 | const float* projmatrix, 58 | bool* present) 59 | { 60 | auto idx = cg::this_grid().thread_rank(); 61 | if (idx >= P) 62 | return; 63 | 64 | float3 p_view; 65 | present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view); 66 | } 67 | 68 | // Generates one key/value pair for all Gaussian / tile overlaps. 69 | // Run once per Gaussian (1:N mapping). 70 | __global__ void duplicateWithKeys( 71 | int P, 72 | const float2* points_xy, 73 | const float* depths, 74 | const uint32_t* offsets, 75 | uint64_t* gaussian_keys_unsorted, 76 | uint32_t* gaussian_values_unsorted, 77 | int* radii, 78 | dim3 grid) 79 | { 80 | auto idx = cg::this_grid().thread_rank(); 81 | if (idx >= P) 82 | return; 83 | 84 | // Generate no key/value pair for invisible Gaussians 85 | if (radii[idx] > 0) 86 | { 87 | // Find this Gaussian's offset in buffer for writing keys/values. 88 | uint32_t off = (idx == 0) ? 0 : offsets[idx - 1]; 89 | uint2 rect_min, rect_max; 90 | 91 | getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid); 92 | 93 | // For each tile that the bounding rect overlaps, emit a 94 | // key/value pair. The key is | tile ID | depth |, 95 | // and the value is the ID of the Gaussian. Sorting the values 96 | // with this key yields Gaussian IDs in a list, such that they 97 | // are first sorted by tile and then by depth. 98 | for (int y = rect_min.y; y < rect_max.y; y++) 99 | { 100 | for (int x = rect_min.x; x < rect_max.x; x++) 101 | { 102 | uint64_t key = y * grid.x + x; 103 | key <<= 32; 104 | key |= *((uint32_t*)&depths[idx]); 105 | gaussian_keys_unsorted[off] = key; 106 | gaussian_values_unsorted[off] = idx; 107 | off++; 108 | } 109 | } 110 | } 111 | } 112 | 113 | // Check keys to see if it is at the start/end of one tile's range in 114 | // the full sorted list. If yes, write start/end of this tile. 115 | // Run once per instanced (duplicated) Gaussian ID. 116 | __global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges) 117 | { 118 | auto idx = cg::this_grid().thread_rank(); 119 | if (idx >= L) 120 | return; 121 | 122 | // Read tile ID from key. Update start/end of tile range if at limit. 123 | uint64_t key = point_list_keys[idx]; 124 | uint32_t currtile = key >> 32; 125 | if (idx == 0) 126 | ranges[currtile].x = 0; 127 | else 128 | { 129 | uint32_t prevtile = point_list_keys[idx - 1] >> 32; 130 | if (currtile != prevtile) 131 | { 132 | ranges[prevtile].y = idx; 133 | ranges[currtile].x = idx; 134 | } 135 | } 136 | if (idx == L - 1) 137 | ranges[currtile].y = L; 138 | } 139 | 140 | // Mark Gaussians as visible/invisible, based on view frustum testing 141 | void CudaRasterizer::Rasterizer::markVisible( 142 | int P, 143 | float* means3D, 144 | float* viewmatrix, 145 | float* projmatrix, 146 | bool* present) 147 | { 148 | checkFrustum << <(P + 255) / 256, 256 >> > ( 149 | P, 150 | means3D, 151 | viewmatrix, projmatrix, 152 | present); 153 | } 154 | 155 | CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P) 156 | { 157 | GeometryState geom; 158 | obtain(chunk, geom.depths, P, 128); 159 | obtain(chunk, geom.clamped, P * 3, 128); 160 | obtain(chunk, geom.internal_radii, P, 128); 161 | obtain(chunk, geom.means2D, P, 128); 162 | obtain(chunk, geom.cov3D, P * 6, 128); 163 | obtain(chunk, geom.conic_opacity, P, 128); 164 | obtain(chunk, geom.rgb, P * 3, 128); 165 | obtain(chunk, geom.tiles_touched, P, 128); 166 | cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P); 167 | obtain(chunk, geom.scanning_space, geom.scan_size, 128); 168 | obtain(chunk, geom.point_offsets, P, 128); 169 | return geom; 170 | } 171 | 172 | CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N) 173 | { 174 | ImageState img; 175 | obtain(chunk, img.accum_alpha, N, 128); 176 | obtain(chunk, img.n_contrib, N, 128); 177 | obtain(chunk, img.ranges, N, 128); 178 | return img; 179 | } 180 | 181 | CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P) 182 | { 183 | BinningState binning; 184 | obtain(chunk, binning.point_list, P, 128); 185 | obtain(chunk, binning.point_list_unsorted, P, 128); 186 | obtain(chunk, binning.point_list_keys, P, 128); 187 | obtain(chunk, binning.point_list_keys_unsorted, P, 128); 188 | cub::DeviceRadixSort::SortPairs( 189 | nullptr, binning.sorting_size, 190 | binning.point_list_keys_unsorted, binning.point_list_keys, 191 | binning.point_list_unsorted, binning.point_list, P); 192 | obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128); 193 | return binning; 194 | } 195 | 196 | // Forward rendering procedure for differentiable rasterization 197 | // of Gaussians. 198 | int CudaRasterizer::Rasterizer::forward( 199 | std::function geometryBuffer, 200 | std::function binningBuffer, 201 | std::function imageBuffer, 202 | const int P, int D, int M, 203 | const float* background, 204 | const int width, int height, 205 | const float* means3D, 206 | const float* shs, 207 | const float* colors_precomp, 208 | const float* opacities, 209 | const float* scales, 210 | const float scale_modifier, 211 | const float* rotations, 212 | const float* cov3D_precomp, 213 | const float* viewmatrix, 214 | const float* projmatrix, 215 | const float* cam_pos, 216 | const float tan_fovx, float tan_fovy, 217 | const bool prefiltered, 218 | float* out_color, 219 | float* out_depth, 220 | float* out_opacity, 221 | int* radii, 222 | int* n_touched, 223 | bool debug) 224 | { 225 | const float focal_y = height / (2.0f * tan_fovy); 226 | const float focal_x = width / (2.0f * tan_fovx); 227 | 228 | size_t chunk_size = required(P); 229 | char* chunkptr = geometryBuffer(chunk_size); 230 | GeometryState geomState = GeometryState::fromChunk(chunkptr, P); 231 | 232 | if (radii == nullptr) 233 | { 234 | radii = geomState.internal_radii; 235 | } 236 | 237 | dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); 238 | dim3 block(BLOCK_X, BLOCK_Y, 1); 239 | 240 | // Dynamically resize image-based auxiliary buffers during training 241 | size_t img_chunk_size = required(width * height); 242 | char* img_chunkptr = imageBuffer(img_chunk_size); 243 | ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height); 244 | 245 | if (NUM_CHANNELS != 3 && colors_precomp == nullptr) 246 | { 247 | throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!"); 248 | } 249 | 250 | // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) 251 | CHECK_CUDA(FORWARD::preprocess( 252 | P, D, M, 253 | means3D, 254 | (glm::vec3*)scales, 255 | scale_modifier, 256 | (glm::vec4*)rotations, 257 | opacities, 258 | shs, 259 | geomState.clamped, 260 | cov3D_precomp, 261 | colors_precomp, 262 | viewmatrix, projmatrix, 263 | (glm::vec3*)cam_pos, 264 | width, height, 265 | focal_x, focal_y, 266 | tan_fovx, tan_fovy, 267 | radii, 268 | geomState.means2D, 269 | geomState.depths, 270 | geomState.cov3D, 271 | geomState.rgb, 272 | geomState.conic_opacity, 273 | tile_grid, 274 | geomState.tiles_touched, 275 | prefiltered 276 | ), debug) 277 | 278 | // Compute prefix sum over full list of touched tile counts by Gaussians 279 | // E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8] 280 | CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug) 281 | 282 | // Retrieve total number of Gaussian instances to launch and resize aux buffers 283 | int num_rendered; 284 | CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug); 285 | 286 | size_t binning_chunk_size = required(num_rendered); 287 | char* binning_chunkptr = binningBuffer(binning_chunk_size); 288 | BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered); 289 | 290 | // For each instance to be rendered, produce adequate [ tile | depth ] key 291 | // and corresponding dublicated Gaussian indices to be sorted 292 | duplicateWithKeys << <(P + 255) / 256, 256 >> > ( 293 | P, 294 | geomState.means2D, 295 | geomState.depths, 296 | geomState.point_offsets, 297 | binningState.point_list_keys_unsorted, 298 | binningState.point_list_unsorted, 299 | radii, 300 | tile_grid) 301 | CHECK_CUDA(, debug) 302 | 303 | int bit = getHigherMsb(tile_grid.x * tile_grid.y); 304 | 305 | // Sort complete list of (duplicated) Gaussian indices by keys 306 | CHECK_CUDA(cub::DeviceRadixSort::SortPairs( 307 | binningState.list_sorting_space, 308 | binningState.sorting_size, 309 | binningState.point_list_keys_unsorted, binningState.point_list_keys, 310 | binningState.point_list_unsorted, binningState.point_list, 311 | num_rendered, 0, 32 + bit), debug) 312 | 313 | CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug); 314 | 315 | // Identify start and end of per-tile workloads in sorted list 316 | if (num_rendered > 0) 317 | identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( 318 | num_rendered, 319 | binningState.point_list_keys, 320 | imgState.ranges); 321 | CHECK_CUDA(, debug) 322 | 323 | // Let each tile blend its range of Gaussians independently in parallel 324 | const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb; 325 | CHECK_CUDA(FORWARD::render( 326 | tile_grid, block, 327 | imgState.ranges, 328 | binningState.point_list, 329 | width, height, 330 | geomState.means2D, 331 | feature_ptr, 332 | geomState.conic_opacity, 333 | imgState.accum_alpha, 334 | imgState.n_contrib, 335 | background, 336 | out_color, 337 | geomState.depths, 338 | out_depth, 339 | out_opacity, 340 | n_touched 341 | ), debug) 342 | 343 | return num_rendered; 344 | } 345 | 346 | // Produce necessary gradients for optimization, corresponding 347 | // to forward render pass 348 | void CudaRasterizer::Rasterizer::backward( 349 | const int P, int D, int M, int R, 350 | const float* background, 351 | const int width, int height, 352 | const float* means3D, 353 | const float* shs, 354 | const float* colors_precomp, 355 | const float* scales, 356 | const float scale_modifier, 357 | const float* rotations, 358 | const float* cov3D_precomp, 359 | const float* viewmatrix, 360 | const float* projmatrix, 361 | const float* projmatrix_raw, 362 | const float* campos, 363 | const float tan_fovx, float tan_fovy, 364 | const int* radii, 365 | char* geom_buffer, 366 | char* binning_buffer, 367 | char* img_buffer, 368 | const float* dL_dpix, 369 | const float* dL_dpix_depth, 370 | float* dL_dmean2D, 371 | float* dL_dconic, 372 | float* dL_dopacity, 373 | float* dL_dcolor, 374 | float* dL_ddepth, 375 | float* dL_dmean3D, 376 | float* dL_dcov3D, 377 | float* dL_dsh, 378 | float* dL_dscale, 379 | float* dL_drot, 380 | float* dL_dtau, 381 | bool debug) 382 | { 383 | GeometryState geomState = GeometryState::fromChunk(geom_buffer, P); 384 | BinningState binningState = BinningState::fromChunk(binning_buffer, R); 385 | ImageState imgState = ImageState::fromChunk(img_buffer, width * height); 386 | 387 | if (radii == nullptr) 388 | { 389 | radii = geomState.internal_radii; 390 | } 391 | 392 | const float focal_y = height / (2.0f * tan_fovy); 393 | const float focal_x = width / (2.0f * tan_fovx); 394 | 395 | const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); 396 | const dim3 block(BLOCK_X, BLOCK_Y, 1); 397 | 398 | // Compute loss gradients w.r.t. 2D mean position, conic matrix, 399 | // opacity and RGB of Gaussians from per-pixel loss gradients. 400 | // If we were given precomputed colors and not SHs, use them. 401 | const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb; 402 | const float* depth_ptr = geomState.depths; 403 | 404 | CHECK_CUDA(BACKWARD::render( 405 | tile_grid, 406 | block, 407 | imgState.ranges, 408 | binningState.point_list, 409 | width, height, 410 | background, 411 | geomState.means2D, 412 | geomState.conic_opacity, 413 | color_ptr, 414 | depth_ptr, 415 | imgState.accum_alpha, 416 | imgState.n_contrib, 417 | dL_dpix, 418 | dL_dpix_depth, 419 | (float3*)dL_dmean2D, 420 | (float4*)dL_dconic, 421 | dL_dopacity, 422 | dL_dcolor, 423 | dL_ddepth 424 | ), debug) 425 | 426 | // Take care of the rest of preprocessing. Was the precomputed covariance 427 | // given to us or a scales/rot pair? If precomputed, pass that. If not, 428 | // use the one we computed ourselves. 429 | const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D; 430 | CHECK_CUDA(BACKWARD::preprocess(P, D, M, 431 | (float3*)means3D, 432 | radii, 433 | shs, 434 | geomState.clamped, 435 | (glm::vec3*)scales, 436 | (glm::vec4*)rotations, 437 | scale_modifier, 438 | cov3D_ptr, 439 | viewmatrix, 440 | projmatrix, 441 | projmatrix_raw, 442 | focal_x, focal_y, 443 | tan_fovx, tan_fovy, 444 | (glm::vec3*)campos, 445 | (float3*)dL_dmean2D, 446 | dL_dconic, 447 | (glm::vec3*)dL_dmean3D, 448 | dL_dcolor, 449 | dL_ddepth, 450 | dL_dcov3D, 451 | dL_dsh, 452 | (glm::vec3*)dL_dscale, 453 | (glm::vec4*)dL_drot, 454 | dL_dtau), debug) 455 | } -------------------------------------------------------------------------------- /cuda_rasterizer/rasterizer_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include "rasterizer.h" 17 | #include 18 | 19 | namespace CudaRasterizer 20 | { 21 | template 22 | static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment) 23 | { 24 | std::size_t offset = (reinterpret_cast(chunk) + alignment - 1) & ~(alignment - 1); 25 | ptr = reinterpret_cast(offset); 26 | chunk = reinterpret_cast(ptr + count); 27 | } 28 | 29 | struct GeometryState 30 | { 31 | size_t scan_size; 32 | float* depths; 33 | char* scanning_space; 34 | bool* clamped; 35 | int* internal_radii; 36 | float2* means2D; 37 | float* cov3D; 38 | float4* conic_opacity; 39 | float* rgb; 40 | uint32_t* point_offsets; 41 | uint32_t* tiles_touched; 42 | 43 | static GeometryState fromChunk(char*& chunk, size_t P); 44 | }; 45 | 46 | struct ImageState 47 | { 48 | uint2* ranges; 49 | uint32_t* n_contrib; 50 | float* accum_alpha; 51 | 52 | static ImageState fromChunk(char*& chunk, size_t N); 53 | }; 54 | 55 | struct BinningState 56 | { 57 | size_t sorting_size; 58 | uint64_t* point_list_keys_unsorted; 59 | uint64_t* point_list_keys; 60 | uint32_t* point_list_unsorted; 61 | uint32_t* point_list; 62 | char* list_sorting_space; 63 | 64 | static BinningState fromChunk(char*& chunk, size_t P); 65 | }; 66 | 67 | template 68 | size_t required(size_t P) 69 | { 70 | char* size = nullptr; 71 | T::fromChunk(size, P); 72 | return ((size_t)size) + 128; 73 | } 74 | }; -------------------------------------------------------------------------------- /diff_gaussian_rasterization/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | from typing import NamedTuple 13 | import torch.nn as nn 14 | import torch 15 | from . import _C 16 | 17 | def cpu_deep_copy_tuple(input_tuple): 18 | copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple] 19 | return tuple(copied_tensors) 20 | 21 | def rasterize_gaussians( 22 | means3D, 23 | means2D, 24 | sh, 25 | colors_precomp, 26 | opacities, 27 | scales, 28 | rotations, 29 | cov3Ds_precomp, 30 | theta, 31 | rho, 32 | raster_settings, 33 | ): 34 | return _RasterizeGaussians.apply( 35 | means3D, 36 | means2D, 37 | sh, 38 | colors_precomp, 39 | opacities, 40 | scales, 41 | rotations, 42 | cov3Ds_precomp, 43 | theta, 44 | rho, 45 | raster_settings, 46 | ) 47 | 48 | class _RasterizeGaussians(torch.autograd.Function): 49 | @staticmethod 50 | def forward( 51 | ctx, 52 | means3D, 53 | means2D, 54 | sh, 55 | colors_precomp, 56 | opacities, 57 | scales, 58 | rotations, 59 | cov3Ds_precomp, 60 | theta, 61 | rho, 62 | raster_settings, 63 | ): 64 | 65 | # Restructure arguments the way that the C++ lib expects them 66 | args = ( 67 | raster_settings.bg, 68 | means3D, 69 | colors_precomp, 70 | opacities, 71 | scales, 72 | rotations, 73 | raster_settings.scale_modifier, 74 | cov3Ds_precomp, 75 | raster_settings.viewmatrix, 76 | raster_settings.projmatrix, 77 | raster_settings.projmatrix_raw, 78 | raster_settings.tanfovx, 79 | raster_settings.tanfovy, 80 | raster_settings.image_height, 81 | raster_settings.image_width, 82 | sh, 83 | raster_settings.sh_degree, 84 | raster_settings.campos, 85 | raster_settings.prefiltered, 86 | raster_settings.debug, 87 | ) 88 | 89 | # Invoke C++/CUDA rasterizer 90 | if raster_settings.debug: 91 | cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted 92 | try: 93 | num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer, depth, opacity, n_touched = _C.rasterize_gaussians(*args) 94 | except Exception as ex: 95 | torch.save(cpu_args, "snapshot_fw.dump") 96 | print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.") 97 | raise ex 98 | else: 99 | num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer, depth, opacity, n_touched = _C.rasterize_gaussians(*args) 100 | 101 | # Keep relevant tensors for backward 102 | ctx.raster_settings = raster_settings 103 | ctx.num_rendered = num_rendered 104 | ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer) 105 | return color, radii, depth, opacity, n_touched 106 | 107 | @staticmethod 108 | def backward(ctx, grad_out_color, grad_out_radii, grad_out_depth, grad_out_opacity, grad_n_touched): 109 | 110 | # Restore necessary values from context 111 | num_rendered = ctx.num_rendered 112 | raster_settings = ctx.raster_settings 113 | colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer = ctx.saved_tensors 114 | 115 | # Restructure args as C++ method expects them 116 | args = (raster_settings.bg, 117 | means3D, 118 | radii, 119 | colors_precomp, 120 | scales, 121 | rotations, 122 | raster_settings.scale_modifier, 123 | cov3Ds_precomp, 124 | raster_settings.viewmatrix, 125 | raster_settings.projmatrix, 126 | raster_settings.projmatrix_raw, 127 | raster_settings.tanfovx, 128 | raster_settings.tanfovy, 129 | grad_out_color, 130 | grad_out_depth, 131 | sh, 132 | raster_settings.sh_degree, 133 | raster_settings.campos, 134 | geomBuffer, 135 | num_rendered, 136 | binningBuffer, 137 | imgBuffer, 138 | raster_settings.debug) 139 | 140 | # Compute gradients for relevant tensors by invoking backward method 141 | if raster_settings.debug: 142 | cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted 143 | try: 144 | grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations, grad_tau = _C.rasterize_gaussians_backward(*args) 145 | except Exception as ex: 146 | torch.save(cpu_args, "snapshot_bw.dump") 147 | print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n") 148 | raise ex 149 | else: 150 | grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations, grad_tau = _C.rasterize_gaussians_backward(*args) 151 | 152 | grad_tau = torch.sum(grad_tau.view(-1, 6), dim=0) 153 | grad_rho = grad_tau[:3].view(1, -1) 154 | grad_theta = grad_tau[3:].view(1, -1) 155 | 156 | 157 | grads = ( 158 | grad_means3D, 159 | grad_means2D, 160 | grad_sh, 161 | grad_colors_precomp, 162 | grad_opacities, 163 | grad_scales, 164 | grad_rotations, 165 | grad_cov3Ds_precomp, 166 | grad_theta, 167 | grad_rho, 168 | None, 169 | ) 170 | 171 | return grads 172 | 173 | class GaussianRasterizationSettings(NamedTuple): 174 | image_height: int 175 | image_width: int 176 | tanfovx : float 177 | tanfovy : float 178 | bg : torch.Tensor 179 | scale_modifier : float 180 | viewmatrix : torch.Tensor 181 | projmatrix : torch.Tensor 182 | projmatrix_raw : torch.Tensor 183 | sh_degree : int 184 | campos : torch.Tensor 185 | prefiltered : bool 186 | debug : bool 187 | 188 | class GaussianRasterizer(nn.Module): 189 | def __init__(self, raster_settings): 190 | super().__init__() 191 | self.raster_settings = raster_settings 192 | 193 | def markVisible(self, positions): 194 | # Mark visible points (based on frustum culling for camera) with a boolean 195 | with torch.no_grad(): 196 | raster_settings = self.raster_settings 197 | visible = _C.mark_visible( 198 | positions, 199 | raster_settings.viewmatrix, 200 | raster_settings.projmatrix) 201 | 202 | return visible 203 | 204 | def forward(self, means3D, means2D, opacities, shs = None, colors_precomp = None, scales = None, rotations = None, cov3D_precomp = None, theta=None, rho=None): 205 | 206 | raster_settings = self.raster_settings 207 | 208 | if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None): 209 | raise Exception('Please provide excatly one of either SHs or precomputed colors!') 210 | 211 | if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None): 212 | raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!') 213 | 214 | if shs is None: 215 | shs = torch.Tensor([]) 216 | if colors_precomp is None: 217 | colors_precomp = torch.Tensor([]) 218 | 219 | if scales is None: 220 | scales = torch.Tensor([]) 221 | if rotations is None: 222 | rotations = torch.Tensor([]) 223 | if cov3D_precomp is None: 224 | cov3D_precomp = torch.Tensor([]) 225 | if theta is None: 226 | theta = torch.Tensor([]) 227 | if rho is None: 228 | rho = torch.Tensor([]) 229 | 230 | 231 | # Invoke C++/CUDA rasterization routine 232 | return rasterize_gaussians( 233 | means3D, 234 | means2D, 235 | shs, 236 | colors_precomp, 237 | opacities, 238 | scales, 239 | rotations, 240 | cov3D_precomp, 241 | theta, 242 | rho, 243 | raster_settings, 244 | ) 245 | 246 | -------------------------------------------------------------------------------- /ext.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include 13 | #include "rasterize_points.h" 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("rasterize_gaussians", &RasterizeGaussiansCUDA); 17 | m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA); 18 | m.def("mark_visible", &markVisible); 19 | } -------------------------------------------------------------------------------- /rasterize_points.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "cuda_rasterizer/config.h" 22 | #include "cuda_rasterizer/rasterizer.h" 23 | #include 24 | #include 25 | #include 26 | 27 | std::function resizeFunctional(torch::Tensor& t) { 28 | auto lambda = [&t](size_t N) { 29 | t.resize_({(long long)N}); 30 | return reinterpret_cast(t.contiguous().data_ptr()); 31 | }; 32 | return lambda; 33 | } 34 | 35 | std::tuple 36 | RasterizeGaussiansCUDA( 37 | const torch::Tensor& background, 38 | const torch::Tensor& means3D, 39 | const torch::Tensor& colors, 40 | const torch::Tensor& opacity, 41 | const torch::Tensor& scales, 42 | const torch::Tensor& rotations, 43 | const float scale_modifier, 44 | const torch::Tensor& cov3D_precomp, 45 | const torch::Tensor& viewmatrix, 46 | const torch::Tensor& projmatrix, 47 | const torch::Tensor& projmatrix_raw, 48 | const float tan_fovx, 49 | const float tan_fovy, 50 | const int image_height, 51 | const int image_width, 52 | const torch::Tensor& sh, 53 | const int degree, 54 | const torch::Tensor& campos, 55 | const bool prefiltered, 56 | const bool debug) 57 | { 58 | if (means3D.ndimension() != 2 || means3D.size(1) != 3) { 59 | AT_ERROR("means3D must have dimensions (num_points, 3)"); 60 | } 61 | 62 | const int P = means3D.size(0); 63 | const int H = image_height; 64 | const int W = image_width; 65 | 66 | auto int_opts = means3D.options().dtype(torch::kInt32); 67 | auto float_opts = means3D.options().dtype(torch::kFloat32); 68 | 69 | torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts); 70 | torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); 71 | torch::Tensor n_touched = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); 72 | torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts); 73 | torch::Tensor out_opaticy = torch::full({1, H, W}, 0.0, float_opts); 74 | 75 | torch::Device device(torch::kCUDA); 76 | torch::TensorOptions options(torch::kByte); 77 | torch::Tensor geomBuffer = torch::empty({0}, options.device(device)); 78 | torch::Tensor binningBuffer = torch::empty({0}, options.device(device)); 79 | torch::Tensor imgBuffer = torch::empty({0}, options.device(device)); 80 | std::function geomFunc = resizeFunctional(geomBuffer); 81 | std::function binningFunc = resizeFunctional(binningBuffer); 82 | std::function imgFunc = resizeFunctional(imgBuffer); 83 | 84 | int rendered = 0; 85 | if(P != 0) 86 | { 87 | int M = 0; 88 | if(sh.size(0) != 0) 89 | { 90 | M = sh.size(1); 91 | } 92 | 93 | rendered = CudaRasterizer::Rasterizer::forward( 94 | geomFunc, 95 | binningFunc, 96 | imgFunc, 97 | P, degree, M, 98 | background.contiguous().data(), 99 | W, H, 100 | means3D.contiguous().data(), 101 | sh.contiguous().data_ptr(), 102 | colors.contiguous().data(), 103 | opacity.contiguous().data(), 104 | scales.contiguous().data_ptr(), 105 | scale_modifier, 106 | rotations.contiguous().data_ptr(), 107 | cov3D_precomp.contiguous().data(), 108 | viewmatrix.contiguous().data(), 109 | projmatrix.contiguous().data(), 110 | campos.contiguous().data(), 111 | tan_fovx, 112 | tan_fovy, 113 | prefiltered, 114 | out_color.contiguous().data(), 115 | out_depth.contiguous().data(), 116 | out_opaticy.contiguous().data(), 117 | radii.contiguous().data(), 118 | n_touched.contiguous().data(), 119 | debug); 120 | } 121 | return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer, out_depth, out_opaticy, n_touched); 122 | } 123 | 124 | std::tuple 125 | RasterizeGaussiansBackwardCUDA( 126 | const torch::Tensor& background, 127 | const torch::Tensor& means3D, 128 | const torch::Tensor& radii, 129 | const torch::Tensor& colors, 130 | const torch::Tensor& scales, 131 | const torch::Tensor& rotations, 132 | const float scale_modifier, 133 | const torch::Tensor& cov3D_precomp, 134 | const torch::Tensor& viewmatrix, 135 | const torch::Tensor& projmatrix, 136 | const torch::Tensor& projmatrix_raw, 137 | const float tan_fovx, 138 | const float tan_fovy, 139 | const torch::Tensor& dL_dout_color, 140 | const torch::Tensor& dL_dout_depths, 141 | const torch::Tensor& sh, 142 | const int degree, 143 | const torch::Tensor& campos, 144 | const torch::Tensor& geomBuffer, 145 | const int R, 146 | const torch::Tensor& binningBuffer, 147 | const torch::Tensor& imageBuffer, 148 | const bool debug) 149 | { 150 | const int P = means3D.size(0); 151 | const int H = dL_dout_color.size(1); 152 | const int W = dL_dout_color.size(2); 153 | 154 | int M = 0; 155 | if(sh.size(0) != 0) 156 | { 157 | M = sh.size(1); 158 | } 159 | 160 | torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options()); 161 | torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options()); 162 | torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options()); 163 | torch::Tensor dL_ddepths = torch::zeros({P, 1}, means3D.options()); 164 | torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options()); 165 | torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options()); 166 | torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options()); 167 | torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options()); 168 | torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options()); 169 | torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options()); 170 | torch::Tensor dL_dtau = torch::zeros({P,6}, means3D.options()); 171 | 172 | if(P != 0) 173 | { 174 | CudaRasterizer::Rasterizer::backward(P, degree, M, R, 175 | background.contiguous().data(), 176 | W, H, 177 | means3D.contiguous().data(), 178 | sh.contiguous().data(), 179 | colors.contiguous().data(), 180 | scales.data_ptr(), 181 | scale_modifier, 182 | rotations.data_ptr(), 183 | cov3D_precomp.contiguous().data(), 184 | viewmatrix.contiguous().data(), 185 | projmatrix.contiguous().data(), 186 | projmatrix_raw.contiguous().data(), 187 | campos.contiguous().data(), 188 | tan_fovx, 189 | tan_fovy, 190 | radii.contiguous().data(), 191 | reinterpret_cast(geomBuffer.contiguous().data_ptr()), 192 | reinterpret_cast(binningBuffer.contiguous().data_ptr()), 193 | reinterpret_cast(imageBuffer.contiguous().data_ptr()), 194 | dL_dout_color.contiguous().data(), 195 | dL_dout_depths.contiguous().data(), 196 | dL_dmeans2D.contiguous().data(), 197 | dL_dconic.contiguous().data(), 198 | dL_dopacity.contiguous().data(), 199 | dL_dcolors.contiguous().data(), 200 | dL_ddepths.contiguous().data(), 201 | dL_dmeans3D.contiguous().data(), 202 | dL_dcov3D.contiguous().data(), 203 | dL_dsh.contiguous().data(), 204 | dL_dscales.contiguous().data(), 205 | dL_drotations.contiguous().data(), 206 | dL_dtau.contiguous().data(), 207 | debug); 208 | } 209 | 210 | return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations, dL_dtau); 211 | } 212 | 213 | torch::Tensor markVisible( 214 | torch::Tensor& means3D, 215 | torch::Tensor& viewmatrix, 216 | torch::Tensor& projmatrix) 217 | { 218 | const int P = means3D.size(0); 219 | 220 | torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool)); 221 | 222 | if(P != 0) 223 | { 224 | CudaRasterizer::Rasterizer::markVisible(P, 225 | means3D.contiguous().data(), 226 | viewmatrix.contiguous().data(), 227 | projmatrix.contiguous().data(), 228 | present.contiguous().data()); 229 | } 230 | 231 | return present; 232 | } -------------------------------------------------------------------------------- /rasterize_points.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023, Inria 3 | * GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | * All rights reserved. 5 | * 6 | * This software is free for non-commercial, research and evaluation use 7 | * under the terms of the LICENSE.md file. 8 | * 9 | * For inquiries contact george.drettakis@inria.fr 10 | */ 11 | 12 | #pragma once 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | std::tuple 19 | RasterizeGaussiansCUDA( 20 | const torch::Tensor& background, 21 | const torch::Tensor& means3D, 22 | const torch::Tensor& colors, 23 | const torch::Tensor& opacity, 24 | const torch::Tensor& scales, 25 | const torch::Tensor& rotations, 26 | const float scale_modifier, 27 | const torch::Tensor& cov3D_precomp, 28 | const torch::Tensor& viewmatrix, 29 | const torch::Tensor& projmatrix, 30 | const torch::Tensor& projmatrix_raw, 31 | const float tan_fovx, 32 | const float tan_fovy, 33 | const int image_height, 34 | const int image_width, 35 | const torch::Tensor& sh, 36 | const int degree, 37 | const torch::Tensor& campos, 38 | const bool prefiltered, 39 | const bool debug); 40 | 41 | std::tuple 42 | RasterizeGaussiansBackwardCUDA( 43 | const torch::Tensor& background, 44 | const torch::Tensor& means3D, 45 | const torch::Tensor& radii, 46 | const torch::Tensor& colors, 47 | const torch::Tensor& scales, 48 | const torch::Tensor& rotations, 49 | const float scale_modifier, 50 | const torch::Tensor& cov3D_precomp, 51 | const torch::Tensor& viewmatrix, 52 | const torch::Tensor& projmatrix, 53 | const torch::Tensor& projmatrix_raw, 54 | const float tan_fovx, 55 | const float tan_fovy, 56 | const torch::Tensor& dL_dout_color, 57 | const torch::Tensor& dL_dout_depth, 58 | const torch::Tensor& sh, 59 | const int degree, 60 | const torch::Tensor& campos, 61 | const torch::Tensor& geomBuffer, 62 | const int R, 63 | const torch::Tensor& binningBuffer, 64 | const torch::Tensor& imageBuffer, 65 | const bool debug); 66 | 67 | torch::Tensor markVisible( 68 | torch::Tensor& means3D, 69 | torch::Tensor& viewmatrix, 70 | torch::Tensor& projmatrix); -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023, Inria 3 | # GRAPHDECO research group, https://team.inria.fr/graphdeco 4 | # All rights reserved. 5 | # 6 | # This software is free for non-commercial, research and evaluation use 7 | # under the terms of the LICENSE.md file. 8 | # 9 | # For inquiries contact george.drettakis@inria.fr 10 | # 11 | 12 | from setuptools import setup 13 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension 14 | import os 15 | os.path.dirname(os.path.abspath(__file__)) 16 | 17 | setup( 18 | name="diff_gaussian_rasterization", 19 | packages=['diff_gaussian_rasterization'], 20 | ext_modules=[ 21 | CUDAExtension( 22 | name="diff_gaussian_rasterization._C", 23 | sources=[ 24 | "cuda_rasterizer/rasterizer_impl.cu", 25 | "cuda_rasterizer/forward.cu", 26 | "cuda_rasterizer/backward.cu", 27 | "rasterize_points.cu", 28 | "ext.cpp"], 29 | extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]}) 30 | ], 31 | cmdclass={ 32 | 'build_ext': BuildExtension 33 | } 34 | ) 35 | --------------------------------------------------------------------------------