├── gsplat2d
    ├── gsplat2d
    │   ├── version.py
    │   ├── cuda
    │   │   ├── csrc
    │   │   │   ├── ext.cpp
    │   │   │   ├── config.h
    │   │   │   ├── backward.cuh
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── forward.cuh
    │   │   │   ├── helpers.cuh
    │   │   │   ├── bindings.h
    │   │   │   ├── forward.cu
    │   │   │   ├── backward.cu
    │   │   │   └── bindings.cu
    │   │   ├── __init__.py
    │   │   └── _backend.py
    │   ├── project_gaussians.py
    │   ├── utils.py
    │   ├── __init__.py
    │   └── rasterize.py
    └── setup.py
├── .gitignore
├── requirements.txt
├── README.md
├── gaussianlig.py
├── utils.py
├── train.py
├── optimizer.py
└── LICENSE


/gsplat2d/gsplat2d/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.11"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.gif
 3 | *.png
 4 | *.jpg
 5 | *.egg
 6 | *.egg-info
 7 | 
 8 | checkpoints*
 9 | data/*
10 | dataset/*
11 | results/*
12 | figure/*
13 | *build*
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | constriction
 2 | numpy
 3 | pandas
 4 | pillow
 5 | pytorch-msssim
 6 | PyYAML
 7 | tqdm
 8 | vector-quantize-pytorch
 9 | torch
10 | torchvision
11 | ninja
12 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/ext.cpp:
--------------------------------------------------------------------------------
 1 | #include "bindings.h"
 2 | #include <torch/extension.h>
 3 | 
 4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 5 |     m.def("rasterize_forward", &rasterize_forward_tensor);
 6 |     m.def("rasterize_backward", &rasterize_backward_tensor);
 7 |     m.def("project_gaussians_forward", &project_gaussians_forward_tensor);
 8 |     m.def("project_gaussians_backward", &project_gaussians_backward_tensor);
 9 |     
10 |     m.def("compute_cov2d_bounds", &compute_cov2d_bounds_tensor);
11 |     m.def("map_gaussian_to_intersects", &map_gaussian_to_intersects_tensor);
12 |     m.def("get_tile_bin_edges", &get_tile_bin_edges_tensor);
13 | }
14 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | 
 4 | def _make_lazy_cuda_func(name: str) -> Callable:
 5 |     def call_cuda(*args, **kwargs):
 6 |         # pylint: disable=import-outside-toplevel
 7 |         from ._backend import _C
 8 | 
 9 |         return getattr(_C, name)(*args, **kwargs)
10 | 
11 |     return call_cuda
12 | 
13 | 
14 | rasterize_forward = _make_lazy_cuda_func("rasterize_forward")
15 | rasterize_backward = _make_lazy_cuda_func("rasterize_backward")
16 | compute_cov2d_bounds = _make_lazy_cuda_func("compute_cov2d_bounds")
17 | project_gaussians_forward = _make_lazy_cuda_func("project_gaussians_forward")
18 | project_gaussians_backward = _make_lazy_cuda_func("project_gaussians_backward")
19 | map_gaussian_to_intersects = _make_lazy_cuda_func("map_gaussian_to_intersects")
20 | get_tile_bin_edges = _make_lazy_cuda_func("get_tile_bin_edges")
21 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/config.h:
--------------------------------------------------------------------------------
 1 | #define MAX_BLOCK_SIZE ( 16 * 16 )
 2 | #define N_THREADS 256
 3 | 
 4 | #define MAX_REGISTER_CHANNELS 3
 5 | 
 6 | #define CUDA_CALL(x)                                                           \
 7 |     do {                                                                       \
 8 |         if ((x) != cudaSuccess) {                                              \
 9 |             printf(                                                            \
10 |                 "Error at %s:%d - %s\n",                                       \
11 |                 __FILE__,                                                      \
12 |                 __LINE__,                                                      \
13 |                 cudaGetErrorString(cudaGetLastError())                         \
14 |             );                                                                 \
15 |             exit(EXIT_FAILURE);                                                \
16 |         }                                                                      \
17 |     } while (0)
18 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/backward.cuh:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <cuda_runtime.h>
 3 | #include <cstdint>
 4 | 
 5 | // for f : R(n) -> R(m), J in R(m, n),
 6 | // v is cotangent in R(m), e.g. dL/df in R(m),
 7 | // compute vjp i.e. vT J -> R(n)
 8 | __global__ void project_gaussians_backward_kernel(
 9 |     const int num_points,
10 |     const int* __restrict__ radii,
11 |     const float3* __restrict__ conics,
12 |     const float2* __restrict__ v_xy,
13 |     const float3* __restrict__ v_conic,
14 |     float3* __restrict__ v_cov2d,
15 |     float2* __restrict__ v_mean2d
16 | );
17 | 
18 | __global__ void rasterize_backward_kernel(
19 |     const dim3 tile_bounds,
20 |     const dim3 img_size,
21 |     const int32_t* __restrict__ gaussian_ids_sorted,
22 |     const int2* __restrict__ tile_bins,
23 |     const float2* __restrict__ xys,
24 |     const float3* __restrict__ conics,
25 |     const float3* __restrict__ rgbs,
26 |     const int* __restrict__ final_index,
27 |     const float3* __restrict__ v_output,
28 |     float2* __restrict__ v_xy,
29 |     float2* __restrict__ v_xy_abs,
30 |     float3* __restrict__ v_cov,
31 |     float3* __restrict__ v_rgb
32 | );


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)  # You can adjust the minimum required version
 2 | set(CMAKE_CUDA_ARCHITECTURES 70 75 89)  # Ti 2080 uses 75. V100 uses 70. RTX 4090 uses 89.
 3 | 
 4 | project(gsplat2d CXX CUDA)
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_EXTENSIONS OFF)
 7 | set(CMAKE_CUDA_STANDARD 17)
 8 | 
 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
10 | 
11 | # our library library
12 | add_library(gsplat2d forward.cu backward.cu helpers.cuh)
13 | target_link_libraries(gsplat2d PUBLIC cuda)
14 | target_include_directories(gsplat2d PRIVATE
15 |     ${PROJECT_SOURCE_DIR}/third_party/glm
16 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
17 | )
18 | set_target_properties(gsplat2d PROPERTIES CUDA_ARCHITECTURES "70;75;86")
19 | 
20 | # # To add an executable that uses the gsplat2d library,
21 | # # follow example in the comments for a script `run_forward.cpp`
22 | # # Add the executable
23 | # add_executable(run_forward run_forward.cpp)
24 | 
25 | # # Link against CUDA runtime library
26 | # target_link_libraries(run_forward PUBLIC cuda gsplat2d)
27 | 
28 | # # Include directories for the header-only library
29 | # target_include_directories(run_forward PRIVATE
30 | #     ${PROJECT_SOURCE_DIR}/third_party/glm
31 | # )
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Large Images are Gaussians: High-Quality Large Image Representation with Levels of 2D Gaussian Splatting
 2 | 
 3 | This is the official code for https://arxiv.org/abs/2502.09039.
 4 | 
 5 | ## Installation
 6 | 
 7 | Clone this repository and install packages:
 8 | ```
 9 | git clone git@github.com:HKU-MedAI/LIG.git
10 | conda create -n lig python=3.10
11 | pip install -r requirements.txt
12 | cd gsplat2d/gsplat2d/cuda/csrc
13 | mkdir third_party
14 | cd third_party
15 | git clone https://github.com/g-truc/glm.git
16 | cd ../../../..
17 | python setup.py build
18 | python setup.py install
19 | cd ..
20 | conda activate lig
21 | ```
22 | 
23 | ## Dataset
24 | 
25 | Download the STimage [here](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/ltzhu99_connect_hku_hk/ETT5fZwxKUNPuvevfgdMXkcBKft_yCVnY1mZ7qS_LEMRxg?e=3ahAVm) and DIV-HR data (Validation Data (HR images)) [here](https://data.vision.ee.ethz.ch/cvl/DIV2K/).
26 | 
27 | The dataset folder is organized as follows.
28 | 
29 | ```bash
30 | ├── dataset
31 | |   | STimage
32 | |     ├── Human_Heart_0.png
33 | |     ├── Human_Heart_1.png
34 | │     ├── ...
35 | │   | DIV2K_valid_HR
36 | │     ├── 0801.png
37 | │     ├── 0802.png
38 | │     ├── ...
39 | ```
40 | 
41 | ## Training
42 | 
43 | Run `python train.py` to start training. The dataset and the parameters can be editted in the script. The metrics, representations, and the images will be saved.
44 | 
45 | ## Acknowledgement
46 | 
47 | The codebase is developed based on [GaussianImage](https://github.com/Xinjie-Q/GaussianImage) and [gsplat](https://github.com/nerfstudio-project/gsplat).
48 | 
49 | ## Citation
50 | 
51 | If you find our work useful, please kindly cite as:
52 | ```
53 | @article{zhu2025large,
54 |   title={Large Images are Gaussians: High-Quality Large Image Representation with Levels of 2D Gaussian Splatting},
55 |   author={Zhu, Lingting and Lin, Guying and Chen, Jinnan and Zhang, Xinjie and Jin, Zhenchao and Wang, Zhao and Yu, Lequan},
56 |   journal={arXiv preprint arXiv:2502.09039},
57 |   year={2025}
58 | }
59 | ```
60 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/forward.cuh:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <cuda_runtime.h>
 3 | #include <cstdint>
 4 | 
 5 | // compute the 2d gaussian parameters from 3d gaussian parameters
 6 | __global__ void project_gaussians_forward_kernel(
 7 |     const int num_points,
 8 |     const float3* __restrict__ cov2d,
 9 |     const float2* __restrict__ means2d,
10 |     const dim3 tile_bounds,
11 |     const unsigned block_width,
12 |     float2* __restrict__ xys,
13 |     int* __restrict__ radii,
14 |     float3* __restrict__ conics,
15 |     int32_t* __restrict__ num_tiles_hit
16 | );
17 | 
18 | // compute output color image from binned and sorted gaussians
19 | __global__ void rasterize_forward(
20 |     const dim3 tile_bounds,
21 |     const dim3 img_size,
22 |     const int32_t* __restrict__ gaussian_ids_sorted,
23 |     const int2* __restrict__ tile_bins,
24 |     const float2* __restrict__ xys,
25 |     const float3* __restrict__ conics,
26 |     const float3* __restrict__ colors,
27 |     int* __restrict__ final_index,
28 |     float3* __restrict__ out_img
29 | );
30 | 
31 | __global__ void map_gaussian_to_intersects(
32 |     const int num_points,
33 |     const float2* __restrict__ xys,
34 |     const float* __restrict__ depths,
35 |     const int* __restrict__ radii,
36 |     const int32_t* __restrict__ cum_tiles_hit,
37 |     const dim3 tile_bounds,
38 |     const unsigned block_width,
39 |     int64_t* __restrict__ isect_ids,
40 |     int32_t* __restrict__ gaussian_ids
41 | );
42 | 
43 | __global__ void get_tile_bin_edges(
44 |     const int num_intersects, const int64_t* __restrict__ isect_ids_sorted, int2* __restrict__ tile_bins
45 | );
46 | 
47 | __global__ void rasterize_forward(
48 |     const dim3 tile_bounds,
49 |     const dim3 img_size,
50 |     const int32_t* __restrict__ gaussian_ids_sorted,
51 |     const int2* __restrict__ tile_bins,
52 |     const float2* __restrict__ xys,
53 |     const float3* __restrict__ conics,
54 |     const float3* __restrict__ colors,
55 |     int* __restrict__ final_index,
56 |     float3* __restrict__ out_img
57 | );
58 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/project_gaussians.py:
--------------------------------------------------------------------------------
 1 | """Python bindings for 2D gaussian projection"""
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | from jaxtyping import Float
 6 | from torch import Tensor
 7 | from torch.autograd import Function
 8 | 
 9 | import gsplat2d.cuda as _C
10 | 
11 | 
12 | def project_gaussians(
13 |     cov2d: Float[Tensor, "*batch 3"],
14 |     means2d: Float[Tensor, "*batch 2"],
15 |     img_height: int,
16 |     img_width: int,
17 |     block_width: int,
18 | ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
19 |     assert block_width > 1 and block_width <= 16, "block_width must be between 2 and 16"
20 |     return _ProjectGaussians.apply(
21 |         cov2d.contiguous(),
22 |         means2d.contiguous(),
23 |         img_height,
24 |         img_width,
25 |         block_width,
26 |     )
27 | 
28 | 
29 | class _ProjectGaussians(Function):
30 | 
31 |     @staticmethod
32 |     def forward(
33 |         ctx,
34 |         cov2d: Float[Tensor, "*batch 3"],
35 |         means2d: Float[Tensor, "*batch 2"],
36 |         img_height: int,
37 |         img_width: int,
38 |         block_width: int,
39 |     ):
40 |         num_points = cov2d.shape[-2]
41 |         if num_points < 1 or cov2d.shape[-1] != 3:
42 |             raise ValueError(f"Invalid shape for cov2d: {cov2d.shape}")
43 | 
44 |         (
45 |             xys,
46 |             radii,
47 |             conics,
48 |             num_tiles_hit,
49 |         ) = _C.project_gaussians_forward(
50 |             num_points,
51 |             cov2d,
52 |             means2d,
53 |             img_height,
54 |             img_width,
55 |             block_width,
56 |         )
57 | 
58 |         # Save non-tensors.
59 |         ctx.img_height = img_height
60 |         ctx.img_width = img_width
61 |         ctx.num_points = num_points
62 | 
63 |         # Save tensors.
64 |         ctx.save_for_backward(
65 |             radii,
66 |             conics,
67 |         )
68 | 
69 |         return (xys, radii, conics, num_tiles_hit)
70 | 
71 |     @staticmethod
72 |     def backward(
73 |         ctx,
74 |         v_xys,
75 |         v_radii,
76 |         v_conics,
77 |         v_num_tiles_hit
78 |     ):
79 |         (   
80 |             radii,
81 |             conics,
82 |         ) = ctx.saved_tensors
83 | 
84 | 
85 |         v_cov2d, v_mean2d = _C.project_gaussians_backward(
86 |             ctx.num_points,
87 |             radii,
88 |             conics,
89 |             v_xys,
90 |             v_conics,
91 |         )
92 |         return (
93 |             v_cov2d,
94 |             v_mean2d,
95 |             None,
96 |             None,
97 |             None,
98 |         )
99 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/helpers.cuh:
--------------------------------------------------------------------------------
 1 | #include "config.h"
 2 | #include <cuda_runtime.h>
 3 | #include "third_party/glm/glm/glm.hpp"
 4 | #include "third_party/glm/glm/gtc/type_ptr.hpp"
 5 | #include <iostream>
 6 | 
 7 | inline __device__ void get_bbox(
 8 |     const float2 center,
 9 |     const float2 dims,
10 |     const dim3 img_size,
11 |     uint2 &bb_min,
12 |     uint2 &bb_max
13 | ) {
14 |     // get bounding box with center and dims, within bounds
15 |     // bounding box coords returned in tile coords, inclusive min, exclusive max
16 |     // clamp between 0 and tile bounds
17 |     bb_min.x = min(max(0, (int)(center.x - dims.x)), img_size.x);
18 |     bb_max.x = min(max(0, (int)(center.x + dims.x + 1)), img_size.x);
19 |     bb_min.y = min(max(0, (int)(center.y - dims.y)), img_size.y);
20 |     bb_max.y = min(max(0, (int)(center.y + dims.y + 1)), img_size.y);
21 | }
22 | 
23 | inline __device__ void get_tile_bbox(
24 |     const float2 pix_center,
25 |     const float pix_radius,
26 |     const dim3 tile_bounds,
27 |     uint2 &tile_min,
28 |     uint2 &tile_max,
29 |     const int block_size
30 | ) {
31 |     // gets gaussian dimensions in tile space, i.e. the span of a gaussian in
32 |     // tile_grid (image divided into tiles)
33 |     float2 tile_center = {
34 |         pix_center.x / (float)block_size, pix_center.y / (float)block_size
35 |     };
36 |     float2 tile_radius = {
37 |         pix_radius / (float)block_size, pix_radius / (float)block_size
38 |     };
39 |     get_bbox(tile_center, tile_radius, tile_bounds, tile_min, tile_max);
40 | }
41 | 
42 | inline __device__ bool
43 | compute_cov2d_bounds(const float3 cov2d, float3 &conic, float &radius) {
44 |     // find eigenvalues of 2d covariance matrix
45 |     // expects upper triangular values of cov matrix as float3
46 |     // then compute the radius and conic dimensions
47 |     // the conic is the inverse cov2d matrix, represented here with upper
48 |     // triangular values.
49 |     float det = cov2d.x * cov2d.z - cov2d.y * cov2d.y;
50 |     if (det == 0.f)
51 |         return false;
52 |     float inv_det = 1.f / det;
53 | 
54 |     // inverse of 2x2 cov2d matrix
55 |     conic.x = cov2d.z * inv_det;
56 |     conic.y = -cov2d.y * inv_det;
57 |     conic.z = cov2d.x * inv_det;
58 | 
59 |     float b = 0.5f * (cov2d.x + cov2d.z);
60 |     float v1 = b + sqrt(max(0.1f, b * b - det));
61 |     float v2 = b - sqrt(max(0.1f, b * b - det));
62 |     // take 3 sigma of covariance
63 |     radius = ceil(3.f * sqrt(max(v1, v2)));
64 |     return true;
65 | }
66 | 
67 | // compute vjp from df/d_conic to df/c_cov2d
68 | inline __device__ void cov2d_to_conic_vjp(
69 |     const float3 &conic, const float3 &v_conic, float3 &v_cov2d
70 | ) {
71 |     glm::mat2 X = glm::mat2(conic.x, conic.y, conic.y, conic.z);
72 |     glm::mat2 G = glm::mat2(v_conic.x, v_conic.y / 2.f, v_conic.y / 2.f, v_conic.z);
73 |     glm::mat2 v_Sigma = -X * G * X;
74 |     v_cov2d.x = v_Sigma[0][0];
75 |     v_cov2d.y = v_Sigma[1][0] + v_Sigma[0][1];
76 |     v_cov2d.z = v_Sigma[1][1];
77 | }
78 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/utils.py:
--------------------------------------------------------------------------------
 1 | """Python bindings for binning and sorting gaussians"""
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | from jaxtyping import Float, Int
 7 | from torch import Tensor
 8 | 
 9 | import gsplat2d.cuda as _C
10 | 
11 | 
12 | def map_gaussian_to_intersects(
13 |     num_points: int,
14 |     num_intersects: int,
15 |     xys: Float[Tensor, "batch 2"],
16 |     depths: Float[Tensor, "batch 1"],
17 |     radii: Float[Tensor, "batch 1"],
18 |     cum_tiles_hit: Float[Tensor, "batch 1"],
19 |     tile_bounds: Tuple[int, int, int],
20 |     block_size: int,
21 | ) -> Tuple[Float[Tensor, "cum_tiles_hit 1"], Float[Tensor, "cum_tiles_hit 1"]]:
22 |     
23 |     isect_ids, gaussian_ids = _C.map_gaussian_to_intersects(
24 |         num_points,
25 |         num_intersects,
26 |         xys.contiguous(),
27 |         depths.contiguous(),
28 |         radii.contiguous(),
29 |         cum_tiles_hit.contiguous(),
30 |         tile_bounds,
31 |         block_size,
32 |     )
33 |     return (isect_ids, gaussian_ids)
34 | 
35 | 
36 | def get_tile_bin_edges(
37 |     num_intersects: int,
38 |     isect_ids_sorted: Int[Tensor, "num_intersects 1"],
39 |     tile_bounds: Tuple[int, int, int],
40 | ) -> Int[Tensor, "num_intersects 2"]:
41 | 
42 |     return _C.get_tile_bin_edges(
43 |         num_intersects, isect_ids_sorted.contiguous(), tile_bounds
44 |     )
45 | 
46 | 
47 | def compute_cov2d_bounds(
48 |     cov2d: Float[Tensor, "batch 3"]
49 | ) -> Tuple[Float[Tensor, "batch_conics 3"], Float[Tensor, "batch_radii 1"]]:
50 | 
51 |     assert (
52 |         cov2d.shape[-1] == 3
53 |     ), f"Expected input cov2d to be of shape (*batch, 3) (upper triangular values), but got {tuple(cov2d.shape)}"
54 |     num_pts = cov2d.shape[0]
55 |     assert num_pts > 0
56 |     return _C.compute_cov2d_bounds(num_pts, cov2d.contiguous())
57 | 
58 | 
59 | def compute_cumulative_intersects(
60 |     num_tiles_hit: Float[Tensor, "batch 1"]
61 | ) -> Tuple[int, Float[Tensor, "batch 1"]]:
62 | 
63 |     cum_tiles_hit = torch.cumsum(num_tiles_hit, dim=0, dtype=torch.int32)
64 |     num_intersects = cum_tiles_hit[-1].item()
65 |     return num_intersects, cum_tiles_hit
66 | 
67 | 
68 | def bin_and_sort_gaussians(
69 |     num_points: int,
70 |     num_intersects: int,
71 |     xys: Float[Tensor, "batch 2"],
72 |     depths: Float[Tensor, "batch 1"],
73 |     radii: Float[Tensor, "batch 1"],
74 |     cum_tiles_hit: Float[Tensor, "batch 1"],
75 |     tile_bounds: Tuple[int, int, int],
76 |     block_size: int,
77 | ) -> Tuple[
78 |     Float[Tensor, "num_intersects 1"],
79 |     Float[Tensor, "num_intersects 1"],
80 |     Float[Tensor, "num_intersects 1"],
81 |     Float[Tensor, "num_intersects 1"],
82 |     Float[Tensor, "num_intersects 2"],
83 | ]:
84 | 
85 |     isect_ids, gaussian_ids = map_gaussian_to_intersects(
86 |         num_points,
87 |         num_intersects,
88 |         xys,
89 |         depths,
90 |         radii,
91 |         cum_tiles_hit,
92 |         tile_bounds,
93 |         block_size,
94 |     )
95 |     isect_ids_sorted, sorted_indices = torch.sort(isect_ids)
96 |     gaussian_ids_sorted = torch.gather(gaussian_ids, 0, sorted_indices)
97 |     tile_bins = get_tile_bin_edges(num_intersects, isect_ids_sorted, tile_bounds)
98 |     return isect_ids, gaussian_ids, isect_ids_sorted, gaussian_ids_sorted, tile_bins
99 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/bindings.h:
--------------------------------------------------------------------------------
  1 | #include "cuda_runtime.h"
  2 | #include "forward.cuh"
  3 | #include <cstdio>
  4 | #include <iostream>
  5 | #include <math.h>
  6 | #include <torch/extension.h>
  7 | #include <tuple>
  8 | #include <c10/cuda/CUDAGuard.h>
  9 | 
 10 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 11 | #define CHECK_CONTIGUOUS(x)                                                    \
 12 |     TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 13 | #define CHECK_INPUT(x)                                                         \
 14 |     CHECK_CUDA(x);                                                             \
 15 |     CHECK_CONTIGUOUS(x)
 16 | #define DEVICE_GUARD(_ten) \
 17 |     const at::cuda::OptionalCUDAGuard device_guard(device_of(_ten));
 18 | 
 19 | std::tuple<
 20 |     torch::Tensor, // output conics
 21 |     torch::Tensor> // output radii
 22 | compute_cov2d_bounds_tensor(const int num_pts, torch::Tensor &A);
 23 | 
 24 | std::tuple<
 25 |     torch::Tensor,
 26 |     torch::Tensor,
 27 |     torch::Tensor,
 28 |     torch::Tensor>
 29 | project_gaussians_forward_tensor(
 30 |     const int num_points,
 31 |     torch::Tensor &cov2d,
 32 |     torch::Tensor &means2d,
 33 |     const unsigned img_height,
 34 |     const unsigned img_width,
 35 |     const unsigned block_width
 36 | );
 37 | 
 38 | std::tuple<
 39 |     torch::Tensor,
 40 |     torch::Tensor>
 41 | project_gaussians_backward_tensor(
 42 |     const int num_points,
 43 |     torch::Tensor &radii,
 44 |     torch::Tensor &conics,
 45 |     torch::Tensor &v_xy,
 46 |     torch::Tensor &v_conic
 47 | );
 48 | 
 49 | 
 50 | std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
 51 |     const int num_points,
 52 |     const int num_intersects,
 53 |     const torch::Tensor &xys,
 54 |     const torch::Tensor &depths,
 55 |     const torch::Tensor &radii,
 56 |     const torch::Tensor &cum_tiles_hit,
 57 |     const std::tuple<int, int, int> tile_bounds,
 58 |     const unsigned block_width
 59 | );
 60 | 
 61 | torch::Tensor get_tile_bin_edges_tensor(
 62 |     int num_intersects,
 63 |     const torch::Tensor &isect_ids_sorted,
 64 |     const std::tuple<int, int, int> tile_bounds
 65 | );
 66 | 
 67 | std::tuple<
 68 |     torch::Tensor,
 69 |     torch::Tensor
 70 | > rasterize_forward_tensor(
 71 |     const std::tuple<int, int, int> tile_bounds,
 72 |     const std::tuple<int, int, int> block,
 73 |     const std::tuple<int, int, int> img_size,
 74 |     const torch::Tensor &gaussian_ids_sorted,
 75 |     const torch::Tensor &tile_bins,
 76 |     const torch::Tensor &xys,
 77 |     const torch::Tensor &conics,
 78 |     const torch::Tensor &colors
 79 | );
 80 | 
 81 | std::
 82 |     tuple<
 83 |         torch::Tensor, // dL_dxy
 84 |         torch::Tensor, // dL_dxy_abs
 85 |         torch::Tensor, // dL_dconic
 86 |         torch::Tensor // dL_dcolors
 87 |         >
 88 |     rasterize_backward_tensor(
 89 |         const unsigned img_height,
 90 |         const unsigned img_width,
 91 |         const unsigned block_width,
 92 |         const torch::Tensor &gaussians_ids_sorted,
 93 |         const torch::Tensor &tile_bins,
 94 |         const torch::Tensor &xys,
 95 |         const torch::Tensor &conics,
 96 |         const torch::Tensor &colors,
 97 |         const torch::Tensor &final_idx,
 98 |         const torch::Tensor &v_output
 99 |     );
100 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/_backend.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | import shutil
 5 | from subprocess import DEVNULL, call
 6 | 
 7 | from rich.console import Console
 8 | from torch.utils.cpp_extension import _get_build_directory, load
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | 
13 | def cuda_toolkit_available():
14 |     """Check if the nvcc is avaiable on the machine."""
15 |     try:
16 |         call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
17 |         return True
18 |     except FileNotFoundError:
19 |         return False
20 | 
21 | 
22 | def cuda_toolkit_version():
23 |     """Get the cuda toolkit version."""
24 |     cuda_home = os.path.join(os.path.dirname(shutil.which("nvcc")), "..")
25 |     if os.path.exists(os.path.join(cuda_home, "version.txt")):
26 |         with open(os.path.join(cuda_home, "version.txt")) as f:
27 |             cuda_version = f.read().strip().split()[-1]
28 |     elif os.path.exists(os.path.join(cuda_home, "version.json")):
29 |         with open(os.path.join(cuda_home, "version.json")) as f:
30 |             cuda_version = json.load(f)["cuda"]["version"]
31 |     else:
32 |         raise RuntimeError("Cannot find the cuda version.")
33 |     return cuda_version
34 | 
35 | 
36 | name = "gsplat2d_cuda"
37 | build_dir = _get_build_directory(name, verbose=False)
38 | extra_include_paths = [os.path.join(PATH, "csrc/third_party/glm")]
39 | extra_cflags = ["-O3"]
40 | extra_cuda_cflags = ["-O3"]
41 | 
42 | _C = None
43 | sources = list(glob.glob(os.path.join(PATH, "csrc/*.cu"))) + list(
44 |     glob.glob(os.path.join(PATH, "csrc/*.cpp"))
45 | )
46 | 
47 | try:
48 |     # try to import the compiled module (via setup.py)
49 |     from gsplat2d import csrc as _C
50 | except ImportError:
51 |     # if failed, try with JIT compilation
52 |     if cuda_toolkit_available():
53 |         # If JIT is interrupted it might leave a lock in the build directory.
54 |         # We dont want it to exist in any case.
55 |         try:
56 |             os.remove(os.path.join(build_dir, "lock"))
57 |         except OSError:
58 |             pass
59 | 
60 |         if os.path.exists(os.path.join(build_dir, "gsplat2d_cuda.so")) or os.path.exists(
61 |             os.path.join(build_dir, "gsplat2d_cuda.lib")
62 |         ):
63 |             # If the build exists, we assume the extension has been built
64 |             # and we can load it.
65 | 
66 |             _C = load(
67 |                 name=name,
68 |                 sources=sources,
69 |                 extra_cflags=extra_cflags,
70 |                 extra_cuda_cflags=extra_cuda_cflags,
71 |                 extra_include_paths=extra_include_paths,
72 |             )
73 |         else:
74 |             # Build from scratch. Remove the build directory just to be safe: pytorch jit might stuck
75 |             # if the build directory exists with a lock file in it.
76 |             shutil.rmtree(build_dir)
77 |             with Console().status(
78 |                 "[bold yellow]gsplat: Setting up CUDA (This may take a few minutes the first time)",
79 |                 spinner="bouncingBall",
80 |             ):
81 |                 _C = load(
82 |                     name=name,
83 |                     sources=sources,
84 |                     extra_cflags=extra_cflags,
85 |                     extra_cuda_cflags=extra_cuda_cflags,
86 |                     extra_include_paths=extra_include_paths,
87 |                 )
88 |     else:
89 |         Console().print(
90 |             "[yellow]gsplat: No CUDA toolkit found. gsplat will be disabled.[/yellow]"
91 |         )
92 | 
93 | 
94 | __all__ = ["_C"]
95 | 


--------------------------------------------------------------------------------
/gaussianlig.py:
--------------------------------------------------------------------------------
  1 | from gsplat2d.project_gaussians import project_gaussians
  2 | from gsplat2d.rasterize import rasterize_gaussians
  3 | from utils import *
  4 | import torch
  5 | import torch.nn as nn
  6 | import math
  7 | from optimizer import Adan
  8 | 
  9 | class LIG(nn.Module):
 10 |     def __init__(self, loss_type="L2", **kwargs):
 11 |         super().__init__()
 12 |         self.loss_type = loss_type
 13 |         self.init_num_points = kwargs["num_points"]
 14 |         self.H, self.W = kwargs["H"], kwargs["W"]
 15 | 
 16 |         self.n_scales = kwargs["n_scales"]
 17 |         self.allo_ratio = kwargs["allo_ratio"]
 18 |         self.level_models = []
 19 | 
 20 |         self.store_min = []
 21 |         self.store_max = []
 22 | 
 23 |         for s in range(self.n_scales):
 24 | 
 25 |             H = int(self.H * pow(2.0, -self.n_scales + s + 1))
 26 |             W = int(self.W * pow(2.0, -self.n_scales + s + 1))
 27 | 
 28 |             if s != self.n_scales - 1:
 29 |                 num_points = int(kwargs["num_points"] * pow(2.0, (-self.n_scales + s + 1)*2) * self.allo_ratio)
 30 |             else:
 31 |                 num_points = kwargs["num_points"]
 32 |                 for i in range(s):
 33 |                     num_points -= int(kwargs["num_points"] * pow(2.0, (-self.n_scales + i + 1)*2) * self.allo_ratio)
 34 | 
 35 |             self.level_models.append(Gaussian2D(loss_type="L2", opt_type=kwargs['opt_type'], num_points=num_points, 
 36 |                                                    H=H, W=W, BLOCK_H=kwargs['BLOCK_H'], BLOCK_W=kwargs['BLOCK_W'],
 37 |                                                    device=kwargs['device'], lr=kwargs['lr']))
 38 | 
 39 | class Gaussian2D(nn.Module):
 40 |     def __init__(self, loss_type="L2", **kwargs):
 41 |         super().__init__()
 42 |         self.loss_type = loss_type
 43 |         self.init_num_points = kwargs["num_points"]
 44 |         self.H, self.W = kwargs["H"], kwargs["W"]
 45 | 
 46 |         self.B_SIZE = 16
 47 | 
 48 |         self.device = kwargs["device"]
 49 | 
 50 |         self.last_size = (self.H, self.W)
 51 | 
 52 |         w_init = torch.rand(self.init_num_points, 1, device=self.device) * self.W
 53 |         h_init = torch.rand(self.init_num_points, 1, device=self.device) * self.H
 54 |         self.means = nn.Parameter(torch.cat((w_init, h_init), dim=1))
 55 | 
 56 |         self.cov2d = nn.Parameter(torch.rand(self.init_num_points, 3, device=self.device))
 57 |         d = 3
 58 |         self.rgbs = nn.Parameter(torch.zeros(self.init_num_points, d, device=self.device))
 59 | 
 60 |         self.means.requires_grad = True
 61 |         self.cov2d.requires_grad = True
 62 |         self.rgbs.requires_grad = True
 63 | 
 64 |         if kwargs["opt_type"] == "adam":
 65 |             self.optimizer = torch.optim.Adam([self.rgbs, self.means, self.cov2d], lr=kwargs["lr"])
 66 |         else:
 67 |             self.optimizer = Adan([self.rgbs, self.means, self.cov2d], lr=kwargs["lr"])
 68 |         self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=70000, gamma=0.7)
 69 | 
 70 |     def forward(self):
 71 |         (
 72 |             xys,
 73 |             radii,
 74 |             conics,
 75 |             num_tiles_hit,
 76 |         ) = project_gaussians(
 77 |             self.cov2d,
 78 |             self.means,
 79 |             self.H,
 80 |             self.W,
 81 |             self.B_SIZE,
 82 |         )
 83 |         out_img = rasterize_gaussians(
 84 |                 xys,
 85 |                 radii,
 86 |                 conics,
 87 |                 num_tiles_hit,
 88 |                 self.rgbs,
 89 |                 self.H,
 90 |                 self.W,
 91 |                 self.B_SIZE,
 92 |             )[..., :3]
 93 | 
 94 |         out_img = torch.clamp(out_img, 0, 1)
 95 |         out_img = out_img.view(-1, self.H, self.W, 3).permute(0, 3, 1, 2).contiguous()
 96 |         return {"render": out_img}
 97 | 
 98 |     def train_iter(self, gt_image):
 99 |         render_pkg = self.forward()
100 |         image = render_pkg["render"]
101 |         loss = loss_fn(image, gt_image, self.loss_type, lambda_value=0.7)
102 |         loss.backward()
103 |         with torch.no_grad():
104 |             mse_loss = F.mse_loss(image, gt_image)
105 |             psnr = 10 * math.log10(1.0 / mse_loss.item())
106 |         self.optimizer.step()
107 |         self.optimizer.zero_grad(set_to_none = True)
108 | 
109 |         self.scheduler.step()
110 |         return loss, psnr
111 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/__init__.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | import torch
  3 | from .project_gaussians import project_gaussians
  4 | from .rasterize import rasterize_gaussians
  5 | from .utils import (
  6 |     map_gaussian_to_intersects,
  7 |     bin_and_sort_gaussians,
  8 |     compute_cumulative_intersects,
  9 |     compute_cov2d_bounds,
 10 |     get_tile_bin_edges,
 11 | )
 12 | from .version import __version__
 13 | import warnings
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "__version__",
 18 |     "project_gaussians",
 19 |     "rasterize_gaussians",
 20 |     "bin_and_sort_gaussians",
 21 |     "compute_cumulative_intersects",
 22 |     "compute_cov2d_bounds",
 23 |     "get_tile_bin_edges",
 24 |     "map_gaussian_to_intersects",
 25 |     "ProjectGaussians",
 26 |     "RasterizeGaussians",
 27 |     "BinAndSortGaussians",
 28 |     "ComputeCumulativeIntersects",
 29 |     "ComputeCov2dBounds",
 30 |     "GetTileBinEdges",
 31 |     "MapGaussiansToIntersects",
 32 | ]
 33 | 
 34 | # Define these for backwards compatibility
 35 | 
 36 | 
 37 | class MapGaussiansToIntersects(torch.autograd.Function):
 38 |     @staticmethod
 39 |     def forward(ctx, *args, **kwargs):
 40 |         warnings.warn(
 41 |             "MapGaussiansToIntersects is deprecated, use map_gaussian_to_intersects instead",
 42 |             DeprecationWarning,
 43 |         )
 44 |         return map_gaussian_to_intersects(*args, **kwargs)
 45 | 
 46 |     @staticmethod
 47 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
 48 |         raise NotImplementedError
 49 | 
 50 | 
 51 | class ComputeCumulativeIntersects(torch.autograd.Function):
 52 |     @staticmethod
 53 |     def forward(ctx, *args, **kwargs):
 54 |         warnings.warn(
 55 |             "ComputeCumulativeIntersects is deprecated, use compute_cumulative_intersects instead",
 56 |             DeprecationWarning,
 57 |         )
 58 |         return compute_cumulative_intersects(*args, **kwargs)
 59 | 
 60 |     @staticmethod
 61 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
 62 |         raise NotImplementedError
 63 | 
 64 | 
 65 | class ComputeCov2dBounds(torch.autograd.Function):
 66 |     @staticmethod
 67 |     def forward(ctx, *args, **kwargs):
 68 |         warnings.warn(
 69 |             "ComputeCov2dBounds is deprecated, use compute_cov2d_bounds instead",
 70 |             DeprecationWarning,
 71 |         )
 72 |         return compute_cov2d_bounds(*args, **kwargs)
 73 | 
 74 |     @staticmethod
 75 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
 76 |         raise NotImplementedError
 77 | 
 78 | 
 79 | class GetTileBinEdges(torch.autograd.Function):
 80 |     @staticmethod
 81 |     def forward(ctx, *args, **kwargs):
 82 |         warnings.warn(
 83 |             "GetTileBinEdges is deprecated, use get_tile_bin_edges instead",
 84 |             DeprecationWarning,
 85 |         )
 86 |         return get_tile_bin_edges(*args, **kwargs)
 87 | 
 88 |     @staticmethod
 89 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
 90 |         raise NotImplementedError
 91 | 
 92 | 
 93 | class BinAndSortGaussians(torch.autograd.Function):
 94 |     @staticmethod
 95 |     def forward(ctx, *args, **kwargs):
 96 |         warnings.warn(
 97 |             "BinAndSortGaussians is deprecated, use bin_and_sort_gaussians instead",
 98 |             DeprecationWarning,
 99 |         )
100 |         return bin_and_sort_gaussians(*args, **kwargs)
101 | 
102 |     @staticmethod
103 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
104 |         raise NotImplementedError
105 | 
106 | 
107 | class ProjectGaussians(torch.autograd.Function):
108 |     @staticmethod
109 |     def forward(ctx, *args, **kwargs):
110 |         warnings.warn(
111 |             "ProjectGaussians is deprecated, use project_gaussians instead",
112 |             DeprecationWarning,
113 |         )
114 |         return project_gaussians(*args, **kwargs)
115 | 
116 |     @staticmethod
117 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
118 |         raise NotImplementedError
119 | 
120 | 
121 | class RasterizeGaussians(torch.autograd.Function):
122 |     @staticmethod
123 |     def forward(ctx, *args, **kwargs):
124 |         warnings.warn(
125 |             "RasterizeGaussians is deprecated, use rasterize_gaussians instead",
126 |             DeprecationWarning,
127 |         )
128 |         return rasterize_gaussians(*args, **kwargs)
129 | 
130 |     @staticmethod
131 |     def backward(ctx: Any, *grad_outputs: Any) -> Any:
132 |         raise NotImplementedError


--------------------------------------------------------------------------------
/gsplat2d/setup.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import os.path as osp
  4 | import platform
  5 | import sys
  6 | 
  7 | from setuptools import find_packages, setup
  8 | 
  9 | __version__ = None
 10 | exec(open("gsplat2d/version.py", "r").read())
 11 | 
 12 | BUILD_NO_CUDA = os.getenv("BUILD_NO_CUDA", "0") == "1"
 13 | WITH_SYMBOLS = os.getenv("WITH_SYMBOLS", "0") == "1"
 14 | LINE_INFO = os.getenv("LINE_INFO", "0") == "1"
 15 | 
 16 | 
 17 | def get_ext():
 18 |     from torch.utils.cpp_extension import BuildExtension
 19 | 
 20 |     return BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
 21 | 
 22 | 
 23 | def get_extensions():
 24 |     import torch
 25 |     from torch.__config__ import parallel_info
 26 |     from torch.utils.cpp_extension import CUDAExtension
 27 | 
 28 |     extensions_dir = osp.join("gsplat2d", "cuda", "csrc")
 29 |     sources = glob.glob(osp.join(extensions_dir, "*.cu")) + glob.glob(
 30 |         osp.join(extensions_dir, "*.cpp")
 31 |     )
 32 | 
 33 |     sources = [path for path in sources if "hip" not in path]
 34 | 
 35 |     undef_macros = []
 36 |     define_macros = []
 37 | 
 38 |     if sys.platform == "win32":
 39 |         define_macros += [("gsplat_EXPORTS", None)]
 40 | 
 41 |     extra_compile_args = {"cxx": ["-O3"]}
 42 |     if not os.name == "nt":  # Not on Windows:
 43 |         extra_compile_args["cxx"] += ["-Wno-sign-compare"]
 44 |     extra_link_args = [] if WITH_SYMBOLS else ["-s"]
 45 | 
 46 |     info = parallel_info()
 47 |     if (
 48 |         "backend: OpenMP" in info
 49 |         and "OpenMP not found" not in info
 50 |         and sys.platform != "darwin"
 51 |     ):
 52 |         extra_compile_args["cxx"] += ["-DAT_PARALLEL_OPENMP"]
 53 |         if sys.platform == "win32":
 54 |             extra_compile_args["cxx"] += ["/openmp"]
 55 |         else:
 56 |             extra_compile_args["cxx"] += ["-fopenmp"]
 57 |     else:
 58 |         print("Compiling without OpenMP...")
 59 | 
 60 |     # Compile for mac arm64
 61 |     if sys.platform == "darwin" and platform.machine() == "arm64":
 62 |         extra_compile_args["cxx"] += ["-arch", "arm64"]
 63 |         extra_link_args += ["-arch", "arm64"]
 64 | 
 65 |     nvcc_flags = os.getenv("NVCC_FLAGS", "")
 66 |     nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ")
 67 |     nvcc_flags += ["-O3", "--use_fast_math"]
 68 |     if LINE_INFO:
 69 |         nvcc_flags += ["-lineinfo"]
 70 |     if torch.version.hip:
 71 |         # USE_ROCM was added to later versions of PyTorch.
 72 |         # Define here to support older PyTorch versions as well:
 73 |         define_macros += [("USE_ROCM", None)]
 74 |         undef_macros += ["__HIP_NO_HALF_CONVERSIONS__"]
 75 |     else:
 76 |         nvcc_flags += ["--expt-relaxed-constexpr"]
 77 |     extra_compile_args["nvcc"] = nvcc_flags
 78 |     if sys.platform == "win32":
 79 |         extra_compile_args["nvcc"] += ["-DWIN32_LEAN_AND_MEAN"]
 80 | 
 81 |     extension = CUDAExtension(
 82 |         f"gsplat2d.csrc",
 83 |         sources,
 84 |         include_dirs=[osp.join(extensions_dir, "third_party", "glm")],
 85 |         define_macros=define_macros,
 86 |         undef_macros=undef_macros,
 87 |         extra_compile_args=extra_compile_args,
 88 |         extra_link_args=extra_link_args,
 89 |     )
 90 | 
 91 |     return [extension]
 92 | 
 93 | 
 94 | setup(
 95 |     name="gsplat2d",
 96 |     version=__version__,
 97 |     description=" Python package for differentiable rasterization of gaussians",
 98 |     keywords="gaussian, splatting, cuda",
 99 |     python_requires=">=3.7",
100 |     install_requires=[
101 |         "jaxtyping",
102 |         "rich>=12",
103 |         "torch",
104 |         "typing_extensions; python_version<'3.8'",
105 |     ],
106 |     extras_require={
107 |         # dev dependencies. Install them by `pip install gsplat[dev]`
108 |         "dev": [
109 |             "black[jupyter]==22.3.0",
110 |             "isort==5.10.1",
111 |             "pylint==2.13.4",
112 |             "pytest==7.1.2",
113 |             "pytest-xdist==2.5.0",
114 |             "typeguard>=2.13.3",
115 |             "pyyaml==6.0",
116 |             "build",
117 |             "twine",
118 |             "ninja",
119 |         ],
120 |     },
121 |     ext_modules=get_extensions() if not BUILD_NO_CUDA else [],
122 |     cmdclass={"build_ext": get_ext()} if not BUILD_NO_CUDA else {},
123 |     packages=find_packages(),
124 |     # https://github.com/pypa/setuptools/issues/1461#issuecomment-954725244
125 |     include_package_data=True,
126 | )
127 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch.nn.functional as F
  3 | from pytorch_msssim import ms_ssim, ssim
  4 | import torch
  5 | 
  6 | class LogWriter:
  7 |     def __init__(self, file_path, train=True):
  8 |         os.makedirs(file_path, exist_ok=True)
  9 |         self.file_path = os.path.join(file_path, "train.txt" if train else "test.txt")
 10 | 
 11 |     def write(self, text):
 12 |         print(text)
 13 |         with open(self.file_path, 'a') as file:
 14 |             file.write(text + '\n')
 15 | 
 16 | 
 17 | def loss_fn(pred, target, loss_type='L2', lambda_value=0.7):
 18 |     target = target.detach()
 19 |     pred = pred.float()
 20 |     target  = target.float()
 21 |     if loss_type == 'L2':
 22 |         loss = F.mse_loss(pred, target)
 23 |     elif loss_type == 'L1':
 24 |         loss = F.l1_loss(pred, target)
 25 |     elif loss_type == 'SSIM':
 26 |         loss = 1 - ssim(pred, target, data_range=1, size_average=True)
 27 |     elif loss_type == 'Fusion1':
 28 |         loss = lambda_value * F.mse_loss(pred, target) + (1-lambda_value) * (1 - ssim(pred, target, data_range=1, size_average=True))
 29 |     elif loss_type == 'Fusion2':
 30 |         loss = lambda_value * F.l1_loss(pred, target) + (1-lambda_value) * (1 - ssim(pred, target, data_range=1, size_average=True))
 31 |     elif loss_type == 'Fusion3':
 32 |         loss = lambda_value * F.mse_loss(pred, target) + (1-lambda_value) * F.l1_loss(pred, target)
 33 |     elif loss_type == 'Fusion4':
 34 |         loss = lambda_value * F.l1_loss(pred, target) + (1-lambda_value) * (1 - ms_ssim(pred, target, data_range=1, size_average=True))
 35 |     elif loss_type == 'Fusion_hinerv':
 36 |         loss = lambda_value * F.l1_loss(pred, target) + (1-lambda_value)  * (1 - ms_ssim(pred, target, data_range=1, size_average=True, win_size=5))
 37 |     return loss
 38 | 
 39 | def strip_lowerdiag(L):
 40 |     if L.shape[1] == 3:
 41 |         uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")
 42 |         uncertainty[:, 0] = L[:, 0, 0]
 43 |         uncertainty[:, 1] = L[:, 0, 1]
 44 |         uncertainty[:, 2] = L[:, 0, 2]
 45 |         uncertainty[:, 3] = L[:, 1, 1]
 46 |         uncertainty[:, 4] = L[:, 1, 2]
 47 |         uncertainty[:, 5] = L[:, 2, 2]
 48 | 
 49 |     elif L.shape[1] == 2:
 50 |         uncertainty = torch.zeros((L.shape[0], 3), dtype=torch.float, device="cuda")
 51 |         uncertainty[:, 0] = L[:, 0, 0]
 52 |         uncertainty[:, 1] = L[:, 0, 1]
 53 |         uncertainty[:, 2] = L[:, 1, 1]
 54 |     return uncertainty
 55 | 
 56 | def strip_symmetric(sym):
 57 |     return strip_lowerdiag(sym)
 58 | 
 59 | def build_rotation(r):
 60 |     norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])
 61 | 
 62 |     q = r / norm[:, None]
 63 | 
 64 |     R = torch.zeros((q.size(0), 3, 3), device='cuda')
 65 | 
 66 |     r = q[:, 0]
 67 |     x = q[:, 1]
 68 |     y = q[:, 2]
 69 |     z = q[:, 3]
 70 | 
 71 |     R[:, 0, 0] = 1 - 2 * (y*y + z*z)
 72 |     R[:, 0, 1] = 2 * (x*y - r*z)
 73 |     R[:, 0, 2] = 2 * (x*z + r*y)
 74 |     R[:, 1, 0] = 2 * (x*y + r*z)
 75 |     R[:, 1, 1] = 1 - 2 * (x*x + z*z)
 76 |     R[:, 1, 2] = 2 * (y*z - r*x)
 77 |     R[:, 2, 0] = 2 * (x*z - r*y)
 78 |     R[:, 2, 1] = 2 * (y*z + r*x)
 79 |     R[:, 2, 2] = 1 - 2 * (x*x + y*y)
 80 |     return R
 81 | 
 82 | def build_scaling_rotation(s, r):
 83 |     L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
 84 |     R = build_rotation(r)
 85 | 
 86 |     L[:,0,0] = s[:,0]
 87 |     L[:,1,1] = s[:,1]
 88 |     L[:,2,2] = s[:,2]
 89 | 
 90 |     L = R @ L
 91 |     return L
 92 | 
 93 | def build_rotation_2d(r):
 94 |     '''
 95 |     Build rotation matrix in 2D.
 96 |     '''
 97 |     R = torch.zeros((r.size(0), 2, 2), device='cuda')
 98 |     R[:, 0, 0] = torch.cos(r)[:, 0]
 99 |     R[:, 0, 1] = -torch.sin(r)[:, 0]
100 |     R[:, 1, 0] = torch.sin(r)[:, 0]
101 |     R[:, 1, 1] = torch.cos(r)[:, 0]
102 |     return R
103 | 
104 | def build_scaling_rotation_2d(s, r, device):
105 |     L = torch.zeros((s.shape[0], 2, 2), dtype=torch.float, device='cuda')
106 |     R = build_rotation_2d(r, device)
107 |     L[:,0,0] = s[:,0]
108 |     L[:,1,1] = s[:,1]
109 |     L = R @ L
110 |     return L
111 |     
112 | def build_covariance_from_scaling_rotation_2d(scaling, scaling_modifier, rotation, device):
113 |     '''
114 |     Build covariance metrix from rotation and scale matricies.
115 |     '''
116 |     L = build_scaling_rotation_2d(scaling_modifier * scaling, rotation, device)
117 |     actual_covariance = L @ L.transpose(1, 2)
118 |     return actual_covariance
119 | 
120 | def build_triangular(r):
121 |     R = torch.zeros((r.size(0), 2, 2), device=r.device)
122 |     R[:, 0, 0] = r[:, 0]
123 |     R[:, 1, 0] = r[:, 1]
124 |     R[:, 1, 1] = r[:, 2]
125 |     return R


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/rasterize.py:
--------------------------------------------------------------------------------
  1 | """Python bindings for custom Cuda functions"""
  2 | 
  3 | import torch
  4 | from jaxtyping import Float, Int
  5 | from torch import Tensor
  6 | from torch.autograd import Function
  7 | 
  8 | import gsplat2d.cuda as _C
  9 | 
 10 | from .utils import bin_and_sort_gaussians, compute_cumulative_intersects
 11 | 
 12 | def rasterize_gaussians(
 13 |     xys: Float[Tensor, "*batch 2"],
 14 |     radii: Float[Tensor, "*batch 1"],
 15 |     conics: Float[Tensor, "*batch 3"],
 16 |     num_tiles_hit: Int[Tensor, "*batch 1"],
 17 |     colors: Float[Tensor, "*batch channels"],
 18 |     img_height: int,
 19 |     img_width: int,
 20 |     block_width: int,
 21 | ) -> Tensor:
 22 |     
 23 |     assert block_width > 1 and block_width <= 16, "block_width must be between 2 and 16"
 24 |     if colors.dtype == torch.uint8:
 25 |         colors = colors.float() / 255
 26 | 
 27 |     if xys.ndimension() != 2 or xys.size(1) != 2:
 28 |         raise ValueError("xys must have dimensions (N, 2)")
 29 | 
 30 |     if colors.ndimension() != 2:
 31 |         raise ValueError("colors must have dimensions (N, D)")
 32 | 
 33 |     return _RasterizeGaussians.apply(
 34 |         xys.contiguous(),
 35 |         radii.contiguous(),
 36 |         conics.contiguous(),
 37 |         num_tiles_hit.contiguous(),
 38 |         colors.contiguous(),
 39 |         img_height,
 40 |         img_width,
 41 |         block_width,
 42 |     )
 43 | 
 44 | 
 45 | class _RasterizeGaussians(Function):
 46 |     """Rasterizes 2D gaussians"""
 47 | 
 48 |     @staticmethod
 49 |     def forward(
 50 |         ctx,
 51 |         xys: Float[Tensor, "*batch 2"],
 52 |         radii: Float[Tensor, "*batch 1"],
 53 |         conics: Float[Tensor, "*batch 3"],
 54 |         num_tiles_hit: Int[Tensor, "*batch 1"],
 55 |         colors: Float[Tensor, "*batch channels"],
 56 |         img_height: int,
 57 |         img_width: int,
 58 |         block_width: int,
 59 |     ) -> Tensor:
 60 |         num_points = xys.size(0)
 61 |         tile_bounds = (
 62 |             (img_width + block_width - 1) // block_width,
 63 |             (img_height + block_width - 1) // block_width,
 64 |             1,
 65 |         )
 66 |         block = (block_width, block_width, 1)
 67 |         img_size = (img_width, img_height, 1)
 68 | 
 69 |         depths = torch.zeros_like(xys[..., 0], device=xys.device)
 70 | 
 71 |         num_intersects, cum_tiles_hit = compute_cumulative_intersects(num_tiles_hit)
 72 | 
 73 |         if num_intersects < 1:
 74 |             out_img = (
 75 |                 torch.ones(img_height, img_width, colors.shape[-1], device=xys.device)
 76 |             )
 77 |             gaussian_ids_sorted = torch.zeros(0, 1, device=xys.device)
 78 |             tile_bins = torch.zeros(0, 2, device=xys.device)
 79 |             final_idx = torch.zeros(img_height, img_width, device=xys.device)
 80 |         else:
 81 |             (
 82 |                 isect_ids_unsorted,
 83 |                 gaussian_ids_unsorted,
 84 |                 isect_ids_sorted,
 85 |                 gaussian_ids_sorted,
 86 |                 tile_bins,
 87 |             ) = bin_and_sort_gaussians(
 88 |                 num_points,
 89 |                 num_intersects,
 90 |                 xys,
 91 |                 depths,
 92 |                 radii,
 93 |                 cum_tiles_hit,
 94 |                 tile_bounds,
 95 |                 block_width,
 96 |             )
 97 |             rasterize_fn = _C.rasterize_forward
 98 |             
 99 |             out_img, final_idx = rasterize_fn(
100 |                 tile_bounds,
101 |                 block,
102 |                 img_size,
103 |                 gaussian_ids_sorted,
104 |                 tile_bins,
105 |                 xys,
106 |                 conics,
107 |                 colors,
108 |             )
109 | 
110 |         ctx.img_width = img_width
111 |         ctx.img_height = img_height
112 |         ctx.num_intersects = num_intersects
113 |         ctx.block_width = block_width
114 |         ctx.save_for_backward(
115 |             gaussian_ids_sorted,
116 |             tile_bins,
117 |             xys,
118 |             conics,
119 |             colors,
120 |             final_idx,
121 |         )
122 | 
123 |         return out_img
124 | 
125 |     @staticmethod
126 |     def backward(ctx, v_out_img):
127 |         img_height = ctx.img_height
128 |         img_width = ctx.img_width
129 |         num_intersects = ctx.num_intersects
130 | 
131 |         (
132 |             gaussian_ids_sorted,
133 |             tile_bins,
134 |             xys,
135 |             conics,
136 |             colors,
137 |             final_idx,
138 |         ) = ctx.saved_tensors
139 | 
140 |         if num_intersects < 1:
141 |             v_xy = torch.zeros_like(xys)
142 |             v_xy_abs = torch.zeros_like(xys)
143 |             v_conic = torch.zeros_like(conics)
144 |             v_colors = torch.zeros_like(colors)
145 | 
146 |         else:
147 |             rasterize_fn = _C.rasterize_backward
148 | 
149 |             v_xy, v_xy_abs, v_conic, v_colors = rasterize_fn(
150 |                 img_height,
151 |                 img_width,
152 |                 ctx.block_width,
153 |                 gaussian_ids_sorted,
154 |                 tile_bins,
155 |                 xys,
156 |                 conics,
157 |                 colors,
158 |                 final_idx,
159 |                 v_out_img,
160 |             )
161 | 
162 |         xys.absgrad = v_xy_abs
163 | 
164 |         return (
165 |             v_xy,  # xys
166 |             None,  # radii
167 |             v_conic,  # conics
168 |             None,  # num_tiles_hit
169 |             v_colors,  # colors
170 |             None,  # img_height
171 |             None,  # img_width
172 |             None,  # block_width
173 |         )
174 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/forward.cu:
--------------------------------------------------------------------------------
  1 | #include "forward.cuh"
  2 | #include "helpers.cuh"
  3 | #include <algorithm>
  4 | #include <cooperative_groups.h>
  5 | #include <cooperative_groups/reduce.h>
  6 | #include <iostream>
  7 | #include <cuda_fp16.h>
  8 | 
  9 | namespace cg = cooperative_groups;
 10 | 
 11 | // kernel function for projecting each gaussian on device
 12 | // each thread processes one gaussian
 13 | __global__ void project_gaussians_forward_kernel(
 14 |     const int num_points,    
 15 |     const float3* __restrict__ cov2d,
 16 |     const float2* __restrict__ means2d,
 17 |     const dim3 tile_bounds,
 18 |     const unsigned block_width,
 19 |     float2* __restrict__ xys,
 20 |     int* __restrict__ radii,
 21 |     float3* __restrict__ conics,
 22 |     int32_t* __restrict__ num_tiles_hit
 23 | ) {
 24 |     unsigned idx = cg::this_grid().thread_rank(); // idx of thread within grid
 25 |     if (idx >= num_points) {
 26 |         return;
 27 |     }
 28 |     radii[idx] = 0;
 29 |     num_tiles_hit[idx] = 0;
 30 | 
 31 |     float3 conic;
 32 |     float radius;
 33 |     float3 cov2d_f = cov2d[idx];
 34 |     bool ok = compute_cov2d_bounds(cov2d_f, conic, radius);
 35 |     if (!ok)
 36 |         return; // zero determinant
 37 | 
 38 |     conics[idx] = conic;
 39 | 
 40 |     float2 center = means2d[idx];
 41 |     uint2 tile_min, tile_max;
 42 |     get_tile_bbox(center, radius, tile_bounds, tile_min, tile_max, block_width);
 43 |     int32_t tile_area = (tile_max.x - tile_min.x) * (tile_max.y - tile_min.y);
 44 |     if (tile_area <= 0) {
 45 |         return;
 46 |     }
 47 | 
 48 |     num_tiles_hit[idx] = tile_area;
 49 |     radii[idx] = (int)radius;
 50 |     xys[idx] = center;
 51 | }
 52 | 
 53 | // kernel to map each intersection from tile ID and depth to a gaussian
 54 | // writes output to isect_ids and gaussian_ids
 55 | __global__ void map_gaussian_to_intersects(
 56 |     const int num_points,
 57 |     const float2* __restrict__ xys,
 58 |     const float* __restrict__ depths,
 59 |     const int* __restrict__ radii,
 60 |     const int32_t* __restrict__ cum_tiles_hit,
 61 |     const dim3 tile_bounds,
 62 |     const unsigned block_width,
 63 |     int64_t* __restrict__ isect_ids,
 64 |     int32_t* __restrict__ gaussian_ids
 65 | ) {
 66 |     unsigned idx = cg::this_grid().thread_rank();
 67 |     if (idx >= num_points)
 68 |         return;
 69 |     if (radii[idx] <= 0)
 70 |         return;
 71 |     // get the tile bbox for gaussian
 72 |     uint2 tile_min, tile_max;
 73 |     float2 center = xys[idx];
 74 |     get_tile_bbox(center, radii[idx], tile_bounds, tile_min, tile_max, block_width);
 75 | 
 76 |     // update the intersection info for all tiles this gaussian hits
 77 |     int32_t cur_idx = (idx == 0) ? 0 : cum_tiles_hit[idx - 1];
 78 |     // printf("point %d starting at %d\n", idx, cur_idx);
 79 |     int64_t depth_id = (int64_t) * (int32_t *)&(depths[idx]);
 80 |     for (int i = tile_min.y; i < tile_max.y; ++i) {
 81 |         for (int j = tile_min.x; j < tile_max.x; ++j) {
 82 |             // isect_id is tile ID and depth as int32
 83 |             int64_t tile_id = i * tile_bounds.x + j; // tile within image
 84 |             isect_ids[cur_idx] = (tile_id << 32) | depth_id; // tile | depth id
 85 |             gaussian_ids[cur_idx] = idx;                     // 3D gaussian id
 86 |             ++cur_idx; // handles gaussians that hit more than one tile
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | // kernel to map sorted intersection IDs to tile bins
 92 | // expect that intersection IDs are sorted by increasing tile ID
 93 | // i.e. intersections of a tile are in contiguous chunks
 94 | __global__ void get_tile_bin_edges(
 95 |     const int num_intersects, const int64_t* __restrict__ isect_ids_sorted, int2* __restrict__ tile_bins
 96 | ) {
 97 |     unsigned idx = cg::this_grid().thread_rank();
 98 |     if (idx >= num_intersects)
 99 |         return;
100 |     // save the indices where the tile_id changes
101 |     int32_t cur_tile_idx = (int32_t)(isect_ids_sorted[idx] >> 32);
102 |     if (idx == 0 || idx == num_intersects - 1) {
103 |         if (idx == 0)
104 |             tile_bins[cur_tile_idx].x = 0;
105 |         if (idx == num_intersects - 1)
106 |             tile_bins[cur_tile_idx].y = num_intersects;
107 |     }
108 |     if (idx == 0)
109 |         return;
110 |     int32_t prev_tile_idx = (int32_t)(isect_ids_sorted[idx - 1] >> 32);
111 |     if (prev_tile_idx != cur_tile_idx) {
112 |         tile_bins[prev_tile_idx].y = idx;
113 |         tile_bins[cur_tile_idx].x = idx;
114 |         return;
115 |     }
116 | }
117 | 
118 | 
119 | __global__ void rasterize_forward(
120 |     const dim3 tile_bounds,
121 |     const dim3 img_size,
122 |     const int32_t* __restrict__ gaussian_ids_sorted,
123 |     const int2* __restrict__ tile_bins,
124 |     const float2* __restrict__ xys,
125 |     const float3* __restrict__ conics,
126 |     const float3* __restrict__ colors,
127 |     int* __restrict__ final_index,
128 |     float3* __restrict__ out_img
129 | ) {
130 | 
131 |     auto block = cg::this_thread_block();
132 |     int32_t tile_id =
133 |         block.group_index().y * tile_bounds.x + block.group_index().x;
134 |     unsigned i =
135 |         block.group_index().y * block.group_dim().y + block.thread_index().y;
136 |     unsigned j =
137 |         block.group_index().x * block.group_dim().x + block.thread_index().x;
138 | 
139 |     float px = (float)j + 0.5;
140 |     float py = (float)i + 0.5;
141 |     int32_t pix_id = i * img_size.x + j;
142 | 
143 |     // return if out of bounds
144 |     // keep not rasterizing threads around for reading data
145 |     bool inside = (i < img_size.y && j < img_size.x);
146 |     bool done = !inside;
147 | 
148 |     // have all threads in tile process the same gaussians in batches
149 |     // first collect gaussians between range.x and range.y in batches
150 |     // which gaussians to look through in this tile
151 |     int2 range = tile_bins[tile_id];
152 |     const int block_size = block.size();
153 |     int num_batches = (range.y - range.x + block_size - 1) / block_size;
154 | 
155 |     __shared__ int32_t id_batch[MAX_BLOCK_SIZE];
156 |     // __shared__ float3 xy_opacity_batch[MAX_BLOCK_SIZE];
157 |     __shared__ float2 xy_batch[MAX_BLOCK_SIZE];
158 |     __shared__ float3 conic_batch[MAX_BLOCK_SIZE];
159 | 
160 |     int cur_idx = 0;
161 | 
162 |     // collect and process batches of gaussians
163 |     // each thread loads one gaussian at a time before rasterizing its
164 |     // designated pixel
165 |     int tr = block.thread_rank();
166 |     float3 pix_out = {0.f, 0.f, 0.f};
167 |     for (int b = 0; b < num_batches; ++b) {
168 |         // resync all threads before beginning next batch
169 |         // end early if entire tile is done
170 |         if (__syncthreads_count(done) >= block_size) {
171 |             break;
172 |         }
173 | 
174 |         // each thread fetch 1 gaussian from front to back
175 |         // index of gaussian to load
176 |         int batch_start = range.x + block_size * b;
177 |         int idx = batch_start + tr;
178 |         if (idx < range.y) {
179 |             int32_t g_id = gaussian_ids_sorted[idx];
180 |             id_batch[tr] = g_id;
181 |             xy_batch[tr] = xys[g_id];
182 |             conic_batch[tr] = conics[g_id];
183 |         }
184 | 
185 |         // wait for other threads to collect the gaussians in batch
186 |         block.sync();
187 | 
188 |         // process gaussians in the current batch for this pixel
189 |         int batch_size = min(block_size, range.y - batch_start);
190 |         for (int t = 0; (t < batch_size) && !done; ++t) {
191 |             const float3 conic = conic_batch[t];
192 |             const float2 xy = xy_batch[t];
193 |             const float2 delta = {xy.x - px, xy.y - py};
194 |             const float sigma = 0.5f * (conic.x * delta.x * delta.x +
195 |                                         conic.z * delta.y * delta.y) +
196 |                                 conic.y * delta.x * delta.y;
197 |             const float alpha = min(0.999f, __expf(-sigma));
198 |             if (sigma < 0.f || alpha < 1.f / 255.f) {
199 |                 continue;
200 |             }
201 | 
202 | 
203 |             int32_t g = id_batch[t];
204 |             const float vis = alpha;
205 |             const float3 c = colors[g];
206 |             pix_out.x = pix_out.x + c.x * vis;
207 |             pix_out.y = pix_out.y + c.y * vis;
208 |             pix_out.z = pix_out.z + c.z * vis;
209 |             cur_idx = batch_start + t;
210 |         }
211 |     }
212 | 
213 |     if (inside) {
214 |         final_index[pix_id] =
215 |             cur_idx; // index of in bin of last gaussian in this pixel
216 |         float3 final_color;
217 | 
218 |         final_color.x = pix_out.x;
219 |         final_color.y = pix_out.y;
220 |         final_color.z = pix_out.z;
221 |         out_img[pix_id] = final_color;
222 |     }
223 | }
224 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/backward.cu:
--------------------------------------------------------------------------------
  1 | #include "backward.cuh"
  2 | #include "helpers.cuh"
  3 | #include <cuda_fp16.h>
  4 | #include <cooperative_groups.h>
  5 | #include <cooperative_groups/reduce.h>
  6 | namespace cg = cooperative_groups;
  7 | 
  8 | inline __device__ void warpSum3(float3& val, cg::thread_block_tile<32>& tile){
  9 |     val.x = cg::reduce(tile, val.x, cg::plus<float>());
 10 |     val.y = cg::reduce(tile, val.y, cg::plus<float>());
 11 |     val.z = cg::reduce(tile, val.z, cg::plus<float>());
 12 | }
 13 | 
 14 | inline __device__ void warpSum2(float2& val, cg::thread_block_tile<32>& tile){
 15 |     val.x = cg::reduce(tile, val.x, cg::plus<float>());
 16 |     val.y = cg::reduce(tile, val.y, cg::plus<float>());
 17 | }
 18 | 
 19 | inline __device__ void warpSum(float& val, cg::thread_block_tile<32>& tile){
 20 |     val = cg::reduce(tile, val, cg::plus<float>());
 21 | }
 22 | 
 23 | __global__ void rasterize_backward_kernel(
 24 |     const dim3 tile_bounds,
 25 |     const dim3 img_size,
 26 |     const int32_t* __restrict__ gaussian_ids_sorted,
 27 |     const int2* __restrict__ tile_bins,
 28 |     const float2* __restrict__ xys,
 29 |     const float3* __restrict__ conics,
 30 |     const float3* __restrict__ rgbs,
 31 |     const int* __restrict__ final_index,
 32 |     const float3* __restrict__ v_output,
 33 |     float2* __restrict__ v_xy,
 34 |     float2* __restrict__ v_xy_abs,
 35 |     float3* __restrict__ v_conic,
 36 |     float3* __restrict__ v_rgb
 37 | ) {
 38 |     auto block = cg::this_thread_block();
 39 |     int32_t tile_id =
 40 |         block.group_index().y * tile_bounds.x + block.group_index().x;
 41 |     unsigned i =
 42 |         block.group_index().y * block.group_dim().y + block.thread_index().y;
 43 |     unsigned j =
 44 |         block.group_index().x * block.group_dim().x + block.thread_index().x;
 45 | 
 46 |     const float px = (float)j + 0.5;
 47 |     const float py = (float)i + 0.5;
 48 |     // clamp this value to the last pixel
 49 |     const int32_t pix_id = min(i * img_size.x + j, img_size.x * img_size.y - 1);
 50 | 
 51 |     // keep not rasterizing threads around for reading data
 52 |     const bool inside = (i < img_size.y && j < img_size.x);
 53 | 
 54 |     // the contribution from gaussians behind the current one
 55 |     // float3 buffer = {0.f, 0.f, 0.f};
 56 |     // index of last gaussian to contribute to this pixel
 57 |     const int bin_final = inside? final_index[pix_id] : 0;
 58 | 
 59 |     // have all threads in tile process the same gaussians in batches
 60 |     // first collect gaussians between range.x and range.y in batches
 61 |     // which gaussians to look through in this tile
 62 |     const int2 range = tile_bins[tile_id];
 63 |     const int block_size = block.size();
 64 |     const int num_batches = (range.y - range.x + block_size - 1) / block_size;
 65 | 
 66 |     __shared__ int32_t id_batch[MAX_BLOCK_SIZE];
 67 |     __shared__ float2 xy_batch[MAX_BLOCK_SIZE];
 68 |     __shared__ float3 conic_batch[MAX_BLOCK_SIZE];
 69 |     __shared__ float3 rgbs_batch[MAX_BLOCK_SIZE];
 70 | 
 71 |     // df/d_out for this pixel
 72 |     const float3 v_out = v_output[pix_id];
 73 | 
 74 |     // collect and process batches of gaussians
 75 |     // each thread loads one gaussian at a time before rasterizing
 76 |     const int tr = block.thread_rank();
 77 |     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
 78 |     const int warp_bin_final = cg::reduce(warp, bin_final, cg::greater<int>());
 79 |     for (int b = 0; b < num_batches; ++b) {
 80 |         // resync all threads before writing next batch of shared mem
 81 |         block.sync();
 82 | 
 83 |         // each thread fetch 1 gaussian from back to front
 84 |         // 0 index will be furthest back in batch
 85 |         // index of gaussian to load
 86 |         // batch end is the index of the last gaussian in the batch
 87 |         const int batch_end = range.y - 1 - block_size * b;
 88 |         int batch_size = min(block_size, batch_end + 1 - range.x);
 89 |         const int idx = batch_end - tr;
 90 |         if (idx >= range.x) {
 91 |             int32_t g_id = gaussian_ids_sorted[idx];
 92 |             id_batch[tr] = g_id;
 93 |             xy_batch[tr] = xys[g_id];
 94 |             conic_batch[tr] = conics[g_id];
 95 |             rgbs_batch[tr] = rgbs[g_id];
 96 |         }
 97 |         // wait for other threads to collect the gaussians in batch
 98 |         block.sync();
 99 |         // process gaussians in the current batch for this pixel
100 |         // 0 index is the furthest back gaussian in the batch
101 |         for (int t = max(0,batch_end - warp_bin_final); t < batch_size; ++t) {
102 |             int valid = inside;
103 |             if (batch_end - t > bin_final) {
104 |                 valid = 0;
105 |             }
106 |             float alpha;
107 |             float2 delta;
108 |             float3 conic;
109 |             float vis;
110 |             if(valid){
111 |                 conic = conic_batch[t];
112 |                 float2 xy = xy_batch[t];
113 |                 delta = {xy.x - px, xy.y - py};
114 |                 float sigma = 0.5f * (conic.x * delta.x * delta.x +
115 |                                             conic.z * delta.y * delta.y) +
116 |                                     conic.y * delta.x * delta.y;
117 |                 vis = __expf(-sigma);
118 |                 alpha = min(0.99f, vis);
119 |                 if (sigma < 0.f || alpha < 1.f / 255.f) {
120 |                     valid = 0;
121 |                 }
122 |             }
123 |             // if all threads are inactive in this warp, skip this loop
124 |             if(!warp.any(valid)){
125 |                 continue;
126 |             }
127 |             float3 v_rgb_local = {0.f, 0.f, 0.f};
128 |             float3 v_conic_local = {0.f, 0.f, 0.f};
129 |             float2 v_xy_local = {0.f, 0.f};
130 |             float2 v_xy_abs_local = {0.f, 0.f};
131 |             //initialize everything to 0, only set if the lane is valid
132 |             if(valid){
133 | 
134 |                 const float fac = alpha;
135 |                 float v_alpha = 0.f;
136 |                 v_rgb_local = {fac * v_out.x, fac * v_out.y, fac * v_out.z};
137 | 
138 |                 const float3 rgb = rgbs_batch[t];
139 | 
140 |                 v_alpha += rgb.x * v_out.x;
141 |                 v_alpha += rgb.y * v_out.y;
142 |                 v_alpha += rgb.z * v_out.z; 
143 | 
144 |                 const float v_sigma = - vis * v_alpha;
145 |                 v_conic_local = {0.5f * v_sigma * delta.x * delta.x, 
146 |                                  v_sigma * delta.x * delta.y,
147 |                                  0.5f * v_sigma * delta.y * delta.y};
148 | 
149 |                 v_xy_local = {v_sigma * (conic.x * delta.x + conic.y * delta.y), 
150 |                                     v_sigma * (conic.y * delta.x + conic.z * delta.y)};
151 |                 v_xy_abs_local = {abs(v_xy_local.x), abs(v_xy_local.y)};
152 |             }
153 |             warpSum3(v_rgb_local, warp);
154 |             warpSum3(v_conic_local, warp);
155 |             warpSum2(v_xy_local, warp);
156 |             warpSum2(v_xy_abs_local, warp);
157 |             if (warp.thread_rank() == 0) {
158 |                 int32_t g = id_batch[t];
159 |                 float* v_rgb_ptr = (float*)(v_rgb);
160 |                 atomicAdd(v_rgb_ptr + 3*g + 0, v_rgb_local.x);
161 |                 atomicAdd(v_rgb_ptr + 3*g + 1, v_rgb_local.y);
162 |                 atomicAdd(v_rgb_ptr + 3*g + 2, v_rgb_local.z);
163 |                 
164 |                 float* v_conic_ptr = (float*)(v_conic);
165 |                 atomicAdd(v_conic_ptr + 3*g + 0, v_conic_local.x);
166 |                 atomicAdd(v_conic_ptr + 3*g + 1, v_conic_local.y);
167 |                 atomicAdd(v_conic_ptr + 3*g + 2, v_conic_local.z);
168 |                 
169 |                 float* v_xy_ptr = (float*)(v_xy);
170 |                 atomicAdd(v_xy_ptr + 2*g + 0, v_xy_local.x);
171 |                 atomicAdd(v_xy_ptr + 2*g + 1, v_xy_local.y);
172 | 
173 |                 float* v_xy_abs_ptr = (float*)(v_xy_abs);
174 |                 atomicAdd(v_xy_abs_ptr + 2*g + 0, v_xy_abs_local.x);
175 |                 atomicAdd(v_xy_abs_ptr + 2*g + 1, v_xy_abs_local.y);
176 |                 
177 |             }
178 |         }
179 |     }
180 | }
181 | 
182 | __global__ void project_gaussians_backward_kernel(
183 |     const int num_points,
184 |     const int* __restrict__ radii,
185 |     const float3* __restrict__ conics,
186 |     const float2* __restrict__ v_xy,
187 |     const float3* __restrict__ v_conic,
188 |     float3* __restrict__ v_cov2d,
189 |     float2* __restrict__ v_mean2d
190 | ) {
191 |     unsigned idx = cg::this_grid().thread_rank(); // idx of thread within grid
192 |     if (idx >= num_points || radii[idx] <= 0) {
193 |         return;
194 |     }
195 | 
196 |     v_mean2d[idx].x = v_xy[idx].x;
197 |     v_mean2d[idx].y = v_xy[idx].y;
198 | 
199 |     // get v_cov2d
200 |     cov2d_to_conic_vjp(conics[idx], v_conic[idx], v_cov2d[idx]);
201 | }
202 | 


--------------------------------------------------------------------------------
/gsplat2d/gsplat2d/cuda/csrc/bindings.cu:
--------------------------------------------------------------------------------
  1 | #include "backward.cuh"
  2 | #include "bindings.h"
  3 | #include "forward.cuh"
  4 | #include "helpers.cuh"
  5 | #include <cooperative_groups.h>
  6 | #include <cooperative_groups/reduce.h>
  7 | #include <cstdio>
  8 | #include <cuda.h>
  9 | #include <cuda_runtime.h>
 10 | #include <cuda_runtime_api.h>
 11 | #include <iostream>
 12 | #include <math.h>
 13 | #include <torch/extension.h>
 14 | #include <tuple>
 15 | 
 16 | namespace cg = cooperative_groups;
 17 | 
 18 | __global__ void compute_cov2d_bounds_kernel(
 19 |     const unsigned num_pts, const float* __restrict__ covs2d, float* __restrict__ conics, float* __restrict__ radii
 20 | ) {
 21 |     unsigned row = cg::this_grid().thread_rank();
 22 |     if (row >= num_pts) {
 23 |         return;
 24 |     }
 25 |     int index = row * 3;
 26 |     float3 conic;
 27 |     float radius;
 28 |     float3 cov2d{
 29 |         (float)covs2d[index], (float)covs2d[index + 1], (float)covs2d[index + 2]
 30 |     };
 31 |     compute_cov2d_bounds(cov2d, conic, radius);
 32 |     conics[index] = conic.x;
 33 |     conics[index + 1] = conic.y;
 34 |     conics[index + 2] = conic.z;
 35 |     radii[row] = radius;
 36 | }
 37 | 
 38 | std::tuple<
 39 |     torch::Tensor, // output conics
 40 |     torch::Tensor> // output radii
 41 | compute_cov2d_bounds_tensor(const int num_pts, torch::Tensor &covs2d) {
 42 |     DEVICE_GUARD(covs2d);
 43 |     CHECK_INPUT(covs2d);
 44 |     torch::Tensor conics = torch::zeros(
 45 |         {num_pts, covs2d.size(1)}, covs2d.options().dtype(torch::kFloat32)
 46 |     );
 47 |     torch::Tensor radii =
 48 |         torch::zeros({num_pts, 1}, covs2d.options().dtype(torch::kFloat32));
 49 | 
 50 |     int blocks = (num_pts + N_THREADS - 1) / N_THREADS;
 51 | 
 52 |     compute_cov2d_bounds_kernel<<<blocks, N_THREADS>>>(
 53 |         num_pts,
 54 |         covs2d.contiguous().data_ptr<float>(),
 55 |         conics.contiguous().data_ptr<float>(),
 56 |         radii.contiguous().data_ptr<float>()
 57 |     );
 58 |     return std::make_tuple(conics, radii);
 59 | }
 60 | 
 61 | std::tuple<
 62 |     torch::Tensor,
 63 |     torch::Tensor,
 64 |     torch::Tensor,
 65 |     torch::Tensor>
 66 | project_gaussians_forward_tensor(
 67 |     const int num_points,
 68 |     torch::Tensor &cov2d,
 69 |     torch::Tensor &means2d,
 70 |     const unsigned img_height,
 71 |     const unsigned img_width,
 72 |     const unsigned block_width
 73 | ) {
 74 |     DEVICE_GUARD(cov2d);
 75 | 
 76 |     dim3 tile_bounds_dim3;
 77 |     tile_bounds_dim3.x = int((img_width + block_width - 1) / block_width);
 78 |     tile_bounds_dim3.y = int((img_height + block_width - 1) / block_width);
 79 |     tile_bounds_dim3.z = 1;
 80 | 
 81 |     torch::Tensor xys_d =
 82 |         torch::zeros({num_points, 2}, cov2d.options().dtype(torch::kFloat32));
 83 | 
 84 |     torch::Tensor radii_d =
 85 |         torch::zeros({num_points}, cov2d.options().dtype(torch::kInt32));
 86 |     torch::Tensor conics_d =
 87 |         torch::zeros({num_points, 3}, cov2d.options().dtype(torch::kFloat32));
 88 |     torch::Tensor num_tiles_hit_d =
 89 |         torch::zeros({num_points}, cov2d.options().dtype(torch::kInt32));
 90 | 
 91 |     project_gaussians_forward_kernel<<<
 92 |         (num_points + N_THREADS - 1) / N_THREADS,
 93 |         N_THREADS>>>(
 94 |         num_points,
 95 |         (float3 *)cov2d.contiguous().data_ptr<float>(),
 96 |         (float2 *)means2d.contiguous().data_ptr<float>(),
 97 |         tile_bounds_dim3,
 98 |         block_width,
 99 |         (float2 *)xys_d.contiguous().data_ptr<float>(),
100 |         radii_d.contiguous().data_ptr<int>(),
101 |         (float3 *)conics_d.contiguous().data_ptr<float>(),
102 |         num_tiles_hit_d.contiguous().data_ptr<int32_t>()
103 |     );
104 | 
105 |     return std::make_tuple(
106 |         xys_d, radii_d, conics_d, num_tiles_hit_d
107 |     );
108 | }
109 | 
110 | std::tuple<
111 |     torch::Tensor,
112 |     torch::Tensor>
113 | project_gaussians_backward_tensor(
114 |     const int num_points,
115 |     torch::Tensor &radii,
116 |     torch::Tensor &conics,
117 |     torch::Tensor &v_xy,
118 |     torch::Tensor &v_conic
119 | ){
120 |     DEVICE_GUARD(conics);
121 |     // Triangular covariance.
122 |     torch::Tensor v_cov2d =
123 |         torch::zeros({num_points, 3}, conics.options().dtype(torch::kFloat32));
124 | 
125 |     torch::Tensor v_mean2d =
126 |         torch::zeros({num_points, 2}, conics.options().dtype(torch::kFloat32));
127 | 
128 |     project_gaussians_backward_kernel<<<
129 |         (num_points + N_THREADS - 1) / N_THREADS,
130 |         N_THREADS>>>(
131 |         num_points,
132 |         radii.contiguous().data_ptr<int32_t>(),
133 |         (float3 *)conics.contiguous().data_ptr<float>(),
134 |         (float2 *)v_xy.contiguous().data_ptr<float>(),
135 |         (float3 *)v_conic.contiguous().data_ptr<float>(),
136 |         // Outputs.
137 |         (float3 *)v_cov2d.contiguous().data_ptr<float>(),
138 |         (float2 *)v_mean2d.contiguous().data_ptr<float>()
139 |     );
140 | 
141 |     return std::make_tuple(v_cov2d, v_mean2d);
142 | }
143 | 
144 | std::tuple<torch::Tensor, torch::Tensor> map_gaussian_to_intersects_tensor(
145 |     const int num_points,
146 |     const int num_intersects,
147 |     const torch::Tensor &xys,
148 |     const torch::Tensor &depths,
149 |     const torch::Tensor &radii,
150 |     const torch::Tensor &cum_tiles_hit,
151 |     const std::tuple<int, int, int> tile_bounds,
152 |     const unsigned block_width
153 | ) {
154 |     DEVICE_GUARD(xys);
155 |     CHECK_INPUT(xys);
156 |     CHECK_INPUT(depths);
157 |     CHECK_INPUT(radii);
158 |     CHECK_INPUT(cum_tiles_hit);
159 | 
160 |     dim3 tile_bounds_dim3;
161 |     tile_bounds_dim3.x = std::get<0>(tile_bounds);
162 |     tile_bounds_dim3.y = std::get<1>(tile_bounds);
163 |     tile_bounds_dim3.z = std::get<2>(tile_bounds);
164 | 
165 |     torch::Tensor gaussian_ids_unsorted =
166 |         torch::zeros({num_intersects}, xys.options().dtype(torch::kInt32));
167 |     torch::Tensor isect_ids_unsorted =
168 |         torch::zeros({num_intersects}, xys.options().dtype(torch::kInt64));
169 | 
170 |     map_gaussian_to_intersects<<<
171 |         (num_points + N_THREADS - 1) / N_THREADS,
172 |         N_THREADS>>>(
173 |         num_points,
174 |         (float2 *)xys.contiguous().data_ptr<float>(),
175 |         depths.contiguous().data_ptr<float>(),
176 |         radii.contiguous().data_ptr<int32_t>(),
177 |         cum_tiles_hit.contiguous().data_ptr<int32_t>(),
178 |         tile_bounds_dim3,
179 |         block_width,
180 |         // Outputs.
181 |         isect_ids_unsorted.contiguous().data_ptr<int64_t>(),
182 |         gaussian_ids_unsorted.contiguous().data_ptr<int32_t>()
183 |     );
184 | 
185 |     return std::make_tuple(isect_ids_unsorted, gaussian_ids_unsorted);
186 | }
187 | 
188 | torch::Tensor get_tile_bin_edges_tensor(
189 |     int num_intersects, const torch::Tensor &isect_ids_sorted, 
190 |     const std::tuple<int, int, int> tile_bounds
191 | ) {
192 |     DEVICE_GUARD(isect_ids_sorted);
193 |     CHECK_INPUT(isect_ids_sorted);
194 |     int num_tiles = std::get<0>(tile_bounds) * std::get<1>(tile_bounds);
195 |     torch::Tensor tile_bins = torch::zeros(
196 |         {num_tiles, 2}, isect_ids_sorted.options().dtype(torch::kInt32)
197 |     );
198 |     get_tile_bin_edges<<<
199 |         (num_intersects + N_THREADS - 1) / N_THREADS,
200 |         N_THREADS>>>(
201 |         num_intersects,
202 |         isect_ids_sorted.contiguous().data_ptr<int64_t>(),
203 |         (int2 *)tile_bins.contiguous().data_ptr<int>()
204 |     );
205 |     return tile_bins;
206 | }
207 | 
208 | std::tuple<torch::Tensor, torch::Tensor>
209 | rasterize_forward_tensor(
210 |     const std::tuple<int, int, int> tile_bounds,
211 |     const std::tuple<int, int, int> block,
212 |     const std::tuple<int, int, int> img_size,
213 |     const torch::Tensor &gaussian_ids_sorted,
214 |     const torch::Tensor &tile_bins,
215 |     const torch::Tensor &xys,
216 |     const torch::Tensor &conics,
217 |     const torch::Tensor &colors //,
218 | ) {
219 |     DEVICE_GUARD(xys);
220 |     CHECK_INPUT(gaussian_ids_sorted);
221 |     CHECK_INPUT(tile_bins);
222 |     CHECK_INPUT(xys);
223 |     CHECK_INPUT(conics);
224 |     CHECK_INPUT(colors);
225 | 
226 |     dim3 tile_bounds_dim3;
227 |     tile_bounds_dim3.x = std::get<0>(tile_bounds);
228 |     tile_bounds_dim3.y = std::get<1>(tile_bounds);
229 |     tile_bounds_dim3.z = std::get<2>(tile_bounds);
230 | 
231 |     dim3 block_dim3;
232 |     block_dim3.x = std::get<0>(block);
233 |     block_dim3.y = std::get<1>(block);
234 |     block_dim3.z = std::get<2>(block);
235 | 
236 |     dim3 img_size_dim3;
237 |     img_size_dim3.x = std::get<0>(img_size);
238 |     img_size_dim3.y = std::get<1>(img_size);
239 |     img_size_dim3.z = std::get<2>(img_size);
240 | 
241 |     const int channels = colors.size(1);
242 |     const int img_width = img_size_dim3.x;
243 |     const int img_height = img_size_dim3.y;
244 | 
245 |     torch::Tensor out_img = torch::zeros(
246 |         {img_height, img_width, channels}, xys.options().dtype(torch::kFloat32)
247 |     );
248 |     torch::Tensor final_idx = torch::zeros(
249 |         {img_height, img_width}, xys.options().dtype(torch::kInt32)
250 |     );
251 | 
252 |     rasterize_forward<<<tile_bounds_dim3, block_dim3>>>(
253 |         tile_bounds_dim3,
254 |         img_size_dim3,
255 |         gaussian_ids_sorted.contiguous().data_ptr<int32_t>(),
256 |         (int2 *)tile_bins.contiguous().data_ptr<int>(),
257 |         (float2 *)xys.contiguous().data_ptr<float>(),
258 |         (float3 *)conics.contiguous().data_ptr<float>(),
259 |         (float3 *)colors.contiguous().data_ptr<float>(),
260 |         final_idx.contiguous().data_ptr<int>(),
261 |         (float3 *)out_img.contiguous().data_ptr<float>() //,
262 |     );
263 | 
264 |     return std::make_tuple(out_img, final_idx);
265 | }
266 | 
267 | std::
268 |     tuple<
269 |         torch::Tensor, // dL_dxy
270 |         torch::Tensor, // dL_dxy_abs
271 |         torch::Tensor, // dL_dconic
272 |         torch::Tensor // dL_dcolors
273 |         >
274 |     rasterize_backward_tensor(
275 |         const unsigned img_height,
276 |         const unsigned img_width,
277 |         const unsigned block_width,
278 |         const torch::Tensor &gaussians_ids_sorted,
279 |         const torch::Tensor &tile_bins,
280 |         const torch::Tensor &xys,
281 |         const torch::Tensor &conics,
282 |         const torch::Tensor &colors,
283 |         const torch::Tensor &final_idx,
284 |         const torch::Tensor &v_output //, // dL_dout_color
285 |     ) {
286 |     DEVICE_GUARD(xys);
287 |     CHECK_INPUT(xys);
288 |     CHECK_INPUT(colors);
289 | 
290 |     if (xys.ndimension() != 2 || xys.size(1) != 2) {
291 |         AT_ERROR("xys must have dimensions (num_points, 2)");
292 |     }
293 | 
294 |     if (colors.ndimension() != 2 || colors.size(1) != 3) {
295 |         AT_ERROR("colors must have 2 dimensions");
296 |     }
297 | 
298 |     const int num_points = xys.size(0);
299 |     const dim3 tile_bounds = {
300 |         (img_width + block_width - 1) / block_width,
301 |         (img_height + block_width - 1) / block_width,
302 |         1
303 |     };
304 |     const dim3 block(block_width, block_width, 1);
305 |     const dim3 img_size = {img_width, img_height, 1};
306 |     const int channels = colors.size(1);
307 | 
308 |     torch::Tensor v_xy = torch::zeros({num_points, 2}, xys.options());
309 |     torch::Tensor v_xy_abs = torch::zeros({num_points, 2}, xys.options());
310 |     torch::Tensor v_conic = torch::zeros({num_points, 3}, xys.options());
311 |     torch::Tensor v_colors =
312 |         torch::zeros({num_points, channels}, xys.options());
313 | 
314 |     rasterize_backward_kernel<<<tile_bounds, block>>>(
315 |         tile_bounds,
316 |         img_size,
317 |         gaussians_ids_sorted.contiguous().data_ptr<int>(),
318 |         (int2 *)tile_bins.contiguous().data_ptr<int>(),
319 |         (float2 *)xys.contiguous().data_ptr<float>(),
320 |         (float3 *)conics.contiguous().data_ptr<float>(),
321 |         (float3 *)colors.contiguous().data_ptr<float>(),
322 |         final_idx.contiguous().data_ptr<int>(),
323 |         (float3 *)v_output.contiguous().data_ptr<float>(),
324 |         (float2 *)v_xy.contiguous().data_ptr<float>(),
325 |         (float2 *)v_xy_abs.contiguous().data_ptr<float>(),
326 |         (float3 *)v_conic.contiguous().data_ptr<float>(),
327 |         (float3 *)v_colors.contiguous().data_ptr<float>()
328 |     );
329 | 
330 |     return std::make_tuple(v_xy, v_xy_abs, v_conic, v_colors);
331 | }
332 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import time
  3 | from pathlib import Path
  4 | import argparse
  5 | import yaml
  6 | import numpy as np
  7 | import torch
  8 | import sys
  9 | from PIL import Image
 10 | import torch.nn.functional as F
 11 | from pytorch_msssim import ms_ssim
 12 | from utils import *
 13 | from tqdm import tqdm
 14 | import random
 15 | import torchvision.transforms as transforms
 16 | 
 17 | class SimpleTrainer2d:
 18 |     """Trains random 2d gaussians to fit an image."""
 19 |     def __init__(
 20 |         self,
 21 |         image_path: Path,
 22 |         log_path: str,
 23 |         num_points: int = 2000,
 24 |         model_name:str = "LIG",
 25 |         iterations:int = 30000,
 26 |         model_path = None,
 27 |         args = None,
 28 |     ):
 29 |         self.device = torch.device("cuda:0")
 30 |         self.gt_image = image_path_to_tensor(image_path).to(self.device)
 31 | 
 32 |         self.num_points = num_points
 33 |         image_path = Path(image_path)
 34 |         self.image_name = image_path.stem
 35 | 
 36 |         BLOCK_H, BLOCK_W = 16, 16
 37 |         self.H, self.W = self.gt_image.shape[2], self.gt_image.shape[3]
 38 |         self.iterations = iterations
 39 |         self.save_imgs = args.save_imgs
 40 |         self.log_dir = Path(log_path + '/' + self.image_name)
 41 | 
 42 |         if model_name == "LIG":
 43 |             from gaussianlig import LIG
 44 |             self.gaussian_model = LIG(loss_type="L2", opt_type="adam", 
 45 |                                       num_points=self.num_points, n_scales=args.n_scales, allo_ratio=args.allo_ratio,
 46 |                                       H=self.H, W=self.W, BLOCK_H=BLOCK_H, BLOCK_W=BLOCK_W,
 47 |                                       device=self.device, lr=args.lr).to(self.device)
 48 | 
 49 |         self.logwriter = LogWriter(self.log_dir)
 50 | 
 51 |         if model_path is not None:
 52 |             print(f"loading model path:{model_path}")
 53 |             if not hasattr(self.gaussian_model, 'n_scales'):
 54 |                 checkpoint = torch.load(model_path, map_location=self.device)
 55 |                 model_dict = self.gaussian_model.state_dict()
 56 |                 pretrained_dict = {k: v for k, v in checkpoint.items() if k in model_dict}
 57 |                 model_dict.update(pretrained_dict)
 58 |                 self.gaussian_model.load_state_dict(model_dict)
 59 |             else:
 60 |                 checkpoint = torch.load(model_path, map_location=self.device)
 61 |                 for level in range(self.gaussian_model.n_scales):
 62 |                     model_dict = self.gaussian_model.level_models[level].state_dict()
 63 |                     pretrained_dict = {k: v for k, v in checkpoint['state_dict'][level].items() if k in model_dict}
 64 |                     model_dict.update(pretrained_dict)
 65 |                     self.gaussian_model.level_models[level].load_state_dict(model_dict)
 66 |                 self.gaussian_model.store_max = checkpoint['store_max']
 67 |                 self.gaussian_model.store_min = checkpoint['store_min']
 68 | 
 69 |     def train(self):     
 70 |         psnr_list, iter_list = [], []
 71 |         if not hasattr(self.gaussian_model, 'n_scales'):
 72 |             progress_bar = tqdm(range(1, self.iterations+1), desc="Training progress")
 73 |             self.gaussian_model.train()
 74 |             start_time = time.time()
 75 |             for iter in range(1, self.iterations+1):
 76 |                 loss, psnr = self.gaussian_model.train_iter(self.gt_image)
 77 |                 psnr_list.append(psnr)
 78 |                 iter_list.append(iter)
 79 |                 with torch.no_grad():
 80 |                     if iter % 10 == 0:
 81 |                         progress_bar.set_postfix({f"Loss":f"{loss.item():.{7}f}", "PSNR":f"{psnr:.{4}f},"})
 82 |                         progress_bar.update(10)
 83 |             end_time = time.time() - start_time
 84 |             progress_bar.close()
 85 |             psnr_value, ms_ssim_value = self.test()
 86 |             with torch.no_grad():
 87 |                 self.gaussian_model.eval()
 88 |                 test_start_time = time.time()
 89 |                 for i in range(100):
 90 |                     _ = self.gaussian_model()
 91 |                 test_end_time = (time.time() - test_start_time)/100
 92 |         else:
 93 |             start_time = time.time()
 94 |             for scale_idx in range(self.gaussian_model.n_scales):
 95 |                 if scale_idx != self.gaussian_model.n_scales - 1 and self.gaussian_model.n_scales > 1:
 96 |                     img_target = torch.nn.functional.interpolate(self.gt_image,
 97 |                                             scale_factor=pow(2.0, -self.gaussian_model.n_scales+scale_idx+1),
 98 |                                             mode='area')
 99 |                 else:
100 |                     img_target = self.gt_image
101 |                 
102 |                 if scale_idx != 0:
103 |                     im_estim_prev = torch.nn.functional.interpolate(im_estim,
104 |                                                                     size = (img_target.shape[2], img_target.shape[3]),
105 |                                                                     mode='bilinear')
106 |                     del im_estim
107 |                     if self.save_imgs:
108 |                         transform = transforms.ToPILImage()
109 |                         im_estim_prev_img = transform(torch.clamp(im_estim_prev, 0, 1).squeeze(0))
110 |                         name = self.image_name + f"_fitting_{scale_idx-1}.png" 
111 |                         im_estim_prev_img.save(str(self.log_dir / name))
112 | 
113 |                     img_target = img_target - im_estim_prev
114 |                     im_estim_prev = im_estim_prev.cpu()
115 |                     img_target += 0.5
116 | 
117 |                     if self.save_imgs:
118 |                         transform = transforms.ToPILImage()
119 |                         img_target_img = transform(torch.clamp(img_target, 0, 1).squeeze(0))
120 |                         name = self.image_name + f"_residual_{scale_idx-1}.png" 
121 |                         img_target_img.save(str(self.log_dir / name))
122 |                     
123 | 
124 |                     store_min = torch.min(img_target)
125 |                     store_max = torch.max(img_target)
126 |                     img_target = (img_target - store_min) / (store_max - store_min)
127 | 
128 |                     if self.save_imgs:
129 |                         transform = transforms.ToPILImage()
130 |                         img_target_img = transform(torch.clamp(img_target, 0, 1).squeeze(0))
131 |                         name = self.image_name + f"_residual_scale_{scale_idx-1}.png" 
132 |                         img_target_img.save(str(self.log_dir / name))
133 | 
134 |                     self.gaussian_model.store_min.append(store_min)
135 |                     self.gaussian_model.store_max.append(store_max)
136 | 
137 |                 progress_bar = tqdm(range(1, self.iterations+1), desc="Training progress")
138 |                 self.gaussian_model.level_models[scale_idx].train()
139 |                 for iter in range(1, self.iterations+1):
140 |                     # affect memory and speed
141 |                     torch.cuda.empty_cache()
142 |                     loss, psnr = self.gaussian_model.level_models[scale_idx].train_iter(img_target)
143 |                     psnr_list.append(psnr)
144 |                     iter_list.append(iter)
145 |                     with torch.no_grad():
146 |                         if iter % 10 == 0:
147 |                             progress_bar.set_postfix({f"Loss":f"{loss.item():.{7}f}", "PSNR":f"{psnr:.{4}f},"})
148 |                             progress_bar.update(10)
149 |                 
150 |                 with torch.no_grad():
151 |                     if scale_idx == 0:
152 |                         im_estim = self.gaussian_model.level_models[scale_idx]()["render"].float()
153 |                     else:
154 |                         im_estim = self.gaussian_model.level_models[scale_idx]()["render"].float()*(store_max-store_min) + im_estim_prev.to(self.device) - 0.5 + store_min 
155 | 
156 |                     im_estim = im_estim.detach()
157 |                     self.gaussian_model.level_models[scale_idx] = self.gaussian_model.level_models[scale_idx].to("cpu")
158 | 
159 |             end_time = time.time() - start_time
160 |             progress_bar.close()
161 |             psnr_value, ms_ssim_value = self.test()
162 | 
163 |             with torch.no_grad():
164 |                 self.gaussian_model.eval()
165 |                 test_start_time = time.time()
166 |                 for i in range(100):
167 |                     for scale_idx in range(self.gaussian_model.n_scales):
168 |                         _ = self.gaussian_model.level_models[scale_idx]()
169 |                 test_end_time = (time.time() - test_start_time)/100
170 |  
171 |         self.logwriter.write("Training Complete in {:.4f}s, Eval time:{:.8f}s, FPS:{:.4f}".format(end_time, test_end_time, 1/test_end_time))
172 |         if not hasattr(self.gaussian_model, 'n_scales'):
173 |             torch.save(self.gaussian_model.state_dict(), self.log_dir / "gaussian_model.pth.tar")
174 |         else:
175 |             torch.save({'state_dict':[self.gaussian_model.level_models[scale_idx].state_dict() for scale_idx in range(self.gaussian_model.n_scales)], 
176 |                         'store_max': self.gaussian_model.store_max, 'store_min': self.gaussian_model.store_min}, self.log_dir / "gaussian_model.pth.tar")
177 |         np.save(self.log_dir / "training.npy", {"iterations": iter_list, "training_psnr": psnr_list, "training_time": end_time, 
178 |         "psnr": psnr_value, "ms-ssim": ms_ssim_value, "rendering_time": test_end_time, "rendering_fps": 1/test_end_time})
179 |         return psnr_value, ms_ssim_value, end_time, test_end_time, 1/test_end_time
180 | 
181 |     def test(self):
182 |         if not hasattr(self.gaussian_model, 'n_scales'):
183 |             self.gaussian_model.eval()
184 |             with torch.no_grad():
185 |                 out = self.gaussian_model()["render"].float()
186 |         else:
187 |             for scale_idx in range(self.gaussian_model.n_scales):
188 |                 self.gaussian_model.level_models[scale_idx].to(self.device)
189 |                 self.gaussian_model.level_models[scale_idx].eval()
190 |                 with torch.no_grad():
191 |                     if scale_idx == 0:
192 |                         out = self.gaussian_model.level_models[scale_idx]()["render"].float()
193 |                     else:
194 |                         next_estim = self.gaussian_model.level_models[scale_idx]()["render"].float()*(self.gaussian_model.store_max[scale_idx-1]-self.gaussian_model.store_min[scale_idx-1]) - 0.5 + self.gaussian_model.store_min[scale_idx-1]
195 |                         out = torch.nn.functional.interpolate(out, size = (next_estim.shape[2], next_estim.shape[3]), mode='bilinear')
196 |                         out = out + next_estim
197 |         out = torch.clamp(out, 0, 1)
198 |         mse_loss = F.mse_loss(out, self.gt_image.float())
199 |         psnr = 10 * math.log10(1.0 / mse_loss.item())
200 |         ms_ssim_value = ms_ssim(out, self.gt_image.float(), data_range=1, size_average=True).item()
201 |         self.logwriter.write("Test PSNR:{:.4f}, MS_SSIM:{:.6f}".format(psnr, ms_ssim_value))
202 |         if self.save_imgs:
203 |             transform = transforms.ToPILImage()
204 |             img = transform(out.squeeze(0))
205 |             name = self.image_name + "_fitting.png" 
206 |             img.save(str(self.log_dir / name))
207 |         return psnr, ms_ssim_value
208 | 
209 | def image_path_to_tensor(image_path: Path):
210 |     img = Image.open(image_path)
211 |     transform = transforms.ToTensor()
212 |     img_tensor = transform(img).unsqueeze(0)
213 |     return img_tensor
214 | 
215 | def parse_args(argv):
216 |     parser = argparse.ArgumentParser(description="Example training script.")
217 |     parser.add_argument(
218 |         "-d", "--dataset", type=str, default='./dataset/DIV2K_valid_HR', help="Training dataset"
219 |     )
220 |     parser.add_argument(
221 |         "--data_name", type=str, default='DIV2K_valid_HR', help="Training dataset"
222 |     )
223 |     parser.add_argument(
224 |         "--iterations", type=int, default=30000, help="number of training epochs (default: %(default)s)"
225 |     )
226 |     parser.add_argument(
227 |         "--model_name", type=str, default="LIG", help="model selection: GaussianImage_Cholesky, GaussianImage_RS, 3DGS"
228 |     )
229 |     parser.add_argument(
230 |         "--num_points",
231 |         type=int,
232 |         default=500000,
233 |         help="2D GS points (default: %(default)s)",
234 |     )
235 |     parser.add_argument("--n_scales", type=int, default=2)
236 |     parser.add_argument("--allo_ratio", type=float, default=0.5)
237 |     parser.add_argument("--model_path", type=str, default=None, help="Path to a checkpoint")
238 |     parser.add_argument("--seed", type=float, default=1, help="Set random seed for reproducibility")
239 |     parser.add_argument("--save_imgs", action="store_true", help="Save image")
240 |     parser.add_argument(
241 |         "--lr",
242 |         type=float,
243 |         default=0.018,
244 |         help="Learning rate (default: %(default)s)",
245 |     )
246 |     args = parser.parse_args(argv)
247 |     return args
248 | 
249 | def main(argv):
250 |     args = parse_args(argv)
251 |     args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
252 | 
253 |     if args.seed is not None:
254 |         torch.manual_seed(args.seed)
255 |         random.seed(args.seed)
256 |         torch.cuda.manual_seed(args.seed)
257 |         torch.backends.cudnn.deterministic = True
258 |         torch.backends.cudnn.benchmark = False
259 |         np.random.seed(args.seed)
260 | 
261 |     if args.n_scales == 1:
262 |         log_path = f"./checkpoints/{args.data_name}/{args.model_name}_{args.iterations}_{args.num_points}_{args.n_scales}"
263 |     elif args.n_scales > 1:
264 |         log_path = f"./checkpoints/{args.data_name}/{args.model_name}_{args.iterations}_{args.num_points}_{args.n_scales}_{args.allo_ratio}"
265 |     
266 |     logwriter = LogWriter(log_path)
267 |     psnrs, ms_ssims, training_times, eval_times, eval_fpses = [], [], [], [], []
268 |     image_h, image_w = 0, 0
269 |     if args.data_name == "kodak":
270 |         image_length, start = 24, 0
271 |     elif args.data_name == "DIV2K_valid_LRX2" or args.data_name == "DIV2K_valid_HR":
272 |         image_length, start = 100, 800
273 |     elif args.data_name == "STimage":
274 |         image_length, start = 15, 0
275 |     elif args.data_name == "GF1":
276 |         image_length, start = 4, 0
277 |     for i in range(start, start+image_length):
278 |         if args.data_name == "kodak":
279 |             image_path = Path(args.dataset) / f'kodim{i+1:02}.png'
280 |         elif args.data_name == "DIV2K_valid_LRX2":
281 |             image_path = Path(args.dataset) /  f'{i+1:04}x2.png'
282 |         elif args.data_name == "DIV2K_valid_HR":
283 |             image_path = Path(args.dataset) /  f'{i+1:04}.png'
284 |         elif args.data_name == "STimage":
285 |             image_path = Path(args.dataset) / f'Human_Heart_{i}.png'
286 |         elif args.data_name == "GF1":
287 |             image_path = Path(args.dataset) / f'GF1_{i}.png'
288 | 
289 |         torch.cuda.empty_cache()
290 |         trainer = SimpleTrainer2d(image_path=image_path, log_path=log_path, num_points=args.num_points, 
291 |             iterations=args.iterations, model_name=args.model_name, args=args, model_path=args.model_path)
292 |         psnr, ms_ssim, training_time, eval_time, eval_fps = trainer.train()
293 |         psnrs.append(psnr)
294 |         ms_ssims.append(ms_ssim)
295 |         training_times.append(training_time) 
296 |         eval_times.append(eval_time)
297 |         eval_fpses.append(eval_fps)
298 |         image_h += trainer.H
299 |         image_w += trainer.W
300 |         image_name = image_path.stem
301 |         logwriter.write("{}: {}x{}, PSNR:{:.4f}, MS-SSIM:{:.4f}, Training:{:.4f}s, Eval:{:.8f}s, FPS:{:.4f}".format(
302 |             image_name, trainer.H, trainer.W, psnr, ms_ssim, training_time, eval_time, eval_fps))
303 | 
304 |     avg_psnr = torch.tensor(psnrs).mean().item()
305 |     avg_ms_ssim = torch.tensor(ms_ssims).mean().item()
306 |     avg_training_time = torch.tensor(training_times).mean().item()
307 |     avg_eval_time = torch.tensor(eval_times).mean().item()
308 |     avg_eval_fps = torch.tensor(eval_fpses).mean().item()
309 |     avg_h = image_h//image_length
310 |     avg_w = image_w//image_length
311 | 
312 |     logwriter.write("Average: {}x{}, PSNR:{:.4f}, MS-SSIM:{:.4f}, Training:{:.4f}s, Eval:{:.8f}s, FPS:{:.4f}".format(
313 |         avg_h, avg_w, avg_psnr, avg_ms_ssim, avg_training_time, avg_eval_time, avg_eval_fps))    
314 | 
315 | if __name__ == "__main__":
316 |     main(sys.argv[1:])
317 | 


--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Garena Online Private Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import math
 16 | from typing import List
 17 | 
 18 | import torch
 19 | from torch import Tensor
 20 | from torch.optim.optimizer import Optimizer
 21 | 
 22 | 
 23 | class MultiTensorApply(object):
 24 |     available = False
 25 |     warned = False
 26 | 
 27 |     def __init__(self, chunk_size):
 28 |         try:
 29 |             MultiTensorApply.available = True
 30 |             self.chunk_size = chunk_size
 31 |         except ImportError as err:
 32 |             MultiTensorApply.available = False
 33 |             MultiTensorApply.import_err = err
 34 | 
 35 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
 36 |         return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
 37 | 
 38 | 
 39 | class Adan(Optimizer):
 40 |     """
 41 |     Implements a pytorch variant of Adan
 42 |     Adan was proposed in
 43 |     Adan: Adaptive Nesterov Momentum Algorithm for
 44 |         Faster Optimizing Deep Models[J].arXiv preprint arXiv:2208.06677, 2022.
 45 |     https://arxiv.org/abs/2208.06677
 46 |     Arguments:
 47 |         params (iterable): iterable of parameters to optimize or
 48 |             dicts defining parameter groups.
 49 |         lr (float, optional): learning rate. (default: 1e-3)
 50 |         betas (Tuple[float, float, flot], optional): coefficients used for
 51 |             first- and second-order moments. (default: (0.98, 0.92, 0.99))
 52 |         eps (float, optional): term added to the denominator to improve
 53 |             numerical stability. (default: 1e-8)
 54 |         weight_decay (float, optional): decoupled weight decay
 55 |             (L2 penalty) (default: 0)
 56 |         max_grad_norm (float, optional): value used to clip
 57 |             global grad norm (default: 0.0 no clip)
 58 |         no_prox (bool): how to perform the decoupled weight decay
 59 |             (default: False)
 60 |         foreach (bool): if True would use torch._foreach implementation.
 61 |             It's faster but uses slightly more memory. (default: True)
 62 |         fused (bool, optional): whether fused implementation is used.
 63 |             (default: False)
 64 |     """
 65 |     def __init__(self,
 66 |                  params,
 67 |                  lr=1e-3,
 68 |                  betas=(0.98, 0.92, 0.99),
 69 |                  eps=1e-8,
 70 |                  weight_decay=0.0,
 71 |                  max_grad_norm=0.0,
 72 |                  no_prox=False,
 73 |                  foreach: bool = True,
 74 |                  fused: bool = False):
 75 |         if not 0.0 <= max_grad_norm:
 76 |             raise ValueError('Invalid Max grad norm: {}'.format(max_grad_norm))
 77 |         if not 0.0 <= lr:
 78 |             raise ValueError('Invalid learning rate: {}'.format(lr))
 79 |         if not 0.0 <= eps:
 80 |             raise ValueError('Invalid epsilon value: {}'.format(eps))
 81 |         if not 0.0 <= betas[0] < 1.0:
 82 |             raise ValueError('Invalid beta parameter at index 0: {}'.format(
 83 |                 betas[0]))
 84 |         if not 0.0 <= betas[1] < 1.0:
 85 |             raise ValueError('Invalid beta parameter at index 1: {}'.format(
 86 |                 betas[1]))
 87 |         if not 0.0 <= betas[2] < 1.0:
 88 |             raise ValueError('Invalid beta parameter at index 2: {}'.format(
 89 |                 betas[2]))
 90 |         if fused:
 91 |             _check_fused_available()
 92 | 
 93 |         defaults = dict(lr=lr,
 94 |                         betas=betas,
 95 |                         eps=eps,
 96 |                         weight_decay=weight_decay,
 97 |                         max_grad_norm=max_grad_norm,
 98 |                         no_prox=no_prox,
 99 |                         foreach=foreach,
100 |                         fused=fused)
101 |         super().__init__(params, defaults)
102 | 
103 |     def __setstate__(self, state):
104 |         super(Adan, self).__setstate__(state)
105 |         for group in self.param_groups:
106 |             group.setdefault('no_prox', False)
107 | 
108 |     @torch.no_grad()
109 |     def restart_opt(self):
110 |         for group in self.param_groups:
111 |             group['step'] = 0
112 |             for p in group['params']:
113 |                 if p.requires_grad:
114 |                     state = self.state[p]
115 |                     # State initialization
116 | 
117 |                     # Exponential moving average of gradient values
118 |                     state['exp_avg'] = torch.zeros_like(p)
119 |                     # Exponential moving average of squared gradient values
120 |                     state['exp_avg_sq'] = torch.zeros_like(p)
121 |                     # Exponential moving average of gradient difference
122 |                     state['exp_avg_diff'] = torch.zeros_like(p)
123 | 
124 |     @torch.no_grad()
125 |     def step(self, closure=None):
126 |         """Performs a single optimization step."""
127 | 
128 |         loss = None
129 |         if closure is not None:
130 |             with torch.enable_grad():
131 |                 loss = closure()
132 | 
133 |         if self.defaults['max_grad_norm'] > 0:
134 |             device = self.param_groups[0]['params'][0].device
135 |             global_grad_norm = torch.zeros(1, device=device)
136 | 
137 |             max_grad_norm = torch.tensor(self.defaults['max_grad_norm'],
138 |                                          device=device)
139 |             for group in self.param_groups:
140 | 
141 |                 for p in group['params']:
142 |                     if p.grad is not None:
143 |                         grad = p.grad
144 |                         global_grad_norm.add_(grad.pow(2).sum())
145 | 
146 |             global_grad_norm = torch.sqrt(global_grad_norm)
147 | 
148 |             clip_global_grad_norm = torch.clamp(
149 |                 max_grad_norm / (global_grad_norm + group['eps']),
150 |                 max=1.0).item()
151 |         else:
152 |             clip_global_grad_norm = 1.0
153 | 
154 |         for group in self.param_groups:
155 |             params_with_grad = []
156 |             grads = []
157 |             exp_avgs = []
158 |             exp_avg_sqs = []
159 |             exp_avg_diffs = []
160 |             neg_pre_grads = []
161 | 
162 |             beta1, beta2, beta3 = group['betas']
163 |             # assume same step across group now to simplify things
164 |             # per parameter step can be easily support
165 |             # by making it tensor, or pass list into kernel
166 |             if 'step' in group:
167 |                 group['step'] += 1
168 |             else:
169 |                 group['step'] = 1
170 | 
171 |             bias_correction1 = 1.0 - beta1**group['step']
172 |             bias_correction2 = 1.0 - beta2**group['step']
173 |             bias_correction3 = 1.0 - beta3**group['step']
174 | 
175 |             for p in group['params']:
176 |                 if p.grad is None:
177 |                     continue
178 |                 params_with_grad.append(p)
179 |                 grads.append(p.grad)
180 | 
181 |                 state = self.state[p]
182 |                 if len(state) == 0:
183 |                     state['exp_avg'] = torch.zeros_like(p)
184 |                     state['exp_avg_sq'] = torch.zeros_like(p)
185 |                     state['exp_avg_diff'] = torch.zeros_like(p)
186 | 
187 |                 if 'neg_pre_grad' not in state or group['step'] == 1:
188 |                     state['neg_pre_grad'] = p.grad.clone().mul_(
189 |                         -clip_global_grad_norm)
190 | 
191 |                 exp_avgs.append(state['exp_avg'])
192 |                 exp_avg_sqs.append(state['exp_avg_sq'])
193 |                 exp_avg_diffs.append(state['exp_avg_diff'])
194 |                 neg_pre_grads.append(state['neg_pre_grad'])
195 | 
196 |             if not params_with_grad:
197 |                 continue
198 | 
199 |             kwargs = dict(
200 |                 params=params_with_grad,
201 |                 grads=grads,
202 |                 exp_avgs=exp_avgs,
203 |                 exp_avg_sqs=exp_avg_sqs,
204 |                 exp_avg_diffs=exp_avg_diffs,
205 |                 neg_pre_grads=neg_pre_grads,
206 |                 beta1=beta1,
207 |                 beta2=beta2,
208 |                 beta3=beta3,
209 |                 bias_correction1=bias_correction1,
210 |                 bias_correction2=bias_correction2,
211 |                 bias_correction3_sqrt=math.sqrt(bias_correction3),
212 |                 lr=group['lr'],
213 |                 weight_decay=group['weight_decay'],
214 |                 eps=group['eps'],
215 |                 no_prox=group['no_prox'],
216 |                 clip_global_grad_norm=clip_global_grad_norm,
217 |             )
218 | 
219 |             if group['foreach']:
220 |                 if group['fused']:
221 |                     if torch.cuda.is_available():
222 |                         _fused_adan_multi_tensor(**kwargs)
223 |                     else:
224 |                         raise ValueError('Fused Adan does not support CPU')
225 |                 else:
226 |                     _multi_tensor_adan(**kwargs)
227 |             elif group['fused']:
228 |                 if torch.cuda.is_available():
229 |                     _fused_adan_single_tensor(**kwargs)
230 |                 else:
231 |                     raise ValueError('Fused Adan does not support CPU')
232 |             else:
233 |                 _single_tensor_adan(**kwargs)
234 | 
235 |         return loss
236 | 
237 | 
238 | def _single_tensor_adan(
239 |     params: List[Tensor],
240 |     grads: List[Tensor],
241 |     exp_avgs: List[Tensor],
242 |     exp_avg_sqs: List[Tensor],
243 |     exp_avg_diffs: List[Tensor],
244 |     neg_pre_grads: List[Tensor],
245 |     *,
246 |     beta1: float,
247 |     beta2: float,
248 |     beta3: float,
249 |     bias_correction1: float,
250 |     bias_correction2: float,
251 |     bias_correction3_sqrt: float,
252 |     lr: float,
253 |     weight_decay: float,
254 |     eps: float,
255 |     no_prox: bool,
256 |     clip_global_grad_norm: Tensor,
257 | ):
258 |     for i, param in enumerate(params):
259 |         grad = grads[i]
260 |         exp_avg = exp_avgs[i]
261 |         exp_avg_sq = exp_avg_sqs[i]
262 |         exp_avg_diff = exp_avg_diffs[i]
263 |         neg_grad_or_diff = neg_pre_grads[i]
264 | 
265 |         grad.mul_(clip_global_grad_norm)
266 | 
267 |         # for memory saving, we use `neg_grad_or_diff`
268 |         # to get some temp variable in a inplace way
269 |         neg_grad_or_diff.add_(grad)
270 | 
271 |         exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
272 |         exp_avg_diff.mul_(beta2).add_(neg_grad_or_diff,
273 |                                       alpha=1 - beta2)  # diff_t
274 | 
275 |         neg_grad_or_diff.mul_(beta2).add_(grad)
276 |         exp_avg_sq.mul_(beta3).addcmul_(neg_grad_or_diff,
277 |                                         neg_grad_or_diff,
278 |                                         value=1 - beta3)  # n_t
279 | 
280 |         denom = ((exp_avg_sq).sqrt() / bias_correction3_sqrt).add_(eps)
281 |         step_size_diff = lr * beta2 / bias_correction2
282 |         step_size = lr / bias_correction1
283 | 
284 |         if no_prox:
285 |             param.mul_(1 - lr * weight_decay)
286 |             param.addcdiv_(exp_avg, denom, value=-step_size)
287 |             param.addcdiv_(exp_avg_diff, denom, value=-step_size_diff)
288 |         else:
289 |             param.addcdiv_(exp_avg, denom, value=-step_size)
290 |             param.addcdiv_(exp_avg_diff, denom, value=-step_size_diff)
291 |             param.div_(1 + lr * weight_decay)
292 | 
293 |         neg_grad_or_diff.zero_().add_(grad, alpha=-1.0)
294 | 
295 | 
296 | def _multi_tensor_adan(
297 |     params: List[Tensor],
298 |     grads: List[Tensor],
299 |     exp_avgs: List[Tensor],
300 |     exp_avg_sqs: List[Tensor],
301 |     exp_avg_diffs: List[Tensor],
302 |     neg_pre_grads: List[Tensor],
303 |     *,
304 |     beta1: float,
305 |     beta2: float,
306 |     beta3: float,
307 |     bias_correction1: float,
308 |     bias_correction2: float,
309 |     bias_correction3_sqrt: float,
310 |     lr: float,
311 |     weight_decay: float,
312 |     eps: float,
313 |     no_prox: bool,
314 |     clip_global_grad_norm: Tensor,
315 | ):
316 |     if len(params) == 0:
317 |         return
318 | 
319 |     torch._foreach_mul_(grads, clip_global_grad_norm)
320 | 
321 |     # for memory saving, we use `neg_pre_grads`
322 |     # to get some temp variable in a inplace way
323 |     torch._foreach_add_(neg_pre_grads, grads)
324 | 
325 |     torch._foreach_mul_(exp_avgs, beta1)
326 |     torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)  # m_t
327 | 
328 |     torch._foreach_mul_(exp_avg_diffs, beta2)
329 |     torch._foreach_add_(exp_avg_diffs, neg_pre_grads,
330 |                         alpha=1 - beta2)  # diff_t
331 | 
332 |     torch._foreach_mul_(neg_pre_grads, beta2)
333 |     torch._foreach_add_(neg_pre_grads, grads)
334 |     torch._foreach_mul_(exp_avg_sqs, beta3)
335 |     torch._foreach_addcmul_(exp_avg_sqs,
336 |                             neg_pre_grads,
337 |                             neg_pre_grads,
338 |                             value=1 - beta3)  # n_t
339 | 
340 |     denom = torch._foreach_sqrt(exp_avg_sqs)
341 |     torch._foreach_div_(denom, bias_correction3_sqrt)
342 |     torch._foreach_add_(denom, eps)
343 | 
344 |     step_size_diff = lr * beta2 / bias_correction2
345 |     step_size = lr / bias_correction1
346 | 
347 |     if no_prox:
348 |         torch._foreach_mul_(params, 1 - lr * weight_decay)
349 |         torch._foreach_addcdiv_(params, exp_avgs, denom, value=-step_size)
350 |         torch._foreach_addcdiv_(params,
351 |                                 exp_avg_diffs,
352 |                                 denom,
353 |                                 value=-step_size_diff)
354 |     else:
355 |         torch._foreach_addcdiv_(params, exp_avgs, denom, value=-step_size)
356 |         torch._foreach_addcdiv_(params,
357 |                                 exp_avg_diffs,
358 |                                 denom,
359 |                                 value=-step_size_diff)
360 |         torch._foreach_div_(params, 1 + lr * weight_decay)
361 |     torch._foreach_zero_(neg_pre_grads)
362 |     torch._foreach_add_(neg_pre_grads, grads, alpha=-1.0)
363 | 
364 | 
365 | def _fused_adan_multi_tensor(
366 |     params: List[Tensor],
367 |     grads: List[Tensor],
368 |     exp_avgs: List[Tensor],
369 |     exp_avg_sqs: List[Tensor],
370 |     exp_avg_diffs: List[Tensor],
371 |     neg_pre_grads: List[Tensor],
372 |     *,
373 |     beta1: float,
374 |     beta2: float,
375 |     beta3: float,
376 |     bias_correction1: float,
377 |     bias_correction2: float,
378 |     bias_correction3_sqrt: float,
379 |     lr: float,
380 |     weight_decay: float,
381 |     eps: float,
382 |     no_prox: bool,
383 |     clip_global_grad_norm: Tensor,
384 | ):
385 |     import fused_adan
386 |     multi_tensor_applier = MultiTensorApply(2048 * 32)
387 |     _dummy_overflow_buf = torch.cuda.IntTensor([0])
388 |     multi_tensor_applier(
389 |         fused_adan.adan_multi_tensor, _dummy_overflow_buf,
390 |         [params, grads, exp_avgs, exp_avg_sqs, exp_avg_diffs, neg_pre_grads],
391 |         beta1, beta2, beta3, bias_correction1, bias_correction2,
392 |         bias_correction3_sqrt, lr, weight_decay, eps, no_prox,
393 |         clip_global_grad_norm)
394 |     torch._foreach_zero_(neg_pre_grads)
395 |     torch._foreach_add_(neg_pre_grads, grads, alpha=-1.0)
396 | 
397 | 
398 | def _fused_adan_single_tensor(
399 |     params: List[Tensor],
400 |     grads: List[Tensor],
401 |     exp_avgs: List[Tensor],
402 |     exp_avg_sqs: List[Tensor],
403 |     exp_avg_diffs: List[Tensor],
404 |     neg_pre_grads: List[Tensor],
405 |     *,
406 |     beta1: float,
407 |     beta2: float,
408 |     beta3: float,
409 |     bias_correction1: float,
410 |     bias_correction2: float,
411 |     bias_correction3_sqrt: float,
412 |     lr: float,
413 |     weight_decay: float,
414 |     eps: float,
415 |     no_prox: bool,
416 |     clip_global_grad_norm: Tensor,
417 | ):
418 |     for i, param in enumerate(params):
419 |         p_data_fp32 = param.data.float()
420 |         out_p = param.data
421 |         grad = grads[i]
422 |         exp_avg = exp_avgs[i]
423 |         exp_avg_sq = exp_avg_sqs[i]
424 |         exp_avg_diff = exp_avg_diffs[i]
425 |         neg_grad = neg_pre_grads[i]
426 |         with torch.cuda.device(param.device):
427 |             import fused_adan
428 |             fused_adan.adan_single_tensor(
429 |                 p_data_fp32,
430 |                 out_p,
431 |                 grad,
432 |                 exp_avg,
433 |                 exp_avg_sq,
434 |                 exp_avg_diff,
435 |                 neg_grad,
436 |                 beta1,
437 |                 beta2,
438 |                 beta3,
439 |                 bias_correction1,
440 |                 bias_correction2,
441 |                 bias_correction3_sqrt,
442 |                 lr,
443 |                 weight_decay,
444 |                 eps,
445 |                 no_prox,
446 |                 clip_global_grad_norm,
447 |             )
448 |         neg_grad.zero_().add_(grad, alpha=-1.0)
449 | 
450 | 
451 | def _check_fused_available():
452 |     try:
453 |         import fused_adan
454 |     except ImportError as exc:
455 |         if torch.cuda.is_available():
456 |             # The module should be available but isn't. Try to
457 |             # help the user in this case.
458 |             raise ImportError((
459 |                 str(exc)
460 |                 + (
461 |                     '\nThis could be caused by not having compiled '
462 |                     'the CUDA extension during package installation. '
463 |                     'Please try to re-install the package with '
464 |                     'the environment flag `FORCE_CUDA=1` set.'
465 |                 )
466 |             ))
467 |         else:
468 |             raise ImportError(
469 |                 str(exc) + '\nFused Adan does not support CPU.')


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------