├── util
    ├── __init__.py
    ├── dataset.py
    ├── scannet_v2.py
    ├── logger.py
    ├── s3dis.py
    ├── voxelize.py
    ├── vis_util.py
    ├── config.py
    └── lr.py
├── lib
    ├── pointops
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── knnquery
    │   │   │   ├── __init__.py
    │   │   │   ├── knnquery_cuda_kernel.h
    │   │   │   ├── knnquery_cuda.cpp
    │   │   │   └── knnquery_cuda_kernel.cu
    │   │   ├── knnquery_heap
    │   │   │   ├── __init__.py
    │   │   │   ├── knnquery_heap_cuda_kernel.h
    │   │   │   ├── knnquery_heap_cuda.cpp
    │   │   │   └── knnquery_heap_cuda_kernel.cu
    │   │   ├── ballquery
    │   │   │   ├── ballquery_cuda_kernel.h
    │   │   │   ├── ballquery_cuda.cpp
    │   │   │   └── ballquery_cuda_kernel.cu
    │   │   ├── grouping_int
    │   │   │   ├── grouping_int_cuda_kernel.h
    │   │   │   ├── grouping_int_cuda.cpp
    │   │   │   └── grouping_int_cuda_kernel.cu
    │   │   ├── cuda_utils.h
    │   │   ├── sampling
    │   │   │   ├── sampling_cuda_kernel.h
    │   │   │   └── sampling_cuda.cpp
    │   │   ├── grouping
    │   │   │   ├── grouping_cuda_kernel.h
    │   │   │   ├── grouping_cuda.cpp
    │   │   │   └── grouping_cuda_kernel.cu
    │   │   ├── featuredistribute
    │   │   │   ├── featuredistribute_cuda_kernel.h
    │   │   │   ├── featuredistribute_cuda.cpp
    │   │   │   └── featuredistribute_cuda_kernel.cu
    │   │   ├── labelstat
    │   │   │   ├── labelstat_cuda_kernel.h
    │   │   │   ├── labelstat_cuda.cpp
    │   │   │   └── labelstat_cuda_kernel.cu
    │   │   ├── interpolation
    │   │   │   ├── interpolation_cuda_kernel.h
    │   │   │   └── interpolation_cuda.cpp
    │   │   └── pointops_api.cpp
    │   ├── functions
    │   │   └── __init__.py
    │   └── setup.py
    ├── pointops2
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── sampling
    │   │   │   ├── sampling_cuda_kernel.h
    │   │   │   ├── sampling_cuda.cpp
    │   │   │   └── sampling_cuda_kernel.cu
    │   │   ├── knnquery
    │   │   │   ├── knnquery_cuda_kernel.h
    │   │   │   ├── knnquery_cuda.cpp
    │   │   │   └── knnquery_cuda_kernel.cu
    │   │   ├── cuda_utils.h
    │   │   ├── grouping
    │   │   │   ├── grouping_cuda_kernel.h
    │   │   │   ├── grouping_cuda.cpp
    │   │   │   └── grouping_cuda_kernel.cu
    │   │   ├── interpolation
    │   │   │   ├── interpolation_cuda_kernel.h
    │   │   │   ├── interpolation_cuda.cpp
    │   │   │   └── interpolation_cuda_kernel.cu
    │   │   ├── subtraction
    │   │   │   ├── subtraction_cuda_kernel.h
    │   │   │   ├── subtraction_cuda.cpp
    │   │   │   └── subtraction_cuda_kernel.cu
    │   │   ├── aggregation
    │   │   │   ├── aggregation_cuda_kernel.h
    │   │   │   ├── aggregation_cuda.cpp
    │   │   │   └── aggregation_cuda_kernel.cu
    │   │   ├── attention
    │   │   │   ├── attention_cuda_kernel.h
    │   │   │   ├── attention_cuda.cpp
    │   │   │   └── attention_cuda_kernel.cu
    │   │   ├── attention_v2
    │   │   │   ├── attention_cuda_kernel_v2.h
    │   │   │   └── attention_cuda_v2.cpp
    │   │   ├── rpe
    │   │   │   ├── relative_pos_encoding_cuda_kernel.h
    │   │   │   ├── relative_pos_encoding_cuda.cpp
    │   │   │   └── relative_pos_encoding_cuda_kernel.cu
    │   │   ├── pointops_api.cpp
    │   │   └── rpe_v2
    │   │   │   └── relative_pos_encoding_cuda_kernel_v2.h
    │   ├── functions
    │   │   ├── __init__.py
    │   │   ├── test_attention_op_step2.py
    │   │   ├── test_relative_pos_encoding_op_step1.py
    │   │   ├── test_relative_pos_encoding_op_step1_v2.py
    │   │   ├── test_relative_pos_encoding_op_step2.py
    │   │   ├── test_attention_op_step1.py
    │   │   ├── test_relative_pos_encoding_op_step2_v2.py
    │   │   ├── test_relative_pos_encoding_op_step1_v3.py
    │   │   └── test_attention_op_step1_v2.py
    │   └── setup.py
    └── cpp_wrappers
    │   ├── compile_wrappers.sh
    │   ├── cpp_subsampling
    │       ├── setup.py
    │       └── grid_subsampling
    │       │   ├── grid_subsampling.h
    │       │   └── grid_subsampling.cpp
    │   └── cpp_utils
    │       └── cloud
    │           ├── cloud.cpp
    │           └── cloud.h
├── figs
    └── fig.jpg
├── requirements.txt
├── LICENSE.md
├── .gitignore
├── config
    ├── s3dis
    │   ├── s3dis_swin3d_transformer.yaml
    │   └── s3dis_stratified_transformer.yaml
    └── scannetv2
    │   ├── scannetv2_stratified_transformer.yaml
    │   └── scannetv2_swin3d_transformer.yaml
└── README.md


/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery_heap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figs/fig.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/Stratified-Transformer/HEAD/figs/fig.jpg


--------------------------------------------------------------------------------
/lib/cpp_wrappers/compile_wrappers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Compile cpp subsampling
4 | cd cpp_subsampling
5 | python3 setup.py build_ext --inplace
6 | cd ..
7 | 
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h5py==3.2.1
 2 | matplotlib==3.4.2
 3 | numpy==1.19.5
 4 | Pillow==9.1.0
 5 | PyYAML==6.0
 6 | scipy==1.6.3
 7 | setuptools==50.3.1.post20201107
 8 | SharedArray==3.2.1
 9 | tensorboardX==2.5
10 | termcolor==1.1.0
11 | timm==0.4.9
12 | torch==1.7.1
13 | torch_geometric==1.7.0
14 | torch_points3d==1.3.0
15 | torch_points_kernels==0.6.10
16 | torch_scatter==2.0.6
17 | torchvision==0.8.2
18 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/sampling/sampling_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SAMPLING_CUDA_KERNEL
 2 | #define _SAMPLING_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor);
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx);
14 | 
15 | #ifdef __cplusplus
16 | }
17 | #endif
18 | #endif
19 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery/knnquery_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KNNQUERY_CUDA_KERNEL
 2 | #define _KNNQUERY_CUDA_KERNEL
 3 | 
 4 | #include <torch/serialize/tensor.h>
 5 | #include <vector>
 6 | #include <ATen/cuda/CUDAContext.h>
 7 | 
 8 | void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream);
15 | 
16 | #ifdef __cplusplus
17 | }
18 | #endif
19 | 
20 | #endif


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KNNQUERY_HEAP_CUDA_KERNEL
 2 | #define _KNNQUERY_HEAP_CUDA_KERNEL
 3 | 
 4 | #include <torch/serialize/tensor.h>
 5 | #include <vector>
 6 | #include <ATen/cuda/CUDAContext.h>
 7 | 
 8 | void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream);
15 | 
16 | #ifdef __cplusplus
17 | }
18 | #endif
19 | 
20 | #endif


--------------------------------------------------------------------------------
/lib/pointops2/src/knnquery/knnquery_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KNNQUERY_CUDA_KERNEL
 2 | #define _KNNQUERY_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2);
14 | 
15 | #ifdef __cplusplus
16 | }
17 | #endif
18 | #endif
19 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/sampling/sampling_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "sampling_cuda_kernel.h"
 6 | 
 7 | 
 8 | void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor)
 9 | {
10 |     const float *xyz = xyz_tensor.data_ptr<float>();
11 |     const int *offset = offset_tensor.data_ptr<int>();
12 |     const int *new_offset = new_offset_tensor.data_ptr<int>();
13 |     float *tmp = tmp_tensor.data_ptr<float>();
14 |     int *idx = idx_tensor.data_ptr<int>();
15 |     furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx);
16 | }
17 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUDA_UTILS_H
 2 | #define _CUDA_UTILS_H
 3 | 
 4 | #include <cmath>
 5 | #include <algorithm>
 6 | 
 7 | #define TOTAL_THREADS 1024
 8 | #define THREADS_PER_BLOCK 256
 9 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
10 | 
11 | inline int opt_n_threads(int work_size) {
12 |     const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
13 |     return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1);
14 | }
15 | 
16 | inline dim3 opt_block_config(int x, int y) {
17 |     const int x_threads = opt_n_threads(x);
18 |     const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
19 |     dim3 block_config(x_threads, y_threads, 1);
20 |     return block_config;
21 | }
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/lib/cpp_wrappers/cpp_subsampling/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup, Extension
 2 | import numpy.distutils.misc_util
 3 | 
 4 | # Adding OpenCV to project
 5 | # ************************
 6 | 
 7 | # Adding sources of the project
 8 | # *****************************
 9 | 
10 | m_name = "grid_subsampling"
11 | 
12 | SOURCES = ["../cpp_utils/cloud/cloud.cpp",
13 |            "grid_subsampling/grid_subsampling.cpp",
14 |            "wrapper.cpp"]
15 | 
16 | module = Extension(m_name,
17 |                    sources=SOURCES,
18 |                    extra_compile_args=['-std=c++11',
19 |                                        '-D_GLIBCXX_USE_CXX11_ABI=0'])
20 | 
21 | setup(ext_modules=[module], include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs())
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/grouping/grouping_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GROUPING_CUDA_KERNEL
 2 | #define _GROUPING_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
 8 | void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output);
15 | void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | #endif
21 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/knnquery/knnquery_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "knnquery_cuda_kernel.h"
 6 | 
 7 | 
 8 | void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
 9 | {
10 |     const float *xyz = xyz_tensor.data_ptr<float>();
11 |     const float *new_xyz = new_xyz_tensor.data_ptr<float>();
12 |     const int *offset = offset_tensor.data_ptr<int>();
13 |     const int *new_offset = new_offset_tensor.data_ptr<int>();
14 |     int *idx = idx_tensor.data_ptr<int>();
15 |     float *dist2 = dist2_tensor.data_ptr<float>();
16 |     knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
17 | }
18 | 


--------------------------------------------------------------------------------
/lib/pointops/src/ballquery/ballquery_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BALLQUERY_CUDA_KERNEL
 2 | #define _BALLQUERY_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
 8 | 
 9 | void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *xyz, const float *new_xyz, int *idx);
16 | 
17 | void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/lib/pointops/src/grouping_int/grouping_int_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GROUPING_INT_CUDA_KERNEL
 2 | #define _GROUPING_INT_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out);
 8 | 
 9 | void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out);
16 | 
17 | void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | #endif
23 | 


--------------------------------------------------------------------------------
/lib/pointops/src/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUDA_UTILS_H
 2 | #define _CUDA_UTILS_H
 3 | 
 4 | #include <cmath>
 5 | 
 6 | #define TOTAL_THREADS 1024
 7 | 
 8 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
 9 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
10 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
11 | 
12 | #define THREADS_PER_BLOCK 256
13 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
14 | 
15 | inline int opt_n_threads(int work_size) {
16 |     const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
17 |     return max(min(1 << pow_2, TOTAL_THREADS), 1);
18 | }
19 | 
20 | inline dim3 opt_block_config(int x, int y) {
21 |     const int x_threads = opt_n_threads(x);
22 |     const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
23 |     dim3 block_config(x_threads, y_threads, 1);
24 |     return block_config;
25 | }
26 | 
27 | #endif


--------------------------------------------------------------------------------
/lib/pointops2/src/interpolation/interpolation_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _INTERPOLATION_CUDA_KERNEL
 2 | #define _INTERPOLATION_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor);
 8 | void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output);
15 | void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | #endif
21 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/subtraction/subtraction_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SUBTRACTION_CUDA_KERNEL
 2 | #define _SUBTRACTION_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
 8 | void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output);
15 | void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | #endif
21 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/grouping/grouping_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "grouping_cuda_kernel.h"
 6 | 
 7 | 
 8 | void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
 9 | {
10 |     const float *input = input_tensor.data_ptr<float>();
11 |     const int *idx = idx_tensor.data_ptr<int>();
12 |     float *output = output_tensor.data_ptr<float>();
13 |     grouping_forward_cuda_launcher(m, nsample, c, input, idx, output);
14 | }
15 | 
16 | void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor)
17 | {
18 |     const float *grad_output = grad_output_tensor.data_ptr<float>();
19 |     const int *idx = idx_tensor.data_ptr<int>();
20 |     float *grad_input = grad_input_tensor.data_ptr<float>();
21 |     grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input);
22 | }
23 | 


--------------------------------------------------------------------------------
/lib/pointops/src/grouping_int/grouping_int_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <ATen/cuda/CUDAContext.h>
 4 | #include <THC/THC.h>
 5 | 
 6 | #include "grouping_int_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
11 | {
12 |     const long int *points = points_tensor.data<long int>();
13 |     const int *idx = idx_tensor.data<int>();
14 |     long int *out = out_tensor.data<long int>();
15 |     grouping_int_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out);
16 | }
17 | 
18 | void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
19 | {
20 |     const long int *points = points_tensor.data<long int>();
21 |     const int *idx = idx_tensor.data<int>();
22 |     long int *out = out_tensor.data<long int>();
23 |     grouping_int_forward_cuda_launcher_fast(b, c, n, m, nsample, points, idx, out);
24 | }


--------------------------------------------------------------------------------
/lib/pointops/src/sampling/sampling_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SAMPLING_CUDA_KERNEL
 2 | #define _SAMPLING_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
 8 | void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
 9 | void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out);
16 | void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points);
17 | void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | #endif
23 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery/knnquery_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <THC/THC.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | #include "knnquery_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13 | 
14 | 
15 | void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
16 | {
17 |     CHECK_INPUT(new_xyz_tensor);
18 |     CHECK_INPUT(xyz_tensor);
19 | 
20 |     const float *new_xyz = new_xyz_tensor.data<float>();
21 |     const float *xyz = xyz_tensor.data<float>();
22 |     int *idx = idx_tensor.data<int>();
23 |     float *dist2 = dist2_tensor.data<float>();
24 | 
25 |     cudaStream_t stream = THCState_getCurrentStream(state);
26 | 
27 |     knnquery_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery_heap/knnquery_heap_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <THC/THC.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | #include "knnquery_heap_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13 | 
14 | 
15 | void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
16 | {
17 |     CHECK_INPUT(new_xyz_tensor);
18 |     CHECK_INPUT(xyz_tensor);
19 | 
20 |     const float *new_xyz = new_xyz_tensor.data<float>();
21 |     const float *xyz = xyz_tensor.data<float>();
22 |     int *idx = idx_tensor.data<int>();
23 |     float *dist2 = dist2_tensor.data<float>();
24 | 
25 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream(state);
26 | 
27 |     knnquery_heap_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
28 | }
29 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 DV Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/pointops/src/grouping/grouping_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GROUPING_CUDA_KERNEL
 2 | #define _GROUPING_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out);
 8 | void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
 9 | 
10 | void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out);
17 | void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points);
18 | 
19 | void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out);
20 | 
21 | #ifdef __cplusplus
22 | }
23 | #endif
24 | #endif
25 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/interpolation/interpolation_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "interpolation_cuda_kernel.h"
 6 | 
 7 | 
 8 | void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor)
 9 | {
10 |     const float *input = input_tensor.data_ptr<float>();
11 |     const int *idx = idx_tensor.data_ptr<int>();
12 |     const float *weight = weight_tensor.data_ptr<float>();
13 |     float *output = output_tensor.data_ptr<float>();
14 |     interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output);
15 | }
16 | 
17 | void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor)
18 | {
19 |     const float *grad_output = grad_output_tensor.data_ptr<float>();
20 |     const int *idx = idx_tensor.data_ptr<int>();
21 |     const float *weight = weight_tensor.data_ptr<float>();
22 |     float *grad_input = grad_input_tensor.data_ptr<float>();
23 |     interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input);
24 | }
25 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/aggregation/aggregation_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _AGGREGATION_CUDA_KERNEL
 2 | #define _AGGREGATION_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
 8 | void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor);
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output);
15 | void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | #endif
21 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/subtraction/subtraction_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "subtraction_cuda_kernel.h"
 6 | 
 7 | 
 8 | void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
 9 | {
10 |     const float *input1 = input1_tensor.data_ptr<float>();
11 |     const float *input2 = input2_tensor.data_ptr<float>();
12 |     const int *idx = idx_tensor.data_ptr<int>();
13 |     float *output = output_tensor.data_ptr<float>();
14 |     subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output);
15 | }
16 | 
17 | void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor)
18 | {
19 |     const int *idx = idx_tensor.data_ptr<int>();
20 |     const float *grad_output = grad_output_tensor.data_ptr<float>();
21 |     float *grad_input1 = grad_input1_tensor.data_ptr<float>();
22 |     float *grad_input2 = grad_input2_tensor.data_ptr<float>();
23 |     subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
24 | }
25 | 


--------------------------------------------------------------------------------
/lib/cpp_wrappers/cpp_utils/cloud/cloud.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //		0==========================0
 4 | //		|    Local feature test    |
 5 | //		0==========================0
 6 | //
 7 | //		version 1.0 : 
 8 | //			> 
 9 | //
10 | //---------------------------------------------------
11 | //
12 | //		Cloud source :
13 | //		Define usefull Functions/Methods
14 | //
15 | //----------------------------------------------------
16 | //
17 | //		Hugues THOMAS - 10/02/2017
18 | //
19 | 
20 | 
21 | #include "cloud.h"
22 | 
23 | 
24 | // Getters
25 | // *******
26 | 
27 | PointXYZ max_point(std::vector<PointXYZ> points)
28 | {
29 | 	// Initiate limits
30 | 	PointXYZ maxP(points[0]);
31 | 
32 | 	// Loop over all points
33 | 	for (auto p : points)
34 | 	{
35 | 		if (p.x > maxP.x)
36 | 			maxP.x = p.x;
37 | 
38 | 		if (p.y > maxP.y)
39 | 			maxP.y = p.y;
40 | 
41 | 		if (p.z > maxP.z)
42 | 			maxP.z = p.z;
43 | 	}
44 | 
45 | 	return maxP;
46 | }
47 | 
48 | PointXYZ min_point(std::vector<PointXYZ> points)
49 | {
50 | 	// Initiate limits
51 | 	PointXYZ minP(points[0]);
52 | 
53 | 	// Loop over all points
54 | 	for (auto p : points)
55 | 	{
56 | 		if (p.x < minP.x)
57 | 			minP.x = p.x;
58 | 
59 | 		if (p.y < minP.y)
60 | 			minP.y = p.y;
61 | 
62 | 		if (p.z < minP.z)
63 | 			minP.z = p.z;
64 | 	}
65 | 
66 | 	return minP;
67 | }


--------------------------------------------------------------------------------
/lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _FEATUREDISTRIBUTE_CUDA_KERNEL
 2 | #define _FEATUREDISTRIBUTE_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor);
 8 | void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor);
 9 | void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor);
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream);
16 | void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream);
17 | void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/lib/pointops/src/sampling/sampling_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <ATen/cuda/CUDAContext.h>
 4 | #include <THC/THC.h>
 5 | #include "sampling_cuda_kernel.h"
 6 | 
 7 | extern THCState *state;
 8 | 
 9 | void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
10 | {
11 |     const float *points = points_tensor.data<float>();
12 |     const int *idx = idx_tensor.data<int>();
13 |     float *out = out_tensor.data<float>();
14 |     gathering_forward_cuda_launcher(b, c, n, m, points, idx, out);
15 | }
16 | 
17 | void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor)
18 | {
19 | 
20 |     const float *grad_out = grad_out_tensor.data<float>();
21 |     const int *idx = idx_tensor.data<int>();
22 |     float *grad_points = grad_points_tensor.data<float>();
23 |     gathering_backward_cuda_launcher(b, c, n, m, grad_out, idx, grad_points);
24 | }
25 | 
26 | void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor)
27 | {
28 |     const float *points = points_tensor.data<float>();
29 |     float *temp = temp_tensor.data<float>();
30 |     int *idx = idx_tensor.data<int>();
31 |     furthestsampling_cuda_launcher(b, n, m, points, temp, idx);
32 | }
33 | 


--------------------------------------------------------------------------------
/lib/pointops/src/ballquery/ballquery_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <THC/THC.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | #include "ballquery_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13 | 
14 | void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor)
15 | {
16 |     const float *new_xyz = new_xyz_tensor.data<float>();
17 |     const float *xyz = xyz_tensor.data<float>();
18 |     int *idx = idx_tensor.data<int>();
19 | 
20 |     ballquery_cuda_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx);
21 | }
22 | 
23 | 
24 | void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor)
25 | {
26 |     CHECK_INPUT(new_xyz_tensor);
27 |     CHECK_INPUT(xyz_tensor);
28 | 
29 |     const float *new_xyz = new_xyz_tensor.data<float>();
30 |     const float *xyz = xyz_tensor.data<float>();
31 |     int *idx = idx_tensor.data<int>();
32 | 
33 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream(state);
34 | 
35 |     ballquery_cuda_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
36 | }
37 | 


--------------------------------------------------------------------------------
/lib/pointops/src/grouping/grouping_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | #include <vector>
 4 | #include <THC/THC.h>
 5 | 
 6 | #include "grouping_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
11 | {
12 |     const float *points = points_tensor.data<float>();
13 |     const int *idx = idx_tensor.data<int>();
14 |     float *out = out_tensor.data<float>();
15 |     grouping_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out);
16 | }
17 | 
18 | void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor)
19 | {
20 |     float *grad_points = grad_points_tensor.data<float>();
21 |     const int *idx = idx_tensor.data<int>();
22 |     const float *grad_out = grad_out_tensor.data<float>();
23 |     grouping_backward_cuda_launcher(b, c, n, m, nsample, grad_out, idx, grad_points);
24 | }
25 | 
26 | void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
27 | 
28 |     const float *points = points_tensor.data<float>();
29 |     const int *idx = idx_tensor.data<int>();
30 |     float *out = out_tensor.data<float>();
31 |     grouping_forward_cuda_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
32 | }


--------------------------------------------------------------------------------
/lib/pointops/setup.py:
--------------------------------------------------------------------------------
 1 | #python3 setup.py install
 2 | from setuptools import setup
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | 
 5 | setup(
 6 |     name='pointops',
 7 |     ext_modules=[
 8 |         CUDAExtension('pointops_cuda', [
 9 |             'src/pointops_api.cpp',
10 |             
11 |             'src/ballquery/ballquery_cuda.cpp',
12 |             'src/ballquery/ballquery_cuda_kernel.cu',
13 |             'src/knnquery/knnquery_cuda.cpp',
14 |             'src/knnquery/knnquery_cuda_kernel.cu',
15 |             'src/knnquery_heap/knnquery_heap_cuda.cpp',
16 |             'src/knnquery_heap/knnquery_heap_cuda_kernel.cu',
17 |             'src/grouping/grouping_cuda.cpp',
18 |             'src/grouping/grouping_cuda_kernel.cu',
19 |             'src/grouping_int/grouping_int_cuda.cpp',
20 |             'src/grouping_int/grouping_int_cuda_kernel.cu',
21 |             'src/interpolation/interpolation_cuda.cpp',
22 |             'src/interpolation/interpolation_cuda_kernel.cu',
23 |             'src/sampling/sampling_cuda.cpp',
24 |             'src/sampling/sampling_cuda_kernel.cu',
25 | 
26 |             'src/labelstat/labelstat_cuda.cpp',
27 |             'src/labelstat/labelstat_cuda_kernel.cu',
28 | 
29 |             'src/featuredistribute/featuredistribute_cuda.cpp',
30 |             'src/featuredistribute/featuredistribute_cuda_kernel.cu'
31 |         ],
32 |         extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
33 |     ],
34 |     cmdclass={'build_ext': BuildExtension})
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## General
 2 | 
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.cuo
 9 | 
10 | # Compiled Dynamic libraries
11 | *.so
12 | *.dylib
13 | 
14 | # Compiled Static libraries
15 | *.lai
16 | *.la
17 | *.a
18 | 
19 | # Compiled protocol buffers
20 | *.pb.h
21 | *.pb.cc
22 | *_pb2.py
23 | 
24 | # Compiled python
25 | *.pyc
26 | 
27 | # Compiled MATLAB
28 | *.mex*
29 | 
30 | # IPython notebook checkpoints
31 | .ipynb_checkpoints
32 | 
33 | # Editor temporaries
34 | *.swp
35 | *~
36 | 
37 | # Sublime Text settings
38 | *.sublime-workspace
39 | *.sublime-project
40 | 
41 | # Eclipse Project settings
42 | *.*project
43 | .settings
44 | 
45 | # QtCreator files
46 | *.user
47 | 
48 | # PyCharm files
49 | .idea
50 | 
51 | # Visual Studio Code files
52 | .vscode
53 | 
54 | # OSX dir files
55 | .DS_Store
56 | 
57 | # personal
58 | __pycache__/
59 | exp/
60 | *.egg-info/
61 | build/
62 | dist/
63 | 
64 | *.tsv
65 | *.npy
66 | *.zip
67 | dataset/scannetv2/train
68 | dataset/scannetv2/val
69 | dataset/scannetv2/test
70 | dataset/scannetv2/val_gt
71 | dataset/scannet_tf/training*
72 | dataset/scannet_tf/val*
73 | temp
74 | 
75 | dataset/s3dis/s3dis
76 | dataset/s3dis/Stanford3dDataset_v1.2_Aligned_Version
77 | 
78 | dataset/sem3d/test
79 | dataset/sem3d/train
80 | dataset/sem3d/train_subsampling
81 | 
82 | exp/
83 | 
84 | dataset/scannet_v2/train*
85 | dataset/scannet_v2/test*
86 | dataset/scannet_v2/val*
87 | 
88 | runs/*/events*
89 | runs/
90 | torch_points3d/
91 | 
92 | output/*
93 | *.pth
94 | */__pycache__
95 | data/
96 | 


--------------------------------------------------------------------------------
/lib/pointops/src/labelstat/labelstat_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LABELSTAT_CUDA_KERNEL
 2 | #define _LABELSTAT_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass,
 8 |     at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor);
 9 | 
10 | void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass,
11 |     at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor);
12 | 
13 | void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass,
14 |     at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor);
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass, \
21 |     const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream);
22 | 
23 | void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass, \
24 |     const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream);
25 | 
26 | void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass, \
27 |     const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/aggregation/aggregation_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "aggregation_cuda_kernel.h"
 6 | 
 7 | 
 8 | void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
 9 | {
10 |     const float *input = input_tensor.data_ptr<float>();
11 |     const float *position = position_tensor.data_ptr<float>();
12 |     const float *weight = weight_tensor.data_ptr<float>();
13 |     const int *idx = idx_tensor.data_ptr<int>();
14 |     float *output = output_tensor.data_ptr<float>();
15 |     aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output);
16 | }
17 | 
18 | void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor)
19 | {
20 | 	const float *input = input_tensor.data_ptr<float>();
21 |     const float *position = position_tensor.data_ptr<float>();
22 |     const float *weight = weight_tensor.data_ptr<float>();
23 |     const int *idx = idx_tensor.data_ptr<int>();
24 |     const float *grad_output = grad_output_tensor.data_ptr<float>();
25 |     float *grad_input = grad_input_tensor.data_ptr<float>();
26 |     float *grad_position = grad_position_tensor.data_ptr<float>();
27 |     float *grad_weight = grad_weight_tensor.data_ptr<float>();
28 |     aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
29 | }
30 | 


--------------------------------------------------------------------------------
/lib/pointops2/setup.py:
--------------------------------------------------------------------------------
 1 | #python3 setup.py install
 2 | from setuptools import setup
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | import os
 5 | from distutils.sysconfig import get_config_vars
 6 | 
 7 | (opt,) = get_config_vars('OPT')
 8 | os.environ['OPT'] = " ".join(
 9 |     flag for flag in opt.split() if flag != '-Wstrict-prototypes'
10 | )
11 | 
12 | setup(
13 |     name='pointops2',
14 |     ext_modules=[
15 |         CUDAExtension('pointops2_cuda', [
16 |             'src/pointops_api.cpp',
17 |             'src/knnquery/knnquery_cuda.cpp',
18 |             'src/knnquery/knnquery_cuda_kernel.cu',
19 |             'src/sampling/sampling_cuda.cpp',
20 |             'src/sampling/sampling_cuda_kernel.cu',
21 |             'src/grouping/grouping_cuda.cpp',
22 |             'src/grouping/grouping_cuda_kernel.cu',
23 |             'src/interpolation/interpolation_cuda.cpp',
24 |             'src/interpolation/interpolation_cuda_kernel.cu',
25 |             'src/subtraction/subtraction_cuda.cpp',
26 |             'src/subtraction/subtraction_cuda_kernel.cu',
27 |             'src/aggregation/aggregation_cuda.cpp',
28 |             'src/aggregation/aggregation_cuda_kernel.cu',
29 |             'src/attention/attention_cuda.cpp',
30 |             'src/attention/attention_cuda_kernel.cu',
31 |             'src/rpe/relative_pos_encoding_cuda.cpp',
32 |             'src/rpe/relative_pos_encoding_cuda_kernel.cu',
33 |             'src/attention_v2/attention_cuda_v2.cpp',
34 |             'src/attention_v2/attention_cuda_kernel_v2.cu',
35 |             'src/rpe_v2/relative_pos_encoding_cuda_v2.cpp',
36 |             'src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu',
37 |             ],
38 |         extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}
39 |         )
40 |     ],
41 |     cmdclass={'build_ext': BuildExtension}
42 | )
43 | 


--------------------------------------------------------------------------------
/lib/pointops/src/interpolation/interpolation_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _INTERPOLATION_CUDA_KERNEL
 2 | #define _INTERPOLATION_CUDA_KERNEL
 3 | #include <torch/serialize/tensor.h>
 4 | #include <vector>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
 8 | void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
 9 | void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
10 | 
11 | void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
12 | void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 | void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx);
19 | void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out);
20 | void interpolation_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points);
21 | 
22 | void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx);
23 | void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out);
24 | 
25 | #ifdef __cplusplus
26 | }
27 | #endif
28 | #endif
29 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/attention/attention_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ATTENTION_CUDA_KERNEL
 2 | #define _ATTENTION_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor);
 8 | void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor);
 9 | 
10 | void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor);
11 | void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor);
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, const int *index0, const int *index1, float *attn);
18 | void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k);
19 | 
20 | void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output);
21 | void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | #endif
27 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_attention_op_step2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 800000
 8 | N = 35000
 9 | C = 96
10 | h = 6
11 | softmax_attn_flat = torch.rand(M, h).cuda()
12 | value = torch.rand(N, h, C//h).cuda()
13 | 
14 | index_0 = torch.rand(M)
15 | index_0[index_0 < 0] = 0
16 | index_0 = (index_0*N).long().cuda()
17 | 
18 | index_1 = torch.rand(M)
19 | index_1[index_1 < 0] = 0
20 | index_1 = (index_1*N).long().cuda()
21 | 
22 | softmax_attn_flat.requires_grad = True
23 | value.requires_grad = True
24 | 
25 | # value_flat = value[index_1] #[M, num_heads, C // num_heads]
26 | # x = (softmax_attn_flat.unsqueeze(-1) * value_flat).reshape(M, C)
27 | # x = scatter_sum(src=x, index=index_0, dim=0, dim_size=N) #[N, C]
28 | # loss = x.sum()
29 | # loss.backward()
30 | 
31 | # print("x.shape: {}, x[:5,:10]: {}".format(x.shape, x[:5,:10]))
32 | # print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10])
33 | # print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5])
34 | # input()
35 | 
36 | print("softmax_attn_flat.is_contiguous(): ", softmax_attn_flat.is_contiguous())
37 | print("value.is_contiguous(): ", value.is_contiguous())
38 | print("index_0.is_contiguous(): ", index_0.is_contiguous())
39 | print("index_1.is_contiguous(): ", index_1.is_contiguous())
40 | 
41 | x_v2 = pointops.attention_step2(softmax_attn_flat.float(), value.float(), index_0.int(), index_1.int())
42 | x_v2 = x_v2.view(N, C)
43 | loss = x_v2.sum()
44 | loss.backward()
45 | 
46 | print("x_v2.shape: {}, x_v2[:5,:10]: {}".format(x_v2.shape, x_v2[:5,:10]))
47 | 
48 | print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10])
49 | print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5])
50 | input()
51 | 
52 | print("((x-x_v2)**2 < 1e-8).all(): ", ((x-x_v2)**2 < 1e-8).all())
53 | 
54 | print("torch.max((x-x_v2)**2): ", torch.max((x-x_v2)**2))
55 | 
56 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/attention_v2/attention_cuda_kernel_v2.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ATTENTION_V2_CUDA_KERNEL
 2 | #define _ATTENTION_V2_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor);
 8 | void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor);
 9 | 
10 | void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor);
11 | void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor);
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn);
18 | void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k);
19 | 
20 | void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output);
21 | void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | #endif
27 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_relative_pos_encoding_op_step1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 80000
 8 | N = 3500
 9 | hdim = 16
10 | h = 6
11 | L = 31
12 | query = torch.rand(N, h, hdim).cuda()
13 | table = torch.rand(L, h, hdim, 3).cuda()
14 | 
15 | index = torch.rand(M)
16 | index[index < 0] = 0
17 | index = (index*N).long().cuda()
18 | 
19 | rel_index = torch.rand(M, 3)
20 | rel_index[rel_index < 0] = 0
21 | rel_index = (rel_index*L).long().cuda()
22 | 
23 | query.requires_grad = True
24 | table.requires_grad = True
25 | 
26 | # query_flat = query[index] #[M, h, hdim]
27 | # table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim]
28 | # rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M]
29 | # rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim]
30 | # output = (query_flat * rel_pos_encoding).sum(-1) #[M, h]
31 | # loss = output.mean()
32 | # loss.backward()
33 | 
34 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
35 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
36 | # print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
37 | # input()
38 | 
39 | # print("query.is_contiguous(): ", query.is_contiguous())
40 | # print("key.is_contiguous(): ", key.is_contiguous())
41 | # print("index_0.is_contiguous(): ", index_0.is_contiguous())
42 | # print("index_1.is_contiguous(): ", index_1.is_contiguous())
43 | 
44 | output_v2 = pointops.dot_prod_with_idx(query, index.int(), table, rel_index.int())
45 | loss = output_v2.mean()
46 | loss.backward()
47 | 
48 | print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
49 | print("v2: query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
50 | print("v2: table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
51 | input()
52 | 
53 | # print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
54 | 
55 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
56 | 
57 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/grouping/grouping_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "grouping_cuda_kernel.h"
 3 | 
 4 | 
 5 | __global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) {
 6 |     // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
 7 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (index >= m * nsample * c) return;
 9 |     const int c_idx = index % c;
10 |     const int nsample_idx = (index / c) % nsample;
11 |     const int m_idx = index / nsample / c;
12 |     const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
13 |     output[index] = input[input_idx];
14 | }
15 | 
16 | __global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) {
17 |     // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
18 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
19 |     if (index >= m * nsample * c) return;
20 |     const int c_idx = index % c;
21 |     const int nsample_idx = (index / c) % nsample;
22 |     const int m_idx = index / nsample / c;
23 |     const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
24 |     atomicAdd(grad_input + input_idx, grad_output[index]);
25 | }
26 | 
27 | void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) {
28 |     // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
29 |     dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
30 |     dim3 threads(THREADS_PER_BLOCK);
31 |     grouping_forward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, input, idx, output);
32 | }
33 | 
34 | void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input)
35 | {  
36 |     // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
37 |     dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
38 |     dim3 threads(THREADS_PER_BLOCK);
39 |     grouping_backward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, grad_output, idx, grad_input);
40 | }
41 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RPE_CUDA_KERNEL
 2 | #define _RPE_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
 8 | void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_table_tensor);
 9 | 
10 | void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
11 | void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor);
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, const float *table, const int *rel_idx, float *output);
18 | void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table);
19 | 
20 | void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, const int *index1, const float *table, const int *rel_idx, float *output);
21 | void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | #endif
27 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/interpolation/interpolation_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "interpolation_cuda_kernel.h"
 3 | 
 4 | 
 5 | __global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output)
 6 | {
 7 |     // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
 8 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     if (index >= n * c) return;
10 |     int c_idx = index % c;
11 |     int n_idx = index / c;
12 |     for (int i = 0; i < k; i++)
13 |     {
14 |         int idx_idx = n_idx * k + i;
15 |         int input_idx = idx[idx_idx] * c + c_idx;
16 |         output[index] += input[input_idx] * weight[idx_idx];
17 |     }
18 | }
19 | 
20 | __global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input)
21 | {
22 |     // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
23 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
24 |     if (index >= n * c) return;
25 |     int c_idx = index % c;
26 |     int n_idx = index / c;
27 |     for (int i = 0; i < k; i++)
28 |     {
29 |         int idx_idx = n_idx * k + i;
30 |         int input_idx = idx[idx_idx] * c + c_idx;
31 |         atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]);
32 |     }
33 | }
34 | 
35 | void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) {
36 |     // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
37 |     dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
38 |     dim3 threads(THREADS_PER_BLOCK);
39 |     interpolation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, input, idx, weight, output);
40 | }
41 | 
42 | void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) {
43 |     // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
44 |     dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
45 |     dim3 threads(THREADS_PER_BLOCK);
46 |     interpolation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, grad_output, idx, weight, grad_input);
47 | }
48 | 


--------------------------------------------------------------------------------
/util/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | def make_dataset(split='train', data_root=None, data_list=None):
 9 |     if not os.path.isfile(data_list):
10 |         raise (RuntimeError("Point list file do not exist: " + data_list + "\n"))
11 |     point_list = []
12 |     list_read = open(data_list).readlines()
13 |     print("Totally {} samples in {} set.".format(len(list_read), split))
14 |     for line in list_read:
15 |         point_list.append(os.path.join(data_root, line.strip()))
16 |     return point_list
17 | 
18 | 
19 | class PointData(Dataset):
20 |     def __init__(self, split='train', data_root=None, data_list=None, transform=None, num_point=None, random_index=False):
21 |         assert split in ['train', 'val', 'test']
22 |         self.split = split
23 |         self.data_list = make_dataset(split, data_root, data_list)
24 |         self.transform = transform
25 |         self.num_point = num_point
26 |         self.random_index = random_index
27 | 
28 |     def __len__(self):
29 |         return len(self.data_list)
30 | 
31 |     def __getitem__(self, index):
32 |         data_path = self.data_list[index]
33 |         f = h5py.File(data_path, 'r')
34 |         data = f['data'][:]
35 |         if self.split is 'test':
36 |             label = 255  # place holder
37 |         else:
38 |             label = f['label'][:]
39 |         f.close()
40 |         if self.num_point is None:
41 |             self.num_point = data.shape[0]
42 |         idxs = np.arange(data.shape[0])
43 |         if self.random_index:
44 |             np.random.shuffle(idxs)
45 |         idxs = idxs[0:self.num_point]
46 |         data = data[idxs, :]
47 |         if label.size != 1:  # seg data
48 |             label = label[idxs]
49 |         if self.transform is not None:
50 |             data, label = self.transform(data, label)
51 |         return data, label
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     data_root = 'dataset/modelnet40'
56 |     data_list = 'dataset/modelnet40/list/val.txt'
57 |     point_data = PointData('train', data_root, data_list)
58 |     print('point data size:', point_data.__len__())
59 |     print('point data 0 shape:', point_data.__getitem__(0)[0].shape)
60 |     print('point label 0 shape:', point_data.__getitem__(0)[1].shape)
61 | 


--------------------------------------------------------------------------------
/lib/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #include "../../cpp_utils/cloud/cloud.h"
 4 | 
 5 | #include <set>
 6 | #include <cstdint>
 7 | 
 8 | using namespace std;
 9 | 
10 | class SampledData
11 | {
12 | public:
13 | 
14 | 	// Elements
15 | 	// ********
16 | 
17 | 	int count;
18 | 	PointXYZ point;
19 | 	vector<float> features;
20 | 	vector<unordered_map<int, int>> labels;
21 | 
22 | 
23 | 	// Methods
24 | 	// *******
25 | 
26 | 	// Constructor
27 | 	SampledData() 
28 | 	{ 
29 | 		count = 0; 
30 | 		point = PointXYZ();
31 | 	}
32 | 
33 | 	SampledData(const size_t fdim, const size_t ldim)
34 | 	{
35 | 		count = 0;
36 | 		point = PointXYZ();
37 | 	    features = vector<float>(fdim);
38 | 	    labels = vector<unordered_map<int, int>>(ldim);
39 | 	}
40 | 
41 | 	// Method Update
42 | 	void update_all(const PointXYZ p, vector<float>::iterator f_begin, vector<int>::iterator l_begin)
43 | 	{
44 | 		count += 1;
45 | 		point += p;
46 | 		transform (features.begin(), features.end(), f_begin, features.begin(), plus<float>());
47 | 		int i = 0;
48 | 		for(vector<int>::iterator it = l_begin; it != l_begin + labels.size(); ++it)
49 | 		{
50 | 		    labels[i][*it] += 1;
51 | 		    i++;
52 | 		}
53 | 		return;
54 | 	}
55 | 	void update_features(const PointXYZ p, vector<float>::iterator f_begin)
56 | 	{
57 | 		count += 1;
58 | 		point += p;
59 | 		transform (features.begin(), features.end(), f_begin, features.begin(), plus<float>());
60 | 		return;
61 | 	}
62 | 	void update_classes(const PointXYZ p, vector<int>::iterator l_begin)
63 | 	{
64 | 		count += 1;
65 | 		point += p;
66 | 		int i = 0;
67 | 		for(vector<int>::iterator it = l_begin; it != l_begin + labels.size(); ++it)
68 | 		{
69 | 		    labels[i][*it] += 1;
70 | 		    i++;
71 | 		}
72 | 		return;
73 | 	}
74 | 	void update_points(const PointXYZ p)
75 | 	{
76 | 		count += 1;
77 | 		point += p;
78 | 		return;
79 | 	}
80 | };
81 | 
82 | 
83 | 
84 | void grid_subsampling(vector<PointXYZ>& original_points,
85 |                       vector<PointXYZ>& subsampled_points,
86 |                       vector<float>& original_features,
87 |                       vector<float>& subsampled_features,
88 |                       vector<int>& original_classes,
89 |                       vector<int>& subsampled_classes,
90 |                       float sampleDl,
91 |                       int verbose);
92 | 
93 | 


--------------------------------------------------------------------------------
/lib/pointops/src/pointops_api.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <torch/extension.h>
 3 | 
 4 | #include "ballquery/ballquery_cuda_kernel.h"
 5 | #include "grouping/grouping_cuda_kernel.h"
 6 | #include "grouping_int/grouping_int_cuda_kernel.h"
 7 | #include "sampling/sampling_cuda_kernel.h"
 8 | #include "interpolation/interpolation_cuda_kernel.h"
 9 | #include "knnquery/knnquery_cuda_kernel.h"
10 | #include "knnquery_heap/knnquery_heap_cuda_kernel.h"
11 | 
12 | #include "labelstat/labelstat_cuda_kernel.h"
13 | #include "featuredistribute/featuredistribute_cuda_kernel.h"
14 | 
15 | 
16 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
17 |     m.def("ballquery_cuda", &ballquery_cuda_fast, "ballquery_cuda_fast");   // name in python, cpp function address, docs
18 | 
19 |     m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda");
20 |     m.def("knnquery_heap_cuda", &knnquery_heap_cuda, "knnquery_heap_cuda");
21 | 
22 |     m.def("grouping_forward_cuda", &grouping_forward_cuda_fast, "grouping_forward_cuda_fast");
23 |     m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
24 | 
25 |     m.def("grouping_int_forward_cuda", &grouping_int_forward_cuda_fast, "grouping_int_forward_cuda_fast");
26 | 
27 |     m.def("gathering_forward_cuda", &gathering_forward_cuda, "gathering_forward_cuda");
28 |     m.def("gathering_backward_cuda", &gathering_backward_cuda, "gathering_backward_cuda");
29 |     m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda");
30 | 
31 |     m.def("nearestneighbor_cuda", &nearestneighbor_cuda_fast, "nearestneighbor_cuda_fast");
32 |     m.def("interpolation_forward_cuda", &interpolation_forward_cuda_fast, "interpolation_forward_cuda_fast");
33 |     m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
34 | 
35 |     m.def("labelstat_idx_cuda", &labelstat_idx_cuda_fast, "labelstat_idx_cuda_fast");
36 |     m.def("labelstat_ballrange_cuda", &labelstat_ballrange_cuda_fast, "labelstat_ballrange_cuda_fast");
37 |     m.def("labelstat_and_ballquery_cuda", &labelstat_and_ballquery_cuda_fast, "labelstat_and_ballquery_cuda_fast");
38 | 
39 |     m.def("featuredistribute_cuda", &featuredistribute_cuda, "featuredistribute_cuda");
40 |     m.def("featuregather_forward_cuda", &featuregather_forward_cuda, "featuregather_forward_cuda");
41 |     m.def("featuregather_backward_cuda", &featuregather_backward_cuda, "featuregather_backward_cuda");
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 80000
 8 | N = 3500
 9 | hdim = 16
10 | h = 6
11 | L = 31
12 | query = torch.rand(N, h, hdim).cuda()
13 | table_q = torch.rand(L, h, hdim, 3).cuda()
14 | key = torch.rand(N, h, hdim).cuda()
15 | table_k = torch.rand(L, h, hdim, 3).cuda()
16 | 
17 | index_q = torch.rand(M)
18 | index_q[index_q < 0] = 0
19 | index_q = (index_q*N).long().cuda()
20 | 
21 | index_k = torch.rand(M)
22 | index_k[index_k < 0] = 0
23 | index_k = (index_k*N).long().cuda()
24 | 
25 | rel_index = torch.rand(M, 3)
26 | rel_index[rel_index < 0] = 0
27 | rel_index = (rel_index*L).long().cuda()
28 | 
29 | query.requires_grad = True
30 | table_q.requires_grad = True
31 | key.requires_grad = True
32 | table_k.requires_grad = True
33 | 
34 | output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int())
35 | output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int())
36 | output = output1 + output2
37 | # loss = output.mean()
38 | # loss.backward()
39 | 
40 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
41 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
42 | # print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
43 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
44 | # print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
45 | # input()
46 | 
47 | # print("query.is_contiguous(): ", query.is_contiguous())
48 | # print("key.is_contiguous(): ", key.is_contiguous())
49 | # print("index_0.is_contiguous(): ", index_0.is_contiguous())
50 | # print("index_1.is_contiguous(): ", index_1.is_contiguous())
51 | 
52 | output_v2 = pointops.dot_prod_with_idx_v2(query, index_q.int(), key, index_k.int(), table_q, table_k, rel_index.int())
53 | loss = output_v2.mean()
54 | loss.backward()
55 | 
56 | print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
57 | print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
58 | print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
59 | print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
60 | print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
61 | # input()
62 | 
63 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
64 | 
65 | 


--------------------------------------------------------------------------------
/lib/pointops/src/featuredistribute/featuredistribute_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <THC/THC.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | #include "featuredistribute_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13 | 
14 | 
15 | void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor)
16 | {
17 |     CHECK_INPUT(max_xyz_tensor);
18 |     CHECK_INPUT(xyz_tensor);
19 | 
20 |     const float *max_xyz = max_xyz_tensor.data<float>();
21 |     const float *xyz = xyz_tensor.data<float>();
22 |     int *distribute_idx = distribute_idx_tensor.data<int>();
23 | 
24 |     cudaStream_t stream = THCState_getCurrentStream(state);
25 | 
26 |     featuredistribute_cuda_launcher(b, n, m, max_xyz, xyz, distribute_idx, stream);
27 | }
28 | 
29 | 
30 | void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor)
31 | {
32 |     CHECK_INPUT(max_feature_tensor);
33 |     CHECK_INPUT(distribute_idx_tensor);
34 | 
35 |     const float *max_feature = max_feature_tensor.data<float>();
36 |     const int *distribute_idx = distribute_idx_tensor.data<int>();
37 |     float *distribute_feature = distribute_feature_tensor.data<float>();
38 | 
39 |     cudaStream_t stream = THCState_getCurrentStream(state);
40 | 
41 |     featuregather_forward_cuda_launcher(b, n, m, c, max_feature, distribute_idx, distribute_feature, stream);
42 | }
43 | 
44 | 
45 | void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor)
46 | {
47 |     CHECK_INPUT(grad_distribute_feature_tensor);
48 |     CHECK_INPUT(distribute_idx_tensor);
49 | 
50 |     const float *grad_distribute_feature = grad_distribute_feature_tensor.data<float>();
51 |     const int *distribute_idx = distribute_idx_tensor.data<int>();
52 |     float *grad_max_feature = grad_max_feature_tensor.data<float>();
53 | 
54 |     cudaStream_t stream = THCState_getCurrentStream(state);
55 | 
56 |     featuregather_backward_cuda_launcher(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature, stream);
57 | }


--------------------------------------------------------------------------------
/config/s3dis/s3dis_swin3d_transformer.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   data_name: s3dis
 3 |   data_root: # Fill in the pre-processed data path (which contains the .npy files)
 4 |   test_area: 5
 5 |   classes: 13
 6 |   fea_dim: 6
 7 |   voxel_size: 0.04
 8 |   voxel_max: 80000
 9 |   loop: 30
10 | 
11 | TRAIN:
12 |   #arch
13 |   arch: swin3d_transformer
14 |   stem_transformer: True
15 |   use_xyz: True
16 |   sync_bn: True  # adopt sync_bn or not
17 |   rel_query: True
18 |   rel_key: True
19 |   rel_value: True
20 |   quant_size: 0.01
21 |   num_layers: 4 
22 |   patch_size: 1 
23 |   window_size: 4
24 |   depths: [2, 2, 6, 2] 
25 |   channels: [48, 96, 192, 384] 
26 |   num_heads: [3, 6, 12, 24] 
27 |   up_k: 3
28 |   drop_path_rate: 0.3
29 |   concat_xyz: True
30 |   grid_size: 0.04
31 |   max_batch_points: 140000
32 |   max_num_neighbors: 34 # For KPConv
33 |   ratio: 0.25
34 |   k: 16
35 | 
36 |   # training
37 |   aug: True
38 |   transformer_lr_scale: 0.1
39 |   jitter_sigma: 0.005
40 |   jitter_clip: 0.02
41 |   scheduler_update: epoch 
42 |   scheduler: MultiStep 
43 |   warmup: linear
44 |   warmup_iters: 1500
45 |   warmup_ratio: 0.000001
46 |   use_amp: True
47 |   optimizer: AdamW 
48 |   ignore_label: 255
49 |   train_gpu: [0, 1, 2, 3]
50 |   workers: 16  # data loader workers
51 |   batch_size: 8  # batch size for training
52 |   batch_size_val: 4  # batch size for validation during training, memory and speed tradeoff
53 |   base_lr: 0.006
54 |   epochs: 100
55 |   start_epoch: 0
56 |   step_epoch: 30
57 |   multiplier: 0.1
58 |   momentum: 0.9
59 |   weight_decay: 0.01
60 |   drop_rate: 0.5
61 |   manual_seed: 123
62 |   print_freq: 1
63 |   save_freq: 1
64 |   save_path: runs/s3dis_swin3d_transformer
65 |   weight:  # path to initial weight (default: none)
66 |   resume:  # path to latest checkpoint (default: none)
67 |   evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
68 |   eval_freq: 1
69 | Distributed:
70 |   dist_url: tcp://127.0.0.1:6789
71 |   dist_backend: 'nccl'
72 |   multiprocessing_distributed: True
73 |   world_size: 1
74 |   rank: 0
75 | 
76 | TEST:
77 |   test_list: dataset/s3dis/list/val5.txt
78 |   test_list_full: dataset/s3dis/list/val5_full.txt
79 |   split: val  # split in [train, val and test]
80 |   test_gpu: [0]
81 |   test_workers: 4
82 |   batch_size_test: 4
83 |   model_path: # Fill the path of the trained .pth file model
84 |   save_folder: # Fill the path to store the .npy files for each scene
85 |   names_path: data/s3dis/s3dis_names.txt
86 | 


--------------------------------------------------------------------------------
/config/s3dis/s3dis_stratified_transformer.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   data_name: s3dis
 3 |   data_root: # Fill in the pre-processed data path (which contains the .npy files)
 4 |   test_area: 5
 5 |   classes: 13
 6 |   fea_dim: 6
 7 |   voxel_size: 0.04
 8 |   voxel_max: 80000
 9 |   loop: 30
10 | 
11 | TRAIN:
12 |   #arch
13 |   arch: stratified_transformer
14 |   stem_transformer: True
15 |   use_xyz: True
16 |   sync_bn: True  # adopt sync_bn or not
17 |   rel_query: True
18 |   rel_key: True
19 |   rel_value: True
20 |   quant_size: 0.01
21 |   downsample_scale: 8
22 |   num_layers: 4 
23 |   patch_size: 1 
24 |   window_size: 4
25 |   depths: [2, 2, 6, 2] 
26 |   channels: [48, 96, 192, 384] 
27 |   num_heads: [3, 6, 12, 24] 
28 |   up_k: 3
29 |   drop_path_rate: 0.3
30 |   concat_xyz: True
31 |   grid_size: 0.04
32 |   max_batch_points: 140000
33 |   max_num_neighbors: 34 # For KPConv
34 |   ratio: 0.25
35 |   k: 16
36 | 
37 |   # training
38 |   aug: True
39 |   transformer_lr_scale: 0.1
40 |   jitter_sigma: 0.005
41 |   jitter_clip: 0.02
42 |   scheduler_update: epoch 
43 |   scheduler: MultiStep 
44 |   warmup: linear
45 |   warmup_iters: 1500
46 |   warmup_ratio: 0.000001
47 |   use_amp: True
48 |   optimizer: AdamW 
49 |   ignore_label: 255
50 |   train_gpu: [0, 1, 2, 3]
51 |   workers: 16  # data loader workers
52 |   batch_size: 8  # batch size for training
53 |   batch_size_val: 4  # batch size for validation during training, memory and speed tradeoff
54 |   base_lr: 0.006
55 |   epochs: 100
56 |   start_epoch: 0
57 |   step_epoch: 30
58 |   multiplier: 0.1
59 |   momentum: 0.9
60 |   weight_decay: 0.01
61 |   drop_rate: 0.5
62 |   manual_seed: 123
63 |   print_freq: 1
64 |   save_freq: 1
65 |   save_path: runs/s3dis_stratified_transformer
66 |   weight:  # path to initial weight (default: none)
67 |   resume:  # path to latest checkpoint (default: none)
68 |   evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
69 |   eval_freq: 1
70 | Distributed:
71 |   dist_url: tcp://127.0.0.1:6789
72 |   dist_backend: 'nccl'
73 |   multiprocessing_distributed: True
74 |   world_size: 1
75 |   rank: 0
76 | 
77 | TEST:
78 |   test_list: dataset/s3dis/list/val5.txt
79 |   test_list_full: dataset/s3dis/list/val5_full.txt
80 |   split: val  # split in [train, val and test]
81 |   test_gpu: [0]
82 |   test_workers: 4
83 |   batch_size_test: 4
84 |   model_path: # Fill the path of the trained .pth file model
85 |   save_folder: # Fill the path to store the .npy files for each scene
86 |   names_path: data/s3dis/s3dis_names.txt
87 | 


--------------------------------------------------------------------------------
/config/scannetv2/scannetv2_stratified_transformer.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   data_name: scannetv2
 3 |   data_root: # Fill in the pre-processed data path (which contains the 'train', 'val', 'test' directories)
 4 |   classes: 20
 5 |   fea_dim: 6
 6 |   voxel_size: 0.02 
 7 |   voxel_max: 120000 
 8 |   loop: 6 
 9 | 
10 | TRAIN:
11 |   # arch
12 |   arch: stratified_transformer
13 |   stem_transformer: False
14 |   use_xyz: True
15 |   sync_bn: True  # adopt sync_bn or not
16 |   rel_query: True
17 |   rel_key: True
18 |   rel_value: True
19 |   quant_size: 0.005
20 |   downsample_scale: 4 
21 |   num_layers: 5 
22 |   patch_size: 1 
23 |   window_size: 5 
24 |   depths: [3,3,9,3,3] 
25 |   channels: [48, 96, 192, 384, 384] 
26 |   num_heads: [3, 6, 12, 24, 24]
27 |   up_k: 3
28 |   drop_path_rate: 0.3
29 |   concat_xyz: True
30 |   grid_size: 0.02
31 |   max_batch_points: 250000
32 |   max_num_neighbors: 34 # For KPConv
33 |   ratio: 0.25
34 |   k: 16
35 | 
36 |   # training
37 |   aug: True
38 |   transformer_lr_scale: 0.1 
39 |   scheduler_update: step 
40 |   scheduler: MultiStepWithWarmup 
41 |   warmup: linear
42 |   warmup_iters: 3000
43 |   warmup_ratio: 0.000001
44 |   use_amp: True
45 |   optimizer: AdamW #SGD
46 |   train_gpu: [0, 1, 2, 3] 
47 |   workers: 16  # data loader workers
48 |   batch_size: 8 # batch size for training
49 |   batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff
50 |   base_lr: 0.006
51 |   epochs: 100
52 |   start_epoch: 0
53 |   step_epoch: 30
54 |   multiplier: 0.1
55 |   momentum: 0.9
56 |   weight_decay: 0.05
57 |   drop_rate: 0.5
58 | 
59 |   ignore_label: -100 #255
60 |   manual_seed: 123
61 |   print_freq: 1
62 |   save_freq: 1
63 |   save_path: runs/sacnnetv2_stratified_transformer
64 |   weight:  # path to initial weight (default: none)
65 |   resume:  # path to latest checkpoint (default: none)
66 |   evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
67 |   eval_freq: 1
68 | Distributed:
69 |   dist_url: tcp://127.0.0.1:6789
70 |   dist_backend: 'nccl'
71 |   multiprocessing_distributed: True
72 |   world_size: 1
73 |   rank: 0
74 | 
75 | TEST:
76 |   data_root_val: # Fill the path that contains the scenes of the validation set (e.g., "[YOUR PATH]/val")
77 |   split: val  # split in [train, val and test]
78 |   test_gpu: [0]
79 |   test_workers: 4
80 |   batch_size_test: 4
81 |   model_path: # Fill the path of the trained .pth file model
82 |   save_folder: # Fill the path to store the .npy files for each scene
83 |   names_path: data/scannet/scannet_names.txt
84 | 


--------------------------------------------------------------------------------
/config/scannetv2/scannetv2_swin3d_transformer.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   data_name: scannetv2
 3 |   data_root: # Fill in the pre-processed data path (which contains the 'train', 'val', 'test' directories)
 4 |   classes: 20
 5 |   fea_dim: 6
 6 |   voxel_size: 0.02 
 7 |   voxel_max: 120000 
 8 |   loop: 6 
 9 | 
10 | TRAIN:
11 |   # arch
12 |   arch: swin3d_transformer
13 |   stem_transformer: False
14 |   use_xyz: True
15 |   sync_bn: True  # adopt sync_bn or not
16 |   rel_query: True
17 |   rel_key: True
18 |   rel_value: True
19 |   quant_size: 0.005
20 |   num_layers: 5 
21 |   patch_size: 1 
22 |   window_size: 5 
23 |   depths: [3,3,9,3,3] 
24 |   channels: [48, 96, 192, 384, 384] 
25 |   num_heads: [3, 6, 12, 24, 24]
26 |   up_k: 3
27 |   drop_path_rate: 0.3
28 |   concat_xyz: True
29 |   grid_size: 0.02
30 |   max_batch_points: 250000
31 |   max_num_neighbors: 34 # For KPConv
32 |   ratio: 0.25
33 |   k: 16
34 | 
35 |   # training
36 |   aug: True
37 |   transformer_lr_scale: 0.1 
38 |   scheduler_update: step 
39 |   scheduler: MultiStepWithWarmup 
40 |   warmup: linear
41 |   warmup_iters: 3000
42 |   warmup_ratio: 0.000001
43 |   use_amp: True
44 |   optimizer: AdamW #SGD
45 |   train_gpu: [0, 1, 2, 3] 
46 |   workers: 16  # data loader workers
47 |   batch_size: 8  # batch size for training
48 |   batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff
49 |   base_lr: 0.006
50 |   epochs: 100
51 |   start_epoch: 0
52 |   step_epoch: 30
53 |   multiplier: 0.1
54 |   momentum: 0.9
55 |   weight_decay: 0.05
56 |   drop_rate: 0.5
57 | 
58 |   ignore_label: -100 #255
59 |   manual_seed: 123
60 |   print_freq: 1
61 |   save_freq: 1
62 |   save_path: runs/sacnnetv2_swin3d_transformer
63 |   weight:  # path to initial weight (default: none)
64 |   resume:  # path to latest checkpoint (default: none)
65 |   evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
66 |   eval_freq: 1
67 | Distributed:
68 |   dist_url: tcp://127.0.0.1:6789
69 |   dist_backend: 'nccl'
70 |   multiprocessing_distributed: True
71 |   world_size: 1
72 |   rank: 0
73 | 
74 | TEST:
75 |   data_root_val: # Fill the path that contains the scenes of the validation set (e.g., "[YOUR PATH]/val")
76 |   test_list: 
77 |   test_list_full: 
78 |   split: val  # split in [train, val and test]
79 |   test_gpu: [0]
80 |   test_workers: 4
81 |   batch_size_test: 4
82 |   model_path: # Fill the path of the trained .pth file model
83 |   save_folder: # Fill the path to store the .npy files for each scene
84 |   names_path: data/scannet/scannet_names.txt
85 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/subtraction/subtraction_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "subtraction_cuda_kernel.h"
 3 | 
 4 | 
 5 | __global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
 6 |     // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
 7 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (index >= n * nsample * c) return;
 9 |     const int c_idx = index % c;
10 |     const int nsample_idx = (index / c) % nsample;
11 |     const int n_idx = index / nsample / c;
12 |     const int idx_idx = n_idx * nsample + nsample_idx;
13 |     const int input1_idx = n_idx * c + c_idx;
14 |     const int input2_idx = idx[idx_idx] * c + c_idx;
15 |     output[index] = input1[input1_idx] - input2[input2_idx];
16 | }
17 | 
18 | __global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
19 |     // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
20 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
21 |     if (index >= n * nsample * c) return;
22 |     const int c_idx = index % c;
23 |     const int nsample_idx = (index / c) % nsample;
24 |     const int n_idx = index / nsample / c;
25 |     const int idx_idx = n_idx * nsample + nsample_idx;
26 |     const int input1_idx = n_idx * c + c_idx;
27 |     const int input2_idx = idx[idx_idx] * c + c_idx;
28 |     atomicAdd(grad_input1 + input1_idx, grad_output[index]);
29 |     atomicAdd(grad_input2 + input2_idx, -grad_output[index]);
30 | }
31 | 
32 | void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
33 |     // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
34 |     dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
35 |     dim3 threads(THREADS_PER_BLOCK);
36 |     subtraction_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, input1, input2, idx, output);
37 | }
38 | 
39 | void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {  
40 |     // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
41 |     dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
42 |     dim3 threads(THREADS_PER_BLOCK);
43 |     subtraction_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/pointops/src/interpolation/interpolation_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <ATen/cuda/CUDAContext.h>
 4 | #include <THC/THC.h>
 5 | #include "interpolation_cuda_kernel.h"
 6 | 
 7 | extern THCState *state;
 8 | 
 9 | void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor)
10 | {
11 |     const float *unknown = unknown_tensor.data<float>();
12 |     const float *known = known_tensor.data<float>();
13 |     float *dist2 = dist2_tensor.data<float>();
14 |     int *idx = idx_tensor.data<int>();
15 |     nearestneighbor_cuda_launcher(b, n, m, unknown, known, dist2, idx);
16 | }
17 | 
18 | void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor)
19 | {
20 |     const float *points = points_tensor.data<float>();
21 |     const float *weight = weight_tensor.data<float>();
22 |     float *out = out_tensor.data<float>();
23 |     const int *idx = idx_tensor.data<int>();
24 |     interpolation_forward_cuda_launcher(b, c, m, n, points, idx, weight, out);
25 | }
26 | 
27 | void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor)
28 | {
29 |     const float *grad_out = grad_out_tensor.data<float>();
30 |     const float *weight = weight_tensor.data<float>();
31 |     float *grad_points = grad_points_tensor.data<float>();
32 |     const int *idx = idx_tensor.data<int>();
33 |     interpolation_backward_cuda_launcher(b, c, n, m, grad_out, idx, weight, grad_points);
34 | }
35 | 
36 | void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
37 |     const float *unknown = unknown_tensor.data<float>();
38 |     const float *known = known_tensor.data<float>();
39 |     float *dist2 = dist2_tensor.data<float>();
40 |     int *idx = idx_tensor.data<int>();
41 |     nearestneighbor_cuda_launcher_fast(b, n, m, unknown, known, dist2, idx);
42 | }
43 | 
44 | void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
45 | 
46 |     const float *points = points_tensor.data<float>();
47 |     const float *weight = weight_tensor.data<float>();
48 |     float *out = out_tensor.data<float>();
49 |     const int *idx = idx_tensor.data<int>();
50 |     interpolation_forward_cuda_launcher_fast(b, c, m, n, points, idx, weight, out);
51 | }


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_relative_pos_encoding_op_step2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 80000
 8 | N = 3500
 9 | hdim = 16
10 | h = 6
11 | L = 31
12 | attn = torch.rand(M, h).cuda()
13 | v = torch.rand(N, h, hdim).cuda()
14 | table = torch.rand(L, h, hdim, 3).cuda()
15 | 
16 | index_0 = torch.rand(M)
17 | index_0[index_0 < 0] = 0
18 | index_0 = (index_0*N).long().cuda()
19 | 
20 | index_1 = torch.rand(M)
21 | index_1[index_1 < 0] = 0
22 | index_1 = (index_1*N).long().cuda()
23 | 
24 | rel_index = torch.rand(M, 3)
25 | rel_index[rel_index < 0] = 0
26 | rel_index = (rel_index*L).long().cuda()
27 | 
28 | attn.requires_grad = True
29 | v.requires_grad = True
30 | table.requires_grad = True
31 | 
32 | v_flat = v[index_1] #[M, h, hdim]
33 | table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim]
34 | rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M]
35 | rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim]
36 | v_flat_new = v_flat + rel_pos_encoding #[M, h, hdim]
37 | output = attn.unsqueeze(-1) * v_flat_new #[M, h, hdim] 
38 | output = scatter_sum(src=output, index=index_0, dim=0, dim_size=N) #[N, h, hdim]
39 | loss = output.mean()
40 | loss.backward()
41 | 
42 | print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5]))
43 | print("attn.grad[:5, :3]: ", attn.grad[:5, :3])
44 | print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
45 | print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
46 | input()
47 | 
48 | # print("query.is_contiguous(): ", query.is_contiguous())
49 | # print("key.is_contiguous(): ", key.is_contiguous())
50 | # print("index_0.is_contiguous(): ", index_0.is_contiguous())
51 | # print("index_1.is_contiguous(): ", index_1.is_contiguous())
52 | 
53 | # output_v2 = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int())
54 | # loss = output_v2.mean()
55 | # loss.backward()
56 | 
57 | # print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5]))
58 | # print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3])
59 | # print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
60 | # print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
61 | # input()
62 | 
63 | # print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
64 | 
65 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
66 | 
67 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery/knnquery_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "knnquery_cuda_kernel.h"
 3 | 
 4 | // input: xyz (b, n, 3) new_xyz (b, m, 3)
 5 | // output: idx (b, m, nsample) dist2 (b, m, nsample)
 6 | __global__ void knnquery_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
 7 |     int bs_idx = blockIdx.y;
 8 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     if (bs_idx >= b || pt_idx >= m) return;
10 | 
11 |     new_xyz += bs_idx * m * 3 + pt_idx * 3;
12 |     xyz += bs_idx * n * 3;
13 |     idx += bs_idx * m * nsample + pt_idx * nsample;
14 | 
15 |     float new_x = new_xyz[0];
16 |     float new_y = new_xyz[1];
17 |     float new_z = new_xyz[2];
18 | 
19 |     //double* best = new double[nsample];
20 |     //int* besti = new int[nsample];
21 |     double best[200];
22 |     int besti[200];
23 |     for(int i = 0; i < nsample; i++){
24 |         best[i] = 1e40;
25 |         besti[i] = 0;
26 |     }
27 |     for(int k = 0; k < n; k++){
28 |         float x = xyz[k * 3 + 0];
29 |         float y = xyz[k * 3 + 1];
30 |         float z = xyz[k * 3 + 2];
31 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
32 |         for(int j = 0; j < nsample; j++){
33 |             if(d2 < best[j]){
34 |                 for(int i = nsample - 1; i > j; i--){
35 |                     best[i] = best[i - 1];
36 |                     besti[i] = besti[i - 1];
37 |                 }
38 |                 best[j] = d2;
39 |                 besti[j] = k;
40 |                 break;
41 |             }
42 |         }
43 |     }
44 |     for(int i = 0; i < nsample; i++){
45 |         idx[i] = besti[i];
46 |         dist2[i] = best[i];
47 |     }
48 |     //delete []best;
49 |     //delete []besti;
50 | }
51 | 
52 | 
53 | void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
54 |     // param new_xyz: (B, m, 3)
55 |     // param xyz: (B, n, 3)
56 |     // param idx: (B, m, nsample)
57 | 
58 |     cudaError_t err;
59 | 
60 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
61 |     dim3 threads(THREADS_PER_BLOCK);
62 | 
63 |     knnquery_cuda_kernel<<<blocks, threads, 0, stream>>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
64 |     // cudaDeviceSynchronize();  // for using printf in kernel function
65 | 
66 |     err = cudaGetLastError();
67 |     if (cudaSuccess != err) {
68 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
69 |         exit(-1);
70 |     }
71 | }


--------------------------------------------------------------------------------
/lib/pointops/src/grouping_int/grouping_int_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "grouping_int_cuda_kernel.h"
 3 | 
 4 | // input: points(b, c, n) idx(b, m, nsample)
 5 | // output: out(b, c, m, nsample)
 6 | __global__ void grouping_int_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out)
 7 | {
 8 |     int batch_index = blockIdx.x;
 9 |     points += batch_index * n * c;
10 |     idx += batch_index * m * nsample;
11 |     out += batch_index * m * nsample * c;
12 |     const int index = threadIdx.y * blockDim.x + threadIdx.x;
13 |     const int stride = blockDim.y * blockDim.x;
14 |     for (int i = index; i < c * m; i += stride)
15 |     {
16 |         const int l = i / m;
17 |         const int j = i % m;
18 |         for (int k = 0; k < nsample; ++k)
19 |         {
20 |             int ii = idx[j * nsample + k];
21 |             out[(l * m + j) * nsample + k] = points[l * n + ii];
22 |         }
23 |     }
24 | }
25 | 
26 | 
27 | void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out)
28 | {
29 |     grouping_int_forward_cuda_kernel<<<b, opt_block_config(m, c), 0>>>(b, c, n, m, nsample, points, idx, out);
30 | }
31 | 
32 | 
33 | __global__ void grouping_int_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const long int *__restrict__ points, const int *__restrict__ idx, long int *__restrict__ out)
34 | {
35 |     int bs_idx = blockIdx.z;
36 |     int c_idx = blockIdx.y;
37 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
38 |     int pt_idx = index / nsample;
39 |     if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
40 | 
41 |     int sample_idx = index % nsample;
42 | 
43 |     idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
44 |     int in_idx = bs_idx * c * n + c_idx * n + idx[0];
45 |     int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
46 | 
47 |     out[out_idx] = points[in_idx];
48 | }
49 | 
50 | 
51 | void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out)
52 | {
53 |     cudaError_t err;
54 | 
55 |     dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
56 |     dim3 threads(THREADS_PER_BLOCK);
57 | 
58 |     grouping_int_forward_cuda_kernel_fast<<<blocks, threads, 0>>>(b, c, n, npoints, nsample, points, idx, out);
59 |     // cudaDeviceSynchronize();  // for using printf in kernel function
60 |     err = cudaGetLastError();
61 |     if (cudaSuccess != err) {
62 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
63 |         exit(-1);
64 |     }
65 | }


--------------------------------------------------------------------------------
/lib/pointops/src/labelstat/labelstat_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <vector>
 3 | #include <THC/THC.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | #include "labelstat_cuda_kernel.h"
 7 | 
 8 | extern THCState *state;
 9 | 
10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13 | 
14 | void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass,
15 |     at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor)
16 | {
17 |     CHECK_INPUT(label_stat_tensor);
18 |     CHECK_INPUT(idx_tensor);
19 | 
20 |     const int *label_stat = label_stat_tensor.data<int>();
21 |     const int *idx = idx_tensor.data<int>();
22 |     int *new_label_stat = new_label_stat_tensor.data<int>();
23 | 
24 |     cudaStream_t stream = THCState_getCurrentStream(state);
25 | 
26 |     labelstat_idx_cuda_launcher_fast(b, n, m, nsample, nclass, label_stat, idx, new_label_stat, stream);
27 | }
28 | 
29 | void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass,
30 |     at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor)
31 | {
32 |     CHECK_INPUT(new_xyz_tensor);
33 |     CHECK_INPUT(xyz_tensor);
34 |     CHECK_INPUT(label_stat_tensor);
35 | 
36 |     const float *new_xyz = new_xyz_tensor.data<float>();
37 |     const float *xyz = xyz_tensor.data<float>();
38 |     const int *label_stat = label_stat_tensor.data<int>();
39 |     int *new_label_stat = new_label_stat_tensor.data<int>();
40 | 
41 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream(state);
42 | 
43 |     labelstat_ballrange_cuda_launcher_fast(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat, stream);
44 | }
45 | 
46 | void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass,
47 |     at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor)
48 | {
49 |     CHECK_INPUT(new_xyz_tensor);
50 |     CHECK_INPUT(xyz_tensor);
51 |     CHECK_INPUT(label_stat_tensor);
52 |     CHECK_INPUT(idx_tensor);
53 | 
54 |     const float *new_xyz = new_xyz_tensor.data<float>();
55 |     const float *xyz = xyz_tensor.data<float>();
56 |     const int *label_stat = label_stat_tensor.data<int>();
57 |     int *idx = idx_tensor.data<int>();
58 |     int *new_label_stat = new_label_stat_tensor.data<int>();
59 | 
60 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream(state);
61 | 
62 |     labelstat_and_ballquery_cuda_launcher_fast(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat, stream);
63 | }
64 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/attention/attention_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "attention_cuda_kernel.h"
 6 | 
 7 | void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, 
 8 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor)
 9 | {
10 |     const float *q = q_tensor.data_ptr<float>();
11 |     const float *k = k_tensor.data_ptr<float>();
12 |     const int *index0 = index0_tensor.data_ptr<int>();
13 |     const int *index1 = index1_tensor.data_ptr<int>();
14 |     float *attn = attn_tensor.data_ptr<float>();
15 |     attention_step1_forward_cuda_launcher(N, M, h, C, q, k, index0, index1, attn);
16 | }
17 | 
18 | void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
19 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 
20 |     at::Tensor grad_q_tensor, at::Tensor grad_k_tensor)
21 | {
22 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
23 |     const int *index0 = index0_tensor.data_ptr<int>();
24 |     const int *index1 = index1_tensor.data_ptr<int>();
25 |     const float *q = q_tensor.data_ptr<float>();
26 |     const float *k = k_tensor.data_ptr<float>();
27 |     float *grad_q = grad_q_tensor.data_ptr<float>();
28 |     float *grad_k = grad_k_tensor.data_ptr<float>();
29 |     attention_step1_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k);
30 | }
31 | 
32 | void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 
33 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor)
34 | {
35 |     const float *attn = attn_tensor.data_ptr<float>();
36 |     const float *v = v_tensor.data_ptr<float>();
37 |     const int *index0 = index0_tensor.data_ptr<int>();
38 |     const int *index1 = index1_tensor.data_ptr<int>();
39 |     float *output = output_tensor.data_ptr<float>();
40 |     attention_step2_forward_cuda_launcher(N, M, h, C, attn, v, index0, index1, output);
41 | }
42 | 
43 | 
44 | void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
45 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 
46 |     at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor)
47 | {
48 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
49 |     const int *index0 = index0_tensor.data_ptr<int>();
50 |     const int *index1 = index1_tensor.data_ptr<int>();
51 |     const float *attn = attn_tensor.data_ptr<float>();
52 |     const float *v = v_tensor.data_ptr<float>();
53 |     float *grad_attn = grad_attn_tensor.data_ptr<float>();
54 |     float *grad_v = grad_v_tensor.data_ptr<float>();
55 |     attention_step2_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
56 | }
57 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_attention_op_step1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 800000
 8 | N = 35000
 9 | C = 96
10 | h = 6
11 | query = torch.rand(N, h, C//h).cuda()
12 | key = torch.rand(N, h, C//h).cuda()
13 | 
14 | index_0 = torch.rand(M)
15 | index_0[index_0 < 0] = 0
16 | index_0 = (index_0*N).long().cuda()
17 | 
18 | index_1 = torch.rand(M)
19 | index_1[index_1 < 0] = 0
20 | index_1 = (index_1*N).long().cuda()
21 | 
22 | query.requires_grad = True
23 | key.requires_grad = True
24 | 
25 | # rearrange index for acceleration
26 | index_0, indices = torch.sort(index_0) #[M,]
27 | index_1 = index_1[indices] #[M,]
28 | index_0_counts = index_0.bincount()
29 | 
30 | print("index_0_counts.shape: ", index_0_counts.shape)
31 | 
32 | n_max = index_0_counts.max()
33 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
34 | 
35 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
36 | 
37 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
38 |         
39 | # print("index_0[:100]: ", index_0[:100])
40 | print("n_max: ", n_max)
41 | print("index_0_offsets.shape: ", index_0_offsets.shape)
42 | # input()
43 | 
44 | print("index_0_offsets[:100]: ", index_0_offsets[:100])
45 | print("index_1[300:320]: ", index_1[300:320])
46 | 
47 |             
48 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
49 | # loss = attn_flat.sum()
50 | # loss.backward()
51 | print("attn_flat.shape: {}, attn_flat[300:320,:10]: {}".format(attn_flat.shape, attn_flat[300:320,:10]))
52 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
53 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
54 | # input()
55 | 
56 | print("query.is_contiguous(): ", query.is_contiguous())
57 | print("key.is_contiguous(): ", key.is_contiguous())
58 | print("index_0.is_contiguous(): ", index_0.is_contiguous())
59 | print("index_1.is_contiguous(): ", index_1.is_contiguous())
60 | 
61 | attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
62 | # loss = attn_flat_v2.sum()
63 | # loss.backward()
64 | print("attn_flat_v2.shape: {}, attn_flat_v2[300:320,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[300:320,:10]))
65 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
66 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
67 | # input()
68 | 
69 | mask = attn_flat_v2.sum(-1) != 0
70 | print("mask.sum(): ", mask.sum())
71 | print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max())
72 | 
73 | 
74 | print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all())
75 | 
76 | selected = 10000
77 | print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0))
78 | 
79 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/attention_v2/attention_cuda_v2.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "attention_cuda_kernel_v2.h"
 6 | 
 7 | void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, 
 8 |     at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor)
 9 | {
10 |     const float *q = q_tensor.data_ptr<float>();
11 |     const float *k = k_tensor.data_ptr<float>();
12 |     const int *index0_offsets = index0_tensor_offsets.data_ptr<int>();
13 |     const int *index1 = index1_tensor.data_ptr<int>();
14 |     float *attn = attn_tensor.data_ptr<float>();
15 |     attention_step1_forward_cuda_launcher_v2(N, M, h, C, n_max, q, k, index0_offsets, index1, attn);
16 | }
17 | 
18 | void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, 
19 |     at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 
20 |     at::Tensor grad_q_tensor, at::Tensor grad_k_tensor)
21 | {
22 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
23 |     const int *index0_offsets = index0_tensor_offsets.data_ptr<int>();
24 |     const int *index1 = index1_tensor.data_ptr<int>();
25 |     const float *q = q_tensor.data_ptr<float>();
26 |     const float *k = k_tensor.data_ptr<float>();
27 |     float *grad_q = grad_q_tensor.data_ptr<float>();
28 |     float *grad_k = grad_k_tensor.data_ptr<float>();
29 |     attention_step1_backward_cuda_launcher_v2(N, M, h, C, n_max, grad_out, index0_offsets, index1, q, k, grad_q, grad_k);
30 | }
31 | 
32 | void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 
33 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor)
34 | {
35 |     const float *attn = attn_tensor.data_ptr<float>();
36 |     const float *v = v_tensor.data_ptr<float>();
37 |     const int *index0 = index0_tensor.data_ptr<int>();
38 |     const int *index1 = index1_tensor.data_ptr<int>();
39 |     float *output = output_tensor.data_ptr<float>();
40 |     attention_step2_forward_cuda_launcher_v2(N, M, h, C, attn, v, index0, index1, output);
41 | }
42 | 
43 | 
44 | void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
45 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 
46 |     at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor)
47 | {
48 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
49 |     const int *index0 = index0_tensor.data_ptr<int>();
50 |     const int *index1 = index1_tensor.data_ptr<int>();
51 |     const float *attn = attn_tensor.data_ptr<float>();
52 |     const float *v = v_tensor.data_ptr<float>();
53 |     float *grad_attn = grad_attn_tensor.data_ptr<float>();
54 |     float *grad_v = grad_v_tensor.data_ptr<float>();
55 |     attention_step2_backward_cuda_launcher_v2(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
56 | }
57 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 80000
 8 | N = 3500
 9 | hdim = 16
10 | h = 6
11 | L = 31
12 | attn = torch.rand(M, h).cuda()
13 | v = torch.rand(N, h, hdim).cuda()
14 | table = torch.rand(L, h, hdim, 3).cuda()
15 | 
16 | index_0 = torch.rand(M)
17 | index_0[index_0 < 0] = 0
18 | index_0 = (index_0*N).long().cuda()
19 | 
20 | index_1 = torch.rand(M)
21 | index_1[index_1 < 0] = 0
22 | index_1 = (index_1*N).long().cuda()
23 | 
24 | rel_index = torch.rand(M, 3)
25 | rel_index[rel_index < 0] = 0
26 | rel_index = (rel_index*L).long().cuda()
27 | 
28 | 
29 | # rearrange index for acceleration
30 | index_0, indices = torch.sort(index_0) #[M,]
31 | index_1 = index_1[indices] #[M,]
32 | rel_index = rel_index[indices]
33 | index_0_counts = index_0.bincount()
34 | 
35 | print("index_0_counts.shape: ", index_0_counts.shape)
36 | 
37 | n_max = index_0_counts.max()
38 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
39 | 
40 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
41 | 
42 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
43 | 
44 | 
45 | attn.requires_grad = True
46 | v.requires_grad = True
47 | table.requires_grad = True
48 | 
49 | 
50 | output = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int())
51 | loss = output.mean()
52 | loss.backward()
53 | 
54 | print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5]))
55 | print("attn.grad[:5, :3]: ", attn.grad[:5, :3])
56 | print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
57 | print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
58 | # input()
59 | 
60 | attn_grad = attn.grad.clone()
61 | v_grad = v.grad.clone()
62 | table_grad = table.grad.clone()
63 | 
64 | attn.grad.zero_()
65 | v.grad.zero_()
66 | table.grad.zero_()
67 | 
68 | # print("query.is_contiguous(): ", query.is_contiguous())
69 | # print("key.is_contiguous(): ", key.is_contiguous())
70 | # print("index_0.is_contiguous(): ", index_0.is_contiguous())
71 | # print("index_1.is_contiguous(): ", index_1.is_contiguous())
72 | 
73 | output_v2 = pointops.attention_step2_with_rel_pos_value_v2(attn, v, index_0_offsets.int(), n_max, index_1.int(), table, rel_index.int())
74 | loss = output_v2.mean()
75 | loss.backward()
76 | 
77 | print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5]))
78 | print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3])
79 | print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
80 | print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
81 | # input()
82 | 
83 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
84 | 
85 | print("((attn_grad-attn.grad)**2).max(): ", ((attn_grad-attn.grad)**2).max())
86 | 
87 | print("((v_grad-v.grad)**2).max(): ", ((v_grad-v.grad)**2).max())
88 | 
89 | print("((table_grad-table.grad)**2).max(): ", ((table_grad-table.grad)**2).max())
90 | 
91 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
92 | 
93 | 


--------------------------------------------------------------------------------
/lib/cpp_wrappers/cpp_utils/cloud/cloud.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //
  3 | //		0==========================0
  4 | //		|    Local feature test    |
  5 | //		0==========================0
  6 | //
  7 | //		version 1.0 : 
  8 | //			> 
  9 | //
 10 | //---------------------------------------------------
 11 | //
 12 | //		Cloud header
 13 | //
 14 | //----------------------------------------------------
 15 | //
 16 | //		Hugues THOMAS - 10/02/2017
 17 | //
 18 | 
 19 | 
 20 | # pragma once
 21 | 
 22 | #include <vector>
 23 | #include <unordered_map>
 24 | #include <map>
 25 | #include <algorithm>
 26 | #include <numeric>
 27 | #include <iostream>
 28 | #include <iomanip>
 29 | #include <cmath>
 30 | 
 31 | #include <time.h>
 32 | 
 33 | 
 34 | 
 35 | 
 36 | // Point class
 37 | // ***********
 38 | 
 39 | 
 40 | class PointXYZ
 41 | {
 42 | public:
 43 | 
 44 | 	// Elements
 45 | 	// ********
 46 | 
 47 | 	float x, y, z;
 48 | 
 49 | 
 50 | 	// Methods
 51 | 	// *******
 52 | 	
 53 | 	// Constructor
 54 | 	PointXYZ() { x = 0; y = 0; z = 0; }
 55 | 	PointXYZ(float x0, float y0, float z0) { x = x0; y = y0; z = z0; }
 56 | 	
 57 | 	// array type accessor
 58 | 	float operator [] (int i) const
 59 | 	{
 60 | 		if (i == 0) return x;
 61 | 		else if (i == 1) return y;
 62 | 		else return z;
 63 | 	}
 64 | 
 65 | 	// opperations
 66 | 	float dot(const PointXYZ P) const
 67 | 	{
 68 | 		return x * P.x + y * P.y + z * P.z;
 69 | 	}
 70 | 
 71 | 	float sq_norm()
 72 | 	{
 73 | 		return x*x + y*y + z*z;
 74 | 	}
 75 | 
 76 | 	PointXYZ cross(const PointXYZ P) const
 77 | 	{
 78 | 		return PointXYZ(y*P.z - z*P.y, z*P.x - x*P.z, x*P.y - y*P.x);
 79 | 	}	
 80 | 
 81 | 	PointXYZ& operator+=(const PointXYZ& P)
 82 | 	{
 83 | 		x += P.x;
 84 | 		y += P.y;
 85 | 		z += P.z;
 86 | 		return *this;
 87 | 	}
 88 | 
 89 | 	PointXYZ& operator-=(const PointXYZ& P)
 90 | 	{
 91 | 		x -= P.x;
 92 | 		y -= P.y;
 93 | 		z -= P.z;
 94 | 		return *this;
 95 | 	}
 96 | 
 97 | 	PointXYZ& operator*=(const float& a)
 98 | 	{
 99 | 		x *= a;
100 | 		y *= a;
101 | 		z *= a;
102 | 		return *this;
103 | 	}
104 | };
105 | 
106 | 
107 | // Point Opperations
108 | // *****************
109 | 
110 | inline PointXYZ operator + (const PointXYZ A, const PointXYZ B)
111 | {
112 | 	return PointXYZ(A.x + B.x, A.y + B.y, A.z + B.z);
113 | }
114 | 
115 | inline PointXYZ operator - (const PointXYZ A, const PointXYZ B)
116 | {
117 | 	return PointXYZ(A.x - B.x, A.y - B.y, A.z - B.z);
118 | }
119 | 
120 | inline PointXYZ operator * (const PointXYZ P, const float a)
121 | {
122 | 	return PointXYZ(P.x * a, P.y * a, P.z * a);
123 | }
124 | 
125 | inline PointXYZ operator * (const float a, const PointXYZ P)
126 | {
127 | 	return PointXYZ(P.x * a, P.y * a, P.z * a);
128 | }
129 | 
130 | inline std::ostream& operator << (std::ostream& os, const PointXYZ P)
131 | {
132 | 	return os << "[" << P.x << ", " << P.y << ", " << P.z << "]";
133 | }
134 | 
135 | inline bool operator == (const PointXYZ A, const PointXYZ B)
136 | {
137 | 	return A.x == B.x && A.y == B.y && A.z == B.z;
138 | }
139 | 
140 | inline PointXYZ floor(const PointXYZ P)
141 | {
142 | 	return PointXYZ(std::floor(P.x), std::floor(P.y), std::floor(P.z));
143 | }
144 | 
145 | 
146 | PointXYZ max_point(std::vector<PointXYZ> points);
147 | PointXYZ min_point(std::vector<PointXYZ> points);
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 80000
 8 | N = 3500
 9 | # M = 80
10 | # N = 5
11 | hdim = 16
12 | h = 6
13 | L = 31
14 | query = torch.rand(N, h, hdim).cuda()
15 | table_q = torch.rand(L, h, hdim, 3).cuda()
16 | key = torch.rand(N, h, hdim).cuda()
17 | table_k = torch.rand(L, h, hdim, 3).cuda()
18 | 
19 | index_q = torch.rand(M)
20 | index_q[index_q < 0] = 0
21 | index_q = (index_q*N).long().cuda()
22 | 
23 | index_k = torch.rand(M)
24 | index_k[index_k < 0] = 0
25 | index_k = (index_k*N).long().cuda()
26 | 
27 | rel_index = torch.rand(M, 3)
28 | rel_index[rel_index < 0] = 0
29 | rel_index = (rel_index*L).long().cuda()
30 | 
31 | 
32 | # rearrange index for acceleration
33 | index_q, indices = torch.sort(index_q) #[M,]
34 | index_k = index_k[indices] #[M,]
35 | rel_index = rel_index[indices]
36 | index_q_counts = index_q.bincount()
37 | 
38 | print("index_q_counts.shape: ", index_q_counts.shape)
39 | 
40 | n_max = index_q_counts.max()
41 | index_q_offsets = index_q_counts.cumsum(dim=-1) #[N]
42 | 
43 | print("v1 index_q_offsets.shape: ", index_q_offsets.shape)
44 | 
45 | index_q_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_q_offsets], 0) #[N+1]
46 |         
47 | # print("index_q[:100]: ", index_q[:100])
48 | print("n_max: ", n_max)
49 | print("index_q_offsets.shape: ", index_q_offsets.shape)
50 | # input()
51 | 
52 | print("index_q_offsets[:100]: ", index_q_offsets[:100])
53 | print("index_k[:20]: ", index_k[:20])
54 | 
55 | query.requires_grad = True
56 | table_q.requires_grad = True
57 | key.requires_grad = True
58 | table_k.requires_grad = True
59 | 
60 | output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int())
61 | output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int())
62 | output = output1 + output2
63 | loss = output.mean()
64 | loss.backward()
65 | 
66 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
67 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
68 | # print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
69 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
70 | # print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
71 | # input()
72 | 
73 | # print("query.is_contiguous(): ", query.is_contiguous())
74 | # print("key.is_contiguous(): ", key.is_contiguous())
75 | # print("index_q.is_contiguous(): ", index_q.is_contiguous())
76 | # print("index_k.is_contiguous(): ", index_k.is_contiguous())
77 | 
78 | output_v2 = pointops.dot_prod_with_idx_v3(query, index_q_offsets.int(), n_max, key, index_k.int(), table_q, table_k, rel_index.int())
79 | # loss = output_v2.mean()
80 | # loss.backward()
81 | 
82 | # print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
83 | # print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
84 | # print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
85 | # print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
86 | # print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
87 | # input()
88 | 
89 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
90 | 
91 | 


--------------------------------------------------------------------------------
/util/scannet_v2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import SharedArray as SA
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | 
 8 | from util.voxelize import voxelize
 9 | from util.data_util import sa_create, collate_fn
10 | from util.data_util import data_prepare_scannet as data_prepare
11 | import glob
12 | 
13 | class Scannetv2(Dataset):
14 |     def __init__(self, split='train', data_root='trainval', voxel_size=0.04, voxel_max=None, transform=None, shuffle_index=False, loop=1):
15 |         super().__init__()
16 | 
17 |         self.split = split
18 |         self.data_root = data_root
19 |         self.voxel_size = voxel_size
20 |         self.voxel_max = voxel_max
21 |         self.transform = transform
22 |         self.shuffle_index = shuffle_index
23 |         self.loop = loop
24 | 
25 |         if split == "train" or split == 'val':
26 |             self.data_list = glob.glob(os.path.join(data_root, split, "*.pth"))
27 |         elif split == 'trainval':
28 |             self.data_list = glob.glob(os.path.join(data_root, "train", "*.pth")) + glob.glob(os.path.join(data_root, "val", "*.pth"))
29 |         else:
30 |             raise ValueError("no such split: {}".format(split))
31 |             
32 |         print("voxel_size: ", voxel_size)
33 |         print("Totally {} samples in {} set.".format(len(self.data_list), split))
34 | 
35 |     def __getitem__(self, idx):
36 |         # data_idx = self.data_idx[idx % len(self.data_idx)]
37 | 
38 |         # data = SA.attach("shm://{}".format(self.data_list[data_idx])).copy()
39 |         data_idx = idx % len(self.data_list)
40 |         data_path = self.data_list[data_idx]
41 |         data = torch.load(data_path)
42 | 
43 |         coord, feat = data[0], data[1]
44 |         if self.split != 'test':
45 |             label = data[2]
46 | 
47 |         coord, feat, label = data_prepare(coord, feat, label, self.split, self.voxel_size, self.voxel_max, self.transform, self.shuffle_index)
48 |         return coord, feat, label
49 | 
50 |     def __len__(self):
51 |         # return len(self.data_idx) * self.loop
52 |         return len(self.data_list) * self.loop
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     data_root = '/home/share/Dataset/s3dis'
57 |     test_area, voxel_size, voxel_max = 5, 0.04, 80000
58 | 
59 |     point_data = S3DIS(split='train', data_root=data_root, test_area=test_area, voxel_size=voxel_size, voxel_max=voxel_max)
60 |     print('point data size:', point_data.__len__())
61 |     import torch, time, random
62 |     manual_seed = 123
63 |     random.seed(manual_seed)
64 |     np.random.seed(manual_seed)
65 |     torch.manual_seed(manual_seed)
66 |     torch.cuda.manual_seed_all(manual_seed)
67 |     def worker_init_fn(worker_id):
68 |         random.seed(manual_seed + worker_id)
69 |     train_loader = torch.utils.data.DataLoader(point_data, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn)
70 |     for idx in range(1):
71 |         end = time.time()
72 |         voxel_num = []
73 |         for i, (coord, feat, label, offset) in enumerate(train_loader):
74 |             print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end))
75 |             print('tag', coord.shape, feat.shape, label.shape, offset.shape, torch.unique(label))
76 |             voxel_num.append(label.shape[0])
77 |             end = time.time()
78 |     print(np.sort(np.array(voxel_num)))
79 | 


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import functools
 4 | import sys
 5 | from termcolor import colored
 6 | 
 7 | class _ColorfulFormatter(logging.Formatter):
 8 |     def __init__(self, *args, **kwargs):
 9 |         self._root_name = kwargs.pop("root_name") + "."
10 |         self._abbrev_name = kwargs.pop("abbrev_name", "")
11 |         if len(self._abbrev_name):
12 |             self._abbrev_name = self._abbrev_name + "."
13 |         super(_ColorfulFormatter, self).__init__(*args, **kwargs)
14 | 
15 |     def formatMessage(self, record):
16 |         record.name = record.name.replace(self._root_name, self._abbrev_name)
17 |         log = super(_ColorfulFormatter, self).formatMessage(record)
18 |         if record.levelno == logging.WARNING:
19 |             prefix = colored("WARNING", "red", attrs=["blink"])
20 |         elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
21 |             prefix = colored("ERROR", "red", attrs=["blink", "underline"])
22 |         else:
23 |             return log
24 |         return prefix + " " + log
25 | 
26 | 
27 | # so that calling setup_logger multiple times won't add many handlers
28 | @functools.lru_cache()
29 | def get_logger(
30 |     output=None, color=True, name="main-logger", abbrev_name=None
31 |     ):
32 |     """
33 |     Initialize the detectron2 logger and set its verbosity level to "INFO".
34 |     Args:
35 |         output (str): a file name or a directory to save log. If None, will not save log file.
36 |             If ends with ".txt" or ".log", assumed to be a file name.
37 |             Otherwise, logs will be saved to `output/log.txt`.
38 |         name (str): the root module name of this logger
39 |     Returns:
40 |         logging.Logger: a logger
41 |     """
42 |     logger = logging.getLogger(name)
43 |     logger.setLevel(logging.DEBUG)
44 |     logger.propagate = False
45 | 
46 |     if abbrev_name is None:
47 |         abbrev_name = name
48 | 
49 |     plain_formatter = logging.Formatter(
50 |         "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
51 |     )
52 |     # stdout logging: master only
53 |     ch = logging.StreamHandler(stream=sys.stdout)
54 |     ch.setLevel(logging.DEBUG)
55 |     if color:
56 |         formatter = _ColorfulFormatter(
57 |             colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
58 |             datefmt="%m/%d %H:%M:%S",
59 |             root_name=name,
60 |             abbrev_name=str(abbrev_name),
61 |         )
62 |     else:
63 |         formatter = plain_formatter
64 |     ch.setFormatter(formatter)
65 |     logger.addHandler(ch)
66 | 
67 |     # file logging: also master only
68 |     if output is not None:
69 |         if output.endswith(".txt") or output.endswith(".log"):
70 |             filename = output
71 |         else:
72 |             filename = os.path.join(output, "log.txt")
73 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
74 | 
75 |         fh = logging.StreamHandler(_cached_log_stream(filename))
76 |         fh.setLevel(logging.DEBUG)
77 |         fh.setFormatter(plain_formatter)
78 |         logger.addHandler(fh)
79 | 
80 |     return logger
81 | 
82 | # cache the opened file object, so that different calls to `setup_logger`
83 | # with the same file name can safely write to the same file.
84 | @functools.lru_cache(maxsize=None)
85 | def _cached_log_stream(filename):
86 |     return open(filename, "a")


--------------------------------------------------------------------------------
/util/s3dis.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import SharedArray as SA
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | 
 8 | from util.voxelize import voxelize
 9 | from util.data_util import sa_create, collate_fn
10 | from util.data_util import data_prepare_v101 as data_prepare
11 | 
12 | 
13 | 
14 | class S3DIS(Dataset):
15 |     def __init__(self, split='train', data_root='trainval', test_area=5, voxel_size=0.04, voxel_max=None, transform=None, shuffle_index=False, loop=1):
16 |         super().__init__()
17 |         self.split, self.voxel_size, self.transform, self.voxel_max, self.shuffle_index, self.loop = split, voxel_size, transform, voxel_max, shuffle_index, loop
18 |         data_list = sorted(os.listdir(data_root))
19 |         data_list = [item[:-4] for item in data_list if 'Area_' in item]
20 |         if split == 'train':
21 |             self.data_list = [item for item in data_list if not 'Area_{}'.format(test_area) in item]
22 |         else:
23 |             self.data_list = [item for item in data_list if 'Area_{}'.format(test_area) in item]
24 |         self.data_root = data_root
25 |         # for item in self.data_list:
26 |         #     if not os.path.exists("/dev/shm/{}".format(item)):
27 |         #         data_path = os.path.join(data_root, item + '.npy')
28 |         #         data = np.load(data_path)  # xyzrgbl, N*7
29 |         #         sa_create("shm://{}".format(item), data)
30 |         self.data_idx = np.arange(len(self.data_list))
31 |         print("Totally {} samples in {} set.".format(len(self.data_idx), split))
32 | 
33 |     def __getitem__(self, idx):
34 |         data_idx = self.data_idx[idx % len(self.data_idx)]
35 | 
36 |         # data = SA.attach("shm://{}".format(self.data_list[data_idx])).copy()
37 |         item = self.data_list[data_idx]
38 |         data_path = os.path.join(self.data_root, item + '.npy')
39 |         data = np.load(data_path)
40 | 
41 |         coord, feat, label = data[:, 0:3], data[:, 3:6], data[:, 6]
42 |         coord, feat, label = data_prepare(coord, feat, label, self.split, self.voxel_size, self.voxel_max, self.transform, self.shuffle_index)
43 |         return coord, feat, label
44 | 
45 |     def __len__(self):
46 |         return len(self.data_idx) * self.loop
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     data_root = '/home/share/Dataset/s3dis'
51 |     test_area, voxel_size, voxel_max = 5, 0.04, 80000
52 | 
53 |     point_data = S3DIS(split='train', data_root=data_root, test_area=test_area, voxel_size=voxel_size, voxel_max=voxel_max)
54 |     print('point data size:', point_data.__len__())
55 |     import torch, time, random
56 |     manual_seed = 123
57 |     random.seed(manual_seed)
58 |     np.random.seed(manual_seed)
59 |     torch.manual_seed(manual_seed)
60 |     torch.cuda.manual_seed_all(manual_seed)
61 |     def worker_init_fn(worker_id):
62 |         random.seed(manual_seed + worker_id)
63 |     train_loader = torch.utils.data.DataLoader(point_data, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn)
64 |     for idx in range(1):
65 |         end = time.time()
66 |         voxel_num = []
67 |         for i, (coord, feat, label, offset) in enumerate(train_loader):
68 |             print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end))
69 |             print('tag', coord.shape, feat.shape, label.shape, offset.shape, torch.unique(label))
70 |             voxel_num.append(label.shape[0])
71 |             end = time.time()
72 |     print(np.sort(np.array(voxel_num)))
73 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/aggregation/aggregation_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "aggregation_cuda_kernel.h"
 3 | 
 4 | 
 5 | __global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
 6 |     // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
 7 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (index >= n * c) return;
 9 |     const int c_idx = index % c;
10 |     const int n_idx = index / c;
11 |     const int w_c_idx = c_idx % w_c;
12 |     for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
13 |     {   
14 |         int idx_idx = n_idx * nsample + nsample_idx;
15 |         int input_idx = idx[idx_idx] * c + c_idx;
16 |         int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
17 |         int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
18 |         output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx];
19 |     }
20 | }
21 | 
22 | __global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
23 |     // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
24 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
25 |     if (index >= n * c) return;
26 |     const int c_idx = index % c;
27 |     const int n_idx = index / c;
28 |     const int w_c_idx = c_idx % w_c;
29 |     for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
30 |     {   
31 |         int idx_idx = n_idx * nsample + nsample_idx;
32 |         int input_idx = idx[idx_idx] * c + c_idx;
33 |         int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
34 |         int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
35 |         atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]);
36 |         grad_position[position_idx] = grad_output[index] * weight[weight_idx];
37 |         atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx]));
38 |     }
39 | }
40 | 
41 | void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
42 |     // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
43 |     dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
44 |     dim3 threads(THREADS_PER_BLOCK);
45 |     aggregation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, output);
46 | }
47 | 
48 | void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {  
49 |     // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
50 |     dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
51 |     dim3 threads(THREADS_PER_BLOCK);
52 |     aggregation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
53 | }
54 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/rpe/relative_pos_encoding_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <THC/THC.h>
 3 | #include <torch/serialize/tensor.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #include "relative_pos_encoding_cuda_kernel.h"
 6 | 
 7 | void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, 
 8 |     at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
 9 | {
10 |     const float *q = q_tensor.data_ptr<float>();
11 |     const float *table = table_tensor.data_ptr<float>();
12 |     const int *index = index_tensor.data_ptr<int>();
13 |     const int *rel_idx = rel_idx_tensor.data_ptr<int>();
14 |     float *output = output_tensor.data_ptr<float>();
15 |     dot_prod_with_idx_forward_cuda_launcher(N, M, h, hdim, q, index, table, rel_idx, output);
16 | }
17 | 
18 | void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 
19 |     at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, 
20 |     at::Tensor grad_q_tensor, at::Tensor grad_table_tensor)
21 | {
22 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
23 |     const float *q = q_tensor.data_ptr<float>();
24 |     const int *index = index_tensor.data_ptr<int>();
25 |     const float *table = table_tensor.data_ptr<float>();
26 |     const int *rel_idx = rel_idx_tensor.data_ptr<int>();
27 |     float *grad_q = grad_q_tensor.data_ptr<float>();
28 |     float *grad_table = grad_table_tensor.data_ptr<float>();
29 |     dot_prod_with_idx_backward_cuda_launcher(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table);
30 | }
31 | 
32 | void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, 
33 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
34 | {
35 |     const float *attn = attn_tensor.data_ptr<float>();
36 |     const float *v = v_tensor.data_ptr<float>();
37 |     const int *index0 = index0_tensor.data_ptr<int>();
38 |     const int *index1 = index1_tensor.data_ptr<int>();
39 |     const float *table = table_tensor.data_ptr<float>();
40 |     const int *rel_idx = rel_idx_tensor.data_ptr<int>();
41 |     float *output = output_tensor.data_ptr<float>();
42 |     attention_step2_with_rel_pos_value_forward_cuda_launcher(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output);
43 | }
44 | 
45 | void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 
46 |     at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor,
47 |     at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor)
48 | {
49 |     const float *grad_out = grad_out_tensor.data_ptr<float>();
50 |     const int *index0 = index0_tensor.data_ptr<int>();
51 |     const int *index1 = index1_tensor.data_ptr<int>();
52 |     const float *attn = attn_tensor.data_ptr<float>();
53 |     const float *v = v_tensor.data_ptr<float>();
54 |     const float *table = table_tensor.data_ptr<float>();
55 |     const int *rel_idx = rel_idx_tensor.data_ptr<int>();
56 |     float *grad_attn = grad_attn_tensor.data_ptr<float>();
57 |     float *grad_v = grad_v_tensor.data_ptr<float>();
58 |     float *grad_table = grad_table_tensor.data_ptr<float>();
59 |     attention_step2_with_rel_pos_value_backward_cuda_launcher(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
60 | }
61 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/knnquery/knnquery_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "knnquery_cuda_kernel.h"
  3 | 
  4 | 
  5 | __device__ void swap_float(float *x, float *y)
  6 | {
  7 |     float tmp = *x;
  8 |     *x = *y;
  9 |     *y = tmp;
 10 | }
 11 | 
 12 | 
 13 | __device__ void swap_int(int *x, int *y)
 14 | {
 15 |     int tmp = *x;
 16 |     *x = *y;
 17 |     *y = tmp;
 18 | }
 19 | 
 20 | 
 21 | __device__ void reheap(float *dist, int *idx, int k)
 22 | {
 23 |     int root = 0;
 24 |     int child = root * 2 + 1;
 25 |     while (child < k)
 26 |     {
 27 |         if(child + 1 < k && dist[child+1] > dist[child])
 28 |             child++;
 29 |         if(dist[root] > dist[child])
 30 |             return;
 31 |         swap_float(&dist[root], &dist[child]);
 32 |         swap_int(&idx[root], &idx[child]);
 33 |         root = child;
 34 |         child = root * 2 + 1;
 35 |     }
 36 | }
 37 | 
 38 | 
 39 | __device__ void heap_sort(float *dist, int *idx, int k)
 40 | {
 41 |     int i;
 42 |     for (i = k - 1; i > 0; i--)
 43 |     {
 44 |         swap_float(&dist[0], &dist[i]);
 45 |         swap_int(&idx[0], &idx[i]);
 46 |         reheap(dist, idx, i);
 47 |     }
 48 | }
 49 | 
 50 | 
 51 | __device__ int get_bt_idx(int idx, const int *offset)
 52 | {
 53 |     int i = 0;
 54 |     while (1)
 55 |     {
 56 |         if (idx < offset[i])
 57 |             break;
 58 |         else
 59 |             i++;
 60 |     }
 61 |     return i;
 62 | }
 63 | 
 64 | 
 65 | __global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) {
 66 |     // input: xyz (n, 3) new_xyz (m, 3)
 67 |     // output: idx (m, nsample) dist2 (m, nsample)
 68 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 69 |     if (pt_idx >= m) return;
 70 | 
 71 |     new_xyz += pt_idx * 3;
 72 |     idx += pt_idx * nsample;
 73 |     dist2 += pt_idx * nsample;
 74 |     int bt_idx = get_bt_idx(pt_idx, new_offset);
 75 |     int start;
 76 |     if (bt_idx == 0)
 77 |         start = 0;
 78 |     else
 79 |         start = offset[bt_idx - 1];
 80 |     int end = offset[bt_idx];
 81 | 
 82 |     float new_x = new_xyz[0];
 83 |     float new_y = new_xyz[1];
 84 |     float new_z = new_xyz[2];
 85 | 
 86 |     float best_dist[100];
 87 |     int best_idx[100];
 88 |     for(int i = 0; i < nsample; i++){
 89 |         best_dist[i] = 1e10;
 90 |         best_idx[i] = start;
 91 |     }
 92 |     for(int i = start; i < end; i++){
 93 |         float x = xyz[i * 3 + 0];
 94 |         float y = xyz[i * 3 + 1];
 95 |         float z = xyz[i * 3 + 2];
 96 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 97 |         if (d2 < best_dist[0]){
 98 |             best_dist[0] = d2;
 99 |             best_idx[0] = i;
100 |             reheap(best_dist, best_idx, nsample);
101 |         }
102 |     }
103 |     heap_sort(best_dist, best_idx, nsample);
104 |     for(int i = 0; i < nsample; i++){
105 |         idx[i] = best_idx[i];
106 |         dist2[i] = best_dist[i];
107 |     }
108 | }
109 | 
110 | 
111 | void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) {
112 |     // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
113 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
114 |     dim3 threads(THREADS_PER_BLOCK);
115 |     knnquery_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
116 | }
117 | 


--------------------------------------------------------------------------------
/lib/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "knnquery_heap_cuda_kernel.h"
  3 | 
  4 | 
  5 | __device__ void swap_float(float *x, float *y)
  6 | {
  7 |     float tmp = *x;
  8 |     *x = *y;
  9 |     *y = tmp;
 10 | }
 11 | 
 12 | 
 13 | __device__ void swap_int(int *x, int *y)
 14 | {
 15 |     int tmp = *x;
 16 |     *x = *y;
 17 |     *y = tmp;
 18 | }
 19 | 
 20 | 
 21 | __device__ void reheap(float *dist, int *idx, int k)
 22 | {
 23 |     int root = 0;
 24 |     int child = root * 2 + 1;
 25 |     while (child < k)
 26 |     {
 27 |         if(child + 1 < k && dist[child+1] > dist[child])
 28 |             child++;
 29 |         if(dist[root] > dist[child])
 30 |             return;
 31 |         swap_float(&dist[root], &dist[child]);
 32 |         swap_int(&idx[root], &idx[child]);
 33 |         root = child;
 34 |         child = root * 2 + 1;
 35 |     }
 36 | }
 37 | 
 38 | 
 39 | __device__ void heap_sort(float *dist, int *idx, int k)
 40 | {
 41 |     int i;
 42 |     for (i = k - 1; i > 0; i--)
 43 |     {
 44 |         swap_float(&dist[0], &dist[i]);
 45 |         swap_int(&idx[0], &idx[i]);
 46 |         reheap(dist, idx, i);
 47 |     }
 48 | }
 49 | 
 50 | 
 51 | // input: xyz (b, n, 3) new_xyz (b, m, 3)
 52 | // output: idx (b, m, nsample) dist2 (b, m, nsample)
 53 | __global__ void knnquery_heap_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
 54 |     int bs_idx = blockIdx.y;
 55 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 56 |     if (bs_idx >= b || pt_idx >= m) return;
 57 | 
 58 |     new_xyz += bs_idx * m * 3 + pt_idx * 3;
 59 |     xyz += bs_idx * n * 3;
 60 |     idx += bs_idx * m * nsample + pt_idx * nsample;
 61 |     dist2 += bs_idx * m * nsample + pt_idx * nsample;
 62 | 
 63 |     float new_x = new_xyz[0];
 64 |     float new_y = new_xyz[1];
 65 |     float new_z = new_xyz[2];
 66 | 
 67 |     float best_dist[100];
 68 |     int best_idx[100];
 69 |     for(int i = 0; i < nsample; i++){
 70 |         best_dist[i] = 1e10;
 71 |         best_idx[i] = 0;
 72 |     }
 73 |     for(int i = 0; i < n; i++){
 74 |         float x = xyz[i * 3 + 0];
 75 |         float y = xyz[i * 3 + 1];
 76 |         float z = xyz[i * 3 + 2];
 77 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 78 |         if (d2 < best_dist[0]){
 79 |             best_dist[0] = d2;
 80 |             best_idx[0] = i;
 81 |             reheap(best_dist, best_idx, nsample);
 82 |         }
 83 |     }
 84 |     heap_sort(best_dist, best_idx, nsample);
 85 |     for(int i = 0; i < nsample; i++){
 86 |         idx[i] = best_idx[i];
 87 |         dist2[i] = best_dist[i];
 88 |     }
 89 | }
 90 | 
 91 | 
 92 | void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
 93 |     // param new_xyz: (B, m, 3)
 94 |     // param xyz: (B, n, 3)
 95 |     // param idx: (B, m, nsample)
 96 | 
 97 |     cudaError_t err;
 98 | 
 99 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
100 |     dim3 threads(THREADS_PER_BLOCK);
101 | 
102 |     knnquery_heap_cuda_kernel<<<blocks, threads, 0, stream>>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
103 |     // cudaDeviceSynchronize();  // for using printf in kernel function
104 | 
105 |     err = cudaGetLastError();
106 |     if (cudaSuccess != err) {
107 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
108 |         exit(-1);
109 |     }
110 | }


--------------------------------------------------------------------------------
/lib/pointops2/functions/test_attention_op_step1_v2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pointops
 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
 4 | 
 5 | torch.manual_seed(1)
 6 | 
 7 | M = 800000
 8 | N = 35000
 9 | C = 96
10 | h = 6
11 | query = torch.rand(N, h, C//h).cuda()
12 | key = torch.rand(N, h, C//h).cuda()
13 | 
14 | index_0 = torch.rand(M)
15 | index_0[index_0 < 0] = 0
16 | index_0 = (index_0*N).long().cuda()
17 | 
18 | index_1 = torch.rand(M)
19 | index_1[index_1 < 0] = 0
20 | index_1 = (index_1*N).long().cuda()
21 | 
22 | query.requires_grad = True
23 | key.requires_grad = True
24 | 
25 | 
26 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
27 | loss = attn_flat.sum()
28 | loss.backward()
29 | print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10]))
30 | print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
31 | print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
32 | input()
33 | 
34 | 
35 | 
36 | # rearrange index for acceleration
37 | index_0, indices = torch.sort(index_0) #[M,]
38 | index_1 = index_1[indices] #[M,]
39 | index_0_counts = index_0.bincount()
40 | 
41 | print("index_0_counts.shape: ", index_0_counts.shape)
42 | 
43 | n_max = index_0_counts.max()
44 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
45 | 
46 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
47 | 
48 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
49 |         
50 | # print("index_0[:100]: ", index_0[:100])
51 | print("n_max: ", n_max)
52 | print("index_0_offsets.shape: ", index_0_offsets.shape)
53 | # input()
54 | 
55 | print("index_0_offsets[:100]: ", index_0_offsets[:100])
56 | print("index_1[:20]: ", index_1[:20])
57 | 
58 |             
59 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
60 | # loss = attn_flat.sum()
61 | # loss.backward()
62 | # # attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
63 | # # loss = attn_flat.sum()
64 | # # loss.backward()
65 | # print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10]))
66 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
67 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
68 | # input()
69 | 
70 | print("query.is_contiguous(): ", query.is_contiguous())
71 | print("key.is_contiguous(): ", key.is_contiguous())
72 | print("index_0.is_contiguous(): ", index_0.is_contiguous())
73 | print("index_1.is_contiguous(): ", index_1.is_contiguous())
74 | 
75 | attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
76 | loss = attn_flat_v2.sum()
77 | loss.backward()
78 | 
79 | # attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
80 | # loss = attn_flat_v2.sum()
81 | # loss.backward()
82 | 
83 | print("attn_flat_v2.shape: {}, attn_flat_v2[:20,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[:20,:10]))
84 | print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
85 | print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
86 | # input()
87 | 
88 | # mask = attn_flat_v2.sum(-1) != 0
89 | # print("mask.sum(): ", mask.sum())
90 | # print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max())
91 | 
92 | 
93 | print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all())
94 | 
95 | selected = 10000
96 | print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0))
97 | 
98 | 


--------------------------------------------------------------------------------
/util/voxelize.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Sequence
  3 | import torch
  4 | from torch_geometric.nn import voxel_grid
  5 | 
  6 | def grid_sample(pos, batch_index, size, start=None, return_p2v=True):
  7 |     # pos: float [N, 3]
  8 |     # batch_szie: long int
  9 |     # size: float [3, ]
 10 |     # start: float [3, ] / None
 11 | 
 12 |     # print("pos.shape: {}, batch.shape: {}".format(pos.shape, batch.shape))
 13 |     # print("size: ", size)
 14 | 
 15 |     # batch [N, ]
 16 |     batch = torch.zeros(pos.shape[0])
 17 |     for i in range (1, len(batch_index)):
 18 |         batch[batch_index[i-1]:batch_index[i]] = i
 19 |         
 20 |     cluster = voxel_grid(pos, batch, size, start=start) #[N, ]
 21 | 
 22 |     if return_p2v == False:
 23 |         unique, cluster = torch.unique(cluster, sorted=True, return_inverse=True)
 24 |         return cluster
 25 | 
 26 |     unique, cluster, counts = torch.unique(cluster, sorted=True, return_inverse=True, return_counts=True)
 27 | 
 28 |     # print("unique.shape: {}, cluster.shape: {}, counts.shape: {}".format(unique.shape, cluster.shape, counts.shape))
 29 | 
 30 |     # input()
 31 | 
 32 |     # obtain p2v_map
 33 |     n = unique.shape[0]
 34 |     k = counts.max().item()
 35 |     p2v_map = cluster.new_zeros(n, k) #[n, k]
 36 |     mask = torch.arange(k).cuda().unsqueeze(0) < counts.unsqueeze(-1) #[n, k]
 37 |     p2v_map[mask] = torch.argsort(cluster)
 38 |     # max_point
 39 |     max_point = 48
 40 |     if k > max_point:
 41 |         counts = torch.where(counts > max_point, max_point, counts)
 42 |         p2v_map = p2v_map[:,0:max_point]
 43 | 
 44 |     return cluster, p2v_map, counts
 45 | 
 46 | def fnv_hash_vec(arr):
 47 |     """
 48 |     FNV64-1A
 49 |     """
 50 |     assert arr.ndim == 2
 51 |     # Floor first for negative coordinates
 52 |     arr = arr.copy()
 53 |     arr = arr.astype(np.uint64, copy=False)
 54 |     hashed_arr = np.uint64(14695981039346656037) * np.ones(arr.shape[0], dtype=np.uint64)
 55 |     for j in range(arr.shape[1]):
 56 |         hashed_arr *= np.uint64(1099511628211)
 57 |         hashed_arr = np.bitwise_xor(hashed_arr, arr[:, j])
 58 |     return hashed_arr
 59 | 
 60 | 
 61 | def ravel_hash_vec(arr):
 62 |     """
 63 |     Ravel the coordinates after subtracting the min coordinates.
 64 |     """
 65 |     assert arr.ndim == 2
 66 |     arr = arr.copy()
 67 |     arr -= arr.min(0)
 68 |     arr = arr.astype(np.uint64, copy=False)
 69 |     arr_max = arr.max(0).astype(np.uint64) + 1
 70 | 
 71 |     keys = np.zeros(arr.shape[0], dtype=np.uint64)
 72 |     # Fortran style indexing
 73 |     for j in range(arr.shape[1] - 1):
 74 |         keys += arr[:, j]
 75 |         keys *= arr_max[j + 1]
 76 |     keys += arr[:, -1]
 77 |     return keys
 78 | 
 79 | 
 80 | def voxelize(coord, voxel_size=0.05, hash_type='fnv', mode=0):
 81 |     discrete_coord = np.floor(coord / np.array(voxel_size))
 82 |     if hash_type == 'ravel':
 83 |         key = ravel_hash_vec(discrete_coord)
 84 |     else:
 85 |         key = fnv_hash_vec(discrete_coord)
 86 | 
 87 |     idx_sort = np.argsort(key)
 88 |     key_sort = key[idx_sort]
 89 |     _, count = np.unique(key_sort, return_counts=True)
 90 |     if mode == 0:  # train mode
 91 |         idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + np.random.randint(0, count.max(), count.size) % count
 92 |         idx_unique = idx_sort[idx_select]
 93 |         return idx_unique
 94 |     else:  # val mode
 95 |         return idx_sort, count
 96 | 
 97 |     '''
 98 |     #_, idx = np.unique(key, return_index=True)
 99 |     #return idx
100 | 
101 |     idx_sort = np.argsort(key)
102 |     key_sort = key[idx_sort]
103 |     _, idx_start, count = np.unique(key_sort, return_counts=True, return_index=True)
104 |     idx_list = np.split(idx_sort, idx_start[1:])
105 |     return idx_list
106 |     '''
107 | 


--------------------------------------------------------------------------------
/lib/pointops/src/ballquery/ballquery_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "ballquery_cuda_kernel.h"
  3 | 
  4 | // input: new_xyz(b, m, 3) xyz(b, n, 3)
  5 | // output: idx(b, m, nsample)
  6 | __global__ void ballquery_cuda_kernel(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx)
  7 | {
  8 |     int batch_index = blockIdx.x;
  9 |     xyz += batch_index * n * 3;
 10 |     new_xyz += batch_index * m * 3;
 11 |     idx += m * nsample * batch_index;
 12 |     int index = threadIdx.x;
 13 |     int stride = blockDim.x;
 14 | 
 15 |     float radius2 = radius * radius;
 16 |     for (int j = index; j < m; j += stride)
 17 |     {
 18 |         float new_x = new_xyz[j * 3 + 0];
 19 |         float new_y = new_xyz[j * 3 + 1];
 20 |         float new_z = new_xyz[j * 3 + 2];
 21 |         for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k)
 22 |         {
 23 |             float x = xyz[k * 3 + 0];
 24 |             float y = xyz[k * 3 + 1];
 25 |             float z = xyz[k * 3 + 2];
 26 |             float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 27 |             if (d2 < radius2)
 28 |             {
 29 |                 if (cnt == 0)
 30 |                 {
 31 |                     for (int l = 0; l < nsample; ++l)
 32 |                         idx[j * nsample + l] = k;
 33 |                 }
 34 |                 idx[j * nsample + cnt] = k;
 35 |                 ++cnt;
 36 |             }
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx)
 42 | {
 43 |     ballquery_cuda_kernel<<<b, opt_n_threads(m), 0>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
 44 | }
 45 | 
 46 | 
 47 | __global__ void ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
 48 |     int bs_idx = blockIdx.y;
 49 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 50 |     if (bs_idx >= b || pt_idx >= m) return;
 51 | 
 52 |     new_xyz += bs_idx * m * 3 + pt_idx * 3;
 53 |     xyz += bs_idx * n * 3;
 54 |     idx += bs_idx * m * nsample + pt_idx * nsample;
 55 | 
 56 |     float radius2 = radius * radius;
 57 |     float new_x = new_xyz[0];
 58 |     float new_y = new_xyz[1];
 59 |     float new_z = new_xyz[2];
 60 | 
 61 |     int cnt = 0;
 62 |     for (int k = 0; k < n; ++k) {
 63 |         float x = xyz[k * 3 + 0];
 64 |         float y = xyz[k * 3 + 1];
 65 |         float z = xyz[k * 3 + 2];
 66 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 67 |         if (d2 < radius2){
 68 |             if (cnt == 0){
 69 |                 for (int l = 0; l < nsample; ++l) {
 70 |                     idx[l] = k;
 71 |                 }
 72 |             }
 73 |             idx[cnt] = k;
 74 |             ++cnt;
 75 |             if (cnt >= nsample){
 76 |                 break;
 77 |             }
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
 84 |     // param new_xyz: (B, m, 3)
 85 |     // param xyz: (B, n, 3)
 86 |     // param idx: (B, m, nsample)
 87 | 
 88 |     cudaError_t err;
 89 | 
 90 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
 91 |     dim3 threads(THREADS_PER_BLOCK);
 92 | 
 93 |     ballquery_cuda_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
 94 |     // cudaDeviceSynchronize();  // for using printf in kernel function
 95 | 
 96 |     err = cudaGetLastError();
 97 |     if (cudaSuccess != err) {
 98 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
 99 |         exit(-1);
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/pointops_api.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/serialize/tensor.h>
 2 | #include <torch/extension.h>
 3 | 
 4 | #include "knnquery/knnquery_cuda_kernel.h"
 5 | #include "sampling/sampling_cuda_kernel.h"
 6 | #include "grouping/grouping_cuda_kernel.h"
 7 | #include "interpolation/interpolation_cuda_kernel.h"
 8 | #include "aggregation/aggregation_cuda_kernel.h"
 9 | #include "subtraction/subtraction_cuda_kernel.h"
10 | #include "attention/attention_cuda_kernel.h"
11 | #include "rpe/relative_pos_encoding_cuda_kernel.h"
12 | #include "attention_v2/attention_cuda_kernel_v2.h"
13 | #include "rpe_v2/relative_pos_encoding_cuda_kernel_v2.h"
14 | 
15 | 
16 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
17 |     m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda");
18 |     m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda");
19 |     m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda");
20 |     m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
21 |     m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda");
22 |     m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
23 |     m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda");
24 |     m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda");
25 |     m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda");
26 |     m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda");
27 |     m.def("attention_step1_forward_cuda", &attention_step1_forward_cuda, "attention_step1_forward_cuda");
28 |     m.def("attention_step1_backward_cuda", &attention_step1_backward_cuda, "attention_step1_backward_cuda");
29 |     m.def("attention_step2_forward_cuda", &attention_step2_forward_cuda, "attention_step2_forward_cuda");
30 |     m.def("attention_step2_backward_cuda", &attention_step2_backward_cuda, "attention_step2_backward_cuda");
31 |     m.def("dot_prod_with_idx_forward_cuda", &dot_prod_with_idx_forward_cuda, "dot_prod_with_idx_forward_cuda");
32 |     m.def("dot_prod_with_idx_backward_cuda", &dot_prod_with_idx_backward_cuda, "dot_prod_with_idx_backward_cuda");
33 |     m.def("attention_step2_with_rel_pos_value_forward_cuda", &attention_step2_with_rel_pos_value_forward_cuda, "attention_step2_with_rel_pos_value_forward_cuda");
34 |     m.def("attention_step2_with_rel_pos_value_backward_cuda", &attention_step2_with_rel_pos_value_backward_cuda, "attention_step2_with_rel_pos_value_backward_cuda");
35 |     m.def("attention_step1_forward_cuda_v2", &attention_step1_forward_cuda_v2, "attention_step1_forward_cuda_v2");
36 |     m.def("attention_step1_backward_cuda_v2", &attention_step1_backward_cuda_v2, "attention_step1_backward_cuda_v2");
37 |     m.def("attention_step2_forward_cuda_v2", &attention_step2_forward_cuda_v2, "attention_step2_forward_cuda_v2");
38 |     m.def("attention_step2_backward_cuda_v2", &attention_step2_backward_cuda_v2, "attention_step2_backward_cuda_v2");
39 |     m.def("dot_prod_with_idx_forward_cuda_v2", &dot_prod_with_idx_forward_cuda_v2, "dot_prod_with_idx_forward_cuda_v2");
40 |     m.def("dot_prod_with_idx_backward_cuda_v2", &dot_prod_with_idx_backward_cuda_v2, "dot_prod_with_idx_backward_cuda_v2");
41 |     m.def("attention_step2_with_rel_pos_value_forward_cuda_v2", &attention_step2_with_rel_pos_value_forward_cuda_v2, "attention_step2_with_rel_pos_value_forward_cuda_v2");
42 |     m.def("attention_step2_with_rel_pos_value_backward_cuda_v2", &attention_step2_with_rel_pos_value_backward_cuda_v2, "attention_step2_with_rel_pos_value_backward_cuda_v2");
43 |     m.def("dot_prod_with_idx_forward_cuda_v3", &dot_prod_with_idx_forward_cuda_v3, "dot_prod_with_idx_forward_cuda_v3");
44 |     m.def("dot_prod_with_idx_backward_cuda_v3", &dot_prod_with_idx_backward_cuda_v3, "dot_prod_with_idx_backward_cuda_v3");
45 |     }
46 | 


--------------------------------------------------------------------------------
/lib/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "grid_subsampling.h"
  3 | 
  4 | 
  5 | void grid_subsampling(vector<PointXYZ>& original_points,
  6 |                       vector<PointXYZ>& subsampled_points,
  7 |                       vector<float>& original_features,
  8 |                       vector<float>& subsampled_features,
  9 |                       vector<int>& original_classes,
 10 |                       vector<int>& subsampled_classes,
 11 |                       float sampleDl,
 12 |                       int verbose) {
 13 | 
 14 | 	// Initiate variables
 15 | 	// ******************
 16 | 
 17 | 	// Number of points in the cloud
 18 | 	size_t N = original_points.size();
 19 | 
 20 | 	// Dimension of the features
 21 | 	size_t fdim = original_features.size() / N;
 22 | 	size_t ldim = original_classes.size() / N;
 23 | 
 24 | 	// Limits of the cloud
 25 | 	PointXYZ minCorner = min_point(original_points);
 26 | 	PointXYZ maxCorner = max_point(original_points);
 27 | 	PointXYZ originCorner = floor(minCorner * (1/sampleDl)) * sampleDl;
 28 | 
 29 | 	// Dimensions of the grid
 30 | 	size_t sampleNX = (size_t)floor((maxCorner.x - originCorner.x) / sampleDl) + 1;
 31 | 	size_t sampleNY = (size_t)floor((maxCorner.y - originCorner.y) / sampleDl) + 1;
 32 | 	//size_t sampleNZ = (size_t)floor((maxCorner.z - originCorner.z) / sampleDl) + 1;
 33 | 
 34 | 	// Check if features and classes need to be processed
 35 | 	bool use_feature = original_features.size() > 0;
 36 | 	bool use_classes = original_classes.size() > 0;
 37 | 
 38 | 
 39 | 	// Create the sampled map
 40 | 	// **********************
 41 | 
 42 | 	// Verbose parameters
 43 | 	int i = 0;
 44 | 	int nDisp = N / 100;
 45 | 
 46 | 	// Initiate variables
 47 | 	size_t iX, iY, iZ, mapIdx;
 48 | 	unordered_map<size_t, SampledData> data;
 49 | 
 50 | 	for (auto& p : original_points)
 51 | 	{
 52 | 		// Position of point in sample map
 53 | 		iX = (size_t)floor((p.x - originCorner.x) / sampleDl);
 54 | 		iY = (size_t)floor((p.y - originCorner.y) / sampleDl);
 55 | 		iZ = (size_t)floor((p.z - originCorner.z) / sampleDl);
 56 | 		mapIdx = iX + sampleNX*iY + sampleNX*sampleNY*iZ;
 57 | 
 58 | 		// If not already created, create key
 59 | 		if (data.count(mapIdx) < 1)
 60 | 			data.emplace(mapIdx, SampledData(fdim, ldim));
 61 | 
 62 | 		// Fill the sample map
 63 | 		if (use_feature && use_classes)
 64 | 			data[mapIdx].update_all(p, original_features.begin() + i * fdim, original_classes.begin() + i * ldim);
 65 | 		else if (use_feature)
 66 | 			data[mapIdx].update_features(p, original_features.begin() + i * fdim);
 67 | 		else if (use_classes)
 68 | 			data[mapIdx].update_classes(p, original_classes.begin() + i * ldim);
 69 | 		else
 70 | 			data[mapIdx].update_points(p);
 71 | 
 72 | 		// Display
 73 | 		i++;
 74 | 		if (verbose > 1 && i%nDisp == 0)
 75 | 			std::cout << "\rSampled Map : " << std::setw(3) << i / nDisp << "%";
 76 | 
 77 | 	}
 78 | 
 79 | 	// Divide for barycentre and transfer to a vector
 80 | 	subsampled_points.reserve(data.size());
 81 | 	if (use_feature)
 82 | 		subsampled_features.reserve(data.size() * fdim);
 83 | 	if (use_classes)
 84 | 		subsampled_classes.reserve(data.size() * ldim);
 85 | 	for (auto& v : data)
 86 | 	{
 87 | 		subsampled_points.push_back(v.second.point * (1.0 / v.second.count));
 88 | 		if (use_feature)
 89 | 		{
 90 | 		    float count = (float)v.second.count;
 91 | 		    transform(v.second.features.begin(),
 92 |                       v.second.features.end(),
 93 |                       v.second.features.begin(),
 94 |                       [count](float f) { return f / count;});
 95 |             subsampled_features.insert(subsampled_features.end(),v.second.features.begin(),v.second.features.end());
 96 | 		}
 97 | 		if (use_classes)
 98 | 		{
 99 | 		    for (int i = 0; i < ldim; i++)
100 | 		        subsampled_classes.push_back(max_element(v.second.labels[i].begin(), v.second.labels[i].end(),
101 | 		        [](const pair<int, int>&a, const pair<int, int>&b){return a.second < b.second;})->first);
102 | 		}
103 | 	}
104 | 
105 | 	return;
106 | }
107 | 


--------------------------------------------------------------------------------
/lib/pointops/src/grouping/grouping_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "../cuda_utils.h"
 2 | #include "grouping_cuda_kernel.h"
 3 | 
 4 | // input: points(b, c, n) idx(b, m, nsample)
 5 | // output: out(b, c, m, nsample)
 6 | __global__ void grouping_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out)
 7 | {
 8 |     int batch_index = blockIdx.x;
 9 |     points += batch_index * n * c;
10 |     idx += batch_index * m * nsample;
11 |     out += batch_index * m * nsample * c;
12 |     const int index = threadIdx.y * blockDim.x + threadIdx.x;
13 |     const int stride = blockDim.y * blockDim.x;
14 |     for (int i = index; i < c * m; i += stride)
15 |     {
16 |         const int l = i / m;
17 |         const int j = i % m;
18 |         for (int k = 0; k < nsample; ++k)
19 |         {
20 |             int ii = idx[j * nsample + k];
21 |             out[(l * m + j) * nsample + k] = points[l * n + ii];
22 |         }
23 |     }
24 | }
25 | 
26 | // input: grad_out(b, c, m, nsample), idx(b, m, nsample)
27 | // output: grad_points(b, c, n)
28 | __global__ void grouping_backward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points)
29 | {
30 |     int batch_index = blockIdx.x;
31 |     grad_out += batch_index * m * nsample * c;
32 |     idx += batch_index * m * nsample;
33 |     grad_points += batch_index * n * c;
34 |     const int index = threadIdx.y * blockDim.x + threadIdx.x;
35 |     const int stride = blockDim.y * blockDim.x;
36 |     for (int i = index; i < c * m; i += stride)
37 |     {
38 |         const int l = i / m;
39 |         const int j = i % m;
40 |         for (int k = 0; k < nsample; ++k)
41 |         {
42 |             int ii = idx[j * nsample + k];
43 |             atomicAdd(grad_points + l * n + ii, grad_out[(l * m + j) * nsample + k]);
44 |         }
45 |     }
46 | }
47 | 
48 | void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out)
49 | {
50 |     grouping_forward_cuda_kernel<<<b, opt_block_config(m, c), 0>>>(b, c, n, m, nsample, points, idx, out);
51 | }
52 | 
53 | void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points)
54 | {
55 |     grouping_backward_cuda_kernel<<<b, opt_block_config(m, c), 0>>>(b, c, n, m, nsample, grad_out, idx, grad_points);
56 | }
57 | 
58 | // input: points(b, c, n) idx(b, npoints, nsample)
59 | // output: out(b, c, npoints, nsample)
60 | __global__ void grouping_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
61 |     int bs_idx = blockIdx.z;
62 |     int c_idx = blockIdx.y;
63 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
64 |     int pt_idx = index / nsample;
65 |     if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
66 | 
67 |     int sample_idx = index % nsample;
68 | 
69 |     idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
70 |     int in_idx = bs_idx * c * n + c_idx * n + idx[0];
71 |     int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
72 | 
73 |     out[out_idx] = points[in_idx];
74 | }
75 | 
76 | // input: points(b, c, n) idx(b, npoints, nsample)
77 | // output: out(b, c, npoints, nsample)
78 | void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out) {
79 | 
80 |     cudaError_t err;
81 | 
82 |     dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
83 |     dim3 threads(THREADS_PER_BLOCK);
84 | 
85 |     grouping_forward_cuda_kernel_fast<<<blocks, threads, 0>>>(b, c, n, npoints, nsample, points, idx, out);
86 |     // cudaDeviceSynchronize();  // for using printf in kernel function
87 |     err = cudaGetLastError();
88 |     if (cudaSuccess != err) {
89 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
90 |         exit(-1);
91 |     }
92 | }
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RPE_V2_CUDA_KERNEL
 2 | #define _RPE_V2_CUDA_KERNEL
 3 | #include <vector>
 4 | #include <torch/serialize/tensor.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | 
 7 | void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor);
 8 | void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor);
 9 | 
10 | void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
11 | void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor);
12 | 
13 | void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
14 | void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor);
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output);
21 | void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k);
22 | 
23 | void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *output);
24 | void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k);
25 | 
26 | void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output);
27 | void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | #endif
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stratified Transformer for 3D Point Cloud Segmentation
 2 | *Xin Lai<sup>\*</sup>, Jianhui Liu<sup>\*</sup>, Li Jiang, Liwei Wang, Hengshuang Zhao, Shu Liu, Xiaojuan Qi, Jiaya Jia*
 3 | 
 4 | This is the official PyTorch implementation of our paper [**Stratified Transformer for 3D Point Cloud Segmentation**](https://arxiv.org/pdf/2203.14508.pdf) that has been accepted to CVPR 2022. [\[arXiv\]](https://arxiv.org/pdf/2203.14508.pdf) [\[CVF\]](https://openaccess.thecvf.com/content/CVPR2022/papers/Lai_Stratified_Transformer_for_3D_Point_Cloud_Segmentation_CVPR_2022_paper.pdf)
 5 | 
 6 | <div align="center">
 7 |   <img src="figs/fig.jpg"/>
 8 | </div>
 9 | 
10 | # Highlight 
11 | 1. Our method (*Stratified Transformer*) achieves the state-of-the-art performance on 3D point cloud semantic segmentation on both S3DIS and ScanNetv2 datasets. **It is the first time for a point-based method to outperform the voxel-based ones**, such as SparseConvNet and MinkowskiNet;
12 | 2. *Stratified Transformer* is point-based, and constructed by Transformer with standard multi-head self-attention, enjoying large receptive field, robust generalization ability as well as competitive performance;
13 | 3. This repository develops a memory-efficient implementation to combat the issue of **variant-length tokens** with several CUDA kernels, avoiding unnecessary momery occupation of vacant tokens. We also use shared memory for further acceleration.
14 | 
15 | # Get Started
16 | 
17 | ## Environment
18 | 
19 | 1. Install dependencies
20 | 
21 | ```
22 | pip install -r requirements.txt
23 | ```
24 | 
25 | If you have any problem with the above command, you can also install them by
26 | 
27 | ```
28 | pip install torch_sparse==0.6.12
29 | pip install torch_points3d==1.3.0
30 | pip install tensorboard timm termcolor tensorboardX
31 | ```
32 | 
33 | 2. Compile pointops
34 | 
35 | Make sure you have installed `gcc` and `cuda`, and `nvcc` can work (Note that if you install cuda by conda, it won't provide nvcc and you should install cuda manually.). Then, compile and install pointops2 as follows. (We have tested on gcc==7.5.0 and cuda==10.1)
36 | ```
37 | cd lib/pointops2
38 | python3 setup.py install
39 | ```
40 | 
41 | ## Datasets Preparation
42 | 
43 | ### S3DIS
44 | Please refer to https://github.com/yanx27/Pointnet_Pointnet2_pytorch for S3DIS preprocessing. Then modify the `data_root` entry in the .yaml configuration file.
45 | 
46 | ### ScanNetv2
47 | Please refer to https://github.com/dvlab-research/PointGroup for the ScanNetv2 preprocessing. Then change the `data_root` entry in the .yaml configuration file accordingly.
48 | 
49 | ## Training
50 | 
51 | ### S3DIS
52 | - Stratified Transformer
53 | ```
54 | python3 train.py --config config/s3dis/s3dis_stratified_transformer.yaml
55 | ```
56 | 
57 | - 3DSwin Transformer (The vanilla version shown in our paper)
58 | ```
59 | python3 train.py --config config/s3dis/s3dis_swin3d_transformer.yaml
60 | ```
61 | 
62 | ### ScanNetv2
63 | - Stratified Transformer
64 | ```
65 | python3 train.py --config config/scannetv2/scannetv2_stratified_transformer.yaml
66 | ```
67 | 
68 | - 3DSwin Transformer (The vanilla version shown in our paper)
69 | ```
70 | python3 train.py --config config/scannetv2/scannetv2_swin3d_transformer.yaml
71 | ```
72 | 
73 | Note: It is normal to see the the results on S3DIS fluctuate between -0.5\% and +0.5\% mIoU maybe because the size of S3DIS is relatively small, while the results on ScanNetv2 are relatively stable.
74 | 
75 | ## Testing
76 | For testing, first change the `model_path`, `save_folder` and `data_root_val` (if applicable) accordingly. Then, run the following command. 
77 | ```
78 | python3 test.py --config [YOUR_CONFIG_PATH]
79 | ```
80 | 
81 | ## Pre-trained Models
82 | 
83 | For your convenience, you can download the pre-trained models and training/testing logs from [Here](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155154502_link_cuhk_edu_hk/EihXWr_HEnJIvR_M0_YRbSgBV-6VEIhmbOA9TMyCmKH35Q?e=hLAPNi).
84 | 
85 | 
86 | # Citation
87 | If you find this project useful, please consider citing:
88 | 
89 | ```
90 | @inproceedings{lai2022stratified,
91 |   title={Stratified Transformer for 3D Point Cloud Segmentation},
92 |   author={Lai, Xin and Liu, Jianhui and Jiang, Li and Wang, Liwei and Zhao, Hengshuang and Liu, Shu and Qi, Xiaojuan and Jia, Jiaya},
93 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
94 |   pages={8500--8509},
95 |   year={2022}
96 | }
97 | ```
98 | 


--------------------------------------------------------------------------------
/util/vis_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import matplotlib.pyplot as pyplot
  5 | 
  6 | colors = {'ceiling':[0,255,0],
  7 |           'floor':[0,0,255],
  8 |           'wall':[0,255,255],
  9 |           'beam':[255,255,0],
 10 |           'column':[255,0,255],
 11 |           'window':[100,100,255],
 12 |           'door':[200,200,100],
 13 |           'table':[170,120,200],
 14 |           'chair':[255,0,0],
 15 |           'sofa':[200,100,100],
 16 |           'bookcase':[10,200,100],
 17 |           'board':[200,200,200],
 18 |           'clutter':[50,50,50]}
 19 | colors = list(colors.values())
 20 | 
 21 | colors2 = [[50,50,50]]
 22 | 
 23 | colors7 = [[255, 0, 0], [255, 125, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255], [255, 0, 255]]
 24 | 
 25 | colors72 = [[242,183,176], [183,205,225], [210,234,200], [219,204,226], [249,218,173], [255,255,209], [227,216,192]]
 26 | 
 27 | colors40 = [[88,170,108], [174,105,226], [78,194,83], [198,62,165], [133,188,52], [97,101,219], [190,177,52], [139,65,168], [75,202,137], [225,66,129],
 28 |         [68,135,42], [226,116,210], [146,186,98], [68,105,201], [219,148,53], [85,142,235], [212,85,42], [78,176,223], [221,63,77], [68,195,195],
 29 |         [175,58,119], [81,175,144], [184,70,74], [40,116,79], [184,134,219], [130,137,46], [110,89,164], [92,135,74], [220,140,190], [94,103,39],
 30 |         [144,154,219], [160,86,40], [67,107,165], [194,170,104], [162,95,150], [143,110,44], [146,72,105], [225,142,106], [162,83,86], [227,124,143]]
 31 | 
 32 | def write_ply_color(points, labels, out_filename, num_classes=None):
 33 |     """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """
 34 |     labels = labels.astype(int)
 35 |     N = points.shape[0]
 36 |     if num_classes is None:
 37 |         num_classes = np.max(labels) + 1
 38 |     else:
 39 |         assert (num_classes > np.max(labels))
 40 |     fout = open(out_filename, 'w')
 41 |     # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)]
 42 |     # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)]
 43 |     for i in range(N):
 44 |         #c = colors[labels[i]]
 45 |         #c = [int(x * 255) for x in c]
 46 |         c = colors[labels[i]]
 47 |         fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
 48 |     fout.close()
 49 | 
 50 | 
 51 | def write_ply_rgb(points, rgb, out_filename, num_classes=None):
 52 |     """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """
 53 |     N = points.shape[0]
 54 |     fout = open(out_filename, 'w')
 55 |     # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)]
 56 |     # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)]
 57 |     for i in range(N):
 58 |         #c = colors[labels[i]]
 59 |         #c = [int(x * 255) for x in c]
 60 |         c = rgb[i]
 61 |         fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
 62 |     fout.close()
 63 | 
 64 | 
 65 | def write_ply_color_modelnet40(points, out_filename, num_classes=None):
 66 |     """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """
 67 |     #labels = labels.astype(int)
 68 |     N = points.shape[0]
 69 |     #if num_classes is None:
 70 |     #    num_classes = np.max(labels) + 1
 71 |     #else:
 72 |     #    assert (num_classes > np.max(labels))
 73 |     fout = open(out_filename, 'w')
 74 |     # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)]
 75 |     # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)]
 76 |     for i in range(N):
 77 |         #c = colors[labels[i]]
 78 |         #c = [int(x * 255) for x in c]
 79 |         c = colors2[0]
 80 |         fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
 81 |     fout.close()
 82 | 
 83 | 
 84 | def write_ply_color_shapenet(points, labels, out_filename, num_classes=None):
 85 |     """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """
 86 |     labels = labels.astype(int)
 87 |     N = points.shape[0]
 88 |     if num_classes is None:
 89 |         num_classes = np.max(labels) + 1
 90 |     else:
 91 |         assert (num_classes > np.max(labels))
 92 |     fout = open(out_filename, 'w')
 93 |     # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)]
 94 |     # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)]
 95 |     for i in range(N):
 96 |         #c = colors[labels[i]]
 97 |         #c = [int(x * 255) for x in c]
 98 |         c = colors7[labels[i]%7]
 99 |         fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
100 |     fout.close()


--------------------------------------------------------------------------------
/lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "featuredistribute_cuda_kernel.h"
  3 | 
  4 | __global__ void featuredistribute_cuda_kernel(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx) {
  5 |     int bs_idx = blockIdx.y;
  6 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  7 |     if (bs_idx >= b || pt_idx >= m) return;
  8 | 
  9 |     max_xyz += bs_idx * n * 3;
 10 |     xyz += bs_idx * m * 3 + pt_idx * 3;
 11 |     distribute_idx += bs_idx * m + pt_idx;
 12 | 
 13 |     float x = xyz[0];
 14 |     float y = xyz[1];
 15 |     float z = xyz[2];
 16 | 
 17 |     float min_dist2 = 100000;
 18 |     int min_dist_idx = -1;
 19 |     for (int k = 0; k < n; ++k) {
 20 |         float max_x = max_xyz[k * 3 + 0];
 21 |         float max_y = max_xyz[k * 3 + 1];
 22 |         float max_z = max_xyz[k * 3 + 2];
 23 |         float d2 = (max_x - x) * (max_x - x) + (max_y - y) * (max_y - y) + (max_z - z) * (max_z - z);
 24 |         if (d2 < min_dist2){
 25 |             min_dist_idx = k;
 26 |             min_dist2 = d2;
 27 |         }
 28 |     }
 29 |     distribute_idx[0] = min_dist_idx;
 30 | }
 31 | 
 32 | 
 33 | void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream) {
 34 |     // param max_xyz: (b, n, 3)
 35 |     // param xyz: (b, m, 3)
 36 |     // return distribute_idx: (b, m)
 37 | 
 38 |     cudaError_t err;
 39 | 
 40 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
 41 |     dim3 threads(THREADS_PER_BLOCK);
 42 | 
 43 |     featuredistribute_cuda_kernel<<<blocks, threads, 0, stream>>>(b, n, m, max_xyz, xyz, distribute_idx);
 44 |     // cudaDeviceSynchronize();  // for using printf in kernel function
 45 | 
 46 |     err = cudaGetLastError();
 47 |     if (cudaSuccess != err) {
 48 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
 49 |         exit(-1);
 50 |     }
 51 | }
 52 | 
 53 | __global__ void featuregather_forward_cuda_kernel(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature) {
 54 |     int bs_idx = blockIdx.z;
 55 |     int c_idx = blockIdx.y;
 56 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 57 |     if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
 58 | 
 59 |     max_feature += bs_idx * c * n + c_idx * n;
 60 |     distribute_idx += bs_idx * m + pt_idx;
 61 |     distribute_feature += bs_idx * c * m + c_idx * m + pt_idx;
 62 | 
 63 |     int idx = distribute_idx[0];
 64 |     distribute_feature[0] = max_feature[idx];
 65 | }
 66 | 
 67 | 
 68 | void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream){
 69 |     // param max_feature: (b, c, n)
 70 |     // param distribute_idx: (b, m)
 71 |     // return distribute_feature: (b, c, m)
 72 | 
 73 |     cudaError_t err;
 74 | 
 75 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
 76 |     dim3 threads(THREADS_PER_BLOCK);
 77 | 
 78 |     featuregather_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(b, n, m, c, max_feature, distribute_idx, distribute_feature);
 79 |     // cudaDeviceSynchronize();  // for using printf in kernel function
 80 | 
 81 |     err = cudaGetLastError();
 82 |     if (cudaSuccess != err) {
 83 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
 84 |         exit(-1);
 85 |     }
 86 | }
 87 | 
 88 | 
 89 | __global__ void featuregather_backward_cuda_kernel(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature){
 90 |     int bs_idx = blockIdx.z;
 91 |     int c_idx = blockIdx.y;
 92 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 93 |     if(bs_idx >= b || c_idx >= c || pt_idx >= m) return;
 94 | 
 95 |     grad_distribute_feature += bs_idx * c * m + c_idx * m + pt_idx;
 96 |     distribute_idx += bs_idx * m + pt_idx;
 97 |     grad_max_feature += bs_idx * c * n + c_idx * n;
 98 | 
 99 |     int idx = distribute_idx[0];
100 |     atomicAdd(grad_max_feature + idx, grad_distribute_feature[0]);
101 | }
102 | 
103 | 
104 | void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream){
105 |     // param grad_distribute_feature: (b, c, m)
106 |     // param distribute_idx: (b, m)
107 |     // return grad_max_feature: (b, c, n)
108 | 
109 |     cudaError_t err;
110 | 
111 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
112 |     dim3 threads(THREADS_PER_BLOCK);
113 | 
114 |     featuregather_backward_cuda_kernel<<<blocks, threads, 0, stream>>>(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature);
115 |     // cudaDeviceSynchronize();  // for using printf in kernel function
116 | 
117 |     err = cudaGetLastError();
118 |     if (cudaSuccess != err) {
119 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
120 |         exit(-1);
121 |     }
122 | }


--------------------------------------------------------------------------------
/lib/pointops2/src/attention/attention_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /* written by Xin Lai. Email: xinlai@cse.cuhk.edu.hk */
  2 | 
  3 | #include "../cuda_utils.h"
  4 | #include "attention_cuda_kernel.h"
  5 | 
  6 | 
  7 | __global__ void attention_step1_forward_cuda_kernel( // M, h, C//h
  8 |     int N, int M, int h, int C, const float *q, const float *k,
  9 |     const int *index0, const int *index1, float *attn) {
 10 | 
 11 |     int c_idx = blockIdx.z;
 12 |     int h_idx = blockIdx.y;
 13 |     int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
 14 |     if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
 15 | 
 16 |     int idx0 = index0[m_idx];
 17 |     int idx1 = index1[m_idx];
 18 |     float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx];
 19 |     atomicAdd(attn+m_idx*h+h_idx, val);
 20 | }
 21 | 
 22 | __global__ void attention_step1_backward_cuda_kernel( // M, h, C//h
 23 |     int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k,
 24 |     float *grad_q, float *grad_k) {
 25 |     
 26 |     int c_idx = blockIdx.z;
 27 |     int h_idx = blockIdx.y;
 28 |     int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
 29 |     if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
 30 | 
 31 |     int idx0 = index0[m_idx];
 32 |     int idx1 = index1[m_idx];
 33 |     int grad_out_idx = m_idx*h+h_idx;
 34 |     int q_idx = idx0*C+h_idx*C/h+c_idx;
 35 |     int k_idx = idx1*C+h_idx*C/h+c_idx;
 36 |     atomicAdd(grad_q+q_idx, grad_out[grad_out_idx] * k[k_idx]);
 37 |     atomicAdd(grad_k+k_idx, grad_out[grad_out_idx] * q[q_idx]);
 38 | }
 39 | 
 40 | void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k,
 41 |     const int *index0, const int *index1, float *attn) {
 42 |     // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
 43 |     //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
 44 |     dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
 45 |     dim3 threads(THREADS_PER_BLOCK);
 46 |     attention_step1_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, q, k, index0, index1, attn);
 47 | }
 48 | 
 49 | void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 
 50 |     const float *q, const float *k, float *grad_q, float *grad_k) {  
 51 |     // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
 52 |     //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
 53 |     dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
 54 |     dim3 threads(THREADS_PER_BLOCK);
 55 |     attention_step1_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k);
 56 | }
 57 | 
 58 | __global__ void attention_step2_forward_cuda_kernel( // M, h, C//h
 59 |     int N, int M, int h, int C, const float *attn, const float *v,
 60 |     const int *index0, const int *index1, float *output) {
 61 | 
 62 |     int c_idx = blockIdx.z;
 63 |     int h_idx = blockIdx.y;
 64 |     int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
 65 |     if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
 66 | 
 67 |     int idx1 = index1[m_idx];
 68 |     float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx];
 69 |     int idx0 = index0[m_idx];
 70 |     atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val);
 71 | }
 72 | 
 73 | __global__ void attention_step2_backward_cuda_kernel( // M, h, C//h
 74 |     int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v,
 75 |     float *grad_attn, float *grad_v) {
 76 |     
 77 |     int c_idx = blockIdx.z;
 78 |     int h_idx = blockIdx.y;
 79 |     int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
 80 |     if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
 81 | 
 82 |     int idx0 = index0[m_idx];
 83 |     int idx1 = index1[m_idx];
 84 |     int grad_out_idx = idx0*C+h_idx*C/h+c_idx;
 85 |     atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]);
 86 |     atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]);
 87 | }
 88 | 
 89 | void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v,
 90 |     const int *index0, const int *index1, float *output) {
 91 |     // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
 92 |     //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
 93 |     dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
 94 |     dim3 threads(THREADS_PER_BLOCK);
 95 |     attention_step2_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, attn, v, index0, index1, output);
 96 | }
 97 | 
 98 | void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 
 99 |     const float *attn, const float *v, float *grad_attn, float *grad_v) {  
100 |     // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
101 |     //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
102 |     dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
103 |     dim3 threads(THREADS_PER_BLOCK);
104 |     attention_step2_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
105 | }
106 | 


--------------------------------------------------------------------------------
/util/config.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | # Functions for parsing args
  3 | # -----------------------------------------------------------------------------
  4 | import yaml
  5 | import os
  6 | from ast import literal_eval
  7 | import copy
  8 | 
  9 | 
 10 | class CfgNode(dict):
 11 |     """
 12 |     CfgNode represents an internal node in the configuration tree. It's a simple
 13 |     dict-like container that allows for attribute-based access to keys.
 14 |     """
 15 | 
 16 |     def __init__(self, init_dict=None, key_list=None, new_allowed=False):
 17 |         # Recursively convert nested dictionaries in init_dict into CfgNodes
 18 |         init_dict = {} if init_dict is None else init_dict
 19 |         key_list = [] if key_list is None else key_list
 20 |         for k, v in init_dict.items():
 21 |             if type(v) is dict:
 22 |                 # Convert dict to CfgNode
 23 |                 init_dict[k] = CfgNode(v, key_list=key_list + [k])
 24 |         super(CfgNode, self).__init__(init_dict)
 25 | 
 26 |     def __getattr__(self, name):
 27 |         if name in self:
 28 |             return self[name]
 29 |         else:
 30 |             raise AttributeError(name)
 31 | 
 32 |     def __setattr__(self, name, value):
 33 |         self[name] = value
 34 | 
 35 |     def __str__(self):
 36 |         def _indent(s_, num_spaces):
 37 |             s = s_.split("\n")
 38 |             if len(s) == 1:
 39 |                 return s_
 40 |             first = s.pop(0)
 41 |             s = [(num_spaces * " ") + line for line in s]
 42 |             s = "\n".join(s)
 43 |             s = first + "\n" + s
 44 |             return s
 45 | 
 46 |         r = ""
 47 |         s = []
 48 |         for k, v in sorted(self.items()):
 49 |             seperator = "\n" if isinstance(v, CfgNode) else " "
 50 |             attr_str = "{}:{}{}".format(str(k), seperator, str(v))
 51 |             attr_str = _indent(attr_str, 2)
 52 |             s.append(attr_str)
 53 |         r += "\n".join(s)
 54 |         return r
 55 | 
 56 |     def __repr__(self):
 57 |         return "{}({})".format(self.__class__.__name__, super(CfgNode, self).__repr__())
 58 | 
 59 | 
 60 | def load_cfg_from_cfg_file(file):
 61 |     cfg = {}
 62 |     assert os.path.isfile(file) and file.endswith('.yaml'), \
 63 |         '{} is not a yaml file'.format(file)
 64 | 
 65 |     with open(file, 'r') as f:
 66 |         cfg_from_file = yaml.safe_load(f)
 67 | 
 68 |     for key in cfg_from_file:
 69 |         for k, v in cfg_from_file[key].items():
 70 |             cfg[k] = v
 71 | 
 72 |     cfg = CfgNode(cfg)
 73 |     return cfg
 74 | 
 75 | 
 76 | def merge_cfg_from_list(cfg, cfg_list):
 77 |     new_cfg = copy.deepcopy(cfg)
 78 |     assert len(cfg_list) % 2 == 0
 79 |     for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
 80 |         subkey = full_key.split('.')[-1]
 81 |         assert subkey in cfg, 'Non-existent key: {}'.format(full_key)
 82 |         value = _decode_cfg_value(v)
 83 |         value = _check_and_coerce_cfg_value_type(
 84 |             value, cfg[subkey], subkey, full_key
 85 |         )
 86 |         setattr(new_cfg, subkey, value)
 87 | 
 88 |     return new_cfg
 89 | 
 90 | 
 91 | def _decode_cfg_value(v):
 92 |     """Decodes a raw config value (e.g., from a yaml config files or command
 93 |     line argument) into a Python object.
 94 |     """
 95 |     # All remaining processing is only applied to strings
 96 |     if not isinstance(v, str):
 97 |         return v
 98 |     # Try to interpret `v` as a:
 99 |     #   string, number, tuple, list, dict, boolean, or None
100 |     try:
101 |         v = literal_eval(v)
102 |     # The following two excepts allow v to pass through when it represents a
103 |     # string.
104 |     #
105 |     # Longer explanation:
106 |     # The type of v is always a string (before calling literal_eval), but
107 |     # sometimes it *represents* a string and other times a data structure, like
108 |     # a list. In the case that v represents a string, what we got back from the
109 |     # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is
110 |     # ok with '"foo"', but will raise a ValueError if given 'foo'. In other
111 |     # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval
112 |     # will raise a SyntaxError.
113 |     except ValueError:
114 |         pass
115 |     except SyntaxError:
116 |         pass
117 |     return v
118 | 
119 | 
120 | def _check_and_coerce_cfg_value_type(replacement, original, key, full_key):
121 |     """Checks that `replacement`, which is intended to replace `original` is of
122 |     the right type. The type is correct if it matches exactly or is one of a few
123 |     cases in which the type can be easily coerced.
124 |     """
125 |     original_type = type(original)
126 |     replacement_type = type(replacement)
127 | 
128 |     # The types must match (with some exceptions)
129 |     if replacement_type == original_type or original is None:
130 |         return replacement
131 | 
132 |     # Cast replacement from from_type to to_type if the replacement and original
133 |     # types match from_type and to_type
134 |     def conditional_cast(from_type, to_type):
135 |         if replacement_type == from_type and original_type == to_type:
136 |             return True, to_type(replacement)
137 |         else:
138 |             return False, None
139 | 
140 |     # Conditionally casts
141 |     # list <-> tuple
142 |     casts = [(tuple, list), (list, tuple)]
143 |     # For py2: allow converting from str (bytes) to a unicode string
144 |     try:
145 |         casts.append((str, unicode))  # noqa: F821
146 |     except Exception:
147 |         pass
148 | 
149 |     for (from_type, to_type) in casts:
150 |         converted, converted_value = conditional_cast(from_type, to_type)
151 |         if converted:
152 |             return converted_value
153 | 
154 |     raise ValueError(
155 |         "Type mismatch ({} vs. {}) with values ({} vs. {}) for config "
156 |         "key: {}".format(
157 |             original_type, replacement_type, original, replacement, full_key
158 |         )
159 |     )
160 | 
161 | 
162 | def _assert_with_logging(cond, msg):
163 |     if not cond:
164 |         logger.debug(msg)
165 |     assert cond, msg
166 | 


--------------------------------------------------------------------------------
/lib/pointops2/src/sampling/sampling_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "sampling_cuda_kernel.h"
  3 | 
  4 | 
  5 | __device__ void __update(float *dists, int *dists_i, int idx1, int idx2) {
  6 |     const float v1 = dists[idx1], v2 = dists[idx2];
  7 |     const int i1 = dists_i[idx1], i2 = dists_i[idx2];
  8 |     dists[idx1] = max(v1, v2);
  9 |     dists_i[idx1] = v2 > v1 ? i2 : i1;
 10 | }
 11 | 
 12 | // input xyz: (n, 3), tmp: (b, n_max)
 13 | // ouput idx (m)
 14 | template <unsigned int block_size>
 15 | __global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
 16 | {
 17 |     __shared__ float dists[block_size];
 18 |     __shared__ int dists_i[block_size];
 19 | 
 20 |     int bid = blockIdx.x;
 21 |     int start_n, end_n, start_m, end_m, old;
 22 |     if (bid == 0) {
 23 |         start_n = 0;
 24 |         end_n = offset[0];
 25 |         start_m = 0;
 26 |         end_m = new_offset[0];
 27 |         old = 0;
 28 |     }
 29 |     else {
 30 |         start_n = offset[bid - 1];
 31 |         end_n = offset[bid];
 32 |         start_m = new_offset[bid - 1];
 33 |         end_m = new_offset[bid];
 34 |         old = offset[bid - 1];
 35 |     }
 36 | 
 37 |     const int stride = block_size;
 38 |     int tid = threadIdx.x;
 39 |     if (tid == 0) idx[start_m] = start_n;
 40 | 
 41 |     __syncthreads();
 42 |     for (int j = start_m + 1; j < end_m; j++)
 43 |     {
 44 |         int besti = start_n;
 45 |         float best = -1;
 46 |         float x1 = xyz[old * 3 + 0];
 47 |         float y1 = xyz[old * 3 + 1];
 48 |         float z1 = xyz[old * 3 + 2];
 49 |         for (int k = start_n + tid; k < end_n; k += stride)
 50 |         {
 51 |             float x2 = xyz[k * 3 + 0];
 52 |             float y2 = xyz[k * 3 + 1];
 53 |             float z2 = xyz[k * 3 + 2];
 54 |             float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
 55 |             float d2 = min(d, tmp[k]);
 56 |             tmp[k] = d2;
 57 |             besti = d2 > best ? k : besti;
 58 |             best = d2 > best ? d2 : best;
 59 |         }
 60 |         dists[tid] = best;
 61 |         dists_i[tid] = besti;
 62 |         __syncthreads();
 63 | 
 64 |         if (block_size >= 1024) {
 65 |             if (tid < 512) {
 66 |             __update(dists, dists_i, tid, tid + 512);
 67 |             }
 68 |             __syncthreads();
 69 |         }
 70 |         if (block_size >= 512) {
 71 |             if (tid < 256) {
 72 |             __update(dists, dists_i, tid, tid + 256);
 73 |             }
 74 |             __syncthreads();
 75 |         }
 76 |         if (block_size >= 256) {
 77 |             if (tid < 128) {
 78 |             __update(dists, dists_i, tid, tid + 128);
 79 |             }
 80 |             __syncthreads();
 81 |         }
 82 |         if (block_size >= 128) {
 83 |             if (tid < 64) {
 84 |             __update(dists, dists_i, tid, tid + 64);
 85 |             }
 86 |             __syncthreads();
 87 |         }
 88 |         if (block_size >= 64) {
 89 |             if (tid < 32) {
 90 |             __update(dists, dists_i, tid, tid + 32);
 91 |             }
 92 |             __syncthreads();
 93 |         }
 94 |         if (block_size >= 32) {
 95 |             if (tid < 16) {
 96 |             __update(dists, dists_i, tid, tid + 16);
 97 |             }
 98 |             __syncthreads();
 99 |         }
100 |         if (block_size >= 16) {
101 |             if (tid < 8) {
102 |             __update(dists, dists_i, tid, tid + 8);
103 |             }
104 |             __syncthreads();
105 |         }
106 |         if (block_size >= 8) {
107 |             if (tid < 4) {
108 |             __update(dists, dists_i, tid, tid + 4);
109 |             }
110 |             __syncthreads();
111 |         }
112 |         if (block_size >= 4) {
113 |             if (tid < 2) {
114 |             __update(dists, dists_i, tid, tid + 2);
115 |             }
116 |             __syncthreads();
117 |         }
118 |         if (block_size >= 2) {
119 |             if (tid < 1) {
120 |             __update(dists, dists_i, tid, tid + 1);
121 |             }
122 |             __syncthreads();
123 |         }
124 | 
125 |         old = dists_i[0];
126 |         if (tid == 0)
127 |             idx[j] = old;
128 |     }
129 | }
130 | 
131 | void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
132 | {   
133 | 	unsigned int n_threads = opt_n_threads(n);
134 | 	switch (n_threads) {
135 |         case 1024:
136 |             furthestsampling_cuda_kernel<1024><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
137 |             break;
138 |         case 512:
139 |             furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
140 |             break;
141 |         case 256:
142 |             furthestsampling_cuda_kernel<256><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
143 |             break;
144 |         case 128:
145 |             furthestsampling_cuda_kernel<128><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
146 |             break;
147 |         case 64:
148 |             furthestsampling_cuda_kernel<64><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
149 |             break;
150 |         case 32:
151 |             furthestsampling_cuda_kernel<32><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
152 |             break;
153 |         case 16:
154 |             furthestsampling_cuda_kernel<16><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
155 |             break;
156 |         case 8:
157 |             furthestsampling_cuda_kernel<8><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
158 |             break;
159 |         case 4:
160 |             furthestsampling_cuda_kernel<4><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
161 |             break;
162 |         case 2:
163 |             furthestsampling_cuda_kernel<2><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
164 |             break;
165 |         case 1:
166 |             furthestsampling_cuda_kernel<1><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
167 |             break;
168 |         default:
169 |             furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/util/lr.py:
--------------------------------------------------------------------------------
  1 | from torch.optim.lr_scheduler import LambdaLR, StepLR, OneCycleLR
  2 | import torch.optim as optim
  3 | 
  4 | class LambdaStepLR(LambdaLR):
  5 | 
  6 |   def __init__(self, optimizer, lr_lambda, last_step=-1):
  7 |     super(LambdaStepLR, self).__init__(optimizer, lr_lambda, last_step)
  8 | 
  9 |   @property
 10 |   def last_step(self):
 11 |     """Use last_epoch for the step counter"""
 12 |     return self.last_epoch
 13 | 
 14 |   @last_step.setter
 15 |   def last_step(self, v):
 16 |     self.last_epoch = v
 17 | 
 18 | 
 19 | class PolyLRwithWarmup(LambdaStepLR):
 20 |   """DeepLab learning rate policy"""
 21 | 
 22 |   def __init__(self, optimizer, max_iter, warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, last_step=-1):
 23 |     
 24 |     assert warmup == 'linear'
 25 |     def poly_with_warmup(s):
 26 |       coeff = (1 - s / (max_iter+1)) ** power
 27 |       if s <= warmup_iters:
 28 |         warmup_coeff = 1 - (1 - s / warmup_iters) * (1 - warmup_ratio)
 29 |       else:
 30 |         warmup_coeff = 1.0
 31 |       return coeff * warmup_coeff
 32 |     
 33 |     super(PolyLRwithWarmup, self).__init__(optimizer, poly_with_warmup, last_step)
 34 |     # torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1, verbose=False)
 35 |     # lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups.
 36 | 
 37 | 
 38 | class MultiStepWithWarmup(LambdaStepLR):
 39 |   def __init__(self, optimizer, milestones, gamma=0.1, warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, last_step=-1):
 40 | 
 41 |     assert warmup == 'linear'
 42 |     def multi_step_with_warmup(s):
 43 |       factor = 1.0
 44 |       for i in range(len(milestones)):
 45 |         if s < milestones[i]:
 46 |           break
 47 |         factor *= gamma
 48 |       
 49 |       if s <= warmup_iters:
 50 |         warmup_coeff = 1 - (1 - s / warmup_iters) * (1 - warmup_ratio)
 51 |       else:
 52 |         warmup_coeff = 1.0
 53 |       return warmup_coeff * factor
 54 | 
 55 |     super(MultiStepWithWarmup, self).__init__(optimizer, multi_step_with_warmup, last_step)
 56 | 
 57 | 
 58 | class PolyLR(LambdaStepLR):
 59 |   """DeepLab learning rate policy"""
 60 | 
 61 |   def __init__(self, optimizer, max_iter, power=0.9, last_step=-1):
 62 |     super(PolyLR, self).__init__(optimizer, lambda s: (1 - s / (max_iter + 1))**power, last_step)
 63 |     # torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1, verbose=False)
 64 |     # lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups.
 65 | 
 66 | 
 67 | class SquaredLR(LambdaStepLR):
 68 |   """ Used for SGD Lars"""
 69 | 
 70 |   def __init__(self, optimizer, max_iter, last_step=-1):
 71 |     super(SquaredLR, self).__init__(optimizer, lambda s: (1 - s / (max_iter + 1))**2, last_step)
 72 | 
 73 | 
 74 | class ExpLR(LambdaStepLR):
 75 | 
 76 |   def __init__(self, optimizer, step_size, gamma=0.9, last_step=-1):
 77 |     # (0.9 ** 21.854) = 0.1, (0.95 ** 44.8906) = 0.1
 78 |     # To get 0.1 every N using gamma 0.9, N * log(0.9)/log(0.1) = 0.04575749 N
 79 |     # To get 0.1 every N using gamma g, g ** N = 0.1 -> N * log(g) = log(0.1) -> g = np.exp(log(0.1) / N)
 80 |     super(ExpLR, self).__init__(optimizer, lambda s: gamma**(s / step_size), last_step)
 81 | 
 82 | 
 83 | def initialize_scheduler(optimizer, config, last_epoch=-1, scheduler_epoch=True, logger=None):
 84 |   # scheduler_epoch: the step_size are given in epoch num
 85 |   last_step = -1 if last_epoch < 0 else config.iter_per_epoch_train * (last_epoch + 1) - 1
 86 |   if scheduler_epoch:
 87 |     config.step_size = config.iter_per_epoch_train * config.step_size
 88 |     config.exp_step_size = config.iter_per_epoch_train * config.exp_step_size
 89 | 
 90 |   if config.scheduler == 'StepLR':
 91 |     return StepLR(optimizer, step_size=config.step_size, gamma=config.step_gamma, last_epoch=last_step)
 92 |   elif config.scheduler == 'PolyLR':
 93 |     return PolyLR(optimizer, max_iter=config.max_iter, power=config.poly_power, last_step=last_step)
 94 |   elif config.scheduler == 'PolyLRwithWarmup':
 95 |     return PolyLRwithWarmup(optimizer, max_iter=config.max_iter, warmup=config.warmup, warmup_iters=config.warmup_iters, warmup_ratio=config.warmup_ratio, power=config.poly_power, last_step=last_step)
 96 |   elif config.scheduler == 'SquaredLR':
 97 |     return SquaredLR(optimizer, max_iter=config.max_iter, last_step=last_step)
 98 |   elif config.scheduler == 'ExpLR':
 99 |     return ExpLR(optimizer, step_size=config.exp_step_size, gamma=config.exp_gamma, last_step=last_step)
100 |   elif config.scheduler == 'OneCycleLR':
101 |     return OneCycleLR(optimizer, max_lr=config.oc_max_lr, total_steps=config.max_iter, pct_start=config.oc_pct_start,
102 |                       anneal_strategy=config.oc_anneal_strategy, div_factor=config.oc_div_factor,
103 |                       final_div_factor=config.oc_final_div_factor, last_epoch=last_step)
104 |   # (optimizer, max_lr, total_steps=None, epochs=None, steps_per_epoch=None, pct_start=0.3, anneal_strategy='cos', cycle_momentum=True, base_momentum=0.85, max_momentum=0.95, div_factor=25.0, final_div_factor=10000.0, last_epoch=-1)
105 |   else:
106 |     if logger is not None:
107 |       logger.info('Scheduler not supported')
108 |     else: print('Scheduler not supported')
109 | 
110 | 
111 | if __name__ == '__main__':
112 |   import torchvision.models as models
113 |   model = models.vgg16()
114 |   optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
115 |   optimizer.param_groups[0]['initial_lr'] = 0.2 / 25.0
116 |   optimizer.param_groups[0]['max_lr'] = 0.2
117 |   optimizer.param_groups[0]['min_lr'] = 0.2 / 10000.0
118 |   optimizer.param_groups[0]['max_momentum'] = 0.95
119 |   optimizer.param_groups[0]['base_momentum'] = 0.85
120 |   last_step = 2
121 |   max_iter = 100
122 |   # scheduler = PolyLR(optimizer, max_iter=max_iter, power=0.9, last_step=last_step)
123 |   scheduler = OneCycleLR(optimizer, max_lr=0.2, total_steps=max_iter, pct_start=0.1, anneal_strategy='cos', div_factor=25.0,
124 |              final_div_factor=10000.0, last_epoch=last_step)
125 |   lr_list = []
126 |   for epoch in range(max(last_step + 1, 0), min(max_iter, 100)):
127 |     lrs = ', '.join(['{:.5e}'.format(x) for x in scheduler.get_last_lr()])
128 |     print('epoch {} lrs {}'.format(epoch, lrs))
129 |     lr_list.append(scheduler.get_last_lr()[0])
130 |     scheduler.step()
131 | 
132 |   import numpy as np
133 |   import matplotlib.pyplot as plt
134 |   x = np.arange(max(last_step + 1, 0), min(max_iter, 100), 1)
135 |   plt.title("function")
136 |   plt.plot(x, lr_list)
137 |   plt.show()


--------------------------------------------------------------------------------
/lib/pointops/src/labelstat/labelstat_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../cuda_utils.h"
  2 | #include "labelstat_cuda_kernel.h"
  3 | 
  4 | // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass)
  5 | // output: idx(b, m, nsample)  new_label_stat(b, m, nclass)
  6 | __global__ void labelstat_and_ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, int nclass,
  7 |     const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat) {
  8 |     int bs_idx = blockIdx.y;
  9 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 10 |     if (bs_idx >= b || pt_idx >= m) return;
 11 | 
 12 |     new_xyz += bs_idx * m * 3 + pt_idx * 3;
 13 |     xyz += bs_idx * n * 3;
 14 |     idx += bs_idx * m * nsample + pt_idx * nsample;
 15 |     label_stat += bs_idx * n * nclass;
 16 |     new_label_stat += bs_idx * m * nclass + pt_idx * nclass;
 17 | 
 18 |     for(int i = 0; i < nclass; i++){
 19 |         new_label_stat[i] = 0;
 20 |     }
 21 | 
 22 |     float radius2 = radius * radius;
 23 |     float new_x = new_xyz[0];
 24 |     float new_y = new_xyz[1];
 25 |     float new_z = new_xyz[2];
 26 | 
 27 |     int cnt = 0;
 28 |     for (int k = 0; k < n; ++k) {
 29 |         float x = xyz[k * 3 + 0];
 30 |         float y = xyz[k * 3 + 1];
 31 |         float z = xyz[k * 3 + 2];
 32 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 33 |         if (d2 < radius2){
 34 |             for(int i = 0; i < nclass; i++){
 35 |                 new_label_stat[i] += label_stat[k * nclass + i];
 36 |             }
 37 |             if (cnt == 0){
 38 |                 for (int l = 0; l < nsample; ++l) {
 39 |                     idx[l] = k;
 40 |                 }
 41 |             }
 42 |             idx[cnt] = k;
 43 |             ++cnt;
 44 |             if (cnt >= nsample){
 45 |                 break;
 46 |             }
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass,
 52 |     const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream) {
 53 |     // param new_xyz: (B, m, 3)
 54 |     // param xyz: (B, n, 3)
 55 |     // param idx: (B, m, nsample)
 56 | 
 57 |     cudaError_t err;
 58 | 
 59 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
 60 |     dim3 threads(THREADS_PER_BLOCK);
 61 | 
 62 |     labelstat_and_ballquery_cuda_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat);
 63 |     // cudaDeviceSynchronize();  // for using printf in kernel function
 64 | 
 65 |     err = cudaGetLastError();
 66 |     if (cudaSuccess != err) {
 67 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
 68 |         exit(-1);
 69 |     }
 70 | }
 71 | 
 72 | // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass)
 73 | // output: new_label_stat(b, m, nclass)
 74 | __global__ void labelstat_ballrange_cuda_kernel_fast(int b, int n, int m, float radius, int nclass,
 75 |     const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat) {
 76 |     int bs_idx = blockIdx.y;
 77 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 78 |     if (bs_idx >= b || pt_idx >= m) return;
 79 | 
 80 |     new_xyz += bs_idx * m * 3 + pt_idx * 3;
 81 |     xyz += bs_idx * n * 3;
 82 |     label_stat += bs_idx * n * nclass;
 83 |     new_label_stat += bs_idx * m * nclass + pt_idx * nclass;
 84 | 
 85 |     for(int i = 0; i < nclass; i++){
 86 |         new_label_stat[i] = 0;
 87 |     }
 88 | 
 89 |     float radius2 = radius * radius;
 90 |     float new_x = new_xyz[0];
 91 |     float new_y = new_xyz[1];
 92 |     float new_z = new_xyz[2];
 93 | 
 94 |     for (int k = 0; k < n; ++k) {
 95 |         float x = xyz[k * 3 + 0];
 96 |         float y = xyz[k * 3 + 1];
 97 |         float z = xyz[k * 3 + 2];
 98 |         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 99 |         if (d2 < radius2){
100 |             for(int i = 0; i < nclass; i++){
101 |                 new_label_stat[i] += label_stat[k * nclass + i];
102 |             }
103 |         }
104 |     }
105 | }
106 | 
107 | 
108 | void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass,
109 |     const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream) {
110 |     // param new_xyz: (B, m, 3)
111 |     // param xyz: (B, n, 3)
112 |     // param idx: (B, m, nsample)
113 | 
114 |     cudaError_t err;
115 | 
116 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
117 |     dim3 threads(THREADS_PER_BLOCK);
118 | 
119 |     labelstat_ballrange_cuda_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat);
120 |     // cudaDeviceSynchronize();  // for using printf in kernel function
121 | 
122 |     err = cudaGetLastError();
123 |     if (cudaSuccess != err) {
124 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
125 |         exit(-1);
126 |     }
127 | }
128 | 
129 | // input: idx(b, m, nsample) label_stat(b, n, nclass)
130 | // output: new_label_stat(b, m, nclass)
131 | __global__ void labelstat_idx_cuda_kernel_fast(int b, int n, int m, int nsample, int nclass,
132 |     const int *label_stat, const int *idx, int *new_label_stat) {
133 |     int bs_idx = blockIdx.y;
134 |     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
135 |     if (bs_idx >= b || pt_idx >= m) return;
136 | 
137 |     idx += bs_idx * m * nsample + pt_idx * nsample;
138 |     label_stat += bs_idx * n * nclass;
139 |     new_label_stat += bs_idx * m * nclass + pt_idx * nclass;
140 | 
141 |     for(int i = 0; i < nclass; i++){
142 |         new_label_stat[i] = 0;
143 |     }
144 | 
145 |     for(int k = 0; k < nsample; k++){
146 |         const int *label_stat_k = label_stat + idx[k] * nclass;
147 |         for(int i = 0; i < nclass; i++){
148 |             new_label_stat[i] += label_stat_k[i];
149 |         }
150 |     }
151 | }
152 | 
153 | 
154 | void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass,
155 |     const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream) {
156 |     // param new_xyz: (B, m, 3)
157 |     // param xyz: (B, n, 3)
158 |     // param idx: (B, m, nsample)
159 | 
160 |     cudaError_t err;
161 | 
162 |     dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
163 |     dim3 threads(THREADS_PER_BLOCK);
164 | 
165 |     labelstat_idx_cuda_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, nsample, nclass, label_stat, idx, new_label_stat);
166 |     // cudaDeviceSynchronize();  // for using printf in kernel function
167 | 
168 |     err = cudaGetLastError();
169 |     if (cudaSuccess != err) {
170 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
171 |         exit(-1);
172 |     }
173 | }


--------------------------------------------------------------------------------
/lib/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /* written by Xin Lai. Email: xinlai@cse.cuhk.edu.hk */
  2 | 
  3 | #include "../cuda_utils.h"
  4 | #include "relative_pos_encoding_cuda_kernel.h"
  5 | 
  6 | 
  7 | __global__ void dot_prod_with_idx_forward_cuda_kernel( // M, h, hdim
  8 |     int N, int M, int h, int hdim, const float *q, const int *index,
  9 |     const float *table, const int *rel_idx, float *output) {
 10 |     // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h)
 11 | 
 12 |     int c_idx = blockIdx.z;
 13 |     int h_idx = blockIdx.y;
 14 |     int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
 15 |     if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
 16 | 
 17 |     int dim = thread_idx % 3;
 18 |     int m_idx = thread_idx / 3;
 19 | 
 20 |     int q_idx = index[m_idx];
 21 |     int rel_idx_dim = rel_idx[thread_idx];
 22 |     float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
 23 |     float val = q[q_idx*h*hdim+h_idx*hdim+c_idx] * rel_table_val;
 24 |     atomicAdd(output+m_idx*h+h_idx, val);
 25 | }
 26 | 
 27 | __global__ void dot_prod_with_idx_backward_cuda_kernel( // M, h, hdim
 28 |     int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, 
 29 |     const float *table, const int *rel_idx, float *grad_q, float *grad_table) {
 30 |     
 31 |     int c_idx = blockIdx.z;
 32 |     int h_idx = blockIdx.y;
 33 |     int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
 34 |     if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
 35 | 
 36 |     int dim = thread_idx % 3;
 37 |     int m_idx = thread_idx / 3;
 38 | 
 39 |     int q_idx = index[m_idx];
 40 |     int rel_idx_dim = rel_idx[thread_idx];
 41 |     int grad_out_idx = m_idx*h+h_idx;
 42 |     float grad_out_value = grad_out[grad_out_idx];
 43 | 
 44 |     float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
 45 |     atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val);
 46 | 
 47 |     float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx];
 48 |     atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value);
 49 | }
 50 | 
 51 | void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index,
 52 |     const float *table, const int *rel_idx, float *output) {
 53 |     // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
 54 |     //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
 55 |     dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
 56 |     dim3 threads(THREADS_PER_BLOCK);
 57 |     dot_prod_with_idx_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, q, index, table, rel_idx, output);
 58 | }
 59 | 
 60 | void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, 
 61 |     const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table) {  
 62 |     // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3)
 63 |     //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
 64 |     dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
 65 |     dim3 threads(THREADS_PER_BLOCK);
 66 |     dot_prod_with_idx_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table);
 67 | }
 68 | 
 69 | __global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel( // M, h, hdim
 70 |     int N, int M, int h, int hdim, const float *attn, const float *v,
 71 |     const int *index0, const int *index1, const float *table, const int *rel_idx, float *output) {
 72 |     // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
 73 | 
 74 |     int c_idx = blockIdx.z;
 75 |     int h_idx = blockIdx.y;
 76 |     int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
 77 |     if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
 78 | 
 79 |     int dim = thread_idx % 3;
 80 |     int m_idx = thread_idx / 3;
 81 | 
 82 |     int idx1 = index1[m_idx];
 83 | 
 84 |     int rel_idx_dim = rel_idx[thread_idx];
 85 |     float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
 86 | 
 87 |     float val = attn[m_idx*h+h_idx] * (v[idx1*h*hdim+h_idx*hdim+c_idx] / 3.0 + table_val);
 88 | 
 89 |     int idx0 = index0[m_idx];
 90 |     atomicAdd(output+idx0*h*hdim+h_idx*hdim+c_idx, val);
 91 | }
 92 | 
 93 | 
 94 | __global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel( // M, h, hdim
 95 |     int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table,
 96 |     const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {
 97 |     // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
 98 | 
 99 |     int c_idx = blockIdx.z;
100 |     int h_idx = blockIdx.y;
101 |     int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
102 |     if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
103 | 
104 |     int dim = thread_idx % 3;
105 |     int m_idx = thread_idx / 3;
106 | 
107 |     int idx0 = index0[m_idx];
108 |     int idx1 = index1[m_idx];
109 |     int grad_out_idx = idx0*h*hdim+h_idx*hdim+c_idx;
110 | 
111 |     int rel_idx_dim = rel_idx[thread_idx];
112 |     float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
113 |     float grad_out_value = grad_out[grad_out_idx];
114 | 
115 |     atomicAdd(grad_attn+m_idx*h+h_idx, grad_out_value * (v[idx1*h*hdim+h_idx*hdim+c_idx]/3 + table_val));
116 |     atomicAdd(grad_v+idx1*h*hdim+h_idx*hdim+c_idx, grad_out_value * attn[m_idx*h+h_idx]/3);
117 |     atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * attn[m_idx*h+h_idx]);
118 | }
119 | 
120 | void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0,
121 |     const int *index1, const float *table, const int *rel_idx, float *output) {
122 |     // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
123 |     //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
124 |     dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
125 |     dim3 threads(THREADS_PER_BLOCK);
126 |     attention_step2_with_rel_pos_value_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output);
127 | }
128 | 
129 | void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, 
130 |     const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {  
131 |     // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
132 |     //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
133 |     dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
134 |     dim3 threads(THREADS_PER_BLOCK);
135 |     attention_step2_with_rel_pos_value_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
136 | }
137 | 


--------------------------------------------------------------------------------