├── util ├── __init__.py ├── dataset.py ├── scannet_v2.py ├── logger.py ├── s3dis.py ├── voxelize.py ├── vis_util.py ├── config.py └── lr.py ├── lib ├── pointops │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ ├── knnquery │ │ │ ├── __init__.py │ │ │ ├── knnquery_cuda_kernel.h │ │ │ ├── knnquery_cuda.cpp │ │ │ └── knnquery_cuda_kernel.cu │ │ ├── knnquery_heap │ │ │ ├── __init__.py │ │ │ ├── knnquery_heap_cuda_kernel.h │ │ │ ├── knnquery_heap_cuda.cpp │ │ │ └── knnquery_heap_cuda_kernel.cu │ │ ├── ballquery │ │ │ ├── ballquery_cuda_kernel.h │ │ │ ├── ballquery_cuda.cpp │ │ │ └── ballquery_cuda_kernel.cu │ │ ├── grouping_int │ │ │ ├── grouping_int_cuda_kernel.h │ │ │ ├── grouping_int_cuda.cpp │ │ │ └── grouping_int_cuda_kernel.cu │ │ ├── cuda_utils.h │ │ ├── sampling │ │ │ ├── sampling_cuda_kernel.h │ │ │ └── sampling_cuda.cpp │ │ ├── grouping │ │ │ ├── grouping_cuda_kernel.h │ │ │ ├── grouping_cuda.cpp │ │ │ └── grouping_cuda_kernel.cu │ │ ├── featuredistribute │ │ │ ├── featuredistribute_cuda_kernel.h │ │ │ ├── featuredistribute_cuda.cpp │ │ │ └── featuredistribute_cuda_kernel.cu │ │ ├── labelstat │ │ │ ├── labelstat_cuda_kernel.h │ │ │ ├── labelstat_cuda.cpp │ │ │ └── labelstat_cuda_kernel.cu │ │ ├── interpolation │ │ │ ├── interpolation_cuda_kernel.h │ │ │ └── interpolation_cuda.cpp │ │ └── pointops_api.cpp │ ├── functions │ │ └── __init__.py │ └── setup.py ├── pointops2 │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ ├── sampling │ │ │ ├── sampling_cuda_kernel.h │ │ │ ├── sampling_cuda.cpp │ │ │ └── sampling_cuda_kernel.cu │ │ ├── knnquery │ │ │ ├── knnquery_cuda_kernel.h │ │ │ ├── knnquery_cuda.cpp │ │ │ └── knnquery_cuda_kernel.cu │ │ ├── cuda_utils.h │ │ ├── grouping │ │ │ ├── grouping_cuda_kernel.h │ │ │ ├── grouping_cuda.cpp │ │ │ └── grouping_cuda_kernel.cu │ │ ├── interpolation │ │ │ ├── interpolation_cuda_kernel.h │ │ │ ├── interpolation_cuda.cpp │ │ │ └── interpolation_cuda_kernel.cu │ │ ├── subtraction │ │ │ ├── subtraction_cuda_kernel.h │ │ │ ├── subtraction_cuda.cpp │ │ │ └── subtraction_cuda_kernel.cu │ │ ├── aggregation │ │ │ ├── aggregation_cuda_kernel.h │ │ │ ├── aggregation_cuda.cpp │ │ │ └── aggregation_cuda_kernel.cu │ │ ├── attention │ │ │ ├── attention_cuda_kernel.h │ │ │ ├── attention_cuda.cpp │ │ │ └── attention_cuda_kernel.cu │ │ ├── attention_v2 │ │ │ ├── attention_cuda_kernel_v2.h │ │ │ └── attention_cuda_v2.cpp │ │ ├── rpe │ │ │ ├── relative_pos_encoding_cuda_kernel.h │ │ │ ├── relative_pos_encoding_cuda.cpp │ │ │ └── relative_pos_encoding_cuda_kernel.cu │ │ ├── pointops_api.cpp │ │ └── rpe_v2 │ │ │ └── relative_pos_encoding_cuda_kernel_v2.h │ ├── functions │ │ ├── __init__.py │ │ ├── test_attention_op_step2.py │ │ ├── test_relative_pos_encoding_op_step1.py │ │ ├── test_relative_pos_encoding_op_step1_v2.py │ │ ├── test_relative_pos_encoding_op_step2.py │ │ ├── test_attention_op_step1.py │ │ ├── test_relative_pos_encoding_op_step2_v2.py │ │ ├── test_relative_pos_encoding_op_step1_v3.py │ │ └── test_attention_op_step1_v2.py │ └── setup.py └── cpp_wrappers │ ├── compile_wrappers.sh │ ├── cpp_subsampling │ ├── setup.py │ └── grid_subsampling │ │ ├── grid_subsampling.h │ │ └── grid_subsampling.cpp │ └── cpp_utils │ └── cloud │ ├── cloud.cpp │ └── cloud.h ├── figs └── fig.jpg ├── requirements.txt ├── LICENSE.md ├── .gitignore ├── config ├── s3dis │ ├── s3dis_swin3d_transformer.yaml │ └── s3dis_stratified_transformer.yaml └── scannetv2 │ ├── scannetv2_stratified_transformer.yaml │ └── scannetv2_swin3d_transformer.yaml └── README.md /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops2/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops2/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery_heap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figs/fig.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/Stratified-Transformer/HEAD/figs/fig.jpg -------------------------------------------------------------------------------- /lib/cpp_wrappers/compile_wrappers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compile cpp subsampling 4 | cd cpp_subsampling 5 | python3 setup.py build_ext --inplace 6 | cd .. 7 | 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py==3.2.1 2 | matplotlib==3.4.2 3 | numpy==1.19.5 4 | Pillow==9.1.0 5 | PyYAML==6.0 6 | scipy==1.6.3 7 | setuptools==50.3.1.post20201107 8 | SharedArray==3.2.1 9 | tensorboardX==2.5 10 | termcolor==1.1.0 11 | timm==0.4.9 12 | torch==1.7.1 13 | torch_geometric==1.7.0 14 | torch_points3d==1.3.0 15 | torch_points_kernels==0.6.10 16 | torch_scatter==2.0.6 17 | torchvision==0.8.2 18 | -------------------------------------------------------------------------------- /lib/pointops2/src/sampling/sampling_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _SAMPLING_CUDA_KERNEL 2 | #define _SAMPLING_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor); 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | #endif 19 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery/knnquery_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _KNNQUERY_CUDA_KERNEL 2 | #define _KNNQUERY_CUDA_KERNEL 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream); 15 | 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | #endif -------------------------------------------------------------------------------- /lib/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _KNNQUERY_HEAP_CUDA_KERNEL 2 | #define _KNNQUERY_HEAP_CUDA_KERNEL 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream); 15 | 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | #endif -------------------------------------------------------------------------------- /lib/pointops2/src/knnquery/knnquery_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _KNNQUERY_CUDA_KERNEL 2 | #define _KNNQUERY_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | #endif 19 | -------------------------------------------------------------------------------- /lib/pointops2/src/sampling/sampling_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sampling_cuda_kernel.h" 6 | 7 | 8 | void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor) 9 | { 10 | const float *xyz = xyz_tensor.data_ptr(); 11 | const int *offset = offset_tensor.data_ptr(); 12 | const int *new_offset = new_offset_tensor.data_ptr(); 13 | float *tmp = tmp_tensor.data_ptr(); 14 | int *idx = idx_tensor.data_ptr(); 15 | furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx); 16 | } 17 | -------------------------------------------------------------------------------- /lib/pointops2/src/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUDA_UTILS_H 2 | #define _CUDA_UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | #define TOTAL_THREADS 1024 8 | #define THREADS_PER_BLOCK 256 9 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) 10 | 11 | inline int opt_n_threads(int work_size) { 12 | const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); 13 | return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1); 14 | } 15 | 16 | inline dim3 opt_block_config(int x, int y) { 17 | const int x_threads = opt_n_threads(x); 18 | const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); 19 | dim3 block_config(x_threads, y_threads, 1); 20 | return block_config; 21 | } 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /lib/cpp_wrappers/cpp_subsampling/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | import numpy.distutils.misc_util 3 | 4 | # Adding OpenCV to project 5 | # ************************ 6 | 7 | # Adding sources of the project 8 | # ***************************** 9 | 10 | m_name = "grid_subsampling" 11 | 12 | SOURCES = ["../cpp_utils/cloud/cloud.cpp", 13 | "grid_subsampling/grid_subsampling.cpp", 14 | "wrapper.cpp"] 15 | 16 | module = Extension(m_name, 17 | sources=SOURCES, 18 | extra_compile_args=['-std=c++11', 19 | '-D_GLIBCXX_USE_CXX11_ABI=0']) 20 | 21 | setup(ext_modules=[module], include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs()) 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /lib/pointops2/src/grouping/grouping_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _GROUPING_CUDA_KERNEL 2 | #define _GROUPING_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); 8 | void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output); 15 | void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/pointops2/src/knnquery/knnquery_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "knnquery_cuda_kernel.h" 6 | 7 | 8 | void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) 9 | { 10 | const float *xyz = xyz_tensor.data_ptr(); 11 | const float *new_xyz = new_xyz_tensor.data_ptr(); 12 | const int *offset = offset_tensor.data_ptr(); 13 | const int *new_offset = new_offset_tensor.data_ptr(); 14 | int *idx = idx_tensor.data_ptr(); 15 | float *dist2 = dist2_tensor.data_ptr(); 16 | knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); 17 | } 18 | -------------------------------------------------------------------------------- /lib/pointops/src/ballquery/ballquery_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _BALLQUERY_CUDA_KERNEL 2 | #define _BALLQUERY_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); 8 | 9 | void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *xyz, const float *new_xyz, int *idx); 16 | 17 | void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /lib/pointops/src/grouping_int/grouping_int_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _GROUPING_INT_CUDA_KERNEL 2 | #define _GROUPING_INT_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out); 8 | 9 | void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out); 16 | 17 | void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | #endif 23 | -------------------------------------------------------------------------------- /lib/pointops/src/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUDA_UTILS_H 2 | #define _CUDA_UTILS_H 3 | 4 | #include 5 | 6 | #define TOTAL_THREADS 1024 7 | 8 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 9 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 10 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 11 | 12 | #define THREADS_PER_BLOCK 256 13 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 14 | 15 | inline int opt_n_threads(int work_size) { 16 | const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); 17 | return max(min(1 << pow_2, TOTAL_THREADS), 1); 18 | } 19 | 20 | inline dim3 opt_block_config(int x, int y) { 21 | const int x_threads = opt_n_threads(x); 22 | const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); 23 | dim3 block_config(x_threads, y_threads, 1); 24 | return block_config; 25 | } 26 | 27 | #endif -------------------------------------------------------------------------------- /lib/pointops2/src/interpolation/interpolation_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _INTERPOLATION_CUDA_KERNEL 2 | #define _INTERPOLATION_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor); 8 | void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output); 15 | void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/pointops2/src/subtraction/subtraction_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _SUBTRACTION_CUDA_KERNEL 2 | #define _SUBTRACTION_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); 8 | void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output); 15 | void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/pointops2/src/grouping/grouping_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "grouping_cuda_kernel.h" 6 | 7 | 8 | void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) 9 | { 10 | const float *input = input_tensor.data_ptr(); 11 | const int *idx = idx_tensor.data_ptr(); 12 | float *output = output_tensor.data_ptr(); 13 | grouping_forward_cuda_launcher(m, nsample, c, input, idx, output); 14 | } 15 | 16 | void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor) 17 | { 18 | const float *grad_output = grad_output_tensor.data_ptr(); 19 | const int *idx = idx_tensor.data_ptr(); 20 | float *grad_input = grad_input_tensor.data_ptr(); 21 | grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input); 22 | } 23 | -------------------------------------------------------------------------------- /lib/pointops/src/grouping_int/grouping_int_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "grouping_int_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) 11 | { 12 | const long int *points = points_tensor.data(); 13 | const int *idx = idx_tensor.data(); 14 | long int *out = out_tensor.data(); 15 | grouping_int_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out); 16 | } 17 | 18 | void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) 19 | { 20 | const long int *points = points_tensor.data(); 21 | const int *idx = idx_tensor.data(); 22 | long int *out = out_tensor.data(); 23 | grouping_int_forward_cuda_launcher_fast(b, c, n, m, nsample, points, idx, out); 24 | } -------------------------------------------------------------------------------- /lib/pointops/src/sampling/sampling_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _SAMPLING_CUDA_KERNEL 2 | #define _SAMPLING_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); 8 | void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); 9 | void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out); 16 | void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points); 17 | void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | #endif 23 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery/knnquery_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "knnquery_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 13 | 14 | 15 | void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) 16 | { 17 | CHECK_INPUT(new_xyz_tensor); 18 | CHECK_INPUT(xyz_tensor); 19 | 20 | const float *new_xyz = new_xyz_tensor.data(); 21 | const float *xyz = xyz_tensor.data(); 22 | int *idx = idx_tensor.data(); 23 | float *dist2 = dist2_tensor.data(); 24 | 25 | cudaStream_t stream = THCState_getCurrentStream(state); 26 | 27 | knnquery_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream); 28 | } 29 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery_heap/knnquery_heap_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "knnquery_heap_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 13 | 14 | 15 | void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) 16 | { 17 | CHECK_INPUT(new_xyz_tensor); 18 | CHECK_INPUT(xyz_tensor); 19 | 20 | const float *new_xyz = new_xyz_tensor.data(); 21 | const float *xyz = xyz_tensor.data(); 22 | int *idx = idx_tensor.data(); 23 | float *dist2 = dist2_tensor.data(); 24 | 25 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(state); 26 | 27 | knnquery_heap_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream); 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 DV Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/pointops/src/grouping/grouping_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _GROUPING_CUDA_KERNEL 2 | #define _GROUPING_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out); 8 | void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); 9 | 10 | void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out); 17 | void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points); 18 | 19 | void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out); 20 | 21 | #ifdef __cplusplus 22 | } 23 | #endif 24 | #endif 25 | -------------------------------------------------------------------------------- /lib/pointops2/src/interpolation/interpolation_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "interpolation_cuda_kernel.h" 6 | 7 | 8 | void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor) 9 | { 10 | const float *input = input_tensor.data_ptr(); 11 | const int *idx = idx_tensor.data_ptr(); 12 | const float *weight = weight_tensor.data_ptr(); 13 | float *output = output_tensor.data_ptr(); 14 | interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output); 15 | } 16 | 17 | void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor) 18 | { 19 | const float *grad_output = grad_output_tensor.data_ptr(); 20 | const int *idx = idx_tensor.data_ptr(); 21 | const float *weight = weight_tensor.data_ptr(); 22 | float *grad_input = grad_input_tensor.data_ptr(); 23 | interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input); 24 | } 25 | -------------------------------------------------------------------------------- /lib/pointops2/src/aggregation/aggregation_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _AGGREGATION_CUDA_KERNEL 2 | #define _AGGREGATION_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); 8 | void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor); 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output); 15 | void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/pointops2/src/subtraction/subtraction_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "subtraction_cuda_kernel.h" 6 | 7 | 8 | void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) 9 | { 10 | const float *input1 = input1_tensor.data_ptr(); 11 | const float *input2 = input2_tensor.data_ptr(); 12 | const int *idx = idx_tensor.data_ptr(); 13 | float *output = output_tensor.data_ptr(); 14 | subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output); 15 | } 16 | 17 | void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor) 18 | { 19 | const int *idx = idx_tensor.data_ptr(); 20 | const float *grad_output = grad_output_tensor.data_ptr(); 21 | float *grad_input1 = grad_input1_tensor.data_ptr(); 22 | float *grad_input2 = grad_input2_tensor.data_ptr(); 23 | subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2); 24 | } 25 | -------------------------------------------------------------------------------- /lib/cpp_wrappers/cpp_utils/cloud/cloud.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 0==========================0 4 | // | Local feature test | 5 | // 0==========================0 6 | // 7 | // version 1.0 : 8 | // > 9 | // 10 | //--------------------------------------------------- 11 | // 12 | // Cloud source : 13 | // Define usefull Functions/Methods 14 | // 15 | //---------------------------------------------------- 16 | // 17 | // Hugues THOMAS - 10/02/2017 18 | // 19 | 20 | 21 | #include "cloud.h" 22 | 23 | 24 | // Getters 25 | // ******* 26 | 27 | PointXYZ max_point(std::vector points) 28 | { 29 | // Initiate limits 30 | PointXYZ maxP(points[0]); 31 | 32 | // Loop over all points 33 | for (auto p : points) 34 | { 35 | if (p.x > maxP.x) 36 | maxP.x = p.x; 37 | 38 | if (p.y > maxP.y) 39 | maxP.y = p.y; 40 | 41 | if (p.z > maxP.z) 42 | maxP.z = p.z; 43 | } 44 | 45 | return maxP; 46 | } 47 | 48 | PointXYZ min_point(std::vector points) 49 | { 50 | // Initiate limits 51 | PointXYZ minP(points[0]); 52 | 53 | // Loop over all points 54 | for (auto p : points) 55 | { 56 | if (p.x < minP.x) 57 | minP.x = p.x; 58 | 59 | if (p.y < minP.y) 60 | minP.y = p.y; 61 | 62 | if (p.z < minP.z) 63 | minP.z = p.z; 64 | } 65 | 66 | return minP; 67 | } -------------------------------------------------------------------------------- /lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _FEATUREDISTRIBUTE_CUDA_KERNEL 2 | #define _FEATUREDISTRIBUTE_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor); 8 | void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor); 9 | void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor); 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream); 16 | void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream); 17 | void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /lib/pointops/src/sampling/sampling_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sampling_cuda_kernel.h" 6 | 7 | extern THCState *state; 8 | 9 | void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) 10 | { 11 | const float *points = points_tensor.data(); 12 | const int *idx = idx_tensor.data(); 13 | float *out = out_tensor.data(); 14 | gathering_forward_cuda_launcher(b, c, n, m, points, idx, out); 15 | } 16 | 17 | void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) 18 | { 19 | 20 | const float *grad_out = grad_out_tensor.data(); 21 | const int *idx = idx_tensor.data(); 22 | float *grad_points = grad_points_tensor.data(); 23 | gathering_backward_cuda_launcher(b, c, n, m, grad_out, idx, grad_points); 24 | } 25 | 26 | void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) 27 | { 28 | const float *points = points_tensor.data(); 29 | float *temp = temp_tensor.data(); 30 | int *idx = idx_tensor.data(); 31 | furthestsampling_cuda_launcher(b, n, m, points, temp, idx); 32 | } 33 | -------------------------------------------------------------------------------- /lib/pointops/src/ballquery/ballquery_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "ballquery_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 13 | 14 | void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) 15 | { 16 | const float *new_xyz = new_xyz_tensor.data(); 17 | const float *xyz = xyz_tensor.data(); 18 | int *idx = idx_tensor.data(); 19 | 20 | ballquery_cuda_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx); 21 | } 22 | 23 | 24 | void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) 25 | { 26 | CHECK_INPUT(new_xyz_tensor); 27 | CHECK_INPUT(xyz_tensor); 28 | 29 | const float *new_xyz = new_xyz_tensor.data(); 30 | const float *xyz = xyz_tensor.data(); 31 | int *idx = idx_tensor.data(); 32 | 33 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(state); 34 | 35 | ballquery_cuda_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream); 36 | } 37 | -------------------------------------------------------------------------------- /lib/pointops/src/grouping/grouping_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "grouping_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) 11 | { 12 | const float *points = points_tensor.data(); 13 | const int *idx = idx_tensor.data(); 14 | float *out = out_tensor.data(); 15 | grouping_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out); 16 | } 17 | 18 | void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) 19 | { 20 | float *grad_points = grad_points_tensor.data(); 21 | const int *idx = idx_tensor.data(); 22 | const float *grad_out = grad_out_tensor.data(); 23 | grouping_backward_cuda_launcher(b, c, n, m, nsample, grad_out, idx, grad_points); 24 | } 25 | 26 | void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { 27 | 28 | const float *points = points_tensor.data(); 29 | const int *idx = idx_tensor.data(); 30 | float *out = out_tensor.data(); 31 | grouping_forward_cuda_launcher_fast(b, c, n, npoints, nsample, points, idx, out); 32 | } -------------------------------------------------------------------------------- /lib/pointops/setup.py: -------------------------------------------------------------------------------- 1 | #python3 setup.py install 2 | from setuptools import setup 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | 5 | setup( 6 | name='pointops', 7 | ext_modules=[ 8 | CUDAExtension('pointops_cuda', [ 9 | 'src/pointops_api.cpp', 10 | 11 | 'src/ballquery/ballquery_cuda.cpp', 12 | 'src/ballquery/ballquery_cuda_kernel.cu', 13 | 'src/knnquery/knnquery_cuda.cpp', 14 | 'src/knnquery/knnquery_cuda_kernel.cu', 15 | 'src/knnquery_heap/knnquery_heap_cuda.cpp', 16 | 'src/knnquery_heap/knnquery_heap_cuda_kernel.cu', 17 | 'src/grouping/grouping_cuda.cpp', 18 | 'src/grouping/grouping_cuda_kernel.cu', 19 | 'src/grouping_int/grouping_int_cuda.cpp', 20 | 'src/grouping_int/grouping_int_cuda_kernel.cu', 21 | 'src/interpolation/interpolation_cuda.cpp', 22 | 'src/interpolation/interpolation_cuda_kernel.cu', 23 | 'src/sampling/sampling_cuda.cpp', 24 | 'src/sampling/sampling_cuda_kernel.cu', 25 | 26 | 'src/labelstat/labelstat_cuda.cpp', 27 | 'src/labelstat/labelstat_cuda_kernel.cu', 28 | 29 | 'src/featuredistribute/featuredistribute_cuda.cpp', 30 | 'src/featuredistribute/featuredistribute_cuda_kernel.cu' 31 | ], 32 | extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}) 33 | ], 34 | cmdclass={'build_ext': BuildExtension}) 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## General 2 | 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.cuo 9 | 10 | # Compiled Dynamic libraries 11 | *.so 12 | *.dylib 13 | 14 | # Compiled Static libraries 15 | *.lai 16 | *.la 17 | *.a 18 | 19 | # Compiled protocol buffers 20 | *.pb.h 21 | *.pb.cc 22 | *_pb2.py 23 | 24 | # Compiled python 25 | *.pyc 26 | 27 | # Compiled MATLAB 28 | *.mex* 29 | 30 | # IPython notebook checkpoints 31 | .ipynb_checkpoints 32 | 33 | # Editor temporaries 34 | *.swp 35 | *~ 36 | 37 | # Sublime Text settings 38 | *.sublime-workspace 39 | *.sublime-project 40 | 41 | # Eclipse Project settings 42 | *.*project 43 | .settings 44 | 45 | # QtCreator files 46 | *.user 47 | 48 | # PyCharm files 49 | .idea 50 | 51 | # Visual Studio Code files 52 | .vscode 53 | 54 | # OSX dir files 55 | .DS_Store 56 | 57 | # personal 58 | __pycache__/ 59 | exp/ 60 | *.egg-info/ 61 | build/ 62 | dist/ 63 | 64 | *.tsv 65 | *.npy 66 | *.zip 67 | dataset/scannetv2/train 68 | dataset/scannetv2/val 69 | dataset/scannetv2/test 70 | dataset/scannetv2/val_gt 71 | dataset/scannet_tf/training* 72 | dataset/scannet_tf/val* 73 | temp 74 | 75 | dataset/s3dis/s3dis 76 | dataset/s3dis/Stanford3dDataset_v1.2_Aligned_Version 77 | 78 | dataset/sem3d/test 79 | dataset/sem3d/train 80 | dataset/sem3d/train_subsampling 81 | 82 | exp/ 83 | 84 | dataset/scannet_v2/train* 85 | dataset/scannet_v2/test* 86 | dataset/scannet_v2/val* 87 | 88 | runs/*/events* 89 | runs/ 90 | torch_points3d/ 91 | 92 | output/* 93 | *.pth 94 | */__pycache__ 95 | data/ 96 | -------------------------------------------------------------------------------- /lib/pointops/src/labelstat/labelstat_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _LABELSTAT_CUDA_KERNEL 2 | #define _LABELSTAT_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass, 8 | at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor); 9 | 10 | void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass, 11 | at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor); 12 | 13 | void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass, 14 | at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor); 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass, \ 21 | const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream); 22 | 23 | void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass, \ 24 | const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream); 25 | 26 | void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass, \ 27 | const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /lib/pointops2/src/aggregation/aggregation_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "aggregation_cuda_kernel.h" 6 | 7 | 8 | void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) 9 | { 10 | const float *input = input_tensor.data_ptr(); 11 | const float *position = position_tensor.data_ptr(); 12 | const float *weight = weight_tensor.data_ptr(); 13 | const int *idx = idx_tensor.data_ptr(); 14 | float *output = output_tensor.data_ptr(); 15 | aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output); 16 | } 17 | 18 | void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor) 19 | { 20 | const float *input = input_tensor.data_ptr(); 21 | const float *position = position_tensor.data_ptr(); 22 | const float *weight = weight_tensor.data_ptr(); 23 | const int *idx = idx_tensor.data_ptr(); 24 | const float *grad_output = grad_output_tensor.data_ptr(); 25 | float *grad_input = grad_input_tensor.data_ptr(); 26 | float *grad_position = grad_position_tensor.data_ptr(); 27 | float *grad_weight = grad_weight_tensor.data_ptr(); 28 | aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); 29 | } 30 | -------------------------------------------------------------------------------- /lib/pointops2/setup.py: -------------------------------------------------------------------------------- 1 | #python3 setup.py install 2 | from setuptools import setup 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | import os 5 | from distutils.sysconfig import get_config_vars 6 | 7 | (opt,) = get_config_vars('OPT') 8 | os.environ['OPT'] = " ".join( 9 | flag for flag in opt.split() if flag != '-Wstrict-prototypes' 10 | ) 11 | 12 | setup( 13 | name='pointops2', 14 | ext_modules=[ 15 | CUDAExtension('pointops2_cuda', [ 16 | 'src/pointops_api.cpp', 17 | 'src/knnquery/knnquery_cuda.cpp', 18 | 'src/knnquery/knnquery_cuda_kernel.cu', 19 | 'src/sampling/sampling_cuda.cpp', 20 | 'src/sampling/sampling_cuda_kernel.cu', 21 | 'src/grouping/grouping_cuda.cpp', 22 | 'src/grouping/grouping_cuda_kernel.cu', 23 | 'src/interpolation/interpolation_cuda.cpp', 24 | 'src/interpolation/interpolation_cuda_kernel.cu', 25 | 'src/subtraction/subtraction_cuda.cpp', 26 | 'src/subtraction/subtraction_cuda_kernel.cu', 27 | 'src/aggregation/aggregation_cuda.cpp', 28 | 'src/aggregation/aggregation_cuda_kernel.cu', 29 | 'src/attention/attention_cuda.cpp', 30 | 'src/attention/attention_cuda_kernel.cu', 31 | 'src/rpe/relative_pos_encoding_cuda.cpp', 32 | 'src/rpe/relative_pos_encoding_cuda_kernel.cu', 33 | 'src/attention_v2/attention_cuda_v2.cpp', 34 | 'src/attention_v2/attention_cuda_kernel_v2.cu', 35 | 'src/rpe_v2/relative_pos_encoding_cuda_v2.cpp', 36 | 'src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu', 37 | ], 38 | extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']} 39 | ) 40 | ], 41 | cmdclass={'build_ext': BuildExtension} 42 | ) 43 | -------------------------------------------------------------------------------- /lib/pointops/src/interpolation/interpolation_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _INTERPOLATION_CUDA_KERNEL 2 | #define _INTERPOLATION_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); 8 | void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); 9 | void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor); 10 | 11 | void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); 12 | void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx); 19 | void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out); 20 | void interpolation_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points); 21 | 22 | void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx); 23 | void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | #endif 29 | -------------------------------------------------------------------------------- /lib/pointops2/src/attention/attention_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ATTENTION_CUDA_KERNEL 2 | #define _ATTENTION_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor); 8 | void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor); 9 | 10 | void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor); 11 | void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor); 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, const int *index0, const int *index1, float *attn); 18 | void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k); 19 | 20 | void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output); 21 | void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | #endif 27 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_attention_op_step2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 800000 8 | N = 35000 9 | C = 96 10 | h = 6 11 | softmax_attn_flat = torch.rand(M, h).cuda() 12 | value = torch.rand(N, h, C//h).cuda() 13 | 14 | index_0 = torch.rand(M) 15 | index_0[index_0 < 0] = 0 16 | index_0 = (index_0*N).long().cuda() 17 | 18 | index_1 = torch.rand(M) 19 | index_1[index_1 < 0] = 0 20 | index_1 = (index_1*N).long().cuda() 21 | 22 | softmax_attn_flat.requires_grad = True 23 | value.requires_grad = True 24 | 25 | # value_flat = value[index_1] #[M, num_heads, C // num_heads] 26 | # x = (softmax_attn_flat.unsqueeze(-1) * value_flat).reshape(M, C) 27 | # x = scatter_sum(src=x, index=index_0, dim=0, dim_size=N) #[N, C] 28 | # loss = x.sum() 29 | # loss.backward() 30 | 31 | # print("x.shape: {}, x[:5,:10]: {}".format(x.shape, x[:5,:10])) 32 | # print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10]) 33 | # print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5]) 34 | # input() 35 | 36 | print("softmax_attn_flat.is_contiguous(): ", softmax_attn_flat.is_contiguous()) 37 | print("value.is_contiguous(): ", value.is_contiguous()) 38 | print("index_0.is_contiguous(): ", index_0.is_contiguous()) 39 | print("index_1.is_contiguous(): ", index_1.is_contiguous()) 40 | 41 | x_v2 = pointops.attention_step2(softmax_attn_flat.float(), value.float(), index_0.int(), index_1.int()) 42 | x_v2 = x_v2.view(N, C) 43 | loss = x_v2.sum() 44 | loss.backward() 45 | 46 | print("x_v2.shape: {}, x_v2[:5,:10]: {}".format(x_v2.shape, x_v2[:5,:10])) 47 | 48 | print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10]) 49 | print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5]) 50 | input() 51 | 52 | print("((x-x_v2)**2 < 1e-8).all(): ", ((x-x_v2)**2 < 1e-8).all()) 53 | 54 | print("torch.max((x-x_v2)**2): ", torch.max((x-x_v2)**2)) 55 | 56 | -------------------------------------------------------------------------------- /lib/pointops2/src/attention_v2/attention_cuda_kernel_v2.h: -------------------------------------------------------------------------------- 1 | #ifndef _ATTENTION_V2_CUDA_KERNEL 2 | #define _ATTENTION_V2_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor); 8 | void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor); 9 | 10 | void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor); 11 | void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor); 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn); 18 | void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k); 19 | 20 | void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output); 21 | void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | #endif 27 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_relative_pos_encoding_op_step1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 80000 8 | N = 3500 9 | hdim = 16 10 | h = 6 11 | L = 31 12 | query = torch.rand(N, h, hdim).cuda() 13 | table = torch.rand(L, h, hdim, 3).cuda() 14 | 15 | index = torch.rand(M) 16 | index[index < 0] = 0 17 | index = (index*N).long().cuda() 18 | 19 | rel_index = torch.rand(M, 3) 20 | rel_index[rel_index < 0] = 0 21 | rel_index = (rel_index*L).long().cuda() 22 | 23 | query.requires_grad = True 24 | table.requires_grad = True 25 | 26 | # query_flat = query[index] #[M, h, hdim] 27 | # table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim] 28 | # rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M] 29 | # rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim] 30 | # output = (query_flat * rel_pos_encoding).sum(-1) #[M, h] 31 | # loss = output.mean() 32 | # loss.backward() 33 | 34 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10])) 35 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 36 | # print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 37 | # input() 38 | 39 | # print("query.is_contiguous(): ", query.is_contiguous()) 40 | # print("key.is_contiguous(): ", key.is_contiguous()) 41 | # print("index_0.is_contiguous(): ", index_0.is_contiguous()) 42 | # print("index_1.is_contiguous(): ", index_1.is_contiguous()) 43 | 44 | output_v2 = pointops.dot_prod_with_idx(query, index.int(), table, rel_index.int()) 45 | loss = output_v2.mean() 46 | loss.backward() 47 | 48 | print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10])) 49 | print("v2: query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 50 | print("v2: table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 51 | input() 52 | 53 | # print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max()) 54 | 55 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2)) 56 | 57 | -------------------------------------------------------------------------------- /lib/pointops2/src/grouping/grouping_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "grouping_cuda_kernel.h" 3 | 4 | 5 | __global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) { 6 | // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) 7 | int index = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (index >= m * nsample * c) return; 9 | const int c_idx = index % c; 10 | const int nsample_idx = (index / c) % nsample; 11 | const int m_idx = index / nsample / c; 12 | const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; 13 | output[index] = input[input_idx]; 14 | } 15 | 16 | __global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) { 17 | // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) 18 | int index = blockIdx.x * blockDim.x + threadIdx.x; 19 | if (index >= m * nsample * c) return; 20 | const int c_idx = index % c; 21 | const int nsample_idx = (index / c) % nsample; 22 | const int m_idx = index / nsample / c; 23 | const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; 24 | atomicAdd(grad_input + input_idx, grad_output[index]); 25 | } 26 | 27 | void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) { 28 | // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) 29 | dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); 30 | dim3 threads(THREADS_PER_BLOCK); 31 | grouping_forward_cuda_kernel<<>>(m, nsample, c, input, idx, output); 32 | } 33 | 34 | void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input) 35 | { 36 | // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) 37 | dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); 38 | dim3 threads(THREADS_PER_BLOCK); 39 | grouping_backward_cuda_kernel<<>>(m, nsample, c, grad_output, idx, grad_input); 40 | } 41 | -------------------------------------------------------------------------------- /lib/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _RPE_CUDA_KERNEL 2 | #define _RPE_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); 8 | void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_table_tensor); 9 | 10 | void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); 11 | void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor); 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, const float *table, const int *rel_idx, float *output); 18 | void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table); 19 | 20 | void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, const int *index1, const float *table, const int *rel_idx, float *output); 21 | void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | #endif 27 | -------------------------------------------------------------------------------- /lib/pointops2/src/interpolation/interpolation_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "interpolation_cuda_kernel.h" 3 | 4 | 5 | __global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) 6 | { 7 | // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) 8 | int index = blockIdx.x * blockDim.x + threadIdx.x; 9 | if (index >= n * c) return; 10 | int c_idx = index % c; 11 | int n_idx = index / c; 12 | for (int i = 0; i < k; i++) 13 | { 14 | int idx_idx = n_idx * k + i; 15 | int input_idx = idx[idx_idx] * c + c_idx; 16 | output[index] += input[input_idx] * weight[idx_idx]; 17 | } 18 | } 19 | 20 | __global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) 21 | { 22 | // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) 23 | int index = blockIdx.x * blockDim.x + threadIdx.x; 24 | if (index >= n * c) return; 25 | int c_idx = index % c; 26 | int n_idx = index / c; 27 | for (int i = 0; i < k; i++) 28 | { 29 | int idx_idx = n_idx * k + i; 30 | int input_idx = idx[idx_idx] * c + c_idx; 31 | atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]); 32 | } 33 | } 34 | 35 | void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) { 36 | // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) 37 | dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); 38 | dim3 threads(THREADS_PER_BLOCK); 39 | interpolation_forward_cuda_kernel<<>>(n, c, k, input, idx, weight, output); 40 | } 41 | 42 | void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) { 43 | // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) 44 | dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); 45 | dim3 threads(THREADS_PER_BLOCK); 46 | interpolation_backward_cuda_kernel<<>>(n, c, k, grad_output, idx, weight, grad_input); 47 | } 48 | -------------------------------------------------------------------------------- /util/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import numpy as np 4 | 5 | from torch.utils.data import Dataset 6 | 7 | 8 | def make_dataset(split='train', data_root=None, data_list=None): 9 | if not os.path.isfile(data_list): 10 | raise (RuntimeError("Point list file do not exist: " + data_list + "\n")) 11 | point_list = [] 12 | list_read = open(data_list).readlines() 13 | print("Totally {} samples in {} set.".format(len(list_read), split)) 14 | for line in list_read: 15 | point_list.append(os.path.join(data_root, line.strip())) 16 | return point_list 17 | 18 | 19 | class PointData(Dataset): 20 | def __init__(self, split='train', data_root=None, data_list=None, transform=None, num_point=None, random_index=False): 21 | assert split in ['train', 'val', 'test'] 22 | self.split = split 23 | self.data_list = make_dataset(split, data_root, data_list) 24 | self.transform = transform 25 | self.num_point = num_point 26 | self.random_index = random_index 27 | 28 | def __len__(self): 29 | return len(self.data_list) 30 | 31 | def __getitem__(self, index): 32 | data_path = self.data_list[index] 33 | f = h5py.File(data_path, 'r') 34 | data = f['data'][:] 35 | if self.split is 'test': 36 | label = 255 # place holder 37 | else: 38 | label = f['label'][:] 39 | f.close() 40 | if self.num_point is None: 41 | self.num_point = data.shape[0] 42 | idxs = np.arange(data.shape[0]) 43 | if self.random_index: 44 | np.random.shuffle(idxs) 45 | idxs = idxs[0:self.num_point] 46 | data = data[idxs, :] 47 | if label.size != 1: # seg data 48 | label = label[idxs] 49 | if self.transform is not None: 50 | data, label = self.transform(data, label) 51 | return data, label 52 | 53 | 54 | if __name__ == '__main__': 55 | data_root = 'dataset/modelnet40' 56 | data_list = 'dataset/modelnet40/list/val.txt' 57 | point_data = PointData('train', data_root, data_list) 58 | print('point data size:', point_data.__len__()) 59 | print('point data 0 shape:', point_data.__getitem__(0)[0].shape) 60 | print('point label 0 shape:', point_data.__getitem__(0)[1].shape) 61 | -------------------------------------------------------------------------------- /lib/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "../../cpp_utils/cloud/cloud.h" 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class SampledData 11 | { 12 | public: 13 | 14 | // Elements 15 | // ******** 16 | 17 | int count; 18 | PointXYZ point; 19 | vector features; 20 | vector> labels; 21 | 22 | 23 | // Methods 24 | // ******* 25 | 26 | // Constructor 27 | SampledData() 28 | { 29 | count = 0; 30 | point = PointXYZ(); 31 | } 32 | 33 | SampledData(const size_t fdim, const size_t ldim) 34 | { 35 | count = 0; 36 | point = PointXYZ(); 37 | features = vector(fdim); 38 | labels = vector>(ldim); 39 | } 40 | 41 | // Method Update 42 | void update_all(const PointXYZ p, vector::iterator f_begin, vector::iterator l_begin) 43 | { 44 | count += 1; 45 | point += p; 46 | transform (features.begin(), features.end(), f_begin, features.begin(), plus()); 47 | int i = 0; 48 | for(vector::iterator it = l_begin; it != l_begin + labels.size(); ++it) 49 | { 50 | labels[i][*it] += 1; 51 | i++; 52 | } 53 | return; 54 | } 55 | void update_features(const PointXYZ p, vector::iterator f_begin) 56 | { 57 | count += 1; 58 | point += p; 59 | transform (features.begin(), features.end(), f_begin, features.begin(), plus()); 60 | return; 61 | } 62 | void update_classes(const PointXYZ p, vector::iterator l_begin) 63 | { 64 | count += 1; 65 | point += p; 66 | int i = 0; 67 | for(vector::iterator it = l_begin; it != l_begin + labels.size(); ++it) 68 | { 69 | labels[i][*it] += 1; 70 | i++; 71 | } 72 | return; 73 | } 74 | void update_points(const PointXYZ p) 75 | { 76 | count += 1; 77 | point += p; 78 | return; 79 | } 80 | }; 81 | 82 | 83 | 84 | void grid_subsampling(vector& original_points, 85 | vector& subsampled_points, 86 | vector& original_features, 87 | vector& subsampled_features, 88 | vector& original_classes, 89 | vector& subsampled_classes, 90 | float sampleDl, 91 | int verbose); 92 | 93 | -------------------------------------------------------------------------------- /lib/pointops/src/pointops_api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "ballquery/ballquery_cuda_kernel.h" 5 | #include "grouping/grouping_cuda_kernel.h" 6 | #include "grouping_int/grouping_int_cuda_kernel.h" 7 | #include "sampling/sampling_cuda_kernel.h" 8 | #include "interpolation/interpolation_cuda_kernel.h" 9 | #include "knnquery/knnquery_cuda_kernel.h" 10 | #include "knnquery_heap/knnquery_heap_cuda_kernel.h" 11 | 12 | #include "labelstat/labelstat_cuda_kernel.h" 13 | #include "featuredistribute/featuredistribute_cuda_kernel.h" 14 | 15 | 16 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 17 | m.def("ballquery_cuda", &ballquery_cuda_fast, "ballquery_cuda_fast"); // name in python, cpp function address, docs 18 | 19 | m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda"); 20 | m.def("knnquery_heap_cuda", &knnquery_heap_cuda, "knnquery_heap_cuda"); 21 | 22 | m.def("grouping_forward_cuda", &grouping_forward_cuda_fast, "grouping_forward_cuda_fast"); 23 | m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda"); 24 | 25 | m.def("grouping_int_forward_cuda", &grouping_int_forward_cuda_fast, "grouping_int_forward_cuda_fast"); 26 | 27 | m.def("gathering_forward_cuda", &gathering_forward_cuda, "gathering_forward_cuda"); 28 | m.def("gathering_backward_cuda", &gathering_backward_cuda, "gathering_backward_cuda"); 29 | m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda"); 30 | 31 | m.def("nearestneighbor_cuda", &nearestneighbor_cuda_fast, "nearestneighbor_cuda_fast"); 32 | m.def("interpolation_forward_cuda", &interpolation_forward_cuda_fast, "interpolation_forward_cuda_fast"); 33 | m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda"); 34 | 35 | m.def("labelstat_idx_cuda", &labelstat_idx_cuda_fast, "labelstat_idx_cuda_fast"); 36 | m.def("labelstat_ballrange_cuda", &labelstat_ballrange_cuda_fast, "labelstat_ballrange_cuda_fast"); 37 | m.def("labelstat_and_ballquery_cuda", &labelstat_and_ballquery_cuda_fast, "labelstat_and_ballquery_cuda_fast"); 38 | 39 | m.def("featuredistribute_cuda", &featuredistribute_cuda, "featuredistribute_cuda"); 40 | m.def("featuregather_forward_cuda", &featuregather_forward_cuda, "featuregather_forward_cuda"); 41 | m.def("featuregather_backward_cuda", &featuregather_backward_cuda, "featuregather_backward_cuda"); 42 | } 43 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 80000 8 | N = 3500 9 | hdim = 16 10 | h = 6 11 | L = 31 12 | query = torch.rand(N, h, hdim).cuda() 13 | table_q = torch.rand(L, h, hdim, 3).cuda() 14 | key = torch.rand(N, h, hdim).cuda() 15 | table_k = torch.rand(L, h, hdim, 3).cuda() 16 | 17 | index_q = torch.rand(M) 18 | index_q[index_q < 0] = 0 19 | index_q = (index_q*N).long().cuda() 20 | 21 | index_k = torch.rand(M) 22 | index_k[index_k < 0] = 0 23 | index_k = (index_k*N).long().cuda() 24 | 25 | rel_index = torch.rand(M, 3) 26 | rel_index[rel_index < 0] = 0 27 | rel_index = (rel_index*L).long().cuda() 28 | 29 | query.requires_grad = True 30 | table_q.requires_grad = True 31 | key.requires_grad = True 32 | table_k.requires_grad = True 33 | 34 | output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int()) 35 | output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int()) 36 | output = output1 + output2 37 | # loss = output.mean() 38 | # loss.backward() 39 | 40 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10])) 41 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 42 | # print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2]) 43 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 44 | # print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2]) 45 | # input() 46 | 47 | # print("query.is_contiguous(): ", query.is_contiguous()) 48 | # print("key.is_contiguous(): ", key.is_contiguous()) 49 | # print("index_0.is_contiguous(): ", index_0.is_contiguous()) 50 | # print("index_1.is_contiguous(): ", index_1.is_contiguous()) 51 | 52 | output_v2 = pointops.dot_prod_with_idx_v2(query, index_q.int(), key, index_k.int(), table_q, table_k, rel_index.int()) 53 | loss = output_v2.mean() 54 | loss.backward() 55 | 56 | print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10])) 57 | print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 58 | print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2]) 59 | print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 60 | print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2]) 61 | # input() 62 | 63 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max()) 64 | 65 | -------------------------------------------------------------------------------- /lib/pointops/src/featuredistribute/featuredistribute_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "featuredistribute_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 13 | 14 | 15 | void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor) 16 | { 17 | CHECK_INPUT(max_xyz_tensor); 18 | CHECK_INPUT(xyz_tensor); 19 | 20 | const float *max_xyz = max_xyz_tensor.data(); 21 | const float *xyz = xyz_tensor.data(); 22 | int *distribute_idx = distribute_idx_tensor.data(); 23 | 24 | cudaStream_t stream = THCState_getCurrentStream(state); 25 | 26 | featuredistribute_cuda_launcher(b, n, m, max_xyz, xyz, distribute_idx, stream); 27 | } 28 | 29 | 30 | void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor) 31 | { 32 | CHECK_INPUT(max_feature_tensor); 33 | CHECK_INPUT(distribute_idx_tensor); 34 | 35 | const float *max_feature = max_feature_tensor.data(); 36 | const int *distribute_idx = distribute_idx_tensor.data(); 37 | float *distribute_feature = distribute_feature_tensor.data(); 38 | 39 | cudaStream_t stream = THCState_getCurrentStream(state); 40 | 41 | featuregather_forward_cuda_launcher(b, n, m, c, max_feature, distribute_idx, distribute_feature, stream); 42 | } 43 | 44 | 45 | void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor) 46 | { 47 | CHECK_INPUT(grad_distribute_feature_tensor); 48 | CHECK_INPUT(distribute_idx_tensor); 49 | 50 | const float *grad_distribute_feature = grad_distribute_feature_tensor.data(); 51 | const int *distribute_idx = distribute_idx_tensor.data(); 52 | float *grad_max_feature = grad_max_feature_tensor.data(); 53 | 54 | cudaStream_t stream = THCState_getCurrentStream(state); 55 | 56 | featuregather_backward_cuda_launcher(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature, stream); 57 | } -------------------------------------------------------------------------------- /config/s3dis/s3dis_swin3d_transformer.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | data_name: s3dis 3 | data_root: # Fill in the pre-processed data path (which contains the .npy files) 4 | test_area: 5 5 | classes: 13 6 | fea_dim: 6 7 | voxel_size: 0.04 8 | voxel_max: 80000 9 | loop: 30 10 | 11 | TRAIN: 12 | #arch 13 | arch: swin3d_transformer 14 | stem_transformer: True 15 | use_xyz: True 16 | sync_bn: True # adopt sync_bn or not 17 | rel_query: True 18 | rel_key: True 19 | rel_value: True 20 | quant_size: 0.01 21 | num_layers: 4 22 | patch_size: 1 23 | window_size: 4 24 | depths: [2, 2, 6, 2] 25 | channels: [48, 96, 192, 384] 26 | num_heads: [3, 6, 12, 24] 27 | up_k: 3 28 | drop_path_rate: 0.3 29 | concat_xyz: True 30 | grid_size: 0.04 31 | max_batch_points: 140000 32 | max_num_neighbors: 34 # For KPConv 33 | ratio: 0.25 34 | k: 16 35 | 36 | # training 37 | aug: True 38 | transformer_lr_scale: 0.1 39 | jitter_sigma: 0.005 40 | jitter_clip: 0.02 41 | scheduler_update: epoch 42 | scheduler: MultiStep 43 | warmup: linear 44 | warmup_iters: 1500 45 | warmup_ratio: 0.000001 46 | use_amp: True 47 | optimizer: AdamW 48 | ignore_label: 255 49 | train_gpu: [0, 1, 2, 3] 50 | workers: 16 # data loader workers 51 | batch_size: 8 # batch size for training 52 | batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff 53 | base_lr: 0.006 54 | epochs: 100 55 | start_epoch: 0 56 | step_epoch: 30 57 | multiplier: 0.1 58 | momentum: 0.9 59 | weight_decay: 0.01 60 | drop_rate: 0.5 61 | manual_seed: 123 62 | print_freq: 1 63 | save_freq: 1 64 | save_path: runs/s3dis_swin3d_transformer 65 | weight: # path to initial weight (default: none) 66 | resume: # path to latest checkpoint (default: none) 67 | evaluate: True # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend 68 | eval_freq: 1 69 | Distributed: 70 | dist_url: tcp://127.0.0.1:6789 71 | dist_backend: 'nccl' 72 | multiprocessing_distributed: True 73 | world_size: 1 74 | rank: 0 75 | 76 | TEST: 77 | test_list: dataset/s3dis/list/val5.txt 78 | test_list_full: dataset/s3dis/list/val5_full.txt 79 | split: val # split in [train, val and test] 80 | test_gpu: [0] 81 | test_workers: 4 82 | batch_size_test: 4 83 | model_path: # Fill the path of the trained .pth file model 84 | save_folder: # Fill the path to store the .npy files for each scene 85 | names_path: data/s3dis/s3dis_names.txt 86 | -------------------------------------------------------------------------------- /config/s3dis/s3dis_stratified_transformer.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | data_name: s3dis 3 | data_root: # Fill in the pre-processed data path (which contains the .npy files) 4 | test_area: 5 5 | classes: 13 6 | fea_dim: 6 7 | voxel_size: 0.04 8 | voxel_max: 80000 9 | loop: 30 10 | 11 | TRAIN: 12 | #arch 13 | arch: stratified_transformer 14 | stem_transformer: True 15 | use_xyz: True 16 | sync_bn: True # adopt sync_bn or not 17 | rel_query: True 18 | rel_key: True 19 | rel_value: True 20 | quant_size: 0.01 21 | downsample_scale: 8 22 | num_layers: 4 23 | patch_size: 1 24 | window_size: 4 25 | depths: [2, 2, 6, 2] 26 | channels: [48, 96, 192, 384] 27 | num_heads: [3, 6, 12, 24] 28 | up_k: 3 29 | drop_path_rate: 0.3 30 | concat_xyz: True 31 | grid_size: 0.04 32 | max_batch_points: 140000 33 | max_num_neighbors: 34 # For KPConv 34 | ratio: 0.25 35 | k: 16 36 | 37 | # training 38 | aug: True 39 | transformer_lr_scale: 0.1 40 | jitter_sigma: 0.005 41 | jitter_clip: 0.02 42 | scheduler_update: epoch 43 | scheduler: MultiStep 44 | warmup: linear 45 | warmup_iters: 1500 46 | warmup_ratio: 0.000001 47 | use_amp: True 48 | optimizer: AdamW 49 | ignore_label: 255 50 | train_gpu: [0, 1, 2, 3] 51 | workers: 16 # data loader workers 52 | batch_size: 8 # batch size for training 53 | batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff 54 | base_lr: 0.006 55 | epochs: 100 56 | start_epoch: 0 57 | step_epoch: 30 58 | multiplier: 0.1 59 | momentum: 0.9 60 | weight_decay: 0.01 61 | drop_rate: 0.5 62 | manual_seed: 123 63 | print_freq: 1 64 | save_freq: 1 65 | save_path: runs/s3dis_stratified_transformer 66 | weight: # path to initial weight (default: none) 67 | resume: # path to latest checkpoint (default: none) 68 | evaluate: True # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend 69 | eval_freq: 1 70 | Distributed: 71 | dist_url: tcp://127.0.0.1:6789 72 | dist_backend: 'nccl' 73 | multiprocessing_distributed: True 74 | world_size: 1 75 | rank: 0 76 | 77 | TEST: 78 | test_list: dataset/s3dis/list/val5.txt 79 | test_list_full: dataset/s3dis/list/val5_full.txt 80 | split: val # split in [train, val and test] 81 | test_gpu: [0] 82 | test_workers: 4 83 | batch_size_test: 4 84 | model_path: # Fill the path of the trained .pth file model 85 | save_folder: # Fill the path to store the .npy files for each scene 86 | names_path: data/s3dis/s3dis_names.txt 87 | -------------------------------------------------------------------------------- /config/scannetv2/scannetv2_stratified_transformer.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | data_name: scannetv2 3 | data_root: # Fill in the pre-processed data path (which contains the 'train', 'val', 'test' directories) 4 | classes: 20 5 | fea_dim: 6 6 | voxel_size: 0.02 7 | voxel_max: 120000 8 | loop: 6 9 | 10 | TRAIN: 11 | # arch 12 | arch: stratified_transformer 13 | stem_transformer: False 14 | use_xyz: True 15 | sync_bn: True # adopt sync_bn or not 16 | rel_query: True 17 | rel_key: True 18 | rel_value: True 19 | quant_size: 0.005 20 | downsample_scale: 4 21 | num_layers: 5 22 | patch_size: 1 23 | window_size: 5 24 | depths: [3,3,9,3,3] 25 | channels: [48, 96, 192, 384, 384] 26 | num_heads: [3, 6, 12, 24, 24] 27 | up_k: 3 28 | drop_path_rate: 0.3 29 | concat_xyz: True 30 | grid_size: 0.02 31 | max_batch_points: 250000 32 | max_num_neighbors: 34 # For KPConv 33 | ratio: 0.25 34 | k: 16 35 | 36 | # training 37 | aug: True 38 | transformer_lr_scale: 0.1 39 | scheduler_update: step 40 | scheduler: MultiStepWithWarmup 41 | warmup: linear 42 | warmup_iters: 3000 43 | warmup_ratio: 0.000001 44 | use_amp: True 45 | optimizer: AdamW #SGD 46 | train_gpu: [0, 1, 2, 3] 47 | workers: 16 # data loader workers 48 | batch_size: 8 # batch size for training 49 | batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff 50 | base_lr: 0.006 51 | epochs: 100 52 | start_epoch: 0 53 | step_epoch: 30 54 | multiplier: 0.1 55 | momentum: 0.9 56 | weight_decay: 0.05 57 | drop_rate: 0.5 58 | 59 | ignore_label: -100 #255 60 | manual_seed: 123 61 | print_freq: 1 62 | save_freq: 1 63 | save_path: runs/sacnnetv2_stratified_transformer 64 | weight: # path to initial weight (default: none) 65 | resume: # path to latest checkpoint (default: none) 66 | evaluate: True # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend 67 | eval_freq: 1 68 | Distributed: 69 | dist_url: tcp://127.0.0.1:6789 70 | dist_backend: 'nccl' 71 | multiprocessing_distributed: True 72 | world_size: 1 73 | rank: 0 74 | 75 | TEST: 76 | data_root_val: # Fill the path that contains the scenes of the validation set (e.g., "[YOUR PATH]/val") 77 | split: val # split in [train, val and test] 78 | test_gpu: [0] 79 | test_workers: 4 80 | batch_size_test: 4 81 | model_path: # Fill the path of the trained .pth file model 82 | save_folder: # Fill the path to store the .npy files for each scene 83 | names_path: data/scannet/scannet_names.txt 84 | -------------------------------------------------------------------------------- /config/scannetv2/scannetv2_swin3d_transformer.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | data_name: scannetv2 3 | data_root: # Fill in the pre-processed data path (which contains the 'train', 'val', 'test' directories) 4 | classes: 20 5 | fea_dim: 6 6 | voxel_size: 0.02 7 | voxel_max: 120000 8 | loop: 6 9 | 10 | TRAIN: 11 | # arch 12 | arch: swin3d_transformer 13 | stem_transformer: False 14 | use_xyz: True 15 | sync_bn: True # adopt sync_bn or not 16 | rel_query: True 17 | rel_key: True 18 | rel_value: True 19 | quant_size: 0.005 20 | num_layers: 5 21 | patch_size: 1 22 | window_size: 5 23 | depths: [3,3,9,3,3] 24 | channels: [48, 96, 192, 384, 384] 25 | num_heads: [3, 6, 12, 24, 24] 26 | up_k: 3 27 | drop_path_rate: 0.3 28 | concat_xyz: True 29 | grid_size: 0.02 30 | max_batch_points: 250000 31 | max_num_neighbors: 34 # For KPConv 32 | ratio: 0.25 33 | k: 16 34 | 35 | # training 36 | aug: True 37 | transformer_lr_scale: 0.1 38 | scheduler_update: step 39 | scheduler: MultiStepWithWarmup 40 | warmup: linear 41 | warmup_iters: 3000 42 | warmup_ratio: 0.000001 43 | use_amp: True 44 | optimizer: AdamW #SGD 45 | train_gpu: [0, 1, 2, 3] 46 | workers: 16 # data loader workers 47 | batch_size: 8 # batch size for training 48 | batch_size_val: 4 # batch size for validation during training, memory and speed tradeoff 49 | base_lr: 0.006 50 | epochs: 100 51 | start_epoch: 0 52 | step_epoch: 30 53 | multiplier: 0.1 54 | momentum: 0.9 55 | weight_decay: 0.05 56 | drop_rate: 0.5 57 | 58 | ignore_label: -100 #255 59 | manual_seed: 123 60 | print_freq: 1 61 | save_freq: 1 62 | save_path: runs/sacnnetv2_swin3d_transformer 63 | weight: # path to initial weight (default: none) 64 | resume: # path to latest checkpoint (default: none) 65 | evaluate: True # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend 66 | eval_freq: 1 67 | Distributed: 68 | dist_url: tcp://127.0.0.1:6789 69 | dist_backend: 'nccl' 70 | multiprocessing_distributed: True 71 | world_size: 1 72 | rank: 0 73 | 74 | TEST: 75 | data_root_val: # Fill the path that contains the scenes of the validation set (e.g., "[YOUR PATH]/val") 76 | test_list: 77 | test_list_full: 78 | split: val # split in [train, val and test] 79 | test_gpu: [0] 80 | test_workers: 4 81 | batch_size_test: 4 82 | model_path: # Fill the path of the trained .pth file model 83 | save_folder: # Fill the path to store the .npy files for each scene 84 | names_path: data/scannet/scannet_names.txt 85 | -------------------------------------------------------------------------------- /lib/pointops2/src/subtraction/subtraction_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "subtraction_cuda_kernel.h" 3 | 4 | 5 | __global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { 6 | // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) 7 | int index = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (index >= n * nsample * c) return; 9 | const int c_idx = index % c; 10 | const int nsample_idx = (index / c) % nsample; 11 | const int n_idx = index / nsample / c; 12 | const int idx_idx = n_idx * nsample + nsample_idx; 13 | const int input1_idx = n_idx * c + c_idx; 14 | const int input2_idx = idx[idx_idx] * c + c_idx; 15 | output[index] = input1[input1_idx] - input2[input2_idx]; 16 | } 17 | 18 | __global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { 19 | // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) 20 | int index = blockIdx.x * blockDim.x + threadIdx.x; 21 | if (index >= n * nsample * c) return; 22 | const int c_idx = index % c; 23 | const int nsample_idx = (index / c) % nsample; 24 | const int n_idx = index / nsample / c; 25 | const int idx_idx = n_idx * nsample + nsample_idx; 26 | const int input1_idx = n_idx * c + c_idx; 27 | const int input2_idx = idx[idx_idx] * c + c_idx; 28 | atomicAdd(grad_input1 + input1_idx, grad_output[index]); 29 | atomicAdd(grad_input2 + input2_idx, -grad_output[index]); 30 | } 31 | 32 | void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { 33 | // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) 34 | dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); 35 | dim3 threads(THREADS_PER_BLOCK); 36 | subtraction_forward_cuda_kernel<<>>(n, nsample, c, input1, input2, idx, output); 37 | } 38 | 39 | void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { 40 | // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) 41 | dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); 42 | dim3 threads(THREADS_PER_BLOCK); 43 | subtraction_backward_cuda_kernel<<>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2); 44 | } 45 | -------------------------------------------------------------------------------- /lib/pointops/src/interpolation/interpolation_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "interpolation_cuda_kernel.h" 6 | 7 | extern THCState *state; 8 | 9 | void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) 10 | { 11 | const float *unknown = unknown_tensor.data(); 12 | const float *known = known_tensor.data(); 13 | float *dist2 = dist2_tensor.data(); 14 | int *idx = idx_tensor.data(); 15 | nearestneighbor_cuda_launcher(b, n, m, unknown, known, dist2, idx); 16 | } 17 | 18 | void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) 19 | { 20 | const float *points = points_tensor.data(); 21 | const float *weight = weight_tensor.data(); 22 | float *out = out_tensor.data(); 23 | const int *idx = idx_tensor.data(); 24 | interpolation_forward_cuda_launcher(b, c, m, n, points, idx, weight, out); 25 | } 26 | 27 | void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor) 28 | { 29 | const float *grad_out = grad_out_tensor.data(); 30 | const float *weight = weight_tensor.data(); 31 | float *grad_points = grad_points_tensor.data(); 32 | const int *idx = idx_tensor.data(); 33 | interpolation_backward_cuda_launcher(b, c, n, m, grad_out, idx, weight, grad_points); 34 | } 35 | 36 | void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) { 37 | const float *unknown = unknown_tensor.data(); 38 | const float *known = known_tensor.data(); 39 | float *dist2 = dist2_tensor.data(); 40 | int *idx = idx_tensor.data(); 41 | nearestneighbor_cuda_launcher_fast(b, n, m, unknown, known, dist2, idx); 42 | } 43 | 44 | void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) { 45 | 46 | const float *points = points_tensor.data(); 47 | const float *weight = weight_tensor.data(); 48 | float *out = out_tensor.data(); 49 | const int *idx = idx_tensor.data(); 50 | interpolation_forward_cuda_launcher_fast(b, c, m, n, points, idx, weight, out); 51 | } -------------------------------------------------------------------------------- /lib/pointops2/functions/test_relative_pos_encoding_op_step2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 80000 8 | N = 3500 9 | hdim = 16 10 | h = 6 11 | L = 31 12 | attn = torch.rand(M, h).cuda() 13 | v = torch.rand(N, h, hdim).cuda() 14 | table = torch.rand(L, h, hdim, 3).cuda() 15 | 16 | index_0 = torch.rand(M) 17 | index_0[index_0 < 0] = 0 18 | index_0 = (index_0*N).long().cuda() 19 | 20 | index_1 = torch.rand(M) 21 | index_1[index_1 < 0] = 0 22 | index_1 = (index_1*N).long().cuda() 23 | 24 | rel_index = torch.rand(M, 3) 25 | rel_index[rel_index < 0] = 0 26 | rel_index = (rel_index*L).long().cuda() 27 | 28 | attn.requires_grad = True 29 | v.requires_grad = True 30 | table.requires_grad = True 31 | 32 | v_flat = v[index_1] #[M, h, hdim] 33 | table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim] 34 | rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M] 35 | rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim] 36 | v_flat_new = v_flat + rel_pos_encoding #[M, h, hdim] 37 | output = attn.unsqueeze(-1) * v_flat_new #[M, h, hdim] 38 | output = scatter_sum(src=output, index=index_0, dim=0, dim_size=N) #[N, h, hdim] 39 | loss = output.mean() 40 | loss.backward() 41 | 42 | print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5])) 43 | print("attn.grad[:5, :3]: ", attn.grad[:5, :3]) 44 | print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5]) 45 | print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 46 | input() 47 | 48 | # print("query.is_contiguous(): ", query.is_contiguous()) 49 | # print("key.is_contiguous(): ", key.is_contiguous()) 50 | # print("index_0.is_contiguous(): ", index_0.is_contiguous()) 51 | # print("index_1.is_contiguous(): ", index_1.is_contiguous()) 52 | 53 | # output_v2 = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int()) 54 | # loss = output_v2.mean() 55 | # loss.backward() 56 | 57 | # print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5])) 58 | # print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3]) 59 | # print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5]) 60 | # print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 61 | # input() 62 | 63 | # print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max()) 64 | 65 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2)) 66 | 67 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery/knnquery_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "knnquery_cuda_kernel.h" 3 | 4 | // input: xyz (b, n, 3) new_xyz (b, m, 3) 5 | // output: idx (b, m, nsample) dist2 (b, m, nsample) 6 | __global__ void knnquery_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) { 7 | int bs_idx = blockIdx.y; 8 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 9 | if (bs_idx >= b || pt_idx >= m) return; 10 | 11 | new_xyz += bs_idx * m * 3 + pt_idx * 3; 12 | xyz += bs_idx * n * 3; 13 | idx += bs_idx * m * nsample + pt_idx * nsample; 14 | 15 | float new_x = new_xyz[0]; 16 | float new_y = new_xyz[1]; 17 | float new_z = new_xyz[2]; 18 | 19 | //double* best = new double[nsample]; 20 | //int* besti = new int[nsample]; 21 | double best[200]; 22 | int besti[200]; 23 | for(int i = 0; i < nsample; i++){ 24 | best[i] = 1e40; 25 | besti[i] = 0; 26 | } 27 | for(int k = 0; k < n; k++){ 28 | float x = xyz[k * 3 + 0]; 29 | float y = xyz[k * 3 + 1]; 30 | float z = xyz[k * 3 + 2]; 31 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 32 | for(int j = 0; j < nsample; j++){ 33 | if(d2 < best[j]){ 34 | for(int i = nsample - 1; i > j; i--){ 35 | best[i] = best[i - 1]; 36 | besti[i] = besti[i - 1]; 37 | } 38 | best[j] = d2; 39 | besti[j] = k; 40 | break; 41 | } 42 | } 43 | } 44 | for(int i = 0; i < nsample; i++){ 45 | idx[i] = besti[i]; 46 | dist2[i] = best[i]; 47 | } 48 | //delete []best; 49 | //delete []besti; 50 | } 51 | 52 | 53 | void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) { 54 | // param new_xyz: (B, m, 3) 55 | // param xyz: (B, n, 3) 56 | // param idx: (B, m, nsample) 57 | 58 | cudaError_t err; 59 | 60 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 61 | dim3 threads(THREADS_PER_BLOCK); 62 | 63 | knnquery_cuda_kernel<<>>(b, n, m, nsample, xyz, new_xyz, idx, dist2); 64 | // cudaDeviceSynchronize(); // for using printf in kernel function 65 | 66 | err = cudaGetLastError(); 67 | if (cudaSuccess != err) { 68 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 69 | exit(-1); 70 | } 71 | } -------------------------------------------------------------------------------- /lib/pointops/src/grouping_int/grouping_int_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "grouping_int_cuda_kernel.h" 3 | 4 | // input: points(b, c, n) idx(b, m, nsample) 5 | // output: out(b, c, m, nsample) 6 | __global__ void grouping_int_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out) 7 | { 8 | int batch_index = blockIdx.x; 9 | points += batch_index * n * c; 10 | idx += batch_index * m * nsample; 11 | out += batch_index * m * nsample * c; 12 | const int index = threadIdx.y * blockDim.x + threadIdx.x; 13 | const int stride = blockDim.y * blockDim.x; 14 | for (int i = index; i < c * m; i += stride) 15 | { 16 | const int l = i / m; 17 | const int j = i % m; 18 | for (int k = 0; k < nsample; ++k) 19 | { 20 | int ii = idx[j * nsample + k]; 21 | out[(l * m + j) * nsample + k] = points[l * n + ii]; 22 | } 23 | } 24 | } 25 | 26 | 27 | void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out) 28 | { 29 | grouping_int_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out); 30 | } 31 | 32 | 33 | __global__ void grouping_int_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const long int *__restrict__ points, const int *__restrict__ idx, long int *__restrict__ out) 34 | { 35 | int bs_idx = blockIdx.z; 36 | int c_idx = blockIdx.y; 37 | int index = blockIdx.x * blockDim.x + threadIdx.x; 38 | int pt_idx = index / nsample; 39 | if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; 40 | 41 | int sample_idx = index % nsample; 42 | 43 | idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 44 | int in_idx = bs_idx * c * n + c_idx * n + idx[0]; 45 | int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; 46 | 47 | out[out_idx] = points[in_idx]; 48 | } 49 | 50 | 51 | void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out) 52 | { 53 | cudaError_t err; 54 | 55 | dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) 56 | dim3 threads(THREADS_PER_BLOCK); 57 | 58 | grouping_int_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out); 59 | // cudaDeviceSynchronize(); // for using printf in kernel function 60 | err = cudaGetLastError(); 61 | if (cudaSuccess != err) { 62 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 63 | exit(-1); 64 | } 65 | } -------------------------------------------------------------------------------- /lib/pointops/src/labelstat/labelstat_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "labelstat_cuda_kernel.h" 7 | 8 | extern THCState *state; 9 | 10 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 11 | #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 12 | #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) 13 | 14 | void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass, 15 | at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor) 16 | { 17 | CHECK_INPUT(label_stat_tensor); 18 | CHECK_INPUT(idx_tensor); 19 | 20 | const int *label_stat = label_stat_tensor.data(); 21 | const int *idx = idx_tensor.data(); 22 | int *new_label_stat = new_label_stat_tensor.data(); 23 | 24 | cudaStream_t stream = THCState_getCurrentStream(state); 25 | 26 | labelstat_idx_cuda_launcher_fast(b, n, m, nsample, nclass, label_stat, idx, new_label_stat, stream); 27 | } 28 | 29 | void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass, 30 | at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor) 31 | { 32 | CHECK_INPUT(new_xyz_tensor); 33 | CHECK_INPUT(xyz_tensor); 34 | CHECK_INPUT(label_stat_tensor); 35 | 36 | const float *new_xyz = new_xyz_tensor.data(); 37 | const float *xyz = xyz_tensor.data(); 38 | const int *label_stat = label_stat_tensor.data(); 39 | int *new_label_stat = new_label_stat_tensor.data(); 40 | 41 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(state); 42 | 43 | labelstat_ballrange_cuda_launcher_fast(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat, stream); 44 | } 45 | 46 | void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass, 47 | at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor) 48 | { 49 | CHECK_INPUT(new_xyz_tensor); 50 | CHECK_INPUT(xyz_tensor); 51 | CHECK_INPUT(label_stat_tensor); 52 | CHECK_INPUT(idx_tensor); 53 | 54 | const float *new_xyz = new_xyz_tensor.data(); 55 | const float *xyz = xyz_tensor.data(); 56 | const int *label_stat = label_stat_tensor.data(); 57 | int *idx = idx_tensor.data(); 58 | int *new_label_stat = new_label_stat_tensor.data(); 59 | 60 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(state); 61 | 62 | labelstat_and_ballquery_cuda_launcher_fast(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat, stream); 63 | } 64 | -------------------------------------------------------------------------------- /lib/pointops2/src/attention/attention_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "attention_cuda_kernel.h" 6 | 7 | void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, 8 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor) 9 | { 10 | const float *q = q_tensor.data_ptr(); 11 | const float *k = k_tensor.data_ptr(); 12 | const int *index0 = index0_tensor.data_ptr(); 13 | const int *index1 = index1_tensor.data_ptr(); 14 | float *attn = attn_tensor.data_ptr(); 15 | attention_step1_forward_cuda_launcher(N, M, h, C, q, k, index0, index1, attn); 16 | } 17 | 18 | void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 19 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 20 | at::Tensor grad_q_tensor, at::Tensor grad_k_tensor) 21 | { 22 | const float *grad_out = grad_out_tensor.data_ptr(); 23 | const int *index0 = index0_tensor.data_ptr(); 24 | const int *index1 = index1_tensor.data_ptr(); 25 | const float *q = q_tensor.data_ptr(); 26 | const float *k = k_tensor.data_ptr(); 27 | float *grad_q = grad_q_tensor.data_ptr(); 28 | float *grad_k = grad_k_tensor.data_ptr(); 29 | attention_step1_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k); 30 | } 31 | 32 | void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 33 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor) 34 | { 35 | const float *attn = attn_tensor.data_ptr(); 36 | const float *v = v_tensor.data_ptr(); 37 | const int *index0 = index0_tensor.data_ptr(); 38 | const int *index1 = index1_tensor.data_ptr(); 39 | float *output = output_tensor.data_ptr(); 40 | attention_step2_forward_cuda_launcher(N, M, h, C, attn, v, index0, index1, output); 41 | } 42 | 43 | 44 | void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 45 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 46 | at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor) 47 | { 48 | const float *grad_out = grad_out_tensor.data_ptr(); 49 | const int *index0 = index0_tensor.data_ptr(); 50 | const int *index1 = index1_tensor.data_ptr(); 51 | const float *attn = attn_tensor.data_ptr(); 52 | const float *v = v_tensor.data_ptr(); 53 | float *grad_attn = grad_attn_tensor.data_ptr(); 54 | float *grad_v = grad_v_tensor.data_ptr(); 55 | attention_step2_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); 56 | } 57 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_attention_op_step1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 800000 8 | N = 35000 9 | C = 96 10 | h = 6 11 | query = torch.rand(N, h, C//h).cuda() 12 | key = torch.rand(N, h, C//h).cuda() 13 | 14 | index_0 = torch.rand(M) 15 | index_0[index_0 < 0] = 0 16 | index_0 = (index_0*N).long().cuda() 17 | 18 | index_1 = torch.rand(M) 19 | index_1[index_1 < 0] = 0 20 | index_1 = (index_1*N).long().cuda() 21 | 22 | query.requires_grad = True 23 | key.requires_grad = True 24 | 25 | # rearrange index for acceleration 26 | index_0, indices = torch.sort(index_0) #[M,] 27 | index_1 = index_1[indices] #[M,] 28 | index_0_counts = index_0.bincount() 29 | 30 | print("index_0_counts.shape: ", index_0_counts.shape) 31 | 32 | n_max = index_0_counts.max() 33 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N] 34 | 35 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape) 36 | 37 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1] 38 | 39 | # print("index_0[:100]: ", index_0[:100]) 40 | print("n_max: ", n_max) 41 | print("index_0_offsets.shape: ", index_0_offsets.shape) 42 | # input() 43 | 44 | print("index_0_offsets[:100]: ", index_0_offsets[:100]) 45 | print("index_1[300:320]: ", index_1[300:320]) 46 | 47 | 48 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int()) 49 | # loss = attn_flat.sum() 50 | # loss.backward() 51 | print("attn_flat.shape: {}, attn_flat[300:320,:10]: {}".format(attn_flat.shape, attn_flat[300:320,:10])) 52 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 53 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 54 | # input() 55 | 56 | print("query.is_contiguous(): ", query.is_contiguous()) 57 | print("key.is_contiguous(): ", key.is_contiguous()) 58 | print("index_0.is_contiguous(): ", index_0.is_contiguous()) 59 | print("index_1.is_contiguous(): ", index_1.is_contiguous()) 60 | 61 | attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max) 62 | # loss = attn_flat_v2.sum() 63 | # loss.backward() 64 | print("attn_flat_v2.shape: {}, attn_flat_v2[300:320,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[300:320,:10])) 65 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 66 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 67 | # input() 68 | 69 | mask = attn_flat_v2.sum(-1) != 0 70 | print("mask.sum(): ", mask.sum()) 71 | print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max()) 72 | 73 | 74 | print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all()) 75 | 76 | selected = 10000 77 | print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0)) 78 | 79 | -------------------------------------------------------------------------------- /lib/pointops2/src/attention_v2/attention_cuda_v2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "attention_cuda_kernel_v2.h" 6 | 7 | void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, 8 | at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor) 9 | { 10 | const float *q = q_tensor.data_ptr(); 11 | const float *k = k_tensor.data_ptr(); 12 | const int *index0_offsets = index0_tensor_offsets.data_ptr(); 13 | const int *index1 = index1_tensor.data_ptr(); 14 | float *attn = attn_tensor.data_ptr(); 15 | attention_step1_forward_cuda_launcher_v2(N, M, h, C, n_max, q, k, index0_offsets, index1, attn); 16 | } 17 | 18 | void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, 19 | at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 20 | at::Tensor grad_q_tensor, at::Tensor grad_k_tensor) 21 | { 22 | const float *grad_out = grad_out_tensor.data_ptr(); 23 | const int *index0_offsets = index0_tensor_offsets.data_ptr(); 24 | const int *index1 = index1_tensor.data_ptr(); 25 | const float *q = q_tensor.data_ptr(); 26 | const float *k = k_tensor.data_ptr(); 27 | float *grad_q = grad_q_tensor.data_ptr(); 28 | float *grad_k = grad_k_tensor.data_ptr(); 29 | attention_step1_backward_cuda_launcher_v2(N, M, h, C, n_max, grad_out, index0_offsets, index1, q, k, grad_q, grad_k); 30 | } 31 | 32 | void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 33 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor) 34 | { 35 | const float *attn = attn_tensor.data_ptr(); 36 | const float *v = v_tensor.data_ptr(); 37 | const int *index0 = index0_tensor.data_ptr(); 38 | const int *index1 = index1_tensor.data_ptr(); 39 | float *output = output_tensor.data_ptr(); 40 | attention_step2_forward_cuda_launcher_v2(N, M, h, C, attn, v, index0, index1, output); 41 | } 42 | 43 | 44 | void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, 45 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 46 | at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor) 47 | { 48 | const float *grad_out = grad_out_tensor.data_ptr(); 49 | const int *index0 = index0_tensor.data_ptr(); 50 | const int *index1 = index1_tensor.data_ptr(); 51 | const float *attn = attn_tensor.data_ptr(); 52 | const float *v = v_tensor.data_ptr(); 53 | float *grad_attn = grad_attn_tensor.data_ptr(); 54 | float *grad_v = grad_v_tensor.data_ptr(); 55 | attention_step2_backward_cuda_launcher_v2(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); 56 | } 57 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 80000 8 | N = 3500 9 | hdim = 16 10 | h = 6 11 | L = 31 12 | attn = torch.rand(M, h).cuda() 13 | v = torch.rand(N, h, hdim).cuda() 14 | table = torch.rand(L, h, hdim, 3).cuda() 15 | 16 | index_0 = torch.rand(M) 17 | index_0[index_0 < 0] = 0 18 | index_0 = (index_0*N).long().cuda() 19 | 20 | index_1 = torch.rand(M) 21 | index_1[index_1 < 0] = 0 22 | index_1 = (index_1*N).long().cuda() 23 | 24 | rel_index = torch.rand(M, 3) 25 | rel_index[rel_index < 0] = 0 26 | rel_index = (rel_index*L).long().cuda() 27 | 28 | 29 | # rearrange index for acceleration 30 | index_0, indices = torch.sort(index_0) #[M,] 31 | index_1 = index_1[indices] #[M,] 32 | rel_index = rel_index[indices] 33 | index_0_counts = index_0.bincount() 34 | 35 | print("index_0_counts.shape: ", index_0_counts.shape) 36 | 37 | n_max = index_0_counts.max() 38 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N] 39 | 40 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape) 41 | 42 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1] 43 | 44 | 45 | attn.requires_grad = True 46 | v.requires_grad = True 47 | table.requires_grad = True 48 | 49 | 50 | output = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int()) 51 | loss = output.mean() 52 | loss.backward() 53 | 54 | print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5])) 55 | print("attn.grad[:5, :3]: ", attn.grad[:5, :3]) 56 | print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5]) 57 | print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 58 | # input() 59 | 60 | attn_grad = attn.grad.clone() 61 | v_grad = v.grad.clone() 62 | table_grad = table.grad.clone() 63 | 64 | attn.grad.zero_() 65 | v.grad.zero_() 66 | table.grad.zero_() 67 | 68 | # print("query.is_contiguous(): ", query.is_contiguous()) 69 | # print("key.is_contiguous(): ", key.is_contiguous()) 70 | # print("index_0.is_contiguous(): ", index_0.is_contiguous()) 71 | # print("index_1.is_contiguous(): ", index_1.is_contiguous()) 72 | 73 | output_v2 = pointops.attention_step2_with_rel_pos_value_v2(attn, v, index_0_offsets.int(), n_max, index_1.int(), table, rel_index.int()) 74 | loss = output_v2.mean() 75 | loss.backward() 76 | 77 | print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5])) 78 | print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3]) 79 | print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5]) 80 | print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2]) 81 | # input() 82 | 83 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max()) 84 | 85 | print("((attn_grad-attn.grad)**2).max(): ", ((attn_grad-attn.grad)**2).max()) 86 | 87 | print("((v_grad-v.grad)**2).max(): ", ((v_grad-v.grad)**2).max()) 88 | 89 | print("((table_grad-table.grad)**2).max(): ", ((table_grad-table.grad)**2).max()) 90 | 91 | # print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2)) 92 | 93 | -------------------------------------------------------------------------------- /lib/cpp_wrappers/cpp_utils/cloud/cloud.h: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 0==========================0 4 | // | Local feature test | 5 | // 0==========================0 6 | // 7 | // version 1.0 : 8 | // > 9 | // 10 | //--------------------------------------------------- 11 | // 12 | // Cloud header 13 | // 14 | //---------------------------------------------------- 15 | // 16 | // Hugues THOMAS - 10/02/2017 17 | // 18 | 19 | 20 | # pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include 32 | 33 | 34 | 35 | 36 | // Point class 37 | // *********** 38 | 39 | 40 | class PointXYZ 41 | { 42 | public: 43 | 44 | // Elements 45 | // ******** 46 | 47 | float x, y, z; 48 | 49 | 50 | // Methods 51 | // ******* 52 | 53 | // Constructor 54 | PointXYZ() { x = 0; y = 0; z = 0; } 55 | PointXYZ(float x0, float y0, float z0) { x = x0; y = y0; z = z0; } 56 | 57 | // array type accessor 58 | float operator [] (int i) const 59 | { 60 | if (i == 0) return x; 61 | else if (i == 1) return y; 62 | else return z; 63 | } 64 | 65 | // opperations 66 | float dot(const PointXYZ P) const 67 | { 68 | return x * P.x + y * P.y + z * P.z; 69 | } 70 | 71 | float sq_norm() 72 | { 73 | return x*x + y*y + z*z; 74 | } 75 | 76 | PointXYZ cross(const PointXYZ P) const 77 | { 78 | return PointXYZ(y*P.z - z*P.y, z*P.x - x*P.z, x*P.y - y*P.x); 79 | } 80 | 81 | PointXYZ& operator+=(const PointXYZ& P) 82 | { 83 | x += P.x; 84 | y += P.y; 85 | z += P.z; 86 | return *this; 87 | } 88 | 89 | PointXYZ& operator-=(const PointXYZ& P) 90 | { 91 | x -= P.x; 92 | y -= P.y; 93 | z -= P.z; 94 | return *this; 95 | } 96 | 97 | PointXYZ& operator*=(const float& a) 98 | { 99 | x *= a; 100 | y *= a; 101 | z *= a; 102 | return *this; 103 | } 104 | }; 105 | 106 | 107 | // Point Opperations 108 | // ***************** 109 | 110 | inline PointXYZ operator + (const PointXYZ A, const PointXYZ B) 111 | { 112 | return PointXYZ(A.x + B.x, A.y + B.y, A.z + B.z); 113 | } 114 | 115 | inline PointXYZ operator - (const PointXYZ A, const PointXYZ B) 116 | { 117 | return PointXYZ(A.x - B.x, A.y - B.y, A.z - B.z); 118 | } 119 | 120 | inline PointXYZ operator * (const PointXYZ P, const float a) 121 | { 122 | return PointXYZ(P.x * a, P.y * a, P.z * a); 123 | } 124 | 125 | inline PointXYZ operator * (const float a, const PointXYZ P) 126 | { 127 | return PointXYZ(P.x * a, P.y * a, P.z * a); 128 | } 129 | 130 | inline std::ostream& operator << (std::ostream& os, const PointXYZ P) 131 | { 132 | return os << "[" << P.x << ", " << P.y << ", " << P.z << "]"; 133 | } 134 | 135 | inline bool operator == (const PointXYZ A, const PointXYZ B) 136 | { 137 | return A.x == B.x && A.y == B.y && A.z == B.z; 138 | } 139 | 140 | inline PointXYZ floor(const PointXYZ P) 141 | { 142 | return PointXYZ(std::floor(P.x), std::floor(P.y), std::floor(P.z)); 143 | } 144 | 145 | 146 | PointXYZ max_point(std::vector points); 147 | PointXYZ min_point(std::vector points); 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /lib/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 80000 8 | N = 3500 9 | # M = 80 10 | # N = 5 11 | hdim = 16 12 | h = 6 13 | L = 31 14 | query = torch.rand(N, h, hdim).cuda() 15 | table_q = torch.rand(L, h, hdim, 3).cuda() 16 | key = torch.rand(N, h, hdim).cuda() 17 | table_k = torch.rand(L, h, hdim, 3).cuda() 18 | 19 | index_q = torch.rand(M) 20 | index_q[index_q < 0] = 0 21 | index_q = (index_q*N).long().cuda() 22 | 23 | index_k = torch.rand(M) 24 | index_k[index_k < 0] = 0 25 | index_k = (index_k*N).long().cuda() 26 | 27 | rel_index = torch.rand(M, 3) 28 | rel_index[rel_index < 0] = 0 29 | rel_index = (rel_index*L).long().cuda() 30 | 31 | 32 | # rearrange index for acceleration 33 | index_q, indices = torch.sort(index_q) #[M,] 34 | index_k = index_k[indices] #[M,] 35 | rel_index = rel_index[indices] 36 | index_q_counts = index_q.bincount() 37 | 38 | print("index_q_counts.shape: ", index_q_counts.shape) 39 | 40 | n_max = index_q_counts.max() 41 | index_q_offsets = index_q_counts.cumsum(dim=-1) #[N] 42 | 43 | print("v1 index_q_offsets.shape: ", index_q_offsets.shape) 44 | 45 | index_q_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_q_offsets], 0) #[N+1] 46 | 47 | # print("index_q[:100]: ", index_q[:100]) 48 | print("n_max: ", n_max) 49 | print("index_q_offsets.shape: ", index_q_offsets.shape) 50 | # input() 51 | 52 | print("index_q_offsets[:100]: ", index_q_offsets[:100]) 53 | print("index_k[:20]: ", index_k[:20]) 54 | 55 | query.requires_grad = True 56 | table_q.requires_grad = True 57 | key.requires_grad = True 58 | table_k.requires_grad = True 59 | 60 | output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int()) 61 | output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int()) 62 | output = output1 + output2 63 | loss = output.mean() 64 | loss.backward() 65 | 66 | # print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10])) 67 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 68 | # print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2]) 69 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 70 | # print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2]) 71 | # input() 72 | 73 | # print("query.is_contiguous(): ", query.is_contiguous()) 74 | # print("key.is_contiguous(): ", key.is_contiguous()) 75 | # print("index_q.is_contiguous(): ", index_q.is_contiguous()) 76 | # print("index_k.is_contiguous(): ", index_k.is_contiguous()) 77 | 78 | output_v2 = pointops.dot_prod_with_idx_v3(query, index_q_offsets.int(), n_max, key, index_k.int(), table_q, table_k, rel_index.int()) 79 | # loss = output_v2.mean() 80 | # loss.backward() 81 | 82 | # print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10])) 83 | # print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 84 | # print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2]) 85 | # print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 86 | # print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2]) 87 | # input() 88 | 89 | print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max()) 90 | 91 | -------------------------------------------------------------------------------- /util/scannet_v2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import SharedArray as SA 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | 8 | from util.voxelize import voxelize 9 | from util.data_util import sa_create, collate_fn 10 | from util.data_util import data_prepare_scannet as data_prepare 11 | import glob 12 | 13 | class Scannetv2(Dataset): 14 | def __init__(self, split='train', data_root='trainval', voxel_size=0.04, voxel_max=None, transform=None, shuffle_index=False, loop=1): 15 | super().__init__() 16 | 17 | self.split = split 18 | self.data_root = data_root 19 | self.voxel_size = voxel_size 20 | self.voxel_max = voxel_max 21 | self.transform = transform 22 | self.shuffle_index = shuffle_index 23 | self.loop = loop 24 | 25 | if split == "train" or split == 'val': 26 | self.data_list = glob.glob(os.path.join(data_root, split, "*.pth")) 27 | elif split == 'trainval': 28 | self.data_list = glob.glob(os.path.join(data_root, "train", "*.pth")) + glob.glob(os.path.join(data_root, "val", "*.pth")) 29 | else: 30 | raise ValueError("no such split: {}".format(split)) 31 | 32 | print("voxel_size: ", voxel_size) 33 | print("Totally {} samples in {} set.".format(len(self.data_list), split)) 34 | 35 | def __getitem__(self, idx): 36 | # data_idx = self.data_idx[idx % len(self.data_idx)] 37 | 38 | # data = SA.attach("shm://{}".format(self.data_list[data_idx])).copy() 39 | data_idx = idx % len(self.data_list) 40 | data_path = self.data_list[data_idx] 41 | data = torch.load(data_path) 42 | 43 | coord, feat = data[0], data[1] 44 | if self.split != 'test': 45 | label = data[2] 46 | 47 | coord, feat, label = data_prepare(coord, feat, label, self.split, self.voxel_size, self.voxel_max, self.transform, self.shuffle_index) 48 | return coord, feat, label 49 | 50 | def __len__(self): 51 | # return len(self.data_idx) * self.loop 52 | return len(self.data_list) * self.loop 53 | 54 | 55 | if __name__ == '__main__': 56 | data_root = '/home/share/Dataset/s3dis' 57 | test_area, voxel_size, voxel_max = 5, 0.04, 80000 58 | 59 | point_data = S3DIS(split='train', data_root=data_root, test_area=test_area, voxel_size=voxel_size, voxel_max=voxel_max) 60 | print('point data size:', point_data.__len__()) 61 | import torch, time, random 62 | manual_seed = 123 63 | random.seed(manual_seed) 64 | np.random.seed(manual_seed) 65 | torch.manual_seed(manual_seed) 66 | torch.cuda.manual_seed_all(manual_seed) 67 | def worker_init_fn(worker_id): 68 | random.seed(manual_seed + worker_id) 69 | train_loader = torch.utils.data.DataLoader(point_data, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn) 70 | for idx in range(1): 71 | end = time.time() 72 | voxel_num = [] 73 | for i, (coord, feat, label, offset) in enumerate(train_loader): 74 | print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end)) 75 | print('tag', coord.shape, feat.shape, label.shape, offset.shape, torch.unique(label)) 76 | voxel_num.append(label.shape[0]) 77 | end = time.time() 78 | print(np.sort(np.array(voxel_num))) 79 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import functools 4 | import sys 5 | from termcolor import colored 6 | 7 | class _ColorfulFormatter(logging.Formatter): 8 | def __init__(self, *args, **kwargs): 9 | self._root_name = kwargs.pop("root_name") + "." 10 | self._abbrev_name = kwargs.pop("abbrev_name", "") 11 | if len(self._abbrev_name): 12 | self._abbrev_name = self._abbrev_name + "." 13 | super(_ColorfulFormatter, self).__init__(*args, **kwargs) 14 | 15 | def formatMessage(self, record): 16 | record.name = record.name.replace(self._root_name, self._abbrev_name) 17 | log = super(_ColorfulFormatter, self).formatMessage(record) 18 | if record.levelno == logging.WARNING: 19 | prefix = colored("WARNING", "red", attrs=["blink"]) 20 | elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: 21 | prefix = colored("ERROR", "red", attrs=["blink", "underline"]) 22 | else: 23 | return log 24 | return prefix + " " + log 25 | 26 | 27 | # so that calling setup_logger multiple times won't add many handlers 28 | @functools.lru_cache() 29 | def get_logger( 30 | output=None, color=True, name="main-logger", abbrev_name=None 31 | ): 32 | """ 33 | Initialize the detectron2 logger and set its verbosity level to "INFO". 34 | Args: 35 | output (str): a file name or a directory to save log. If None, will not save log file. 36 | If ends with ".txt" or ".log", assumed to be a file name. 37 | Otherwise, logs will be saved to `output/log.txt`. 38 | name (str): the root module name of this logger 39 | Returns: 40 | logging.Logger: a logger 41 | """ 42 | logger = logging.getLogger(name) 43 | logger.setLevel(logging.DEBUG) 44 | logger.propagate = False 45 | 46 | if abbrev_name is None: 47 | abbrev_name = name 48 | 49 | plain_formatter = logging.Formatter( 50 | "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S" 51 | ) 52 | # stdout logging: master only 53 | ch = logging.StreamHandler(stream=sys.stdout) 54 | ch.setLevel(logging.DEBUG) 55 | if color: 56 | formatter = _ColorfulFormatter( 57 | colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s", 58 | datefmt="%m/%d %H:%M:%S", 59 | root_name=name, 60 | abbrev_name=str(abbrev_name), 61 | ) 62 | else: 63 | formatter = plain_formatter 64 | ch.setFormatter(formatter) 65 | logger.addHandler(ch) 66 | 67 | # file logging: also master only 68 | if output is not None: 69 | if output.endswith(".txt") or output.endswith(".log"): 70 | filename = output 71 | else: 72 | filename = os.path.join(output, "log.txt") 73 | os.makedirs(os.path.dirname(filename), exist_ok=True) 74 | 75 | fh = logging.StreamHandler(_cached_log_stream(filename)) 76 | fh.setLevel(logging.DEBUG) 77 | fh.setFormatter(plain_formatter) 78 | logger.addHandler(fh) 79 | 80 | return logger 81 | 82 | # cache the opened file object, so that different calls to `setup_logger` 83 | # with the same file name can safely write to the same file. 84 | @functools.lru_cache(maxsize=None) 85 | def _cached_log_stream(filename): 86 | return open(filename, "a") -------------------------------------------------------------------------------- /util/s3dis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import SharedArray as SA 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | 8 | from util.voxelize import voxelize 9 | from util.data_util import sa_create, collate_fn 10 | from util.data_util import data_prepare_v101 as data_prepare 11 | 12 | 13 | 14 | class S3DIS(Dataset): 15 | def __init__(self, split='train', data_root='trainval', test_area=5, voxel_size=0.04, voxel_max=None, transform=None, shuffle_index=False, loop=1): 16 | super().__init__() 17 | self.split, self.voxel_size, self.transform, self.voxel_max, self.shuffle_index, self.loop = split, voxel_size, transform, voxel_max, shuffle_index, loop 18 | data_list = sorted(os.listdir(data_root)) 19 | data_list = [item[:-4] for item in data_list if 'Area_' in item] 20 | if split == 'train': 21 | self.data_list = [item for item in data_list if not 'Area_{}'.format(test_area) in item] 22 | else: 23 | self.data_list = [item for item in data_list if 'Area_{}'.format(test_area) in item] 24 | self.data_root = data_root 25 | # for item in self.data_list: 26 | # if not os.path.exists("/dev/shm/{}".format(item)): 27 | # data_path = os.path.join(data_root, item + '.npy') 28 | # data = np.load(data_path) # xyzrgbl, N*7 29 | # sa_create("shm://{}".format(item), data) 30 | self.data_idx = np.arange(len(self.data_list)) 31 | print("Totally {} samples in {} set.".format(len(self.data_idx), split)) 32 | 33 | def __getitem__(self, idx): 34 | data_idx = self.data_idx[idx % len(self.data_idx)] 35 | 36 | # data = SA.attach("shm://{}".format(self.data_list[data_idx])).copy() 37 | item = self.data_list[data_idx] 38 | data_path = os.path.join(self.data_root, item + '.npy') 39 | data = np.load(data_path) 40 | 41 | coord, feat, label = data[:, 0:3], data[:, 3:6], data[:, 6] 42 | coord, feat, label = data_prepare(coord, feat, label, self.split, self.voxel_size, self.voxel_max, self.transform, self.shuffle_index) 43 | return coord, feat, label 44 | 45 | def __len__(self): 46 | return len(self.data_idx) * self.loop 47 | 48 | 49 | if __name__ == '__main__': 50 | data_root = '/home/share/Dataset/s3dis' 51 | test_area, voxel_size, voxel_max = 5, 0.04, 80000 52 | 53 | point_data = S3DIS(split='train', data_root=data_root, test_area=test_area, voxel_size=voxel_size, voxel_max=voxel_max) 54 | print('point data size:', point_data.__len__()) 55 | import torch, time, random 56 | manual_seed = 123 57 | random.seed(manual_seed) 58 | np.random.seed(manual_seed) 59 | torch.manual_seed(manual_seed) 60 | torch.cuda.manual_seed_all(manual_seed) 61 | def worker_init_fn(worker_id): 62 | random.seed(manual_seed + worker_id) 63 | train_loader = torch.utils.data.DataLoader(point_data, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn) 64 | for idx in range(1): 65 | end = time.time() 66 | voxel_num = [] 67 | for i, (coord, feat, label, offset) in enumerate(train_loader): 68 | print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end)) 69 | print('tag', coord.shape, feat.shape, label.shape, offset.shape, torch.unique(label)) 70 | voxel_num.append(label.shape[0]) 71 | end = time.time() 72 | print(np.sort(np.array(voxel_num))) 73 | -------------------------------------------------------------------------------- /lib/pointops2/src/aggregation/aggregation_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "aggregation_cuda_kernel.h" 3 | 4 | 5 | __global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { 6 | // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) 7 | int index = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (index >= n * c) return; 9 | const int c_idx = index % c; 10 | const int n_idx = index / c; 11 | const int w_c_idx = c_idx % w_c; 12 | for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) 13 | { 14 | int idx_idx = n_idx * nsample + nsample_idx; 15 | int input_idx = idx[idx_idx] * c + c_idx; 16 | int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; 17 | int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; 18 | output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx]; 19 | } 20 | } 21 | 22 | __global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { 23 | // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) 24 | int index = blockIdx.x * blockDim.x + threadIdx.x; 25 | if (index >= n * c) return; 26 | const int c_idx = index % c; 27 | const int n_idx = index / c; 28 | const int w_c_idx = c_idx % w_c; 29 | for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) 30 | { 31 | int idx_idx = n_idx * nsample + nsample_idx; 32 | int input_idx = idx[idx_idx] * c + c_idx; 33 | int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; 34 | int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; 35 | atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]); 36 | grad_position[position_idx] = grad_output[index] * weight[weight_idx]; 37 | atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx])); 38 | } 39 | } 40 | 41 | void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { 42 | // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) 43 | dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); 44 | dim3 threads(THREADS_PER_BLOCK); 45 | aggregation_forward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, output); 46 | } 47 | 48 | void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { 49 | // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) 50 | dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); 51 | dim3 threads(THREADS_PER_BLOCK); 52 | aggregation_backward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); 53 | } 54 | -------------------------------------------------------------------------------- /lib/pointops2/src/rpe/relative_pos_encoding_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "relative_pos_encoding_cuda_kernel.h" 6 | 7 | void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, 8 | at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) 9 | { 10 | const float *q = q_tensor.data_ptr(); 11 | const float *table = table_tensor.data_ptr(); 12 | const int *index = index_tensor.data_ptr(); 13 | const int *rel_idx = rel_idx_tensor.data_ptr(); 14 | float *output = output_tensor.data_ptr(); 15 | dot_prod_with_idx_forward_cuda_launcher(N, M, h, hdim, q, index, table, rel_idx, output); 16 | } 17 | 18 | void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 19 | at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, 20 | at::Tensor grad_q_tensor, at::Tensor grad_table_tensor) 21 | { 22 | const float *grad_out = grad_out_tensor.data_ptr(); 23 | const float *q = q_tensor.data_ptr(); 24 | const int *index = index_tensor.data_ptr(); 25 | const float *table = table_tensor.data_ptr(); 26 | const int *rel_idx = rel_idx_tensor.data_ptr(); 27 | float *grad_q = grad_q_tensor.data_ptr(); 28 | float *grad_table = grad_table_tensor.data_ptr(); 29 | dot_prod_with_idx_backward_cuda_launcher(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table); 30 | } 31 | 32 | void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, 33 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) 34 | { 35 | const float *attn = attn_tensor.data_ptr(); 36 | const float *v = v_tensor.data_ptr(); 37 | const int *index0 = index0_tensor.data_ptr(); 38 | const int *index1 = index1_tensor.data_ptr(); 39 | const float *table = table_tensor.data_ptr(); 40 | const int *rel_idx = rel_idx_tensor.data_ptr(); 41 | float *output = output_tensor.data_ptr(); 42 | attention_step2_with_rel_pos_value_forward_cuda_launcher(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output); 43 | } 44 | 45 | void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 46 | at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, 47 | at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor) 48 | { 49 | const float *grad_out = grad_out_tensor.data_ptr(); 50 | const int *index0 = index0_tensor.data_ptr(); 51 | const int *index1 = index1_tensor.data_ptr(); 52 | const float *attn = attn_tensor.data_ptr(); 53 | const float *v = v_tensor.data_ptr(); 54 | const float *table = table_tensor.data_ptr(); 55 | const int *rel_idx = rel_idx_tensor.data_ptr(); 56 | float *grad_attn = grad_attn_tensor.data_ptr(); 57 | float *grad_v = grad_v_tensor.data_ptr(); 58 | float *grad_table = grad_table_tensor.data_ptr(); 59 | attention_step2_with_rel_pos_value_backward_cuda_launcher(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); 60 | } 61 | -------------------------------------------------------------------------------- /lib/pointops2/src/knnquery/knnquery_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "knnquery_cuda_kernel.h" 3 | 4 | 5 | __device__ void swap_float(float *x, float *y) 6 | { 7 | float tmp = *x; 8 | *x = *y; 9 | *y = tmp; 10 | } 11 | 12 | 13 | __device__ void swap_int(int *x, int *y) 14 | { 15 | int tmp = *x; 16 | *x = *y; 17 | *y = tmp; 18 | } 19 | 20 | 21 | __device__ void reheap(float *dist, int *idx, int k) 22 | { 23 | int root = 0; 24 | int child = root * 2 + 1; 25 | while (child < k) 26 | { 27 | if(child + 1 < k && dist[child+1] > dist[child]) 28 | child++; 29 | if(dist[root] > dist[child]) 30 | return; 31 | swap_float(&dist[root], &dist[child]); 32 | swap_int(&idx[root], &idx[child]); 33 | root = child; 34 | child = root * 2 + 1; 35 | } 36 | } 37 | 38 | 39 | __device__ void heap_sort(float *dist, int *idx, int k) 40 | { 41 | int i; 42 | for (i = k - 1; i > 0; i--) 43 | { 44 | swap_float(&dist[0], &dist[i]); 45 | swap_int(&idx[0], &idx[i]); 46 | reheap(dist, idx, i); 47 | } 48 | } 49 | 50 | 51 | __device__ int get_bt_idx(int idx, const int *offset) 52 | { 53 | int i = 0; 54 | while (1) 55 | { 56 | if (idx < offset[i]) 57 | break; 58 | else 59 | i++; 60 | } 61 | return i; 62 | } 63 | 64 | 65 | __global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) { 66 | // input: xyz (n, 3) new_xyz (m, 3) 67 | // output: idx (m, nsample) dist2 (m, nsample) 68 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 69 | if (pt_idx >= m) return; 70 | 71 | new_xyz += pt_idx * 3; 72 | idx += pt_idx * nsample; 73 | dist2 += pt_idx * nsample; 74 | int bt_idx = get_bt_idx(pt_idx, new_offset); 75 | int start; 76 | if (bt_idx == 0) 77 | start = 0; 78 | else 79 | start = offset[bt_idx - 1]; 80 | int end = offset[bt_idx]; 81 | 82 | float new_x = new_xyz[0]; 83 | float new_y = new_xyz[1]; 84 | float new_z = new_xyz[2]; 85 | 86 | float best_dist[100]; 87 | int best_idx[100]; 88 | for(int i = 0; i < nsample; i++){ 89 | best_dist[i] = 1e10; 90 | best_idx[i] = start; 91 | } 92 | for(int i = start; i < end; i++){ 93 | float x = xyz[i * 3 + 0]; 94 | float y = xyz[i * 3 + 1]; 95 | float z = xyz[i * 3 + 2]; 96 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 97 | if (d2 < best_dist[0]){ 98 | best_dist[0] = d2; 99 | best_idx[0] = i; 100 | reheap(best_dist, best_idx, nsample); 101 | } 102 | } 103 | heap_sort(best_dist, best_idx, nsample); 104 | for(int i = 0; i < nsample; i++){ 105 | idx[i] = best_idx[i]; 106 | dist2[i] = best_dist[i]; 107 | } 108 | } 109 | 110 | 111 | void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) { 112 | // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample) 113 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK)); 114 | dim3 threads(THREADS_PER_BLOCK); 115 | knnquery_cuda_kernel<<>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); 116 | } 117 | -------------------------------------------------------------------------------- /lib/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "knnquery_heap_cuda_kernel.h" 3 | 4 | 5 | __device__ void swap_float(float *x, float *y) 6 | { 7 | float tmp = *x; 8 | *x = *y; 9 | *y = tmp; 10 | } 11 | 12 | 13 | __device__ void swap_int(int *x, int *y) 14 | { 15 | int tmp = *x; 16 | *x = *y; 17 | *y = tmp; 18 | } 19 | 20 | 21 | __device__ void reheap(float *dist, int *idx, int k) 22 | { 23 | int root = 0; 24 | int child = root * 2 + 1; 25 | while (child < k) 26 | { 27 | if(child + 1 < k && dist[child+1] > dist[child]) 28 | child++; 29 | if(dist[root] > dist[child]) 30 | return; 31 | swap_float(&dist[root], &dist[child]); 32 | swap_int(&idx[root], &idx[child]); 33 | root = child; 34 | child = root * 2 + 1; 35 | } 36 | } 37 | 38 | 39 | __device__ void heap_sort(float *dist, int *idx, int k) 40 | { 41 | int i; 42 | for (i = k - 1; i > 0; i--) 43 | { 44 | swap_float(&dist[0], &dist[i]); 45 | swap_int(&idx[0], &idx[i]); 46 | reheap(dist, idx, i); 47 | } 48 | } 49 | 50 | 51 | // input: xyz (b, n, 3) new_xyz (b, m, 3) 52 | // output: idx (b, m, nsample) dist2 (b, m, nsample) 53 | __global__ void knnquery_heap_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) { 54 | int bs_idx = blockIdx.y; 55 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 56 | if (bs_idx >= b || pt_idx >= m) return; 57 | 58 | new_xyz += bs_idx * m * 3 + pt_idx * 3; 59 | xyz += bs_idx * n * 3; 60 | idx += bs_idx * m * nsample + pt_idx * nsample; 61 | dist2 += bs_idx * m * nsample + pt_idx * nsample; 62 | 63 | float new_x = new_xyz[0]; 64 | float new_y = new_xyz[1]; 65 | float new_z = new_xyz[2]; 66 | 67 | float best_dist[100]; 68 | int best_idx[100]; 69 | for(int i = 0; i < nsample; i++){ 70 | best_dist[i] = 1e10; 71 | best_idx[i] = 0; 72 | } 73 | for(int i = 0; i < n; i++){ 74 | float x = xyz[i * 3 + 0]; 75 | float y = xyz[i * 3 + 1]; 76 | float z = xyz[i * 3 + 2]; 77 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 78 | if (d2 < best_dist[0]){ 79 | best_dist[0] = d2; 80 | best_idx[0] = i; 81 | reheap(best_dist, best_idx, nsample); 82 | } 83 | } 84 | heap_sort(best_dist, best_idx, nsample); 85 | for(int i = 0; i < nsample; i++){ 86 | idx[i] = best_idx[i]; 87 | dist2[i] = best_dist[i]; 88 | } 89 | } 90 | 91 | 92 | void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) { 93 | // param new_xyz: (B, m, 3) 94 | // param xyz: (B, n, 3) 95 | // param idx: (B, m, nsample) 96 | 97 | cudaError_t err; 98 | 99 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 100 | dim3 threads(THREADS_PER_BLOCK); 101 | 102 | knnquery_heap_cuda_kernel<<>>(b, n, m, nsample, xyz, new_xyz, idx, dist2); 103 | // cudaDeviceSynchronize(); // for using printf in kernel function 104 | 105 | err = cudaGetLastError(); 106 | if (cudaSuccess != err) { 107 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 108 | exit(-1); 109 | } 110 | } -------------------------------------------------------------------------------- /lib/pointops2/functions/test_attention_op_step1_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pointops 3 | from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum 4 | 5 | torch.manual_seed(1) 6 | 7 | M = 800000 8 | N = 35000 9 | C = 96 10 | h = 6 11 | query = torch.rand(N, h, C//h).cuda() 12 | key = torch.rand(N, h, C//h).cuda() 13 | 14 | index_0 = torch.rand(M) 15 | index_0[index_0 < 0] = 0 16 | index_0 = (index_0*N).long().cuda() 17 | 18 | index_1 = torch.rand(M) 19 | index_1[index_1 < 0] = 0 20 | index_1 = (index_1*N).long().cuda() 21 | 22 | query.requires_grad = True 23 | key.requires_grad = True 24 | 25 | 26 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int()) 27 | loss = attn_flat.sum() 28 | loss.backward() 29 | print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10])) 30 | print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 31 | print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 32 | input() 33 | 34 | 35 | 36 | # rearrange index for acceleration 37 | index_0, indices = torch.sort(index_0) #[M,] 38 | index_1 = index_1[indices] #[M,] 39 | index_0_counts = index_0.bincount() 40 | 41 | print("index_0_counts.shape: ", index_0_counts.shape) 42 | 43 | n_max = index_0_counts.max() 44 | index_0_offsets = index_0_counts.cumsum(dim=-1) #[N] 45 | 46 | print("v1 index_0_offsets.shape: ", index_0_offsets.shape) 47 | 48 | index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1] 49 | 50 | # print("index_0[:100]: ", index_0[:100]) 51 | print("n_max: ", n_max) 52 | print("index_0_offsets.shape: ", index_0_offsets.shape) 53 | # input() 54 | 55 | print("index_0_offsets[:100]: ", index_0_offsets[:100]) 56 | print("index_1[:20]: ", index_1[:20]) 57 | 58 | 59 | attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int()) 60 | # loss = attn_flat.sum() 61 | # loss.backward() 62 | # # attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int()) 63 | # # loss = attn_flat.sum() 64 | # # loss.backward() 65 | # print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10])) 66 | # print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 67 | # print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 68 | # input() 69 | 70 | print("query.is_contiguous(): ", query.is_contiguous()) 71 | print("key.is_contiguous(): ", key.is_contiguous()) 72 | print("index_0.is_contiguous(): ", index_0.is_contiguous()) 73 | print("index_1.is_contiguous(): ", index_1.is_contiguous()) 74 | 75 | attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max) 76 | loss = attn_flat_v2.sum() 77 | loss.backward() 78 | 79 | # attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max) 80 | # loss = attn_flat_v2.sum() 81 | # loss.backward() 82 | 83 | print("attn_flat_v2.shape: {}, attn_flat_v2[:20,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[:20,:10])) 84 | print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5]) 85 | print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5]) 86 | # input() 87 | 88 | # mask = attn_flat_v2.sum(-1) != 0 89 | # print("mask.sum(): ", mask.sum()) 90 | # print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max()) 91 | 92 | 93 | print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all()) 94 | 95 | selected = 10000 96 | print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0)) 97 | 98 | -------------------------------------------------------------------------------- /util/voxelize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Sequence 3 | import torch 4 | from torch_geometric.nn import voxel_grid 5 | 6 | def grid_sample(pos, batch_index, size, start=None, return_p2v=True): 7 | # pos: float [N, 3] 8 | # batch_szie: long int 9 | # size: float [3, ] 10 | # start: float [3, ] / None 11 | 12 | # print("pos.shape: {}, batch.shape: {}".format(pos.shape, batch.shape)) 13 | # print("size: ", size) 14 | 15 | # batch [N, ] 16 | batch = torch.zeros(pos.shape[0]) 17 | for i in range (1, len(batch_index)): 18 | batch[batch_index[i-1]:batch_index[i]] = i 19 | 20 | cluster = voxel_grid(pos, batch, size, start=start) #[N, ] 21 | 22 | if return_p2v == False: 23 | unique, cluster = torch.unique(cluster, sorted=True, return_inverse=True) 24 | return cluster 25 | 26 | unique, cluster, counts = torch.unique(cluster, sorted=True, return_inverse=True, return_counts=True) 27 | 28 | # print("unique.shape: {}, cluster.shape: {}, counts.shape: {}".format(unique.shape, cluster.shape, counts.shape)) 29 | 30 | # input() 31 | 32 | # obtain p2v_map 33 | n = unique.shape[0] 34 | k = counts.max().item() 35 | p2v_map = cluster.new_zeros(n, k) #[n, k] 36 | mask = torch.arange(k).cuda().unsqueeze(0) < counts.unsqueeze(-1) #[n, k] 37 | p2v_map[mask] = torch.argsort(cluster) 38 | # max_point 39 | max_point = 48 40 | if k > max_point: 41 | counts = torch.where(counts > max_point, max_point, counts) 42 | p2v_map = p2v_map[:,0:max_point] 43 | 44 | return cluster, p2v_map, counts 45 | 46 | def fnv_hash_vec(arr): 47 | """ 48 | FNV64-1A 49 | """ 50 | assert arr.ndim == 2 51 | # Floor first for negative coordinates 52 | arr = arr.copy() 53 | arr = arr.astype(np.uint64, copy=False) 54 | hashed_arr = np.uint64(14695981039346656037) * np.ones(arr.shape[0], dtype=np.uint64) 55 | for j in range(arr.shape[1]): 56 | hashed_arr *= np.uint64(1099511628211) 57 | hashed_arr = np.bitwise_xor(hashed_arr, arr[:, j]) 58 | return hashed_arr 59 | 60 | 61 | def ravel_hash_vec(arr): 62 | """ 63 | Ravel the coordinates after subtracting the min coordinates. 64 | """ 65 | assert arr.ndim == 2 66 | arr = arr.copy() 67 | arr -= arr.min(0) 68 | arr = arr.astype(np.uint64, copy=False) 69 | arr_max = arr.max(0).astype(np.uint64) + 1 70 | 71 | keys = np.zeros(arr.shape[0], dtype=np.uint64) 72 | # Fortran style indexing 73 | for j in range(arr.shape[1] - 1): 74 | keys += arr[:, j] 75 | keys *= arr_max[j + 1] 76 | keys += arr[:, -1] 77 | return keys 78 | 79 | 80 | def voxelize(coord, voxel_size=0.05, hash_type='fnv', mode=0): 81 | discrete_coord = np.floor(coord / np.array(voxel_size)) 82 | if hash_type == 'ravel': 83 | key = ravel_hash_vec(discrete_coord) 84 | else: 85 | key = fnv_hash_vec(discrete_coord) 86 | 87 | idx_sort = np.argsort(key) 88 | key_sort = key[idx_sort] 89 | _, count = np.unique(key_sort, return_counts=True) 90 | if mode == 0: # train mode 91 | idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + np.random.randint(0, count.max(), count.size) % count 92 | idx_unique = idx_sort[idx_select] 93 | return idx_unique 94 | else: # val mode 95 | return idx_sort, count 96 | 97 | ''' 98 | #_, idx = np.unique(key, return_index=True) 99 | #return idx 100 | 101 | idx_sort = np.argsort(key) 102 | key_sort = key[idx_sort] 103 | _, idx_start, count = np.unique(key_sort, return_counts=True, return_index=True) 104 | idx_list = np.split(idx_sort, idx_start[1:]) 105 | return idx_list 106 | ''' 107 | -------------------------------------------------------------------------------- /lib/pointops/src/ballquery/ballquery_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "ballquery_cuda_kernel.h" 3 | 4 | // input: new_xyz(b, m, 3) xyz(b, n, 3) 5 | // output: idx(b, m, nsample) 6 | __global__ void ballquery_cuda_kernel(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx) 7 | { 8 | int batch_index = blockIdx.x; 9 | xyz += batch_index * n * 3; 10 | new_xyz += batch_index * m * 3; 11 | idx += m * nsample * batch_index; 12 | int index = threadIdx.x; 13 | int stride = blockDim.x; 14 | 15 | float radius2 = radius * radius; 16 | for (int j = index; j < m; j += stride) 17 | { 18 | float new_x = new_xyz[j * 3 + 0]; 19 | float new_y = new_xyz[j * 3 + 1]; 20 | float new_z = new_xyz[j * 3 + 2]; 21 | for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) 22 | { 23 | float x = xyz[k * 3 + 0]; 24 | float y = xyz[k * 3 + 1]; 25 | float z = xyz[k * 3 + 2]; 26 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 27 | if (d2 < radius2) 28 | { 29 | if (cnt == 0) 30 | { 31 | for (int l = 0; l < nsample; ++l) 32 | idx[j * nsample + l] = k; 33 | } 34 | idx[j * nsample + cnt] = k; 35 | ++cnt; 36 | } 37 | } 38 | } 39 | } 40 | 41 | void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx) 42 | { 43 | ballquery_cuda_kernel<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx); 44 | } 45 | 46 | 47 | __global__ void ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { 48 | int bs_idx = blockIdx.y; 49 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 50 | if (bs_idx >= b || pt_idx >= m) return; 51 | 52 | new_xyz += bs_idx * m * 3 + pt_idx * 3; 53 | xyz += bs_idx * n * 3; 54 | idx += bs_idx * m * nsample + pt_idx * nsample; 55 | 56 | float radius2 = radius * radius; 57 | float new_x = new_xyz[0]; 58 | float new_y = new_xyz[1]; 59 | float new_z = new_xyz[2]; 60 | 61 | int cnt = 0; 62 | for (int k = 0; k < n; ++k) { 63 | float x = xyz[k * 3 + 0]; 64 | float y = xyz[k * 3 + 1]; 65 | float z = xyz[k * 3 + 2]; 66 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 67 | if (d2 < radius2){ 68 | if (cnt == 0){ 69 | for (int l = 0; l < nsample; ++l) { 70 | idx[l] = k; 71 | } 72 | } 73 | idx[cnt] = k; 74 | ++cnt; 75 | if (cnt >= nsample){ 76 | break; 77 | } 78 | } 79 | } 80 | } 81 | 82 | 83 | void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) { 84 | // param new_xyz: (B, m, 3) 85 | // param xyz: (B, n, 3) 86 | // param idx: (B, m, nsample) 87 | 88 | cudaError_t err; 89 | 90 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 91 | dim3 threads(THREADS_PER_BLOCK); 92 | 93 | ballquery_cuda_kernel_fast<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx); 94 | // cudaDeviceSynchronize(); // for using printf in kernel function 95 | 96 | err = cudaGetLastError(); 97 | if (cudaSuccess != err) { 98 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 99 | exit(-1); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /lib/pointops2/src/pointops_api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "knnquery/knnquery_cuda_kernel.h" 5 | #include "sampling/sampling_cuda_kernel.h" 6 | #include "grouping/grouping_cuda_kernel.h" 7 | #include "interpolation/interpolation_cuda_kernel.h" 8 | #include "aggregation/aggregation_cuda_kernel.h" 9 | #include "subtraction/subtraction_cuda_kernel.h" 10 | #include "attention/attention_cuda_kernel.h" 11 | #include "rpe/relative_pos_encoding_cuda_kernel.h" 12 | #include "attention_v2/attention_cuda_kernel_v2.h" 13 | #include "rpe_v2/relative_pos_encoding_cuda_kernel_v2.h" 14 | 15 | 16 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 17 | m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda"); 18 | m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda"); 19 | m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda"); 20 | m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda"); 21 | m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda"); 22 | m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda"); 23 | m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda"); 24 | m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda"); 25 | m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda"); 26 | m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda"); 27 | m.def("attention_step1_forward_cuda", &attention_step1_forward_cuda, "attention_step1_forward_cuda"); 28 | m.def("attention_step1_backward_cuda", &attention_step1_backward_cuda, "attention_step1_backward_cuda"); 29 | m.def("attention_step2_forward_cuda", &attention_step2_forward_cuda, "attention_step2_forward_cuda"); 30 | m.def("attention_step2_backward_cuda", &attention_step2_backward_cuda, "attention_step2_backward_cuda"); 31 | m.def("dot_prod_with_idx_forward_cuda", &dot_prod_with_idx_forward_cuda, "dot_prod_with_idx_forward_cuda"); 32 | m.def("dot_prod_with_idx_backward_cuda", &dot_prod_with_idx_backward_cuda, "dot_prod_with_idx_backward_cuda"); 33 | m.def("attention_step2_with_rel_pos_value_forward_cuda", &attention_step2_with_rel_pos_value_forward_cuda, "attention_step2_with_rel_pos_value_forward_cuda"); 34 | m.def("attention_step2_with_rel_pos_value_backward_cuda", &attention_step2_with_rel_pos_value_backward_cuda, "attention_step2_with_rel_pos_value_backward_cuda"); 35 | m.def("attention_step1_forward_cuda_v2", &attention_step1_forward_cuda_v2, "attention_step1_forward_cuda_v2"); 36 | m.def("attention_step1_backward_cuda_v2", &attention_step1_backward_cuda_v2, "attention_step1_backward_cuda_v2"); 37 | m.def("attention_step2_forward_cuda_v2", &attention_step2_forward_cuda_v2, "attention_step2_forward_cuda_v2"); 38 | m.def("attention_step2_backward_cuda_v2", &attention_step2_backward_cuda_v2, "attention_step2_backward_cuda_v2"); 39 | m.def("dot_prod_with_idx_forward_cuda_v2", &dot_prod_with_idx_forward_cuda_v2, "dot_prod_with_idx_forward_cuda_v2"); 40 | m.def("dot_prod_with_idx_backward_cuda_v2", &dot_prod_with_idx_backward_cuda_v2, "dot_prod_with_idx_backward_cuda_v2"); 41 | m.def("attention_step2_with_rel_pos_value_forward_cuda_v2", &attention_step2_with_rel_pos_value_forward_cuda_v2, "attention_step2_with_rel_pos_value_forward_cuda_v2"); 42 | m.def("attention_step2_with_rel_pos_value_backward_cuda_v2", &attention_step2_with_rel_pos_value_backward_cuda_v2, "attention_step2_with_rel_pos_value_backward_cuda_v2"); 43 | m.def("dot_prod_with_idx_forward_cuda_v3", &dot_prod_with_idx_forward_cuda_v3, "dot_prod_with_idx_forward_cuda_v3"); 44 | m.def("dot_prod_with_idx_backward_cuda_v3", &dot_prod_with_idx_backward_cuda_v3, "dot_prod_with_idx_backward_cuda_v3"); 45 | } 46 | -------------------------------------------------------------------------------- /lib/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "grid_subsampling.h" 3 | 4 | 5 | void grid_subsampling(vector& original_points, 6 | vector& subsampled_points, 7 | vector& original_features, 8 | vector& subsampled_features, 9 | vector& original_classes, 10 | vector& subsampled_classes, 11 | float sampleDl, 12 | int verbose) { 13 | 14 | // Initiate variables 15 | // ****************** 16 | 17 | // Number of points in the cloud 18 | size_t N = original_points.size(); 19 | 20 | // Dimension of the features 21 | size_t fdim = original_features.size() / N; 22 | size_t ldim = original_classes.size() / N; 23 | 24 | // Limits of the cloud 25 | PointXYZ minCorner = min_point(original_points); 26 | PointXYZ maxCorner = max_point(original_points); 27 | PointXYZ originCorner = floor(minCorner * (1/sampleDl)) * sampleDl; 28 | 29 | // Dimensions of the grid 30 | size_t sampleNX = (size_t)floor((maxCorner.x - originCorner.x) / sampleDl) + 1; 31 | size_t sampleNY = (size_t)floor((maxCorner.y - originCorner.y) / sampleDl) + 1; 32 | //size_t sampleNZ = (size_t)floor((maxCorner.z - originCorner.z) / sampleDl) + 1; 33 | 34 | // Check if features and classes need to be processed 35 | bool use_feature = original_features.size() > 0; 36 | bool use_classes = original_classes.size() > 0; 37 | 38 | 39 | // Create the sampled map 40 | // ********************** 41 | 42 | // Verbose parameters 43 | int i = 0; 44 | int nDisp = N / 100; 45 | 46 | // Initiate variables 47 | size_t iX, iY, iZ, mapIdx; 48 | unordered_map data; 49 | 50 | for (auto& p : original_points) 51 | { 52 | // Position of point in sample map 53 | iX = (size_t)floor((p.x - originCorner.x) / sampleDl); 54 | iY = (size_t)floor((p.y - originCorner.y) / sampleDl); 55 | iZ = (size_t)floor((p.z - originCorner.z) / sampleDl); 56 | mapIdx = iX + sampleNX*iY + sampleNX*sampleNY*iZ; 57 | 58 | // If not already created, create key 59 | if (data.count(mapIdx) < 1) 60 | data.emplace(mapIdx, SampledData(fdim, ldim)); 61 | 62 | // Fill the sample map 63 | if (use_feature && use_classes) 64 | data[mapIdx].update_all(p, original_features.begin() + i * fdim, original_classes.begin() + i * ldim); 65 | else if (use_feature) 66 | data[mapIdx].update_features(p, original_features.begin() + i * fdim); 67 | else if (use_classes) 68 | data[mapIdx].update_classes(p, original_classes.begin() + i * ldim); 69 | else 70 | data[mapIdx].update_points(p); 71 | 72 | // Display 73 | i++; 74 | if (verbose > 1 && i%nDisp == 0) 75 | std::cout << "\rSampled Map : " << std::setw(3) << i / nDisp << "%"; 76 | 77 | } 78 | 79 | // Divide for barycentre and transfer to a vector 80 | subsampled_points.reserve(data.size()); 81 | if (use_feature) 82 | subsampled_features.reserve(data.size() * fdim); 83 | if (use_classes) 84 | subsampled_classes.reserve(data.size() * ldim); 85 | for (auto& v : data) 86 | { 87 | subsampled_points.push_back(v.second.point * (1.0 / v.second.count)); 88 | if (use_feature) 89 | { 90 | float count = (float)v.second.count; 91 | transform(v.second.features.begin(), 92 | v.second.features.end(), 93 | v.second.features.begin(), 94 | [count](float f) { return f / count;}); 95 | subsampled_features.insert(subsampled_features.end(),v.second.features.begin(),v.second.features.end()); 96 | } 97 | if (use_classes) 98 | { 99 | for (int i = 0; i < ldim; i++) 100 | subsampled_classes.push_back(max_element(v.second.labels[i].begin(), v.second.labels[i].end(), 101 | [](const pair&a, const pair&b){return a.second < b.second;})->first); 102 | } 103 | } 104 | 105 | return; 106 | } 107 | -------------------------------------------------------------------------------- /lib/pointops/src/grouping/grouping_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "grouping_cuda_kernel.h" 3 | 4 | // input: points(b, c, n) idx(b, m, nsample) 5 | // output: out(b, c, m, nsample) 6 | __global__ void grouping_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out) 7 | { 8 | int batch_index = blockIdx.x; 9 | points += batch_index * n * c; 10 | idx += batch_index * m * nsample; 11 | out += batch_index * m * nsample * c; 12 | const int index = threadIdx.y * blockDim.x + threadIdx.x; 13 | const int stride = blockDim.y * blockDim.x; 14 | for (int i = index; i < c * m; i += stride) 15 | { 16 | const int l = i / m; 17 | const int j = i % m; 18 | for (int k = 0; k < nsample; ++k) 19 | { 20 | int ii = idx[j * nsample + k]; 21 | out[(l * m + j) * nsample + k] = points[l * n + ii]; 22 | } 23 | } 24 | } 25 | 26 | // input: grad_out(b, c, m, nsample), idx(b, m, nsample) 27 | // output: grad_points(b, c, n) 28 | __global__ void grouping_backward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points) 29 | { 30 | int batch_index = blockIdx.x; 31 | grad_out += batch_index * m * nsample * c; 32 | idx += batch_index * m * nsample; 33 | grad_points += batch_index * n * c; 34 | const int index = threadIdx.y * blockDim.x + threadIdx.x; 35 | const int stride = blockDim.y * blockDim.x; 36 | for (int i = index; i < c * m; i += stride) 37 | { 38 | const int l = i / m; 39 | const int j = i % m; 40 | for (int k = 0; k < nsample; ++k) 41 | { 42 | int ii = idx[j * nsample + k]; 43 | atomicAdd(grad_points + l * n + ii, grad_out[(l * m + j) * nsample + k]); 44 | } 45 | } 46 | } 47 | 48 | void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out) 49 | { 50 | grouping_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out); 51 | } 52 | 53 | void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points) 54 | { 55 | grouping_backward_cuda_kernel<<>>(b, c, n, m, nsample, grad_out, idx, grad_points); 56 | } 57 | 58 | // input: points(b, c, n) idx(b, npoints, nsample) 59 | // output: out(b, c, npoints, nsample) 60 | __global__ void grouping_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { 61 | int bs_idx = blockIdx.z; 62 | int c_idx = blockIdx.y; 63 | int index = blockIdx.x * blockDim.x + threadIdx.x; 64 | int pt_idx = index / nsample; 65 | if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; 66 | 67 | int sample_idx = index % nsample; 68 | 69 | idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 70 | int in_idx = bs_idx * c * n + c_idx * n + idx[0]; 71 | int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; 72 | 73 | out[out_idx] = points[in_idx]; 74 | } 75 | 76 | // input: points(b, c, n) idx(b, npoints, nsample) 77 | // output: out(b, c, npoints, nsample) 78 | void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out) { 79 | 80 | cudaError_t err; 81 | 82 | dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) 83 | dim3 threads(THREADS_PER_BLOCK); 84 | 85 | grouping_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out); 86 | // cudaDeviceSynchronize(); // for using printf in kernel function 87 | err = cudaGetLastError(); 88 | if (cudaSuccess != err) { 89 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 90 | exit(-1); 91 | } 92 | } 93 | 94 | 95 | -------------------------------------------------------------------------------- /lib/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h: -------------------------------------------------------------------------------- 1 | #ifndef _RPE_V2_CUDA_KERNEL 2 | #define _RPE_V2_CUDA_KERNEL 3 | #include 4 | #include 5 | #include 6 | 7 | void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor); 8 | void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor); 9 | 10 | void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); 11 | void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor); 12 | 13 | void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); 14 | void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor); 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output); 21 | void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k); 22 | 23 | void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *output); 24 | void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k); 25 | 26 | void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output); 27 | void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | #endif 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stratified Transformer for 3D Point Cloud Segmentation 2 | *Xin Lai\*, Jianhui Liu\*, Li Jiang, Liwei Wang, Hengshuang Zhao, Shu Liu, Xiaojuan Qi, Jiaya Jia* 3 | 4 | This is the official PyTorch implementation of our paper [**Stratified Transformer for 3D Point Cloud Segmentation**](https://arxiv.org/pdf/2203.14508.pdf) that has been accepted to CVPR 2022. [\[arXiv\]](https://arxiv.org/pdf/2203.14508.pdf) [\[CVF\]](https://openaccess.thecvf.com/content/CVPR2022/papers/Lai_Stratified_Transformer_for_3D_Point_Cloud_Segmentation_CVPR_2022_paper.pdf) 5 | 6 |
7 | 8 |
9 | 10 | # Highlight 11 | 1. Our method (*Stratified Transformer*) achieves the state-of-the-art performance on 3D point cloud semantic segmentation on both S3DIS and ScanNetv2 datasets. **It is the first time for a point-based method to outperform the voxel-based ones**, such as SparseConvNet and MinkowskiNet; 12 | 2. *Stratified Transformer* is point-based, and constructed by Transformer with standard multi-head self-attention, enjoying large receptive field, robust generalization ability as well as competitive performance; 13 | 3. This repository develops a memory-efficient implementation to combat the issue of **variant-length tokens** with several CUDA kernels, avoiding unnecessary momery occupation of vacant tokens. We also use shared memory for further acceleration. 14 | 15 | # Get Started 16 | 17 | ## Environment 18 | 19 | 1. Install dependencies 20 | 21 | ``` 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | If you have any problem with the above command, you can also install them by 26 | 27 | ``` 28 | pip install torch_sparse==0.6.12 29 | pip install torch_points3d==1.3.0 30 | pip install tensorboard timm termcolor tensorboardX 31 | ``` 32 | 33 | 2. Compile pointops 34 | 35 | Make sure you have installed `gcc` and `cuda`, and `nvcc` can work (Note that if you install cuda by conda, it won't provide nvcc and you should install cuda manually.). Then, compile and install pointops2 as follows. (We have tested on gcc==7.5.0 and cuda==10.1) 36 | ``` 37 | cd lib/pointops2 38 | python3 setup.py install 39 | ``` 40 | 41 | ## Datasets Preparation 42 | 43 | ### S3DIS 44 | Please refer to https://github.com/yanx27/Pointnet_Pointnet2_pytorch for S3DIS preprocessing. Then modify the `data_root` entry in the .yaml configuration file. 45 | 46 | ### ScanNetv2 47 | Please refer to https://github.com/dvlab-research/PointGroup for the ScanNetv2 preprocessing. Then change the `data_root` entry in the .yaml configuration file accordingly. 48 | 49 | ## Training 50 | 51 | ### S3DIS 52 | - Stratified Transformer 53 | ``` 54 | python3 train.py --config config/s3dis/s3dis_stratified_transformer.yaml 55 | ``` 56 | 57 | - 3DSwin Transformer (The vanilla version shown in our paper) 58 | ``` 59 | python3 train.py --config config/s3dis/s3dis_swin3d_transformer.yaml 60 | ``` 61 | 62 | ### ScanNetv2 63 | - Stratified Transformer 64 | ``` 65 | python3 train.py --config config/scannetv2/scannetv2_stratified_transformer.yaml 66 | ``` 67 | 68 | - 3DSwin Transformer (The vanilla version shown in our paper) 69 | ``` 70 | python3 train.py --config config/scannetv2/scannetv2_swin3d_transformer.yaml 71 | ``` 72 | 73 | Note: It is normal to see the the results on S3DIS fluctuate between -0.5\% and +0.5\% mIoU maybe because the size of S3DIS is relatively small, while the results on ScanNetv2 are relatively stable. 74 | 75 | ## Testing 76 | For testing, first change the `model_path`, `save_folder` and `data_root_val` (if applicable) accordingly. Then, run the following command. 77 | ``` 78 | python3 test.py --config [YOUR_CONFIG_PATH] 79 | ``` 80 | 81 | ## Pre-trained Models 82 | 83 | For your convenience, you can download the pre-trained models and training/testing logs from [Here](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155154502_link_cuhk_edu_hk/EihXWr_HEnJIvR_M0_YRbSgBV-6VEIhmbOA9TMyCmKH35Q?e=hLAPNi). 84 | 85 | 86 | # Citation 87 | If you find this project useful, please consider citing: 88 | 89 | ``` 90 | @inproceedings{lai2022stratified, 91 | title={Stratified Transformer for 3D Point Cloud Segmentation}, 92 | author={Lai, Xin and Liu, Jianhui and Jiang, Li and Wang, Liwei and Zhao, Hengshuang and Liu, Shu and Qi, Xiaojuan and Jia, Jiaya}, 93 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 94 | pages={8500--8509}, 95 | year={2022} 96 | } 97 | ``` 98 | -------------------------------------------------------------------------------- /util/vis_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import matplotlib.pyplot as pyplot 5 | 6 | colors = {'ceiling':[0,255,0], 7 | 'floor':[0,0,255], 8 | 'wall':[0,255,255], 9 | 'beam':[255,255,0], 10 | 'column':[255,0,255], 11 | 'window':[100,100,255], 12 | 'door':[200,200,100], 13 | 'table':[170,120,200], 14 | 'chair':[255,0,0], 15 | 'sofa':[200,100,100], 16 | 'bookcase':[10,200,100], 17 | 'board':[200,200,200], 18 | 'clutter':[50,50,50]} 19 | colors = list(colors.values()) 20 | 21 | colors2 = [[50,50,50]] 22 | 23 | colors7 = [[255, 0, 0], [255, 125, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255], [255, 0, 255]] 24 | 25 | colors72 = [[242,183,176], [183,205,225], [210,234,200], [219,204,226], [249,218,173], [255,255,209], [227,216,192]] 26 | 27 | colors40 = [[88,170,108], [174,105,226], [78,194,83], [198,62,165], [133,188,52], [97,101,219], [190,177,52], [139,65,168], [75,202,137], [225,66,129], 28 | [68,135,42], [226,116,210], [146,186,98], [68,105,201], [219,148,53], [85,142,235], [212,85,42], [78,176,223], [221,63,77], [68,195,195], 29 | [175,58,119], [81,175,144], [184,70,74], [40,116,79], [184,134,219], [130,137,46], [110,89,164], [92,135,74], [220,140,190], [94,103,39], 30 | [144,154,219], [160,86,40], [67,107,165], [194,170,104], [162,95,150], [143,110,44], [146,72,105], [225,142,106], [162,83,86], [227,124,143]] 31 | 32 | def write_ply_color(points, labels, out_filename, num_classes=None): 33 | """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """ 34 | labels = labels.astype(int) 35 | N = points.shape[0] 36 | if num_classes is None: 37 | num_classes = np.max(labels) + 1 38 | else: 39 | assert (num_classes > np.max(labels)) 40 | fout = open(out_filename, 'w') 41 | # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)] 42 | # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)] 43 | for i in range(N): 44 | #c = colors[labels[i]] 45 | #c = [int(x * 255) for x in c] 46 | c = colors[labels[i]] 47 | fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) 48 | fout.close() 49 | 50 | 51 | def write_ply_rgb(points, rgb, out_filename, num_classes=None): 52 | """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """ 53 | N = points.shape[0] 54 | fout = open(out_filename, 'w') 55 | # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)] 56 | # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)] 57 | for i in range(N): 58 | #c = colors[labels[i]] 59 | #c = [int(x * 255) for x in c] 60 | c = rgb[i] 61 | fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) 62 | fout.close() 63 | 64 | 65 | def write_ply_color_modelnet40(points, out_filename, num_classes=None): 66 | """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """ 67 | #labels = labels.astype(int) 68 | N = points.shape[0] 69 | #if num_classes is None: 70 | # num_classes = np.max(labels) + 1 71 | #else: 72 | # assert (num_classes > np.max(labels)) 73 | fout = open(out_filename, 'w') 74 | # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)] 75 | # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)] 76 | for i in range(N): 77 | #c = colors[labels[i]] 78 | #c = [int(x * 255) for x in c] 79 | c = colors2[0] 80 | fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) 81 | fout.close() 82 | 83 | 84 | def write_ply_color_shapenet(points, labels, out_filename, num_classes=None): 85 | """ Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file """ 86 | labels = labels.astype(int) 87 | N = points.shape[0] 88 | if num_classes is None: 89 | num_classes = np.max(labels) + 1 90 | else: 91 | assert (num_classes > np.max(labels)) 92 | fout = open(out_filename, 'w') 93 | # colors = [pyplot.cm.hsv(i/float(num_classes)) for i in range(num_classes)] 94 | # colors = [pyplot.cm.jet(i / float(num_classes)) for i in range(num_classes)] 95 | for i in range(N): 96 | #c = colors[labels[i]] 97 | #c = [int(x * 255) for x in c] 98 | c = colors7[labels[i]%7] 99 | fout.write('v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) 100 | fout.close() -------------------------------------------------------------------------------- /lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "featuredistribute_cuda_kernel.h" 3 | 4 | __global__ void featuredistribute_cuda_kernel(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx) { 5 | int bs_idx = blockIdx.y; 6 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (bs_idx >= b || pt_idx >= m) return; 8 | 9 | max_xyz += bs_idx * n * 3; 10 | xyz += bs_idx * m * 3 + pt_idx * 3; 11 | distribute_idx += bs_idx * m + pt_idx; 12 | 13 | float x = xyz[0]; 14 | float y = xyz[1]; 15 | float z = xyz[2]; 16 | 17 | float min_dist2 = 100000; 18 | int min_dist_idx = -1; 19 | for (int k = 0; k < n; ++k) { 20 | float max_x = max_xyz[k * 3 + 0]; 21 | float max_y = max_xyz[k * 3 + 1]; 22 | float max_z = max_xyz[k * 3 + 2]; 23 | float d2 = (max_x - x) * (max_x - x) + (max_y - y) * (max_y - y) + (max_z - z) * (max_z - z); 24 | if (d2 < min_dist2){ 25 | min_dist_idx = k; 26 | min_dist2 = d2; 27 | } 28 | } 29 | distribute_idx[0] = min_dist_idx; 30 | } 31 | 32 | 33 | void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream) { 34 | // param max_xyz: (b, n, 3) 35 | // param xyz: (b, m, 3) 36 | // return distribute_idx: (b, m) 37 | 38 | cudaError_t err; 39 | 40 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 41 | dim3 threads(THREADS_PER_BLOCK); 42 | 43 | featuredistribute_cuda_kernel<<>>(b, n, m, max_xyz, xyz, distribute_idx); 44 | // cudaDeviceSynchronize(); // for using printf in kernel function 45 | 46 | err = cudaGetLastError(); 47 | if (cudaSuccess != err) { 48 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 49 | exit(-1); 50 | } 51 | } 52 | 53 | __global__ void featuregather_forward_cuda_kernel(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature) { 54 | int bs_idx = blockIdx.z; 55 | int c_idx = blockIdx.y; 56 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 57 | if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; 58 | 59 | max_feature += bs_idx * c * n + c_idx * n; 60 | distribute_idx += bs_idx * m + pt_idx; 61 | distribute_feature += bs_idx * c * m + c_idx * m + pt_idx; 62 | 63 | int idx = distribute_idx[0]; 64 | distribute_feature[0] = max_feature[idx]; 65 | } 66 | 67 | 68 | void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream){ 69 | // param max_feature: (b, c, n) 70 | // param distribute_idx: (b, m) 71 | // return distribute_feature: (b, c, m) 72 | 73 | cudaError_t err; 74 | 75 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) 76 | dim3 threads(THREADS_PER_BLOCK); 77 | 78 | featuregather_forward_cuda_kernel<<>>(b, n, m, c, max_feature, distribute_idx, distribute_feature); 79 | // cudaDeviceSynchronize(); // for using printf in kernel function 80 | 81 | err = cudaGetLastError(); 82 | if (cudaSuccess != err) { 83 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 84 | exit(-1); 85 | } 86 | } 87 | 88 | 89 | __global__ void featuregather_backward_cuda_kernel(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature){ 90 | int bs_idx = blockIdx.z; 91 | int c_idx = blockIdx.y; 92 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 93 | if(bs_idx >= b || c_idx >= c || pt_idx >= m) return; 94 | 95 | grad_distribute_feature += bs_idx * c * m + c_idx * m + pt_idx; 96 | distribute_idx += bs_idx * m + pt_idx; 97 | grad_max_feature += bs_idx * c * n + c_idx * n; 98 | 99 | int idx = distribute_idx[0]; 100 | atomicAdd(grad_max_feature + idx, grad_distribute_feature[0]); 101 | } 102 | 103 | 104 | void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream){ 105 | // param grad_distribute_feature: (b, c, m) 106 | // param distribute_idx: (b, m) 107 | // return grad_max_feature: (b, c, n) 108 | 109 | cudaError_t err; 110 | 111 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) 112 | dim3 threads(THREADS_PER_BLOCK); 113 | 114 | featuregather_backward_cuda_kernel<<>>(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature); 115 | // cudaDeviceSynchronize(); // for using printf in kernel function 116 | 117 | err = cudaGetLastError(); 118 | if (cudaSuccess != err) { 119 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 120 | exit(-1); 121 | } 122 | } -------------------------------------------------------------------------------- /lib/pointops2/src/attention/attention_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /* written by Xin Lai. Email: xinlai@cse.cuhk.edu.hk */ 2 | 3 | #include "../cuda_utils.h" 4 | #include "attention_cuda_kernel.h" 5 | 6 | 7 | __global__ void attention_step1_forward_cuda_kernel( // M, h, C//h 8 | int N, int M, int h, int C, const float *q, const float *k, 9 | const int *index0, const int *index1, float *attn) { 10 | 11 | int c_idx = blockIdx.z; 12 | int h_idx = blockIdx.y; 13 | int m_idx = blockIdx.x * blockDim.x + threadIdx.x; 14 | if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; 15 | 16 | int idx0 = index0[m_idx]; 17 | int idx1 = index1[m_idx]; 18 | float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx]; 19 | atomicAdd(attn+m_idx*h+h_idx, val); 20 | } 21 | 22 | __global__ void attention_step1_backward_cuda_kernel( // M, h, C//h 23 | int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, 24 | float *grad_q, float *grad_k) { 25 | 26 | int c_idx = blockIdx.z; 27 | int h_idx = blockIdx.y; 28 | int m_idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; 30 | 31 | int idx0 = index0[m_idx]; 32 | int idx1 = index1[m_idx]; 33 | int grad_out_idx = m_idx*h+h_idx; 34 | int q_idx = idx0*C+h_idx*C/h+c_idx; 35 | int k_idx = idx1*C+h_idx*C/h+c_idx; 36 | atomicAdd(grad_q+q_idx, grad_out[grad_out_idx] * k[k_idx]); 37 | atomicAdd(grad_k+k_idx, grad_out[grad_out_idx] * q[q_idx]); 38 | } 39 | 40 | void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, 41 | const int *index0, const int *index1, float *attn) { 42 | // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) 43 | //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); 44 | dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); 45 | dim3 threads(THREADS_PER_BLOCK); 46 | attention_step1_forward_cuda_kernel<<>>(N, M, h, C, q, k, index0, index1, attn); 47 | } 48 | 49 | void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 50 | const float *q, const float *k, float *grad_q, float *grad_k) { 51 | // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) 52 | //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); 53 | dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); 54 | dim3 threads(THREADS_PER_BLOCK); 55 | attention_step1_backward_cuda_kernel<<>>(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k); 56 | } 57 | 58 | __global__ void attention_step2_forward_cuda_kernel( // M, h, C//h 59 | int N, int M, int h, int C, const float *attn, const float *v, 60 | const int *index0, const int *index1, float *output) { 61 | 62 | int c_idx = blockIdx.z; 63 | int h_idx = blockIdx.y; 64 | int m_idx = blockIdx.x * blockDim.x + threadIdx.x; 65 | if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; 66 | 67 | int idx1 = index1[m_idx]; 68 | float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx]; 69 | int idx0 = index0[m_idx]; 70 | atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val); 71 | } 72 | 73 | __global__ void attention_step2_backward_cuda_kernel( // M, h, C//h 74 | int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, 75 | float *grad_attn, float *grad_v) { 76 | 77 | int c_idx = blockIdx.z; 78 | int h_idx = blockIdx.y; 79 | int m_idx = blockIdx.x * blockDim.x + threadIdx.x; 80 | if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; 81 | 82 | int idx0 = index0[m_idx]; 83 | int idx1 = index1[m_idx]; 84 | int grad_out_idx = idx0*C+h_idx*C/h+c_idx; 85 | atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]); 86 | atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]); 87 | } 88 | 89 | void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, 90 | const int *index0, const int *index1, float *output) { 91 | // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) 92 | //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); 93 | dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); 94 | dim3 threads(THREADS_PER_BLOCK); 95 | attention_step2_forward_cuda_kernel<<>>(N, M, h, C, attn, v, index0, index1, output); 96 | } 97 | 98 | void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 99 | const float *attn, const float *v, float *grad_attn, float *grad_v) { 100 | // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) 101 | //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); 102 | dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); 103 | dim3 threads(THREADS_PER_BLOCK); 104 | attention_step2_backward_cuda_kernel<<>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); 105 | } 106 | -------------------------------------------------------------------------------- /util/config.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # Functions for parsing args 3 | # ----------------------------------------------------------------------------- 4 | import yaml 5 | import os 6 | from ast import literal_eval 7 | import copy 8 | 9 | 10 | class CfgNode(dict): 11 | """ 12 | CfgNode represents an internal node in the configuration tree. It's a simple 13 | dict-like container that allows for attribute-based access to keys. 14 | """ 15 | 16 | def __init__(self, init_dict=None, key_list=None, new_allowed=False): 17 | # Recursively convert nested dictionaries in init_dict into CfgNodes 18 | init_dict = {} if init_dict is None else init_dict 19 | key_list = [] if key_list is None else key_list 20 | for k, v in init_dict.items(): 21 | if type(v) is dict: 22 | # Convert dict to CfgNode 23 | init_dict[k] = CfgNode(v, key_list=key_list + [k]) 24 | super(CfgNode, self).__init__(init_dict) 25 | 26 | def __getattr__(self, name): 27 | if name in self: 28 | return self[name] 29 | else: 30 | raise AttributeError(name) 31 | 32 | def __setattr__(self, name, value): 33 | self[name] = value 34 | 35 | def __str__(self): 36 | def _indent(s_, num_spaces): 37 | s = s_.split("\n") 38 | if len(s) == 1: 39 | return s_ 40 | first = s.pop(0) 41 | s = [(num_spaces * " ") + line for line in s] 42 | s = "\n".join(s) 43 | s = first + "\n" + s 44 | return s 45 | 46 | r = "" 47 | s = [] 48 | for k, v in sorted(self.items()): 49 | seperator = "\n" if isinstance(v, CfgNode) else " " 50 | attr_str = "{}:{}{}".format(str(k), seperator, str(v)) 51 | attr_str = _indent(attr_str, 2) 52 | s.append(attr_str) 53 | r += "\n".join(s) 54 | return r 55 | 56 | def __repr__(self): 57 | return "{}({})".format(self.__class__.__name__, super(CfgNode, self).__repr__()) 58 | 59 | 60 | def load_cfg_from_cfg_file(file): 61 | cfg = {} 62 | assert os.path.isfile(file) and file.endswith('.yaml'), \ 63 | '{} is not a yaml file'.format(file) 64 | 65 | with open(file, 'r') as f: 66 | cfg_from_file = yaml.safe_load(f) 67 | 68 | for key in cfg_from_file: 69 | for k, v in cfg_from_file[key].items(): 70 | cfg[k] = v 71 | 72 | cfg = CfgNode(cfg) 73 | return cfg 74 | 75 | 76 | def merge_cfg_from_list(cfg, cfg_list): 77 | new_cfg = copy.deepcopy(cfg) 78 | assert len(cfg_list) % 2 == 0 79 | for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]): 80 | subkey = full_key.split('.')[-1] 81 | assert subkey in cfg, 'Non-existent key: {}'.format(full_key) 82 | value = _decode_cfg_value(v) 83 | value = _check_and_coerce_cfg_value_type( 84 | value, cfg[subkey], subkey, full_key 85 | ) 86 | setattr(new_cfg, subkey, value) 87 | 88 | return new_cfg 89 | 90 | 91 | def _decode_cfg_value(v): 92 | """Decodes a raw config value (e.g., from a yaml config files or command 93 | line argument) into a Python object. 94 | """ 95 | # All remaining processing is only applied to strings 96 | if not isinstance(v, str): 97 | return v 98 | # Try to interpret `v` as a: 99 | # string, number, tuple, list, dict, boolean, or None 100 | try: 101 | v = literal_eval(v) 102 | # The following two excepts allow v to pass through when it represents a 103 | # string. 104 | # 105 | # Longer explanation: 106 | # The type of v is always a string (before calling literal_eval), but 107 | # sometimes it *represents* a string and other times a data structure, like 108 | # a list. In the case that v represents a string, what we got back from the 109 | # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is 110 | # ok with '"foo"', but will raise a ValueError if given 'foo'. In other 111 | # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval 112 | # will raise a SyntaxError. 113 | except ValueError: 114 | pass 115 | except SyntaxError: 116 | pass 117 | return v 118 | 119 | 120 | def _check_and_coerce_cfg_value_type(replacement, original, key, full_key): 121 | """Checks that `replacement`, which is intended to replace `original` is of 122 | the right type. The type is correct if it matches exactly or is one of a few 123 | cases in which the type can be easily coerced. 124 | """ 125 | original_type = type(original) 126 | replacement_type = type(replacement) 127 | 128 | # The types must match (with some exceptions) 129 | if replacement_type == original_type or original is None: 130 | return replacement 131 | 132 | # Cast replacement from from_type to to_type if the replacement and original 133 | # types match from_type and to_type 134 | def conditional_cast(from_type, to_type): 135 | if replacement_type == from_type and original_type == to_type: 136 | return True, to_type(replacement) 137 | else: 138 | return False, None 139 | 140 | # Conditionally casts 141 | # list <-> tuple 142 | casts = [(tuple, list), (list, tuple)] 143 | # For py2: allow converting from str (bytes) to a unicode string 144 | try: 145 | casts.append((str, unicode)) # noqa: F821 146 | except Exception: 147 | pass 148 | 149 | for (from_type, to_type) in casts: 150 | converted, converted_value = conditional_cast(from_type, to_type) 151 | if converted: 152 | return converted_value 153 | 154 | raise ValueError( 155 | "Type mismatch ({} vs. {}) with values ({} vs. {}) for config " 156 | "key: {}".format( 157 | original_type, replacement_type, original, replacement, full_key 158 | ) 159 | ) 160 | 161 | 162 | def _assert_with_logging(cond, msg): 163 | if not cond: 164 | logger.debug(msg) 165 | assert cond, msg 166 | -------------------------------------------------------------------------------- /lib/pointops2/src/sampling/sampling_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "sampling_cuda_kernel.h" 3 | 4 | 5 | __device__ void __update(float *dists, int *dists_i, int idx1, int idx2) { 6 | const float v1 = dists[idx1], v2 = dists[idx2]; 7 | const int i1 = dists_i[idx1], i2 = dists_i[idx2]; 8 | dists[idx1] = max(v1, v2); 9 | dists_i[idx1] = v2 > v1 ? i2 : i1; 10 | } 11 | 12 | // input xyz: (n, 3), tmp: (b, n_max) 13 | // ouput idx (m) 14 | template 15 | __global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) 16 | { 17 | __shared__ float dists[block_size]; 18 | __shared__ int dists_i[block_size]; 19 | 20 | int bid = blockIdx.x; 21 | int start_n, end_n, start_m, end_m, old; 22 | if (bid == 0) { 23 | start_n = 0; 24 | end_n = offset[0]; 25 | start_m = 0; 26 | end_m = new_offset[0]; 27 | old = 0; 28 | } 29 | else { 30 | start_n = offset[bid - 1]; 31 | end_n = offset[bid]; 32 | start_m = new_offset[bid - 1]; 33 | end_m = new_offset[bid]; 34 | old = offset[bid - 1]; 35 | } 36 | 37 | const int stride = block_size; 38 | int tid = threadIdx.x; 39 | if (tid == 0) idx[start_m] = start_n; 40 | 41 | __syncthreads(); 42 | for (int j = start_m + 1; j < end_m; j++) 43 | { 44 | int besti = start_n; 45 | float best = -1; 46 | float x1 = xyz[old * 3 + 0]; 47 | float y1 = xyz[old * 3 + 1]; 48 | float z1 = xyz[old * 3 + 2]; 49 | for (int k = start_n + tid; k < end_n; k += stride) 50 | { 51 | float x2 = xyz[k * 3 + 0]; 52 | float y2 = xyz[k * 3 + 1]; 53 | float z2 = xyz[k * 3 + 2]; 54 | float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); 55 | float d2 = min(d, tmp[k]); 56 | tmp[k] = d2; 57 | besti = d2 > best ? k : besti; 58 | best = d2 > best ? d2 : best; 59 | } 60 | dists[tid] = best; 61 | dists_i[tid] = besti; 62 | __syncthreads(); 63 | 64 | if (block_size >= 1024) { 65 | if (tid < 512) { 66 | __update(dists, dists_i, tid, tid + 512); 67 | } 68 | __syncthreads(); 69 | } 70 | if (block_size >= 512) { 71 | if (tid < 256) { 72 | __update(dists, dists_i, tid, tid + 256); 73 | } 74 | __syncthreads(); 75 | } 76 | if (block_size >= 256) { 77 | if (tid < 128) { 78 | __update(dists, dists_i, tid, tid + 128); 79 | } 80 | __syncthreads(); 81 | } 82 | if (block_size >= 128) { 83 | if (tid < 64) { 84 | __update(dists, dists_i, tid, tid + 64); 85 | } 86 | __syncthreads(); 87 | } 88 | if (block_size >= 64) { 89 | if (tid < 32) { 90 | __update(dists, dists_i, tid, tid + 32); 91 | } 92 | __syncthreads(); 93 | } 94 | if (block_size >= 32) { 95 | if (tid < 16) { 96 | __update(dists, dists_i, tid, tid + 16); 97 | } 98 | __syncthreads(); 99 | } 100 | if (block_size >= 16) { 101 | if (tid < 8) { 102 | __update(dists, dists_i, tid, tid + 8); 103 | } 104 | __syncthreads(); 105 | } 106 | if (block_size >= 8) { 107 | if (tid < 4) { 108 | __update(dists, dists_i, tid, tid + 4); 109 | } 110 | __syncthreads(); 111 | } 112 | if (block_size >= 4) { 113 | if (tid < 2) { 114 | __update(dists, dists_i, tid, tid + 2); 115 | } 116 | __syncthreads(); 117 | } 118 | if (block_size >= 2) { 119 | if (tid < 1) { 120 | __update(dists, dists_i, tid, tid + 1); 121 | } 122 | __syncthreads(); 123 | } 124 | 125 | old = dists_i[0]; 126 | if (tid == 0) 127 | idx[j] = old; 128 | } 129 | } 130 | 131 | void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) 132 | { 133 | unsigned int n_threads = opt_n_threads(n); 134 | switch (n_threads) { 135 | case 1024: 136 | furthestsampling_cuda_kernel<1024><<>>(xyz, offset, new_offset, tmp, idx); 137 | break; 138 | case 512: 139 | furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); 140 | break; 141 | case 256: 142 | furthestsampling_cuda_kernel<256><<>>(xyz, offset, new_offset, tmp, idx); 143 | break; 144 | case 128: 145 | furthestsampling_cuda_kernel<128><<>>(xyz, offset, new_offset, tmp, idx); 146 | break; 147 | case 64: 148 | furthestsampling_cuda_kernel<64><<>>(xyz, offset, new_offset, tmp, idx); 149 | break; 150 | case 32: 151 | furthestsampling_cuda_kernel<32><<>>(xyz, offset, new_offset, tmp, idx); 152 | break; 153 | case 16: 154 | furthestsampling_cuda_kernel<16><<>>(xyz, offset, new_offset, tmp, idx); 155 | break; 156 | case 8: 157 | furthestsampling_cuda_kernel<8><<>>(xyz, offset, new_offset, tmp, idx); 158 | break; 159 | case 4: 160 | furthestsampling_cuda_kernel<4><<>>(xyz, offset, new_offset, tmp, idx); 161 | break; 162 | case 2: 163 | furthestsampling_cuda_kernel<2><<>>(xyz, offset, new_offset, tmp, idx); 164 | break; 165 | case 1: 166 | furthestsampling_cuda_kernel<1><<>>(xyz, offset, new_offset, tmp, idx); 167 | break; 168 | default: 169 | furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /util/lr.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import LambdaLR, StepLR, OneCycleLR 2 | import torch.optim as optim 3 | 4 | class LambdaStepLR(LambdaLR): 5 | 6 | def __init__(self, optimizer, lr_lambda, last_step=-1): 7 | super(LambdaStepLR, self).__init__(optimizer, lr_lambda, last_step) 8 | 9 | @property 10 | def last_step(self): 11 | """Use last_epoch for the step counter""" 12 | return self.last_epoch 13 | 14 | @last_step.setter 15 | def last_step(self, v): 16 | self.last_epoch = v 17 | 18 | 19 | class PolyLRwithWarmup(LambdaStepLR): 20 | """DeepLab learning rate policy""" 21 | 22 | def __init__(self, optimizer, max_iter, warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, last_step=-1): 23 | 24 | assert warmup == 'linear' 25 | def poly_with_warmup(s): 26 | coeff = (1 - s / (max_iter+1)) ** power 27 | if s <= warmup_iters: 28 | warmup_coeff = 1 - (1 - s / warmup_iters) * (1 - warmup_ratio) 29 | else: 30 | warmup_coeff = 1.0 31 | return coeff * warmup_coeff 32 | 33 | super(PolyLRwithWarmup, self).__init__(optimizer, poly_with_warmup, last_step) 34 | # torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1, verbose=False) 35 | # lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups. 36 | 37 | 38 | class MultiStepWithWarmup(LambdaStepLR): 39 | def __init__(self, optimizer, milestones, gamma=0.1, warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, last_step=-1): 40 | 41 | assert warmup == 'linear' 42 | def multi_step_with_warmup(s): 43 | factor = 1.0 44 | for i in range(len(milestones)): 45 | if s < milestones[i]: 46 | break 47 | factor *= gamma 48 | 49 | if s <= warmup_iters: 50 | warmup_coeff = 1 - (1 - s / warmup_iters) * (1 - warmup_ratio) 51 | else: 52 | warmup_coeff = 1.0 53 | return warmup_coeff * factor 54 | 55 | super(MultiStepWithWarmup, self).__init__(optimizer, multi_step_with_warmup, last_step) 56 | 57 | 58 | class PolyLR(LambdaStepLR): 59 | """DeepLab learning rate policy""" 60 | 61 | def __init__(self, optimizer, max_iter, power=0.9, last_step=-1): 62 | super(PolyLR, self).__init__(optimizer, lambda s: (1 - s / (max_iter + 1))**power, last_step) 63 | # torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1, verbose=False) 64 | # lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups. 65 | 66 | 67 | class SquaredLR(LambdaStepLR): 68 | """ Used for SGD Lars""" 69 | 70 | def __init__(self, optimizer, max_iter, last_step=-1): 71 | super(SquaredLR, self).__init__(optimizer, lambda s: (1 - s / (max_iter + 1))**2, last_step) 72 | 73 | 74 | class ExpLR(LambdaStepLR): 75 | 76 | def __init__(self, optimizer, step_size, gamma=0.9, last_step=-1): 77 | # (0.9 ** 21.854) = 0.1, (0.95 ** 44.8906) = 0.1 78 | # To get 0.1 every N using gamma 0.9, N * log(0.9)/log(0.1) = 0.04575749 N 79 | # To get 0.1 every N using gamma g, g ** N = 0.1 -> N * log(g) = log(0.1) -> g = np.exp(log(0.1) / N) 80 | super(ExpLR, self).__init__(optimizer, lambda s: gamma**(s / step_size), last_step) 81 | 82 | 83 | def initialize_scheduler(optimizer, config, last_epoch=-1, scheduler_epoch=True, logger=None): 84 | # scheduler_epoch: the step_size are given in epoch num 85 | last_step = -1 if last_epoch < 0 else config.iter_per_epoch_train * (last_epoch + 1) - 1 86 | if scheduler_epoch: 87 | config.step_size = config.iter_per_epoch_train * config.step_size 88 | config.exp_step_size = config.iter_per_epoch_train * config.exp_step_size 89 | 90 | if config.scheduler == 'StepLR': 91 | return StepLR(optimizer, step_size=config.step_size, gamma=config.step_gamma, last_epoch=last_step) 92 | elif config.scheduler == 'PolyLR': 93 | return PolyLR(optimizer, max_iter=config.max_iter, power=config.poly_power, last_step=last_step) 94 | elif config.scheduler == 'PolyLRwithWarmup': 95 | return PolyLRwithWarmup(optimizer, max_iter=config.max_iter, warmup=config.warmup, warmup_iters=config.warmup_iters, warmup_ratio=config.warmup_ratio, power=config.poly_power, last_step=last_step) 96 | elif config.scheduler == 'SquaredLR': 97 | return SquaredLR(optimizer, max_iter=config.max_iter, last_step=last_step) 98 | elif config.scheduler == 'ExpLR': 99 | return ExpLR(optimizer, step_size=config.exp_step_size, gamma=config.exp_gamma, last_step=last_step) 100 | elif config.scheduler == 'OneCycleLR': 101 | return OneCycleLR(optimizer, max_lr=config.oc_max_lr, total_steps=config.max_iter, pct_start=config.oc_pct_start, 102 | anneal_strategy=config.oc_anneal_strategy, div_factor=config.oc_div_factor, 103 | final_div_factor=config.oc_final_div_factor, last_epoch=last_step) 104 | # (optimizer, max_lr, total_steps=None, epochs=None, steps_per_epoch=None, pct_start=0.3, anneal_strategy='cos', cycle_momentum=True, base_momentum=0.85, max_momentum=0.95, div_factor=25.0, final_div_factor=10000.0, last_epoch=-1) 105 | else: 106 | if logger is not None: 107 | logger.info('Scheduler not supported') 108 | else: print('Scheduler not supported') 109 | 110 | 111 | if __name__ == '__main__': 112 | import torchvision.models as models 113 | model = models.vgg16() 114 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001) 115 | optimizer.param_groups[0]['initial_lr'] = 0.2 / 25.0 116 | optimizer.param_groups[0]['max_lr'] = 0.2 117 | optimizer.param_groups[0]['min_lr'] = 0.2 / 10000.0 118 | optimizer.param_groups[0]['max_momentum'] = 0.95 119 | optimizer.param_groups[0]['base_momentum'] = 0.85 120 | last_step = 2 121 | max_iter = 100 122 | # scheduler = PolyLR(optimizer, max_iter=max_iter, power=0.9, last_step=last_step) 123 | scheduler = OneCycleLR(optimizer, max_lr=0.2, total_steps=max_iter, pct_start=0.1, anneal_strategy='cos', div_factor=25.0, 124 | final_div_factor=10000.0, last_epoch=last_step) 125 | lr_list = [] 126 | for epoch in range(max(last_step + 1, 0), min(max_iter, 100)): 127 | lrs = ', '.join(['{:.5e}'.format(x) for x in scheduler.get_last_lr()]) 128 | print('epoch {} lrs {}'.format(epoch, lrs)) 129 | lr_list.append(scheduler.get_last_lr()[0]) 130 | scheduler.step() 131 | 132 | import numpy as np 133 | import matplotlib.pyplot as plt 134 | x = np.arange(max(last_step + 1, 0), min(max_iter, 100), 1) 135 | plt.title("function") 136 | plt.plot(x, lr_list) 137 | plt.show() -------------------------------------------------------------------------------- /lib/pointops/src/labelstat/labelstat_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../cuda_utils.h" 2 | #include "labelstat_cuda_kernel.h" 3 | 4 | // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass) 5 | // output: idx(b, m, nsample) new_label_stat(b, m, nclass) 6 | __global__ void labelstat_and_ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, int nclass, 7 | const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat) { 8 | int bs_idx = blockIdx.y; 9 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | if (bs_idx >= b || pt_idx >= m) return; 11 | 12 | new_xyz += bs_idx * m * 3 + pt_idx * 3; 13 | xyz += bs_idx * n * 3; 14 | idx += bs_idx * m * nsample + pt_idx * nsample; 15 | label_stat += bs_idx * n * nclass; 16 | new_label_stat += bs_idx * m * nclass + pt_idx * nclass; 17 | 18 | for(int i = 0; i < nclass; i++){ 19 | new_label_stat[i] = 0; 20 | } 21 | 22 | float radius2 = radius * radius; 23 | float new_x = new_xyz[0]; 24 | float new_y = new_xyz[1]; 25 | float new_z = new_xyz[2]; 26 | 27 | int cnt = 0; 28 | for (int k = 0; k < n; ++k) { 29 | float x = xyz[k * 3 + 0]; 30 | float y = xyz[k * 3 + 1]; 31 | float z = xyz[k * 3 + 2]; 32 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 33 | if (d2 < radius2){ 34 | for(int i = 0; i < nclass; i++){ 35 | new_label_stat[i] += label_stat[k * nclass + i]; 36 | } 37 | if (cnt == 0){ 38 | for (int l = 0; l < nsample; ++l) { 39 | idx[l] = k; 40 | } 41 | } 42 | idx[cnt] = k; 43 | ++cnt; 44 | if (cnt >= nsample){ 45 | break; 46 | } 47 | } 48 | } 49 | } 50 | 51 | void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass, 52 | const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream) { 53 | // param new_xyz: (B, m, 3) 54 | // param xyz: (B, n, 3) 55 | // param idx: (B, m, nsample) 56 | 57 | cudaError_t err; 58 | 59 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 60 | dim3 threads(THREADS_PER_BLOCK); 61 | 62 | labelstat_and_ballquery_cuda_kernel_fast<<>>(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat); 63 | // cudaDeviceSynchronize(); // for using printf in kernel function 64 | 65 | err = cudaGetLastError(); 66 | if (cudaSuccess != err) { 67 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 68 | exit(-1); 69 | } 70 | } 71 | 72 | // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass) 73 | // output: new_label_stat(b, m, nclass) 74 | __global__ void labelstat_ballrange_cuda_kernel_fast(int b, int n, int m, float radius, int nclass, 75 | const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat) { 76 | int bs_idx = blockIdx.y; 77 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 78 | if (bs_idx >= b || pt_idx >= m) return; 79 | 80 | new_xyz += bs_idx * m * 3 + pt_idx * 3; 81 | xyz += bs_idx * n * 3; 82 | label_stat += bs_idx * n * nclass; 83 | new_label_stat += bs_idx * m * nclass + pt_idx * nclass; 84 | 85 | for(int i = 0; i < nclass; i++){ 86 | new_label_stat[i] = 0; 87 | } 88 | 89 | float radius2 = radius * radius; 90 | float new_x = new_xyz[0]; 91 | float new_y = new_xyz[1]; 92 | float new_z = new_xyz[2]; 93 | 94 | for (int k = 0; k < n; ++k) { 95 | float x = xyz[k * 3 + 0]; 96 | float y = xyz[k * 3 + 1]; 97 | float z = xyz[k * 3 + 2]; 98 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 99 | if (d2 < radius2){ 100 | for(int i = 0; i < nclass; i++){ 101 | new_label_stat[i] += label_stat[k * nclass + i]; 102 | } 103 | } 104 | } 105 | } 106 | 107 | 108 | void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass, 109 | const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream) { 110 | // param new_xyz: (B, m, 3) 111 | // param xyz: (B, n, 3) 112 | // param idx: (B, m, nsample) 113 | 114 | cudaError_t err; 115 | 116 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 117 | dim3 threads(THREADS_PER_BLOCK); 118 | 119 | labelstat_ballrange_cuda_kernel_fast<<>>(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat); 120 | // cudaDeviceSynchronize(); // for using printf in kernel function 121 | 122 | err = cudaGetLastError(); 123 | if (cudaSuccess != err) { 124 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 125 | exit(-1); 126 | } 127 | } 128 | 129 | // input: idx(b, m, nsample) label_stat(b, n, nclass) 130 | // output: new_label_stat(b, m, nclass) 131 | __global__ void labelstat_idx_cuda_kernel_fast(int b, int n, int m, int nsample, int nclass, 132 | const int *label_stat, const int *idx, int *new_label_stat) { 133 | int bs_idx = blockIdx.y; 134 | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; 135 | if (bs_idx >= b || pt_idx >= m) return; 136 | 137 | idx += bs_idx * m * nsample + pt_idx * nsample; 138 | label_stat += bs_idx * n * nclass; 139 | new_label_stat += bs_idx * m * nclass + pt_idx * nclass; 140 | 141 | for(int i = 0; i < nclass; i++){ 142 | new_label_stat[i] = 0; 143 | } 144 | 145 | for(int k = 0; k < nsample; k++){ 146 | const int *label_stat_k = label_stat + idx[k] * nclass; 147 | for(int i = 0; i < nclass; i++){ 148 | new_label_stat[i] += label_stat_k[i]; 149 | } 150 | } 151 | } 152 | 153 | 154 | void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass, 155 | const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream) { 156 | // param new_xyz: (B, m, 3) 157 | // param xyz: (B, n, 3) 158 | // param idx: (B, m, nsample) 159 | 160 | cudaError_t err; 161 | 162 | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) 163 | dim3 threads(THREADS_PER_BLOCK); 164 | 165 | labelstat_idx_cuda_kernel_fast<<>>(b, n, m, nsample, nclass, label_stat, idx, new_label_stat); 166 | // cudaDeviceSynchronize(); // for using printf in kernel function 167 | 168 | err = cudaGetLastError(); 169 | if (cudaSuccess != err) { 170 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 171 | exit(-1); 172 | } 173 | } -------------------------------------------------------------------------------- /lib/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /* written by Xin Lai. Email: xinlai@cse.cuhk.edu.hk */ 2 | 3 | #include "../cuda_utils.h" 4 | #include "relative_pos_encoding_cuda_kernel.h" 5 | 6 | 7 | __global__ void dot_prod_with_idx_forward_cuda_kernel( // M, h, hdim 8 | int N, int M, int h, int hdim, const float *q, const int *index, 9 | const float *table, const int *rel_idx, float *output) { 10 | // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h) 11 | 12 | int c_idx = blockIdx.z; 13 | int h_idx = blockIdx.y; 14 | int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; 15 | if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; 16 | 17 | int dim = thread_idx % 3; 18 | int m_idx = thread_idx / 3; 19 | 20 | int q_idx = index[m_idx]; 21 | int rel_idx_dim = rel_idx[thread_idx]; 22 | float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; 23 | float val = q[q_idx*h*hdim+h_idx*hdim+c_idx] * rel_table_val; 24 | atomicAdd(output+m_idx*h+h_idx, val); 25 | } 26 | 27 | __global__ void dot_prod_with_idx_backward_cuda_kernel( // M, h, hdim 28 | int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, 29 | const float *table, const int *rel_idx, float *grad_q, float *grad_table) { 30 | 31 | int c_idx = blockIdx.z; 32 | int h_idx = blockIdx.y; 33 | int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; 34 | if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; 35 | 36 | int dim = thread_idx % 3; 37 | int m_idx = thread_idx / 3; 38 | 39 | int q_idx = index[m_idx]; 40 | int rel_idx_dim = rel_idx[thread_idx]; 41 | int grad_out_idx = m_idx*h+h_idx; 42 | float grad_out_value = grad_out[grad_out_idx]; 43 | 44 | float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; 45 | atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val); 46 | 47 | float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx]; 48 | atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value); 49 | } 50 | 51 | void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, 52 | const float *table, const int *rel_idx, float *output) { 53 | // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3) 54 | //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); 55 | dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); 56 | dim3 threads(THREADS_PER_BLOCK); 57 | dot_prod_with_idx_forward_cuda_kernel<<>>(N, M, h, hdim, q, index, table, rel_idx, output); 58 | } 59 | 60 | void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, 61 | const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table) { 62 | // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3) 63 | //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); 64 | dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); 65 | dim3 threads(THREADS_PER_BLOCK); 66 | dot_prod_with_idx_backward_cuda_kernel<<>>(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table); 67 | } 68 | 69 | __global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel( // M, h, hdim 70 | int N, int M, int h, int hdim, const float *attn, const float *v, 71 | const int *index0, const int *index1, const float *table, const int *rel_idx, float *output) { 72 | // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) 73 | 74 | int c_idx = blockIdx.z; 75 | int h_idx = blockIdx.y; 76 | int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; 77 | if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; 78 | 79 | int dim = thread_idx % 3; 80 | int m_idx = thread_idx / 3; 81 | 82 | int idx1 = index1[m_idx]; 83 | 84 | int rel_idx_dim = rel_idx[thread_idx]; 85 | float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; 86 | 87 | float val = attn[m_idx*h+h_idx] * (v[idx1*h*hdim+h_idx*hdim+c_idx] / 3.0 + table_val); 88 | 89 | int idx0 = index0[m_idx]; 90 | atomicAdd(output+idx0*h*hdim+h_idx*hdim+c_idx, val); 91 | } 92 | 93 | 94 | __global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel( // M, h, hdim 95 | int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, 96 | const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { 97 | // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) 98 | 99 | int c_idx = blockIdx.z; 100 | int h_idx = blockIdx.y; 101 | int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; 102 | if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; 103 | 104 | int dim = thread_idx % 3; 105 | int m_idx = thread_idx / 3; 106 | 107 | int idx0 = index0[m_idx]; 108 | int idx1 = index1[m_idx]; 109 | int grad_out_idx = idx0*h*hdim+h_idx*hdim+c_idx; 110 | 111 | int rel_idx_dim = rel_idx[thread_idx]; 112 | float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; 113 | float grad_out_value = grad_out[grad_out_idx]; 114 | 115 | atomicAdd(grad_attn+m_idx*h+h_idx, grad_out_value * (v[idx1*h*hdim+h_idx*hdim+c_idx]/3 + table_val)); 116 | atomicAdd(grad_v+idx1*h*hdim+h_idx*hdim+c_idx, grad_out_value * attn[m_idx*h+h_idx]/3); 117 | atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * attn[m_idx*h+h_idx]); 118 | } 119 | 120 | void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, 121 | const int *index1, const float *table, const int *rel_idx, float *output) { 122 | // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) 123 | //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); 124 | dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); 125 | dim3 threads(THREADS_PER_BLOCK); 126 | attention_step2_with_rel_pos_value_forward_cuda_kernel<<>>(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output); 127 | } 128 | 129 | void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, 130 | const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { 131 | // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) 132 | //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); 133 | dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); 134 | dim3 threads(THREADS_PER_BLOCK); 135 | attention_step2_with_rel_pos_value_backward_cuda_kernel<<>>(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); 136 | } 137 | --------------------------------------------------------------------------------