├── conda └── torch-points-kernels │ ├── run_test.sh │ ├── build.sh │ ├── run_test.bat │ ├── README.md │ ├── bld.bat │ ├── meta.yaml │ └── build_conda.sh ├── cpu ├── include │ ├── compat.h │ ├── fps.h │ ├── knn.h │ ├── utils.h │ ├── interpolate.h │ ├── ball_query.h │ ├── neighbors.h │ └── cloud.h └── src │ ├── fps.cpp │ ├── knn.cpp │ ├── interpolate.cpp │ ├── bindings.cpp │ ├── ball_query.cpp │ └── neighbors.cpp ├── cuda ├── include │ ├── compat.h │ ├── sampling.h │ ├── metrics.h │ ├── interpolate.h │ ├── ball_query.h │ ├── chamfer_dist.h │ ├── gridding.h │ ├── cubic_feature_sampling.h │ ├── utils.h │ └── cuda_utils.h └── src │ ├── chamfer_dist.cpp │ ├── bindings.cpp │ ├── gridding.cpp │ ├── sampling.cpp │ ├── cubic_feature_sampling.cpp │ ├── metrics.cpp │ ├── metrics_gpu.cu │ ├── interpolate.cpp │ ├── ball_query.cpp │ ├── ball_query_gpu.cu │ ├── interpolate_gpu.cu │ ├── sampling_gpu.cu │ ├── cubic_feature_sampling_gpu.cu │ ├── chamfer_dist_gpu.cu │ └── gridding_gpu.cu ├── MANIFEST.in ├── .github └── workflows │ ├── cuda │ ├── cu101-Linux-env.sh │ ├── cu102-Linux-env.sh │ ├── cu116-Linux-env.sh │ ├── cu111-Linux-env.sh │ ├── cu113-Linux-env.sh │ ├── cu115-Linux-env.sh │ ├── cu116-Windows-env.sh │ ├── cu115-Windows-env.sh │ ├── cu101-Windows-env.sh │ ├── cu102-Windows-env.sh │ ├── cu111-Windows-env.sh │ ├── cu113-Windows-env.sh │ ├── cu116-Linux.sh │ ├── cu111-Linux.sh │ ├── cu113-Linux.sh │ ├── cu115-Linux.sh │ ├── cu102-Linux.sh │ ├── cu101-Linux.sh │ ├── cu111-Windows.sh │ ├── cu101-Windows.sh │ ├── cu102-Windows.sh │ ├── cu113-Windows.sh │ ├── cu116-Windows.sh │ └── cu115-Windows.sh │ ├── deploy.yaml │ ├── tests.yaml │ └── building-conda.yml ├── test ├── __init__.py ├── test_fps.py ├── test_gridding.py ├── test_chamfer_dist.py ├── test_knn.py ├── test_interpolate.py ├── test_grouping.py ├── test_cluster.py ├── test_cubic_feature_sampling.py ├── speed_radius.py ├── test_metrics.py └── test_ballquerry.py ├── .gitignore ├── torch_points_kernels ├── __init__.py ├── knn.py ├── chamfer_dist.py ├── gridding.py ├── cubic_feature_sampling.py ├── cluster.py ├── metrics.py └── torchpoints.py ├── .black.toml ├── .pre-commit-config.yaml ├── LICENSE ├── benchmark └── region_cluster.py ├── .devcontainer ├── devcontainer.json └── Dockerfile ├── CHANGELOG.md ├── .clang-format ├── setup.py └── README.md /conda/torch-points-kernels/run_test.sh: -------------------------------------------------------------------------------- 1 | $PYTHON -m unittest -------------------------------------------------------------------------------- /conda/torch-points-kernels/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON -m pip install . 2 | -------------------------------------------------------------------------------- /conda/torch-points-kernels/run_test.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" -m unittest 2 | if errorlevel 1 exit 1 -------------------------------------------------------------------------------- /cpu/include/compat.h: -------------------------------------------------------------------------------- 1 | #ifdef VERSION_GE_1_3 2 | #define DATA_PTR data_ptr 3 | #else 4 | #define DATA_PTR data 5 | #endif 6 | -------------------------------------------------------------------------------- /cuda/include/compat.h: -------------------------------------------------------------------------------- 1 | #ifdef VERSION_GE_1_3 2 | #define DATA_PTR data_ptr 3 | #else 4 | #define DATA_PTR data 5 | #endif 6 | -------------------------------------------------------------------------------- /cpu/include/fps.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | at::Tensor fps(at::Tensor points, const int nsamples, bool random = true); 4 | -------------------------------------------------------------------------------- /cuda/include/sampling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); 5 | -------------------------------------------------------------------------------- /cpu/include/knn.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | std::pair dense_knn(at::Tensor support, at::Tensor query, int k); 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include CHANGELOG.md 4 | include pyproject.toml 5 | 6 | recursive-exclude test * 7 | recursive-include cpu * 8 | recursive-include cuda * 9 | -------------------------------------------------------------------------------- /cpu/include/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #define CHECK_CPU(x) AT_ASSERTM(!x.is_cuda(), #x " must be a CPU tensor") 5 | 6 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor") 7 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu101-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-10.1 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu102-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-10.2 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu116-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-11.6 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" -------------------------------------------------------------------------------- /.github/workflows/cuda/cu111-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-11.1 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu113-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-11.3 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu115-Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/usr/local/cuda-11.5 4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 5 | PATH=${CUDA_HOME}/bin:${PATH} 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 9 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def run_if_cuda(func): 5 | def wrapped_func(*args, **kwargs): 6 | if torch.cuda.is_available(): 7 | return func(*args, **kwargs) 8 | else: 9 | return 10 | 11 | return wrapped_func 12 | -------------------------------------------------------------------------------- /conda/torch-points-kernels/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ./build_conda.sh 3.9 1.9.0 cu111 # python, pytorch and cuda version 3 | ``` 4 | 5 | These conda scripts are based off of [pytorch_sparse](https://github.com/rusty1s/pytorch_sparse/tree/master/conda/pytorch-sparse), track changes there to update. -------------------------------------------------------------------------------- /cpu/include/interpolate.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight); 5 | 6 | at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, 7 | const int m); 8 | -------------------------------------------------------------------------------- /cuda/include/metrics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets, 5 | at::Tensor gt_instances, at::Tensor gt_instance_sizes, 6 | at::Tensor num_gt_instances, at::Tensor batch); 7 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu116-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="6.0+PTX" -------------------------------------------------------------------------------- /.github/workflows/cuda/cu115-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="6.0+PTX" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu101-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.1 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu102-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.2 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu111-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.1 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 9 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu113-Windows-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3 4 | PATH=${CUDA_HOME}/bin:$PATH 5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH 6 | 7 | export FORCE_CUDA=1 8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 9 | -------------------------------------------------------------------------------- /conda/torch-points-kernels/bld.bat: -------------------------------------------------------------------------------- 1 | copy "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64\\metis.lib" %LIBRARY_LIB% 2 | if errorlevel 1 exit 1 3 | copy "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Tools\\MSVC\\14.29.30133\\include\\metis.h" %LIBRARY_INC% 4 | if errorlevel 1 exit 1 5 | 6 | "%PYTHON%" -m pip install . 7 | if errorlevel 1 exit 1 -------------------------------------------------------------------------------- /cuda/include/interpolate.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | std::vector three_nn(at::Tensor unknowns, at::Tensor knows); 7 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight); 8 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, 9 | const int m); 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | build 35 | *.pyc 36 | 37 | .vscode/ 38 | dist/ 39 | torch_points_kernels.egg-info/ 40 | -------------------------------------------------------------------------------- /torch_points_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | from .torchpoints import * 2 | from .knn import knn 3 | from .cluster import region_grow 4 | from .metrics import instance_iou 5 | from .cubic_feature_sampling import cubic_feature_sampling 6 | 7 | __all__ = [ 8 | "ball_query", 9 | "furthest_point_sample", 10 | "grouping_operation", 11 | "three_interpolate", 12 | "three_nn", 13 | "knn", 14 | "region_grow", 15 | "instance_iou", 16 | "chamfer_dist", 17 | "cubic_feature_sampling", 18 | "gridding", 19 | ] 20 | -------------------------------------------------------------------------------- /cuda/src/chamfer_dist.cpp: -------------------------------------------------------------------------------- 1 | #include "chamfer_dist.h" 2 | 3 | std::vector chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2) 4 | { 5 | return chamfer_dist_kernel_wrapper(xyz1, xyz2); 6 | } 7 | 8 | std::vector chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2, 9 | torch::Tensor idx1, torch::Tensor idx2, 10 | torch::Tensor grad_dist1, torch::Tensor grad_dist2) 11 | { 12 | return chamfer_dist_grad_kernel_wrapper(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2); 13 | } 14 | -------------------------------------------------------------------------------- /cuda/include/ball_query.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | std::pair ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, 5 | const float radius, const int nsample); 6 | 7 | std::pair ball_query_partial_dense(at::Tensor x, at::Tensor y, 8 | at::Tensor batch_x, at::Tensor batch_y, 9 | const float radius, const int nsample); 10 | 11 | at::Tensor degree(at::Tensor row, int64_t num_nodes); 12 | -------------------------------------------------------------------------------- /.black.toml: -------------------------------------------------------------------------------- 1 | # Example configuration for Black. 2 | 3 | # NOTE: you have to use single-quoted strings in TOML for regular expressions. 4 | # It's the equivalent of r-strings in Python. Multiline strings are treated as 5 | # verbose regular expressions by Black. Use [ ] to denote a significant space 6 | # character. 7 | 8 | [tool.black] 9 | line-length = 120 10 | target-version = ['py36', 'py37', 'py38'] 11 | include = '\.pyi?$' 12 | exclude = ''' 13 | /( 14 | \.eggs 15 | | \.git 16 | | \.hg 17 | | \.mypy_cache 18 | | \.tox 19 | | \.venv 20 | | _build 21 | | buck-out 22 | | build 23 | | dist 24 | )/ 25 | ''' 26 | -------------------------------------------------------------------------------- /torch_points_kernels/knn.py: -------------------------------------------------------------------------------- 1 | import torch_points_kernels.points_cpu as tpcpu 2 | 3 | 4 | def knn(pos_support, pos, k): 5 | """Dense knn serach 6 | Arguments: 7 | pos_support - [B,N,3] support points 8 | pos - [B,M,3] centre of queries 9 | k - number of neighboors, needs to be > N 10 | 11 | Returns: 12 | idx - [B,M,k] 13 | dist2 - [B,M,k] squared distances 14 | """ 15 | assert pos_support.dim() == 3 and pos.dim() == 3 16 | if pos_support.is_cuda: 17 | raise ValueError("CUDA version not implemented, use pytorch geometric") 18 | return tpcpu.dense_knn(pos_support, pos, k) 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "build|egg-info|dist" 2 | 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.3.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: check-added-large-files 9 | - id: end-of-file-fixer 10 | 11 | - repo: https://github.com/psf/black 12 | rev: 22.3.0 13 | hooks: 14 | - id: black 15 | language_version: python3.7 16 | args: ["--config", ".black.toml"] 17 | - repo: local 18 | hooks: 19 | - id: clang-format 20 | name: Run clang-format 21 | entry: clang-format -i 22 | types: [text] 23 | files: '.*\.cpp$|.*\.h$|.*\.cu$|.*\.hpp$' 24 | language: system 25 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu116-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-11-6 cuda-libraries-dev-11-6 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb -------------------------------------------------------------------------------- /.github/workflows/cuda/cu111-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-${OS}-11-1-local/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-11-1 cuda-libraries-dev-11-1 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb 16 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu113-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-11-3 cuda-libraries-dev-11-3 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb 16 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu115-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-${OS}-11-5-local/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-11-5 cuda-libraries-dev-11-5 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb 16 | -------------------------------------------------------------------------------- /test/test_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import os 4 | import sys 5 | 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 7 | sys.path.insert(0, ROOT) 8 | 9 | from torch_points_kernels.points_cpu import fps 10 | 11 | 12 | class TestFps(unittest.TestCase): 13 | def test_simplecpu(self): 14 | points = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[-1, 1, 0], [0, 0, 10], [0, 0, 2]]]).float() 15 | idx = fps(points, 2, False) 16 | torch.testing.assert_allclose(idx, torch.tensor([[0, 2], [0, 1]])) 17 | 18 | def test_random(self): 19 | points = torch.randn(10, 100, 3) 20 | idx = fps(points, 2, True) 21 | self.assertNotEqual(idx[0][0], 0) 22 | 23 | 24 | if __name__ == "__main__": 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu102-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-10-2 cuda-libraries-dev-10-2 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb 16 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu101-Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OS=ubuntu1804 4 | 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb 8 | sudo dpkg -i cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb 9 | sudo apt-key add /var/cuda-repo-10-1-local-10.1.243-418.87.00/7fa2af80.pub 10 | 11 | sudo apt-get -qq update 12 | sudo apt install cuda-nvcc-10-1 cuda-libraries-dev-10-1 13 | sudo apt clean 14 | 15 | rm -f https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb 16 | -------------------------------------------------------------------------------- /cpu/include/ball_query.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | std::pair ball_query(at::Tensor query, at::Tensor support, float radius, 4 | int max_num, int mode, bool sorted); 5 | 6 | std::pair batch_ball_query(at::Tensor query, at::Tensor support, 7 | at::Tensor query_batch, at::Tensor support_batch, 8 | float radius, int max_num, int mode, 9 | bool sorted); 10 | 11 | std::pair dense_ball_query(at::Tensor query, at::Tensor support, 12 | float radius, int max_num, int mode, 13 | bool sorted); 14 | -------------------------------------------------------------------------------- /test/test_gridding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import torch 5 | import unittest 6 | 7 | from torch.autograd import gradcheck 8 | 9 | from . import run_if_cuda 10 | 11 | 12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 13 | sys.path.insert(0, ROOT) 14 | 15 | from torch_points_kernels.gridding import GriddingFunction 16 | 17 | 18 | class TestGridding(unittest.TestCase): 19 | @run_if_cuda 20 | def test_gridding_function_32pts(self): 21 | x = torch.rand(1, 32, 3) 22 | x.requires_grad = True 23 | self.assertTrue(gradcheck(GriddingFunction.apply, [x.double().cuda(), 4])) 24 | 25 | @run_if_cuda 26 | def test_gridding_function_64pts(self): 27 | x = torch.rand(1, 64, 3) 28 | x.requires_grad = True 29 | self.assertTrue(gradcheck(GriddingFunction.apply, [x.double().cuda(), 8])) 30 | -------------------------------------------------------------------------------- /cuda/include/chamfer_dist.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::vector chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2); 5 | 6 | std::vector chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2, 7 | torch::Tensor idx1, torch::Tensor idx2, 8 | torch::Tensor grad_dist1, torch::Tensor grad_dist2); 9 | 10 | std::vector chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2); 11 | 12 | std::vector chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2, 13 | torch::Tensor idx1, torch::Tensor idx2, 14 | torch::Tensor grad_dist1, 15 | torch::Tensor grad_dist2); 16 | -------------------------------------------------------------------------------- /cuda/include/gridding.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | std::vector gridding_kernel_warpper(float min_x, float max_x, float min_y, 7 | float max_y, float min_z, float max_z, 8 | torch::Tensor ptcloud, cudaStream_t stream); 9 | 10 | torch::Tensor gridding_grad_kernel_warpper(torch::Tensor grid_pt_weights, 11 | torch::Tensor grid_pt_indexes, torch::Tensor grad_grid, 12 | cudaStream_t stream); 13 | 14 | std::vector gridding(float min_x, float max_x, float min_y, float max_y, float min_z, 15 | float max_z, torch::Tensor ptcloud); 16 | 17 | torch::Tensor gridding_grad(torch::Tensor grid_pt_weights, torch::Tensor grid_pt_indexes, 18 | torch::Tensor grad_grid); 19 | -------------------------------------------------------------------------------- /cpu/include/neighbors.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "cloud.h" 4 | #include "nanoflann.hpp" 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | template 11 | int nanoflann_neighbors(vector& queries, vector& supports, 12 | vector& neighbors_indices, vector& dists, float radius, 13 | int max_num, int mode, bool sorted); 14 | 15 | template 16 | int batch_nanoflann_neighbors(vector& queries, vector& supports, 17 | vector& q_batches, vector& s_batches, 18 | vector& neighbors_indices, vector& dists, 19 | float radius, int max_num, int mode, bool sorted); 20 | 21 | template 22 | void nanoflann_knn_neighbors(vector& queries, vector& supports, 23 | vector& neighbors_indices, vector& dists, int k); 24 | -------------------------------------------------------------------------------- /cuda/src/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "ball_query.h" 2 | #include "chamfer_dist.h" 3 | #include "cubic_feature_sampling.h" 4 | #include "gridding.h" 5 | #include "interpolate.h" 6 | #include "metrics.h" 7 | #include "sampling.h" 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 10 | { 11 | m.def("furthest_point_sampling", &furthest_point_sampling); 12 | 13 | m.def("three_nn", &three_nn); 14 | m.def("three_interpolate", &three_interpolate); 15 | m.def("three_interpolate_grad", &three_interpolate_grad); 16 | 17 | m.def("ball_query_dense", &ball_query_dense); 18 | m.def("ball_query_partial_dense", &ball_query_partial_dense); 19 | 20 | m.def("instance_iou_cuda", &instance_iou_cuda); 21 | 22 | m.def("chamfer_dist", &chamfer_dist); 23 | m.def("chamfer_dist_grad", &chamfer_dist_grad); 24 | 25 | m.def("cubic_feature_sampling", &cubic_feature_sampling); 26 | m.def("cubic_feature_sampling_grad", &cubic_feature_sampling_grad); 27 | 28 | m.def("gridding", &gridding); 29 | m.def("gridding_grad", &gridding_grad); 30 | } 31 | -------------------------------------------------------------------------------- /cuda/src/gridding.cpp: -------------------------------------------------------------------------------- 1 | #include "gridding.h" 2 | #include "utils.h" 3 | 4 | std::vector gridding(float min_x, float max_x, float min_y, float max_y, float min_z, 5 | float max_z, torch::Tensor ptcloud) 6 | { 7 | CHECK_CUDA(ptcloud); 8 | CHECK_CONTIGUOUS(ptcloud); 9 | 10 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 11 | return gridding_kernel_warpper(min_x, max_x, min_y, max_y, min_z, max_z, ptcloud, stream); 12 | } 13 | 14 | torch::Tensor gridding_grad(torch::Tensor grid_pt_weights, torch::Tensor grid_pt_indexes, 15 | torch::Tensor grad_grid) 16 | { 17 | CHECK_CUDA(grid_pt_weights); 18 | CHECK_CONTIGUOUS(grid_pt_weights); 19 | CHECK_CUDA(grid_pt_indexes); 20 | CHECK_CONTIGUOUS(grid_pt_indexes); 21 | CHECK_CUDA(grad_grid); 22 | CHECK_CONTIGUOUS(grad_grid); 23 | 24 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 25 | return gridding_grad_kernel_warpper(grid_pt_weights, grid_pt_indexes, grad_grid, stream); 26 | } 27 | -------------------------------------------------------------------------------- /cuda/src/sampling.cpp: -------------------------------------------------------------------------------- 1 | #include "sampling.h" 2 | #include "compat.h" 3 | #include "utils.h" 4 | 5 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float* dataset, float* temp, 6 | int* idxs); 7 | 8 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) 9 | { 10 | CHECK_CONTIGUOUS(points); 11 | CHECK_IS_FLOAT(points); 12 | CHECK_CUDA(points); 13 | 14 | at::Tensor output = torch::zeros({points.size(0), nsamples}, 15 | at::device(points.device()).dtype(at::ScalarType::Int)); 16 | 17 | at::Tensor tmp = torch::full({points.size(0), points.size(1)}, 1e10, 18 | at::device(points.device()).dtype(at::ScalarType::Float)); 19 | 20 | furthest_point_sampling_kernel_wrapper(points.size(0), points.size(1), nsamples, 21 | points.DATA_PTR(), tmp.DATA_PTR(), 22 | output.DATA_PTR()); 23 | 24 | return output; 25 | } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nicolas Chaulet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu111-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install NVIDIA drivers, see: 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 7 | 8 | export CUDA_SHORT=11.1 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers 10 | export CUDA_FILE=cuda_${CUDA_SHORT}.1_456.81_win10.exe 11 | 12 | # Install CUDA: 13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 14 | echo "" 15 | echo "Installing from ${CUDA_FILE}..." 16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 17 | echo "Done!" 18 | rm -f "${CUDA_FILE}" 19 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu101-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install NVIDIA drivers, see: 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 7 | 8 | export CUDA_SHORT=10.1 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers/ 10 | export CUDA_FILE=cuda_${CUDA_SHORT}.243_426.00_win10.exe 11 | 12 | # Install CUDA: 13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 14 | echo "" 15 | echo "Installing from ${CUDA_FILE}..." 16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 17 | echo "Done!" 18 | rm -f "${CUDA_FILE}" 19 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu102-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install NVIDIA drivers, see: 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 7 | 8 | export CUDA_SHORT=10.2 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers 10 | export CUDA_FILE=cuda_${CUDA_SHORT}.89_441.22_win10.exe 11 | 12 | # Install CUDA: 13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 14 | echo "" 15 | echo "Installing from ${CUDA_FILE}..." 16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 17 | echo "Done!" 18 | rm -f "${CUDA_FILE}" 19 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu113-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install NVIDIA drivers, see: 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 7 | 8 | export CUDA_SHORT=11.3 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers 10 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe 11 | 12 | # Install CUDA: 13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 14 | echo "" 15 | echo "Installing from ${CUDA_FILE}..." 16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 17 | echo "Done!" 18 | rm -f "${CUDA_FILE}" 19 | -------------------------------------------------------------------------------- /cpu/src/fps.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "compat.h" 4 | #include "utils.h" 5 | 6 | at::Tensor get_dist(at::Tensor x, ptrdiff_t index) 7 | { 8 | return (x - x[index]).norm(2, 1); 9 | } 10 | 11 | at::Tensor fps(at::Tensor points, const int nsamples, bool random) 12 | { 13 | CHECK_CONTIGUOUS(points); 14 | 15 | auto out_options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU); 16 | auto batch_size = points.size(0); 17 | auto out = torch::empty({batch_size, nsamples}, out_options); 18 | auto out_a = out.accessor(); 19 | 20 | for (ptrdiff_t b = 0; b < batch_size; b++) 21 | { 22 | auto y = points[b]; 23 | ptrdiff_t start = 0; 24 | if (random) 25 | start = at::randperm(y.size(0), out_options).DATA_PTR()[0]; 26 | 27 | out_a[b][0] = start; 28 | auto dist = get_dist(y, start); 29 | for (ptrdiff_t i = 1; i < nsamples; i++) 30 | { 31 | ptrdiff_t argmax = dist.argmax().DATA_PTR()[0]; 32 | out_a[b][i] = argmax; 33 | dist = at::min(dist, get_dist(y, argmax)); 34 | } 35 | } 36 | return out; 37 | } 38 | -------------------------------------------------------------------------------- /conda/torch-points-kernels/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: torch-points-kernels 3 | version: 0.7.1 4 | 5 | source: 6 | path: ../.. 7 | 8 | requirements: 9 | build: 10 | - {{ compiler('c') }} # [win] 11 | 12 | host: 13 | - pip 14 | - python {{ environ.get('PYTHON_VERSION') }} 15 | - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} 16 | - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }} 17 | 18 | run: 19 | - python {{ environ.get('PYTHON_VERSION') }} 20 | - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} 21 | - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }} 22 | - numpy 23 | - numba 24 | - scikit-learn 25 | 26 | build: 27 | string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }} 28 | script_env: 29 | - FORCE_CUDA 30 | - TORCH_CUDA_ARCH_LIST 31 | preserve_egg_dir: True 32 | 33 | test: 34 | source_files: 35 | - test 36 | 37 | about: 38 | home: https://github.com/torch-points3d/torch-points-kernels 39 | license: MIT 40 | summary: Pytorch CPU and CUDA kernels for spatial search and interpolation for 3D point clouds. 41 | -------------------------------------------------------------------------------- /benchmark/region_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import os 4 | import sys 5 | import time 6 | import random 7 | 8 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 9 | sys.path.insert(0, ROOT) 10 | 11 | from torch_points_kernels.cluster import grow_proximity 12 | 13 | torch.manual_seed(0) 14 | 15 | num_points = 100000 16 | pos1 = torch.rand((num_points, 3)) 17 | pos2 = torch.rand((num_points, 3)) + 2 18 | pos3 = torch.rand((num_points, 3)) + 4 19 | labels1 = torch.ones(num_points).long() 20 | labels2 = torch.ones(num_points).long() 21 | labels3 = torch.ones(num_points).long() 22 | pos = torch.cat([pos1, pos2, pos3], 0) 23 | label = torch.cat([labels1, labels2, labels3], 0) 24 | batch = torch.ones((3 * num_points)).long() 25 | cl = grow_proximity(pos, batch, radius=0.5) 26 | 27 | 28 | import cProfile, pstats, io 29 | from pstats import SortKey 30 | 31 | pr = cProfile.Profile() 32 | pr.enable() 33 | t_start = time.perf_counter() 34 | grow_proximity(pos, batch) 35 | print(time.perf_counter() - t_start) 36 | pr.disable() 37 | s = io.StringIO() 38 | sortby = SortKey.CUMULATIVE 39 | ps = pstats.Stats(pr, stream=s).sort_stats(sortby) 40 | ps.print_stats() 41 | print(s.getvalue()) 42 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.101.1/containers/python-3 3 | { 4 | "name": "Python 3", 5 | "context": "..", 6 | "dockerFile": "Dockerfile", 7 | // Set *default* container specific settings.json values on container create. 8 | "settings": { 9 | "terminal.integrated.shell.linux": "/bin/bash", 10 | "python.pythonPath": "/usr/local/bin/python", 11 | "python.linting.enabled": true, 12 | "python.linting.pylintEnabled": true, 13 | "python.linting.pylintPath": "/usr/local/bin/pylint" 14 | }, 15 | // Add the IDs of extensions you want installed when the container is created. 16 | "extensions": [ 17 | "ms-python.python", 18 | "ms-vscode.cpptools" 19 | ] 20 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 21 | // "forwardPorts": [], 22 | // Use 'postCreateCommand' to run commands after the container is created. 23 | // "postCreateCommand": "pip install -r requirements.txt", 24 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 25 | // "remoteUser": "vscode" 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/cuda/cu116-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels 4 | 5 | # Install NVIDIA drivers, see: 6 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 7 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 8 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 9 | 10 | export CUDA_SHORT=11.3 11 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers 12 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe 13 | 14 | # Install CUDA: 15 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 16 | echo "" 17 | echo "Installing from ${CUDA_FILE}..." 18 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 19 | echo "Done!" 20 | rm -f "${CUDA_FILE}" -------------------------------------------------------------------------------- /.github/workflows/cuda/cu115-Windows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels 4 | 5 | # Install NVIDIA drivers, see: 6 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102 7 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip" 8 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32" 9 | 10 | export CUDA_SHORT=11.3 11 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers 12 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe 13 | 14 | # Install CUDA: 15 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}" 16 | echo "" 17 | echo "Installing from ${CUDA_FILE}..." 18 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow" 19 | echo "Done!" 20 | rm -f "${CUDA_FILE}" 21 | -------------------------------------------------------------------------------- /cuda/include/cubic_feature_sampling.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | std::vector cubic_feature_sampling(int scale, int neighborhood_size, 7 | torch::Tensor ptcloud, 8 | torch::Tensor cubic_features); 9 | 10 | std::vector cubic_feature_sampling_grad(int scale, int neighborhood_size, 11 | torch::Tensor grad_point_features, 12 | torch::Tensor grid_pt_indexes); 13 | 14 | std::vector cubic_feature_sampling_kernel_wrapper(int scale, int neighborhood_size, 15 | torch::Tensor ptcloud, 16 | torch::Tensor cubic_features, 17 | cudaStream_t stream); 18 | 19 | std::vector 20 | cubic_feature_sampling_grad_kernel_wrapper(int scale, int neighborhood_size, 21 | torch::Tensor grad_point_features, 22 | torch::Tensor grid_pt_indexes, cudaStream_t stream); 23 | -------------------------------------------------------------------------------- /conda/torch-points-kernels/build_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHON_VERSION=$1 4 | export TORCH_VERSION=$2 5 | export CUDA_VERSION=$3 6 | 7 | export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*" 8 | 9 | if [ "${CUDA_VERSION}" = "cpu" ]; then 10 | export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly # [not osx]" 11 | else 12 | case $CUDA_VERSION in 13 | cu116) 14 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*" 15 | ;; 16 | cu115) 17 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*" 18 | ;; 19 | cu113) 20 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*" 21 | ;; 22 | cu111) 23 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*" 24 | ;; 25 | cu102) 26 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*" 27 | ;; 28 | cu101) 29 | export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*" 30 | ;; 31 | *) 32 | echo "Unrecognized CUDA_VERSION=$CUDA_VERSION" 33 | exit 1 34 | ;; 35 | esac 36 | fi 37 | 38 | echo "PyTorch $TORCH_VERSION+$CUDA_VERSION" 39 | echo "- $CONDA_PYTORCH_CONSTRAINT" 40 | echo "- $CONDA_CUDATOOLKIT_CONSTRAINT" 41 | 42 | if [ "${CUDA_VERSION}" = "cu116" ]; then 43 | conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld" 44 | else 45 | conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld" 46 | fi -------------------------------------------------------------------------------- /cuda/src/cubic_feature_sampling.cpp: -------------------------------------------------------------------------------- 1 | #include "cubic_feature_sampling.h" 2 | #include "utils.h" 3 | 4 | std::vector cubic_feature_sampling(int scale, int neighborhood_size, 5 | torch::Tensor ptcloud, 6 | torch::Tensor cubic_features) 7 | { 8 | CHECK_CUDA(ptcloud); 9 | CHECK_CONTIGUOUS(ptcloud); 10 | CHECK_CUDA(cubic_features); 11 | CHECK_CONTIGUOUS(cubic_features); 12 | 13 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 14 | return cubic_feature_sampling_kernel_wrapper(scale, neighborhood_size, ptcloud, cubic_features, 15 | stream); 16 | } 17 | 18 | std::vector cubic_feature_sampling_grad(int scale, int neighborhood_size, 19 | torch::Tensor grad_point_features, 20 | torch::Tensor grid_pt_indexes) 21 | { 22 | CHECK_CUDA(grad_point_features); 23 | CHECK_CONTIGUOUS(grad_point_features); 24 | CHECK_CUDA(grid_pt_indexes); 25 | CHECK_CONTIGUOUS(grid_pt_indexes); 26 | 27 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 28 | return cubic_feature_sampling_grad_kernel_wrapper(scale, neighborhood_size, grad_point_features, 29 | grid_pt_indexes, stream); 30 | } 31 | -------------------------------------------------------------------------------- /test/test_chamfer_dist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import torch 5 | import unittest 6 | 7 | from torch.autograd import gradcheck 8 | 9 | from . import run_if_cuda 10 | 11 | 12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 13 | sys.path.insert(0, ROOT) 14 | 15 | from torch_points_kernels.chamfer_dist import ChamferFunction, chamfer_dist 16 | 17 | 18 | class TestChamferDistance(unittest.TestCase): 19 | @run_if_cuda 20 | def test_chamfer_dist_grad(self): 21 | x = torch.rand(4, 64, 3).double() 22 | y = torch.rand(4, 128, 3).double() 23 | x.requires_grad = True 24 | y.requires_grad = True 25 | test = gradcheck(ChamferFunction.apply, [x.cuda(), y.cuda()]) 26 | 27 | @run_if_cuda 28 | def test_chamfer_dist(self): 29 | xyz1 = torch.from_numpy(np.array([[[0, 0, 0], [1, 1, 1], [2, 0, 1]]])).float() 30 | xyz2 = torch.from_numpy(np.array([[[1, 0, 0], [1, 2, 1]]])).float() 31 | dist = chamfer_dist(xyz1.cuda(), xyz2.cuda()) 32 | self.assertAlmostEqual(dist.item(), 2.333333, places=5) 33 | 34 | @run_if_cuda 35 | def test_chamfer_dist_ignore_zeros(self): 36 | xyz1 = torch.from_numpy(np.array([[[0, 0, 0], [1, 1, 1], [2, 0, 1]]])).float() 37 | xyz2 = torch.from_numpy(np.array([[[1, 0, 0], [1, 2, 1]]])).float() 38 | dist = chamfer_dist(xyz1.cuda(), xyz2.cuda(), True) 39 | self.assertAlmostEqual(dist.item(), 3.0, places=5) 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /test/test_knn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import os 4 | import sys 5 | 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 7 | sys.path.insert(0, ROOT) 8 | 9 | from torch_points_kernels import three_nn, knn 10 | from . import run_if_cuda 11 | 12 | 13 | class TestKnn(unittest.TestCase): 14 | def test_cpu(self): 15 | support = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).float() 16 | query = torch.tensor([[[0, 0, 0]]]).float() 17 | 18 | idx, dist = knn(support, query, 3) 19 | torch.testing.assert_allclose(idx, torch.tensor([[[0, 1, 2]]])) 20 | torch.testing.assert_allclose(dist, torch.tensor([[[0.0, 1.0, 4.0]]])) 21 | 22 | idx, dist = knn(support, query, 2) 23 | torch.testing.assert_allclose(idx, torch.tensor([[[0, 1]]])) 24 | 25 | with self.assertRaises(RuntimeError): 26 | knn(support, query, 5) 27 | 28 | def test_larger_cpu(self): 29 | support = torch.randn((2, 10, 3)) 30 | query = torch.randn((2, 10, 3)) 31 | 32 | idx, dist = knn(support, query, 3) 33 | 34 | 35 | class TestThreeNN(unittest.TestCase): 36 | @run_if_cuda 37 | def test_cpugpu(self): 38 | b = 20 39 | n = 1000 40 | known = torch.randn((b, 2 * n, 3)) 41 | unknown = torch.randn((b, n, 3)) 42 | 43 | dist_cpu, cpu_idx = three_nn(unknown, known) 44 | dist_cuda, cuda_idx = three_nn(unknown.cuda(), known.cuda()) 45 | 46 | torch.testing.assert_allclose(dist_cpu, dist_cuda.cpu()) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | tags: 4 | - 'v*' 5 | 6 | name: Deploy 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 3.6 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.6 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install torch "numpy<=1.21" scikit-learn flake8 setuptools wheel twine numba 21 | - name: Build package 22 | run: | 23 | python setup.py build_ext --inplace 24 | - name: Lint with flake8 25 | run: | 26 | # stop the build if there are Python syntax errors or undefined names 27 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 28 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 29 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 30 | - name: Test with unittest 31 | run: | 32 | python -m unittest -v 33 | - name: Build package 34 | run: | 35 | python setup.py sdist 36 | - name: Publish package 37 | uses: pypa/gh-action-pypi-publish@master 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_PASSWORD }} 41 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.7.1 2 | ## Additions 3 | - Add support for anaconda installation 4 | 5 | # 0.7.0 6 | ## Change 7 | - Added some extra compilation flags: FORCE_CUDA=1 for CUDA install and FORCE_ONLY_CPU=1 for CPU only install. 8 | 9 | # 0.6.9 10 | ## Additions 11 | 12 | - Cubic feature sampling kernel as proposed in https://arxiv.org/pdf/2006.03761 13 | 14 | # 0.6.8 15 | ## Bug fix 16 | - Comilation with older GPU architecture 17 | 18 | 19 | # 0.6.7 20 | ## Additions 21 | - Chamfer distance introduced in https://arxiv.org/pdf/1612.00603 for dense batches 22 | 23 | # 0.6.6 24 | ## Additions 25 | - Windows support 26 | 27 | 28 | ## Change 29 | - Develop with python 3.7 30 | 31 | ## Bug fix 32 | - Fixed bug in region growing related to batching 33 | - Ball query for partial dense data on GPU was returning only the first point. Fixed now 34 | 35 | 36 | # 0.6.5 37 | 38 | ## Additions 39 | - Clustering algorithm for [PointGroup](https://arxiv.org/pdf/2004.01658.pdf) 40 | - Instance IoU computation on CPU and GPU 41 | 42 | ## Change 43 | - Force no ninja for the compilation 44 | 45 | # 0.6.4 46 | 47 | ## Bug fix 48 | - CPU version works for MacOS 49 | 50 | # 0.6.2 51 | 52 | ## Bug fix 53 | - Fix install with pip > 19 54 | 55 | # 0.6.1 56 | 57 | ## Bug fix 58 | - Random memory access on cpu radius search in the degree function 59 | 60 | # 0.6.0 61 | 62 | ## Bug fix 63 | - Require pytorch implicitely and log nice message when missing 64 | 65 | # 0.5.3 66 | 67 | ## Update 68 | - ball query returns squared distance instead of distance 69 | - leaner Point Cloud struct that avoids copying data 70 | 71 | ## Bug fix 72 | - Package would not install if pytorch is not already installed 73 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 4 | #------------------------------------------------------------------------------------------------------------- 5 | 6 | FROM ubuntu:bionic 7 | 8 | # Avoid warnings by switching to noninteractive 9 | ENV DEBIAN_FRONTEND=noninteractive 10 | 11 | # This Dockerfile adds a non-root user with sudo access. Use the "remoteUser" 12 | # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs 13 | # will be updated to match your local UID/GID (when using the dockerFile property). 14 | # See https://aka.ms/vscode-remote/containers/non-root-user for details. 15 | ARG USERNAME=vscode 16 | ARG USER_UID=1000 17 | ARG USER_GID=$USER_UID 18 | 19 | # Uncomment the following COPY line and the corresponding lines in the `RUN` command if you wish to 20 | # include your requirements in the image itself. It is suggested that you only do this if your 21 | # requirements rarely (if ever) change. 22 | 23 | RUN apt-get update \ 24 | && apt-get install -y --fix-missing --no-install-recommends\ 25 | libffi-dev libssl-dev build-essential \ 26 | python3-pip python3-dev python3-venv python3-setuptools\ 27 | git iproute2 procps lsb-release clang-format \ 28 | && apt-get clean \ 29 | && rm -rf /var/lib/apt/lists/* 30 | 31 | RUN pip3 install -U pip 32 | RUN pip3 install torch numpy scikit-learn flake8 setuptools numba 33 | RUN pip3 install torch_cluster torch_sparse torch_scatter torch_geometric 34 | -------------------------------------------------------------------------------- /test/test_interpolate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | from torch.autograd import gradcheck 4 | from torch_points_kernels import three_interpolate, three_nn 5 | 6 | from . import run_if_cuda 7 | 8 | 9 | class TestInterpolate(unittest.TestCase): 10 | @run_if_cuda 11 | def test_gpu(self): 12 | pos = torch.randn([16, 100, 3]).cuda() 13 | pos_skip = torch.randn([16, 500, 3]).cuda() 14 | x = torch.randn([16, 30, 100], requires_grad=True).cuda() 15 | 16 | dist, idx = three_nn(pos_skip, pos) 17 | dist_recip = 1.0 / (dist + 1e-8) 18 | norm = torch.sum(dist_recip, dim=2, keepdim=True) 19 | weight = dist_recip / norm 20 | interpolated_feats = three_interpolate(x, idx, weight) 21 | 22 | dist, idx = three_nn(pos_skip.cpu(), pos.cpu()) 23 | dist_recip = 1.0 / (dist + 1e-8) 24 | norm = torch.sum(dist_recip, dim=2, keepdim=True) 25 | weight = dist_recip / norm 26 | interpolated_feats_cpu = three_interpolate(x.cpu(), idx, weight) 27 | 28 | torch.testing.assert_allclose(interpolated_feats_cpu, interpolated_feats.cpu()) 29 | 30 | def test_grad(self): 31 | b, n, k = (2, 10, 3) 32 | pos = torch.randn([b, n, k]).double() 33 | pos_skip = torch.randn([b, 2 * n, k]).double() 34 | x = torch.randn([b, 30, n], requires_grad=True).double() 35 | dist, idx = three_nn(pos_skip, pos) 36 | dist_recip = 1.0 / (dist + 1e-8) 37 | norm = torch.sum(dist_recip, dim=2, keepdim=True) 38 | weight = dist_recip / norm 39 | input = (x, idx, weight.double()) 40 | test = gradcheck(three_interpolate, input, eps=1e-6, atol=1e-4) 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /cuda/include/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #define CHECK_CUDA(x) \ 6 | do \ 7 | { \ 8 | TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor"); \ 9 | } while (0) 10 | 11 | #define CHECK_CONTIGUOUS(x) \ 12 | do \ 13 | { \ 14 | TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor"); \ 15 | } while (0) 16 | 17 | #define CHECK_IS_INT(x) \ 18 | do \ 19 | { \ 20 | TORCH_CHECK(isIntegralType(x.scalar_type(), false), #x " must be an int tensor"); \ 21 | } while (0) 22 | 23 | #define CHECK_IS_FLOAT(x) \ 24 | do \ 25 | { \ 26 | TORCH_CHECK(isFloatingType(x.scalar_type()), #x " must be a float tensor"); \ 27 | } while (0) 28 | -------------------------------------------------------------------------------- /test/test_grouping.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import numpy as np 4 | import numpy.testing as npt 5 | from torch_points_kernels import grouping_operation 6 | 7 | 8 | class TestGroup(unittest.TestCase): 9 | 10 | # input: points(b, c, n) idx(b, npoints, nsample) 11 | # output: out(b, c, npoints, nsample) 12 | def test_simple(self): 13 | features = torch.tensor( 14 | [ 15 | [[0, 10, 0], [1, 11, 0], [2, 12, 0]], 16 | [ 17 | [100, 110, 120], 18 | [101, 111, 121], 19 | [102, 112, 122], 20 | ], # x-coordinates # y-coordinates # z-coordinates 21 | ] 22 | ).type(torch.float) 23 | idx = torch.tensor([[[1, 0], [0, 0]], [[0, 1], [1, 2]]]).type(torch.long) 24 | 25 | expected = np.array( 26 | [ 27 | [[[10, 0], [0, 0]], [[11, 1], [1, 1]], [[12, 2], [2, 2]]], 28 | [ # 2nd batch 29 | [ # x-coordinates 30 | [100, 110], # x-coordinates of samples for point 0 31 | [110, 120], # x-coordinates of samples for point 1 32 | ], 33 | [[101, 111], [111, 121]], # y-coordinates 34 | [[102, 112], [112, 122]], # z-coordinates 35 | ], 36 | ] 37 | ) 38 | 39 | cpu_output = grouping_operation(features, idx).detach().cpu().numpy() 40 | 41 | npt.assert_array_equal(expected, cpu_output) 42 | 43 | if torch.cuda.is_available(): 44 | npt.assert_array_equal( 45 | grouping_operation(features.cuda(), idx.cuda()).detach().cpu().numpy(), 46 | expected, 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /test/test_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import os 4 | import sys 5 | 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 7 | sys.path.insert(0, ROOT) 8 | 9 | from torch_points_kernels.cluster import grow_proximity, region_grow 10 | 11 | 12 | class TestGrow(unittest.TestCase): 13 | def setUp(self): 14 | self.pos = torch.tensor( 15 | [ 16 | [0, 0, 0], 17 | [1, 0, 0], 18 | [2, 0, 0], 19 | [10, 0, 0], 20 | [0, 0, 0], 21 | [1, 0, 0], 22 | [2, 0, 0], 23 | [10, 0, 0], 24 | ] 25 | ) 26 | self.batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1]) 27 | self.labels = torch.tensor([0, 0, 1, 1, 0, 1, 1, 10]) 28 | 29 | def test_simple(self): 30 | clusters = grow_proximity(self.pos, self.batch, radius=2, min_cluster_size=1) 31 | self.assertEqual(clusters, [[0, 1, 2], [3], [4, 5, 6], [7]]) 32 | 33 | clusters = grow_proximity(self.pos, self.batch, radius=2, min_cluster_size=3) 34 | self.assertEqual(clusters, [[0, 1, 2], [4, 5, 6]]) 35 | 36 | def test_region_grow(self): 37 | cluster_idx = region_grow(self.pos, self.labels, self.batch, radius=2, min_cluster_size=1) 38 | self.assertEqual(len(cluster_idx), 6) 39 | torch.testing.assert_allclose(cluster_idx[0], torch.tensor([0, 1])) 40 | torch.testing.assert_allclose(cluster_idx[1], torch.tensor([4])) 41 | torch.testing.assert_allclose(cluster_idx[2], torch.tensor([2])) 42 | torch.testing.assert_allclose(cluster_idx[3], torch.tensor([3])) 43 | torch.testing.assert_allclose(cluster_idx[4], torch.tensor([5, 6])) 44 | torch.testing.assert_allclose(cluster_idx[5], torch.tensor([7])) 45 | 46 | 47 | if __name__ == "__main__": 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Unittests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | 9 | jobs: 10 | unittests: 11 | strategy: 12 | matrix: 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | python-version: [3.7, 3.8] 15 | torch-version: [1.10.0, 1.11.0, 1.12.0] 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install "numpy<=1.21" scikit-learn flake8 setuptools numba 27 | 28 | - name: Install torch ${{ matrix.torch-version }} windows + linux 29 | if: ${{matrix.os != 'macos-latest'}} 30 | run: pip install torch==${{ matrix.torch-version }}+cpu -f https://download.pytorch.org/whl/torch_stable.html 31 | - name: Install torch ${{ matrix.torch-version }} macos 32 | if: ${{matrix.os == 'macos-latest'}} 33 | run: pip install torch==${{ matrix.torch-version }} 34 | 35 | - name: Build package 36 | run: | 37 | python setup.py build_ext --inplace 38 | - name: Lint with flake8 39 | run: | 40 | # stop the build if there are Python syntax errors or undefined names 41 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 42 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 43 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 44 | - name: Test with unittest 45 | run: | 46 | python -m unittest -v 47 | -------------------------------------------------------------------------------- /test/test_cubic_feature_sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import torch 5 | import unittest 6 | 7 | from torch.autograd import gradcheck 8 | 9 | from . import run_if_cuda 10 | 11 | 12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 13 | sys.path.insert(0, ROOT) 14 | 15 | from torch_points_kernels.cubic_feature_sampling import CubicFeatureSamplingFunction, cubic_feature_sampling 16 | 17 | 18 | class TestCubicFeatureSampling(unittest.TestCase): 19 | @run_if_cuda 20 | def test_neighborhood_size_1(self): 21 | ptcloud = torch.rand(2, 64, 3) * 2 - 1 22 | cubic_features = torch.rand(2, 4, 8, 8, 8) 23 | ptcloud.requires_grad = True 24 | cubic_features.requires_grad = True 25 | self.assertTrue( 26 | gradcheck( 27 | CubicFeatureSamplingFunction.apply, 28 | [ptcloud.double().cuda(), cubic_features.double().cuda()], 29 | ) 30 | ) 31 | 32 | @run_if_cuda 33 | def test_neighborhood_size_2(self): 34 | ptcloud = torch.rand(2, 32, 3) * 2 - 1 35 | cubic_features = torch.rand(2, 2, 8, 8, 8) 36 | ptcloud.requires_grad = True 37 | cubic_features.requires_grad = True 38 | self.assertTrue( 39 | gradcheck( 40 | CubicFeatureSamplingFunction.apply, 41 | [ptcloud.double().cuda(), cubic_features.double().cuda(), 2], 42 | ) 43 | ) 44 | 45 | @run_if_cuda 46 | def test_neighborhood_size_3(self): 47 | ptcloud = torch.rand(1, 32, 3) * 2 - 1 48 | cubic_features = torch.rand(1, 2, 16, 16, 16) 49 | ptcloud.requires_grad = True 50 | cubic_features.requires_grad = True 51 | self.assertTrue( 52 | gradcheck( 53 | CubicFeatureSamplingFunction.apply, 54 | [ptcloud.double().cuda(), cubic_features.double().cuda(), 3], 55 | ) 56 | ) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /torch_points_kernels/chamfer_dist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if torch.cuda.is_available(): 4 | import torch_points_kernels.points_cuda as tpcuda 5 | 6 | 7 | class ChamferFunction(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, xyz1, xyz2): 10 | if not torch.cuda.is_available(): 11 | raise NotImplementedError("CPU version is not available for Chamfer Distance") 12 | 13 | dist1, dist2, idx1, idx2 = tpcuda.chamfer_dist(xyz1, xyz2) 14 | ctx.save_for_backward(xyz1, xyz2, idx1, idx2) 15 | 16 | return dist1, dist2 17 | 18 | @staticmethod 19 | def backward(ctx, grad_dist1, grad_dist2): 20 | xyz1, xyz2, idx1, idx2 = ctx.saved_tensors 21 | grad_xyz1, grad_xyz2 = tpcuda.chamfer_dist_grad(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2) 22 | return grad_xyz1, grad_xyz2 23 | 24 | 25 | def chamfer_dist(xyz1, xyz2, ignore_zeros=False): 26 | r""" 27 | Calcuates the distance between B pairs of point clouds 28 | 29 | Parameters 30 | ---------- 31 | xyz1 : torch.Tensor (dtype=torch.float32) 32 | (B, n1, 3) B point clouds containing n1 points 33 | xyz2 : torch.Tensor (dtype=torch.float32) 34 | (B, n2, 3) B point clouds containing n2 points 35 | ignore_zeros : bool 36 | ignore the point whose coordinate is (0, 0, 0) or not 37 | 38 | Returns 39 | ------- 40 | dist: torch.Tensor 41 | (B, ): the distances between B pairs of point clouds 42 | """ 43 | if len(xyz1.shape) != 3 or xyz1.size(2) != 3 or len(xyz2.shape) != 3 or xyz2.size(2) != 3: 44 | raise ValueError("The input point cloud should be of size (B, n_pts, 3)") 45 | 46 | batch_size = xyz1.size(0) 47 | if batch_size == 1 and ignore_zeros: 48 | non_zeros1 = torch.sum(xyz1, dim=2).ne(0) 49 | non_zeros2 = torch.sum(xyz2, dim=2).ne(0) 50 | xyz1 = xyz1[non_zeros1].unsqueeze(dim=0) 51 | xyz2 = xyz2[non_zeros2].unsqueeze(dim=0) 52 | 53 | dist1, dist2 = ChamferFunction.apply(xyz1, xyz2) 54 | return torch.mean(dist1) + torch.mean(dist2) 55 | -------------------------------------------------------------------------------- /test/speed_radius.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import sys 4 | import numpy.testing as npt 5 | import numpy as np 6 | from sklearn.neighbors import KDTree 7 | import unittest 8 | import time 9 | 10 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 11 | sys.path.insert(0, ROOT) 12 | 13 | from torch_points_kernels import ball_query 14 | 15 | 16 | class TestRadiusSpeed(unittest.TestCase): 17 | def test_speed(self): 18 | start = time.time() 19 | a = torch.randn(50000, 3).to(torch.float) 20 | b = torch.randn(10000, 3).to(torch.float) 21 | batch_a = torch.tensor([0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])]) 22 | batch_b = torch.tensor([0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])]) 23 | R = 1 24 | samples = 50 25 | 26 | idx, dist = ball_query( 27 | R, 28 | samples, 29 | a, 30 | b, 31 | mode="PARTIAL_DENSE", 32 | batch_x=batch_a, 33 | batch_y=batch_b, 34 | sort=True, 35 | ) 36 | idx1, dist = ball_query( 37 | R, 38 | samples, 39 | a, 40 | b, 41 | mode="PARTIAL_DENSE", 42 | batch_x=batch_a, 43 | batch_y=batch_b, 44 | sort=True, 45 | ) 46 | print(time.time() - start) 47 | torch.testing.assert_allclose(idx1, idx) 48 | 49 | self.assertEqual(idx.shape[0], b.shape[0]) 50 | self.assertEqual(dist.shape[0], b.shape[0]) 51 | self.assertLessEqual(idx.max().item(), len(batch_a)) 52 | 53 | # # Comparison to see if we have the same result 54 | # tree = KDTree(a.detach().numpy()) 55 | # idx3_sk = tree.query_radius(b.detach().numpy(), r=R) 56 | # i = np.random.randint(len(batch_b)) 57 | # for p in idx[i].detach().numpy(): 58 | # if p >= 0 and p < len(batch_a): 59 | # assert p in idx3_sk[i] 60 | 61 | 62 | if __name__ == "__main__": 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /torch_points_kernels/gridding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if torch.cuda.is_available(): 4 | import torch_points_kernels.points_cuda as tpcuda 5 | 6 | 7 | class GriddingFunction(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, ptcloud, scale): 10 | if not torch.cuda.is_available(): 11 | raise NotImplementedError("CPU version is not available for Chamfer Distance") 12 | 13 | grid, grid_pt_weights, grid_pt_indexes = tpcuda.gridding( 14 | -scale, scale - 1, -scale, scale - 1, -scale, scale - 1, ptcloud 15 | ) 16 | # print(grid.size()) # torch.Size(batch_size, n_grid_vertices) 17 | # print(grid_pt_weights.size()) # torch.Size(batch_size, n_pts, 8, 3) 18 | # print(grid_pt_indexes.size()) # torch.Size(batch_size, n_pts, 8) 19 | ctx.save_for_backward(grid_pt_weights, grid_pt_indexes) 20 | 21 | return grid 22 | 23 | @staticmethod 24 | def backward(ctx, grad_grid): 25 | grid_pt_weights, grid_pt_indexes = ctx.saved_tensors 26 | grad_ptcloud = tpcuda.gridding_grad(grid_pt_weights, grid_pt_indexes, grad_grid) 27 | # print(grad_ptcloud.size()) # torch.Size(batch_size, n_pts, 3) 28 | 29 | return grad_ptcloud, None 30 | 31 | 32 | def gridding(ptcloud, scale): 33 | r""" 34 | Converts the input point clouds into 3D grids by trilinear interpolcation. 35 | Please refer to https://arxiv.org/pdf/2006.03761 for more information 36 | 37 | Parameters 38 | ---------- 39 | ptcloud : torch.Tensor (dtype=torch.float32) 40 | (B, n_pts, 3) B point clouds containing n_pts points 41 | scale : Int 42 | the resolution of the 3D grid 43 | 44 | Returns 45 | ------- 46 | grid: torch.Tensor 47 | (B, scale, scale, scale): the grid of the resolution of scale * scale * scale 48 | """ 49 | if len(ptcloud.shape) != 3 or ptcloud.size(2) != 3: 50 | raise ValueError("The input point cloud should be of size (B, n_pts, 3)") 51 | 52 | ptcloud = ptcloud * scale 53 | _ptcloud = torch.split(ptcloud, 1, dim=0) 54 | grids = [] 55 | for p in _ptcloud: 56 | non_zeros = torch.sum(p, dim=2).ne(0) 57 | p = p[non_zeros].unsqueeze(dim=0) 58 | grids.append(GriddingFunction.apply(p, scale)) 59 | 60 | return torch.cat(grids, dim=0).contiguous() 61 | -------------------------------------------------------------------------------- /cpu/src/knn.cpp: -------------------------------------------------------------------------------- 1 | #include "compat.h" 2 | #include "neighbors.cpp" 3 | #include "neighbors.h" 4 | #include "utils.h" 5 | #include 6 | #include 7 | 8 | std::pair _single_batch_knn(at::Tensor support, at::Tensor query, int k) 9 | { 10 | CHECK_CONTIGUOUS(support); 11 | CHECK_CONTIGUOUS(query); 12 | if (support.size(0) < k) 13 | TORCH_CHECK(false, 14 | "Not enough points in support to find " + std::to_string(k) + " neighboors") 15 | std::vector neighbors_indices(query.size(0) * k, -1); 16 | std::vector neighbors_dists(query.size(0) * k, -1); 17 | 18 | auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU); 19 | auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU); 20 | AT_DISPATCH_ALL_TYPES(query.scalar_type(), "knn", [&] { 21 | auto data_q = query.DATA_PTR(); 22 | auto data_s = support.DATA_PTR(); 23 | std::vector queries_stl = 24 | std::vector(data_q, data_q + query.size(0) * query.size(1)); 25 | std::vector supports_stl = 26 | std::vector(data_s, data_s + support.size(0) * support.size(1)); 27 | 28 | nanoflann_knn_neighbors(queries_stl, supports_stl, neighbors_indices, 29 | neighbors_dists, k); 30 | }); 31 | auto neighbors_dists_ptr = neighbors_dists.data(); 32 | int64_t* neighbors_indices_ptr = neighbors_indices.data(); 33 | auto out = torch::from_blob(neighbors_indices_ptr, {query.size(0), k}, options = options); 34 | auto out_dists = 35 | torch::from_blob(neighbors_dists_ptr, {query.size(0), k}, options = options_dist); 36 | 37 | return std::make_pair(out.clone(), out_dists.clone()); 38 | } 39 | 40 | std::pair dense_knn(at::Tensor support, at::Tensor query, int k) 41 | { 42 | CHECK_CONTIGUOUS(support); 43 | CHECK_CONTIGUOUS(query); 44 | CHECK_CPU(query); 45 | CHECK_CPU(support); 46 | 47 | int b = query.size(0); 48 | vector batch_idx; 49 | vector batch_dist; 50 | for (int i = 0; i < b; i++) 51 | { 52 | auto out_pair = _single_batch_knn(support[i], query[i], k); 53 | batch_idx.push_back(out_pair.first); 54 | batch_dist.push_back(out_pair.second); 55 | } 56 | auto out_idx = torch::stack(batch_idx); 57 | auto out_dist = torch::stack(batch_dist); 58 | return std::make_pair(out_idx, out_dist); 59 | } 60 | -------------------------------------------------------------------------------- /cuda/src/metrics.cpp: -------------------------------------------------------------------------------- 1 | #include "metrics.h" 2 | #include "compat.h" 3 | #include "utils.h" 4 | 5 | void instance_iou_kernel_wrapper(int64_t total_gt_instances, int64_t max_gt_instances, 6 | const int64_t* nInstance, int nProposal, 7 | const int64_t* proposals_idx, const int64_t* proposals_offset, 8 | const int64_t* instance_labels, 9 | const int64_t* offset_num_gt_instances, const int64_t* batch, 10 | const int64_t* instance_pointnum, float* proposals_iou); 11 | 12 | at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets, 13 | at::Tensor gt_instances, at::Tensor gt_instance_sizes, 14 | at::Tensor num_gt_instances, at::Tensor batch) 15 | { 16 | CHECK_CONTIGUOUS(instance_idx); 17 | CHECK_CONTIGUOUS(instance_offsets); 18 | CHECK_CONTIGUOUS(gt_instances); 19 | CHECK_CONTIGUOUS(gt_instance_sizes); 20 | CHECK_CONTIGUOUS(num_gt_instances); 21 | CHECK_CONTIGUOUS(batch); 22 | 23 | CHECK_CUDA(instance_idx); 24 | CHECK_CUDA(instance_offsets); 25 | CHECK_CUDA(gt_instances); 26 | CHECK_CUDA(gt_instance_sizes); 27 | 28 | cudaSetDevice(instance_idx.get_device()); 29 | int64_t num_proposed_instances = instance_offsets.size(0) - 1; 30 | auto total_gt_instances = (int64_t*)malloc(sizeof(int64_t)); 31 | cudaMemcpy(total_gt_instances, num_gt_instances.sum().DATA_PTR(), sizeof(int64_t), 32 | cudaMemcpyDeviceToHost); 33 | auto max_gt_instances = (int64_t*)malloc(sizeof(int64_t)); 34 | cudaMemcpy(max_gt_instances, num_gt_instances.max().DATA_PTR(), sizeof(int64_t), 35 | cudaMemcpyDeviceToHost); 36 | 37 | at::Tensor output = 38 | torch::zeros({num_proposed_instances, total_gt_instances[0]}, 39 | at::device(gt_instances.device()).dtype(at::ScalarType::Float)); 40 | 41 | at::Tensor offset_num_gt_instances = 42 | at::cat({at::zeros(1, num_gt_instances.options()), num_gt_instances.cumsum(0)}, 0); 43 | instance_iou_kernel_wrapper( 44 | total_gt_instances[0], max_gt_instances[0], num_gt_instances.DATA_PTR(), 45 | num_proposed_instances, instance_idx.DATA_PTR(), 46 | instance_offsets.DATA_PTR(), gt_instances.DATA_PTR(), 47 | offset_num_gt_instances.DATA_PTR(), batch.DATA_PTR(), 48 | gt_instance_sizes.DATA_PTR(), output.DATA_PTR()); 49 | 50 | return output; 51 | } 52 | -------------------------------------------------------------------------------- /cuda/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUDA_UTILS_H 2 | #define _CUDA_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #define TOTAL_THREADS_DENSE 512 14 | #define TOTAL_THREADS_SPARSE 1024 15 | 16 | inline int opt_n_threads(int work_size) 17 | { 18 | const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); 19 | 20 | return max(min(1 << pow_2, TOTAL_THREADS_DENSE), 1); 21 | } 22 | 23 | inline dim3 opt_block_config(int x, int y) 24 | { 25 | const int x_threads = opt_n_threads(x); 26 | const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS_DENSE / x_threads), 1); 27 | dim3 block_config(x_threads, y_threads, 1); 28 | 29 | return block_config; 30 | } 31 | 32 | // from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions 33 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 34 | #else 35 | __device__ double atomicAdd(double* address, double val) 36 | { 37 | unsigned long long int* address_as_ull = (unsigned long long int*)address; 38 | unsigned long long int old = *address_as_ull, assumed; 39 | 40 | do 41 | { 42 | assumed = old; 43 | old = atomicCAS(address_as_ull, assumed, 44 | __double_as_longlong(val + __longlong_as_double(assumed))); 45 | 46 | // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) 47 | } while (assumed != old); 48 | 49 | return __longlong_as_double(old); 50 | } 51 | #endif 52 | 53 | #define CUDA_CHECK_ERRORS() \ 54 | do \ 55 | { \ 56 | cudaError_t err = cudaGetLastError(); \ 57 | if (cudaSuccess != err) \ 58 | { \ 59 | fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ 60 | cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, __FILE__); \ 61 | exit(-1); \ 62 | } \ 63 | } while (0) 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /torch_points_kernels/cubic_feature_sampling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if torch.cuda.is_available(): 4 | import torch_points_kernels.points_cuda as tpcuda 5 | 6 | 7 | class CubicFeatureSamplingFunction(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, ptcloud, cubic_features, neighborhood_size=1): 10 | scale = cubic_features.size(2) 11 | if not torch.cuda.is_available(): 12 | raise NotImplementedError("CPU version is not available for Cubic Feature Sampling") 13 | 14 | point_features, grid_pt_indexes = tpcuda.cubic_feature_sampling( 15 | scale, neighborhood_size, ptcloud, cubic_features 16 | ) 17 | ctx.save_for_backward(torch.Tensor([scale]), torch.Tensor([neighborhood_size]), grid_pt_indexes) 18 | return point_features 19 | 20 | @staticmethod 21 | def backward(ctx, grad_point_features): 22 | scale, neighborhood_size, grid_pt_indexes = ctx.saved_tensors 23 | scale = int(scale.item()) 24 | neighborhood_size = int(neighborhood_size.item()) 25 | grad_point_features = grad_point_features.contiguous() 26 | grad_ptcloud, grad_cubic_features = tpcuda.cubic_feature_sampling_grad( 27 | scale, neighborhood_size, grad_point_features, grid_pt_indexes 28 | ) 29 | return grad_ptcloud, grad_cubic_features, None 30 | 31 | 32 | def cubic_feature_sampling(ptcloud, cubic_features, neighborhood_size=1): 33 | r""" 34 | Sample the features of points from 3D feature maps that the point lies in. 35 | Please refer to https://arxiv.org/pdf/2006.03761 for more information 36 | 37 | Parameters 38 | ---------- 39 | ptcloud : torch.Tensor (dtype=torch.float32) 40 | (B, n_pts, 3) point clouds containing n_pts points 41 | cubic_features : torch.Tensor (dtype=torch.float32) 42 | (B, c, m, m, m) 3D feature maps of sizes m x m x m and c channels 43 | neighborhood_size : int 44 | The neighborhood cubes to sample. 45 | neighborhood_size = 1 means to sample the cube that point lies in. 46 | neighborhood_size = 2 means to sample surrouding cubes (step = 1) of 47 | the cube that point lies in. 48 | 49 | Returns 50 | ------- 51 | dist: torch.Tensor 52 | (B, n_pts, n_vertices, c), where n_vertices = (neighborhood_size * 2)^3 53 | """ 54 | if len(ptcloud.shape) != 3 or ptcloud.shape[2] != 3: 55 | raise ValueError("The input point cloud should be of size (B, n_pts, 3).") 56 | 57 | h_scale = cubic_features.size(2) / 2 58 | ptcloud = ptcloud * h_scale + h_scale 59 | return CubicFeatureSamplingFunction.apply(ptcloud, cubic_features, neighborhood_size) 60 | -------------------------------------------------------------------------------- /cpu/src/interpolate.cpp: -------------------------------------------------------------------------------- 1 | #include "compat.h" 2 | #include "utils.h" 3 | #include 4 | #include 5 | 6 | at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight) 7 | { 8 | CHECK_CONTIGUOUS(features); 9 | CHECK_CONTIGUOUS(idx); 10 | CHECK_CONTIGUOUS(weight); 11 | CHECK_CPU(idx); 12 | CHECK_CPU(features); 13 | CHECK_CPU(weight); 14 | 15 | at::Tensor output = torch::zeros({features.size(0), features.size(1), idx.size(1)}, 16 | at::device(features.device()).dtype(features.scalar_type())); 17 | 18 | AT_DISPATCH_ALL_TYPES(features.scalar_type(), "knn_interpolate", [&] { 19 | auto output_a = output.accessor(); 20 | auto features_a = features.accessor(); 21 | auto weight_a = weight.accessor(); 22 | auto idx_a = idx.accessor(); 23 | 24 | auto batch_size = idx.size(0); 25 | for (auto b = 0; b < batch_size; b++) 26 | { 27 | for (auto p = 0; p < idx.size(1); p++) 28 | { 29 | for (auto c = 0; c < features.size(1); c++) 30 | { 31 | output_a[b][c][p] = 0; 32 | for (int i = 0; i < idx.size(2); i++) 33 | { 34 | auto new_idx = idx_a[b][p][i]; 35 | output_a[b][c][p] += features_a[b][c][new_idx] * weight_a[b][p][i]; 36 | } 37 | } 38 | } 39 | } 40 | }); 41 | return output; 42 | } 43 | 44 | at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, const int m) 45 | { 46 | CHECK_CPU(grad_out); 47 | at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m}, 48 | at::device(grad_out.device()).dtype(grad_out.scalar_type())); 49 | 50 | AT_DISPATCH_ALL_TYPES(grad_out.scalar_type(), "knn_interpolate_grad", [&] { 51 | auto output_a = output.accessor(); 52 | auto grad_out_a = grad_out.accessor(); 53 | auto weight_a = weight.accessor(); 54 | auto idx_a = idx.accessor(); 55 | 56 | auto batch_size = idx.size(0); 57 | for (auto b = 0; b < batch_size; b++) 58 | { 59 | for (auto p = 0; p < idx.size(1); p++) 60 | { 61 | for (auto c = 0; c < grad_out.size(1); c++) 62 | { 63 | for (int i = 0; i < idx.size(2); i++) 64 | { 65 | auto new_idx = idx_a[b][p][i]; 66 | output_a[b][c][new_idx] += grad_out_a[b][c][p] * weight_a[b][p][i]; 67 | } 68 | } 69 | } 70 | } 71 | }); 72 | return output; 73 | } 74 | -------------------------------------------------------------------------------- /test/test_metrics.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import os 4 | import sys 5 | import numpy as np 6 | import random 7 | 8 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 9 | sys.path.insert(0, ROOT) 10 | 11 | from torch_points_kernels.metrics import instance_iou 12 | from test import run_if_cuda 13 | 14 | 15 | class TestInstanceIou(unittest.TestCase): 16 | def test_simple(self, cuda=False): 17 | gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0]) 18 | proposed_instances = [ 19 | torch.tensor([0, 2]), # 100% instance 1 20 | torch.tensor([1, 4]), # 2/3 of instance 2 21 | torch.tensor([3, 5]), # 1/3 of instance 2 and 1/1 of instance 3 22 | ] 23 | if cuda: 24 | proposed_instances = [c.cuda() for c in proposed_instances] 25 | gt_instances = gt_instances.cuda() 26 | ious = instance_iou(proposed_instances, gt_instances) 27 | torch.testing.assert_allclose( 28 | ious.cpu(), 29 | torch.tensor([[1, 0, 0], [0, 2 / 3.0, 0], [0, 1.0 / 4.0, 1.0 / 2.0]]), 30 | ) 31 | 32 | def test_batch(self, cuda=False): 33 | gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0]) 34 | batch = torch.tensor([0, 0, 1, 1, 1, 1, 1]) 35 | proposed_instances = [ 36 | torch.tensor([0, 1]), # 50% instance 1, 50% instance 2 of sample 1 37 | torch.tensor([3, 4]), # 100% instance 2 of sample 2 38 | torch.tensor([5]), # 100% of instance 3 of sample 2 39 | ] 40 | if cuda: 41 | proposed_instances = [c.cuda() for c in proposed_instances] 42 | gt_instances = gt_instances.cuda() 43 | batch = batch.cuda() 44 | ious = instance_iou(proposed_instances, gt_instances, batch=batch) 45 | torch.testing.assert_allclose( 46 | ious.cpu(), 47 | torch.tensor( 48 | [ 49 | [0.5, 0.5, 0, 0, 0], 50 | [0, 0, 0, 1, 0], 51 | [0, 0, 0, 0, 1], 52 | ] 53 | ), 54 | ) 55 | 56 | @run_if_cuda 57 | def test_simple_cuda(self): 58 | self.test_simple(cuda=True) 59 | 60 | @run_if_cuda 61 | def test_batch_cuda(self): 62 | self.test_batch(cuda=True) 63 | 64 | @run_if_cuda 65 | def test_same(self): 66 | gt_instances = torch.randint(0, 10, (1000,)) 67 | proposed_instances = [] 68 | for i in range(20): 69 | instance_size = random.randint(5, 50) 70 | proposed_instances.append(torch.randint(0, 1000, (instance_size,))) 71 | 72 | ious = instance_iou(proposed_instances, gt_instances) 73 | proposed_instances_cuda = [i.cuda() for i in proposed_instances] 74 | ious_cuda = instance_iou(proposed_instances_cuda, gt_instances.cuda()) 75 | 76 | torch.testing.assert_allclose(ious, ious_cuda.cpu()) 77 | 78 | 79 | if __name__ == "__main__": 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /cuda/src/metrics_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cuda_utils.h" 6 | 7 | #define THREADS 512 8 | 9 | __global__ void instance_iou_cuda_kernel( 10 | int64_t total_gt_instances, const int64_t* __restrict__ nInstance, int nProposal, 11 | const int64_t* __restrict__ proposals_idx, const int64_t* __restrict__ proposals_offset, 12 | const int64_t* __restrict__ instance_labels, 13 | const int64_t* __restrict__ offset_num_gt_instances, const int64_t* __restrict__ batch, 14 | const int64_t* __restrict__ instance_pointnum, float* proposals_iou) 15 | { 16 | for (int proposal_id = blockIdx.x; proposal_id < nProposal; proposal_id += gridDim.x) 17 | { 18 | int start = proposals_offset[proposal_id]; 19 | int end = proposals_offset[proposal_id + 1]; 20 | int sampleIdx = batch[proposals_idx[start]]; 21 | int sampleNInstances = nInstance[sampleIdx]; 22 | int instanceOffset = offset_num_gt_instances[sampleIdx]; 23 | int proposal_total = end - start; 24 | for (int instance_id = threadIdx.x; instance_id < sampleNInstances; 25 | instance_id += blockDim.x) 26 | { 27 | int instance_total = instance_pointnum[instanceOffset + instance_id]; 28 | int intersection = 0; 29 | for (int i = start; i < end; i++) 30 | { 31 | int idx = proposals_idx[i]; 32 | if ((int)instance_labels[idx] == instance_id + 1) 33 | { // 0 is reserved for "no instance" 34 | intersection += 1; 35 | } 36 | } 37 | 38 | proposals_iou[instanceOffset + instance_id + proposal_id * total_gt_instances] = 39 | (float)intersection / 40 | ((float)(proposal_total + instance_total - intersection) + 1e-5); 41 | } 42 | } 43 | } 44 | 45 | // input: proposals_idx (sumNPoint), int 46 | // input: proposals_offset (nProposal + 1), int 47 | // input: instance_labels (N), int64_t, 0~total_nInst-1, -100 48 | // input: instance_pointnum (total_nInst), int 49 | // output: proposals_iou (nProposal, total_nInst), float 50 | void instance_iou_kernel_wrapper(int64_t total_gt_instances, int64_t max_gt_instances, 51 | const int64_t* nInstance, int nProposal, 52 | const int64_t* proposals_idx, const int64_t* proposals_offset, 53 | const int64_t* instance_labels, 54 | const int64_t* offset_num_gt_instances, const int64_t* batch, 55 | const int64_t* instance_pointnum, float* proposals_iou) 56 | { 57 | auto stream = at::cuda::getCurrentCUDAStream(); 58 | instance_iou_cuda_kernel<<>>( 60 | total_gt_instances, nInstance, nProposal, proposals_idx, proposals_offset, instance_labels, 61 | offset_num_gt_instances, batch, instance_pointnum, proposals_iou); 62 | } 63 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlinesLeft: false 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: false 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | BeforeCatch: false 33 | BeforeElse: false 34 | IndentBraces: false 35 | BreakBeforeBinaryOperators: None 36 | BreakBeforeBraces: Allman 37 | BreakBeforeTernaryOperators: true 38 | BreakConstructorInitializersBeforeComma: false 39 | BreakAfterJavaFieldAnnotations: false 40 | BreakStringLiterals: true 41 | ColumnLimit: 100 42 | CommentPragmas: "^ IWYU pragma:" 43 | BreakBeforeInheritanceComma: false 44 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 45 | ConstructorInitializerIndentWidth: 4 46 | ContinuationIndentWidth: 4 47 | Cpp11BracedListStyle: true 48 | DerivePointerAlignment: false 49 | DisableFormat: false 50 | ExperimentalAutoDetectBinPacking: false 51 | FixNamespaceComments: true 52 | ForEachMacros: [foreach, Q_FOREACH, BOOST_FOREACH] 53 | IncludeCategories: 54 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 55 | Priority: 2 56 | - Regex: '^(<|"(gtest|isl|json)/)' 57 | Priority: 3 58 | - Regex: ".*" 59 | Priority: 1 60 | IncludeIsMainRegex: "$" 61 | IndentCaseLabels: false 62 | IndentWidth: 4 63 | IndentWrappedFunctionNames: false 64 | JavaScriptQuotes: Leave 65 | JavaScriptWrapImports: true 66 | KeepEmptyLinesAtTheStartOfBlocks: true 67 | MacroBlockBegin: "" 68 | MacroBlockEnd: "" 69 | MaxEmptyLinesToKeep: 1 70 | NamespaceIndentation: None 71 | ObjCBlockIndentWidth: 2 72 | ObjCSpaceAfterProperty: false 73 | ObjCSpaceBeforeProtocolList: true 74 | PenaltyBreakBeforeFirstCallParameter: 19 75 | PenaltyBreakComment: 300 76 | PenaltyBreakFirstLessLess: 120 77 | PenaltyBreakString: 1000 78 | PenaltyExcessCharacter: 1000000 79 | PenaltyReturnTypeOnItsOwnLine: 60 80 | PointerAlignment: Left 81 | ReflowComments: true 82 | SortIncludes: true 83 | SpaceAfterCStyleCast: false 84 | SpaceAfterTemplateKeyword: true 85 | SpaceBeforeAssignmentOperators: true 86 | SpaceBeforeParens: ControlStatements 87 | SpaceInEmptyParentheses: false 88 | SpacesBeforeTrailingComments: 1 89 | SpacesInAngles: false 90 | SpacesInContainerLiterals: true 91 | SpacesInCStyleCastParentheses: false 92 | SpacesInParentheses: false 93 | SpacesInSquareBrackets: false 94 | Standard: Cpp11 95 | TabWidth: 4 96 | UseTab: Never 97 | -------------------------------------------------------------------------------- /cpu/include/cloud.h: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 0==========================0 4 | // | Local feature test | 5 | // 0==========================0 6 | // 7 | // version 1.0 : 8 | // > 9 | // 10 | //--------------------------------------------------- 11 | // 12 | // Cloud header 13 | // 14 | //---------------------------------------------------- 15 | // 16 | // Hugues THOMAS - 10/02/2017 17 | // 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | template struct PointCloud 33 | { 34 | void set(const std::vector& new_pts) 35 | { 36 | pts = new_pts.data(); 37 | length = new_pts.size() / 3; 38 | } 39 | void set_batch(const std::vector& new_pts, int begin, int end) 40 | { 41 | pts = new_pts.data(); 42 | int start = begin * 3; 43 | pts += start; 44 | length = (end - begin); 45 | } 46 | 47 | // Must return the number of data points 48 | inline size_t kdtree_get_point_count() const 49 | { 50 | return get_point_count(); 51 | } 52 | 53 | // Must return the number of data points 54 | inline size_t get_point_count() const 55 | { 56 | return length; 57 | } 58 | 59 | // Returns the dim'th component of the idx'th point in the class: 60 | // Since this is inlined and the "dim" argument is typically an immediate 61 | // value, the 62 | // "if/else's" are actually solved at compile time. 63 | inline scalar_t kdtree_get_pt(const size_t idx, const size_t dim) const 64 | { 65 | if (dim == 0) 66 | return pts[idx * 3]; 67 | else if (dim == 1) 68 | return pts[idx * 3 + 1]; 69 | else 70 | return pts[idx * 3 + 2]; 71 | } 72 | 73 | // Optional bounding-box computation: return false to default to a standard 74 | // bbox computation loop. 75 | // Return true if the BBOX was already computed by the class and returned in 76 | // "bb" so it can be avoided to redo it again. Look at bb.size() to find out 77 | // the expected dimensionality (e.g. 2 or 3 for point clouds) 78 | template bool kdtree_get_bbox(BBOX& /* bb */) const 79 | { 80 | return false; 81 | } 82 | 83 | const scalar_t* get_point_ptr(const int i) const 84 | { 85 | return pts + i * 3; 86 | } 87 | 88 | std::array operator[](const size_t index) const 89 | { 90 | return {pts[index * 3], pts[index * 3 + 1], pts[index * 3 + 2]}; 91 | } 92 | 93 | private: 94 | const scalar_t* pts; 95 | size_t length; 96 | }; 97 | 98 | template 99 | inline std::ostream& operator<<(std::ostream& os, const PointCloud& P) 100 | { 101 | for (size_t i = 0; i < P.get_point_count(); i++) 102 | { 103 | auto p = P[i]; 104 | os << "[" << p[0] << ", " << p[1] << ", " << p[2] << "];"; 105 | } 106 | return os; 107 | } 108 | -------------------------------------------------------------------------------- /cuda/src/interpolate.cpp: -------------------------------------------------------------------------------- 1 | #include "interpolate.h" 2 | #include "compat.h" 3 | #include "utils.h" 4 | 5 | void three_nn_kernel_wrapper(int b, int n, int m, const float* unknown, const float* known, 6 | float* dist2, int* idx); 7 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float* points, 8 | const int* idx, const float* weight, float* out); 9 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float* grad_out, 10 | const int* idx, const float* weight, float* grad_points); 11 | 12 | std::vector three_nn(at::Tensor unknowns, at::Tensor knows) 13 | { 14 | CHECK_CONTIGUOUS(unknowns); 15 | CHECK_CONTIGUOUS(knows); 16 | CHECK_IS_FLOAT(unknowns); 17 | CHECK_IS_FLOAT(knows); 18 | 19 | CHECK_CUDA(knows); 20 | CHECK_CUDA(unknowns); 21 | 22 | at::Tensor idx = torch::zeros({unknowns.size(0), unknowns.size(1), 3}, 23 | at::device(unknowns.device()).dtype(at::ScalarType::Int)); 24 | at::Tensor dist2 = torch::zeros({unknowns.size(0), unknowns.size(1), 3}, 25 | at::device(unknowns.device()).dtype(at::ScalarType::Float)); 26 | 27 | three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1), 28 | unknowns.DATA_PTR(), knows.DATA_PTR(), 29 | dist2.DATA_PTR(), idx.DATA_PTR()); 30 | 31 | return {dist2, idx}; 32 | } 33 | 34 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight) 35 | { 36 | CHECK_CONTIGUOUS(points); 37 | CHECK_CONTIGUOUS(idx); 38 | CHECK_CONTIGUOUS(weight); 39 | CHECK_IS_FLOAT(points); 40 | CHECK_IS_INT(idx); 41 | CHECK_IS_FLOAT(weight); 42 | 43 | CHECK_CUDA(idx); 44 | CHECK_CUDA(weight); 45 | 46 | at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)}, 47 | at::device(points.device()).dtype(at::ScalarType::Float)); 48 | 49 | three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1), 50 | points.DATA_PTR(), idx.DATA_PTR(), 51 | weight.DATA_PTR(), output.DATA_PTR()); 52 | 53 | return output; 54 | } 55 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, 56 | const int m) 57 | { 58 | CHECK_CONTIGUOUS(grad_out); 59 | CHECK_CONTIGUOUS(idx); 60 | CHECK_CONTIGUOUS(weight); 61 | CHECK_IS_FLOAT(grad_out); 62 | CHECK_IS_INT(idx); 63 | CHECK_IS_FLOAT(weight); 64 | CHECK_CUDA(idx); 65 | CHECK_CUDA(weight); 66 | CHECK_CUDA(grad_out); 67 | 68 | at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m}, 69 | at::device(grad_out.device()).dtype(at::ScalarType::Float)); 70 | 71 | three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2), m, 72 | grad_out.DATA_PTR(), idx.DATA_PTR(), 73 | weight.DATA_PTR(), output.DATA_PTR()); 74 | 75 | return output; 76 | } 77 | -------------------------------------------------------------------------------- /.github/workflows/building-conda.yml: -------------------------------------------------------------------------------- 1 | name: Building Conda 2 | 3 | on: [workflow_dispatch] 4 | 5 | jobs: 6 | 7 | conda-build: 8 | runs-on: ${{ matrix.os }} 9 | 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | os: [ubuntu-18.04, macos-10.15] #, windows-2019] 14 | python-version: ['3.7', '3.8', '3.9'] #, '3.10'] 15 | torch-version: [1.10.0, 1.11.0, 1.12.0] 16 | cuda-version: ['cpu', 'cu102', 'cu113', 'cu115', 'cu116'] 17 | exclude: 18 | - torch-version: 1.10.0 19 | cuda-version: 'cu116' 20 | - torch-version: 1.11.0 21 | cuda-version: 'cu116' 22 | - torch-version: 1.12.0 23 | cuda-version: 'cu115' 24 | - torch-version: 1.10.0 25 | cuda-version: 'cu115' 26 | - torch-version: 1.10.0 27 | python-version: '3.10' 28 | - os: windows-2019 29 | torch-version: 1.11.0 30 | cuda-version: 'cu102' 31 | - os: macos-10.15 32 | cuda-version: 'cu102' 33 | - os: macos-10.15 34 | cuda-version: 'cu113' 35 | - os: macos-10.15 36 | cuda-version: 'cu115' 37 | - os: macos-10.15 38 | cuda-version: 'cu116' 39 | - os: macos-10.15 40 | python-version: '3.10' # this is unhappy 41 | - os: ubuntu-18.04 42 | cuda-version: 'cu115' 43 | - os: windows-2019 44 | cuda-version: 'cu102' 45 | - os: windows-2019 # Complains about CUDA mismatch. 46 | python-version: '3.7' 47 | 48 | steps: 49 | - uses: actions/checkout@v2 50 | - name: Set up Conda for Python ${{ matrix.python-version }} 51 | uses: conda-incubator/setup-miniconda@v2 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | 55 | - name: Free up disk space 56 | if: ${{ runner.os == 'Linux' }} 57 | run: | 58 | sudo rm -rf /usr/share/dotnet 59 | 60 | - name: Free up disk space 61 | if: ${{ runner.os == 'Linux' }} 62 | run: | 63 | sudo rm -rf /usr/share/dotnet 64 | 65 | - name: Install Conda packages 66 | run: | 67 | conda install conda-build conda-verify --yes 68 | shell: 69 | bash -l {0} 70 | 71 | - name: Install CUDA ${{ matrix.cuda-version }} 72 | if: ${{ matrix.cuda-version != 'cpu' }} 73 | run: | 74 | bash .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}.sh 75 | shell: 76 | bash 77 | 78 | - name: Build Conda package for CPU 79 | if: ${{ matrix.cuda-version == 'cpu' }} 80 | run: | 81 | FORCE_CUDA=0 TORCH_CUDA_ARCH_LIST=0 ./conda/torch-points-kernels/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }} 82 | shell: 83 | bash -l {0} 84 | 85 | - name: Build Conda package for GPU 86 | if: ${{ matrix.cuda-version != 'cpu' }} 87 | run: | 88 | source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh 89 | ./conda/torch-points-kernels/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }} 90 | shell: 91 | bash -l {0} 92 | 93 | - name: Publish Conda package 94 | run: | 95 | conda install anaconda-client --yes 96 | anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2 97 | env: 98 | ANACONDA_API_TOKEN: ${{ secrets.CONDA_TOKEN }} 99 | shell: 100 | bash -l {0} 101 | -------------------------------------------------------------------------------- /cuda/src/ball_query.cpp: -------------------------------------------------------------------------------- 1 | #include "ball_query.h" 2 | #include "compat.h" 3 | #include "utils.h" 4 | 5 | void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample, 6 | const float* new_xyz, const float* xyz, int64_t* idx, 7 | float* dist_out); 8 | 9 | void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y, 10 | float radius, int nsample, const float* x, 11 | const float* y, const int64_t* batch_x, 12 | const int64_t* batch_y, int64_t* idx_out, 13 | float* dist_out); 14 | 15 | std::pair ball_query_dense(at::Tensor new_xyz, at::Tensor xyz, 16 | const float radius, const int nsample) 17 | { 18 | CHECK_CONTIGUOUS(new_xyz); 19 | CHECK_CONTIGUOUS(xyz); 20 | CHECK_IS_FLOAT(new_xyz); 21 | CHECK_IS_FLOAT(xyz); 22 | 23 | CHECK_CUDA(xyz); 24 | CHECK_CUDA(new_xyz); 25 | 26 | at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, 27 | at::device(new_xyz.device()).dtype(at::ScalarType::Long)); 28 | at::Tensor dist = torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1, 29 | at::device(new_xyz.device()).dtype(at::ScalarType::Float)); 30 | 31 | query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius, 32 | nsample, new_xyz.DATA_PTR(), xyz.DATA_PTR(), 33 | idx.DATA_PTR(), dist.DATA_PTR()); 34 | 35 | return std::make_pair(idx, dist); 36 | } 37 | 38 | at::Tensor degree(at::Tensor row, int64_t num_nodes) 39 | { 40 | auto zero = at::zeros(num_nodes, row.options()); 41 | auto one = at::ones(row.size(0), row.options()); 42 | return zero.scatter_add_(0, row, one); 43 | } 44 | 45 | std::pair ball_query_partial_dense(at::Tensor x, at::Tensor y, 46 | at::Tensor batch_x, at::Tensor batch_y, 47 | const float radius, const int nsample) 48 | { 49 | CHECK_CONTIGUOUS(x); 50 | CHECK_CONTIGUOUS(y); 51 | CHECK_IS_FLOAT(x); 52 | CHECK_IS_FLOAT(y); 53 | CHECK_CUDA(x); 54 | CHECK_CUDA(y); 55 | CHECK_CUDA(batch_x); 56 | CHECK_CUDA(batch_y); 57 | 58 | at::Tensor idx = 59 | torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Long)); 60 | 61 | at::Tensor dist = 62 | torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Float)); 63 | 64 | cudaSetDevice(x.get_device()); 65 | auto batch_sizes = (int64_t*)malloc(sizeof(int64_t)); 66 | cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR(), sizeof(int64_t), 67 | cudaMemcpyDeviceToHost); 68 | auto batch_size = batch_sizes[0] + 1; 69 | 70 | batch_x = degree(batch_x, batch_size); 71 | batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0); 72 | batch_y = degree(batch_y, batch_size); 73 | batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0); 74 | 75 | query_ball_point_kernel_partial_wrapper( 76 | batch_size, x.size(0), y.size(0), radius, nsample, x.DATA_PTR(), y.DATA_PTR(), 77 | batch_x.DATA_PTR(), batch_y.DATA_PTR(), idx.DATA_PTR(), 78 | dist.DATA_PTR()); 79 | 80 | return std::make_pair(idx, dist); 81 | } 82 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | import glob 4 | 5 | try: 6 | import torch 7 | from torch.utils.cpp_extension import ( 8 | BuildExtension, 9 | CUDAExtension, 10 | CUDA_HOME, 11 | CppExtension, 12 | ) 13 | except: 14 | raise ModuleNotFoundError("Please install pytorch >= 1.1 before proceeding.") 15 | 16 | WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None 17 | WITH_CPU = True 18 | if os.getenv("FORCE_CUDA", "0") == "1": 19 | WITH_CUDA = True 20 | if os.getenv("FORCE_ONLY_CUDA", "0") == "1": 21 | WITH_CUDA = True 22 | WITH_CPU = False 23 | if os.getenv("FORCE_ONLY_CPU", "0") == "1": 24 | WITH_CUDA = False 25 | WITH_CPU = True 26 | 27 | 28 | def get_ext_modules(): 29 | TORCH_MAJOR = int(torch.__version__.split(".")[0]) 30 | TORCH_MINOR = int(torch.__version__.split(".")[1]) 31 | extra_compile_args = {"cxx": ["-O3"]} 32 | if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): 33 | extra_compile_args["cxx"] += ["-DVERSION_GE_1_3"] 34 | 35 | ext_src_root = "cuda" 36 | ext_sources = glob.glob("{}/src/*.cpp".format(ext_src_root)) + glob.glob( 37 | "{}/src/*.cu".format(ext_src_root) 38 | ) 39 | 40 | ext_modules = [] 41 | if WITH_CUDA: 42 | nvcc_flags = os.getenv("NVCC_FLAGS", "") 43 | nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ") 44 | nvcc_flags += ["-arch=sm_35", "--expt-relaxed-constexpr", "-O2"] 45 | extra_compile_args["nvcc"] = nvcc_flags 46 | 47 | ext_modules.append( 48 | CUDAExtension( 49 | name="torch_points_kernels.points_cuda", 50 | sources=ext_sources, 51 | include_dirs=["{}/include".format(ext_src_root)], 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ) 55 | 56 | cpu_ext_src_root = "cpu" 57 | cpu_ext_sources = glob.glob("{}/src/*.cpp".format(cpu_ext_src_root)) 58 | 59 | if WITH_CPU: 60 | ext_modules.append( 61 | CppExtension( 62 | name="torch_points_kernels.points_cpu", 63 | sources=cpu_ext_sources, 64 | include_dirs=["{}/include".format(cpu_ext_src_root)], 65 | extra_compile_args=extra_compile_args, 66 | ) 67 | ) 68 | return ext_modules 69 | 70 | 71 | class CustomBuildExtension(BuildExtension): 72 | def __init__(self, *args, **kwargs): 73 | super().__init__(*args, no_python_abi_suffix=True, use_ninja=False, **kwargs) 74 | 75 | 76 | def get_cmdclass(): 77 | return {"build_ext": CustomBuildExtension} 78 | 79 | 80 | this_directory = os.path.abspath(os.path.dirname(__file__)) 81 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f: 82 | long_description = f.read() 83 | 84 | requirements = ["torch>=1.1.0", "numba", "numpy<=1.21", "scikit-learn"] 85 | 86 | url = "https://github.com/nicolas-chaulet/torch-points-kernels" 87 | __version__ = "0.7.1" 88 | setup( 89 | name="torch-points-kernels", 90 | version=__version__, 91 | author="Nicolas Chaulet", 92 | packages=find_packages(), 93 | description="PyTorch kernels for spatial operations on point clouds", 94 | url=url, 95 | download_url="{}/archive/{}.tar.gz".format(url, __version__), 96 | install_requires=requirements, 97 | ext_modules=get_ext_modules(), 98 | cmdclass=get_cmdclass(), 99 | long_description=long_description, 100 | long_description_content_type="text/markdown", 101 | classifiers=[ 102 | "Programming Language :: Python :: 3", 103 | "License :: OSI Approved :: MIT License", 104 | ], 105 | ) 106 | -------------------------------------------------------------------------------- /torch_points_kernels/cluster.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .torchpoints import ball_query_partial_dense 3 | import numpy as np 4 | import numba 5 | from typing import List 6 | 7 | 8 | @numba.jit(nopython=True) 9 | def _grow_proximity_core(neighbours, min_cluster_size): 10 | num_points = int(neighbours.shape[0]) 11 | visited = np.zeros((num_points,), dtype=numba.types.bool_) 12 | clusters = [] 13 | for i in range(num_points): 14 | if visited[i]: 15 | continue 16 | 17 | cluster = [] 18 | queue = [] 19 | visited[i] = True 20 | queue.append(i) 21 | cluster.append(i) 22 | while len(queue): 23 | k = queue.pop() 24 | k_neighbours = neighbours[k] 25 | for nei in k_neighbours: 26 | if nei.item() == -1: 27 | break 28 | 29 | if not visited[nei]: 30 | visited[nei] = True 31 | queue.append(nei.item()) 32 | cluster.append(nei.item()) 33 | 34 | if len(cluster) >= min_cluster_size: 35 | clusters.append(cluster) 36 | 37 | return clusters 38 | 39 | 40 | def grow_proximity(pos, batch, nsample=16, radius=0.02, min_cluster_size=32): 41 | """Grow based on proximity only 42 | Neighbour search is done on device while the cluster assignement is done on cpu""" 43 | assert pos.shape[0] == batch.shape[0] 44 | neighbours = ball_query_partial_dense(radius, nsample, pos, pos, batch, batch)[0].cpu().numpy() 45 | return _grow_proximity_core(neighbours, min_cluster_size) 46 | 47 | 48 | def region_grow( 49 | pos, labels, batch, ignore_labels=[], nsample=16, radius=0.02, min_cluster_size=32 50 | ) -> List[torch.Tensor]: 51 | """Region growing clustering algorithm proposed in 52 | PointGroup: Dual-Set Point Grouping for 3D Instance Segmentation 53 | https://arxiv.org/pdf/2004.01658.pdf 54 | for instance segmentation 55 | 56 | Parameters 57 | ---------- 58 | pos: torch.Tensor [N, 3] 59 | Location of the points 60 | labels: torch.Tensor [N,] 61 | labels of each point 62 | ignore_labels: 63 | Labels that should be ignored, no region growing will be performed on those 64 | nsample: 65 | maximum number of neighbours to consider 66 | radius: 67 | radius for the neighbour search 68 | min_cluster_size: 69 | Number of points above which a cluster is considered valid 70 | """ 71 | assert labels.dim() == 1 72 | assert pos.dim() == 2 73 | assert pos.shape[0] == labels.shape[0] 74 | 75 | unique_labels = torch.unique(labels) 76 | clusters = [] 77 | ind = torch.arange(0, pos.shape[0]) 78 | for l in unique_labels: 79 | if l in ignore_labels: 80 | continue 81 | 82 | # Build clusters for a given label (ignore other points) 83 | label_mask = labels == l 84 | local_ind = ind[label_mask] 85 | 86 | # Remap batch to a continuous sequence 87 | label_batch = batch[label_mask] 88 | unique_in_batch = torch.unique(label_batch) 89 | remaped_batch = torch.empty_like(label_batch) 90 | for new, old in enumerate(unique_in_batch): 91 | mask = label_batch == old 92 | remaped_batch[mask] = new 93 | 94 | # Cluster 95 | label_clusters = grow_proximity( 96 | pos[label_mask, :], 97 | remaped_batch, 98 | nsample=nsample, 99 | radius=radius, 100 | min_cluster_size=min_cluster_size, 101 | ) 102 | 103 | # Remap indices to original coordinates 104 | if len(label_clusters): 105 | for cluster in label_clusters: 106 | cluster = torch.tensor(cluster).to(pos.device) 107 | clusters.append(local_ind[cluster]) 108 | 109 | return clusters 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 3D Point Cloud Kernels 2 | Pytorch CPU and CUDA kernels for spatial search and interpolation for 3D point clouds. 3 | 4 | [![PyPI version](https://badge.fury.io/py/torch-points-kernels.svg)](https://badge.fury.io/py/torch-points-kernels) [![Deploy](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/deploy.yaml/badge.svg)](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/deploy.yaml) [![Unittests](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/tests.yaml/badge.svg)](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/tests.yaml) 5 | 6 | ## Installation 7 | **Update:** we now provide precompiled Conda packages for the latest PyTorch/CUDA combinations (PyTorch >= 1.10.0). To install with conda: 8 | ``` 9 | conda install -c torch-points3d torch-points-kernels 10 | ``` 11 | 12 | Or, you can compile the wheel yourself for any PyTorch/CUDA combination (must have a matching installation of CUDA toolkit): 13 | ``` 14 | pip install torch-points-kernels 15 | ``` 16 | 17 | To force CUDA installation (for example on Docker builds) please use the flag `FORCE_CUDA`: 18 | ``` 19 | FORCE_CUDA=1 pip install torch-points-kernels 20 | ``` 21 | 22 | ## Usage 23 | ``` 24 | import torch 25 | import torch_points_kernels.points_cuda 26 | ``` 27 | 28 | ## Build and test 29 | ``` 30 | python setup.py build_ext --inplace 31 | python -m unittest 32 | ``` 33 | 34 | ## Troubleshooting 35 | 36 | ### Compilation issues 37 | Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, e.g.: 38 | ``` 39 | $ python -c "import torch; print(torch.__version__)" 40 | >>> 1.4.0 41 | 42 | $ echo $PATH 43 | >>> /usr/local/cuda/bin:... 44 | 45 | $ echo $CPATH 46 | >>> /usr/local/cuda/include:... 47 | ``` 48 | 49 | On the compilation, if you have this error: 50 | ```error: cannot call member function 'void std::basic_string<_CharT, _Traits, _Alloc>::_Rep::_M_set_sharable()``` 51 | it means that your nvcc version is too old. The version must be at least 10.1.168. 52 | To check the version: 53 | ``` 54 | nvcc --version 55 | >>> V10.1.168 56 | ``` 57 | 58 | ### Windows compilation 59 | On Windows you may have this error when compiling: 60 | ``` 61 | error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized 62 | error: member "torch::jit::detail::ParameterPolicy::all_slots" may not be initialized 63 | error: member "torch::jit::detail::BufferPolicy::all_slots" may not be initialized 64 | error: member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized 65 | ``` 66 | This requires you to edit some of your pytorch header files, use [this script](https://github.com/rusty1s/pytorch_scatter/blob/master/script/torch.sh) as a guide. 67 | 68 | ### CUDA kernel failed : no kernel image is available for execution on the device 69 | 70 | This can happen when trying to run the code on a different GPU than the one used to compile the `torch-points-kernels` library. Uninstall `torch-points-kernels`, clear cache, and reinstall after setting the `TORCH_CUDA_ARCH_LIST` environment variable. For example, for compiling with a Tesla T4 (Turing 7.5) and running the code on a Tesla V100 (Volta 7.0) use: 71 | ``` 72 | export TORCH_CUDA_ARCH_LIST="7.0;7.5" 73 | ``` 74 | See [this useful chart](http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for more architecture compatibility. 75 | 76 | 77 | ## Projects using those kernels. 78 | 79 | [```Pytorch Point Cloud Benchmark```](https://github.com/nicolas-chaulet/deeppointcloud-benchmarks) 80 | 81 | ## Credit 82 | 83 | * [```Pointnet2_Tensorflow```](https://github.com/charlesq34/pointnet2) by [Charles R. Qi](https://github.com/charlesq34) 84 | 85 | * [```Pointnet2_PyTorch```](https://github.com/erikwijmans/Pointnet2_PyTorch) by [Erik Wijmans](https://github.com/erikwijmans) 86 | 87 | * [```GRNet```](https://github.com/hzxie/GRNet) by [Haozhe Xie](https://github.com/hzxie) 88 | -------------------------------------------------------------------------------- /torch_points_kernels/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List, Optional 3 | import numpy as np 4 | import numba 5 | 6 | if torch.cuda.is_available(): 7 | import torch_points_kernels.points_cuda as tpcuda 8 | 9 | 10 | @numba.jit(nopython=True, parallel=True) 11 | def _instance_iou_cpu( 12 | instance_idx, 13 | instance_offsets, 14 | gt_instances, 15 | gt_instance_sizes, 16 | num_gt_instances: np.array, 17 | batch: np.array, 18 | ): 19 | num_proposed_instances = len(instance_offsets) - 1 20 | iou = np.zeros((num_proposed_instances, num_gt_instances.sum())) 21 | offset_num_gt_instances = np.concatenate((np.array([0]), num_gt_instances.cumsum())) 22 | for proposed_instance in range(num_proposed_instances): 23 | instance = instance_idx[instance_offsets[proposed_instance] : instance_offsets[proposed_instance + 1]] 24 | sample_idx = batch[instance[0]] 25 | gt_count_offset = offset_num_gt_instances[sample_idx] 26 | sample_instance_count = num_gt_instances[sample_idx] 27 | for instance_id in numba.prange(1, sample_instance_count + 1): 28 | intersection = 0 29 | for idx in instance: 30 | if gt_instances[idx] == instance_id: 31 | intersection += 1 32 | iou[proposed_instance, gt_count_offset + instance_id - 1] = intersection / float( 33 | len(instance) + gt_instance_sizes[gt_count_offset + instance_id - 1] - intersection 34 | ) 35 | return iou 36 | 37 | 38 | def instance_iou( 39 | instance_idx: List[torch.Tensor], 40 | gt_instances: torch.Tensor, 41 | batch: Optional[torch.Tensor] = None, 42 | ): 43 | """Computes the IoU between each proposed instance in instance_idx and ground truth instances. Returns a 44 | tensor of shape [instance_idx.shape[0], num_instances] that contains the iou between the proposed instances and all gt instances 45 | Instance label 0 is reserved for non instance points 46 | 47 | Parameters 48 | ---------- 49 | instance_idx : List[torch.Tensor] 50 | List of instances. Each tensor in this list is a proposed and contains the index of the points 51 | that belong to that particular instance 52 | gt_instances : torch.Tensor 53 | Ground truth instances, contains the index of the instance for each point 54 | 55 | Returns 56 | ------- 57 | ious: torch.Tensor[nb_proposals, nb_groundtruth] 58 | """ 59 | if batch is None: 60 | batch = torch.zeros_like(gt_instances) 61 | 62 | # Gather number of gt instances per batch and size of those instances 63 | gt_instance_sizes = [] 64 | num_gt_instances = [] 65 | batch_size = batch[-1] + 1 66 | for s in range(batch_size): 67 | batch_mask = batch == s 68 | sample_gt_instances = gt_instances[batch_mask] 69 | sample_num_gt_instances = torch.max(sample_gt_instances).item() 70 | num_gt_instances.append(sample_num_gt_instances) 71 | for instance_id in range(1, sample_num_gt_instances + 1): 72 | gt_instance_sizes.append(torch.sum(sample_gt_instances == instance_id)) 73 | gt_instance_sizes = torch.stack(gt_instance_sizes) 74 | num_gt_instances = torch.tensor(num_gt_instances) 75 | 76 | # Instance offset when flatten 77 | instance_offsets = [0] 78 | cum_offset = 0 79 | for instance in instance_idx: 80 | cum_offset += instance.shape[0] 81 | instance_offsets.append(cum_offset) 82 | 83 | # Compute ious 84 | instance_idx = torch.cat(instance_idx) 85 | if gt_instances.is_cuda: 86 | return tpcuda.instance_iou_cuda( 87 | instance_idx.cuda(), 88 | torch.tensor(instance_offsets).cuda(), 89 | gt_instances.cuda(), 90 | gt_instance_sizes.cuda(), 91 | num_gt_instances.cuda(), 92 | batch.cuda(), 93 | ) 94 | else: 95 | res = _instance_iou_cpu( 96 | instance_idx.numpy(), 97 | np.asarray(instance_offsets), 98 | gt_instances.numpy(), 99 | gt_instance_sizes.numpy(), 100 | num_gt_instances.numpy(), 101 | batch.numpy(), 102 | ) 103 | return torch.tensor(res).float() 104 | -------------------------------------------------------------------------------- /cpu/src/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "ball_query.h" 2 | #include "fps.h" 3 | #include "interpolate.h" 4 | #include "knn.h" 5 | 6 | using namespace pybind11::literals; 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 9 | { 10 | m.def("dense_knn", &dense_knn, "", "support"_a, "querry"_a, "k"_a); 11 | m.def("knn_interpolate", &knn_interpolate, "", "features"_a, "idx"_a, "weights"_a); 12 | m.def("knn_interpolate_grad", &knn_interpolate_grad, "", "grad_out"_a, "idx"_a, "weights"_a, 13 | "m"_a); 14 | m.def("fps", &fps, "", "points"_a, "num_samples"_a, "random"_a); 15 | 16 | m.def("ball_query", &ball_query, 17 | "compute the radius search of a point cloud using nanoflann" 18 | "- support : a pytorch tensor of size N1 x 3, points where the " 19 | "neighboors are accessed from" 20 | "- query : a pytorch tensor of size N2 x 3, centre of the balls" 21 | "- radius : float number, size of the ball for the radius search." 22 | "- max_num : int number, indicate the maximum of neaghbors allowed(if " 23 | "-1 then all the possible neighbors will be computed). " 24 | "- mode : int number that indicate which format for the neighborhood" 25 | "mode=0 mean a matrix of neighbors(-1 for shadow neighbors)" 26 | "mode=1 means a matrix of edges of size Num_edge x 2" 27 | "return a tensor of size N2 x M where M is either max_num or the " 28 | "maximum number of neighbors found if mode = 0, if mode=1 return a " 29 | "tensor of size Num_edge x 2 and return a tensor containing the " 30 | "squared distance of the neighbors", 31 | "support"_a, "querry"_a, "radius"_a, "max_num"_a = -1, "mode"_a = 0, "sorted"_a = false); 32 | 33 | m.def("batch_ball_query", &batch_ball_query, 34 | "compute the radius search of a point cloud for each batch using " 35 | "nanoflann" 36 | "- support : a pytorch tensor of size N1 x 3, points where the " 37 | "neighboors are accessed from" 38 | "- query : a pytorch tensor of size N2 x 3, centre of the balls" 39 | "- support_batch: a pytorch tensor(long) contains indices of the batch " 40 | "of the support size N1" 41 | "NB: the batch must be sorted" 42 | "- query_batch : a pytorch tensor(long) contains indices of the batch " 43 | "of the query size N2" 44 | "NB : the batch must be sorted" 45 | "-radius: float number, size of the ball for the radius search." 46 | "- max_num : int number, indicate the maximum of neaghbors allowed(if " 47 | "-1 then all the possible neighbors wrt the radius will be computed)." 48 | "- mode : int number that indicate which format for the neighborhood" 49 | "mode=0 mean a matrix of neighbors(N1 for shadow neighbors)" 50 | "mode=1 means a matrix of edges of size Num_edge x 2" 51 | "return a tensor of size N2 x M where M is either max_num or the " 52 | "maximum number of neighbors found if mode = 0, if mode=1 return a " 53 | "tensor of size Num_edge x 2 and return a tensor containing the " 54 | "squared distance of the neighbors", 55 | "support"_a, "querry"_a, "query_batch"_a, "support_batch"_a, "radius"_a, "max_num"_a = -1, 56 | "mode"_a = 0, "sorted"_a = false); 57 | m.def("dense_ball_query", &dense_ball_query, 58 | "compute the radius search of a batch of point cloud using nanoflann" 59 | "- support : a pytorch tensor of size B x N1 x 3, points where the " 60 | "neighboors are accessed from" 61 | "- query : a pytorch tensor of size B x N2 x 3, centre of the balls" 62 | "- radius : float number, size of the ball for the radius search." 63 | "- max_num : int number, indicate the maximum of neaghbors allowed(if " 64 | "-1 then all the possible neighbors will be computed). " 65 | "- mode : int number that indicate which format for the neighborhood" 66 | "mode=0 mean a matrix of neighbors(-1 for shadow neighbors)" 67 | "mode=1 means a matrix of edges of size Num_edge x 2" 68 | "return a tensor of size B x N2 x M where M is either max_num or the " 69 | "maximum number of neighbors found if mode = 0, if mode=1 return a " 70 | "tensor of size Num_edge x 2 and return a tensor containing the " 71 | "squared distance of the neighbors", 72 | "support"_a, "querry"_a, "radius"_a, "max_num"_a = -1, "mode"_a = 0, "sorted"_a = false); 73 | } 74 | -------------------------------------------------------------------------------- /cuda/src/ball_query_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cuda_utils.h" 6 | 7 | // input: new_xyz(b, m, 3) xyz(b, n, 3) 8 | // output: idx(b, m, nsample) 9 | __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius, int nsample, 10 | const float* __restrict__ new_xyz, 11 | const float* __restrict__ xyz, 12 | int64_t* __restrict__ idx_out, 13 | float* __restrict__ dist_out) 14 | { 15 | int batch_index = blockIdx.x; 16 | xyz += batch_index * n * 3; 17 | new_xyz += batch_index * m * 3; 18 | idx_out += m * nsample * batch_index; 19 | dist_out += m * nsample * batch_index; 20 | 21 | int index = threadIdx.x; 22 | int stride = blockDim.x; 23 | 24 | float radius2 = radius * radius; 25 | for (int j = index; j < m; j += stride) 26 | { 27 | float new_x = new_xyz[j * 3 + 0]; 28 | float new_y = new_xyz[j * 3 + 1]; 29 | float new_z = new_xyz[j * 3 + 2]; 30 | for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) 31 | { 32 | float x = xyz[k * 3 + 0]; 33 | float y = xyz[k * 3 + 1]; 34 | float z = xyz[k * 3 + 2]; 35 | float d2 = 36 | (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); 37 | if (d2 < radius2) 38 | { 39 | if (cnt == 0) 40 | { 41 | for (int l = 0; l < nsample; ++l) 42 | { 43 | idx_out[j * nsample + l] = k; 44 | } 45 | } 46 | idx_out[j * nsample + cnt] = k; 47 | dist_out[j * nsample + cnt] = d2; 48 | ++cnt; 49 | } 50 | } 51 | } 52 | } 53 | 54 | __global__ void query_ball_point_kernel_partial_dense(int size_x, int size_y, float radius, 55 | int nsample, const float* __restrict__ x, 56 | const float* __restrict__ y, 57 | const int64_t* __restrict__ batch_x, 58 | const int64_t* __restrict__ batch_y, 59 | int64_t* __restrict__ idx_out, 60 | float* __restrict__ dist_out) 61 | { 62 | // taken from 63 | // https://github.com/rusty1s/pytorch_cluster/blob/master/cuda/radius_kernel.cu 64 | const ptrdiff_t batch_idx = blockIdx.x; 65 | 66 | const ptrdiff_t start_idx_x = batch_x[batch_idx]; 67 | const ptrdiff_t end_idx_x = batch_x[batch_idx + 1]; 68 | 69 | const ptrdiff_t start_idx_y = batch_y[batch_idx]; 70 | const ptrdiff_t end_idx_y = batch_y[batch_idx + 1]; 71 | float radius2 = radius * radius; 72 | 73 | for (ptrdiff_t n_y = start_idx_y + threadIdx.x; n_y < end_idx_y; n_y += blockDim.x) 74 | { 75 | int64_t count = 0; 76 | for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++) 77 | { 78 | float dist = 0; 79 | for (ptrdiff_t d = 0; d < 3; d++) 80 | { 81 | dist += (x[n_x * 3 + d] - y[n_y * 3 + d]) * (x[n_x * 3 + d] - y[n_y * 3 + d]); 82 | } 83 | if (dist <= radius2) 84 | { 85 | idx_out[n_y * nsample + count] = n_x; 86 | dist_out[n_y * nsample + count] = dist; 87 | count++; 88 | } 89 | if (count >= nsample) 90 | { 91 | break; 92 | } 93 | } 94 | } 95 | } 96 | 97 | void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample, 98 | const float* new_xyz, const float* xyz, int64_t* idx, 99 | float* dist_out) 100 | { 101 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 102 | query_ball_point_kernel_dense<<>>(b, n, m, radius, nsample, 103 | new_xyz, xyz, idx, dist_out); 104 | 105 | CUDA_CHECK_ERRORS(); 106 | } 107 | 108 | void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y, 109 | float radius, int nsample, const float* x, 110 | const float* y, const int64_t* batch_x, 111 | const int64_t* batch_y, int64_t* idx_out, 112 | float* dist_out) 113 | { 114 | query_ball_point_kernel_partial_dense<<>>( 115 | size_x, size_y, radius, nsample, x, y, batch_x, batch_y, idx_out, dist_out); 116 | 117 | CUDA_CHECK_ERRORS(); 118 | } 119 | -------------------------------------------------------------------------------- /cuda/src/interpolate_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cuda_utils.h" 6 | 7 | // input: unknown(b, n, 3) known(b, m, 3) 8 | // output: dist2(b, n, 3), idx(b, n, 3) 9 | __global__ void three_nn_kernel(int b, int n, int m, const float* __restrict__ unknown, 10 | const float* __restrict__ known, float* __restrict__ dist2, 11 | int* __restrict__ idx) 12 | { 13 | int batch_index = blockIdx.x; 14 | unknown += batch_index * n * 3; 15 | known += batch_index * m * 3; 16 | dist2 += batch_index * n * 3; 17 | idx += batch_index * n * 3; 18 | 19 | int index = threadIdx.x; 20 | int stride = blockDim.x; 21 | for (int j = index; j < n; j += stride) 22 | { 23 | float ux = unknown[j * 3 + 0]; 24 | float uy = unknown[j * 3 + 1]; 25 | float uz = unknown[j * 3 + 2]; 26 | 27 | double best1 = 1e40, best2 = 1e40, best3 = 1e40; 28 | int besti1 = 0, besti2 = 0, besti3 = 0; 29 | for (int k = 0; k < m; ++k) 30 | { 31 | float x = known[k * 3 + 0]; 32 | float y = known[k * 3 + 1]; 33 | float z = known[k * 3 + 2]; 34 | float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); 35 | if (d < best1) 36 | { 37 | best3 = best2; 38 | besti3 = besti2; 39 | best2 = best1; 40 | besti2 = besti1; 41 | best1 = d; 42 | besti1 = k; 43 | } 44 | else if (d < best2) 45 | { 46 | best3 = best2; 47 | besti3 = besti2; 48 | best2 = d; 49 | besti2 = k; 50 | } 51 | else if (d < best3) 52 | { 53 | best3 = d; 54 | besti3 = k; 55 | } 56 | } 57 | dist2[j * 3 + 0] = best1; 58 | dist2[j * 3 + 1] = best2; 59 | dist2[j * 3 + 2] = best3; 60 | 61 | idx[j * 3 + 0] = besti1; 62 | idx[j * 3 + 1] = besti2; 63 | idx[j * 3 + 2] = besti3; 64 | } 65 | } 66 | 67 | void three_nn_kernel_wrapper(int b, int n, int m, const float* unknown, const float* known, 68 | float* dist2, int* idx) 69 | { 70 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 71 | three_nn_kernel<<>>(b, n, m, unknown, known, dist2, idx); 72 | 73 | CUDA_CHECK_ERRORS(); 74 | } 75 | 76 | // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) 77 | // output: out(b, c, n) 78 | __global__ void three_interpolate_kernel(int b, int c, int m, int n, 79 | const float* __restrict__ points, 80 | const int* __restrict__ idx, 81 | const float* __restrict__ weight, float* __restrict__ out) 82 | { 83 | int batch_index = blockIdx.x; 84 | points += batch_index * m * c; 85 | 86 | idx += batch_index * n * 3; 87 | weight += batch_index * n * 3; 88 | 89 | out += batch_index * n * c; 90 | 91 | const int index = threadIdx.y * blockDim.x + threadIdx.x; 92 | const int stride = blockDim.y * blockDim.x; 93 | for (int i = index; i < c * n; i += stride) 94 | { 95 | const int l = i / n; 96 | const int j = i % n; 97 | float w1 = weight[j * 3 + 0]; 98 | float w2 = weight[j * 3 + 1]; 99 | float w3 = weight[j * 3 + 2]; 100 | 101 | int i1 = idx[j * 3 + 0]; 102 | int i2 = idx[j * 3 + 1]; 103 | int i3 = idx[j * 3 + 2]; 104 | 105 | out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + points[l * m + i3] * w3; 106 | } 107 | } 108 | 109 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float* points, 110 | const int* idx, const float* weight, float* out) 111 | { 112 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 113 | three_interpolate_kernel<<>>(b, c, m, n, points, idx, 114 | weight, out); 115 | 116 | CUDA_CHECK_ERRORS(); 117 | } 118 | 119 | // input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) 120 | // output: grad_points(b, c, m) 121 | 122 | __global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, 123 | const float* __restrict__ grad_out, 124 | const int* __restrict__ idx, 125 | const float* __restrict__ weight, 126 | float* __restrict__ grad_points) 127 | { 128 | int batch_index = blockIdx.x; 129 | grad_out += batch_index * n * c; 130 | idx += batch_index * n * 3; 131 | weight += batch_index * n * 3; 132 | grad_points += batch_index * m * c; 133 | 134 | const int index = threadIdx.y * blockDim.x + threadIdx.x; 135 | const int stride = blockDim.y * blockDim.x; 136 | for (int i = index; i < c * n; i += stride) 137 | { 138 | const int l = i / n; 139 | const int j = i % n; 140 | float w1 = weight[j * 3 + 0]; 141 | float w2 = weight[j * 3 + 1]; 142 | float w3 = weight[j * 3 + 2]; 143 | 144 | int i1 = idx[j * 3 + 0]; 145 | int i2 = idx[j * 3 + 1]; 146 | int i3 = idx[j * 3 + 2]; 147 | 148 | atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); 149 | atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); 150 | atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); 151 | } 152 | } 153 | 154 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float* grad_out, 155 | const int* idx, const float* weight, float* grad_points) 156 | { 157 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 158 | three_interpolate_grad_kernel<<>>( 159 | b, c, n, m, grad_out, idx, weight, grad_points); 160 | 161 | CUDA_CHECK_ERRORS(); 162 | } 163 | -------------------------------------------------------------------------------- /cuda/src/sampling_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "cuda_utils.h" 5 | 6 | __device__ void __update(float* __restrict__ dists, int* __restrict__ dists_i, int idx1, int idx2) 7 | { 8 | const float v1 = dists[idx1], v2 = dists[idx2]; 9 | const int i1 = dists_i[idx1], i2 = dists_i[idx2]; 10 | dists[idx1] = max(v1, v2); 11 | dists_i[idx1] = v2 > v1 ? i2 : i1; 12 | } 13 | 14 | // Input dataset: (b, n, 3), tmp: (b, n) 15 | // Ouput idxs (b, m) 16 | template 17 | __global__ void furthest_point_sampling_kernel(int b, int n, int m, 18 | const float* __restrict__ dataset, 19 | float* __restrict__ temp, int* __restrict__ idxs) 20 | { 21 | if (m <= 0) 22 | return; 23 | __shared__ float dists[block_size]; 24 | __shared__ int dists_i[block_size]; 25 | 26 | int batch_index = blockIdx.x; 27 | dataset += batch_index * n * 3; 28 | temp += batch_index * n; 29 | idxs += batch_index * m; 30 | 31 | int tid = threadIdx.x; 32 | const int stride = block_size; 33 | 34 | int old = 0; 35 | if (threadIdx.x == 0) 36 | idxs[0] = old; 37 | 38 | __syncthreads(); 39 | for (int j = 1; j < m; j++) 40 | { 41 | int besti = 0; 42 | float best = -1; 43 | float x1 = dataset[old * 3 + 0]; 44 | float y1 = dataset[old * 3 + 1]; 45 | float z1 = dataset[old * 3 + 2]; 46 | for (int k = tid; k < n; k += stride) 47 | { 48 | float x2, y2, z2; 49 | x2 = dataset[k * 3 + 0]; 50 | y2 = dataset[k * 3 + 1]; 51 | z2 = dataset[k * 3 + 2]; 52 | float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); 53 | if (mag <= 1e-3) 54 | continue; 55 | 56 | float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); 57 | 58 | float d2 = min(d, temp[k]); 59 | temp[k] = d2; 60 | besti = d2 > best ? k : besti; 61 | best = d2 > best ? d2 : best; 62 | } 63 | dists[tid] = best; 64 | dists_i[tid] = besti; 65 | __syncthreads(); 66 | 67 | if (block_size >= 512) 68 | { 69 | if (tid < 256) 70 | { 71 | __update(dists, dists_i, tid, tid + 256); 72 | } 73 | __syncthreads(); 74 | } 75 | if (block_size >= 256) 76 | { 77 | if (tid < 128) 78 | { 79 | __update(dists, dists_i, tid, tid + 128); 80 | } 81 | __syncthreads(); 82 | } 83 | if (block_size >= 128) 84 | { 85 | if (tid < 64) 86 | { 87 | __update(dists, dists_i, tid, tid + 64); 88 | } 89 | __syncthreads(); 90 | } 91 | if (block_size >= 64) 92 | { 93 | if (tid < 32) 94 | { 95 | __update(dists, dists_i, tid, tid + 32); 96 | } 97 | __syncthreads(); 98 | } 99 | if (block_size >= 32) 100 | { 101 | if (tid < 16) 102 | { 103 | __update(dists, dists_i, tid, tid + 16); 104 | } 105 | __syncthreads(); 106 | } 107 | if (block_size >= 16) 108 | { 109 | if (tid < 8) 110 | { 111 | __update(dists, dists_i, tid, tid + 8); 112 | } 113 | __syncthreads(); 114 | } 115 | if (block_size >= 8) 116 | { 117 | if (tid < 4) 118 | { 119 | __update(dists, dists_i, tid, tid + 4); 120 | } 121 | __syncthreads(); 122 | } 123 | if (block_size >= 4) 124 | { 125 | if (tid < 2) 126 | { 127 | __update(dists, dists_i, tid, tid + 2); 128 | } 129 | __syncthreads(); 130 | } 131 | if (block_size >= 2) 132 | { 133 | if (tid < 1) 134 | { 135 | __update(dists, dists_i, tid, tid + 1); 136 | } 137 | __syncthreads(); 138 | } 139 | 140 | old = dists_i[0]; 141 | if (tid == 0) 142 | idxs[j] = old; 143 | } 144 | } 145 | 146 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float* dataset, float* temp, 147 | int* idxs) 148 | { 149 | unsigned int n_threads = opt_n_threads(n); 150 | 151 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 152 | 153 | switch (n_threads) 154 | { 155 | case 512: 156 | furthest_point_sampling_kernel<512> 157 | <<>>(b, n, m, dataset, temp, idxs); 158 | break; 159 | case 256: 160 | furthest_point_sampling_kernel<256> 161 | <<>>(b, n, m, dataset, temp, idxs); 162 | break; 163 | case 128: 164 | furthest_point_sampling_kernel<128> 165 | <<>>(b, n, m, dataset, temp, idxs); 166 | break; 167 | case 64: 168 | furthest_point_sampling_kernel<64> 169 | <<>>(b, n, m, dataset, temp, idxs); 170 | break; 171 | case 32: 172 | furthest_point_sampling_kernel<32> 173 | <<>>(b, n, m, dataset, temp, idxs); 174 | break; 175 | case 16: 176 | furthest_point_sampling_kernel<16> 177 | <<>>(b, n, m, dataset, temp, idxs); 178 | break; 179 | case 8: 180 | furthest_point_sampling_kernel<8> 181 | <<>>(b, n, m, dataset, temp, idxs); 182 | break; 183 | case 4: 184 | furthest_point_sampling_kernel<4> 185 | <<>>(b, n, m, dataset, temp, idxs); 186 | break; 187 | case 2: 188 | furthest_point_sampling_kernel<2> 189 | <<>>(b, n, m, dataset, temp, idxs); 190 | break; 191 | case 1: 192 | furthest_point_sampling_kernel<1> 193 | <<>>(b, n, m, dataset, temp, idxs); 194 | break; 195 | default: 196 | furthest_point_sampling_kernel<512> 197 | <<>>(b, n, m, dataset, temp, idxs); 198 | } 199 | 200 | CUDA_CHECK_ERRORS(); 201 | } 202 | -------------------------------------------------------------------------------- /cpu/src/ball_query.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "ball_query.h" 3 | #include "compat.h" 4 | #include "neighbors.cpp" 5 | #include "neighbors.h" 6 | #include "utils.h" 7 | #include 8 | #include 9 | 10 | std::pair ball_query(at::Tensor support, at::Tensor query, float radius, 11 | int max_num, int mode, bool sorted) 12 | { 13 | CHECK_CONTIGUOUS(support); 14 | CHECK_CONTIGUOUS(query); 15 | 16 | at::Tensor out; 17 | at::Tensor out_dists; 18 | std::vector neighbors_indices(query.size(0), 0); 19 | std::vector neighbors_dists(query.size(0), -1); 20 | 21 | auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU); 22 | auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU); 23 | int max_count = 0; 24 | 25 | AT_DISPATCH_ALL_TYPES(query.scalar_type(), "radius_search", [&] { 26 | auto data_q = query.DATA_PTR(); 27 | auto data_s = support.DATA_PTR(); 28 | std::vector queries_stl = 29 | std::vector(data_q, data_q + query.size(0) * query.size(1)); 30 | std::vector supports_stl = 31 | std::vector(data_s, data_s + support.size(0) * support.size(1)); 32 | 33 | max_count = nanoflann_neighbors(queries_stl, supports_stl, neighbors_indices, 34 | neighbors_dists, radius, max_num, mode, sorted); 35 | }); 36 | auto neighbors_dists_ptr = neighbors_dists.data(); 37 | int64_t* neighbors_indices_ptr = neighbors_indices.data(); 38 | if (mode == 0) 39 | { 40 | out = 41 | torch::from_blob(neighbors_indices_ptr, {query.size(0), max_count}, options = options); 42 | out_dists = torch::from_blob(neighbors_dists_ptr, {query.size(0), max_count}, 43 | options = options_dist); 44 | } 45 | else if (mode == 1) 46 | { 47 | out = torch::from_blob(neighbors_indices_ptr, {(int)neighbors_indices.size() / 2, 2}, 48 | options = options); 49 | out_dists = torch::from_blob(neighbors_dists_ptr, {(int)neighbors_indices.size() / 2, 1}, 50 | options = options_dist); 51 | } 52 | 53 | return std::make_pair(out.clone(), out_dists.clone()); 54 | } 55 | 56 | at::Tensor degree(at::Tensor row, int64_t num_nodes) 57 | { 58 | auto zero = at::zeros(num_nodes, row.options()); 59 | auto one = at::ones(row.size(0), row.options()); 60 | auto out = zero.scatter_add_(0, row, one); 61 | return out; 62 | } 63 | 64 | std::pair batch_ball_query(at::Tensor support, at::Tensor query, 65 | at::Tensor support_batch, at::Tensor query_batch, 66 | float radius, int max_num, int mode, bool sorted) 67 | { 68 | CHECK_CONTIGUOUS(support); 69 | CHECK_CONTIGUOUS(query); 70 | CHECK_CONTIGUOUS(support_batch); 71 | CHECK_CONTIGUOUS(query_batch); 72 | 73 | at::Tensor idx; 74 | 75 | at::Tensor dist; 76 | std::vector neighbors_indices; 77 | std::vector neighbors_dists; 78 | 79 | auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU); 80 | auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU); 81 | 82 | int max_count = 0; 83 | auto q_batch_access = query_batch.accessor(); 84 | auto s_batch_access = support_batch.accessor(); 85 | 86 | auto batch_size = q_batch_access[query_batch.size(0) - 1] + 1; 87 | TORCH_CHECK(batch_size == (s_batch_access[support_batch.size(0) - 1] + 1), 88 | "Both batches need to have the same number of samples.") 89 | 90 | query_batch = degree(query_batch, batch_size); 91 | query_batch = at::cat({at::zeros(1, query_batch.options()), query_batch.cumsum(0)}, 0); 92 | support_batch = degree(support_batch, batch_size); 93 | support_batch = at::cat({at::zeros(1, support_batch.options()), support_batch.cumsum(0)}, 0); 94 | std::vector query_batch_stl(query_batch.DATA_PTR(), 95 | query_batch.DATA_PTR() + query_batch.numel()); 96 | std::vector support_batch_stl(support_batch.DATA_PTR(), 97 | support_batch.DATA_PTR() + 98 | support_batch.numel()); 99 | 100 | AT_DISPATCH_ALL_TYPES(query.scalar_type(), "batch_radius_search", [&] { 101 | std::vector queries_stl(query.DATA_PTR(), 102 | query.DATA_PTR() + query.numel()); 103 | std::vector supports_stl(support.DATA_PTR(), 104 | support.DATA_PTR() + support.numel()); 105 | 106 | max_count = batch_nanoflann_neighbors( 107 | queries_stl, supports_stl, query_batch_stl, support_batch_stl, neighbors_indices, 108 | neighbors_dists, radius, max_num, mode, sorted); 109 | }); 110 | auto neighbors_dists_ptr = neighbors_dists.data(); 111 | int64_t* neighbors_indices_ptr = neighbors_indices.data(); 112 | 113 | if (mode == 0) 114 | { 115 | idx = 116 | torch::from_blob(neighbors_indices_ptr, {query.size(0), max_count}, options = options); 117 | dist = torch::from_blob(neighbors_dists_ptr, {query.size(0), max_count}, 118 | options = options_dist); 119 | } 120 | else if (mode == 1) 121 | { 122 | idx = torch::from_blob(neighbors_indices_ptr, {(int)neighbors_indices.size() / 2, 2}, 123 | options = options); 124 | dist = torch::from_blob(neighbors_dists_ptr, {(int)neighbors_indices.size() / 2, 1}, 125 | options = options_dist); 126 | } 127 | return std::make_pair(idx.clone(), dist.clone()); 128 | } 129 | 130 | std::pair dense_ball_query(at::Tensor support, at::Tensor query, 131 | float radius, int max_num, int mode, bool sorted) 132 | { 133 | CHECK_CONTIGUOUS(support); 134 | CHECK_CONTIGUOUS(query); 135 | 136 | int b = query.size(0); 137 | vector batch_idx; 138 | vector batch_dist; 139 | for (int i = 0; i < b; i++) 140 | { 141 | auto out_pair = ball_query(query[i], support[i], radius, max_num, mode, sorted); 142 | batch_idx.push_back(out_pair.first); 143 | batch_dist.push_back(out_pair.second); 144 | } 145 | auto out_idx = torch::stack(batch_idx); 146 | auto out_dist = torch::stack(batch_dist); 147 | return std::make_pair(out_idx, out_dist); 148 | } 149 | -------------------------------------------------------------------------------- /torch_points_kernels/torchpoints.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | import torch.nn as nn 4 | import sys 5 | from typing import Optional, Any, Tuple 6 | 7 | import torch_points_kernels.points_cpu as tpcpu 8 | from .knn import knn 9 | 10 | if torch.cuda.is_available(): 11 | import torch_points_kernels.points_cuda as tpcuda 12 | 13 | 14 | def furthest_point_sample(xyz, npoint): 15 | # type: (Any, torch.Tensor, int) -> torch.Tensor 16 | r""" 17 | Uses iterative furthest point sampling to select a set of npoint features that have the largest 18 | minimum distance 19 | 20 | Parameters 21 | ---------- 22 | xyz : torch.Tensor 23 | (B, N, 3) tensor where N > npoint 24 | npoint : int32 25 | number of features in the sampled set 26 | 27 | Returns 28 | ------- 29 | torch.Tensor 30 | (B, npoint) tensor containing the set 31 | """ 32 | if npoint > xyz.shape[1]: 33 | raise ValueError("caanot sample %i points from an input set of %i points" % (npoint, xyz.shape[1])) 34 | if xyz.is_cuda: 35 | return tpcuda.furthest_point_sampling(xyz, npoint) 36 | else: 37 | return tpcpu.fps(xyz, npoint, True) 38 | 39 | 40 | def three_nn(unknown, known): 41 | r""" 42 | Find the three nearest neighbors of unknown in known 43 | Parameters 44 | ---------- 45 | unknown : torch.Tensor 46 | (B, n, 3) tensor of unknown features 47 | known : torch.Tensor 48 | (B, m, 3) tensor of known features 49 | 50 | Returns 51 | ------- 52 | dist : torch.Tensor 53 | (B, n, 3) l2 distance to the three nearest neighbors 54 | idx : torch.Tensor 55 | (B, n, 3) index of 3 nearest neighbors 56 | """ 57 | if unknown.shape[1] < 3: 58 | raise ValueError("Not enough points. unknown should ahve at least 3 points.") 59 | if unknown.is_cuda: 60 | dist2, idx = tpcuda.three_nn(unknown, known) 61 | else: 62 | idx, dist2 = knn(known, unknown, 3) 63 | 64 | return torch.sqrt(dist2), idx 65 | 66 | 67 | class ThreeInterpolate(Function): 68 | @staticmethod 69 | def forward(ctx, features, idx, weight): 70 | # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor 71 | B, c, m = features.size() 72 | n = idx.size(1) 73 | 74 | ctx.three_interpolate_for_backward = (idx, weight, m) 75 | 76 | if features.is_cuda: 77 | return tpcuda.three_interpolate(features, idx, weight) 78 | else: 79 | return tpcpu.knn_interpolate(features, idx, weight) 80 | 81 | @staticmethod 82 | def backward(ctx, grad_out): 83 | # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] 84 | r""" 85 | Parameters 86 | ---------- 87 | grad_out : torch.Tensor 88 | (B, c, n) tensor with gradients of ouputs 89 | 90 | Returns 91 | ------- 92 | grad_features : torch.Tensor 93 | (B, c, m) tensor with gradients of features 94 | 95 | None 96 | 97 | None 98 | """ 99 | idx, weight, m = ctx.three_interpolate_for_backward 100 | 101 | if grad_out.is_cuda: 102 | grad_features = tpcuda.three_interpolate_grad(grad_out.contiguous(), idx, weight, m) 103 | else: 104 | grad_features = tpcpu.knn_interpolate_grad(grad_out.contiguous(), idx, weight, m) 105 | 106 | return grad_features, None, None 107 | 108 | 109 | def three_interpolate(features, idx, weight): 110 | r""" 111 | Performs weight linear interpolation on 3 features 112 | Parameters 113 | ---------- 114 | features : torch.Tensor 115 | (B, c, m) Features descriptors to be interpolated from 116 | idx : torch.Tensor 117 | (B, n, 3) three nearest neighbors of the target features in features 118 | weight : torch.Tensor 119 | (B, n, 3) weights 120 | 121 | Returns 122 | ------- 123 | torch.Tensor 124 | (B, c, n) tensor of the interpolated features 125 | """ 126 | return ThreeInterpolate.apply(features, idx, weight) 127 | 128 | 129 | def grouping_operation(features, idx): 130 | r""" 131 | Parameters 132 | ---------- 133 | features : torch.Tensor 134 | (B, C, N) tensor of features to group 135 | idx : torch.Tensor 136 | (B, npoint, nsample) tensor containing the indicies of features to group with 137 | 138 | Returns 139 | ------- 140 | torch.Tensor 141 | (B, C, npoint, nsample) tensor 142 | """ 143 | all_idx = idx.reshape(idx.shape[0], -1) 144 | all_idx = all_idx.unsqueeze(1).repeat(1, features.shape[1], 1) 145 | grouped_features = features.gather(2, all_idx) 146 | return grouped_features.reshape(idx.shape[0], features.shape[1], idx.shape[1], idx.shape[2]) 147 | 148 | 149 | def ball_query_dense(radius, nsample, xyz, new_xyz, batch_xyz=None, batch_new_xyz=None, sort=False): 150 | # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor 151 | if new_xyz.is_cuda: 152 | if sort: 153 | raise NotImplementedError("CUDA version does not sort the neighbors") 154 | ind, dist = tpcuda.ball_query_dense(new_xyz, xyz, radius, nsample) 155 | else: 156 | ind, dist = tpcpu.dense_ball_query(new_xyz, xyz, radius, nsample, mode=0, sorted=sort) 157 | return ind, dist 158 | 159 | 160 | def ball_query_partial_dense(radius, nsample, x, y, batch_x, batch_y, sort=False): 161 | # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor 162 | if x.is_cuda: 163 | if sort: 164 | raise NotImplementedError("CUDA version does not sort the neighbors") 165 | ind, dist = tpcuda.ball_query_partial_dense(x, y, batch_x, batch_y, radius, nsample) 166 | else: 167 | ind, dist = tpcpu.batch_ball_query(x, y, batch_x, batch_y, radius, nsample, mode=0, sorted=sort) 168 | return ind, dist 169 | 170 | 171 | def ball_query( 172 | radius: float, 173 | nsample: int, 174 | x: torch.Tensor, 175 | y: torch.Tensor, 176 | mode: Optional[str] = "dense", 177 | batch_x: Optional[torch.tensor] = None, 178 | batch_y: Optional[torch.tensor] = None, 179 | sort: Optional[bool] = False, 180 | ) -> torch.Tensor: 181 | """ 182 | Arguments: 183 | radius {float} -- radius of the balls 184 | nsample {int} -- maximum number of features in the balls 185 | x {torch.Tensor} -- 186 | (M, 3) [partial_dense] or (B, M, 3) [dense] xyz coordinates of the features 187 | y {torch.Tensor} -- 188 | (npoint, 3) [partial_dense] or or (B, npoint, 3) [dense] centers of the ball query 189 | mode {str} -- switch between "dense" or "partial_dense" data layout 190 | 191 | Keyword Arguments: 192 | batch_x -- (M, ) [partial_dense] or (B, M, 3) [dense] Contains indexes to indicate within batch it belongs to. 193 | batch_y -- (N, ) Contains indexes to indicate within batch it belongs to 194 | sort -- bool wether the neighboors are sorted or not (closests first) 195 | 196 | Returns: 197 | idx: (npoint, nsample) or (B, npoint, nsample) [dense] It contains the indexes of the element within x at radius distance to y 198 | dist: (N, nsample) or (B, npoint, nsample) Default value: -1. 199 | It contains the squared distance of the element within x at radius distance to y 200 | """ 201 | if mode is None: 202 | raise Exception('The mode should be defined within ["partial_dense | dense"]') 203 | 204 | if mode.lower() == "partial_dense": 205 | if (batch_x is None) or (batch_y is None): 206 | raise Exception("batch_x and batch_y should be provided") 207 | assert x.size(0) == batch_x.size(0) 208 | assert y.size(0) == batch_y.size(0) 209 | assert x.dim() == 2 210 | return ball_query_partial_dense(radius, nsample, x, y, batch_x, batch_y, sort=sort) 211 | 212 | elif mode.lower() == "dense": 213 | if (batch_x is not None) or (batch_y is not None): 214 | raise Exception("batch_x and batch_y should not be provided") 215 | assert x.dim() == 3 216 | return ball_query_dense(radius, nsample, x, y, sort=sort) 217 | else: 218 | raise Exception("unrecognized mode {}".format(mode)) 219 | -------------------------------------------------------------------------------- /test/test_ballquerry.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import numpy.testing as npt 4 | import numpy as np 5 | from sklearn.neighbors import KDTree 6 | import os 7 | import sys 8 | 9 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") 10 | sys.path.insert(0, ROOT) 11 | 12 | from test import run_if_cuda 13 | from torch_points_kernels import ball_query 14 | 15 | 16 | class TestBall(unittest.TestCase): 17 | @run_if_cuda 18 | def test_simple_gpu(self): 19 | a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float).cuda() 20 | b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float).cuda() 21 | idx, dist = ball_query(1.01, 2, a, b) 22 | torch.testing.assert_allclose(idx.cpu(), torch.tensor([[[0, 1]], [[2, 2]]])) 23 | torch.testing.assert_allclose(dist.cpu(), torch.tensor([[[0, 1]], [[1, -1]]]).float()) 24 | 25 | def test_simple_cpu(self): 26 | a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float) 27 | b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float) 28 | idx, dist = ball_query(1.01, 2, a, b, sort=True) 29 | torch.testing.assert_allclose(idx, torch.tensor([[[0, 1]], [[2, 2]]])) 30 | torch.testing.assert_allclose(dist, torch.tensor([[[0, 1]], [[1, -1]]]).float()) 31 | 32 | a = torch.tensor([[[0, 0, 0], [1, 0, 0], [1, 1, 0]]]).to(torch.float) 33 | idx, dist = ball_query(1.01, 3, a, a, sort=True) 34 | torch.testing.assert_allclose(idx, torch.tensor([[[0, 1, 0], [1, 0, 2], [2, 1, 2]]])) 35 | 36 | @run_if_cuda 37 | def test_larger_gpu(self): 38 | a = torch.randn(32, 4096, 3).to(torch.float).cuda() 39 | idx, dist = ball_query(1, 64, a, a) 40 | self.assertGreaterEqual(idx.min(), 0) 41 | 42 | @run_if_cuda 43 | def test_cpu_gpu_equality(self): 44 | a = torch.randn(5, 1000, 3) 45 | b = torch.randn(5, 500, 3) 46 | res_cpu = ball_query(1, 500, a, b)[0].detach().numpy() 47 | res_cuda = ball_query(1, 500, a.cuda(), b.cuda())[0].cpu().detach().numpy() 48 | for i in range(b.shape[0]): 49 | for j in range(b.shape[1]): 50 | # Because it is not necessary the same order 51 | assert set(res_cpu[i][j]) == set(res_cuda[i][j]) 52 | 53 | res_cpu = ball_query(0.01, 500, a, b)[0].detach().numpy() 54 | res_cuda = ball_query(0.01, 500, a.cuda(), b.cuda())[0].cpu().detach().numpy() 55 | for i in range(b.shape[0]): 56 | for j in range(b.shape[1]): 57 | # Because it is not necessary the same order 58 | assert set(res_cpu[i][j]) == set(res_cuda[i][j]) 59 | 60 | 61 | class TestBallPartial(unittest.TestCase): 62 | @run_if_cuda 63 | def test_simple_gpu(self): 64 | x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [0.2, 0, 0], [0.1, 0, 0]]).to(torch.float).cuda() 65 | y = torch.tensor([[0, 0, 0]]).to(torch.float).cuda() 66 | batch_x = torch.from_numpy(np.asarray([0, 0, 0, 1])).long().cuda() 67 | batch_y = torch.from_numpy(np.asarray([0])).long().cuda() 68 | 69 | idx, dist2 = ball_query(0.2, 4, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y) 70 | 71 | idx = idx.detach().cpu().numpy() 72 | dist2 = dist2.detach().cpu().numpy() 73 | 74 | idx_answer = np.asarray([[1, 2, -1, -1]]) 75 | dist2_answer = np.asarray([[0.0100, 0.04, -1, -1]]).astype(np.float32) 76 | 77 | npt.assert_array_almost_equal(idx, idx_answer) 78 | npt.assert_array_almost_equal(dist2, dist2_answer) 79 | 80 | def test_simple_cpu(self): 81 | x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [10, 0, 0], [10.1, 0, 0]]).to(torch.float) 82 | y = torch.tensor([[0, 0, 0]]).to(torch.float) 83 | 84 | batch_x = torch.from_numpy(np.asarray([0, 0, 0, 0])).long() 85 | batch_y = torch.from_numpy(np.asarray([0])).long() 86 | 87 | idx, dist2 = ball_query(1.0, 2, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y) 88 | 89 | idx = idx.detach().cpu().numpy() 90 | dist2 = dist2.detach().cpu().numpy() 91 | 92 | idx_answer = np.asarray([[1, -1]]) 93 | dist2_answer = np.asarray([[0.0100, -1.0000]]).astype(np.float32) 94 | 95 | npt.assert_array_almost_equal(idx, idx_answer) 96 | npt.assert_array_almost_equal(dist2, dist2_answer) 97 | 98 | def test_breaks(self): 99 | x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [10, 0, 0], [10.1, 0, 0]]).to(torch.float) 100 | y = torch.tensor([[0, 0, 0]]).to(torch.float) 101 | 102 | batch_x = torch.from_numpy(np.asarray([0, 0, 1, 1])).long() 103 | batch_y = torch.from_numpy(np.asarray([0])).long() 104 | 105 | with self.assertRaises(RuntimeError): 106 | idx, dist2 = ball_query(1.0, 2, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y) 107 | 108 | def test_random_cpu(self, cuda=False): 109 | a = torch.randn(100, 3).to(torch.float) 110 | b = torch.randn(50, 3).to(torch.float) 111 | batch_a = torch.tensor([0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])]) 112 | batch_b = torch.tensor([0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])]) 113 | R = 1 114 | 115 | idx, dist = ball_query( 116 | R, 117 | 15, 118 | a, 119 | b, 120 | mode="PARTIAL_DENSE", 121 | batch_x=batch_a, 122 | batch_y=batch_b, 123 | sort=True, 124 | ) 125 | idx1, dist = ball_query( 126 | R, 127 | 15, 128 | a, 129 | b, 130 | mode="PARTIAL_DENSE", 131 | batch_x=batch_a, 132 | batch_y=batch_b, 133 | sort=True, 134 | ) 135 | torch.testing.assert_allclose(idx1, idx) 136 | with self.assertRaises(AssertionError): 137 | idx, dist = ball_query( 138 | R, 139 | 15, 140 | a, 141 | b, 142 | mode="PARTIAL_DENSE", 143 | batch_x=batch_a, 144 | batch_y=batch_b, 145 | sort=False, 146 | ) 147 | idx1, dist = ball_query( 148 | R, 149 | 15, 150 | a, 151 | b, 152 | mode="PARTIAL_DENSE", 153 | batch_x=batch_a, 154 | batch_y=batch_b, 155 | sort=False, 156 | ) 157 | torch.testing.assert_allclose(idx1, idx) 158 | 159 | self.assertEqual(idx.shape[0], b.shape[0]) 160 | self.assertEqual(dist.shape[0], b.shape[0]) 161 | self.assertLessEqual(idx.max().item(), len(batch_a)) 162 | 163 | # Comparison to see if we have the same result 164 | tree = KDTree(a.detach().numpy()) 165 | idx3_sk = tree.query_radius(b.detach().numpy(), r=R) 166 | i = np.random.randint(len(batch_b)) 167 | for p in idx[i].detach().numpy(): 168 | if p >= 0 and p < len(batch_a): 169 | assert p in idx3_sk[i] 170 | 171 | @run_if_cuda 172 | def test_random_gpu(self): 173 | a = torch.randn(100, 3).to(torch.float).cuda() 174 | b = torch.randn(50, 3).to(torch.float).cuda() 175 | batch_a = torch.tensor( 176 | [0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])] 177 | ).cuda() 178 | batch_b = torch.tensor( 179 | [0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])] 180 | ).cuda() 181 | R = 1 182 | 183 | idx, dist = ball_query( 184 | R, 185 | 15, 186 | a, 187 | b, 188 | mode="PARTIAL_DENSE", 189 | batch_x=batch_a, 190 | batch_y=batch_b, 191 | sort=False, 192 | ) 193 | 194 | # Comparison to see if we have the same result 195 | tree = KDTree(a.cpu().detach().numpy()) 196 | idx3_sk = tree.query_radius(b.cpu().detach().numpy(), r=R) 197 | i = np.random.randint(len(batch_b)) 198 | for p in idx[i].cpu().detach().numpy(): 199 | if p >= 0 and p < len(batch_a): 200 | assert p in idx3_sk[i] 201 | 202 | 203 | if __name__ == "__main__": 204 | unittest.main() 205 | -------------------------------------------------------------------------------- /cuda/src/cubic_feature_sampling_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cuda_utils.h" 7 | 8 | #define CUDA_NUM_THREADS 512 9 | 10 | // Computer the number of threads needed in GPU 11 | inline int get_n_threads(int n) 12 | { 13 | const int pow_2 = std::log(static_cast(n)) / std::log(2.0); 14 | return max(min(1 << pow_2, CUDA_NUM_THREADS), 1); 15 | } 16 | 17 | __device__ int compute_index(int offset_x, int offset_y, int offset_z, int scale) 18 | { 19 | return offset_x * scale * scale + offset_y * scale + offset_z; 20 | } 21 | 22 | template 23 | __global__ void cubic_feature_sampling_kernel(int scale, int neighborhood_size, int n_vertices, 24 | int n_pts, int n_cubic_channels, 25 | const scalar_t* __restrict__ ptcloud, 26 | const scalar_t* __restrict__ cubic_features, 27 | scalar_t* __restrict__ point_features, 28 | int* __restrict__ grid_pt_indexes) 29 | { 30 | int batch_index = blockIdx.x; 31 | int index = threadIdx.x; 32 | int stride = blockDim.x; 33 | int cub_scale = scale * scale * scale; 34 | 35 | ptcloud += batch_index * n_pts * 3; 36 | cubic_features += batch_index * n_cubic_channels * cub_scale; 37 | point_features += batch_index * n_pts * n_vertices * n_cubic_channels; 38 | grid_pt_indexes += batch_index * n_pts * n_vertices; 39 | 40 | for (int i = index; i < n_pts; i += stride) 41 | { 42 | scalar_t pt_x = ptcloud[i * 3 + 0]; 43 | scalar_t pt_y = ptcloud[i * 3 + 1]; 44 | scalar_t pt_z = ptcloud[i * 3 + 2]; 45 | 46 | int lower_x = std::floor(pt_x); 47 | int upper_x = std::ceil(pt_x); 48 | if (lower_x == upper_x) 49 | { 50 | upper_x += 1; 51 | } 52 | int lower_y = std::floor(pt_y); 53 | int upper_y = std::ceil(pt_y); 54 | if (lower_y == upper_y) 55 | { 56 | upper_y += 1; 57 | } 58 | int lower_z = std::floor(pt_z); 59 | int upper_z = std::ceil(pt_z); 60 | if (lower_z == upper_z) 61 | { 62 | upper_z += 1; 63 | } 64 | 65 | int ns = neighborhood_size - 1; 66 | int vertex_idx = 0; 67 | for (int j = lower_x - ns; j <= upper_x + ns; ++j) 68 | { 69 | for (int k = lower_y - ns; k <= upper_y + ns; ++k) 70 | { 71 | for (int m = lower_z - ns; m <= upper_z + ns; ++m) 72 | { 73 | if (j < 0 || j >= scale || k < 0 || k >= scale || m < 0 || m >= scale) 74 | { 75 | // Ignore points lies out of the grid 76 | grid_pt_indexes[i * n_vertices + vertex_idx++] = -1; 77 | } 78 | else 79 | { 80 | // Calcuating indexes for adjacent vertices 81 | grid_pt_indexes[i * n_vertices + vertex_idx++] = 82 | compute_index(j, k, m, scale); 83 | } 84 | } 85 | } 86 | } 87 | 88 | // Gather Features 89 | for (int j = 0; j < n_vertices; ++j) 90 | { 91 | for (int k = 0; k < n_cubic_channels; ++k) 92 | { 93 | int vertex_idx = grid_pt_indexes[i * n_vertices + j]; 94 | if (vertex_idx == -1) 95 | { 96 | continue; 97 | } 98 | int feature_idx = i * n_vertices * n_cubic_channels + j * n_cubic_channels + k; 99 | scalar_t feature_val = cubic_features[k * cub_scale + vertex_idx]; 100 | point_features[feature_idx] = feature_val; 101 | } 102 | } 103 | } 104 | } 105 | 106 | std::vector cubic_feature_sampling_kernel_wrapper(int scale, int neighborhood_size, 107 | torch::Tensor ptcloud, 108 | torch::Tensor cubic_features, 109 | cudaStream_t stream) 110 | { 111 | int batch_size = ptcloud.size(0); 112 | int n_pts = ptcloud.size(1); 113 | int n_cubic_channels = cubic_features.size(1); 114 | 115 | int n_vertices = std::pow(neighborhood_size * 2, 3); 116 | torch::Tensor point_features = torch::zeros({batch_size, n_pts, n_vertices, n_cubic_channels}, 117 | torch::CUDA(ptcloud.scalar_type())); 118 | torch::Tensor grid_pt_indexes = 119 | torch::zeros({batch_size, n_pts, n_vertices}, torch::CUDA(torch::kInt)); 120 | 121 | AT_DISPATCH_FLOATING_TYPES( 122 | ptcloud.scalar_type(), "cubic_feature_sampling_cuda", ([&] { 123 | cubic_feature_sampling_kernel<<>>( 124 | scale, neighborhood_size, n_vertices, n_pts, n_cubic_channels, 125 | ptcloud.data_ptr(), cubic_features.data_ptr(), 126 | point_features.data_ptr(), grid_pt_indexes.data_ptr()); 127 | })); 128 | 129 | cudaError_t err = cudaGetLastError(); 130 | if (err != cudaSuccess) 131 | { 132 | printf("Error in cubic_feature_sampling_kernel_wrapper: %s\n", cudaGetErrorString(err)); 133 | } 134 | return {point_features, grid_pt_indexes}; 135 | } 136 | 137 | template 138 | __global__ void cubic_feature_sampling_grad_kernel(int scale, int neighborhood_size, int n_vertices, 139 | int n_pts, int n_cubic_channels, 140 | const scalar_t* __restrict__ grad_point_features, 141 | const int* __restrict__ grid_pt_indexes, 142 | scalar_t* __restrict__ grad_ptcloud, 143 | scalar_t* __restrict__ grad_cubic_features) 144 | { 145 | int batch_index = blockIdx.x; 146 | int index = threadIdx.x; 147 | int stride = blockDim.x; 148 | int cub_scale = scale * scale * scale; 149 | 150 | grad_point_features += batch_index * n_pts * n_vertices * n_cubic_channels; 151 | grid_pt_indexes += batch_index * n_pts * n_vertices; 152 | grad_ptcloud += batch_index * n_pts * 3; 153 | grad_cubic_features += batch_index * n_cubic_channels * cub_scale; 154 | 155 | for (int i = index; i < n_pts; i += stride) 156 | { 157 | for (int j = 0; j < n_vertices; ++j) 158 | { 159 | int vertex_idx = grid_pt_indexes[i * n_vertices + j]; 160 | if (vertex_idx == -1) 161 | { 162 | continue; 163 | } 164 | for (int k = 0; k < n_cubic_channels; ++k) 165 | { 166 | int grad_idx = i * n_vertices * n_cubic_channels + j * n_cubic_channels + k; 167 | scalar_t grad_val = grad_point_features[grad_idx]; 168 | // Fix bugs: the gradients of ceil and floor functions are zeros. 169 | // Ref: https://github.com/tensorflow/tensorflow/issues/897 170 | // atomicAdd(&(grad_ptcloud[i * 3 + 0]), grad_val); 171 | // atomicAdd(&(grad_ptcloud[i * 3 + 1]), grad_val); 172 | // atomicAdd(&(grad_ptcloud[i * 3 + 2]), grad_val); 173 | atomicAdd(&(grad_cubic_features[k * cub_scale + vertex_idx]), grad_val); 174 | } 175 | } 176 | } 177 | } 178 | 179 | std::vector 180 | cubic_feature_sampling_grad_kernel_wrapper(int scale, int neighborhood_size, 181 | torch::Tensor grad_point_features, 182 | torch::Tensor grid_pt_indexes, cudaStream_t stream) 183 | { 184 | int batch_size = grad_point_features.size(0); 185 | int n_cubic_channels = grad_point_features.size(3); 186 | int n_pts = grid_pt_indexes.size(1); 187 | int n_vertices = std::pow(neighborhood_size * 2, 3); 188 | 189 | torch::Tensor grad_ptcloud = 190 | torch::zeros({batch_size, n_pts, 3}, torch::CUDA(grad_point_features.scalar_type())); 191 | torch::Tensor grad_cubic_features = 192 | torch::zeros({batch_size, n_cubic_channels, scale, scale, scale}, 193 | torch::CUDA(grad_point_features.scalar_type())); 194 | 195 | AT_DISPATCH_FLOATING_TYPES( 196 | grad_point_features.scalar_type(), "cubic_feature_sampling_grad_cuda", ([&] { 197 | cubic_feature_sampling_grad_kernel<<>>( 198 | scale, neighborhood_size, n_vertices, n_pts, n_cubic_channels, 199 | grad_point_features.data_ptr(), grid_pt_indexes.data_ptr(), 200 | grad_ptcloud.data_ptr(), grad_cubic_features.data_ptr()); 201 | })); 202 | 203 | cudaError_t err = cudaGetLastError(); 204 | if (err != cudaSuccess) 205 | { 206 | printf("Error in cubic_feature_sampling_grad_kernel_wrapper: %s\n", 207 | cudaGetErrorString(err)); 208 | } 209 | return {grad_ptcloud, grad_cubic_features}; 210 | } 211 | -------------------------------------------------------------------------------- /cuda/src/chamfer_dist_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cuda_utils.h" 6 | #include 7 | 8 | template 9 | __global__ void chamfer_dist_kernel(int batch_size, int n, const scalar_t* __restrict__ xyz1, int m, 10 | const scalar_t* __restrict__ xyz2, scalar_t* __restrict__ dist, 11 | int* indexes) 12 | { 13 | const int batch = 512; 14 | __shared__ scalar_t buf[batch * 3]; 15 | for (int i = blockIdx.x; i < batch_size; i += gridDim.x) 16 | { 17 | for (int k2 = 0; k2 < m; k2 += batch) 18 | { 19 | int end_k = min(m, k2 + batch) - k2; 20 | for (int j = threadIdx.x; j < end_k * 3; j += blockDim.x) 21 | { 22 | buf[j] = xyz2[(i * m + k2) * 3 + j]; 23 | } 24 | __syncthreads(); 25 | for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y) 26 | { 27 | scalar_t x1 = xyz1[(i * n + j) * 3 + 0]; 28 | scalar_t y1 = xyz1[(i * n + j) * 3 + 1]; 29 | scalar_t z1 = xyz1[(i * n + j) * 3 + 2]; 30 | scalar_t best_dist = 0; 31 | int best_dist_index = 0; 32 | int end_ka = end_k - (end_k & 3); 33 | if (end_ka == batch) 34 | { 35 | for (int k = 0; k < batch; k += 4) 36 | { 37 | { 38 | scalar_t x2 = buf[k * 3 + 0] - x1; 39 | scalar_t y2 = buf[k * 3 + 1] - y1; 40 | scalar_t z2 = buf[k * 3 + 2] - z1; 41 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 42 | 43 | if (k == 0 || dist < best_dist) 44 | { 45 | best_dist = dist; 46 | best_dist_index = k + k2; 47 | } 48 | } 49 | { 50 | scalar_t x2 = buf[k * 3 + 3] - x1; 51 | scalar_t y2 = buf[k * 3 + 4] - y1; 52 | scalar_t z2 = buf[k * 3 + 5] - z1; 53 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 54 | if (dist < best_dist) 55 | { 56 | best_dist = dist; 57 | best_dist_index = k + k2 + 1; 58 | } 59 | } 60 | { 61 | scalar_t x2 = buf[k * 3 + 6] - x1; 62 | scalar_t y2 = buf[k * 3 + 7] - y1; 63 | scalar_t z2 = buf[k * 3 + 8] - z1; 64 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 65 | if (dist < best_dist) 66 | { 67 | best_dist = dist; 68 | best_dist_index = k + k2 + 2; 69 | } 70 | } 71 | { 72 | scalar_t x2 = buf[k * 3 + 9] - x1; 73 | scalar_t y2 = buf[k * 3 + 10] - y1; 74 | scalar_t z2 = buf[k * 3 + 11] - z1; 75 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 76 | if (dist < best_dist) 77 | { 78 | best_dist = dist; 79 | best_dist_index = k + k2 + 3; 80 | } 81 | } 82 | } 83 | } 84 | else 85 | { 86 | for (int k = 0; k < end_ka; k += 4) 87 | { 88 | { 89 | scalar_t x2 = buf[k * 3 + 0] - x1; 90 | scalar_t y2 = buf[k * 3 + 1] - y1; 91 | scalar_t z2 = buf[k * 3 + 2] - z1; 92 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 93 | if (k == 0 || dist < best_dist) 94 | { 95 | best_dist = dist; 96 | best_dist_index = k + k2; 97 | } 98 | } 99 | { 100 | scalar_t x2 = buf[k * 3 + 3] - x1; 101 | scalar_t y2 = buf[k * 3 + 4] - y1; 102 | scalar_t z2 = buf[k * 3 + 5] - z1; 103 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 104 | if (dist < best_dist) 105 | { 106 | best_dist = dist; 107 | best_dist_index = k + k2 + 1; 108 | } 109 | } 110 | { 111 | scalar_t x2 = buf[k * 3 + 6] - x1; 112 | scalar_t y2 = buf[k * 3 + 7] - y1; 113 | scalar_t z2 = buf[k * 3 + 8] - z1; 114 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 115 | if (dist < best_dist) 116 | { 117 | best_dist = dist; 118 | best_dist_index = k + k2 + 2; 119 | } 120 | } 121 | { 122 | scalar_t x2 = buf[k * 3 + 9] - x1; 123 | scalar_t y2 = buf[k * 3 + 10] - y1; 124 | scalar_t z2 = buf[k * 3 + 11] - z1; 125 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 126 | if (dist < best_dist) 127 | { 128 | best_dist = dist; 129 | best_dist_index = k + k2 + 3; 130 | } 131 | } 132 | } 133 | } 134 | for (int k = end_ka; k < end_k; k++) 135 | { 136 | scalar_t x2 = buf[k * 3 + 0] - x1; 137 | scalar_t y2 = buf[k * 3 + 1] - y1; 138 | scalar_t z2 = buf[k * 3 + 2] - z1; 139 | scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2; 140 | if (k == 0 || dist < best_dist) 141 | { 142 | best_dist = dist; 143 | best_dist_index = k + k2; 144 | } 145 | } 146 | if (k2 == 0 || dist[(i * n + j)] > best_dist) 147 | { 148 | dist[(i * n + j)] = best_dist; 149 | indexes[(i * n + j)] = best_dist_index; 150 | } 151 | } 152 | __syncthreads(); 153 | } 154 | } 155 | } 156 | 157 | std::vector chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2) 158 | { 159 | const int batch_size = xyz1.size(0); 160 | const int n = xyz1.size(1); // num_points point cloud A 161 | const int m = xyz2.size(1); // num_points point cloud B 162 | torch::Tensor dist1 = torch::zeros({batch_size, n}, torch::CUDA(xyz1.scalar_type())); 163 | torch::Tensor dist2 = torch::zeros({batch_size, m}, torch::CUDA(xyz1.scalar_type())); 164 | torch::Tensor idx1 = torch::zeros({batch_size, n}, torch::CUDA(torch::kInt)); 165 | torch::Tensor idx2 = torch::zeros({batch_size, m}, torch::CUDA(torch::kInt)); 166 | 167 | AT_DISPATCH_FLOATING_TYPES( 168 | xyz1.scalar_type(), "chamfer_dist_cuda", ([&] { 169 | chamfer_dist_kernel<<>>( 170 | batch_size, n, xyz1.data_ptr(), m, xyz2.data_ptr(), 171 | dist1.data_ptr(), idx1.data_ptr()); 172 | 173 | chamfer_dist_kernel<<>>( 174 | batch_size, m, xyz2.data_ptr(), n, xyz1.data_ptr(), 175 | dist2.data_ptr(), idx2.data_ptr()); 176 | })); 177 | 178 | cudaError_t err = cudaGetLastError(); 179 | if (err != cudaSuccess) 180 | { 181 | printf("Error in chamfer_dist_kernel_wrapper: %s\n", cudaGetErrorString(err)); 182 | } 183 | return {dist1, dist2, idx1, idx2}; 184 | } 185 | 186 | template 187 | __global__ void chamfer_dist_grad_kernel(int b, int n, const scalar_t* __restrict__ xyz1, int m, 188 | const scalar_t* __restrict__ xyz2, 189 | const scalar_t* __restrict__ grad_dist1, const int* idx1, 190 | scalar_t* __restrict__ grad_xyz1, 191 | scalar_t* __restrict__ grad_xyz2) 192 | { 193 | for (int i = blockIdx.x; i < b; i += gridDim.x) 194 | { 195 | for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y) 196 | { 197 | scalar_t x1 = xyz1[(i * n + j) * 3 + 0]; 198 | scalar_t y1 = xyz1[(i * n + j) * 3 + 1]; 199 | scalar_t z1 = xyz1[(i * n + j) * 3 + 2]; 200 | int j2 = idx1[i * n + j]; 201 | scalar_t x2 = xyz2[(i * m + j2) * 3 + 0]; 202 | scalar_t y2 = xyz2[(i * m + j2) * 3 + 1]; 203 | scalar_t z2 = xyz2[(i * m + j2) * 3 + 2]; 204 | scalar_t g = grad_dist1[i * n + j] * 2; 205 | atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 0]), g * (x1 - x2)); 206 | atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 1]), g * (y1 - y2)); 207 | atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 2]), g * (z1 - z2)); 208 | atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 0]), -(g * (x1 - x2))); 209 | atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 1]), -(g * (y1 - y2))); 210 | atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 2]), -(g * (z1 - z2))); 211 | } 212 | } 213 | } 214 | 215 | std::vector chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2, 216 | torch::Tensor idx1, torch::Tensor idx2, 217 | torch::Tensor grad_dist1, 218 | torch::Tensor grad_dist2) 219 | { 220 | const int batch_size = xyz1.size(0); 221 | const int n = xyz1.size(1); // num_points point cloud A 222 | const int m = xyz2.size(1); // num_points point cloud B 223 | torch::Tensor grad_xyz1 = torch::zeros_like(xyz1); 224 | torch::Tensor grad_xyz2 = torch::zeros_like(xyz2); 225 | 226 | AT_DISPATCH_FLOATING_TYPES( 227 | xyz1.scalar_type(), "chamfer_dist_grad_cuda", ([&] { 228 | chamfer_dist_grad_kernel<<>>( 229 | batch_size, n, xyz1.data_ptr(), m, xyz2.data_ptr(), 230 | grad_dist1.data_ptr(), idx1.data_ptr(), 231 | grad_xyz1.data_ptr(), grad_xyz2.data_ptr()); 232 | 233 | chamfer_dist_grad_kernel<<>>( 234 | batch_size, m, xyz2.data_ptr(), n, xyz1.data_ptr(), 235 | grad_dist2.data_ptr(), idx2.data_ptr(), 236 | grad_xyz2.data_ptr(), grad_xyz1.data_ptr()); 237 | })); 238 | 239 | cudaError_t err = cudaGetLastError(); 240 | if (err != cudaSuccess) 241 | { 242 | printf("Error in chamfer_dist_grad_kernel_wrapper: %s\n", cudaGetErrorString(err)); 243 | } 244 | return {grad_xyz1, grad_xyz2}; 245 | } 246 | -------------------------------------------------------------------------------- /cpu/src/neighbors.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Taken from https://github.com/HuguesTHOMAS/KPConv 3 | 4 | #include "neighbors.h" 5 | #include 6 | #include 7 | 8 | template 9 | int nanoflann_neighbors(vector& queries, vector& supports, 10 | vector& neighbors_indices, vector& dists, float radius, 11 | int max_num, int mode, bool sorted) 12 | { 13 | // Initiate variables 14 | // ****************** 15 | std::random_device rd; 16 | std::mt19937 g(rd()); 17 | 18 | // square radius 19 | const float search_radius = static_cast(radius * radius); 20 | 21 | // indices 22 | int i0 = 0; 23 | 24 | // Counting vector 25 | size_t max_count = 1; 26 | 27 | // Nanoflann related variables 28 | // *************************** 29 | 30 | // CLoud variable 31 | PointCloud pcd; 32 | pcd.set(supports); 33 | 34 | // Cloud query 35 | PointCloud pcd_query; 36 | pcd_query.set(queries); 37 | 38 | // Tree parameters 39 | nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */); 40 | 41 | // KDTree type definition 42 | typedef nanoflann::KDTreeSingleIndexAdaptor< 43 | nanoflann::L2_Simple_Adaptor>, PointCloud, 3> 44 | my_kd_tree_t; 45 | 46 | // Pointer to trees 47 | std::unique_ptr index(new my_kd_tree_t(3, pcd, tree_params)); 48 | index->buildIndex(); 49 | // Search neigbors indices 50 | // *********************** 51 | 52 | // Search params 53 | nanoflann::SearchParams search_params; 54 | search_params.sorted = sorted; 55 | auto num_query_points = pcd_query.get_point_count(); 56 | std::vector>> list_matches(num_query_points); 57 | 58 | for (size_t i = 0; i < num_query_points; i++) 59 | { 60 | // Find neighbors 61 | list_matches[i0].reserve(max_count); 62 | std::vector> ret_matches; 63 | 64 | const size_t nMatches = index->radiusSearch(pcd_query.get_point_ptr(i), search_radius, 65 | ret_matches, search_params); 66 | if (nMatches == 0) 67 | list_matches[i0] = {std::make_pair(0, -1)}; 68 | else 69 | { 70 | if (!sorted) 71 | std::shuffle(ret_matches.begin(), ret_matches.end(), g); 72 | list_matches[i0] = ret_matches; 73 | } 74 | max_count = max(max_count, nMatches); 75 | i0++; 76 | } 77 | // Reserve the memory 78 | if (max_num > 0) 79 | { 80 | max_count = max_num; 81 | } 82 | if (mode == 0) 83 | { 84 | neighbors_indices.resize(list_matches.size() * max_count, 0); 85 | dists.resize(list_matches.size() * max_count, -1); 86 | i0 = 0; 87 | int token = 0; 88 | for (auto& inds : list_matches) 89 | { 90 | token = inds[0].first; 91 | for (size_t j = 0; j < max_count; j++) 92 | { 93 | if (j < inds.size()) 94 | { 95 | neighbors_indices[i0 * max_count + j] = inds[j].first; 96 | dists[i0 * max_count + j] = (float)inds[j].second; 97 | } 98 | else 99 | { 100 | neighbors_indices[i0 * max_count + j] = token; 101 | dists[i0 * max_count + j] = -1; 102 | } 103 | } 104 | i0++; 105 | } 106 | } 107 | else if (mode == 1) 108 | { 109 | size_t size = 0; // total number of edges 110 | for (auto& inds : list_matches) 111 | { 112 | if (inds.size() <= max_count) 113 | size += inds.size(); 114 | else 115 | size += max_count; 116 | } 117 | neighbors_indices.resize(size * 2); 118 | dists.resize(size); 119 | int i0 = 0; // index of the query points 120 | int u = 0; // curent index of the neighbors_indices 121 | for (auto& inds : list_matches) 122 | { 123 | for (size_t j = 0; j < max_count; j++) 124 | { 125 | if (j < inds.size()) 126 | { 127 | neighbors_indices[u] = inds[j].first; 128 | neighbors_indices[u + 1] = i0; 129 | dists[u / 2] = (float)inds[j].second; 130 | u += 2; 131 | } 132 | } 133 | i0++; 134 | } 135 | } 136 | return max_count; 137 | } 138 | 139 | template 140 | int batch_nanoflann_neighbors(vector& queries, vector& supports, 141 | vector& q_batches, vector& s_batches, 142 | vector& neighbors_indices, vector& dists, 143 | float radius, int max_num, int mode, bool sorted) 144 | { 145 | // Initiate variables 146 | // ****************** 147 | std::random_device rd; 148 | std::mt19937 g(rd()); 149 | 150 | // indices 151 | int i0 = 0; 152 | 153 | // Square radius 154 | float r2 = radius * radius; 155 | 156 | // Counting vector 157 | int max_count = 0; 158 | 159 | // batch index 160 | int b = 0; 161 | 162 | // Nanoflann related variables 163 | // *************************** 164 | 165 | // CLoud variable 166 | PointCloud current_cloud; 167 | PointCloud query_pcd; 168 | query_pcd.set(queries); 169 | auto num_query_points = query_pcd.get_point_count(); 170 | vector>> all_inds_dists(num_query_points); 171 | 172 | // Tree parameters 173 | nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */); 174 | 175 | // KDTree type definition 176 | typedef nanoflann::KDTreeSingleIndexAdaptor< 177 | nanoflann::L2_Simple_Adaptor>, PointCloud, 3> 178 | my_kd_tree_t; 179 | 180 | // Build KDTree for the first batch element 181 | current_cloud.set_batch(supports, s_batches[b], s_batches[b + 1]); 182 | std::unique_ptr index(new my_kd_tree_t(3, current_cloud, tree_params)); 183 | index->buildIndex(); 184 | 185 | // Search neigbors indices 186 | // *********************** 187 | // Search params 188 | nanoflann::SearchParams search_params; 189 | search_params.sorted = sorted; 190 | for (size_t i = 0; i < num_query_points; i++) 191 | { 192 | // Check if we changed batch 193 | if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 && 194 | b < (int)q_batches.size() - 1) 195 | { 196 | // Change the points 197 | b++; 198 | if (s_batches[b] < s_batches[b + 1]) 199 | current_cloud.set_batch(supports, s_batches[b], s_batches[b + 1]); 200 | 201 | index.reset(new my_kd_tree_t(3, current_cloud, tree_params)); 202 | index->buildIndex(); 203 | } 204 | 205 | // Find neighboors 206 | std::vector> ret_matches; 207 | ret_matches.reserve(max_count); 208 | size_t nMatches = 209 | index->radiusSearch(query_pcd.get_point_ptr(i), r2, ret_matches, search_params); 210 | 211 | // Shuffle if needed 212 | if (!sorted) 213 | std::shuffle(ret_matches.begin(), ret_matches.end(), g); 214 | all_inds_dists[i0] = ret_matches; 215 | 216 | // Update max count 217 | if (nMatches > (size_t)max_count) 218 | max_count = nMatches; 219 | // Increment query idx 220 | i0++; 221 | } 222 | // how many neighbors do we keep 223 | if (max_num > 0) 224 | max_count = max_num; 225 | 226 | const int token = -1; 227 | if (mode == 0) 228 | { 229 | neighbors_indices.resize(query_pcd.get_point_count() * max_count); 230 | dists.resize(query_pcd.get_point_count() * max_count); 231 | i0 = 0; 232 | b = 0; 233 | 234 | for (auto& inds_dists : all_inds_dists) 235 | { // Check if we changed batch 236 | if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 && 237 | b < (int)q_batches.size() - 1) 238 | b++; 239 | 240 | for (int j = 0; j < max_count; j++) 241 | { 242 | if ((size_t)j < inds_dists.size()) 243 | { 244 | neighbors_indices[i0 * max_count + j] = inds_dists[j].first + s_batches[b]; 245 | dists[i0 * max_count + j] = (float)inds_dists[j].second; 246 | } 247 | else 248 | { 249 | neighbors_indices[i0 * max_count + j] = token; 250 | dists[i0 * max_count + j] = -1; 251 | } 252 | } 253 | i0++; 254 | } 255 | index.reset(); 256 | } 257 | else if (mode == 1) 258 | { 259 | int size = 0; // total number of edges 260 | for (auto& inds_dists : all_inds_dists) 261 | { 262 | if ((int)inds_dists.size() <= max_count) 263 | size += inds_dists.size(); 264 | else 265 | size += max_count; 266 | } 267 | neighbors_indices.resize(size * 2); 268 | dists.resize(size); 269 | i0 = 0; 270 | b = 0; 271 | int u = 0; 272 | for (auto& inds_dists : all_inds_dists) 273 | { 274 | if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 && 275 | b < (int)q_batches.size() - 1) 276 | { 277 | b++; 278 | } 279 | for (int j = 0; j < max_count; j++) 280 | { 281 | if ((unsigned int)j < inds_dists.size()) 282 | { 283 | neighbors_indices[u] = inds_dists[j].first + s_batches[b]; 284 | neighbors_indices[u + 1] = i0; 285 | dists[u / 2] = (float)inds_dists[j].second; 286 | u += 2; 287 | } 288 | } 289 | i0++; 290 | } 291 | } 292 | return max_count; 293 | } 294 | 295 | template 296 | void nanoflann_knn_neighbors(vector& queries, vector& supports, 297 | vector& neighbors_indices, vector& dists, int k) 298 | { 299 | // Nanoflann related variables 300 | // *************************** 301 | // CLoud variable 302 | PointCloud pcd; 303 | pcd.set(supports); 304 | // Cloud query 305 | PointCloud pcd_query; 306 | pcd_query.set(queries); 307 | 308 | // Tree parameters 309 | nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */); 310 | 311 | // KDTree type definition 312 | typedef nanoflann::KDTreeSingleIndexAdaptor< 313 | nanoflann::L2_Simple_Adaptor>, PointCloud, 3> 314 | my_kd_tree_t; 315 | 316 | // Pointer to trees 317 | std::unique_ptr index(new my_kd_tree_t(3, pcd, tree_params)); 318 | index->buildIndex(); 319 | 320 | // Search neigbors indices 321 | // *********************** 322 | size_t current_pos = 0; 323 | auto num_query_points = pcd_query.get_point_count(); 324 | for (size_t i = 0; i < num_query_points; i++) 325 | { 326 | // Find neighbors 327 | std::vector ret_index(k); 328 | std::vector out_dist_sqr(k); 329 | 330 | const size_t nMatches = 331 | index->knnSearch(pcd_query.get_point_ptr(i), k, &ret_index[0], &out_dist_sqr[0]); 332 | for (size_t i = 0; i < nMatches; i++) 333 | { 334 | neighbors_indices[i + current_pos] = ret_index[i]; 335 | dists[i + current_pos] = out_dist_sqr[i]; 336 | } 337 | current_pos += k; 338 | } 339 | } 340 | -------------------------------------------------------------------------------- /cuda/src/gridding_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cuda_utils.h" 7 | 8 | #define CUDA_NUM_THREADS 512 9 | 10 | // Computer the number of threads needed in GPU 11 | inline int get_n_threads(int n) 12 | { 13 | const int pow_2 = std::log(static_cast(n)) / std::log(2.0); 14 | return max(min(1 << pow_2, CUDA_NUM_THREADS), 1); 15 | } 16 | 17 | __device__ int compute_index(int offset_x, int offset_y, int offset_z, int len_y, int len_z) 18 | { 19 | return offset_x * len_y * len_z + offset_y * len_z + offset_z; 20 | } 21 | 22 | template 23 | __device__ scalar_t compute_weight(scalar_t x, scalar_t x0) 24 | { 25 | return 1 - abs(x - x0); 26 | } 27 | 28 | template 29 | __global__ void 30 | gridding_kernel(int n_grid_vertices, int n_pts, float min_x, float min_y, float min_z, int len_y, 31 | int len_z, const scalar_t* __restrict__ ptcloud, 32 | scalar_t* __restrict__ grid_weights, scalar_t* __restrict__ grid_pt_weights, 33 | int* __restrict__ grid_pt_indexes) 34 | { 35 | int batch_index = blockIdx.x; 36 | int index = threadIdx.x; 37 | int stride = blockDim.x; 38 | 39 | ptcloud += batch_index * n_pts * 3; 40 | grid_weights += batch_index * n_grid_vertices; 41 | grid_pt_weights += batch_index * n_pts * 24; 42 | grid_pt_indexes += batch_index * n_pts * 8; 43 | 44 | for (int j = index; j < n_pts; j += stride) 45 | { 46 | scalar_t pt_x = ptcloud[j * 3 + 0]; 47 | scalar_t pt_y = ptcloud[j * 3 + 1]; 48 | scalar_t pt_z = ptcloud[j * 3 + 2]; 49 | 50 | int lower_x = std::floor(pt_x); 51 | int upper_x = std::ceil(pt_x); 52 | if (lower_x == upper_x) 53 | { 54 | upper_x += 1; 55 | } 56 | int lower_y = std::floor(pt_y); 57 | int upper_y = std::ceil(pt_y); 58 | if (lower_y == upper_y) 59 | { 60 | upper_y += 1; 61 | } 62 | int lower_z = std::floor(pt_z); 63 | int upper_z = std::ceil(pt_z); 64 | if (lower_z == upper_z) 65 | { 66 | upper_z += 1; 67 | } 68 | 69 | int lx_offset = lower_x - min_x, ux_offset = upper_x - min_x; 70 | int ly_offset = lower_y - min_y, uy_offset = upper_y - min_y; 71 | int lz_offset = lower_z - min_z, uz_offset = upper_z - min_z; 72 | 73 | // Compute weights and corresponding positions, a loop for 8 points 74 | // LLL -> Lower X, Lower Y, Lower Z 75 | grid_pt_indexes[j * 8 + 0] = compute_index(lx_offset, ly_offset, lz_offset, len_y, len_z); 76 | grid_pt_weights[j * 24 + 0] = compute_weight(pt_x, lower_x); 77 | grid_pt_weights[j * 24 + 1] = compute_weight(pt_y, lower_y); 78 | grid_pt_weights[j * 24 + 2] = compute_weight(pt_z, lower_z); 79 | 80 | // LLU -> Lower X, Lower Y, Upper Z 81 | grid_pt_indexes[j * 8 + 1] = compute_index(lx_offset, ly_offset, uz_offset, len_y, len_z); 82 | grid_pt_weights[j * 24 + 3] = compute_weight(pt_x, lower_x); 83 | grid_pt_weights[j * 24 + 4] = compute_weight(pt_y, lower_y); 84 | grid_pt_weights[j * 24 + 5] = compute_weight(pt_z, upper_z); 85 | 86 | // LUL -> Lower X, Upper Y, Lower Z 87 | grid_pt_indexes[j * 8 + 2] = compute_index(lx_offset, uy_offset, lz_offset, len_y, len_z); 88 | grid_pt_weights[j * 24 + 6] = compute_weight(pt_x, lower_x); 89 | grid_pt_weights[j * 24 + 7] = compute_weight(pt_y, upper_y); 90 | grid_pt_weights[j * 24 + 8] = compute_weight(pt_z, lower_z); 91 | 92 | // LUU -> Lower X, Upper Y, Upper Z 93 | grid_pt_indexes[j * 8 + 3] = compute_index(lx_offset, uy_offset, uz_offset, len_y, len_z); 94 | grid_pt_weights[j * 24 + 9] = compute_weight(pt_x, lower_x); 95 | grid_pt_weights[j * 24 + 10] = compute_weight(pt_y, upper_y); 96 | grid_pt_weights[j * 24 + 11] = compute_weight(pt_z, upper_z); 97 | 98 | // ULL -> Upper X, Lower Y, Lower Z 99 | grid_pt_indexes[j * 8 + 4] = compute_index(ux_offset, ly_offset, lz_offset, len_y, len_z); 100 | grid_pt_weights[j * 24 + 12] = compute_weight(pt_x, upper_x); 101 | grid_pt_weights[j * 24 + 13] = compute_weight(pt_y, lower_y); 102 | grid_pt_weights[j * 24 + 14] = compute_weight(pt_z, lower_z); 103 | 104 | // ULU -> Upper X, Lower Y, Upper Z 105 | grid_pt_indexes[j * 8 + 5] = compute_index(ux_offset, ly_offset, uz_offset, len_y, len_z); 106 | grid_pt_weights[j * 24 + 15] = compute_weight(pt_x, upper_x); 107 | grid_pt_weights[j * 24 + 16] = compute_weight(pt_y, lower_y); 108 | grid_pt_weights[j * 24 + 17] = compute_weight(pt_z, upper_z); 109 | 110 | // UUL -> Upper X, Upper Y, Lower Z 111 | grid_pt_indexes[j * 8 + 6] = compute_index(ux_offset, uy_offset, lz_offset, len_y, len_z); 112 | grid_pt_weights[j * 24 + 18] = compute_weight(pt_x, upper_x); 113 | grid_pt_weights[j * 24 + 19] = compute_weight(pt_y, upper_y); 114 | grid_pt_weights[j * 24 + 20] = compute_weight(pt_z, lower_z); 115 | 116 | // UUU -> Upper X, Upper Y, Upper Z 117 | grid_pt_indexes[j * 8 + 7] = compute_index(ux_offset, uy_offset, uz_offset, len_y, len_z); 118 | grid_pt_weights[j * 24 + 21] = compute_weight(pt_x, upper_x); 119 | grid_pt_weights[j * 24 + 22] = compute_weight(pt_y, upper_y); 120 | grid_pt_weights[j * 24 + 23] = compute_weight(pt_z, upper_z); 121 | } 122 | 123 | __syncthreads(); 124 | 125 | int gvtx_idx = 0; 126 | for (int j = index; j < n_pts; j += stride) 127 | { 128 | // LLL -> Lower X, Lower Y, Lower Z 129 | gvtx_idx = grid_pt_indexes[j * 8 + 0]; 130 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 0] * 131 | grid_pt_weights[j * 24 + 1] * 132 | grid_pt_weights[j * 24 + 2]); 133 | // LLU -> Lower X, Lower Y, Upper Z 134 | gvtx_idx = grid_pt_indexes[j * 8 + 1]; 135 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 3] * 136 | grid_pt_weights[j * 24 + 4] * 137 | grid_pt_weights[j * 24 + 5]); 138 | // LUL -> Lower X, Upper Y, Lower Z 139 | gvtx_idx = grid_pt_indexes[j * 8 + 2]; 140 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 6] * 141 | grid_pt_weights[j * 24 + 7] * 142 | grid_pt_weights[j * 24 + 8]); 143 | // LUU -> Lower X, Upper Y, Upper Z 144 | gvtx_idx = grid_pt_indexes[j * 8 + 3]; 145 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 9] * 146 | grid_pt_weights[j * 24 + 10] * 147 | grid_pt_weights[j * 24 + 11]); 148 | // ULL -> Upper X, Lower Y, Lower Z 149 | gvtx_idx = grid_pt_indexes[j * 8 + 4]; 150 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 12] * 151 | grid_pt_weights[j * 24 + 13] * 152 | grid_pt_weights[j * 24 + 14]); 153 | // ULU -> Upper X, Lower Y, Upper Z 154 | gvtx_idx = grid_pt_indexes[j * 8 + 5]; 155 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 15] * 156 | grid_pt_weights[j * 24 + 16] * 157 | grid_pt_weights[j * 24 + 17]); 158 | // UUL -> Upper X, Upper Y, Lower Z 159 | gvtx_idx = grid_pt_indexes[j * 8 + 6]; 160 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 18] * 161 | grid_pt_weights[j * 24 + 19] * 162 | grid_pt_weights[j * 24 + 20]); 163 | // UUU -> Upper X, Upper Y, Upper Z 164 | gvtx_idx = grid_pt_indexes[j * 8 + 7]; 165 | atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 21] * 166 | grid_pt_weights[j * 24 + 22] * 167 | grid_pt_weights[j * 24 + 23]); 168 | } 169 | } 170 | 171 | std::vector gridding_kernel_warpper(float min_x, float max_x, float min_y, 172 | float max_y, float min_z, float max_z, 173 | torch::Tensor ptcloud, cudaStream_t stream) 174 | { 175 | int batch_size = ptcloud.size(0); 176 | int n_pts = ptcloud.size(1); 177 | int len_x = max_x - min_x + 1; 178 | int len_y = max_y - min_y + 1; 179 | int len_z = max_z - min_z + 1; 180 | int n_grid_vertices = len_x * len_y * len_z; 181 | 182 | torch::Tensor grid_weights = 183 | torch::zeros({batch_size, n_grid_vertices}, torch::CUDA(ptcloud.scalar_type())); 184 | torch::Tensor grid_pt_weights = 185 | torch::zeros({batch_size, n_pts, 8, 3}, torch::CUDA(ptcloud.scalar_type())); 186 | torch::Tensor grid_pt_indexes = torch::zeros({batch_size, n_pts, 8}, torch::CUDA(torch::kInt)); 187 | 188 | AT_DISPATCH_FLOATING_TYPES( 189 | ptcloud.scalar_type(), "gridding_cuda", ([&] { 190 | gridding_kernel<<>>( 191 | n_grid_vertices, n_pts, min_x, min_y, min_z, len_y, len_z, 192 | ptcloud.data_ptr(), grid_weights.data_ptr(), 193 | grid_pt_weights.data_ptr(), grid_pt_indexes.data_ptr()); 194 | })); 195 | 196 | cudaError_t err = cudaGetLastError(); 197 | if (err != cudaSuccess) 198 | { 199 | printf("Error in gridding_kernel_warpper: %s\n", cudaGetErrorString(err)); 200 | } 201 | return {grid_weights, grid_pt_weights, grid_pt_indexes}; 202 | } 203 | 204 | template 205 | __global__ void 206 | gridding_grad_kernel(int n_grid_vertices, int n_pts, const scalar_t* __restrict__ grid_pt_weights, 207 | const int* __restrict__ grid_pt_indexes, 208 | const scalar_t* __restrict__ grad_grid, scalar_t* __restrict__ grad_ptcloud) 209 | { 210 | int batch_index = blockIdx.x; 211 | int index = threadIdx.x; 212 | int stride = blockDim.x; 213 | 214 | grid_pt_weights += batch_index * n_pts * 24; 215 | grid_pt_indexes += batch_index * n_pts * 8; 216 | grad_grid += batch_index * n_grid_vertices; 217 | grad_ptcloud += batch_index * n_pts * 3; 218 | 219 | int gvtx_idx = 0; 220 | scalar_t grad_vtx = 0, x_weights = 0, y_weights = 0, z_weights = 0; 221 | for (int j = index; j < n_pts; j += stride) 222 | { 223 | // Compute gradient for the corresponding positions, a loop for 8 points 224 | // LLL -> Lower X, Lower Y, Lower Z 225 | gvtx_idx = grid_pt_indexes[j * 8 + 0]; 226 | grad_vtx = grad_grid[gvtx_idx]; 227 | x_weights = grid_pt_weights[j * 24 + 0]; 228 | y_weights = grid_pt_weights[j * 24 + 1]; 229 | z_weights = grid_pt_weights[j * 24 + 2]; 230 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights); 231 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights); 232 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights); 233 | 234 | // LLU -> Lower X, Lower Y, Upper Z 235 | gvtx_idx = grid_pt_indexes[j * 8 + 1]; 236 | grad_vtx = grad_grid[gvtx_idx]; 237 | x_weights = grid_pt_weights[j * 24 + 3]; 238 | y_weights = grid_pt_weights[j * 24 + 4]; 239 | z_weights = grid_pt_weights[j * 24 + 5]; 240 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights); 241 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights); 242 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights); 243 | 244 | // LUL -> Lower X, Upper Y, Lower Z 245 | gvtx_idx = grid_pt_indexes[j * 8 + 2]; 246 | grad_vtx = grad_grid[gvtx_idx]; 247 | x_weights = grid_pt_weights[j * 24 + 6]; 248 | y_weights = grid_pt_weights[j * 24 + 7]; 249 | z_weights = grid_pt_weights[j * 24 + 8]; 250 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights); 251 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights); 252 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights); 253 | 254 | // LUU -> Lower X, Upper Y, Upper Z 255 | gvtx_idx = grid_pt_indexes[j * 8 + 3]; 256 | grad_vtx = grad_grid[gvtx_idx]; 257 | x_weights = grid_pt_weights[j * 24 + 9]; 258 | y_weights = grid_pt_weights[j * 24 + 10]; 259 | z_weights = grid_pt_weights[j * 24 + 11]; 260 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights); 261 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights); 262 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights); 263 | 264 | // ULL -> Upper X, Lower Y, Lower Z 265 | gvtx_idx = grid_pt_indexes[j * 8 + 4]; 266 | grad_vtx = grad_grid[gvtx_idx]; 267 | x_weights = grid_pt_weights[j * 24 + 12]; 268 | y_weights = grid_pt_weights[j * 24 + 13]; 269 | z_weights = grid_pt_weights[j * 24 + 14]; 270 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights); 271 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights); 272 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights); 273 | 274 | // ULU -> Upper X, Lower Y, Upper Z 275 | gvtx_idx = grid_pt_indexes[j * 8 + 5]; 276 | grad_vtx = grad_grid[gvtx_idx]; 277 | x_weights = grid_pt_weights[j * 24 + 15]; 278 | y_weights = grid_pt_weights[j * 24 + 16]; 279 | z_weights = grid_pt_weights[j * 24 + 17]; 280 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights); 281 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights); 282 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights); 283 | 284 | // UUL -> Upper X, Upper Y, Lower Z 285 | gvtx_idx = grid_pt_indexes[j * 8 + 6]; 286 | grad_vtx = grad_grid[gvtx_idx]; 287 | x_weights = grid_pt_weights[j * 24 + 18]; 288 | y_weights = grid_pt_weights[j * 24 + 19]; 289 | z_weights = grid_pt_weights[j * 24 + 20]; 290 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights); 291 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights); 292 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights); 293 | 294 | // UUU -> Upper X, Upper Y, Upper Z 295 | gvtx_idx = grid_pt_indexes[j * 8 + 7]; 296 | grad_vtx = grad_grid[gvtx_idx]; 297 | x_weights = grid_pt_weights[j * 24 + 21]; 298 | y_weights = grid_pt_weights[j * 24 + 22]; 299 | z_weights = grid_pt_weights[j * 24 + 23]; 300 | atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights); 301 | atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights); 302 | atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights); 303 | } 304 | } 305 | 306 | torch::Tensor gridding_grad_kernel_warpper(torch::Tensor grid_pt_weights, 307 | torch::Tensor grid_pt_indexes, torch::Tensor grad_grid, 308 | cudaStream_t stream) 309 | { 310 | int batch_size = grad_grid.size(0); 311 | int n_grid_vertices = grad_grid.size(1); 312 | int n_pts = grid_pt_indexes.size(1); 313 | 314 | torch::Tensor grad_ptcloud = 315 | torch::zeros({batch_size, n_pts, 3}, torch::CUDA(grid_pt_weights.scalar_type())); 316 | 317 | AT_DISPATCH_FLOATING_TYPES( 318 | grid_pt_weights.scalar_type(), "gridding_grad_cuda", ([&] { 319 | gridding_grad_kernel<<>>( 320 | n_grid_vertices, n_pts, grid_pt_weights.data_ptr(), 321 | grid_pt_indexes.data_ptr(), grad_grid.data_ptr(), 322 | grad_ptcloud.data_ptr()); 323 | })); 324 | 325 | cudaError_t err = cudaGetLastError(); 326 | if (err != cudaSuccess) 327 | { 328 | printf("Error in gridding_grad_kernel_warpper: %s\n", cudaGetErrorString(err)); 329 | } 330 | return grad_ptcloud; 331 | } 332 | --------------------------------------------------------------------------------