├── conda
    └── torch-points-kernels
    │   ├── run_test.sh
    │   ├── build.sh
    │   ├── run_test.bat
    │   ├── README.md
    │   ├── bld.bat
    │   ├── meta.yaml
    │   └── build_conda.sh
├── cpu
    ├── include
    │   ├── compat.h
    │   ├── fps.h
    │   ├── knn.h
    │   ├── utils.h
    │   ├── interpolate.h
    │   ├── ball_query.h
    │   ├── neighbors.h
    │   └── cloud.h
    └── src
    │   ├── fps.cpp
    │   ├── knn.cpp
    │   ├── interpolate.cpp
    │   ├── bindings.cpp
    │   ├── ball_query.cpp
    │   └── neighbors.cpp
├── cuda
    ├── include
    │   ├── compat.h
    │   ├── sampling.h
    │   ├── metrics.h
    │   ├── interpolate.h
    │   ├── ball_query.h
    │   ├── chamfer_dist.h
    │   ├── gridding.h
    │   ├── cubic_feature_sampling.h
    │   ├── utils.h
    │   └── cuda_utils.h
    └── src
    │   ├── chamfer_dist.cpp
    │   ├── bindings.cpp
    │   ├── gridding.cpp
    │   ├── sampling.cpp
    │   ├── cubic_feature_sampling.cpp
    │   ├── metrics.cpp
    │   ├── metrics_gpu.cu
    │   ├── interpolate.cpp
    │   ├── ball_query.cpp
    │   ├── ball_query_gpu.cu
    │   ├── interpolate_gpu.cu
    │   ├── sampling_gpu.cu
    │   ├── cubic_feature_sampling_gpu.cu
    │   ├── chamfer_dist_gpu.cu
    │   └── gridding_gpu.cu
├── MANIFEST.in
├── .github
    └── workflows
    │   ├── cuda
    │       ├── cu101-Linux-env.sh
    │       ├── cu102-Linux-env.sh
    │       ├── cu116-Linux-env.sh
    │       ├── cu111-Linux-env.sh
    │       ├── cu113-Linux-env.sh
    │       ├── cu115-Linux-env.sh
    │       ├── cu116-Windows-env.sh
    │       ├── cu115-Windows-env.sh
    │       ├── cu101-Windows-env.sh
    │       ├── cu102-Windows-env.sh
    │       ├── cu111-Windows-env.sh
    │       ├── cu113-Windows-env.sh
    │       ├── cu116-Linux.sh
    │       ├── cu111-Linux.sh
    │       ├── cu113-Linux.sh
    │       ├── cu115-Linux.sh
    │       ├── cu102-Linux.sh
    │       ├── cu101-Linux.sh
    │       ├── cu111-Windows.sh
    │       ├── cu101-Windows.sh
    │       ├── cu102-Windows.sh
    │       ├── cu113-Windows.sh
    │       ├── cu116-Windows.sh
    │       └── cu115-Windows.sh
    │   ├── deploy.yaml
    │   ├── tests.yaml
    │   └── building-conda.yml
├── test
    ├── __init__.py
    ├── test_fps.py
    ├── test_gridding.py
    ├── test_chamfer_dist.py
    ├── test_knn.py
    ├── test_interpolate.py
    ├── test_grouping.py
    ├── test_cluster.py
    ├── test_cubic_feature_sampling.py
    ├── speed_radius.py
    ├── test_metrics.py
    └── test_ballquerry.py
├── .gitignore
├── torch_points_kernels
    ├── __init__.py
    ├── knn.py
    ├── chamfer_dist.py
    ├── gridding.py
    ├── cubic_feature_sampling.py
    ├── cluster.py
    ├── metrics.py
    └── torchpoints.py
├── .black.toml
├── .pre-commit-config.yaml
├── LICENSE
├── benchmark
    └── region_cluster.py
├── .devcontainer
    ├── devcontainer.json
    └── Dockerfile
├── CHANGELOG.md
├── .clang-format
├── setup.py
└── README.md


/conda/torch-points-kernels/run_test.sh:
--------------------------------------------------------------------------------
1 | $PYTHON -m unittest


--------------------------------------------------------------------------------
/conda/torch-points-kernels/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON -m pip install .
2 | 


--------------------------------------------------------------------------------
/conda/torch-points-kernels/run_test.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" -m unittest
2 | if errorlevel 1 exit 1


--------------------------------------------------------------------------------
/cpu/include/compat.h:
--------------------------------------------------------------------------------
1 | #ifdef VERSION_GE_1_3
2 | #define DATA_PTR data_ptr
3 | #else
4 | #define DATA_PTR data
5 | #endif
6 | 


--------------------------------------------------------------------------------
/cuda/include/compat.h:
--------------------------------------------------------------------------------
1 | #ifdef VERSION_GE_1_3
2 | #define DATA_PTR data_ptr
3 | #else
4 | #define DATA_PTR data
5 | #endif
6 | 


--------------------------------------------------------------------------------
/cpu/include/fps.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | at::Tensor fps(at::Tensor points, const int nsamples, bool random = true);
4 | 


--------------------------------------------------------------------------------
/cuda/include/sampling.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 
4 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
5 | 


--------------------------------------------------------------------------------
/cpu/include/knn.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | std::pair<at::Tensor, at::Tensor> dense_knn(at::Tensor support, at::Tensor query, int k);
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include CHANGELOG.md
4 | include pyproject.toml
5 | 
6 | recursive-exclude test *
7 | recursive-include cpu *
8 | recursive-include cuda *
9 | 


--------------------------------------------------------------------------------
/cpu/include/utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 
4 | #define CHECK_CPU(x) AT_ASSERTM(!x.is_cuda(), #x " must be a CPU tensor")
5 | 
6 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
7 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu101-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-10.1
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu102-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-10.2
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu116-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-11.6
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu111-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-11.1
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu113-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-11.3
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu115-Linux-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/usr/local/cuda-11.5
4 | LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5 | PATH=${CUDA_HOME}/bin:${PATH}
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
9 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def run_if_cuda(func):
 5 |     def wrapped_func(*args, **kwargs):
 6 |         if torch.cuda.is_available():
 7 |             return func(*args, **kwargs)
 8 |         else:
 9 |             return
10 | 
11 |     return wrapped_func
12 | 


--------------------------------------------------------------------------------
/conda/torch-points-kernels/README.md:
--------------------------------------------------------------------------------
1 | ```
2 | ./build_conda.sh 3.9 1.9.0 cu111  # python, pytorch and cuda version
3 | ```
4 | 
5 | These conda scripts are based off of [pytorch_sparse](https://github.com/rusty1s/pytorch_sparse/tree/master/conda/pytorch-sparse), track changes there to update.


--------------------------------------------------------------------------------
/cpu/include/interpolate.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 
4 | at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight);
5 | 
6 | at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight,
7 |                                 const int m);
8 | 


--------------------------------------------------------------------------------
/cuda/include/metrics.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 
4 | at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
5 |                              at::Tensor gt_instances, at::Tensor gt_instance_sizes,
6 |                              at::Tensor num_gt_instances, at::Tensor batch);
7 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu116-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="6.0+PTX"


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu115-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="6.0+PTX"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu101-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.1
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu102-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.2
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu111-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.1
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu113-Windows-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
4 | PATH=${CUDA_HOME}/bin:$PATH
5 | PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
6 | 
7 | export FORCE_CUDA=1
8 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
9 | 


--------------------------------------------------------------------------------
/conda/torch-points-kernels/bld.bat:
--------------------------------------------------------------------------------
1 | copy "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64\\metis.lib" %LIBRARY_LIB%
2 | if errorlevel 1 exit 1
3 | copy "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Tools\\MSVC\\14.29.30133\\include\\metis.h" %LIBRARY_INC%
4 | if errorlevel 1 exit 1
5 | 
6 | "%PYTHON%" -m pip install .
7 | if errorlevel 1 exit 1


--------------------------------------------------------------------------------
/cuda/include/interpolate.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | #include <vector>
 5 | 
 6 | std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
 7 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight);
 8 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight,
 9 |                                   const int m);
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | build
35 | *.pyc
36 | 
37 | .vscode/
38 | dist/
39 | torch_points_kernels.egg-info/
40 | 


--------------------------------------------------------------------------------
/torch_points_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | from .torchpoints import *
 2 | from .knn import knn
 3 | from .cluster import region_grow
 4 | from .metrics import instance_iou
 5 | from .cubic_feature_sampling import cubic_feature_sampling
 6 | 
 7 | __all__ = [
 8 |     "ball_query",
 9 |     "furthest_point_sample",
10 |     "grouping_operation",
11 |     "three_interpolate",
12 |     "three_nn",
13 |     "knn",
14 |     "region_grow",
15 |     "instance_iou",
16 |     "chamfer_dist",
17 |     "cubic_feature_sampling",
18 |     "gridding",
19 | ]
20 | 


--------------------------------------------------------------------------------
/cuda/src/chamfer_dist.cpp:
--------------------------------------------------------------------------------
 1 | #include "chamfer_dist.h"
 2 | 
 3 | std::vector<torch::Tensor> chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2)
 4 | {
 5 |     return chamfer_dist_kernel_wrapper(xyz1, xyz2);
 6 | }
 7 | 
 8 | std::vector<torch::Tensor> chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2,
 9 |                                              torch::Tensor idx1, torch::Tensor idx2,
10 |                                              torch::Tensor grad_dist1, torch::Tensor grad_dist2)
11 | {
12 |     return chamfer_dist_grad_kernel_wrapper(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2);
13 | }
14 | 


--------------------------------------------------------------------------------
/cuda/include/ball_query.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tensor xyz,
 5 |                                                    const float radius, const int nsample);
 6 | 
 7 | std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Tensor y,
 8 |                                                            at::Tensor batch_x, at::Tensor batch_y,
 9 |                                                            const float radius, const int nsample);
10 | 
11 | at::Tensor degree(at::Tensor row, int64_t num_nodes);
12 | 


--------------------------------------------------------------------------------
/.black.toml:
--------------------------------------------------------------------------------
 1 | # Example configuration for Black.
 2 | 
 3 | # NOTE: you have to use single-quoted strings in TOML for regular expressions.
 4 | # It's the equivalent of r-strings in Python.  Multiline strings are treated as
 5 | # verbose regular expressions by Black.  Use [ ] to denote a significant space
 6 | # character.
 7 | 
 8 | [tool.black]
 9 | line-length = 120
10 | target-version = ['py36', 'py37', 'py38']
11 | include = '\.pyi?$'
12 | exclude = '''
13 | /(
14 |     \.eggs
15 |   | \.git
16 |   | \.hg
17 |   | \.mypy_cache
18 |   | \.tox
19 |   | \.venv
20 |   | _build
21 |   | buck-out
22 |   | build
23 |   | dist
24 | )/
25 | '''
26 | 


--------------------------------------------------------------------------------
/torch_points_kernels/knn.py:
--------------------------------------------------------------------------------
 1 | import torch_points_kernels.points_cpu as tpcpu
 2 | 
 3 | 
 4 | def knn(pos_support, pos, k):
 5 |     """Dense knn serach
 6 |     Arguments:
 7 |         pos_support - [B,N,3] support points
 8 |         pos - [B,M,3] centre of queries
 9 |         k - number of neighboors, needs to be > N
10 | 
11 |     Returns:
12 |         idx - [B,M,k]
13 |         dist2 - [B,M,k] squared distances
14 |     """
15 |     assert pos_support.dim() == 3 and pos.dim() == 3
16 |     if pos_support.is_cuda:
17 |         raise ValueError("CUDA version not implemented, use pytorch geometric")
18 |     return tpcpu.dense_knn(pos_support, pos, k)
19 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: "build|egg-info|dist"
 2 | 
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.3.0
 6 |     hooks:
 7 |       - id: trailing-whitespace
 8 |       - id: check-added-large-files
 9 |       - id: end-of-file-fixer
10 | 
11 |   - repo: https://github.com/psf/black
12 |     rev: 22.3.0
13 |     hooks:
14 |       - id: black
15 |         language_version: python3.7
16 |         args: ["--config", ".black.toml"]
17 |   - repo: local
18 |     hooks:
19 |       - id: clang-format
20 |         name: Run clang-format
21 |         entry: clang-format -i
22 |         types: [text]
23 |         files: '.*\.cpp$|.*\.h$|.*\.cu$|.*\.hpp$'
24 |         language: system
25 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu116-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-11-6 cuda-libraries-dev-11-6
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu111-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-${OS}-11-1-local/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-11-1 cuda-libraries-dev-11-1
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb
16 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu113-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-11-3 cuda-libraries-dev-11-3
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
16 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu115-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-${OS}-11-5-local/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-11-5 cuda-libraries-dev-11-5
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
16 | 


--------------------------------------------------------------------------------
/test/test_fps.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import os
 4 | import sys
 5 | 
 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 7 | sys.path.insert(0, ROOT)
 8 | 
 9 | from torch_points_kernels.points_cpu import fps
10 | 
11 | 
12 | class TestFps(unittest.TestCase):
13 |     def test_simplecpu(self):
14 |         points = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[-1, 1, 0], [0, 0, 10], [0, 0, 2]]]).float()
15 |         idx = fps(points, 2, False)
16 |         torch.testing.assert_allclose(idx, torch.tensor([[0, 2], [0, 1]]))
17 | 
18 |     def test_random(self):
19 |         points = torch.randn(10, 100, 3)
20 |         idx = fps(points, 2, True)
21 |         self.assertNotEqual(idx[0][0], 0)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu102-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-10-2 cuda-libraries-dev-10-2
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
16 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu101-Linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OS=ubuntu1804
 4 | 
 5 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 6 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 7 | wget -nv https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
 8 | sudo dpkg -i cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
 9 | sudo apt-key add /var/cuda-repo-10-1-local-10.1.243-418.87.00/7fa2af80.pub
10 | 
11 | sudo apt-get -qq update
12 | sudo apt install cuda-nvcc-10-1 cuda-libraries-dev-10-1
13 | sudo apt clean
14 | 
15 | rm -f https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
16 | 


--------------------------------------------------------------------------------
/cpu/include/ball_query.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | std::pair<at::Tensor, at::Tensor> ball_query(at::Tensor query, at::Tensor support, float radius,
 4 |                                              int max_num, int mode, bool sorted);
 5 | 
 6 | std::pair<at::Tensor, at::Tensor> batch_ball_query(at::Tensor query, at::Tensor support,
 7 |                                                    at::Tensor query_batch, at::Tensor support_batch,
 8 |                                                    float radius, int max_num, int mode,
 9 |                                                    bool sorted);
10 | 
11 | std::pair<at::Tensor, at::Tensor> dense_ball_query(at::Tensor query, at::Tensor support,
12 |                                                    float radius, int max_num, int mode,
13 |                                                    bool sorted);
14 | 


--------------------------------------------------------------------------------
/test/test_gridding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | import torch
 5 | import unittest
 6 | 
 7 | from torch.autograd import gradcheck
 8 | 
 9 | from . import run_if_cuda
10 | 
11 | 
12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
13 | sys.path.insert(0, ROOT)
14 | 
15 | from torch_points_kernels.gridding import GriddingFunction
16 | 
17 | 
18 | class TestGridding(unittest.TestCase):
19 |     @run_if_cuda
20 |     def test_gridding_function_32pts(self):
21 |         x = torch.rand(1, 32, 3)
22 |         x.requires_grad = True
23 |         self.assertTrue(gradcheck(GriddingFunction.apply, [x.double().cuda(), 4]))
24 | 
25 |     @run_if_cuda
26 |     def test_gridding_function_64pts(self):
27 |         x = torch.rand(1, 64, 3)
28 |         x.requires_grad = True
29 |         self.assertTrue(gradcheck(GriddingFunction.apply, [x.double().cuda(), 8]))
30 | 


--------------------------------------------------------------------------------
/cuda/include/chamfer_dist.h:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <vector>
 3 | 
 4 | std::vector<torch::Tensor> chamfer_dist(torch::Tensor xyz1, torch::Tensor xyz2);
 5 | 
 6 | std::vector<torch::Tensor> chamfer_dist_grad(torch::Tensor xyz1, torch::Tensor xyz2,
 7 |                                              torch::Tensor idx1, torch::Tensor idx2,
 8 |                                              torch::Tensor grad_dist1, torch::Tensor grad_dist2);
 9 | 
10 | std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2);
11 | 
12 | std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
13 |                                                             torch::Tensor idx1, torch::Tensor idx2,
14 |                                                             torch::Tensor grad_dist1,
15 |                                                             torch::Tensor grad_dist2);
16 | 


--------------------------------------------------------------------------------
/cuda/include/gridding.h:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | 
 3 | #include <ATen/cuda/CUDAContext.h>
 4 | #include <torch/extension.h>
 5 | 
 6 | std::vector<torch::Tensor> gridding_kernel_warpper(float min_x, float max_x, float min_y,
 7 |                                                    float max_y, float min_z, float max_z,
 8 |                                                    torch::Tensor ptcloud, cudaStream_t stream);
 9 | 
10 | torch::Tensor gridding_grad_kernel_warpper(torch::Tensor grid_pt_weights,
11 |                                            torch::Tensor grid_pt_indexes, torch::Tensor grad_grid,
12 |                                            cudaStream_t stream);
13 | 
14 | std::vector<torch::Tensor> gridding(float min_x, float max_x, float min_y, float max_y, float min_z,
15 |                                     float max_z, torch::Tensor ptcloud);
16 | 
17 | torch::Tensor gridding_grad(torch::Tensor grid_pt_weights, torch::Tensor grid_pt_indexes,
18 |                             torch::Tensor grad_grid);
19 | 


--------------------------------------------------------------------------------
/cpu/include/neighbors.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #include "cloud.h"
 4 | #include "nanoflann.hpp"
 5 | #include <cstdint>
 6 | #include <set>
 7 | 
 8 | using namespace std;
 9 | 
10 | template <typename scalar_t>
11 | int nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
12 |                         vector<int64_t>& neighbors_indices, vector<float>& dists, float radius,
13 |                         int max_num, int mode, bool sorted);
14 | 
15 | template <typename scalar_t>
16 | int batch_nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
17 |                               vector<int64_t>& q_batches, vector<int64_t>& s_batches,
18 |                               vector<int64_t>& neighbors_indices, vector<float>& dists,
19 |                               float radius, int max_num, int mode, bool sorted);
20 | 
21 | template <typename scalar_t>
22 | void nanoflann_knn_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
23 |                              vector<int64_t>& neighbors_indices, vector<float>& dists, int k);
24 | 


--------------------------------------------------------------------------------
/cuda/src/bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "ball_query.h"
 2 | #include "chamfer_dist.h"
 3 | #include "cubic_feature_sampling.h"
 4 | #include "gridding.h"
 5 | #include "interpolate.h"
 6 | #include "metrics.h"
 7 | #include "sampling.h"
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
10 | {
11 |     m.def("furthest_point_sampling", &furthest_point_sampling);
12 | 
13 |     m.def("three_nn", &three_nn);
14 |     m.def("three_interpolate", &three_interpolate);
15 |     m.def("three_interpolate_grad", &three_interpolate_grad);
16 | 
17 |     m.def("ball_query_dense", &ball_query_dense);
18 |     m.def("ball_query_partial_dense", &ball_query_partial_dense);
19 | 
20 |     m.def("instance_iou_cuda", &instance_iou_cuda);
21 | 
22 |     m.def("chamfer_dist", &chamfer_dist);
23 |     m.def("chamfer_dist_grad", &chamfer_dist_grad);
24 | 
25 |     m.def("cubic_feature_sampling", &cubic_feature_sampling);
26 |     m.def("cubic_feature_sampling_grad", &cubic_feature_sampling_grad);
27 | 
28 |     m.def("gridding", &gridding);
29 |     m.def("gridding_grad", &gridding_grad);
30 | }
31 | 


--------------------------------------------------------------------------------
/cuda/src/gridding.cpp:
--------------------------------------------------------------------------------
 1 | #include "gridding.h"
 2 | #include "utils.h"
 3 | 
 4 | std::vector<torch::Tensor> gridding(float min_x, float max_x, float min_y, float max_y, float min_z,
 5 |                                     float max_z, torch::Tensor ptcloud)
 6 | {
 7 |     CHECK_CUDA(ptcloud);
 8 |     CHECK_CONTIGUOUS(ptcloud);
 9 | 
10 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
11 |     return gridding_kernel_warpper(min_x, max_x, min_y, max_y, min_z, max_z, ptcloud, stream);
12 | }
13 | 
14 | torch::Tensor gridding_grad(torch::Tensor grid_pt_weights, torch::Tensor grid_pt_indexes,
15 |                             torch::Tensor grad_grid)
16 | {
17 |     CHECK_CUDA(grid_pt_weights);
18 |     CHECK_CONTIGUOUS(grid_pt_weights);
19 |     CHECK_CUDA(grid_pt_indexes);
20 |     CHECK_CONTIGUOUS(grid_pt_indexes);
21 |     CHECK_CUDA(grad_grid);
22 |     CHECK_CONTIGUOUS(grad_grid);
23 | 
24 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
25 |     return gridding_grad_kernel_warpper(grid_pt_weights, grid_pt_indexes, grad_grid, stream);
26 | }
27 | 


--------------------------------------------------------------------------------
/cuda/src/sampling.cpp:
--------------------------------------------------------------------------------
 1 | #include "sampling.h"
 2 | #include "compat.h"
 3 | #include "utils.h"
 4 | 
 5 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float* dataset, float* temp,
 6 |                                             int* idxs);
 7 | 
 8 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples)
 9 | {
10 |     CHECK_CONTIGUOUS(points);
11 |     CHECK_IS_FLOAT(points);
12 |     CHECK_CUDA(points);
13 | 
14 |     at::Tensor output = torch::zeros({points.size(0), nsamples},
15 |                                      at::device(points.device()).dtype(at::ScalarType::Int));
16 | 
17 |     at::Tensor tmp = torch::full({points.size(0), points.size(1)}, 1e10,
18 |                                  at::device(points.device()).dtype(at::ScalarType::Float));
19 | 
20 |     furthest_point_sampling_kernel_wrapper(points.size(0), points.size(1), nsamples,
21 |                                            points.DATA_PTR<float>(), tmp.DATA_PTR<float>(),
22 |                                            output.DATA_PTR<int>());
23 | 
24 |     return output;
25 | }
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Nicolas Chaulet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu111-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install NVIDIA drivers, see:
 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 7 | 
 8 | export CUDA_SHORT=11.1
 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers
10 | export CUDA_FILE=cuda_${CUDA_SHORT}.1_456.81_win10.exe
11 | 
12 | # Install CUDA:
13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
14 | echo ""
15 | echo "Installing from ${CUDA_FILE}..."
16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
17 | echo "Done!"
18 | rm -f "${CUDA_FILE}"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu101-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install NVIDIA drivers, see:
 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 7 | 
 8 | export CUDA_SHORT=10.1
 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers/
10 | export CUDA_FILE=cuda_${CUDA_SHORT}.243_426.00_win10.exe
11 | 
12 | # Install CUDA:
13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
14 | echo ""
15 | echo "Installing from ${CUDA_FILE}..."
16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
17 | echo "Done!"
18 | rm -f "${CUDA_FILE}"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu102-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install NVIDIA drivers, see:
 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 7 | 
 8 | export CUDA_SHORT=10.2
 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
10 | export CUDA_FILE=cuda_${CUDA_SHORT}.89_441.22_win10.exe
11 | 
12 | # Install CUDA:
13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
14 | echo ""
15 | echo "Installing from ${CUDA_FILE}..."
16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
17 | echo "Done!"
18 | rm -f "${CUDA_FILE}"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu113-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install NVIDIA drivers, see:
 4 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 5 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 6 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 7 | 
 8 | export CUDA_SHORT=11.3
 9 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
10 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
11 | 
12 | # Install CUDA:
13 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
14 | echo ""
15 | echo "Installing from ${CUDA_FILE}..."
16 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
17 | echo "Done!"
18 | rm -f "${CUDA_FILE}"
19 | 


--------------------------------------------------------------------------------
/cpu/src/fps.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include "compat.h"
 4 | #include "utils.h"
 5 | 
 6 | at::Tensor get_dist(at::Tensor x, ptrdiff_t index)
 7 | {
 8 |     return (x - x[index]).norm(2, 1);
 9 | }
10 | 
11 | at::Tensor fps(at::Tensor points, const int nsamples, bool random)
12 | {
13 |     CHECK_CONTIGUOUS(points);
14 | 
15 |     auto out_options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU);
16 |     auto batch_size = points.size(0);
17 |     auto out = torch::empty({batch_size, nsamples}, out_options);
18 |     auto out_a = out.accessor<int64_t, 2>();
19 | 
20 |     for (ptrdiff_t b = 0; b < batch_size; b++)
21 |     {
22 |         auto y = points[b];
23 |         ptrdiff_t start = 0;
24 |         if (random)
25 |             start = at::randperm(y.size(0), out_options).DATA_PTR<int64_t>()[0];
26 | 
27 |         out_a[b][0] = start;
28 |         auto dist = get_dist(y, start);
29 |         for (ptrdiff_t i = 1; i < nsamples; i++)
30 |         {
31 |             ptrdiff_t argmax = dist.argmax().DATA_PTR<int64_t>()[0];
32 |             out_a[b][i] = argmax;
33 |             dist = at::min(dist, get_dist(y, argmax));
34 |         }
35 |     }
36 |     return out;
37 | }
38 | 


--------------------------------------------------------------------------------
/conda/torch-points-kernels/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: torch-points-kernels
 3 |   version: 0.7.1
 4 | 
 5 | source:
 6 |   path: ../..
 7 | 
 8 | requirements:
 9 |   build:
10 |     - {{ compiler('c') }}  # [win]
11 | 
12 |   host:
13 |     - pip
14 |     - python {{ environ.get('PYTHON_VERSION') }}
15 |     - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
16 |     - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
17 | 
18 |   run:
19 |     - python {{ environ.get('PYTHON_VERSION') }}
20 |     - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
21 |     - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
22 |     - numpy
23 |     - numba
24 |     - scikit-learn
25 | 
26 | build:
27 |   string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
28 |   script_env:
29 |     - FORCE_CUDA
30 |     - TORCH_CUDA_ARCH_LIST
31 |   preserve_egg_dir: True
32 | 
33 | test:
34 |   source_files:
35 |     - test
36 | 
37 | about:
38 |   home: https://github.com/torch-points3d/torch-points-kernels
39 |   license: MIT
40 |   summary: Pytorch CPU and CUDA kernels for spatial search and interpolation for 3D point clouds.
41 | 


--------------------------------------------------------------------------------
/benchmark/region_cluster.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import os
 4 | import sys
 5 | import time
 6 | import random
 7 | 
 8 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 9 | sys.path.insert(0, ROOT)
10 | 
11 | from torch_points_kernels.cluster import grow_proximity
12 | 
13 | torch.manual_seed(0)
14 | 
15 | num_points = 100000
16 | pos1 = torch.rand((num_points, 3))
17 | pos2 = torch.rand((num_points, 3)) + 2
18 | pos3 = torch.rand((num_points, 3)) + 4
19 | labels1 = torch.ones(num_points).long()
20 | labels2 = torch.ones(num_points).long()
21 | labels3 = torch.ones(num_points).long()
22 | pos = torch.cat([pos1, pos2, pos3], 0)
23 | label = torch.cat([labels1, labels2, labels3], 0)
24 | batch = torch.ones((3 * num_points)).long()
25 | cl = grow_proximity(pos, batch, radius=0.5)
26 | 
27 | 
28 | import cProfile, pstats, io
29 | from pstats import SortKey
30 | 
31 | pr = cProfile.Profile()
32 | pr.enable()
33 | t_start = time.perf_counter()
34 | grow_proximity(pos, batch)
35 | print(time.perf_counter() - t_start)
36 | pr.disable()
37 | s = io.StringIO()
38 | sortby = SortKey.CUMULATIVE
39 | ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
40 | ps.print_stats()
41 | print(s.getvalue())
42 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.101.1/containers/python-3
 3 | {
 4 | 	"name": "Python 3",
 5 | 	"context": "..",
 6 | 	"dockerFile": "Dockerfile",
 7 | 	// Set *default* container specific settings.json values on container create.
 8 | 	"settings": {
 9 | 		"terminal.integrated.shell.linux": "/bin/bash",
10 | 		"python.pythonPath": "/usr/local/bin/python",
11 | 		"python.linting.enabled": true,
12 | 		"python.linting.pylintEnabled": true,
13 | 		"python.linting.pylintPath": "/usr/local/bin/pylint"
14 | 	},
15 | 	// Add the IDs of extensions you want installed when the container is created.
16 | 	"extensions": [
17 | 		"ms-python.python",
18 | 		"ms-vscode.cpptools"
19 | 	]
20 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
21 | 	// "forwardPorts": [],
22 | 	// Use 'postCreateCommand' to run commands after the container is created.
23 | 	// "postCreateCommand": "pip install -r requirements.txt",
24 | 	// Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
25 | 	// "remoteUser": "vscode"
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu116-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels
 4 | 
 5 | # Install NVIDIA drivers, see:
 6 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 7 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 8 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 9 | 
10 | export CUDA_SHORT=11.3
11 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
12 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
13 | 
14 | # Install CUDA:
15 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
16 | echo ""
17 | echo "Installing from ${CUDA_FILE}..."
18 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
19 | echo "Done!"
20 | rm -f "${CUDA_FILE}"


--------------------------------------------------------------------------------
/.github/workflows/cuda/cu115-Windows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels
 4 | 
 5 | # Install NVIDIA drivers, see:
 6 | # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
 7 | curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
 8 | 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 9 | 
10 | export CUDA_SHORT=11.3
11 | export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
12 | export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe
13 | 
14 | # Install CUDA:
15 | curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
16 | echo ""
17 | echo "Installing from ${CUDA_FILE}..."
18 | PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
19 | echo "Done!"
20 | rm -f "${CUDA_FILE}"
21 | 


--------------------------------------------------------------------------------
/cuda/include/cubic_feature_sampling.h:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | 
 3 | #include <ATen/cuda/CUDAContext.h>
 4 | #include <torch/extension.h>
 5 | 
 6 | std::vector<torch::Tensor> cubic_feature_sampling(int scale, int neighborhood_size,
 7 |                                                   torch::Tensor ptcloud,
 8 |                                                   torch::Tensor cubic_features);
 9 | 
10 | std::vector<torch::Tensor> cubic_feature_sampling_grad(int scale, int neighborhood_size,
11 |                                                        torch::Tensor grad_point_features,
12 |                                                        torch::Tensor grid_pt_indexes);
13 | 
14 | std::vector<torch::Tensor> cubic_feature_sampling_kernel_wrapper(int scale, int neighborhood_size,
15 |                                                                  torch::Tensor ptcloud,
16 |                                                                  torch::Tensor cubic_features,
17 |                                                                  cudaStream_t stream);
18 | 
19 | std::vector<torch::Tensor>
20 | cubic_feature_sampling_grad_kernel_wrapper(int scale, int neighborhood_size,
21 |                                            torch::Tensor grad_point_features,
22 |                                            torch::Tensor grid_pt_indexes, cudaStream_t stream);
23 | 


--------------------------------------------------------------------------------
/conda/torch-points-kernels/build_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHON_VERSION=$1
 4 | export TORCH_VERSION=$2
 5 | export CUDA_VERSION=$3
 6 | 
 7 | export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
 8 | 
 9 | if [ "${CUDA_VERSION}" = "cpu" ]; then
10 |   export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
11 | else
12 |   case $CUDA_VERSION in
13 |     cu116)
14 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
15 |       ;;
16 |     cu115)
17 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
18 |       ;;
19 |     cu113)
20 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
21 |       ;;
22 |     cu111)
23 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
24 |       ;;
25 |     cu102)
26 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
27 |       ;;
28 |     cu101)
29 |       export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
30 |       ;;
31 |     *)
32 |       echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
33 |       exit 1
34 |       ;;
35 |   esac
36 | fi
37 | 
38 | echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
39 | echo "- $CONDA_PYTORCH_CONSTRAINT"
40 | echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
41 | 
42 | if [ "${CUDA_VERSION}" = "cu116" ]; then
43 |   conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
44 | else
45 |   conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
46 | fi


--------------------------------------------------------------------------------
/cuda/src/cubic_feature_sampling.cpp:
--------------------------------------------------------------------------------
 1 | #include "cubic_feature_sampling.h"
 2 | #include "utils.h"
 3 | 
 4 | std::vector<torch::Tensor> cubic_feature_sampling(int scale, int neighborhood_size,
 5 |                                                   torch::Tensor ptcloud,
 6 |                                                   torch::Tensor cubic_features)
 7 | {
 8 |     CHECK_CUDA(ptcloud);
 9 |     CHECK_CONTIGUOUS(ptcloud);
10 |     CHECK_CUDA(cubic_features);
11 |     CHECK_CONTIGUOUS(cubic_features);
12 | 
13 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
14 |     return cubic_feature_sampling_kernel_wrapper(scale, neighborhood_size, ptcloud, cubic_features,
15 |                                                  stream);
16 | }
17 | 
18 | std::vector<torch::Tensor> cubic_feature_sampling_grad(int scale, int neighborhood_size,
19 |                                                        torch::Tensor grad_point_features,
20 |                                                        torch::Tensor grid_pt_indexes)
21 | {
22 |     CHECK_CUDA(grad_point_features);
23 |     CHECK_CONTIGUOUS(grad_point_features);
24 |     CHECK_CUDA(grid_pt_indexes);
25 |     CHECK_CONTIGUOUS(grid_pt_indexes);
26 | 
27 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
28 |     return cubic_feature_sampling_grad_kernel_wrapper(scale, neighborhood_size, grad_point_features,
29 |                                                       grid_pt_indexes, stream);
30 | }
31 | 


--------------------------------------------------------------------------------
/test/test_chamfer_dist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | import torch
 5 | import unittest
 6 | 
 7 | from torch.autograd import gradcheck
 8 | 
 9 | from . import run_if_cuda
10 | 
11 | 
12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
13 | sys.path.insert(0, ROOT)
14 | 
15 | from torch_points_kernels.chamfer_dist import ChamferFunction, chamfer_dist
16 | 
17 | 
18 | class TestChamferDistance(unittest.TestCase):
19 |     @run_if_cuda
20 |     def test_chamfer_dist_grad(self):
21 |         x = torch.rand(4, 64, 3).double()
22 |         y = torch.rand(4, 128, 3).double()
23 |         x.requires_grad = True
24 |         y.requires_grad = True
25 |         test = gradcheck(ChamferFunction.apply, [x.cuda(), y.cuda()])
26 | 
27 |     @run_if_cuda
28 |     def test_chamfer_dist(self):
29 |         xyz1 = torch.from_numpy(np.array([[[0, 0, 0], [1, 1, 1], [2, 0, 1]]])).float()
30 |         xyz2 = torch.from_numpy(np.array([[[1, 0, 0], [1, 2, 1]]])).float()
31 |         dist = chamfer_dist(xyz1.cuda(), xyz2.cuda())
32 |         self.assertAlmostEqual(dist.item(), 2.333333, places=5)
33 | 
34 |     @run_if_cuda
35 |     def test_chamfer_dist_ignore_zeros(self):
36 |         xyz1 = torch.from_numpy(np.array([[[0, 0, 0], [1, 1, 1], [2, 0, 1]]])).float()
37 |         xyz2 = torch.from_numpy(np.array([[[1, 0, 0], [1, 2, 1]]])).float()
38 |         dist = chamfer_dist(xyz1.cuda(), xyz2.cuda(), True)
39 |         self.assertAlmostEqual(dist.item(), 3.0, places=5)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/test/test_knn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import os
 4 | import sys
 5 | 
 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 7 | sys.path.insert(0, ROOT)
 8 | 
 9 | from torch_points_kernels import three_nn, knn
10 | from . import run_if_cuda
11 | 
12 | 
13 | class TestKnn(unittest.TestCase):
14 |     def test_cpu(self):
15 |         support = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).float()
16 |         query = torch.tensor([[[0, 0, 0]]]).float()
17 | 
18 |         idx, dist = knn(support, query, 3)
19 |         torch.testing.assert_allclose(idx, torch.tensor([[[0, 1, 2]]]))
20 |         torch.testing.assert_allclose(dist, torch.tensor([[[0.0, 1.0, 4.0]]]))
21 | 
22 |         idx, dist = knn(support, query, 2)
23 |         torch.testing.assert_allclose(idx, torch.tensor([[[0, 1]]]))
24 | 
25 |         with self.assertRaises(RuntimeError):
26 |             knn(support, query, 5)
27 | 
28 |     def test_larger_cpu(self):
29 |         support = torch.randn((2, 10, 3))
30 |         query = torch.randn((2, 10, 3))
31 | 
32 |         idx, dist = knn(support, query, 3)
33 | 
34 | 
35 | class TestThreeNN(unittest.TestCase):
36 |     @run_if_cuda
37 |     def test_cpugpu(self):
38 |         b = 20
39 |         n = 1000
40 |         known = torch.randn((b, 2 * n, 3))
41 |         unknown = torch.randn((b, n, 3))
42 | 
43 |         dist_cpu, cpu_idx = three_nn(unknown, known)
44 |         dist_cuda, cuda_idx = three_nn(unknown.cuda(), known.cuda())
45 | 
46 |         torch.testing.assert_allclose(dist_cpu, dist_cuda.cpu())
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     tags:
 4 |     - 'v*'
 5 | 
 6 | name: Deploy
 7 | 
 8 | jobs:
 9 |     deploy:
10 |         runs-on: ubuntu-latest
11 |         steps:
12 |             - uses: actions/checkout@v1
13 |             - name: Set up Python 3.6
14 |               uses: actions/setup-python@v1
15 |               with:
16 |                   python-version: 3.6
17 |             - name: Install dependencies
18 |               run: |
19 |                   python -m pip install --upgrade pip
20 |                   pip install torch "numpy<=1.21" scikit-learn flake8 setuptools wheel twine numba
21 |             - name: Build package
22 |               run: |
23 |                   python setup.py build_ext --inplace
24 |             - name: Lint with flake8
25 |               run: |
26 |                   # stop the build if there are Python syntax errors or undefined names
27 |                   flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
28 |                   # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
29 |                   flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
30 |             - name: Test with unittest
31 |               run: |
32 |                   python -m unittest -v
33 |             - name: Build package
34 |               run: |
35 |                   python setup.py sdist
36 |             - name: Publish package
37 |               uses: pypa/gh-action-pypi-publish@master
38 |               with:
39 |                 user: __token__
40 |                 password: ${{ secrets.PYPI_PASSWORD }}
41 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.7.1
 2 | ## Additions
 3 | - Add support for anaconda installation
 4 | 
 5 | # 0.7.0
 6 | ## Change
 7 | - Added some extra compilation flags: FORCE_CUDA=1 for CUDA install and FORCE_ONLY_CPU=1 for CPU only install.
 8 | 
 9 | # 0.6.9
10 | ## Additions
11 | 
12 | - Cubic feature sampling kernel as proposed in https://arxiv.org/pdf/2006.03761
13 | 
14 | # 0.6.8
15 | ## Bug fix
16 | - Comilation with older GPU architecture
17 | 
18 | 
19 | # 0.6.7
20 | ## Additions
21 | - Chamfer distance introduced in https://arxiv.org/pdf/1612.00603 for dense batches
22 | 
23 | # 0.6.6
24 | ## Additions
25 | - Windows support
26 | 
27 | 
28 | ## Change
29 | - Develop with python 3.7
30 | 
31 | ## Bug fix
32 | - Fixed bug in region growing related to batching
33 | - Ball query for partial dense data on GPU was returning only the first point. Fixed now
34 | 
35 | 
36 | # 0.6.5
37 | 
38 | ## Additions
39 | - Clustering algorithm for [PointGroup](https://arxiv.org/pdf/2004.01658.pdf)
40 | - Instance IoU computation on CPU and GPU
41 | 
42 | ## Change
43 | - Force no ninja for the compilation
44 | 
45 | # 0.6.4
46 | 
47 | ## Bug fix
48 | - CPU version works for MacOS
49 | 
50 | # 0.6.2
51 | 
52 | ## Bug fix
53 | - Fix install with pip > 19
54 | 
55 | # 0.6.1
56 | 
57 | ## Bug fix
58 | - Random memory access on cpu radius search in the degree function
59 | 
60 | # 0.6.0
61 | 
62 | ## Bug fix
63 | - Require pytorch implicitely and log nice message when missing
64 | 
65 | # 0.5.3
66 | 
67 | ## Update
68 | - ball query returns squared distance instead of distance
69 | - leaner Point Cloud struct that avoids copying data
70 | 
71 | ## Bug fix
72 | - Package would not install if pytorch is not already installed
73 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
 4 | #-------------------------------------------------------------------------------------------------------------
 5 | 
 6 | FROM ubuntu:bionic
 7 | 
 8 | # Avoid warnings by switching to noninteractive
 9 | ENV DEBIAN_FRONTEND=noninteractive
10 | 
11 | # This Dockerfile adds a non-root user with sudo access. Use the "remoteUser"
12 | # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs
13 | # will be updated to match your local UID/GID (when using the dockerFile property).
14 | # See https://aka.ms/vscode-remote/containers/non-root-user for details.
15 | ARG USERNAME=vscode
16 | ARG USER_UID=1000
17 | ARG USER_GID=$USER_UID
18 | 
19 | # Uncomment the following COPY line and the corresponding lines in the `RUN` command if you wish to
20 | # include your requirements in the image itself. It is suggested that you only do this if your
21 | # requirements rarely (if ever) change.
22 | 
23 | RUN apt-get update \
24 |     && apt-get install -y --fix-missing --no-install-recommends\
25 |     libffi-dev libssl-dev build-essential \
26 |     python3-pip python3-dev python3-venv python3-setuptools\
27 |     git iproute2 procps lsb-release clang-format \
28 |     && apt-get clean \
29 |     && rm -rf /var/lib/apt/lists/*
30 | 
31 | RUN pip3 install -U pip
32 | RUN pip3 install torch numpy scikit-learn flake8 setuptools numba
33 | RUN pip3 install torch_cluster torch_sparse torch_scatter torch_geometric
34 | 


--------------------------------------------------------------------------------
/test/test_interpolate.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | from torch.autograd import gradcheck
 4 | from torch_points_kernels import three_interpolate, three_nn
 5 | 
 6 | from . import run_if_cuda
 7 | 
 8 | 
 9 | class TestInterpolate(unittest.TestCase):
10 |     @run_if_cuda
11 |     def test_gpu(self):
12 |         pos = torch.randn([16, 100, 3]).cuda()
13 |         pos_skip = torch.randn([16, 500, 3]).cuda()
14 |         x = torch.randn([16, 30, 100], requires_grad=True).cuda()
15 | 
16 |         dist, idx = three_nn(pos_skip, pos)
17 |         dist_recip = 1.0 / (dist + 1e-8)
18 |         norm = torch.sum(dist_recip, dim=2, keepdim=True)
19 |         weight = dist_recip / norm
20 |         interpolated_feats = three_interpolate(x, idx, weight)
21 | 
22 |         dist, idx = three_nn(pos_skip.cpu(), pos.cpu())
23 |         dist_recip = 1.0 / (dist + 1e-8)
24 |         norm = torch.sum(dist_recip, dim=2, keepdim=True)
25 |         weight = dist_recip / norm
26 |         interpolated_feats_cpu = three_interpolate(x.cpu(), idx, weight)
27 | 
28 |         torch.testing.assert_allclose(interpolated_feats_cpu, interpolated_feats.cpu())
29 | 
30 |     def test_grad(self):
31 |         b, n, k = (2, 10, 3)
32 |         pos = torch.randn([b, n, k]).double()
33 |         pos_skip = torch.randn([b, 2 * n, k]).double()
34 |         x = torch.randn([b, 30, n], requires_grad=True).double()
35 |         dist, idx = three_nn(pos_skip, pos)
36 |         dist_recip = 1.0 / (dist + 1e-8)
37 |         norm = torch.sum(dist_recip, dim=2, keepdim=True)
38 |         weight = dist_recip / norm
39 |         input = (x, idx, weight.double())
40 |         test = gradcheck(three_interpolate, input, eps=1e-6, atol=1e-4)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main()
45 | 


--------------------------------------------------------------------------------
/cuda/include/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | #include <torch/extension.h>
 4 | 
 5 | #define CHECK_CUDA(x)                                                                              \
 6 |     do                                                                                             \
 7 |     {                                                                                              \
 8 |         TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor");                                     \
 9 |     } while (0)
10 | 
11 | #define CHECK_CONTIGUOUS(x)                                                                        \
12 |     do                                                                                             \
13 |     {                                                                                              \
14 |         TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor");                         \
15 |     } while (0)
16 | 
17 | #define CHECK_IS_INT(x)                                                                            \
18 |     do                                                                                             \
19 |     {                                                                                              \
20 |         TORCH_CHECK(isIntegralType(x.scalar_type(), false), #x " must be an int tensor");          \
21 |     } while (0)
22 | 
23 | #define CHECK_IS_FLOAT(x)                                                                          \
24 |     do                                                                                             \
25 |     {                                                                                              \
26 |         TORCH_CHECK(isFloatingType(x.scalar_type()), #x " must be a float tensor");       \
27 |     } while (0)
28 | 


--------------------------------------------------------------------------------
/test/test_grouping.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import numpy as np
 4 | import numpy.testing as npt
 5 | from torch_points_kernels import grouping_operation
 6 | 
 7 | 
 8 | class TestGroup(unittest.TestCase):
 9 | 
10 |     # input: points(b, c, n) idx(b, npoints, nsample)
11 |     # output: out(b, c, npoints, nsample)
12 |     def test_simple(self):
13 |         features = torch.tensor(
14 |             [
15 |                 [[0, 10, 0], [1, 11, 0], [2, 12, 0]],
16 |                 [
17 |                     [100, 110, 120],
18 |                     [101, 111, 121],
19 |                     [102, 112, 122],
20 |                 ],  # x-coordinates  # y-coordinates  # z-coordinates
21 |             ]
22 |         ).type(torch.float)
23 |         idx = torch.tensor([[[1, 0], [0, 0]], [[0, 1], [1, 2]]]).type(torch.long)
24 | 
25 |         expected = np.array(
26 |             [
27 |                 [[[10, 0], [0, 0]], [[11, 1], [1, 1]], [[12, 2], [2, 2]]],
28 |                 [  # 2nd batch
29 |                     [  # x-coordinates
30 |                         [100, 110],  # x-coordinates of samples for point 0
31 |                         [110, 120],  # x-coordinates of samples for point 1
32 |                     ],
33 |                     [[101, 111], [111, 121]],  # y-coordinates
34 |                     [[102, 112], [112, 122]],  # z-coordinates
35 |                 ],
36 |             ]
37 |         )
38 | 
39 |         cpu_output = grouping_operation(features, idx).detach().cpu().numpy()
40 | 
41 |         npt.assert_array_equal(expected, cpu_output)
42 | 
43 |         if torch.cuda.is_available():
44 |             npt.assert_array_equal(
45 |                 grouping_operation(features.cuda(), idx.cuda()).detach().cpu().numpy(),
46 |                 expected,
47 |             )
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     unittest.main()
52 | 


--------------------------------------------------------------------------------
/test/test_cluster.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import os
 4 | import sys
 5 | 
 6 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 7 | sys.path.insert(0, ROOT)
 8 | 
 9 | from torch_points_kernels.cluster import grow_proximity, region_grow
10 | 
11 | 
12 | class TestGrow(unittest.TestCase):
13 |     def setUp(self):
14 |         self.pos = torch.tensor(
15 |             [
16 |                 [0, 0, 0],
17 |                 [1, 0, 0],
18 |                 [2, 0, 0],
19 |                 [10, 0, 0],
20 |                 [0, 0, 0],
21 |                 [1, 0, 0],
22 |                 [2, 0, 0],
23 |                 [10, 0, 0],
24 |             ]
25 |         )
26 |         self.batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
27 |         self.labels = torch.tensor([0, 0, 1, 1, 0, 1, 1, 10])
28 | 
29 |     def test_simple(self):
30 |         clusters = grow_proximity(self.pos, self.batch, radius=2, min_cluster_size=1)
31 |         self.assertEqual(clusters, [[0, 1, 2], [3], [4, 5, 6], [7]])
32 | 
33 |         clusters = grow_proximity(self.pos, self.batch, radius=2, min_cluster_size=3)
34 |         self.assertEqual(clusters, [[0, 1, 2], [4, 5, 6]])
35 | 
36 |     def test_region_grow(self):
37 |         cluster_idx = region_grow(self.pos, self.labels, self.batch, radius=2, min_cluster_size=1)
38 |         self.assertEqual(len(cluster_idx), 6)
39 |         torch.testing.assert_allclose(cluster_idx[0], torch.tensor([0, 1]))
40 |         torch.testing.assert_allclose(cluster_idx[1], torch.tensor([4]))
41 |         torch.testing.assert_allclose(cluster_idx[2], torch.tensor([2]))
42 |         torch.testing.assert_allclose(cluster_idx[3], torch.tensor([3]))
43 |         torch.testing.assert_allclose(cluster_idx[4], torch.tensor([5, 6]))
44 |         torch.testing.assert_allclose(cluster_idx[5], torch.tensor([7]))
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Unittests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   unittests:
11 |     strategy:
12 |       matrix:
13 |         os: [ubuntu-latest, macos-latest, windows-latest]
14 |         python-version: [3.7, 3.8]
15 |         torch-version: [1.10.0, 1.11.0, 1.12.0]
16 |     runs-on: ${{ matrix.os }}
17 |     steps:
18 |         - uses: actions/checkout@v2
19 |         - name: Set up Python ${{ matrix.python-version }}
20 |           uses: actions/setup-python@v2
21 |           with:
22 |               python-version: ${{ matrix.python-version }}
23 |         - name: Install dependencies
24 |           run: |
25 |               python -m pip install --upgrade pip
26 |               pip install "numpy<=1.21" scikit-learn flake8 setuptools numba
27 | 
28 |         - name: Install torch ${{ matrix.torch-version }} windows + linux
29 |           if: ${{matrix.os != 'macos-latest'}}
30 |           run: pip install torch==${{ matrix.torch-version }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
31 |         - name: Install torch ${{ matrix.torch-version }} macos
32 |           if: ${{matrix.os == 'macos-latest'}}
33 |           run: pip install torch==${{ matrix.torch-version }}
34 | 
35 |         - name: Build package
36 |           run: |
37 |               python setup.py build_ext --inplace
38 |         - name: Lint with flake8
39 |           run: |
40 |               # stop the build if there are Python syntax errors or undefined names
41 |               flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
42 |               # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
43 |               flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
44 |         - name: Test with unittest
45 |           run: |
46 |               python -m unittest -v
47 | 


--------------------------------------------------------------------------------
/test/test_cubic_feature_sampling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | import torch
 5 | import unittest
 6 | 
 7 | from torch.autograd import gradcheck
 8 | 
 9 | from . import run_if_cuda
10 | 
11 | 
12 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
13 | sys.path.insert(0, ROOT)
14 | 
15 | from torch_points_kernels.cubic_feature_sampling import CubicFeatureSamplingFunction, cubic_feature_sampling
16 | 
17 | 
18 | class TestCubicFeatureSampling(unittest.TestCase):
19 |     @run_if_cuda
20 |     def test_neighborhood_size_1(self):
21 |         ptcloud = torch.rand(2, 64, 3) * 2 - 1
22 |         cubic_features = torch.rand(2, 4, 8, 8, 8)
23 |         ptcloud.requires_grad = True
24 |         cubic_features.requires_grad = True
25 |         self.assertTrue(
26 |             gradcheck(
27 |                 CubicFeatureSamplingFunction.apply,
28 |                 [ptcloud.double().cuda(), cubic_features.double().cuda()],
29 |             )
30 |         )
31 | 
32 |     @run_if_cuda
33 |     def test_neighborhood_size_2(self):
34 |         ptcloud = torch.rand(2, 32, 3) * 2 - 1
35 |         cubic_features = torch.rand(2, 2, 8, 8, 8)
36 |         ptcloud.requires_grad = True
37 |         cubic_features.requires_grad = True
38 |         self.assertTrue(
39 |             gradcheck(
40 |                 CubicFeatureSamplingFunction.apply,
41 |                 [ptcloud.double().cuda(), cubic_features.double().cuda(), 2],
42 |             )
43 |         )
44 | 
45 |     @run_if_cuda
46 |     def test_neighborhood_size_3(self):
47 |         ptcloud = torch.rand(1, 32, 3) * 2 - 1
48 |         cubic_features = torch.rand(1, 2, 16, 16, 16)
49 |         ptcloud.requires_grad = True
50 |         cubic_features.requires_grad = True
51 |         self.assertTrue(
52 |             gradcheck(
53 |                 CubicFeatureSamplingFunction.apply,
54 |                 [ptcloud.double().cuda(), cubic_features.double().cuda(), 3],
55 |             )
56 |         )
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/torch_points_kernels/chamfer_dist.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if torch.cuda.is_available():
 4 |     import torch_points_kernels.points_cuda as tpcuda
 5 | 
 6 | 
 7 | class ChamferFunction(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, xyz1, xyz2):
10 |         if not torch.cuda.is_available():
11 |             raise NotImplementedError("CPU version is not available for Chamfer Distance")
12 | 
13 |         dist1, dist2, idx1, idx2 = tpcuda.chamfer_dist(xyz1, xyz2)
14 |         ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
15 | 
16 |         return dist1, dist2
17 | 
18 |     @staticmethod
19 |     def backward(ctx, grad_dist1, grad_dist2):
20 |         xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
21 |         grad_xyz1, grad_xyz2 = tpcuda.chamfer_dist_grad(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2)
22 |         return grad_xyz1, grad_xyz2
23 | 
24 | 
25 | def chamfer_dist(xyz1, xyz2, ignore_zeros=False):
26 |     r"""
27 |     Calcuates the distance between B pairs of point clouds
28 | 
29 |     Parameters
30 |     ----------
31 |     xyz1 : torch.Tensor (dtype=torch.float32)
32 |         (B, n1, 3) B point clouds containing n1 points
33 |     xyz2 : torch.Tensor (dtype=torch.float32)
34 |         (B, n2, 3) B point clouds containing n2 points
35 |     ignore_zeros : bool
36 |         ignore the point whose coordinate is (0, 0, 0) or not
37 | 
38 |     Returns
39 |     -------
40 |     dist: torch.Tensor
41 |         (B, ): the distances between B pairs of point clouds
42 |     """
43 |     if len(xyz1.shape) != 3 or xyz1.size(2) != 3 or len(xyz2.shape) != 3 or xyz2.size(2) != 3:
44 |         raise ValueError("The input point cloud should be of size (B, n_pts, 3)")
45 | 
46 |     batch_size = xyz1.size(0)
47 |     if batch_size == 1 and ignore_zeros:
48 |         non_zeros1 = torch.sum(xyz1, dim=2).ne(0)
49 |         non_zeros2 = torch.sum(xyz2, dim=2).ne(0)
50 |         xyz1 = xyz1[non_zeros1].unsqueeze(dim=0)
51 |         xyz2 = xyz2[non_zeros2].unsqueeze(dim=0)
52 | 
53 |     dist1, dist2 = ChamferFunction.apply(xyz1, xyz2)
54 |     return torch.mean(dist1) + torch.mean(dist2)
55 | 


--------------------------------------------------------------------------------
/test/speed_radius.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import sys
 4 | import numpy.testing as npt
 5 | import numpy as np
 6 | from sklearn.neighbors import KDTree
 7 | import unittest
 8 | import time
 9 | 
10 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
11 | sys.path.insert(0, ROOT)
12 | 
13 | from torch_points_kernels import ball_query
14 | 
15 | 
16 | class TestRadiusSpeed(unittest.TestCase):
17 |     def test_speed(self):
18 |         start = time.time()
19 |         a = torch.randn(50000, 3).to(torch.float)
20 |         b = torch.randn(10000, 3).to(torch.float)
21 |         batch_a = torch.tensor([0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])])
22 |         batch_b = torch.tensor([0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])])
23 |         R = 1
24 |         samples = 50
25 | 
26 |         idx, dist = ball_query(
27 |             R,
28 |             samples,
29 |             a,
30 |             b,
31 |             mode="PARTIAL_DENSE",
32 |             batch_x=batch_a,
33 |             batch_y=batch_b,
34 |             sort=True,
35 |         )
36 |         idx1, dist = ball_query(
37 |             R,
38 |             samples,
39 |             a,
40 |             b,
41 |             mode="PARTIAL_DENSE",
42 |             batch_x=batch_a,
43 |             batch_y=batch_b,
44 |             sort=True,
45 |         )
46 |         print(time.time() - start)
47 |         torch.testing.assert_allclose(idx1, idx)
48 | 
49 |         self.assertEqual(idx.shape[0], b.shape[0])
50 |         self.assertEqual(dist.shape[0], b.shape[0])
51 |         self.assertLessEqual(idx.max().item(), len(batch_a))
52 | 
53 |         # # Comparison to see if we have the same result
54 |         # tree = KDTree(a.detach().numpy())
55 |         # idx3_sk = tree.query_radius(b.detach().numpy(), r=R)
56 |         # i = np.random.randint(len(batch_b))
57 |         # for p in idx[i].detach().numpy():
58 |         #     if p >= 0 and p < len(batch_a):
59 |         #         assert p in idx3_sk[i]
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/torch_points_kernels/gridding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if torch.cuda.is_available():
 4 |     import torch_points_kernels.points_cuda as tpcuda
 5 | 
 6 | 
 7 | class GriddingFunction(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, ptcloud, scale):
10 |         if not torch.cuda.is_available():
11 |             raise NotImplementedError("CPU version is not available for Chamfer Distance")
12 | 
13 |         grid, grid_pt_weights, grid_pt_indexes = tpcuda.gridding(
14 |             -scale, scale - 1, -scale, scale - 1, -scale, scale - 1, ptcloud
15 |         )
16 |         # print(grid.size())             # torch.Size(batch_size, n_grid_vertices)
17 |         # print(grid_pt_weights.size())  # torch.Size(batch_size, n_pts, 8, 3)
18 |         # print(grid_pt_indexes.size())  # torch.Size(batch_size, n_pts, 8)
19 |         ctx.save_for_backward(grid_pt_weights, grid_pt_indexes)
20 | 
21 |         return grid
22 | 
23 |     @staticmethod
24 |     def backward(ctx, grad_grid):
25 |         grid_pt_weights, grid_pt_indexes = ctx.saved_tensors
26 |         grad_ptcloud = tpcuda.gridding_grad(grid_pt_weights, grid_pt_indexes, grad_grid)
27 |         # print(grad_ptcloud.size())   # torch.Size(batch_size, n_pts, 3)
28 | 
29 |         return grad_ptcloud, None
30 | 
31 | 
32 | def gridding(ptcloud, scale):
33 |     r"""
34 |     Converts the input point clouds into 3D grids by trilinear interpolcation.
35 |     Please refer to https://arxiv.org/pdf/2006.03761 for more information
36 | 
37 |     Parameters
38 |     ----------
39 |     ptcloud : torch.Tensor (dtype=torch.float32)
40 |         (B, n_pts, 3) B point clouds containing n_pts points
41 |     scale : Int
42 |         the resolution of the 3D grid
43 | 
44 |     Returns
45 |     -------
46 |     grid: torch.Tensor
47 |         (B, scale, scale, scale): the grid of the resolution of scale * scale * scale
48 |     """
49 |     if len(ptcloud.shape) != 3 or ptcloud.size(2) != 3:
50 |         raise ValueError("The input point cloud should be of size (B, n_pts, 3)")
51 | 
52 |     ptcloud = ptcloud * scale
53 |     _ptcloud = torch.split(ptcloud, 1, dim=0)
54 |     grids = []
55 |     for p in _ptcloud:
56 |         non_zeros = torch.sum(p, dim=2).ne(0)
57 |         p = p[non_zeros].unsqueeze(dim=0)
58 |         grids.append(GriddingFunction.apply(p, scale))
59 | 
60 |     return torch.cat(grids, dim=0).contiguous()
61 | 


--------------------------------------------------------------------------------
/cpu/src/knn.cpp:
--------------------------------------------------------------------------------
 1 | #include "compat.h"
 2 | #include "neighbors.cpp"
 3 | #include "neighbors.h"
 4 | #include "utils.h"
 5 | #include <iostream>
 6 | #include <torch/extension.h>
 7 | 
 8 | std::pair<at::Tensor, at::Tensor> _single_batch_knn(at::Tensor support, at::Tensor query, int k)
 9 | {
10 |     CHECK_CONTIGUOUS(support);
11 |     CHECK_CONTIGUOUS(query);
12 |     if (support.size(0) < k)
13 |         TORCH_CHECK(false,
14 |                     "Not enough points in support to find " + std::to_string(k) + " neighboors")
15 |     std::vector<int64_t> neighbors_indices(query.size(0) * k, -1);
16 |     std::vector<float> neighbors_dists(query.size(0) * k, -1);
17 | 
18 |     auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU);
19 |     auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
20 |     AT_DISPATCH_ALL_TYPES(query.scalar_type(), "knn", [&] {
21 |         auto data_q = query.DATA_PTR<scalar_t>();
22 |         auto data_s = support.DATA_PTR<scalar_t>();
23 |         std::vector<scalar_t> queries_stl =
24 |             std::vector<scalar_t>(data_q, data_q + query.size(0) * query.size(1));
25 |         std::vector<scalar_t> supports_stl =
26 |             std::vector<scalar_t>(data_s, data_s + support.size(0) * support.size(1));
27 | 
28 |         nanoflann_knn_neighbors<scalar_t>(queries_stl, supports_stl, neighbors_indices,
29 |                                           neighbors_dists, k);
30 |     });
31 |     auto neighbors_dists_ptr = neighbors_dists.data();
32 |     int64_t* neighbors_indices_ptr = neighbors_indices.data();
33 |     auto out = torch::from_blob(neighbors_indices_ptr, {query.size(0), k}, options = options);
34 |     auto out_dists =
35 |         torch::from_blob(neighbors_dists_ptr, {query.size(0), k}, options = options_dist);
36 | 
37 |     return std::make_pair(out.clone(), out_dists.clone());
38 | }
39 | 
40 | std::pair<at::Tensor, at::Tensor> dense_knn(at::Tensor support, at::Tensor query, int k)
41 | {
42 |     CHECK_CONTIGUOUS(support);
43 |     CHECK_CONTIGUOUS(query);
44 |     CHECK_CPU(query);
45 |     CHECK_CPU(support);
46 | 
47 |     int b = query.size(0);
48 |     vector<at::Tensor> batch_idx;
49 |     vector<at::Tensor> batch_dist;
50 |     for (int i = 0; i < b; i++)
51 |     {
52 |         auto out_pair = _single_batch_knn(support[i], query[i], k);
53 |         batch_idx.push_back(out_pair.first);
54 |         batch_dist.push_back(out_pair.second);
55 |     }
56 |     auto out_idx = torch::stack(batch_idx);
57 |     auto out_dist = torch::stack(batch_dist);
58 |     return std::make_pair(out_idx, out_dist);
59 | }
60 | 


--------------------------------------------------------------------------------
/cuda/src/metrics.cpp:
--------------------------------------------------------------------------------
 1 | #include "metrics.h"
 2 | #include "compat.h"
 3 | #include "utils.h"
 4 | 
 5 | void instance_iou_kernel_wrapper(int64_t total_gt_instances, int64_t max_gt_instances,
 6 |                                  const int64_t* nInstance, int nProposal,
 7 |                                  const int64_t* proposals_idx, const int64_t* proposals_offset,
 8 |                                  const int64_t* instance_labels,
 9 |                                  const int64_t* offset_num_gt_instances, const int64_t* batch,
10 |                                  const int64_t* instance_pointnum, float* proposals_iou);
11 | 
12 | at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
13 |                              at::Tensor gt_instances, at::Tensor gt_instance_sizes,
14 |                              at::Tensor num_gt_instances, at::Tensor batch)
15 | {
16 |     CHECK_CONTIGUOUS(instance_idx);
17 |     CHECK_CONTIGUOUS(instance_offsets);
18 |     CHECK_CONTIGUOUS(gt_instances);
19 |     CHECK_CONTIGUOUS(gt_instance_sizes);
20 |     CHECK_CONTIGUOUS(num_gt_instances);
21 |     CHECK_CONTIGUOUS(batch);
22 | 
23 |     CHECK_CUDA(instance_idx);
24 |     CHECK_CUDA(instance_offsets);
25 |     CHECK_CUDA(gt_instances);
26 |     CHECK_CUDA(gt_instance_sizes);
27 | 
28 |     cudaSetDevice(instance_idx.get_device());
29 |     int64_t num_proposed_instances = instance_offsets.size(0) - 1;
30 |     auto total_gt_instances = (int64_t*)malloc(sizeof(int64_t));
31 |     cudaMemcpy(total_gt_instances, num_gt_instances.sum().DATA_PTR<int64_t>(), sizeof(int64_t),
32 |                cudaMemcpyDeviceToHost);
33 |     auto max_gt_instances = (int64_t*)malloc(sizeof(int64_t));
34 |     cudaMemcpy(max_gt_instances, num_gt_instances.max().DATA_PTR<int64_t>(), sizeof(int64_t),
35 |                cudaMemcpyDeviceToHost);
36 | 
37 |     at::Tensor output =
38 |         torch::zeros({num_proposed_instances, total_gt_instances[0]},
39 |                      at::device(gt_instances.device()).dtype(at::ScalarType::Float));
40 | 
41 |     at::Tensor offset_num_gt_instances =
42 |         at::cat({at::zeros(1, num_gt_instances.options()), num_gt_instances.cumsum(0)}, 0);
43 |     instance_iou_kernel_wrapper(
44 |         total_gt_instances[0], max_gt_instances[0], num_gt_instances.DATA_PTR<int64_t>(),
45 |         num_proposed_instances, instance_idx.DATA_PTR<int64_t>(),
46 |         instance_offsets.DATA_PTR<int64_t>(), gt_instances.DATA_PTR<int64_t>(),
47 |         offset_num_gt_instances.DATA_PTR<int64_t>(), batch.DATA_PTR<int64_t>(),
48 |         gt_instance_sizes.DATA_PTR<int64_t>(), output.DATA_PTR<float>());
49 | 
50 |     return output;
51 | }
52 | 


--------------------------------------------------------------------------------
/cuda/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUDA_UTILS_H
 2 | #define _CUDA_UTILS_H
 3 | 
 4 | #include <ATen/ATen.h>
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <cmath>
 7 | 
 8 | #include <cuda.h>
 9 | #include <cuda_runtime.h>
10 | 
11 | #include <vector>
12 | 
13 | #define TOTAL_THREADS_DENSE 512
14 | #define TOTAL_THREADS_SPARSE 1024
15 | 
16 | inline int opt_n_threads(int work_size)
17 | {
18 |     const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
19 | 
20 |     return max(min(1 << pow_2, TOTAL_THREADS_DENSE), 1);
21 | }
22 | 
23 | inline dim3 opt_block_config(int x, int y)
24 | {
25 |     const int x_threads = opt_n_threads(x);
26 |     const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS_DENSE / x_threads), 1);
27 |     dim3 block_config(x_threads, y_threads, 1);
28 | 
29 |     return block_config;
30 | }
31 | 
32 | // from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
33 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
34 | #else
35 | __device__ double atomicAdd(double* address, double val)
36 | {
37 |     unsigned long long int* address_as_ull = (unsigned long long int*)address;
38 |     unsigned long long int old = *address_as_ull, assumed;
39 | 
40 |     do
41 |     {
42 |         assumed = old;
43 |         old = atomicCAS(address_as_ull, assumed,
44 |                         __double_as_longlong(val + __longlong_as_double(assumed)));
45 | 
46 |         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
47 |     } while (assumed != old);
48 | 
49 |     return __longlong_as_double(old);
50 | }
51 | #endif
52 | 
53 | #define CUDA_CHECK_ERRORS()                                                                        \
54 |     do                                                                                             \
55 |     {                                                                                              \
56 |         cudaError_t err = cudaGetLastError();                                                      \
57 |         if (cudaSuccess != err)                                                                    \
58 |         {                                                                                          \
59 |             fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",                         \
60 |                     cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, __FILE__);             \
61 |             exit(-1);                                                                              \
62 |         }                                                                                          \
63 |     } while (0)
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/torch_points_kernels/cubic_feature_sampling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if torch.cuda.is_available():
 4 |     import torch_points_kernels.points_cuda as tpcuda
 5 | 
 6 | 
 7 | class CubicFeatureSamplingFunction(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, ptcloud, cubic_features, neighborhood_size=1):
10 |         scale = cubic_features.size(2)
11 |         if not torch.cuda.is_available():
12 |             raise NotImplementedError("CPU version is not available for Cubic Feature Sampling")
13 | 
14 |         point_features, grid_pt_indexes = tpcuda.cubic_feature_sampling(
15 |             scale, neighborhood_size, ptcloud, cubic_features
16 |         )
17 |         ctx.save_for_backward(torch.Tensor([scale]), torch.Tensor([neighborhood_size]), grid_pt_indexes)
18 |         return point_features
19 | 
20 |     @staticmethod
21 |     def backward(ctx, grad_point_features):
22 |         scale, neighborhood_size, grid_pt_indexes = ctx.saved_tensors
23 |         scale = int(scale.item())
24 |         neighborhood_size = int(neighborhood_size.item())
25 |         grad_point_features = grad_point_features.contiguous()
26 |         grad_ptcloud, grad_cubic_features = tpcuda.cubic_feature_sampling_grad(
27 |             scale, neighborhood_size, grad_point_features, grid_pt_indexes
28 |         )
29 |         return grad_ptcloud, grad_cubic_features, None
30 | 
31 | 
32 | def cubic_feature_sampling(ptcloud, cubic_features, neighborhood_size=1):
33 |     r"""
34 |     Sample the features of points from 3D feature maps that the point lies in.
35 |     Please refer to https://arxiv.org/pdf/2006.03761 for more information
36 | 
37 |     Parameters
38 |     ----------
39 |     ptcloud : torch.Tensor (dtype=torch.float32)
40 |         (B, n_pts, 3) point clouds containing n_pts points
41 |     cubic_features : torch.Tensor (dtype=torch.float32)
42 |         (B, c, m, m, m) 3D feature maps of sizes m x m x m and c channels
43 |     neighborhood_size : int
44 |         The neighborhood cubes to sample.
45 |         neighborhood_size = 1 means to sample the cube that point lies in.
46 |         neighborhood_size = 2 means to sample surrouding cubes (step = 1) of
47 |         the cube that point lies in.
48 | 
49 |     Returns
50 |     -------
51 |     dist: torch.Tensor
52 |         (B, n_pts, n_vertices, c), where n_vertices = (neighborhood_size * 2)^3
53 |     """
54 |     if len(ptcloud.shape) != 3 or ptcloud.shape[2] != 3:
55 |         raise ValueError("The input point cloud should be of size (B, n_pts, 3).")
56 | 
57 |     h_scale = cubic_features.size(2) / 2
58 |     ptcloud = ptcloud * h_scale + h_scale
59 |     return CubicFeatureSamplingFunction.apply(ptcloud, cubic_features, neighborhood_size)
60 | 


--------------------------------------------------------------------------------
/cpu/src/interpolate.cpp:
--------------------------------------------------------------------------------
 1 | #include "compat.h"
 2 | #include "utils.h"
 3 | #include <iostream>
 4 | #include <torch/extension.h>
 5 | 
 6 | at::Tensor knn_interpolate(at::Tensor features, at::Tensor idx, at::Tensor weight)
 7 | {
 8 |     CHECK_CONTIGUOUS(features);
 9 |     CHECK_CONTIGUOUS(idx);
10 |     CHECK_CONTIGUOUS(weight);
11 |     CHECK_CPU(idx);
12 |     CHECK_CPU(features);
13 |     CHECK_CPU(weight);
14 | 
15 |     at::Tensor output = torch::zeros({features.size(0), features.size(1), idx.size(1)},
16 |                                      at::device(features.device()).dtype(features.scalar_type()));
17 | 
18 |     AT_DISPATCH_ALL_TYPES(features.scalar_type(), "knn_interpolate", [&] {
19 |         auto output_a = output.accessor<scalar_t, 3>();
20 |         auto features_a = features.accessor<scalar_t, 3>();
21 |         auto weight_a = weight.accessor<scalar_t, 3>();
22 |         auto idx_a = idx.accessor<int64_t, 3>();
23 | 
24 |         auto batch_size = idx.size(0);
25 |         for (auto b = 0; b < batch_size; b++)
26 |         {
27 |             for (auto p = 0; p < idx.size(1); p++)
28 |             {
29 |                 for (auto c = 0; c < features.size(1); c++)
30 |                 {
31 |                     output_a[b][c][p] = 0;
32 |                     for (int i = 0; i < idx.size(2); i++)
33 |                     {
34 |                         auto new_idx = idx_a[b][p][i];
35 |                         output_a[b][c][p] += features_a[b][c][new_idx] * weight_a[b][p][i];
36 |                     }
37 |                 }
38 |             }
39 |         }
40 |     });
41 |     return output;
42 | }
43 | 
44 | at::Tensor knn_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, const int m)
45 | {
46 |     CHECK_CPU(grad_out);
47 |     at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m},
48 |                                      at::device(grad_out.device()).dtype(grad_out.scalar_type()));
49 | 
50 |     AT_DISPATCH_ALL_TYPES(grad_out.scalar_type(), "knn_interpolate_grad", [&] {
51 |         auto output_a = output.accessor<scalar_t, 3>();
52 |         auto grad_out_a = grad_out.accessor<scalar_t, 3>();
53 |         auto weight_a = weight.accessor<scalar_t, 3>();
54 |         auto idx_a = idx.accessor<int64_t, 3>();
55 | 
56 |         auto batch_size = idx.size(0);
57 |         for (auto b = 0; b < batch_size; b++)
58 |         {
59 |             for (auto p = 0; p < idx.size(1); p++)
60 |             {
61 |                 for (auto c = 0; c < grad_out.size(1); c++)
62 |                 {
63 |                     for (int i = 0; i < idx.size(2); i++)
64 |                     {
65 |                         auto new_idx = idx_a[b][p][i];
66 |                         output_a[b][c][new_idx] += grad_out_a[b][c][p] * weight_a[b][p][i];
67 |                     }
68 |                 }
69 |             }
70 |         }
71 |     });
72 |     return output;
73 | }
74 | 


--------------------------------------------------------------------------------
/test/test_metrics.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | import os
 4 | import sys
 5 | import numpy as np
 6 | import random
 7 | 
 8 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 9 | sys.path.insert(0, ROOT)
10 | 
11 | from torch_points_kernels.metrics import instance_iou
12 | from test import run_if_cuda
13 | 
14 | 
15 | class TestInstanceIou(unittest.TestCase):
16 |     def test_simple(self, cuda=False):
17 |         gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0])
18 |         proposed_instances = [
19 |             torch.tensor([0, 2]),  # 100% instance 1
20 |             torch.tensor([1, 4]),  # 2/3 of instance 2
21 |             torch.tensor([3, 5]),  # 1/3 of instance 2 and 1/1 of instance 3
22 |         ]
23 |         if cuda:
24 |             proposed_instances = [c.cuda() for c in proposed_instances]
25 |             gt_instances = gt_instances.cuda()
26 |         ious = instance_iou(proposed_instances, gt_instances)
27 |         torch.testing.assert_allclose(
28 |             ious.cpu(),
29 |             torch.tensor([[1, 0, 0], [0, 2 / 3.0, 0], [0, 1.0 / 4.0, 1.0 / 2.0]]),
30 |         )
31 | 
32 |     def test_batch(self, cuda=False):
33 |         gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0])
34 |         batch = torch.tensor([0, 0, 1, 1, 1, 1, 1])
35 |         proposed_instances = [
36 |             torch.tensor([0, 1]),  # 50% instance 1, 50% instance 2 of sample 1
37 |             torch.tensor([3, 4]),  # 100% instance 2 of sample 2
38 |             torch.tensor([5]),  # 100% of instance 3 of sample 2
39 |         ]
40 |         if cuda:
41 |             proposed_instances = [c.cuda() for c in proposed_instances]
42 |             gt_instances = gt_instances.cuda()
43 |             batch = batch.cuda()
44 |         ious = instance_iou(proposed_instances, gt_instances, batch=batch)
45 |         torch.testing.assert_allclose(
46 |             ious.cpu(),
47 |             torch.tensor(
48 |                 [
49 |                     [0.5, 0.5, 0, 0, 0],
50 |                     [0, 0, 0, 1, 0],
51 |                     [0, 0, 0, 0, 1],
52 |                 ]
53 |             ),
54 |         )
55 | 
56 |     @run_if_cuda
57 |     def test_simple_cuda(self):
58 |         self.test_simple(cuda=True)
59 | 
60 |     @run_if_cuda
61 |     def test_batch_cuda(self):
62 |         self.test_batch(cuda=True)
63 | 
64 |     @run_if_cuda
65 |     def test_same(self):
66 |         gt_instances = torch.randint(0, 10, (1000,))
67 |         proposed_instances = []
68 |         for i in range(20):
69 |             instance_size = random.randint(5, 50)
70 |             proposed_instances.append(torch.randint(0, 1000, (instance_size,)))
71 | 
72 |         ious = instance_iou(proposed_instances, gt_instances)
73 |         proposed_instances_cuda = [i.cuda() for i in proposed_instances]
74 |         ious_cuda = instance_iou(proposed_instances_cuda, gt_instances.cuda())
75 | 
76 |         torch.testing.assert_allclose(ious, ious_cuda.cpu())
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/cuda/src/metrics_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "cuda_utils.h"
 6 | 
 7 | #define THREADS 512
 8 | 
 9 | __global__ void instance_iou_cuda_kernel(
10 |     int64_t total_gt_instances, const int64_t* __restrict__ nInstance, int nProposal,
11 |     const int64_t* __restrict__ proposals_idx, const int64_t* __restrict__ proposals_offset,
12 |     const int64_t* __restrict__ instance_labels,
13 |     const int64_t* __restrict__ offset_num_gt_instances, const int64_t* __restrict__ batch,
14 |     const int64_t* __restrict__ instance_pointnum, float* proposals_iou)
15 | {
16 |     for (int proposal_id = blockIdx.x; proposal_id < nProposal; proposal_id += gridDim.x)
17 |     {
18 |         int start = proposals_offset[proposal_id];
19 |         int end = proposals_offset[proposal_id + 1];
20 |         int sampleIdx = batch[proposals_idx[start]];
21 |         int sampleNInstances = nInstance[sampleIdx];
22 |         int instanceOffset = offset_num_gt_instances[sampleIdx];
23 |         int proposal_total = end - start;
24 |         for (int instance_id = threadIdx.x; instance_id < sampleNInstances;
25 |              instance_id += blockDim.x)
26 |         {
27 |             int instance_total = instance_pointnum[instanceOffset + instance_id];
28 |             int intersection = 0;
29 |             for (int i = start; i < end; i++)
30 |             {
31 |                 int idx = proposals_idx[i];
32 |                 if ((int)instance_labels[idx] == instance_id + 1)
33 |                 { // 0 is reserved for "no instance"
34 |                     intersection += 1;
35 |                 }
36 |             }
37 | 
38 |             proposals_iou[instanceOffset + instance_id + proposal_id * total_gt_instances] =
39 |                 (float)intersection /
40 |                 ((float)(proposal_total + instance_total - intersection) + 1e-5);
41 |         }
42 |     }
43 | }
44 | 
45 | // input: proposals_idx (sumNPoint), int
46 | // input: proposals_offset (nProposal + 1), int
47 | // input: instance_labels (N), int64_t, 0~total_nInst-1, -100
48 | // input: instance_pointnum (total_nInst), int
49 | // output: proposals_iou (nProposal, total_nInst), float
50 | void instance_iou_kernel_wrapper(int64_t total_gt_instances, int64_t max_gt_instances,
51 |                                  const int64_t* nInstance, int nProposal,
52 |                                  const int64_t* proposals_idx, const int64_t* proposals_offset,
53 |                                  const int64_t* instance_labels,
54 |                                  const int64_t* offset_num_gt_instances, const int64_t* batch,
55 |                                  const int64_t* instance_pointnum, float* proposals_iou)
56 | {
57 |     auto stream = at::cuda::getCurrentCUDAStream();
58 |     instance_iou_cuda_kernel<<<std::min(nProposal, THREADS * THREADS),
59 |                                std::min(max_gt_instances, (int64_t)THREADS), 0, stream>>>(
60 |         total_gt_instances, nInstance, nProposal, proposals_idx, proposals_offset, instance_labels,
61 |         offset_num_gt_instances, batch, instance_pointnum, proposals_iou);
62 | }
63 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language: Cpp
 3 | # BasedOnStyle:  LLVM
 4 | AccessModifierOffset: -4
 5 | AlignAfterOpenBracket: Align
 6 | AlignConsecutiveAssignments: false
 7 | AlignConsecutiveDeclarations: false
 8 | AlignEscapedNewlinesLeft: false
 9 | AlignOperands: true
10 | AlignTrailingComments: true
11 | AllowAllParametersOfDeclarationOnNextLine: true
12 | AllowShortBlocksOnASingleLine: false
13 | AllowShortCaseLabelsOnASingleLine: false
14 | AllowShortFunctionsOnASingleLine: Empty
15 | AllowShortIfStatementsOnASingleLine: false
16 | AllowShortLoopsOnASingleLine: false
17 | AlwaysBreakAfterDefinitionReturnType: None
18 | AlwaysBreakAfterReturnType: None
19 | AlwaysBreakBeforeMultilineStrings: false
20 | AlwaysBreakTemplateDeclarations: false
21 | BinPackArguments: true
22 | BinPackParameters: true
23 | BraceWrapping:
24 |     AfterClass: false
25 |     AfterControlStatement: false
26 |     AfterEnum: false
27 |     AfterFunction: false
28 |     AfterNamespace: false
29 |     AfterObjCDeclaration: false
30 |     AfterStruct: false
31 |     AfterUnion: false
32 |     BeforeCatch: false
33 |     BeforeElse: false
34 |     IndentBraces: false
35 | BreakBeforeBinaryOperators: None
36 | BreakBeforeBraces: Allman
37 | BreakBeforeTernaryOperators: true
38 | BreakConstructorInitializersBeforeComma: false
39 | BreakAfterJavaFieldAnnotations: false
40 | BreakStringLiterals: true
41 | ColumnLimit: 100
42 | CommentPragmas: "^ IWYU pragma:"
43 | BreakBeforeInheritanceComma: false
44 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
45 | ConstructorInitializerIndentWidth: 4
46 | ContinuationIndentWidth: 4
47 | Cpp11BracedListStyle: true
48 | DerivePointerAlignment: false
49 | DisableFormat: false
50 | ExperimentalAutoDetectBinPacking: false
51 | FixNamespaceComments: true
52 | ForEachMacros: [foreach, Q_FOREACH, BOOST_FOREACH]
53 | IncludeCategories:
54 |     - Regex: '^"(llvm|llvm-c|clang|clang-c)/'
55 |       Priority: 2
56 |     - Regex: '^(<|"(gtest|isl|json)/)'
57 |       Priority: 3
58 |     - Regex: ".*"
59 |       Priority: 1
60 | IncludeIsMainRegex: "$"
61 | IndentCaseLabels: false
62 | IndentWidth: 4
63 | IndentWrappedFunctionNames: false
64 | JavaScriptQuotes: Leave
65 | JavaScriptWrapImports: true
66 | KeepEmptyLinesAtTheStartOfBlocks: true
67 | MacroBlockBegin: ""
68 | MacroBlockEnd: ""
69 | MaxEmptyLinesToKeep: 1
70 | NamespaceIndentation: None
71 | ObjCBlockIndentWidth: 2
72 | ObjCSpaceAfterProperty: false
73 | ObjCSpaceBeforeProtocolList: true
74 | PenaltyBreakBeforeFirstCallParameter: 19
75 | PenaltyBreakComment: 300
76 | PenaltyBreakFirstLessLess: 120
77 | PenaltyBreakString: 1000
78 | PenaltyExcessCharacter: 1000000
79 | PenaltyReturnTypeOnItsOwnLine: 60
80 | PointerAlignment: Left
81 | ReflowComments: true
82 | SortIncludes: true
83 | SpaceAfterCStyleCast: false
84 | SpaceAfterTemplateKeyword: true
85 | SpaceBeforeAssignmentOperators: true
86 | SpaceBeforeParens: ControlStatements
87 | SpaceInEmptyParentheses: false
88 | SpacesBeforeTrailingComments: 1
89 | SpacesInAngles: false
90 | SpacesInContainerLiterals: true
91 | SpacesInCStyleCastParentheses: false
92 | SpacesInParentheses: false
93 | SpacesInSquareBrackets: false
94 | Standard: Cpp11
95 | TabWidth: 4
96 | UseTab: Never
97 | 


--------------------------------------------------------------------------------
/cpu/include/cloud.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //
  3 | //		0==========================0
  4 | //		|    Local feature test    |
  5 | //		0==========================0
  6 | //
  7 | //		version 1.0 :
  8 | //			>
  9 | //
 10 | //---------------------------------------------------
 11 | //
 12 | //		Cloud header
 13 | //
 14 | //----------------------------------------------------
 15 | //
 16 | //		Hugues THOMAS - 10/02/2017
 17 | //
 18 | 
 19 | #pragma once
 20 | 
 21 | #include <algorithm>
 22 | #include <cmath>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <map>
 26 | #include <numeric>
 27 | #include <unordered_map>
 28 | #include <vector>
 29 | 
 30 | #include <time.h>
 31 | 
 32 | template <typename scalar_t> struct PointCloud
 33 | {
 34 |     void set(const std::vector<scalar_t>& new_pts)
 35 |     {
 36 |         pts = new_pts.data();
 37 |         length = new_pts.size() / 3;
 38 |     }
 39 |     void set_batch(const std::vector<scalar_t>& new_pts, int begin, int end)
 40 |     {
 41 |         pts = new_pts.data();
 42 |         int start = begin * 3;
 43 |         pts += start;
 44 |         length = (end - begin);
 45 |     }
 46 | 
 47 |     // Must return the number of data points
 48 |     inline size_t kdtree_get_point_count() const
 49 |     {
 50 |         return get_point_count();
 51 |     }
 52 | 
 53 |     // Must return the number of data points
 54 |     inline size_t get_point_count() const
 55 |     {
 56 |         return length;
 57 |     }
 58 | 
 59 |     // Returns the dim'th component of the idx'th point in the class:
 60 |     // Since this is inlined and the "dim" argument is typically an immediate
 61 |     // value, the
 62 |     //  "if/else's" are actually solved at compile time.
 63 |     inline scalar_t kdtree_get_pt(const size_t idx, const size_t dim) const
 64 |     {
 65 |         if (dim == 0)
 66 |             return pts[idx * 3];
 67 |         else if (dim == 1)
 68 |             return pts[idx * 3 + 1];
 69 |         else
 70 |             return pts[idx * 3 + 2];
 71 |     }
 72 | 
 73 |     // Optional bounding-box computation: return false to default to a standard
 74 |     // bbox computation loop.
 75 |     //   Return true if the BBOX was already computed by the class and returned in
 76 |     //   "bb" so it can be avoided to redo it again. Look at bb.size() to find out
 77 |     //   the expected dimensionality (e.g. 2 or 3 for point clouds)
 78 |     template <class BBOX> bool kdtree_get_bbox(BBOX& /* bb */) const
 79 |     {
 80 |         return false;
 81 |     }
 82 | 
 83 |     const scalar_t* get_point_ptr(const int i) const
 84 |     {
 85 |         return pts + i * 3;
 86 |     }
 87 | 
 88 |     std::array<scalar_t, 3> operator[](const size_t index) const
 89 |     {
 90 |         return {pts[index * 3], pts[index * 3 + 1], pts[index * 3 + 2]};
 91 |     }
 92 | 
 93 | private:
 94 |     const scalar_t* pts;
 95 |     size_t length;
 96 | };
 97 | 
 98 | template <typename scalar_t>
 99 | inline std::ostream& operator<<(std::ostream& os, const PointCloud<scalar_t>& P)
100 | {
101 |     for (size_t i = 0; i < P.get_point_count(); i++)
102 |     {
103 |         auto p = P[i];
104 |         os << "[" << p[0] << ", " << p[1] << ", " << p[2] << "];";
105 |     }
106 |     return os;
107 | }
108 | 


--------------------------------------------------------------------------------
/cuda/src/interpolate.cpp:
--------------------------------------------------------------------------------
 1 | #include "interpolate.h"
 2 | #include "compat.h"
 3 | #include "utils.h"
 4 | 
 5 | void three_nn_kernel_wrapper(int b, int n, int m, const float* unknown, const float* known,
 6 |                              float* dist2, int* idx);
 7 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float* points,
 8 |                                       const int* idx, const float* weight, float* out);
 9 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float* grad_out,
10 |                                            const int* idx, const float* weight, float* grad_points);
11 | 
12 | std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows)
13 | {
14 |     CHECK_CONTIGUOUS(unknowns);
15 |     CHECK_CONTIGUOUS(knows);
16 |     CHECK_IS_FLOAT(unknowns);
17 |     CHECK_IS_FLOAT(knows);
18 | 
19 |     CHECK_CUDA(knows);
20 |     CHECK_CUDA(unknowns);
21 | 
22 |     at::Tensor idx = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
23 |                                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
24 |     at::Tensor dist2 = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
25 |                                     at::device(unknowns.device()).dtype(at::ScalarType::Float));
26 | 
27 |     three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
28 |                             unknowns.DATA_PTR<float>(), knows.DATA_PTR<float>(),
29 |                             dist2.DATA_PTR<float>(), idx.DATA_PTR<int>());
30 | 
31 |     return {dist2, idx};
32 | }
33 | 
34 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight)
35 | {
36 |     CHECK_CONTIGUOUS(points);
37 |     CHECK_CONTIGUOUS(idx);
38 |     CHECK_CONTIGUOUS(weight);
39 |     CHECK_IS_FLOAT(points);
40 |     CHECK_IS_INT(idx);
41 |     CHECK_IS_FLOAT(weight);
42 | 
43 |     CHECK_CUDA(idx);
44 |     CHECK_CUDA(weight);
45 | 
46 |     at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)},
47 |                                      at::device(points.device()).dtype(at::ScalarType::Float));
48 | 
49 |     three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1),
50 |                                      points.DATA_PTR<float>(), idx.DATA_PTR<int>(),
51 |                                      weight.DATA_PTR<float>(), output.DATA_PTR<float>());
52 | 
53 |     return output;
54 | }
55 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight,
56 |                                   const int m)
57 | {
58 |     CHECK_CONTIGUOUS(grad_out);
59 |     CHECK_CONTIGUOUS(idx);
60 |     CHECK_CONTIGUOUS(weight);
61 |     CHECK_IS_FLOAT(grad_out);
62 |     CHECK_IS_INT(idx);
63 |     CHECK_IS_FLOAT(weight);
64 |     CHECK_CUDA(idx);
65 |     CHECK_CUDA(weight);
66 |     CHECK_CUDA(grad_out);
67 | 
68 |     at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m},
69 |                                      at::device(grad_out.device()).dtype(at::ScalarType::Float));
70 | 
71 |     three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
72 |                                           grad_out.DATA_PTR<float>(), idx.DATA_PTR<int>(),
73 |                                           weight.DATA_PTR<float>(), output.DATA_PTR<float>());
74 | 
75 |     return output;
76 | }
77 | 


--------------------------------------------------------------------------------
/.github/workflows/building-conda.yml:
--------------------------------------------------------------------------------
  1 | name: Building Conda
  2 | 
  3 | on: [workflow_dispatch]
  4 | 
  5 | jobs:
  6 | 
  7 |   conda-build:
  8 |     runs-on: ${{ matrix.os }}
  9 | 
 10 |     strategy:
 11 |       fail-fast: false
 12 |       matrix:
 13 |         os: [ubuntu-18.04, macos-10.15] #, windows-2019]
 14 |         python-version: ['3.7', '3.8', '3.9'] #, '3.10']
 15 |         torch-version: [1.10.0, 1.11.0, 1.12.0]
 16 |         cuda-version: ['cpu', 'cu102', 'cu113', 'cu115', 'cu116']
 17 |         exclude:
 18 |           - torch-version: 1.10.0
 19 |             cuda-version: 'cu116'
 20 |           - torch-version: 1.11.0
 21 |             cuda-version: 'cu116'
 22 |           - torch-version: 1.12.0
 23 |             cuda-version: 'cu115'
 24 |           - torch-version: 1.10.0
 25 |             cuda-version: 'cu115'
 26 |           - torch-version: 1.10.0
 27 |             python-version: '3.10'
 28 |           - os: windows-2019
 29 |             torch-version: 1.11.0
 30 |             cuda-version: 'cu102'
 31 |           - os: macos-10.15
 32 |             cuda-version: 'cu102'
 33 |           - os: macos-10.15
 34 |             cuda-version: 'cu113'
 35 |           - os: macos-10.15
 36 |             cuda-version: 'cu115'
 37 |           - os: macos-10.15
 38 |             cuda-version: 'cu116'
 39 |           - os: macos-10.15
 40 |             python-version: '3.10' # this is unhappy
 41 |           - os: ubuntu-18.04
 42 |             cuda-version: 'cu115'
 43 |           - os: windows-2019
 44 |             cuda-version: 'cu102'
 45 |           - os: windows-2019  # Complains about CUDA mismatch.
 46 |             python-version: '3.7'
 47 | 
 48 |     steps:
 49 |       - uses: actions/checkout@v2
 50 |       - name: Set up Conda for Python ${{ matrix.python-version }}
 51 |         uses: conda-incubator/setup-miniconda@v2
 52 |         with:
 53 |           python-version: ${{ matrix.python-version }}
 54 |           
 55 |       - name: Free up disk space
 56 |         if: ${{ runner.os == 'Linux' }}
 57 |         run: |
 58 |           sudo rm -rf /usr/share/dotnet
 59 | 
 60 |       - name: Free up disk space
 61 |         if: ${{ runner.os == 'Linux' }}
 62 |         run: |
 63 |           sudo rm -rf /usr/share/dotnet
 64 | 
 65 |       - name: Install Conda packages
 66 |         run: |
 67 |           conda install conda-build conda-verify --yes
 68 |         shell:
 69 |           bash -l {0}
 70 | 
 71 |       - name: Install CUDA ${{ matrix.cuda-version }}
 72 |         if: ${{ matrix.cuda-version != 'cpu' }}
 73 |         run: |
 74 |           bash .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}.sh
 75 |         shell:
 76 |           bash
 77 | 
 78 |       - name: Build Conda package for CPU
 79 |         if: ${{ matrix.cuda-version == 'cpu' }}
 80 |         run: |
 81 |           FORCE_CUDA=0 TORCH_CUDA_ARCH_LIST=0 ./conda/torch-points-kernels/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
 82 |         shell:
 83 |           bash -l {0}
 84 | 
 85 |       - name: Build Conda package for GPU
 86 |         if: ${{ matrix.cuda-version != 'cpu' }}
 87 |         run: |
 88 |           source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
 89 |           ./conda/torch-points-kernels/build_conda.sh ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
 90 |         shell:
 91 |           bash -l {0}
 92 | 
 93 |       - name: Publish Conda package
 94 |         run: |
 95 |           conda install anaconda-client --yes
 96 |           anaconda upload --force --label main $HOME/conda-bld/*/*.tar.bz2
 97 |         env:
 98 |           ANACONDA_API_TOKEN: ${{ secrets.CONDA_TOKEN }}
 99 |         shell:
100 |           bash -l {0}
101 | 


--------------------------------------------------------------------------------
/cuda/src/ball_query.cpp:
--------------------------------------------------------------------------------
 1 | #include "ball_query.h"
 2 | #include "compat.h"
 3 | #include "utils.h"
 4 | 
 5 | void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample,
 6 |                                            const float* new_xyz, const float* xyz, int64_t* idx,
 7 |                                            float* dist_out);
 8 | 
 9 | void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y,
10 |                                              float radius, int nsample, const float* x,
11 |                                              const float* y, const int64_t* batch_x,
12 |                                              const int64_t* batch_y, int64_t* idx_out,
13 |                                              float* dist_out);
14 | 
15 | std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tensor xyz,
16 |                                                    const float radius, const int nsample)
17 | {
18 |     CHECK_CONTIGUOUS(new_xyz);
19 |     CHECK_CONTIGUOUS(xyz);
20 |     CHECK_IS_FLOAT(new_xyz);
21 |     CHECK_IS_FLOAT(xyz);
22 | 
23 |     CHECK_CUDA(xyz);
24 |     CHECK_CUDA(new_xyz);
25 | 
26 |     at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
27 |                                   at::device(new_xyz.device()).dtype(at::ScalarType::Long));
28 |     at::Tensor dist = torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1,
29 |                                   at::device(new_xyz.device()).dtype(at::ScalarType::Float));
30 | 
31 |     query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius,
32 |                                           nsample, new_xyz.DATA_PTR<float>(), xyz.DATA_PTR<float>(),
33 |                                           idx.DATA_PTR<int64_t>(), dist.DATA_PTR<float>());
34 | 
35 |     return std::make_pair(idx, dist);
36 | }
37 | 
38 | at::Tensor degree(at::Tensor row, int64_t num_nodes)
39 | {
40 |     auto zero = at::zeros(num_nodes, row.options());
41 |     auto one = at::ones(row.size(0), row.options());
42 |     return zero.scatter_add_(0, row, one);
43 | }
44 | 
45 | std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Tensor y,
46 |                                                            at::Tensor batch_x, at::Tensor batch_y,
47 |                                                            const float radius, const int nsample)
48 | {
49 |     CHECK_CONTIGUOUS(x);
50 |     CHECK_CONTIGUOUS(y);
51 |     CHECK_IS_FLOAT(x);
52 |     CHECK_IS_FLOAT(y);
53 |     CHECK_CUDA(x);
54 |     CHECK_CUDA(y);
55 |     CHECK_CUDA(batch_x);
56 |     CHECK_CUDA(batch_y);
57 | 
58 |     at::Tensor idx =
59 |         torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Long));
60 | 
61 |     at::Tensor dist =
62 |         torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Float));
63 | 
64 |     cudaSetDevice(x.get_device());
65 |     auto batch_sizes = (int64_t*)malloc(sizeof(int64_t));
66 |     cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
67 |                cudaMemcpyDeviceToHost);
68 |     auto batch_size = batch_sizes[0] + 1;
69 | 
70 |     batch_x = degree(batch_x, batch_size);
71 |     batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0);
72 |     batch_y = degree(batch_y, batch_size);
73 |     batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
74 | 
75 |     query_ball_point_kernel_partial_wrapper(
76 |         batch_size, x.size(0), y.size(0), radius, nsample, x.DATA_PTR<float>(), y.DATA_PTR<float>(),
77 |         batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(), idx.DATA_PTR<int64_t>(),
78 |         dist.DATA_PTR<float>());
79 | 
80 |     return std::make_pair(idx, dist);
81 | }
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, find_packages
  2 | import os
  3 | import glob
  4 | 
  5 | try:
  6 |     import torch
  7 |     from torch.utils.cpp_extension import (
  8 |         BuildExtension,
  9 |         CUDAExtension,
 10 |         CUDA_HOME,
 11 |         CppExtension,
 12 |     )
 13 | except:
 14 |     raise ModuleNotFoundError("Please install pytorch >= 1.1 before proceeding.")
 15 | 
 16 | WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
 17 | WITH_CPU = True
 18 | if os.getenv("FORCE_CUDA", "0") == "1":
 19 |     WITH_CUDA = True
 20 | if os.getenv("FORCE_ONLY_CUDA", "0") == "1":
 21 |     WITH_CUDA = True
 22 |     WITH_CPU = False
 23 | if os.getenv("FORCE_ONLY_CPU", "0") == "1":
 24 |     WITH_CUDA = False
 25 |     WITH_CPU = True
 26 | 
 27 | 
 28 | def get_ext_modules():
 29 |     TORCH_MAJOR = int(torch.__version__.split(".")[0])
 30 |     TORCH_MINOR = int(torch.__version__.split(".")[1])
 31 |     extra_compile_args = {"cxx": ["-O3"]}
 32 |     if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
 33 |         extra_compile_args["cxx"] += ["-DVERSION_GE_1_3"]
 34 | 
 35 |     ext_src_root = "cuda"
 36 |     ext_sources = glob.glob("{}/src/*.cpp".format(ext_src_root)) + glob.glob(
 37 |         "{}/src/*.cu".format(ext_src_root)
 38 |     )
 39 | 
 40 |     ext_modules = []
 41 |     if WITH_CUDA:
 42 |         nvcc_flags = os.getenv("NVCC_FLAGS", "")
 43 |         nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ")
 44 |         nvcc_flags += ["-arch=sm_35", "--expt-relaxed-constexpr", "-O2"]
 45 |         extra_compile_args["nvcc"] = nvcc_flags
 46 | 
 47 |         ext_modules.append(
 48 |             CUDAExtension(
 49 |                 name="torch_points_kernels.points_cuda",
 50 |                 sources=ext_sources,
 51 |                 include_dirs=["{}/include".format(ext_src_root)],
 52 |                 extra_compile_args=extra_compile_args,
 53 |             )
 54 |         )
 55 | 
 56 |     cpu_ext_src_root = "cpu"
 57 |     cpu_ext_sources = glob.glob("{}/src/*.cpp".format(cpu_ext_src_root))
 58 | 
 59 |     if WITH_CPU:
 60 |         ext_modules.append(
 61 |             CppExtension(
 62 |                 name="torch_points_kernels.points_cpu",
 63 |                 sources=cpu_ext_sources,
 64 |                 include_dirs=["{}/include".format(cpu_ext_src_root)],
 65 |                 extra_compile_args=extra_compile_args,
 66 |             )
 67 |         )
 68 |     return ext_modules
 69 | 
 70 | 
 71 | class CustomBuildExtension(BuildExtension):
 72 |     def __init__(self, *args, **kwargs):
 73 |         super().__init__(*args, no_python_abi_suffix=True, use_ninja=False, **kwargs)
 74 | 
 75 | 
 76 | def get_cmdclass():
 77 |     return {"build_ext": CustomBuildExtension}
 78 | 
 79 | 
 80 | this_directory = os.path.abspath(os.path.dirname(__file__))
 81 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
 82 |     long_description = f.read()
 83 | 
 84 | requirements = ["torch>=1.1.0", "numba", "numpy<=1.21", "scikit-learn"]
 85 | 
 86 | url = "https://github.com/nicolas-chaulet/torch-points-kernels"
 87 | __version__ = "0.7.1"
 88 | setup(
 89 |     name="torch-points-kernels",
 90 |     version=__version__,
 91 |     author="Nicolas Chaulet",
 92 |     packages=find_packages(),
 93 |     description="PyTorch kernels for spatial operations on point clouds",
 94 |     url=url,
 95 |     download_url="{}/archive/{}.tar.gz".format(url, __version__),
 96 |     install_requires=requirements,
 97 |     ext_modules=get_ext_modules(),
 98 |     cmdclass=get_cmdclass(),
 99 |     long_description=long_description,
100 |     long_description_content_type="text/markdown",
101 |     classifiers=[
102 |         "Programming Language :: Python :: 3",
103 |         "License :: OSI Approved :: MIT License",
104 |     ],
105 | )
106 | 


--------------------------------------------------------------------------------
/torch_points_kernels/cluster.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .torchpoints import ball_query_partial_dense
  3 | import numpy as np
  4 | import numba
  5 | from typing import List
  6 | 
  7 | 
  8 | @numba.jit(nopython=True)
  9 | def _grow_proximity_core(neighbours, min_cluster_size):
 10 |     num_points = int(neighbours.shape[0])
 11 |     visited = np.zeros((num_points,), dtype=numba.types.bool_)
 12 |     clusters = []
 13 |     for i in range(num_points):
 14 |         if visited[i]:
 15 |             continue
 16 | 
 17 |         cluster = []
 18 |         queue = []
 19 |         visited[i] = True
 20 |         queue.append(i)
 21 |         cluster.append(i)
 22 |         while len(queue):
 23 |             k = queue.pop()
 24 |             k_neighbours = neighbours[k]
 25 |             for nei in k_neighbours:
 26 |                 if nei.item() == -1:
 27 |                     break
 28 | 
 29 |                 if not visited[nei]:
 30 |                     visited[nei] = True
 31 |                     queue.append(nei.item())
 32 |                     cluster.append(nei.item())
 33 | 
 34 |         if len(cluster) >= min_cluster_size:
 35 |             clusters.append(cluster)
 36 | 
 37 |     return clusters
 38 | 
 39 | 
 40 | def grow_proximity(pos, batch, nsample=16, radius=0.02, min_cluster_size=32):
 41 |     """Grow based on proximity only
 42 |     Neighbour search is done on device while the cluster assignement is done on cpu"""
 43 |     assert pos.shape[0] == batch.shape[0]
 44 |     neighbours = ball_query_partial_dense(radius, nsample, pos, pos, batch, batch)[0].cpu().numpy()
 45 |     return _grow_proximity_core(neighbours, min_cluster_size)
 46 | 
 47 | 
 48 | def region_grow(
 49 |     pos, labels, batch, ignore_labels=[], nsample=16, radius=0.02, min_cluster_size=32
 50 | ) -> List[torch.Tensor]:
 51 |     """Region growing clustering algorithm proposed in
 52 |     PointGroup: Dual-Set Point Grouping for 3D Instance Segmentation
 53 |     https://arxiv.org/pdf/2004.01658.pdf
 54 |     for instance segmentation
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     pos: torch.Tensor [N, 3]
 59 |         Location of the points
 60 |     labels: torch.Tensor [N,]
 61 |         labels of each point
 62 |     ignore_labels:
 63 |         Labels that should be ignored, no region growing will be performed on those
 64 |     nsample:
 65 |         maximum number of neighbours to consider
 66 |     radius:
 67 |         radius for the neighbour search
 68 |     min_cluster_size:
 69 |         Number of points above which a cluster is considered valid
 70 |     """
 71 |     assert labels.dim() == 1
 72 |     assert pos.dim() == 2
 73 |     assert pos.shape[0] == labels.shape[0]
 74 | 
 75 |     unique_labels = torch.unique(labels)
 76 |     clusters = []
 77 |     ind = torch.arange(0, pos.shape[0])
 78 |     for l in unique_labels:
 79 |         if l in ignore_labels:
 80 |             continue
 81 | 
 82 |         # Build clusters for a given label (ignore other points)
 83 |         label_mask = labels == l
 84 |         local_ind = ind[label_mask]
 85 | 
 86 |         # Remap batch to a continuous sequence
 87 |         label_batch = batch[label_mask]
 88 |         unique_in_batch = torch.unique(label_batch)
 89 |         remaped_batch = torch.empty_like(label_batch)
 90 |         for new, old in enumerate(unique_in_batch):
 91 |             mask = label_batch == old
 92 |             remaped_batch[mask] = new
 93 | 
 94 |         # Cluster
 95 |         label_clusters = grow_proximity(
 96 |             pos[label_mask, :],
 97 |             remaped_batch,
 98 |             nsample=nsample,
 99 |             radius=radius,
100 |             min_cluster_size=min_cluster_size,
101 |         )
102 | 
103 |         # Remap indices to original coordinates
104 |         if len(label_clusters):
105 |             for cluster in label_clusters:
106 |                 cluster = torch.tensor(cluster).to(pos.device)
107 |                 clusters.append(local_ind[cluster])
108 | 
109 |     return clusters
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 3D Point Cloud Kernels
 2 | Pytorch CPU and CUDA kernels for spatial search and interpolation for 3D point clouds.
 3 | 
 4 | [![PyPI version](https://badge.fury.io/py/torch-points-kernels.svg)](https://badge.fury.io/py/torch-points-kernels) [![Deploy](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/deploy.yaml/badge.svg)](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/deploy.yaml) [![Unittests](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/tests.yaml/badge.svg)](https://github.com/torch-points3d/torch-points-kernels/actions/workflows/tests.yaml)
 5 | 
 6 | ## Installation
 7 | **Update:** we now provide precompiled Conda packages for the latest PyTorch/CUDA combinations (PyTorch >= 1.10.0). To install with conda:
 8 | ```
 9 | conda install -c torch-points3d torch-points-kernels
10 | ```
11 | 
12 | Or, you can compile the wheel yourself for any PyTorch/CUDA combination (must have a matching installation of CUDA toolkit):
13 | ```
14 | pip install torch-points-kernels
15 | ```
16 | 
17 | To force CUDA installation (for example on Docker builds) please use the flag `FORCE_CUDA`:
18 | ```
19 | FORCE_CUDA=1 pip install torch-points-kernels
20 | ```
21 | 
22 | ## Usage
23 | ```
24 | import torch
25 | import torch_points_kernels.points_cuda
26 | ```
27 | 
28 | ## Build and test
29 | ```
30 | python setup.py build_ext --inplace
31 | python -m unittest
32 | ```
33 | 
34 | ## Troubleshooting
35 | 
36 | ### Compilation issues
37 | Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, e.g.:
38 | ```
39 | $ python -c "import torch; print(torch.__version__)"
40 | >>> 1.4.0
41 | 
42 | $ echo $PATH
43 | >>> /usr/local/cuda/bin:...
44 | 
45 | $ echo $CPATH
46 | >>> /usr/local/cuda/include:...
47 | ```
48 | 
49 | On the compilation, if you have this error:
50 | ```error: cannot call member function 'void std::basic_string<_CharT, _Traits, _Alloc>::_Rep::_M_set_sharable()```
51 | it means that your nvcc version is too old. The version must be at least 10.1.168.
52 | To check the version:
53 | ```
54 | nvcc --version
55 | >>> V10.1.168
56 | ```
57 | 
58 | ### Windows compilation
59 | On Windows you may have this error when compiling:
60 | ```
61 | error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized
62 | error: member "torch::jit::detail::ParameterPolicy::all_slots" may not be initialized
63 | error: member "torch::jit::detail::BufferPolicy::all_slots" may not be initialized
64 | error: member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized
65 | ```
66 | This requires you to edit some of your pytorch header files, use [this script](https://github.com/rusty1s/pytorch_scatter/blob/master/script/torch.sh) as a guide.
67 | 
68 | ### CUDA kernel failed : no kernel image is available for execution on the device
69 | 
70 | This can happen when trying to run the code on a different GPU than the one used to compile the `torch-points-kernels` library. Uninstall `torch-points-kernels`, clear cache, and reinstall after setting the `TORCH_CUDA_ARCH_LIST` environment variable. For example, for compiling with a Tesla T4 (Turing 7.5) and running the code on a Tesla V100 (Volta 7.0) use:
71 | ```
72 | export TORCH_CUDA_ARCH_LIST="7.0;7.5"
73 | ```
74 | See [this useful chart](http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for more architecture compatibility.
75 | 
76 | 
77 | ## Projects using those kernels.
78 | 
79 | [```Pytorch Point Cloud Benchmark```](https://github.com/nicolas-chaulet/deeppointcloud-benchmarks)
80 | 
81 | ## Credit
82 | 
83 | * [```Pointnet2_Tensorflow```](https://github.com/charlesq34/pointnet2) by [Charles R. Qi](https://github.com/charlesq34)
84 | 
85 | * [```Pointnet2_PyTorch```](https://github.com/erikwijmans/Pointnet2_PyTorch) by [Erik Wijmans](https://github.com/erikwijmans)
86 | 
87 | * [```GRNet```](https://github.com/hzxie/GRNet) by [Haozhe Xie](https://github.com/hzxie)
88 | 


--------------------------------------------------------------------------------
/torch_points_kernels/metrics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from typing import List, Optional
  3 | import numpy as np
  4 | import numba
  5 | 
  6 | if torch.cuda.is_available():
  7 |     import torch_points_kernels.points_cuda as tpcuda
  8 | 
  9 | 
 10 | @numba.jit(nopython=True, parallel=True)
 11 | def _instance_iou_cpu(
 12 |     instance_idx,
 13 |     instance_offsets,
 14 |     gt_instances,
 15 |     gt_instance_sizes,
 16 |     num_gt_instances: np.array,
 17 |     batch: np.array,
 18 | ):
 19 |     num_proposed_instances = len(instance_offsets) - 1
 20 |     iou = np.zeros((num_proposed_instances, num_gt_instances.sum()))
 21 |     offset_num_gt_instances = np.concatenate((np.array([0]), num_gt_instances.cumsum()))
 22 |     for proposed_instance in range(num_proposed_instances):
 23 |         instance = instance_idx[instance_offsets[proposed_instance] : instance_offsets[proposed_instance + 1]]
 24 |         sample_idx = batch[instance[0]]
 25 |         gt_count_offset = offset_num_gt_instances[sample_idx]
 26 |         sample_instance_count = num_gt_instances[sample_idx]
 27 |         for instance_id in numba.prange(1, sample_instance_count + 1):
 28 |             intersection = 0
 29 |             for idx in instance:
 30 |                 if gt_instances[idx] == instance_id:
 31 |                     intersection += 1
 32 |             iou[proposed_instance, gt_count_offset + instance_id - 1] = intersection / float(
 33 |                 len(instance) + gt_instance_sizes[gt_count_offset + instance_id - 1] - intersection
 34 |             )
 35 |     return iou
 36 | 
 37 | 
 38 | def instance_iou(
 39 |     instance_idx: List[torch.Tensor],
 40 |     gt_instances: torch.Tensor,
 41 |     batch: Optional[torch.Tensor] = None,
 42 | ):
 43 |     """Computes the IoU between each proposed instance in instance_idx and ground truth instances. Returns a
 44 |     tensor of shape [instance_idx.shape[0], num_instances] that contains the iou between the proposed instances and all gt instances
 45 |     Instance label 0 is reserved for non instance points
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     instance_idx : List[torch.Tensor]
 50 |         List of instances. Each tensor in this list is a proposed and contains the index of the points
 51 |         that belong to that particular instance
 52 |     gt_instances : torch.Tensor
 53 |         Ground truth instances, contains the index of the instance for each point
 54 | 
 55 |     Returns
 56 |     -------
 57 |     ious: torch.Tensor[nb_proposals, nb_groundtruth]
 58 |     """
 59 |     if batch is None:
 60 |         batch = torch.zeros_like(gt_instances)
 61 | 
 62 |     # Gather number of gt instances per batch and size of those instances
 63 |     gt_instance_sizes = []
 64 |     num_gt_instances = []
 65 |     batch_size = batch[-1] + 1
 66 |     for s in range(batch_size):
 67 |         batch_mask = batch == s
 68 |         sample_gt_instances = gt_instances[batch_mask]
 69 |         sample_num_gt_instances = torch.max(sample_gt_instances).item()
 70 |         num_gt_instances.append(sample_num_gt_instances)
 71 |         for instance_id in range(1, sample_num_gt_instances + 1):
 72 |             gt_instance_sizes.append(torch.sum(sample_gt_instances == instance_id))
 73 |     gt_instance_sizes = torch.stack(gt_instance_sizes)
 74 |     num_gt_instances = torch.tensor(num_gt_instances)
 75 | 
 76 |     # Instance offset when flatten
 77 |     instance_offsets = [0]
 78 |     cum_offset = 0
 79 |     for instance in instance_idx:
 80 |         cum_offset += instance.shape[0]
 81 |         instance_offsets.append(cum_offset)
 82 | 
 83 |     # Compute ious
 84 |     instance_idx = torch.cat(instance_idx)
 85 |     if gt_instances.is_cuda:
 86 |         return tpcuda.instance_iou_cuda(
 87 |             instance_idx.cuda(),
 88 |             torch.tensor(instance_offsets).cuda(),
 89 |             gt_instances.cuda(),
 90 |             gt_instance_sizes.cuda(),
 91 |             num_gt_instances.cuda(),
 92 |             batch.cuda(),
 93 |         )
 94 |     else:
 95 |         res = _instance_iou_cpu(
 96 |             instance_idx.numpy(),
 97 |             np.asarray(instance_offsets),
 98 |             gt_instances.numpy(),
 99 |             gt_instance_sizes.numpy(),
100 |             num_gt_instances.numpy(),
101 |             batch.numpy(),
102 |         )
103 |         return torch.tensor(res).float()
104 | 


--------------------------------------------------------------------------------
/cpu/src/bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "ball_query.h"
 2 | #include "fps.h"
 3 | #include "interpolate.h"
 4 | #include "knn.h"
 5 | 
 6 | using namespace pybind11::literals;
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 9 | {
10 |     m.def("dense_knn", &dense_knn, "", "support"_a, "querry"_a, "k"_a);
11 |     m.def("knn_interpolate", &knn_interpolate, "", "features"_a, "idx"_a, "weights"_a);
12 |     m.def("knn_interpolate_grad", &knn_interpolate_grad, "", "grad_out"_a, "idx"_a, "weights"_a,
13 |           "m"_a);
14 |     m.def("fps", &fps, "", "points"_a, "num_samples"_a, "random"_a);
15 | 
16 |     m.def("ball_query", &ball_query,
17 |           "compute the radius search of a point cloud using nanoflann"
18 |           "- support : a pytorch tensor of size N1 x 3, points where the "
19 |           "neighboors are accessed from"
20 |           "- query : a pytorch tensor of size N2 x 3, centre of the balls"
21 |           "- radius : float number, size of the ball for the radius search."
22 |           "- max_num : int number, indicate the maximum of neaghbors allowed(if "
23 |           "-1 then all the possible neighbors will be computed). "
24 |           "- mode : int number that indicate which format for the neighborhood"
25 |           "mode=0 mean a matrix of neighbors(-1 for shadow neighbors)"
26 |           "mode=1 means a matrix of edges of size Num_edge x 2"
27 |           "return a tensor of size N2 x M where M is either max_num or the "
28 |           "maximum number of neighbors found if mode = 0, if mode=1 return a "
29 |           "tensor of size Num_edge x 2 and return a tensor containing the "
30 |           "squared distance of the neighbors",
31 |           "support"_a, "querry"_a, "radius"_a, "max_num"_a = -1, "mode"_a = 0, "sorted"_a = false);
32 | 
33 |     m.def("batch_ball_query", &batch_ball_query,
34 |           "compute the radius search of a point cloud for each batch using "
35 |           "nanoflann"
36 |           "- support : a pytorch tensor of size N1 x 3, points where the "
37 |           "neighboors are accessed from"
38 |           "- query : a pytorch tensor of size N2 x 3, centre of the balls"
39 |           "- support_batch: a pytorch tensor(long) contains indices of the batch "
40 |           "of the support size N1"
41 |           "NB: the batch must be sorted"
42 |           "- query_batch : a pytorch tensor(long) contains indices of the batch "
43 |           "of the query size N2"
44 |           "NB : the batch must be sorted"
45 |           "-radius: float number, size of the ball for the radius search."
46 |           "- max_num : int number, indicate the maximum of neaghbors allowed(if "
47 |           "-1 then all the possible neighbors wrt the radius will be computed)."
48 |           "- mode : int number that indicate which format for the neighborhood"
49 |           "mode=0 mean a matrix of neighbors(N1 for shadow neighbors)"
50 |           "mode=1 means a matrix of edges of size Num_edge x 2"
51 |           "return a tensor of size N2 x M where M is either max_num or the "
52 |           "maximum number of neighbors found if mode = 0, if mode=1 return a "
53 |           "tensor of size Num_edge x 2 and return a tensor containing the "
54 |           "squared distance of the neighbors",
55 |           "support"_a, "querry"_a, "query_batch"_a, "support_batch"_a, "radius"_a, "max_num"_a = -1,
56 |           "mode"_a = 0, "sorted"_a = false);
57 |     m.def("dense_ball_query", &dense_ball_query,
58 |           "compute the radius search of a batch of point cloud using nanoflann"
59 |           "- support : a pytorch tensor of size B x N1 x 3, points where the "
60 |           "neighboors are accessed from"
61 |           "- query : a pytorch tensor of size B x N2 x 3, centre of the balls"
62 |           "- radius : float number, size of the ball for the radius search."
63 |           "- max_num : int number, indicate the maximum of neaghbors allowed(if "
64 |           "-1 then all the possible neighbors will be computed). "
65 |           "- mode : int number that indicate which format for the neighborhood"
66 |           "mode=0 mean a matrix of neighbors(-1 for shadow neighbors)"
67 |           "mode=1 means a matrix of edges of size Num_edge x 2"
68 |           "return a tensor of size B x N2 x M where M is either max_num or the "
69 |           "maximum number of neighbors found if mode = 0, if mode=1 return a "
70 |           "tensor of size Num_edge x 2 and return a tensor containing the "
71 |           "squared distance of the neighbors",
72 |           "support"_a, "querry"_a, "radius"_a, "max_num"_a = -1, "mode"_a = 0, "sorted"_a = false);
73 | }
74 | 


--------------------------------------------------------------------------------
/cuda/src/ball_query_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include "cuda_utils.h"
  6 | 
  7 | // input: new_xyz(b, m, 3) xyz(b, n, 3)
  8 | // output: idx(b, m, nsample)
  9 | __global__ void query_ball_point_kernel_dense(int b, int n, int m, float radius, int nsample,
 10 |                                               const float* __restrict__ new_xyz,
 11 |                                               const float* __restrict__ xyz,
 12 |                                               int64_t* __restrict__ idx_out,
 13 |                                               float* __restrict__ dist_out)
 14 | {
 15 |     int batch_index = blockIdx.x;
 16 |     xyz += batch_index * n * 3;
 17 |     new_xyz += batch_index * m * 3;
 18 |     idx_out += m * nsample * batch_index;
 19 |     dist_out += m * nsample * batch_index;
 20 | 
 21 |     int index = threadIdx.x;
 22 |     int stride = blockDim.x;
 23 | 
 24 |     float radius2 = radius * radius;
 25 |     for (int j = index; j < m; j += stride)
 26 |     {
 27 |         float new_x = new_xyz[j * 3 + 0];
 28 |         float new_y = new_xyz[j * 3 + 1];
 29 |         float new_z = new_xyz[j * 3 + 2];
 30 |         for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k)
 31 |         {
 32 |             float x = xyz[k * 3 + 0];
 33 |             float y = xyz[k * 3 + 1];
 34 |             float z = xyz[k * 3 + 2];
 35 |             float d2 =
 36 |                 (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
 37 |             if (d2 < radius2)
 38 |             {
 39 |                 if (cnt == 0)
 40 |                 {
 41 |                     for (int l = 0; l < nsample; ++l)
 42 |                     {
 43 |                         idx_out[j * nsample + l] = k;
 44 |                     }
 45 |                 }
 46 |                 idx_out[j * nsample + cnt] = k;
 47 |                 dist_out[j * nsample + cnt] = d2;
 48 |                 ++cnt;
 49 |             }
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | __global__ void query_ball_point_kernel_partial_dense(int size_x, int size_y, float radius,
 55 |                                                       int nsample, const float* __restrict__ x,
 56 |                                                       const float* __restrict__ y,
 57 |                                                       const int64_t* __restrict__ batch_x,
 58 |                                                       const int64_t* __restrict__ batch_y,
 59 |                                                       int64_t* __restrict__ idx_out,
 60 |                                                       float* __restrict__ dist_out)
 61 | {
 62 |     // taken from
 63 |     // https://github.com/rusty1s/pytorch_cluster/blob/master/cuda/radius_kernel.cu
 64 |     const ptrdiff_t batch_idx = blockIdx.x;
 65 | 
 66 |     const ptrdiff_t start_idx_x = batch_x[batch_idx];
 67 |     const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
 68 | 
 69 |     const ptrdiff_t start_idx_y = batch_y[batch_idx];
 70 |     const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
 71 |     float radius2 = radius * radius;
 72 | 
 73 |     for (ptrdiff_t n_y = start_idx_y + threadIdx.x; n_y < end_idx_y; n_y += blockDim.x)
 74 |     {
 75 |         int64_t count = 0;
 76 |         for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++)
 77 |         {
 78 |             float dist = 0;
 79 |             for (ptrdiff_t d = 0; d < 3; d++)
 80 |             {
 81 |                 dist += (x[n_x * 3 + d] - y[n_y * 3 + d]) * (x[n_x * 3 + d] - y[n_y * 3 + d]);
 82 |             }
 83 |             if (dist <= radius2)
 84 |             {
 85 |                 idx_out[n_y * nsample + count] = n_x;
 86 |                 dist_out[n_y * nsample + count] = dist;
 87 |                 count++;
 88 |             }
 89 |             if (count >= nsample)
 90 |             {
 91 |                 break;
 92 |             }
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | void query_ball_point_kernel_dense_wrapper(int b, int n, int m, float radius, int nsample,
 98 |                                            const float* new_xyz, const float* xyz, int64_t* idx,
 99 |                                            float* dist_out)
100 | {
101 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
102 |     query_ball_point_kernel_dense<<<b, opt_n_threads(m), 0, stream>>>(b, n, m, radius, nsample,
103 |                                                                       new_xyz, xyz, idx, dist_out);
104 | 
105 |     CUDA_CHECK_ERRORS();
106 | }
107 | 
108 | void query_ball_point_kernel_partial_wrapper(int64_t batch_size, int size_x, int size_y,
109 |                                              float radius, int nsample, const float* x,
110 |                                              const float* y, const int64_t* batch_x,
111 |                                              const int64_t* batch_y, int64_t* idx_out,
112 |                                              float* dist_out)
113 | {
114 |     query_ball_point_kernel_partial_dense<<<batch_size, TOTAL_THREADS_SPARSE>>>(
115 |         size_x, size_y, radius, nsample, x, y, batch_x, batch_y, idx_out, dist_out);
116 | 
117 |     CUDA_CHECK_ERRORS();
118 | }
119 | 


--------------------------------------------------------------------------------
/cuda/src/interpolate_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include "cuda_utils.h"
  6 | 
  7 | // input: unknown(b, n, 3) known(b, m, 3)
  8 | // output: dist2(b, n, 3), idx(b, n, 3)
  9 | __global__ void three_nn_kernel(int b, int n, int m, const float* __restrict__ unknown,
 10 |                                 const float* __restrict__ known, float* __restrict__ dist2,
 11 |                                 int* __restrict__ idx)
 12 | {
 13 |     int batch_index = blockIdx.x;
 14 |     unknown += batch_index * n * 3;
 15 |     known += batch_index * m * 3;
 16 |     dist2 += batch_index * n * 3;
 17 |     idx += batch_index * n * 3;
 18 | 
 19 |     int index = threadIdx.x;
 20 |     int stride = blockDim.x;
 21 |     for (int j = index; j < n; j += stride)
 22 |     {
 23 |         float ux = unknown[j * 3 + 0];
 24 |         float uy = unknown[j * 3 + 1];
 25 |         float uz = unknown[j * 3 + 2];
 26 | 
 27 |         double best1 = 1e40, best2 = 1e40, best3 = 1e40;
 28 |         int besti1 = 0, besti2 = 0, besti3 = 0;
 29 |         for (int k = 0; k < m; ++k)
 30 |         {
 31 |             float x = known[k * 3 + 0];
 32 |             float y = known[k * 3 + 1];
 33 |             float z = known[k * 3 + 2];
 34 |             float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
 35 |             if (d < best1)
 36 |             {
 37 |                 best3 = best2;
 38 |                 besti3 = besti2;
 39 |                 best2 = best1;
 40 |                 besti2 = besti1;
 41 |                 best1 = d;
 42 |                 besti1 = k;
 43 |             }
 44 |             else if (d < best2)
 45 |             {
 46 |                 best3 = best2;
 47 |                 besti3 = besti2;
 48 |                 best2 = d;
 49 |                 besti2 = k;
 50 |             }
 51 |             else if (d < best3)
 52 |             {
 53 |                 best3 = d;
 54 |                 besti3 = k;
 55 |             }
 56 |         }
 57 |         dist2[j * 3 + 0] = best1;
 58 |         dist2[j * 3 + 1] = best2;
 59 |         dist2[j * 3 + 2] = best3;
 60 | 
 61 |         idx[j * 3 + 0] = besti1;
 62 |         idx[j * 3 + 1] = besti2;
 63 |         idx[j * 3 + 2] = besti3;
 64 |     }
 65 | }
 66 | 
 67 | void three_nn_kernel_wrapper(int b, int n, int m, const float* unknown, const float* known,
 68 |                              float* dist2, int* idx)
 69 | {
 70 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 71 |     three_nn_kernel<<<b, opt_n_threads(n), 0, stream>>>(b, n, m, unknown, known, dist2, idx);
 72 | 
 73 |     CUDA_CHECK_ERRORS();
 74 | }
 75 | 
 76 | // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
 77 | // output: out(b, c, n)
 78 | __global__ void three_interpolate_kernel(int b, int c, int m, int n,
 79 |                                          const float* __restrict__ points,
 80 |                                          const int* __restrict__ idx,
 81 |                                          const float* __restrict__ weight, float* __restrict__ out)
 82 | {
 83 |     int batch_index = blockIdx.x;
 84 |     points += batch_index * m * c;
 85 | 
 86 |     idx += batch_index * n * 3;
 87 |     weight += batch_index * n * 3;
 88 | 
 89 |     out += batch_index * n * c;
 90 | 
 91 |     const int index = threadIdx.y * blockDim.x + threadIdx.x;
 92 |     const int stride = blockDim.y * blockDim.x;
 93 |     for (int i = index; i < c * n; i += stride)
 94 |     {
 95 |         const int l = i / n;
 96 |         const int j = i % n;
 97 |         float w1 = weight[j * 3 + 0];
 98 |         float w2 = weight[j * 3 + 1];
 99 |         float w3 = weight[j * 3 + 2];
100 | 
101 |         int i1 = idx[j * 3 + 0];
102 |         int i2 = idx[j * 3 + 1];
103 |         int i3 = idx[j * 3 + 2];
104 | 
105 |         out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + points[l * m + i3] * w3;
106 |     }
107 | }
108 | 
109 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float* points,
110 |                                       const int* idx, const float* weight, float* out)
111 | {
112 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
113 |     three_interpolate_kernel<<<b, opt_block_config(n, c), 0, stream>>>(b, c, m, n, points, idx,
114 |                                                                        weight, out);
115 | 
116 |     CUDA_CHECK_ERRORS();
117 | }
118 | 
119 | // input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
120 | // output: grad_points(b, c, m)
121 | 
122 | __global__ void three_interpolate_grad_kernel(int b, int c, int n, int m,
123 |                                               const float* __restrict__ grad_out,
124 |                                               const int* __restrict__ idx,
125 |                                               const float* __restrict__ weight,
126 |                                               float* __restrict__ grad_points)
127 | {
128 |     int batch_index = blockIdx.x;
129 |     grad_out += batch_index * n * c;
130 |     idx += batch_index * n * 3;
131 |     weight += batch_index * n * 3;
132 |     grad_points += batch_index * m * c;
133 | 
134 |     const int index = threadIdx.y * blockDim.x + threadIdx.x;
135 |     const int stride = blockDim.y * blockDim.x;
136 |     for (int i = index; i < c * n; i += stride)
137 |     {
138 |         const int l = i / n;
139 |         const int j = i % n;
140 |         float w1 = weight[j * 3 + 0];
141 |         float w2 = weight[j * 3 + 1];
142 |         float w3 = weight[j * 3 + 2];
143 | 
144 |         int i1 = idx[j * 3 + 0];
145 |         int i2 = idx[j * 3 + 1];
146 |         int i3 = idx[j * 3 + 2];
147 | 
148 |         atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
149 |         atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
150 |         atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
151 |     }
152 | }
153 | 
154 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float* grad_out,
155 |                                            const int* idx, const float* weight, float* grad_points)
156 | {
157 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
158 |     three_interpolate_grad_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
159 |         b, c, n, m, grad_out, idx, weight, grad_points);
160 | 
161 |     CUDA_CHECK_ERRORS();
162 | }
163 | 


--------------------------------------------------------------------------------
/cuda/src/sampling_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | 
  4 | #include "cuda_utils.h"
  5 | 
  6 | __device__ void __update(float* __restrict__ dists, int* __restrict__ dists_i, int idx1, int idx2)
  7 | {
  8 |     const float v1 = dists[idx1], v2 = dists[idx2];
  9 |     const int i1 = dists_i[idx1], i2 = dists_i[idx2];
 10 |     dists[idx1] = max(v1, v2);
 11 |     dists_i[idx1] = v2 > v1 ? i2 : i1;
 12 | }
 13 | 
 14 | // Input dataset: (b, n, 3), tmp: (b, n)
 15 | // Ouput idxs (b, m)
 16 | template <unsigned int block_size>
 17 | __global__ void furthest_point_sampling_kernel(int b, int n, int m,
 18 |                                                const float* __restrict__ dataset,
 19 |                                                float* __restrict__ temp, int* __restrict__ idxs)
 20 | {
 21 |     if (m <= 0)
 22 |         return;
 23 |     __shared__ float dists[block_size];
 24 |     __shared__ int dists_i[block_size];
 25 | 
 26 |     int batch_index = blockIdx.x;
 27 |     dataset += batch_index * n * 3;
 28 |     temp += batch_index * n;
 29 |     idxs += batch_index * m;
 30 | 
 31 |     int tid = threadIdx.x;
 32 |     const int stride = block_size;
 33 | 
 34 |     int old = 0;
 35 |     if (threadIdx.x == 0)
 36 |         idxs[0] = old;
 37 | 
 38 |     __syncthreads();
 39 |     for (int j = 1; j < m; j++)
 40 |     {
 41 |         int besti = 0;
 42 |         float best = -1;
 43 |         float x1 = dataset[old * 3 + 0];
 44 |         float y1 = dataset[old * 3 + 1];
 45 |         float z1 = dataset[old * 3 + 2];
 46 |         for (int k = tid; k < n; k += stride)
 47 |         {
 48 |             float x2, y2, z2;
 49 |             x2 = dataset[k * 3 + 0];
 50 |             y2 = dataset[k * 3 + 1];
 51 |             z2 = dataset[k * 3 + 2];
 52 |             float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
 53 |             if (mag <= 1e-3)
 54 |                 continue;
 55 | 
 56 |             float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
 57 | 
 58 |             float d2 = min(d, temp[k]);
 59 |             temp[k] = d2;
 60 |             besti = d2 > best ? k : besti;
 61 |             best = d2 > best ? d2 : best;
 62 |         }
 63 |         dists[tid] = best;
 64 |         dists_i[tid] = besti;
 65 |         __syncthreads();
 66 | 
 67 |         if (block_size >= 512)
 68 |         {
 69 |             if (tid < 256)
 70 |             {
 71 |                 __update(dists, dists_i, tid, tid + 256);
 72 |             }
 73 |             __syncthreads();
 74 |         }
 75 |         if (block_size >= 256)
 76 |         {
 77 |             if (tid < 128)
 78 |             {
 79 |                 __update(dists, dists_i, tid, tid + 128);
 80 |             }
 81 |             __syncthreads();
 82 |         }
 83 |         if (block_size >= 128)
 84 |         {
 85 |             if (tid < 64)
 86 |             {
 87 |                 __update(dists, dists_i, tid, tid + 64);
 88 |             }
 89 |             __syncthreads();
 90 |         }
 91 |         if (block_size >= 64)
 92 |         {
 93 |             if (tid < 32)
 94 |             {
 95 |                 __update(dists, dists_i, tid, tid + 32);
 96 |             }
 97 |             __syncthreads();
 98 |         }
 99 |         if (block_size >= 32)
100 |         {
101 |             if (tid < 16)
102 |             {
103 |                 __update(dists, dists_i, tid, tid + 16);
104 |             }
105 |             __syncthreads();
106 |         }
107 |         if (block_size >= 16)
108 |         {
109 |             if (tid < 8)
110 |             {
111 |                 __update(dists, dists_i, tid, tid + 8);
112 |             }
113 |             __syncthreads();
114 |         }
115 |         if (block_size >= 8)
116 |         {
117 |             if (tid < 4)
118 |             {
119 |                 __update(dists, dists_i, tid, tid + 4);
120 |             }
121 |             __syncthreads();
122 |         }
123 |         if (block_size >= 4)
124 |         {
125 |             if (tid < 2)
126 |             {
127 |                 __update(dists, dists_i, tid, tid + 2);
128 |             }
129 |             __syncthreads();
130 |         }
131 |         if (block_size >= 2)
132 |         {
133 |             if (tid < 1)
134 |             {
135 |                 __update(dists, dists_i, tid, tid + 1);
136 |             }
137 |             __syncthreads();
138 |         }
139 | 
140 |         old = dists_i[0];
141 |         if (tid == 0)
142 |             idxs[j] = old;
143 |     }
144 | }
145 | 
146 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float* dataset, float* temp,
147 |                                             int* idxs)
148 | {
149 |     unsigned int n_threads = opt_n_threads(n);
150 | 
151 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
152 | 
153 |     switch (n_threads)
154 |     {
155 |     case 512:
156 |         furthest_point_sampling_kernel<512>
157 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
158 |         break;
159 |     case 256:
160 |         furthest_point_sampling_kernel<256>
161 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
162 |         break;
163 |     case 128:
164 |         furthest_point_sampling_kernel<128>
165 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
166 |         break;
167 |     case 64:
168 |         furthest_point_sampling_kernel<64>
169 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
170 |         break;
171 |     case 32:
172 |         furthest_point_sampling_kernel<32>
173 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
174 |         break;
175 |     case 16:
176 |         furthest_point_sampling_kernel<16>
177 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
178 |         break;
179 |     case 8:
180 |         furthest_point_sampling_kernel<8>
181 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
182 |         break;
183 |     case 4:
184 |         furthest_point_sampling_kernel<4>
185 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
186 |         break;
187 |     case 2:
188 |         furthest_point_sampling_kernel<2>
189 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
190 |         break;
191 |     case 1:
192 |         furthest_point_sampling_kernel<1>
193 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
194 |         break;
195 |     default:
196 |         furthest_point_sampling_kernel<512>
197 |             <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
198 |     }
199 | 
200 |     CUDA_CHECK_ERRORS();
201 | }
202 | 


--------------------------------------------------------------------------------
/cpu/src/ball_query.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "ball_query.h"
  3 | #include "compat.h"
  4 | #include "neighbors.cpp"
  5 | #include "neighbors.h"
  6 | #include "utils.h"
  7 | #include <iostream>
  8 | #include <torch/extension.h>
  9 | 
 10 | std::pair<at::Tensor, at::Tensor> ball_query(at::Tensor support, at::Tensor query, float radius,
 11 |                                              int max_num, int mode, bool sorted)
 12 | {
 13 |     CHECK_CONTIGUOUS(support);
 14 |     CHECK_CONTIGUOUS(query);
 15 | 
 16 |     at::Tensor out;
 17 |     at::Tensor out_dists;
 18 |     std::vector<int64_t> neighbors_indices(query.size(0), 0);
 19 |     std::vector<float> neighbors_dists(query.size(0), -1);
 20 | 
 21 |     auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU);
 22 |     auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
 23 |     int max_count = 0;
 24 | 
 25 |     AT_DISPATCH_ALL_TYPES(query.scalar_type(), "radius_search", [&] {
 26 |         auto data_q = query.DATA_PTR<scalar_t>();
 27 |         auto data_s = support.DATA_PTR<scalar_t>();
 28 |         std::vector<scalar_t> queries_stl =
 29 |             std::vector<scalar_t>(data_q, data_q + query.size(0) * query.size(1));
 30 |         std::vector<scalar_t> supports_stl =
 31 |             std::vector<scalar_t>(data_s, data_s + support.size(0) * support.size(1));
 32 | 
 33 |         max_count = nanoflann_neighbors<scalar_t>(queries_stl, supports_stl, neighbors_indices,
 34 |                                                   neighbors_dists, radius, max_num, mode, sorted);
 35 |     });
 36 |     auto neighbors_dists_ptr = neighbors_dists.data();
 37 |     int64_t* neighbors_indices_ptr = neighbors_indices.data();
 38 |     if (mode == 0)
 39 |     {
 40 |         out =
 41 |             torch::from_blob(neighbors_indices_ptr, {query.size(0), max_count}, options = options);
 42 |         out_dists = torch::from_blob(neighbors_dists_ptr, {query.size(0), max_count},
 43 |                                      options = options_dist);
 44 |     }
 45 |     else if (mode == 1)
 46 |     {
 47 |         out = torch::from_blob(neighbors_indices_ptr, {(int)neighbors_indices.size() / 2, 2},
 48 |                                options = options);
 49 |         out_dists = torch::from_blob(neighbors_dists_ptr, {(int)neighbors_indices.size() / 2, 1},
 50 |                                      options = options_dist);
 51 |     }
 52 | 
 53 |     return std::make_pair(out.clone(), out_dists.clone());
 54 | }
 55 | 
 56 | at::Tensor degree(at::Tensor row, int64_t num_nodes)
 57 | {
 58 |     auto zero = at::zeros(num_nodes, row.options());
 59 |     auto one = at::ones(row.size(0), row.options());
 60 |     auto out = zero.scatter_add_(0, row, one);
 61 |     return out;
 62 | }
 63 | 
 64 | std::pair<at::Tensor, at::Tensor> batch_ball_query(at::Tensor support, at::Tensor query,
 65 |                                                    at::Tensor support_batch, at::Tensor query_batch,
 66 |                                                    float radius, int max_num, int mode, bool sorted)
 67 | {
 68 |     CHECK_CONTIGUOUS(support);
 69 |     CHECK_CONTIGUOUS(query);
 70 |     CHECK_CONTIGUOUS(support_batch);
 71 |     CHECK_CONTIGUOUS(query_batch);
 72 | 
 73 |     at::Tensor idx;
 74 | 
 75 |     at::Tensor dist;
 76 |     std::vector<int64_t> neighbors_indices;
 77 |     std::vector<float> neighbors_dists;
 78 | 
 79 |     auto options = torch::TensorOptions().dtype(torch::kLong).device(torch::kCPU);
 80 |     auto options_dist = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
 81 | 
 82 |     int max_count = 0;
 83 |     auto q_batch_access = query_batch.accessor<int64_t, 1>();
 84 |     auto s_batch_access = support_batch.accessor<int64_t, 1>();
 85 | 
 86 |     auto batch_size = q_batch_access[query_batch.size(0) - 1] + 1;
 87 |     TORCH_CHECK(batch_size == (s_batch_access[support_batch.size(0) - 1] + 1),
 88 |                 "Both batches need to have the same number of samples.")
 89 | 
 90 |     query_batch = degree(query_batch, batch_size);
 91 |     query_batch = at::cat({at::zeros(1, query_batch.options()), query_batch.cumsum(0)}, 0);
 92 |     support_batch = degree(support_batch, batch_size);
 93 |     support_batch = at::cat({at::zeros(1, support_batch.options()), support_batch.cumsum(0)}, 0);
 94 |     std::vector<int64_t> query_batch_stl(query_batch.DATA_PTR<int64_t>(),
 95 |                                          query_batch.DATA_PTR<int64_t>() + query_batch.numel());
 96 |     std::vector<int64_t> support_batch_stl(support_batch.DATA_PTR<int64_t>(),
 97 |                                            support_batch.DATA_PTR<int64_t>() +
 98 |                                                support_batch.numel());
 99 | 
100 |     AT_DISPATCH_ALL_TYPES(query.scalar_type(), "batch_radius_search", [&] {
101 |         std::vector<scalar_t> queries_stl(query.DATA_PTR<scalar_t>(),
102 |                                           query.DATA_PTR<scalar_t>() + query.numel());
103 |         std::vector<scalar_t> supports_stl(support.DATA_PTR<scalar_t>(),
104 |                                            support.DATA_PTR<scalar_t>() + support.numel());
105 | 
106 |         max_count = batch_nanoflann_neighbors<scalar_t>(
107 |             queries_stl, supports_stl, query_batch_stl, support_batch_stl, neighbors_indices,
108 |             neighbors_dists, radius, max_num, mode, sorted);
109 |     });
110 |     auto neighbors_dists_ptr = neighbors_dists.data();
111 |     int64_t* neighbors_indices_ptr = neighbors_indices.data();
112 | 
113 |     if (mode == 0)
114 |     {
115 |         idx =
116 |             torch::from_blob(neighbors_indices_ptr, {query.size(0), max_count}, options = options);
117 |         dist = torch::from_blob(neighbors_dists_ptr, {query.size(0), max_count},
118 |                                 options = options_dist);
119 |     }
120 |     else if (mode == 1)
121 |     {
122 |         idx = torch::from_blob(neighbors_indices_ptr, {(int)neighbors_indices.size() / 2, 2},
123 |                                options = options);
124 |         dist = torch::from_blob(neighbors_dists_ptr, {(int)neighbors_indices.size() / 2, 1},
125 |                                 options = options_dist);
126 |     }
127 |     return std::make_pair(idx.clone(), dist.clone());
128 | }
129 | 
130 | std::pair<at::Tensor, at::Tensor> dense_ball_query(at::Tensor support, at::Tensor query,
131 |                                                    float radius, int max_num, int mode, bool sorted)
132 | {
133 |     CHECK_CONTIGUOUS(support);
134 |     CHECK_CONTIGUOUS(query);
135 | 
136 |     int b = query.size(0);
137 |     vector<at::Tensor> batch_idx;
138 |     vector<at::Tensor> batch_dist;
139 |     for (int i = 0; i < b; i++)
140 |     {
141 |         auto out_pair = ball_query(query[i], support[i], radius, max_num, mode, sorted);
142 |         batch_idx.push_back(out_pair.first);
143 |         batch_dist.push_back(out_pair.second);
144 |     }
145 |     auto out_idx = torch::stack(batch_idx);
146 |     auto out_dist = torch::stack(batch_dist);
147 |     return std::make_pair(out_idx, out_dist);
148 | }
149 | 


--------------------------------------------------------------------------------
/torch_points_kernels/torchpoints.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Function
  3 | import torch.nn as nn
  4 | import sys
  5 | from typing import Optional, Any, Tuple
  6 | 
  7 | import torch_points_kernels.points_cpu as tpcpu
  8 | from .knn import knn
  9 | 
 10 | if torch.cuda.is_available():
 11 |     import torch_points_kernels.points_cuda as tpcuda
 12 | 
 13 | 
 14 | def furthest_point_sample(xyz, npoint):
 15 |     # type: (Any, torch.Tensor, int) -> torch.Tensor
 16 |     r"""
 17 |     Uses iterative furthest point sampling to select a set of npoint features that have the largest
 18 |     minimum distance
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     xyz : torch.Tensor
 23 |         (B, N, 3) tensor where N > npoint
 24 |     npoint : int32
 25 |         number of features in the sampled set
 26 | 
 27 |     Returns
 28 |     -------
 29 |     torch.Tensor
 30 |         (B, npoint) tensor containing the set
 31 |     """
 32 |     if npoint > xyz.shape[1]:
 33 |         raise ValueError("caanot sample %i points from an input set of %i points" % (npoint, xyz.shape[1]))
 34 |     if xyz.is_cuda:
 35 |         return tpcuda.furthest_point_sampling(xyz, npoint)
 36 |     else:
 37 |         return tpcpu.fps(xyz, npoint, True)
 38 | 
 39 | 
 40 | def three_nn(unknown, known):
 41 |     r"""
 42 |         Find the three nearest neighbors of unknown in known
 43 |     Parameters
 44 |     ----------
 45 |     unknown : torch.Tensor
 46 |         (B, n, 3) tensor of unknown features
 47 |     known : torch.Tensor
 48 |         (B, m, 3) tensor of known features
 49 | 
 50 |     Returns
 51 |     -------
 52 |     dist : torch.Tensor
 53 |         (B, n, 3) l2 distance to the three nearest neighbors
 54 |     idx : torch.Tensor
 55 |         (B, n, 3) index of 3 nearest neighbors
 56 |     """
 57 |     if unknown.shape[1] < 3:
 58 |         raise ValueError("Not enough points. unknown should ahve at least 3 points.")
 59 |     if unknown.is_cuda:
 60 |         dist2, idx = tpcuda.three_nn(unknown, known)
 61 |     else:
 62 |         idx, dist2 = knn(known, unknown, 3)
 63 | 
 64 |     return torch.sqrt(dist2), idx
 65 | 
 66 | 
 67 | class ThreeInterpolate(Function):
 68 |     @staticmethod
 69 |     def forward(ctx, features, idx, weight):
 70 |         # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor
 71 |         B, c, m = features.size()
 72 |         n = idx.size(1)
 73 | 
 74 |         ctx.three_interpolate_for_backward = (idx, weight, m)
 75 | 
 76 |         if features.is_cuda:
 77 |             return tpcuda.three_interpolate(features, idx, weight)
 78 |         else:
 79 |             return tpcpu.knn_interpolate(features, idx, weight)
 80 | 
 81 |     @staticmethod
 82 |     def backward(ctx, grad_out):
 83 |         # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
 84 |         r"""
 85 |         Parameters
 86 |         ----------
 87 |         grad_out : torch.Tensor
 88 |             (B, c, n) tensor with gradients of ouputs
 89 | 
 90 |         Returns
 91 |         -------
 92 |         grad_features : torch.Tensor
 93 |             (B, c, m) tensor with gradients of features
 94 | 
 95 |         None
 96 | 
 97 |         None
 98 |         """
 99 |         idx, weight, m = ctx.three_interpolate_for_backward
100 | 
101 |         if grad_out.is_cuda:
102 |             grad_features = tpcuda.three_interpolate_grad(grad_out.contiguous(), idx, weight, m)
103 |         else:
104 |             grad_features = tpcpu.knn_interpolate_grad(grad_out.contiguous(), idx, weight, m)
105 | 
106 |         return grad_features, None, None
107 | 
108 | 
109 | def three_interpolate(features, idx, weight):
110 |     r"""
111 |     Performs weight linear interpolation on 3 features
112 |     Parameters
113 |     ----------
114 |     features : torch.Tensor
115 |         (B, c, m) Features descriptors to be interpolated from
116 |     idx : torch.Tensor
117 |         (B, n, 3) three nearest neighbors of the target features in features
118 |     weight : torch.Tensor
119 |         (B, n, 3) weights
120 | 
121 |     Returns
122 |     -------
123 |     torch.Tensor
124 |         (B, c, n) tensor of the interpolated features
125 |     """
126 |     return ThreeInterpolate.apply(features, idx, weight)
127 | 
128 | 
129 | def grouping_operation(features, idx):
130 |     r"""
131 |     Parameters
132 |     ----------
133 |     features : torch.Tensor
134 |         (B, C, N) tensor of features to group
135 |     idx : torch.Tensor
136 |         (B, npoint, nsample) tensor containing the indicies of features to group with
137 | 
138 |     Returns
139 |     -------
140 |     torch.Tensor
141 |         (B, C, npoint, nsample) tensor
142 |     """
143 |     all_idx = idx.reshape(idx.shape[0], -1)
144 |     all_idx = all_idx.unsqueeze(1).repeat(1, features.shape[1], 1)
145 |     grouped_features = features.gather(2, all_idx)
146 |     return grouped_features.reshape(idx.shape[0], features.shape[1], idx.shape[1], idx.shape[2])
147 | 
148 | 
149 | def ball_query_dense(radius, nsample, xyz, new_xyz, batch_xyz=None, batch_new_xyz=None, sort=False):
150 |     # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
151 |     if new_xyz.is_cuda:
152 |         if sort:
153 |             raise NotImplementedError("CUDA version does not sort the neighbors")
154 |         ind, dist = tpcuda.ball_query_dense(new_xyz, xyz, radius, nsample)
155 |     else:
156 |         ind, dist = tpcpu.dense_ball_query(new_xyz, xyz, radius, nsample, mode=0, sorted=sort)
157 |     return ind, dist
158 | 
159 | 
160 | def ball_query_partial_dense(radius, nsample, x, y, batch_x, batch_y, sort=False):
161 |     # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
162 |     if x.is_cuda:
163 |         if sort:
164 |             raise NotImplementedError("CUDA version does not sort the neighbors")
165 |         ind, dist = tpcuda.ball_query_partial_dense(x, y, batch_x, batch_y, radius, nsample)
166 |     else:
167 |         ind, dist = tpcpu.batch_ball_query(x, y, batch_x, batch_y, radius, nsample, mode=0, sorted=sort)
168 |     return ind, dist
169 | 
170 | 
171 | def ball_query(
172 |     radius: float,
173 |     nsample: int,
174 |     x: torch.Tensor,
175 |     y: torch.Tensor,
176 |     mode: Optional[str] = "dense",
177 |     batch_x: Optional[torch.tensor] = None,
178 |     batch_y: Optional[torch.tensor] = None,
179 |     sort: Optional[bool] = False,
180 | ) -> torch.Tensor:
181 |     """
182 |     Arguments:
183 |         radius {float} -- radius of the balls
184 |         nsample {int} -- maximum number of features in the balls
185 |         x {torch.Tensor} --
186 |             (M, 3) [partial_dense] or (B, M, 3) [dense] xyz coordinates of the features
187 |         y {torch.Tensor} --
188 |             (npoint, 3) [partial_dense] or or (B, npoint, 3) [dense] centers of the ball query
189 |         mode {str} -- switch between "dense" or "partial_dense" data layout
190 | 
191 |     Keyword Arguments:
192 |         batch_x -- (M, ) [partial_dense] or (B, M, 3) [dense] Contains indexes to indicate within batch it belongs to.
193 |         batch_y -- (N, ) Contains indexes to indicate within batch it belongs to
194 |         sort -- bool wether the neighboors are sorted or not (closests first)
195 | 
196 |     Returns:
197 |         idx: (npoint, nsample) or (B, npoint, nsample) [dense] It contains the indexes of the element within x at radius distance to y
198 |         dist: (N, nsample) or (B, npoint, nsample)  Default value: -1.
199 |                  It contains the squared distance of the element within x at radius distance to y
200 |     """
201 |     if mode is None:
202 |         raise Exception('The mode should be defined within ["partial_dense | dense"]')
203 | 
204 |     if mode.lower() == "partial_dense":
205 |         if (batch_x is None) or (batch_y is None):
206 |             raise Exception("batch_x and batch_y should be provided")
207 |         assert x.size(0) == batch_x.size(0)
208 |         assert y.size(0) == batch_y.size(0)
209 |         assert x.dim() == 2
210 |         return ball_query_partial_dense(radius, nsample, x, y, batch_x, batch_y, sort=sort)
211 | 
212 |     elif mode.lower() == "dense":
213 |         if (batch_x is not None) or (batch_y is not None):
214 |             raise Exception("batch_x and batch_y should not be provided")
215 |         assert x.dim() == 3
216 |         return ball_query_dense(radius, nsample, x, y, sort=sort)
217 |     else:
218 |         raise Exception("unrecognized mode {}".format(mode))
219 | 


--------------------------------------------------------------------------------
/test/test_ballquerry.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import torch
  3 | import numpy.testing as npt
  4 | import numpy as np
  5 | from sklearn.neighbors import KDTree
  6 | import os
  7 | import sys
  8 | 
  9 | ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 10 | sys.path.insert(0, ROOT)
 11 | 
 12 | from test import run_if_cuda
 13 | from torch_points_kernels import ball_query
 14 | 
 15 | 
 16 | class TestBall(unittest.TestCase):
 17 |     @run_if_cuda
 18 |     def test_simple_gpu(self):
 19 |         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float).cuda()
 20 |         b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float).cuda()
 21 |         idx, dist = ball_query(1.01, 2, a, b)
 22 |         torch.testing.assert_allclose(idx.cpu(), torch.tensor([[[0, 1]], [[2, 2]]]))
 23 |         torch.testing.assert_allclose(dist.cpu(), torch.tensor([[[0, 1]], [[1, -1]]]).float())
 24 | 
 25 |     def test_simple_cpu(self):
 26 |         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [2, 0, 0]], [[0, 0, 0], [1, 0, 0], [2, 0, 0]]]).to(torch.float)
 27 |         b = torch.tensor([[[0, 0, 0]], [[3, 0, 0]]]).to(torch.float)
 28 |         idx, dist = ball_query(1.01, 2, a, b, sort=True)
 29 |         torch.testing.assert_allclose(idx, torch.tensor([[[0, 1]], [[2, 2]]]))
 30 |         torch.testing.assert_allclose(dist, torch.tensor([[[0, 1]], [[1, -1]]]).float())
 31 | 
 32 |         a = torch.tensor([[[0, 0, 0], [1, 0, 0], [1, 1, 0]]]).to(torch.float)
 33 |         idx, dist = ball_query(1.01, 3, a, a, sort=True)
 34 |         torch.testing.assert_allclose(idx, torch.tensor([[[0, 1, 0], [1, 0, 2], [2, 1, 2]]]))
 35 | 
 36 |     @run_if_cuda
 37 |     def test_larger_gpu(self):
 38 |         a = torch.randn(32, 4096, 3).to(torch.float).cuda()
 39 |         idx, dist = ball_query(1, 64, a, a)
 40 |         self.assertGreaterEqual(idx.min(), 0)
 41 | 
 42 |     @run_if_cuda
 43 |     def test_cpu_gpu_equality(self):
 44 |         a = torch.randn(5, 1000, 3)
 45 |         b = torch.randn(5, 500, 3)
 46 |         res_cpu = ball_query(1, 500, a, b)[0].detach().numpy()
 47 |         res_cuda = ball_query(1, 500, a.cuda(), b.cuda())[0].cpu().detach().numpy()
 48 |         for i in range(b.shape[0]):
 49 |             for j in range(b.shape[1]):
 50 |                 # Because it is not necessary the same order
 51 |                 assert set(res_cpu[i][j]) == set(res_cuda[i][j])
 52 | 
 53 |         res_cpu = ball_query(0.01, 500, a, b)[0].detach().numpy()
 54 |         res_cuda = ball_query(0.01, 500, a.cuda(), b.cuda())[0].cpu().detach().numpy()
 55 |         for i in range(b.shape[0]):
 56 |             for j in range(b.shape[1]):
 57 |                 # Because it is not necessary the same order
 58 |                 assert set(res_cpu[i][j]) == set(res_cuda[i][j])
 59 | 
 60 | 
 61 | class TestBallPartial(unittest.TestCase):
 62 |     @run_if_cuda
 63 |     def test_simple_gpu(self):
 64 |         x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [0.2, 0, 0], [0.1, 0, 0]]).to(torch.float).cuda()
 65 |         y = torch.tensor([[0, 0, 0]]).to(torch.float).cuda()
 66 |         batch_x = torch.from_numpy(np.asarray([0, 0, 0, 1])).long().cuda()
 67 |         batch_y = torch.from_numpy(np.asarray([0])).long().cuda()
 68 | 
 69 |         idx, dist2 = ball_query(0.2, 4, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y)
 70 | 
 71 |         idx = idx.detach().cpu().numpy()
 72 |         dist2 = dist2.detach().cpu().numpy()
 73 | 
 74 |         idx_answer = np.asarray([[1, 2, -1, -1]])
 75 |         dist2_answer = np.asarray([[0.0100, 0.04, -1, -1]]).astype(np.float32)
 76 | 
 77 |         npt.assert_array_almost_equal(idx, idx_answer)
 78 |         npt.assert_array_almost_equal(dist2, dist2_answer)
 79 | 
 80 |     def test_simple_cpu(self):
 81 |         x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [10, 0, 0], [10.1, 0, 0]]).to(torch.float)
 82 |         y = torch.tensor([[0, 0, 0]]).to(torch.float)
 83 | 
 84 |         batch_x = torch.from_numpy(np.asarray([0, 0, 0, 0])).long()
 85 |         batch_y = torch.from_numpy(np.asarray([0])).long()
 86 | 
 87 |         idx, dist2 = ball_query(1.0, 2, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y)
 88 | 
 89 |         idx = idx.detach().cpu().numpy()
 90 |         dist2 = dist2.detach().cpu().numpy()
 91 | 
 92 |         idx_answer = np.asarray([[1, -1]])
 93 |         dist2_answer = np.asarray([[0.0100, -1.0000]]).astype(np.float32)
 94 | 
 95 |         npt.assert_array_almost_equal(idx, idx_answer)
 96 |         npt.assert_array_almost_equal(dist2, dist2_answer)
 97 | 
 98 |     def test_breaks(self):
 99 |         x = torch.tensor([[10, 0, 0], [0.1, 0, 0], [10, 0, 0], [10.1, 0, 0]]).to(torch.float)
100 |         y = torch.tensor([[0, 0, 0]]).to(torch.float)
101 | 
102 |         batch_x = torch.from_numpy(np.asarray([0, 0, 1, 1])).long()
103 |         batch_y = torch.from_numpy(np.asarray([0])).long()
104 | 
105 |         with self.assertRaises(RuntimeError):
106 |             idx, dist2 = ball_query(1.0, 2, x, y, mode="PARTIAL_DENSE", batch_x=batch_x, batch_y=batch_y)
107 | 
108 |     def test_random_cpu(self, cuda=False):
109 |         a = torch.randn(100, 3).to(torch.float)
110 |         b = torch.randn(50, 3).to(torch.float)
111 |         batch_a = torch.tensor([0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])])
112 |         batch_b = torch.tensor([0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])])
113 |         R = 1
114 | 
115 |         idx, dist = ball_query(
116 |             R,
117 |             15,
118 |             a,
119 |             b,
120 |             mode="PARTIAL_DENSE",
121 |             batch_x=batch_a,
122 |             batch_y=batch_b,
123 |             sort=True,
124 |         )
125 |         idx1, dist = ball_query(
126 |             R,
127 |             15,
128 |             a,
129 |             b,
130 |             mode="PARTIAL_DENSE",
131 |             batch_x=batch_a,
132 |             batch_y=batch_b,
133 |             sort=True,
134 |         )
135 |         torch.testing.assert_allclose(idx1, idx)
136 |         with self.assertRaises(AssertionError):
137 |             idx, dist = ball_query(
138 |                 R,
139 |                 15,
140 |                 a,
141 |                 b,
142 |                 mode="PARTIAL_DENSE",
143 |                 batch_x=batch_a,
144 |                 batch_y=batch_b,
145 |                 sort=False,
146 |             )
147 |             idx1, dist = ball_query(
148 |                 R,
149 |                 15,
150 |                 a,
151 |                 b,
152 |                 mode="PARTIAL_DENSE",
153 |                 batch_x=batch_a,
154 |                 batch_y=batch_b,
155 |                 sort=False,
156 |             )
157 |             torch.testing.assert_allclose(idx1, idx)
158 | 
159 |         self.assertEqual(idx.shape[0], b.shape[0])
160 |         self.assertEqual(dist.shape[0], b.shape[0])
161 |         self.assertLessEqual(idx.max().item(), len(batch_a))
162 | 
163 |         # Comparison to see if we have the same result
164 |         tree = KDTree(a.detach().numpy())
165 |         idx3_sk = tree.query_radius(b.detach().numpy(), r=R)
166 |         i = np.random.randint(len(batch_b))
167 |         for p in idx[i].detach().numpy():
168 |             if p >= 0 and p < len(batch_a):
169 |                 assert p in idx3_sk[i]
170 | 
171 |     @run_if_cuda
172 |     def test_random_gpu(self):
173 |         a = torch.randn(100, 3).to(torch.float).cuda()
174 |         b = torch.randn(50, 3).to(torch.float).cuda()
175 |         batch_a = torch.tensor(
176 |             [0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])]
177 |         ).cuda()
178 |         batch_b = torch.tensor(
179 |             [0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])]
180 |         ).cuda()
181 |         R = 1
182 | 
183 |         idx, dist = ball_query(
184 |             R,
185 |             15,
186 |             a,
187 |             b,
188 |             mode="PARTIAL_DENSE",
189 |             batch_x=batch_a,
190 |             batch_y=batch_b,
191 |             sort=False,
192 |         )
193 | 
194 |         # Comparison to see if we have the same result
195 |         tree = KDTree(a.cpu().detach().numpy())
196 |         idx3_sk = tree.query_radius(b.cpu().detach().numpy(), r=R)
197 |         i = np.random.randint(len(batch_b))
198 |         for p in idx[i].cpu().detach().numpy():
199 |             if p >= 0 and p < len(batch_a):
200 |                 assert p in idx3_sk[i]
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     unittest.main()
205 | 


--------------------------------------------------------------------------------
/cuda/src/cubic_feature_sampling_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cstdio>
  3 | #include <cstdlib>
  4 | #include <torch/extension.h>
  5 | 
  6 | #include "cuda_utils.h"
  7 | 
  8 | #define CUDA_NUM_THREADS 512
  9 | 
 10 | // Computer the number of threads needed in GPU
 11 | inline int get_n_threads(int n)
 12 | {
 13 |     const int pow_2 = std::log(static_cast<float>(n)) / std::log(2.0);
 14 |     return max(min(1 << pow_2, CUDA_NUM_THREADS), 1);
 15 | }
 16 | 
 17 | __device__ int compute_index(int offset_x, int offset_y, int offset_z, int scale)
 18 | {
 19 |     return offset_x * scale * scale + offset_y * scale + offset_z;
 20 | }
 21 | 
 22 | template <typename scalar_t>
 23 | __global__ void cubic_feature_sampling_kernel(int scale, int neighborhood_size, int n_vertices,
 24 |                                               int n_pts, int n_cubic_channels,
 25 |                                               const scalar_t* __restrict__ ptcloud,
 26 |                                               const scalar_t* __restrict__ cubic_features,
 27 |                                               scalar_t* __restrict__ point_features,
 28 |                                               int* __restrict__ grid_pt_indexes)
 29 | {
 30 |     int batch_index = blockIdx.x;
 31 |     int index = threadIdx.x;
 32 |     int stride = blockDim.x;
 33 |     int cub_scale = scale * scale * scale;
 34 | 
 35 |     ptcloud += batch_index * n_pts * 3;
 36 |     cubic_features += batch_index * n_cubic_channels * cub_scale;
 37 |     point_features += batch_index * n_pts * n_vertices * n_cubic_channels;
 38 |     grid_pt_indexes += batch_index * n_pts * n_vertices;
 39 | 
 40 |     for (int i = index; i < n_pts; i += stride)
 41 |     {
 42 |         scalar_t pt_x = ptcloud[i * 3 + 0];
 43 |         scalar_t pt_y = ptcloud[i * 3 + 1];
 44 |         scalar_t pt_z = ptcloud[i * 3 + 2];
 45 | 
 46 |         int lower_x = std::floor(pt_x);
 47 |         int upper_x = std::ceil(pt_x);
 48 |         if (lower_x == upper_x)
 49 |         {
 50 |             upper_x += 1;
 51 |         }
 52 |         int lower_y = std::floor(pt_y);
 53 |         int upper_y = std::ceil(pt_y);
 54 |         if (lower_y == upper_y)
 55 |         {
 56 |             upper_y += 1;
 57 |         }
 58 |         int lower_z = std::floor(pt_z);
 59 |         int upper_z = std::ceil(pt_z);
 60 |         if (lower_z == upper_z)
 61 |         {
 62 |             upper_z += 1;
 63 |         }
 64 | 
 65 |         int ns = neighborhood_size - 1;
 66 |         int vertex_idx = 0;
 67 |         for (int j = lower_x - ns; j <= upper_x + ns; ++j)
 68 |         {
 69 |             for (int k = lower_y - ns; k <= upper_y + ns; ++k)
 70 |             {
 71 |                 for (int m = lower_z - ns; m <= upper_z + ns; ++m)
 72 |                 {
 73 |                     if (j < 0 || j >= scale || k < 0 || k >= scale || m < 0 || m >= scale)
 74 |                     {
 75 |                         // Ignore points lies out of the grid
 76 |                         grid_pt_indexes[i * n_vertices + vertex_idx++] = -1;
 77 |                     }
 78 |                     else
 79 |                     {
 80 |                         // Calcuating indexes for adjacent vertices
 81 |                         grid_pt_indexes[i * n_vertices + vertex_idx++] =
 82 |                             compute_index(j, k, m, scale);
 83 |                     }
 84 |                 }
 85 |             }
 86 |         }
 87 | 
 88 |         // Gather Features
 89 |         for (int j = 0; j < n_vertices; ++j)
 90 |         {
 91 |             for (int k = 0; k < n_cubic_channels; ++k)
 92 |             {
 93 |                 int vertex_idx = grid_pt_indexes[i * n_vertices + j];
 94 |                 if (vertex_idx == -1)
 95 |                 {
 96 |                     continue;
 97 |                 }
 98 |                 int feature_idx = i * n_vertices * n_cubic_channels + j * n_cubic_channels + k;
 99 |                 scalar_t feature_val = cubic_features[k * cub_scale + vertex_idx];
100 |                 point_features[feature_idx] = feature_val;
101 |             }
102 |         }
103 |     }
104 | }
105 | 
106 | std::vector<torch::Tensor> cubic_feature_sampling_kernel_wrapper(int scale, int neighborhood_size,
107 |                                                                  torch::Tensor ptcloud,
108 |                                                                  torch::Tensor cubic_features,
109 |                                                                  cudaStream_t stream)
110 | {
111 |     int batch_size = ptcloud.size(0);
112 |     int n_pts = ptcloud.size(1);
113 |     int n_cubic_channels = cubic_features.size(1);
114 | 
115 |     int n_vertices = std::pow(neighborhood_size * 2, 3);
116 |     torch::Tensor point_features = torch::zeros({batch_size, n_pts, n_vertices, n_cubic_channels},
117 |                                                 torch::CUDA(ptcloud.scalar_type()));
118 |     torch::Tensor grid_pt_indexes =
119 |         torch::zeros({batch_size, n_pts, n_vertices}, torch::CUDA(torch::kInt));
120 | 
121 |     AT_DISPATCH_FLOATING_TYPES(
122 |         ptcloud.scalar_type(), "cubic_feature_sampling_cuda", ([&] {
123 |             cubic_feature_sampling_kernel<<<batch_size, get_n_threads(n_pts), 0, stream>>>(
124 |                 scale, neighborhood_size, n_vertices, n_pts, n_cubic_channels,
125 |                 ptcloud.data_ptr<scalar_t>(), cubic_features.data_ptr<scalar_t>(),
126 |                 point_features.data_ptr<scalar_t>(), grid_pt_indexes.data_ptr<int>());
127 |         }));
128 | 
129 |     cudaError_t err = cudaGetLastError();
130 |     if (err != cudaSuccess)
131 |     {
132 |         printf("Error in cubic_feature_sampling_kernel_wrapper: %s\n", cudaGetErrorString(err));
133 |     }
134 |     return {point_features, grid_pt_indexes};
135 | }
136 | 
137 | template <typename scalar_t>
138 | __global__ void cubic_feature_sampling_grad_kernel(int scale, int neighborhood_size, int n_vertices,
139 |                                                    int n_pts, int n_cubic_channels,
140 |                                                    const scalar_t* __restrict__ grad_point_features,
141 |                                                    const int* __restrict__ grid_pt_indexes,
142 |                                                    scalar_t* __restrict__ grad_ptcloud,
143 |                                                    scalar_t* __restrict__ grad_cubic_features)
144 | {
145 |     int batch_index = blockIdx.x;
146 |     int index = threadIdx.x;
147 |     int stride = blockDim.x;
148 |     int cub_scale = scale * scale * scale;
149 | 
150 |     grad_point_features += batch_index * n_pts * n_vertices * n_cubic_channels;
151 |     grid_pt_indexes += batch_index * n_pts * n_vertices;
152 |     grad_ptcloud += batch_index * n_pts * 3;
153 |     grad_cubic_features += batch_index * n_cubic_channels * cub_scale;
154 | 
155 |     for (int i = index; i < n_pts; i += stride)
156 |     {
157 |         for (int j = 0; j < n_vertices; ++j)
158 |         {
159 |             int vertex_idx = grid_pt_indexes[i * n_vertices + j];
160 |             if (vertex_idx == -1)
161 |             {
162 |                 continue;
163 |             }
164 |             for (int k = 0; k < n_cubic_channels; ++k)
165 |             {
166 |                 int grad_idx = i * n_vertices * n_cubic_channels + j * n_cubic_channels + k;
167 |                 scalar_t grad_val = grad_point_features[grad_idx];
168 |                 // Fix bugs: the gradients of ceil and floor functions are zeros.
169 |                 // Ref: https://github.com/tensorflow/tensorflow/issues/897
170 |                 // atomicAdd(&(grad_ptcloud[i * 3 + 0]), grad_val);
171 |                 // atomicAdd(&(grad_ptcloud[i * 3 + 1]), grad_val);
172 |                 // atomicAdd(&(grad_ptcloud[i * 3 + 2]), grad_val);
173 |                 atomicAdd(&(grad_cubic_features[k * cub_scale + vertex_idx]), grad_val);
174 |             }
175 |         }
176 |     }
177 | }
178 | 
179 | std::vector<torch::Tensor>
180 | cubic_feature_sampling_grad_kernel_wrapper(int scale, int neighborhood_size,
181 |                                            torch::Tensor grad_point_features,
182 |                                            torch::Tensor grid_pt_indexes, cudaStream_t stream)
183 | {
184 |     int batch_size = grad_point_features.size(0);
185 |     int n_cubic_channels = grad_point_features.size(3);
186 |     int n_pts = grid_pt_indexes.size(1);
187 |     int n_vertices = std::pow(neighborhood_size * 2, 3);
188 | 
189 |     torch::Tensor grad_ptcloud =
190 |         torch::zeros({batch_size, n_pts, 3}, torch::CUDA(grad_point_features.scalar_type()));
191 |     torch::Tensor grad_cubic_features =
192 |         torch::zeros({batch_size, n_cubic_channels, scale, scale, scale},
193 |                      torch::CUDA(grad_point_features.scalar_type()));
194 | 
195 |     AT_DISPATCH_FLOATING_TYPES(
196 |         grad_point_features.scalar_type(), "cubic_feature_sampling_grad_cuda", ([&] {
197 |             cubic_feature_sampling_grad_kernel<<<batch_size, get_n_threads(n_pts), 0, stream>>>(
198 |                 scale, neighborhood_size, n_vertices, n_pts, n_cubic_channels,
199 |                 grad_point_features.data_ptr<scalar_t>(), grid_pt_indexes.data_ptr<int>(),
200 |                 grad_ptcloud.data_ptr<scalar_t>(), grad_cubic_features.data_ptr<scalar_t>());
201 |         }));
202 | 
203 |     cudaError_t err = cudaGetLastError();
204 |     if (err != cudaSuccess)
205 |     {
206 |         printf("Error in cubic_feature_sampling_grad_kernel_wrapper: %s\n",
207 |                cudaGetErrorString(err));
208 |     }
209 |     return {grad_ptcloud, grad_cubic_features};
210 | }
211 | 


--------------------------------------------------------------------------------
/cuda/src/chamfer_dist_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <cuda_runtime.h>
  3 | #include <torch/extension.h>
  4 | 
  5 | #include "cuda_utils.h"
  6 | #include <vector>
  7 | 
  8 | template <typename scalar_t>
  9 | __global__ void chamfer_dist_kernel(int batch_size, int n, const scalar_t* __restrict__ xyz1, int m,
 10 |                                     const scalar_t* __restrict__ xyz2, scalar_t* __restrict__ dist,
 11 |                                     int* indexes)
 12 | {
 13 |     const int batch = 512;
 14 |     __shared__ scalar_t buf[batch * 3];
 15 |     for (int i = blockIdx.x; i < batch_size; i += gridDim.x)
 16 |     {
 17 |         for (int k2 = 0; k2 < m; k2 += batch)
 18 |         {
 19 |             int end_k = min(m, k2 + batch) - k2;
 20 |             for (int j = threadIdx.x; j < end_k * 3; j += blockDim.x)
 21 |             {
 22 |                 buf[j] = xyz2[(i * m + k2) * 3 + j];
 23 |             }
 24 |             __syncthreads();
 25 |             for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
 26 |             {
 27 |                 scalar_t x1 = xyz1[(i * n + j) * 3 + 0];
 28 |                 scalar_t y1 = xyz1[(i * n + j) * 3 + 1];
 29 |                 scalar_t z1 = xyz1[(i * n + j) * 3 + 2];
 30 |                 scalar_t best_dist = 0;
 31 |                 int best_dist_index = 0;
 32 |                 int end_ka = end_k - (end_k & 3);
 33 |                 if (end_ka == batch)
 34 |                 {
 35 |                     for (int k = 0; k < batch; k += 4)
 36 |                     {
 37 |                         {
 38 |                             scalar_t x2 = buf[k * 3 + 0] - x1;
 39 |                             scalar_t y2 = buf[k * 3 + 1] - y1;
 40 |                             scalar_t z2 = buf[k * 3 + 2] - z1;
 41 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 42 | 
 43 |                             if (k == 0 || dist < best_dist)
 44 |                             {
 45 |                                 best_dist = dist;
 46 |                                 best_dist_index = k + k2;
 47 |                             }
 48 |                         }
 49 |                         {
 50 |                             scalar_t x2 = buf[k * 3 + 3] - x1;
 51 |                             scalar_t y2 = buf[k * 3 + 4] - y1;
 52 |                             scalar_t z2 = buf[k * 3 + 5] - z1;
 53 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 54 |                             if (dist < best_dist)
 55 |                             {
 56 |                                 best_dist = dist;
 57 |                                 best_dist_index = k + k2 + 1;
 58 |                             }
 59 |                         }
 60 |                         {
 61 |                             scalar_t x2 = buf[k * 3 + 6] - x1;
 62 |                             scalar_t y2 = buf[k * 3 + 7] - y1;
 63 |                             scalar_t z2 = buf[k * 3 + 8] - z1;
 64 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 65 |                             if (dist < best_dist)
 66 |                             {
 67 |                                 best_dist = dist;
 68 |                                 best_dist_index = k + k2 + 2;
 69 |                             }
 70 |                         }
 71 |                         {
 72 |                             scalar_t x2 = buf[k * 3 + 9] - x1;
 73 |                             scalar_t y2 = buf[k * 3 + 10] - y1;
 74 |                             scalar_t z2 = buf[k * 3 + 11] - z1;
 75 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 76 |                             if (dist < best_dist)
 77 |                             {
 78 |                                 best_dist = dist;
 79 |                                 best_dist_index = k + k2 + 3;
 80 |                             }
 81 |                         }
 82 |                     }
 83 |                 }
 84 |                 else
 85 |                 {
 86 |                     for (int k = 0; k < end_ka; k += 4)
 87 |                     {
 88 |                         {
 89 |                             scalar_t x2 = buf[k * 3 + 0] - x1;
 90 |                             scalar_t y2 = buf[k * 3 + 1] - y1;
 91 |                             scalar_t z2 = buf[k * 3 + 2] - z1;
 92 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
 93 |                             if (k == 0 || dist < best_dist)
 94 |                             {
 95 |                                 best_dist = dist;
 96 |                                 best_dist_index = k + k2;
 97 |                             }
 98 |                         }
 99 |                         {
100 |                             scalar_t x2 = buf[k * 3 + 3] - x1;
101 |                             scalar_t y2 = buf[k * 3 + 4] - y1;
102 |                             scalar_t z2 = buf[k * 3 + 5] - z1;
103 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
104 |                             if (dist < best_dist)
105 |                             {
106 |                                 best_dist = dist;
107 |                                 best_dist_index = k + k2 + 1;
108 |                             }
109 |                         }
110 |                         {
111 |                             scalar_t x2 = buf[k * 3 + 6] - x1;
112 |                             scalar_t y2 = buf[k * 3 + 7] - y1;
113 |                             scalar_t z2 = buf[k * 3 + 8] - z1;
114 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
115 |                             if (dist < best_dist)
116 |                             {
117 |                                 best_dist = dist;
118 |                                 best_dist_index = k + k2 + 2;
119 |                             }
120 |                         }
121 |                         {
122 |                             scalar_t x2 = buf[k * 3 + 9] - x1;
123 |                             scalar_t y2 = buf[k * 3 + 10] - y1;
124 |                             scalar_t z2 = buf[k * 3 + 11] - z1;
125 |                             scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
126 |                             if (dist < best_dist)
127 |                             {
128 |                                 best_dist = dist;
129 |                                 best_dist_index = k + k2 + 3;
130 |                             }
131 |                         }
132 |                     }
133 |                 }
134 |                 for (int k = end_ka; k < end_k; k++)
135 |                 {
136 |                     scalar_t x2 = buf[k * 3 + 0] - x1;
137 |                     scalar_t y2 = buf[k * 3 + 1] - y1;
138 |                     scalar_t z2 = buf[k * 3 + 2] - z1;
139 |                     scalar_t dist = x2 * x2 + y2 * y2 + z2 * z2;
140 |                     if (k == 0 || dist < best_dist)
141 |                     {
142 |                         best_dist = dist;
143 |                         best_dist_index = k + k2;
144 |                     }
145 |                 }
146 |                 if (k2 == 0 || dist[(i * n + j)] > best_dist)
147 |                 {
148 |                     dist[(i * n + j)] = best_dist;
149 |                     indexes[(i * n + j)] = best_dist_index;
150 |                 }
151 |             }
152 |             __syncthreads();
153 |         }
154 |     }
155 | }
156 | 
157 | std::vector<torch::Tensor> chamfer_dist_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2)
158 | {
159 |     const int batch_size = xyz1.size(0);
160 |     const int n = xyz1.size(1); // num_points point cloud A
161 |     const int m = xyz2.size(1); // num_points point cloud B
162 |     torch::Tensor dist1 = torch::zeros({batch_size, n}, torch::CUDA(xyz1.scalar_type()));
163 |     torch::Tensor dist2 = torch::zeros({batch_size, m}, torch::CUDA(xyz1.scalar_type()));
164 |     torch::Tensor idx1 = torch::zeros({batch_size, n}, torch::CUDA(torch::kInt));
165 |     torch::Tensor idx2 = torch::zeros({batch_size, m}, torch::CUDA(torch::kInt));
166 | 
167 |     AT_DISPATCH_FLOATING_TYPES(
168 |         xyz1.scalar_type(), "chamfer_dist_cuda", ([&] {
169 |             chamfer_dist_kernel<scalar_t><<<dim3(32, 16, 1), 512>>>(
170 |                 batch_size, n, xyz1.data_ptr<scalar_t>(), m, xyz2.data_ptr<scalar_t>(),
171 |                 dist1.data_ptr<scalar_t>(), idx1.data_ptr<int>());
172 | 
173 |             chamfer_dist_kernel<scalar_t><<<dim3(32, 16, 1), 512>>>(
174 |                 batch_size, m, xyz2.data_ptr<scalar_t>(), n, xyz1.data_ptr<scalar_t>(),
175 |                 dist2.data_ptr<scalar_t>(), idx2.data_ptr<int>());
176 |         }));
177 | 
178 |     cudaError_t err = cudaGetLastError();
179 |     if (err != cudaSuccess)
180 |     {
181 |         printf("Error in chamfer_dist_kernel_wrapper: %s\n", cudaGetErrorString(err));
182 |     }
183 |     return {dist1, dist2, idx1, idx2};
184 | }
185 | 
186 | template <typename scalar_t>
187 | __global__ void chamfer_dist_grad_kernel(int b, int n, const scalar_t* __restrict__ xyz1, int m,
188 |                                          const scalar_t* __restrict__ xyz2,
189 |                                          const scalar_t* __restrict__ grad_dist1, const int* idx1,
190 |                                          scalar_t* __restrict__ grad_xyz1,
191 |                                          scalar_t* __restrict__ grad_xyz2)
192 | {
193 |     for (int i = blockIdx.x; i < b; i += gridDim.x)
194 |     {
195 |         for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x * gridDim.y)
196 |         {
197 |             scalar_t x1 = xyz1[(i * n + j) * 3 + 0];
198 |             scalar_t y1 = xyz1[(i * n + j) * 3 + 1];
199 |             scalar_t z1 = xyz1[(i * n + j) * 3 + 2];
200 |             int j2 = idx1[i * n + j];
201 |             scalar_t x2 = xyz2[(i * m + j2) * 3 + 0];
202 |             scalar_t y2 = xyz2[(i * m + j2) * 3 + 1];
203 |             scalar_t z2 = xyz2[(i * m + j2) * 3 + 2];
204 |             scalar_t g = grad_dist1[i * n + j] * 2;
205 |             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 0]), g * (x1 - x2));
206 |             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 1]), g * (y1 - y2));
207 |             atomicAdd(&(grad_xyz1[(i * n + j) * 3 + 2]), g * (z1 - z2));
208 |             atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 0]), -(g * (x1 - x2)));
209 |             atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 1]), -(g * (y1 - y2)));
210 |             atomicAdd(&(grad_xyz2[(i * m + j2) * 3 + 2]), -(g * (z1 - z2)));
211 |         }
212 |     }
213 | }
214 | 
215 | std::vector<torch::Tensor> chamfer_dist_grad_kernel_wrapper(torch::Tensor xyz1, torch::Tensor xyz2,
216 |                                                             torch::Tensor idx1, torch::Tensor idx2,
217 |                                                             torch::Tensor grad_dist1,
218 |                                                             torch::Tensor grad_dist2)
219 | {
220 |     const int batch_size = xyz1.size(0);
221 |     const int n = xyz1.size(1); // num_points point cloud A
222 |     const int m = xyz2.size(1); // num_points point cloud B
223 |     torch::Tensor grad_xyz1 = torch::zeros_like(xyz1);
224 |     torch::Tensor grad_xyz2 = torch::zeros_like(xyz2);
225 | 
226 |     AT_DISPATCH_FLOATING_TYPES(
227 |         xyz1.scalar_type(), "chamfer_dist_grad_cuda", ([&] {
228 |             chamfer_dist_grad_kernel<scalar_t><<<dim3(1, 16, 1), 256>>>(
229 |                 batch_size, n, xyz1.data_ptr<scalar_t>(), m, xyz2.data_ptr<scalar_t>(),
230 |                 grad_dist1.data_ptr<scalar_t>(), idx1.data_ptr<int>(),
231 |                 grad_xyz1.data_ptr<scalar_t>(), grad_xyz2.data_ptr<scalar_t>());
232 | 
233 |             chamfer_dist_grad_kernel<scalar_t><<<dim3(1, 16, 1), 256>>>(
234 |                 batch_size, m, xyz2.data_ptr<scalar_t>(), n, xyz1.data_ptr<scalar_t>(),
235 |                 grad_dist2.data_ptr<scalar_t>(), idx2.data_ptr<int>(),
236 |                 grad_xyz2.data_ptr<scalar_t>(), grad_xyz1.data_ptr<scalar_t>());
237 |         }));
238 | 
239 |     cudaError_t err = cudaGetLastError();
240 |     if (err != cudaSuccess)
241 |     {
242 |         printf("Error in chamfer_dist_grad_kernel_wrapper: %s\n", cudaGetErrorString(err));
243 |     }
244 |     return {grad_xyz1, grad_xyz2};
245 | }
246 | 


--------------------------------------------------------------------------------
/cpu/src/neighbors.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Taken from https://github.com/HuguesTHOMAS/KPConv
  3 | 
  4 | #include "neighbors.h"
  5 | #include <chrono>
  6 | #include <random>
  7 | 
  8 | template <typename scalar_t>
  9 | int nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
 10 |                         vector<int64_t>& neighbors_indices, vector<float>& dists, float radius,
 11 |                         int max_num, int mode, bool sorted)
 12 | {
 13 |     // Initiate variables
 14 |     // ******************
 15 |     std::random_device rd;
 16 |     std::mt19937 g(rd());
 17 | 
 18 |     // square radius
 19 |     const float search_radius = static_cast<float>(radius * radius);
 20 | 
 21 |     // indices
 22 |     int i0 = 0;
 23 | 
 24 |     // Counting vector
 25 |     size_t max_count = 1;
 26 | 
 27 |     // Nanoflann related variables
 28 |     // ***************************
 29 | 
 30 |     // CLoud variable
 31 |     PointCloud<scalar_t> pcd;
 32 |     pcd.set(supports);
 33 | 
 34 |     // Cloud query
 35 |     PointCloud<scalar_t> pcd_query;
 36 |     pcd_query.set(queries);
 37 | 
 38 |     // Tree parameters
 39 |     nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */);
 40 | 
 41 |     // KDTree type definition
 42 |     typedef nanoflann::KDTreeSingleIndexAdaptor<
 43 |         nanoflann::L2_Simple_Adaptor<scalar_t, PointCloud<scalar_t>>, PointCloud<scalar_t>, 3>
 44 |         my_kd_tree_t;
 45 | 
 46 |     // Pointer to trees
 47 |     std::unique_ptr<my_kd_tree_t> index(new my_kd_tree_t(3, pcd, tree_params));
 48 |     index->buildIndex();
 49 |     // Search neigbors indices
 50 |     // ***********************
 51 | 
 52 |     // Search params
 53 |     nanoflann::SearchParams search_params;
 54 |     search_params.sorted = sorted;
 55 |     auto num_query_points = pcd_query.get_point_count();
 56 |     std::vector<std::vector<std::pair<size_t, scalar_t>>> list_matches(num_query_points);
 57 | 
 58 |     for (size_t i = 0; i < num_query_points; i++)
 59 |     {
 60 |         // Find neighbors
 61 |         list_matches[i0].reserve(max_count);
 62 |         std::vector<std::pair<size_t, scalar_t>> ret_matches;
 63 | 
 64 |         const size_t nMatches = index->radiusSearch(pcd_query.get_point_ptr(i), search_radius,
 65 |                                                     ret_matches, search_params);
 66 |         if (nMatches == 0)
 67 |             list_matches[i0] = {std::make_pair(0, -1)};
 68 |         else
 69 |         {
 70 |             if (!sorted)
 71 |                 std::shuffle(ret_matches.begin(), ret_matches.end(), g);
 72 |             list_matches[i0] = ret_matches;
 73 |         }
 74 |         max_count = max(max_count, nMatches);
 75 |         i0++;
 76 |     }
 77 |     // Reserve the memory
 78 |     if (max_num > 0)
 79 |     {
 80 |         max_count = max_num;
 81 |     }
 82 |     if (mode == 0)
 83 |     {
 84 |         neighbors_indices.resize(list_matches.size() * max_count, 0);
 85 |         dists.resize(list_matches.size() * max_count, -1);
 86 |         i0 = 0;
 87 |         int token = 0;
 88 |         for (auto& inds : list_matches)
 89 |         {
 90 |             token = inds[0].first;
 91 |             for (size_t j = 0; j < max_count; j++)
 92 |             {
 93 |                 if (j < inds.size())
 94 |                 {
 95 |                     neighbors_indices[i0 * max_count + j] = inds[j].first;
 96 |                     dists[i0 * max_count + j] = (float)inds[j].second;
 97 |                 }
 98 |                 else
 99 |                 {
100 |                     neighbors_indices[i0 * max_count + j] = token;
101 |                     dists[i0 * max_count + j] = -1;
102 |                 }
103 |             }
104 |             i0++;
105 |         }
106 |     }
107 |     else if (mode == 1)
108 |     {
109 |         size_t size = 0; // total number of edges
110 |         for (auto& inds : list_matches)
111 |         {
112 |             if (inds.size() <= max_count)
113 |                 size += inds.size();
114 |             else
115 |                 size += max_count;
116 |         }
117 |         neighbors_indices.resize(size * 2);
118 |         dists.resize(size);
119 |         int i0 = 0; // index of the query points
120 |         int u = 0;  // curent index of the neighbors_indices
121 |         for (auto& inds : list_matches)
122 |         {
123 |             for (size_t j = 0; j < max_count; j++)
124 |             {
125 |                 if (j < inds.size())
126 |                 {
127 |                     neighbors_indices[u] = inds[j].first;
128 |                     neighbors_indices[u + 1] = i0;
129 |                     dists[u / 2] = (float)inds[j].second;
130 |                     u += 2;
131 |                 }
132 |             }
133 |             i0++;
134 |         }
135 |     }
136 |     return max_count;
137 | }
138 | 
139 | template <typename scalar_t>
140 | int batch_nanoflann_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
141 |                               vector<int64_t>& q_batches, vector<int64_t>& s_batches,
142 |                               vector<int64_t>& neighbors_indices, vector<float>& dists,
143 |                               float radius, int max_num, int mode, bool sorted)
144 | {
145 |     // Initiate variables
146 |     // ******************
147 |     std::random_device rd;
148 |     std::mt19937 g(rd());
149 | 
150 |     // indices
151 |     int i0 = 0;
152 | 
153 |     // Square radius
154 |     float r2 = radius * radius;
155 | 
156 |     // Counting vector
157 |     int max_count = 0;
158 | 
159 |     // batch index
160 |     int b = 0;
161 | 
162 |     // Nanoflann related variables
163 |     // ***************************
164 | 
165 |     // CLoud variable
166 |     PointCloud<scalar_t> current_cloud;
167 |     PointCloud<scalar_t> query_pcd;
168 |     query_pcd.set(queries);
169 |     auto num_query_points = query_pcd.get_point_count();
170 |     vector<vector<pair<size_t, scalar_t>>> all_inds_dists(num_query_points);
171 | 
172 |     // Tree parameters
173 |     nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */);
174 | 
175 |     // KDTree type definition
176 |     typedef nanoflann::KDTreeSingleIndexAdaptor<
177 |         nanoflann::L2_Simple_Adaptor<scalar_t, PointCloud<scalar_t>>, PointCloud<scalar_t>, 3>
178 |         my_kd_tree_t;
179 | 
180 |     // Build KDTree for the first batch element
181 |     current_cloud.set_batch(supports, s_batches[b], s_batches[b + 1]);
182 |     std::unique_ptr<my_kd_tree_t> index(new my_kd_tree_t(3, current_cloud, tree_params));
183 |     index->buildIndex();
184 | 
185 |     // Search neigbors indices
186 |     // ***********************
187 |     // Search params
188 |     nanoflann::SearchParams search_params;
189 |     search_params.sorted = sorted;
190 |     for (size_t i = 0; i < num_query_points; i++)
191 |     {
192 |         // Check if we changed batch
193 |         if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 &&
194 |             b < (int)q_batches.size() - 1)
195 |         {
196 |             // Change the points
197 |             b++;
198 |             if (s_batches[b] < s_batches[b + 1])
199 |                 current_cloud.set_batch(supports, s_batches[b], s_batches[b + 1]);
200 | 
201 |             index.reset(new my_kd_tree_t(3, current_cloud, tree_params));
202 |             index->buildIndex();
203 |         }
204 | 
205 |         // Find neighboors
206 |         std::vector<std::pair<size_t, scalar_t>> ret_matches;
207 |         ret_matches.reserve(max_count);
208 |         size_t nMatches =
209 |             index->radiusSearch(query_pcd.get_point_ptr(i), r2, ret_matches, search_params);
210 | 
211 |         // Shuffle if needed
212 |         if (!sorted)
213 |             std::shuffle(ret_matches.begin(), ret_matches.end(), g);
214 |         all_inds_dists[i0] = ret_matches;
215 | 
216 |         // Update max count
217 |         if (nMatches > (size_t)max_count)
218 |             max_count = nMatches;
219 |         // Increment query idx
220 |         i0++;
221 |     }
222 |     // how many neighbors do we keep
223 |     if (max_num > 0)
224 |         max_count = max_num;
225 | 
226 |     const int token = -1;
227 |     if (mode == 0)
228 |     {
229 |         neighbors_indices.resize(query_pcd.get_point_count() * max_count);
230 |         dists.resize(query_pcd.get_point_count() * max_count);
231 |         i0 = 0;
232 |         b = 0;
233 | 
234 |         for (auto& inds_dists : all_inds_dists)
235 |         { // Check if we changed batch
236 |             if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 &&
237 |                 b < (int)q_batches.size() - 1)
238 |                 b++;
239 | 
240 |             for (int j = 0; j < max_count; j++)
241 |             {
242 |                 if ((size_t)j < inds_dists.size())
243 |                 {
244 |                     neighbors_indices[i0 * max_count + j] = inds_dists[j].first + s_batches[b];
245 |                     dists[i0 * max_count + j] = (float)inds_dists[j].second;
246 |                 }
247 |                 else
248 |                 {
249 |                     neighbors_indices[i0 * max_count + j] = token;
250 |                     dists[i0 * max_count + j] = -1;
251 |                 }
252 |             }
253 |             i0++;
254 |         }
255 |         index.reset();
256 |     }
257 |     else if (mode == 1)
258 |     {
259 |         int size = 0; // total number of edges
260 |         for (auto& inds_dists : all_inds_dists)
261 |         {
262 |             if ((int)inds_dists.size() <= max_count)
263 |                 size += inds_dists.size();
264 |             else
265 |                 size += max_count;
266 |         }
267 |         neighbors_indices.resize(size * 2);
268 |         dists.resize(size);
269 |         i0 = 0;
270 |         b = 0;
271 |         int u = 0;
272 |         for (auto& inds_dists : all_inds_dists)
273 |         {
274 |             if (i0 == q_batches[b + 1] && b < (int)s_batches.size() - 1 &&
275 |                 b < (int)q_batches.size() - 1)
276 |             {
277 |                 b++;
278 |             }
279 |             for (int j = 0; j < max_count; j++)
280 |             {
281 |                 if ((unsigned int)j < inds_dists.size())
282 |                 {
283 |                     neighbors_indices[u] = inds_dists[j].first + s_batches[b];
284 |                     neighbors_indices[u + 1] = i0;
285 |                     dists[u / 2] = (float)inds_dists[j].second;
286 |                     u += 2;
287 |                 }
288 |             }
289 |             i0++;
290 |         }
291 |     }
292 |     return max_count;
293 | }
294 | 
295 | template <typename scalar_t>
296 | void nanoflann_knn_neighbors(vector<scalar_t>& queries, vector<scalar_t>& supports,
297 |                              vector<int64_t>& neighbors_indices, vector<float>& dists, int k)
298 | {
299 |     // Nanoflann related variables
300 |     // ***************************
301 |     // CLoud variable
302 |     PointCloud<scalar_t> pcd;
303 |     pcd.set(supports);
304 |     // Cloud query
305 |     PointCloud<scalar_t> pcd_query;
306 |     pcd_query.set(queries);
307 | 
308 |     // Tree parameters
309 |     nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */);
310 | 
311 |     // KDTree type definition
312 |     typedef nanoflann::KDTreeSingleIndexAdaptor<
313 |         nanoflann::L2_Simple_Adaptor<scalar_t, PointCloud<scalar_t>>, PointCloud<scalar_t>, 3>
314 |         my_kd_tree_t;
315 | 
316 |     // Pointer to trees
317 |     std::unique_ptr<my_kd_tree_t> index(new my_kd_tree_t(3, pcd, tree_params));
318 |     index->buildIndex();
319 | 
320 |     // Search neigbors indices
321 |     // ***********************
322 |     size_t current_pos = 0;
323 |     auto num_query_points = pcd_query.get_point_count();
324 |     for (size_t i = 0; i < num_query_points; i++)
325 |     {
326 |         // Find neighbors
327 |         std::vector<size_t> ret_index(k);
328 |         std::vector<scalar_t> out_dist_sqr(k);
329 | 
330 |         const size_t nMatches =
331 |             index->knnSearch(pcd_query.get_point_ptr(i), k, &ret_index[0], &out_dist_sqr[0]);
332 |         for (size_t i = 0; i < nMatches; i++)
333 |         {
334 |             neighbors_indices[i + current_pos] = ret_index[i];
335 |             dists[i + current_pos] = out_dist_sqr[i];
336 |         }
337 |         current_pos += k;
338 |     }
339 | }
340 | 


--------------------------------------------------------------------------------
/cuda/src/gridding_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cstdio>
  3 | #include <cstdlib>
  4 | #include <torch/extension.h>
  5 | 
  6 | #include "cuda_utils.h"
  7 | 
  8 | #define CUDA_NUM_THREADS 512
  9 | 
 10 | // Computer the number of threads needed in GPU
 11 | inline int get_n_threads(int n)
 12 | {
 13 |     const int pow_2 = std::log(static_cast<float>(n)) / std::log(2.0);
 14 |     return max(min(1 << pow_2, CUDA_NUM_THREADS), 1);
 15 | }
 16 | 
 17 | __device__ int compute_index(int offset_x, int offset_y, int offset_z, int len_y, int len_z)
 18 | {
 19 |     return offset_x * len_y * len_z + offset_y * len_z + offset_z;
 20 | }
 21 | 
 22 | template <typename scalar_t>
 23 | __device__ scalar_t compute_weight(scalar_t x, scalar_t x0)
 24 | {
 25 |     return 1 - abs(x - x0);
 26 | }
 27 | 
 28 | template <typename scalar_t>
 29 | __global__ void
 30 | gridding_kernel(int n_grid_vertices, int n_pts, float min_x, float min_y, float min_z, int len_y,
 31 |                 int len_z, const scalar_t* __restrict__ ptcloud,
 32 |                 scalar_t* __restrict__ grid_weights, scalar_t* __restrict__ grid_pt_weights,
 33 |                 int* __restrict__ grid_pt_indexes)
 34 | {
 35 |     int batch_index = blockIdx.x;
 36 |     int index = threadIdx.x;
 37 |     int stride = blockDim.x;
 38 | 
 39 |     ptcloud += batch_index * n_pts * 3;
 40 |     grid_weights += batch_index * n_grid_vertices;
 41 |     grid_pt_weights += batch_index * n_pts * 24;
 42 |     grid_pt_indexes += batch_index * n_pts * 8;
 43 | 
 44 |     for (int j = index; j < n_pts; j += stride)
 45 |     {
 46 |         scalar_t pt_x = ptcloud[j * 3 + 0];
 47 |         scalar_t pt_y = ptcloud[j * 3 + 1];
 48 |         scalar_t pt_z = ptcloud[j * 3 + 2];
 49 | 
 50 |         int lower_x = std::floor(pt_x);
 51 |         int upper_x = std::ceil(pt_x);
 52 |         if (lower_x == upper_x)
 53 |         {
 54 |             upper_x += 1;
 55 |         }
 56 |         int lower_y = std::floor(pt_y);
 57 |         int upper_y = std::ceil(pt_y);
 58 |         if (lower_y == upper_y)
 59 |         {
 60 |             upper_y += 1;
 61 |         }
 62 |         int lower_z = std::floor(pt_z);
 63 |         int upper_z = std::ceil(pt_z);
 64 |         if (lower_z == upper_z)
 65 |         {
 66 |             upper_z += 1;
 67 |         }
 68 | 
 69 |         int lx_offset = lower_x - min_x, ux_offset = upper_x - min_x;
 70 |         int ly_offset = lower_y - min_y, uy_offset = upper_y - min_y;
 71 |         int lz_offset = lower_z - min_z, uz_offset = upper_z - min_z;
 72 | 
 73 |         // Compute weights and corresponding positions, a loop for 8 points
 74 |         // LLL -> Lower X, Lower Y, Lower Z
 75 |         grid_pt_indexes[j * 8 + 0] = compute_index(lx_offset, ly_offset, lz_offset, len_y, len_z);
 76 |         grid_pt_weights[j * 24 + 0] = compute_weight<scalar_t>(pt_x, lower_x);
 77 |         grid_pt_weights[j * 24 + 1] = compute_weight<scalar_t>(pt_y, lower_y);
 78 |         grid_pt_weights[j * 24 + 2] = compute_weight<scalar_t>(pt_z, lower_z);
 79 | 
 80 |         // LLU -> Lower X, Lower Y, Upper Z
 81 |         grid_pt_indexes[j * 8 + 1] = compute_index(lx_offset, ly_offset, uz_offset, len_y, len_z);
 82 |         grid_pt_weights[j * 24 + 3] = compute_weight<scalar_t>(pt_x, lower_x);
 83 |         grid_pt_weights[j * 24 + 4] = compute_weight<scalar_t>(pt_y, lower_y);
 84 |         grid_pt_weights[j * 24 + 5] = compute_weight<scalar_t>(pt_z, upper_z);
 85 | 
 86 |         // LUL -> Lower X, Upper Y, Lower Z
 87 |         grid_pt_indexes[j * 8 + 2] = compute_index(lx_offset, uy_offset, lz_offset, len_y, len_z);
 88 |         grid_pt_weights[j * 24 + 6] = compute_weight<scalar_t>(pt_x, lower_x);
 89 |         grid_pt_weights[j * 24 + 7] = compute_weight<scalar_t>(pt_y, upper_y);
 90 |         grid_pt_weights[j * 24 + 8] = compute_weight<scalar_t>(pt_z, lower_z);
 91 | 
 92 |         // LUU -> Lower X, Upper Y, Upper Z
 93 |         grid_pt_indexes[j * 8 + 3] = compute_index(lx_offset, uy_offset, uz_offset, len_y, len_z);
 94 |         grid_pt_weights[j * 24 + 9] = compute_weight<scalar_t>(pt_x, lower_x);
 95 |         grid_pt_weights[j * 24 + 10] = compute_weight<scalar_t>(pt_y, upper_y);
 96 |         grid_pt_weights[j * 24 + 11] = compute_weight<scalar_t>(pt_z, upper_z);
 97 | 
 98 |         // ULL -> Upper X, Lower Y, Lower Z
 99 |         grid_pt_indexes[j * 8 + 4] = compute_index(ux_offset, ly_offset, lz_offset, len_y, len_z);
100 |         grid_pt_weights[j * 24 + 12] = compute_weight<scalar_t>(pt_x, upper_x);
101 |         grid_pt_weights[j * 24 + 13] = compute_weight<scalar_t>(pt_y, lower_y);
102 |         grid_pt_weights[j * 24 + 14] = compute_weight<scalar_t>(pt_z, lower_z);
103 | 
104 |         // ULU -> Upper X, Lower Y, Upper Z
105 |         grid_pt_indexes[j * 8 + 5] = compute_index(ux_offset, ly_offset, uz_offset, len_y, len_z);
106 |         grid_pt_weights[j * 24 + 15] = compute_weight<scalar_t>(pt_x, upper_x);
107 |         grid_pt_weights[j * 24 + 16] = compute_weight<scalar_t>(pt_y, lower_y);
108 |         grid_pt_weights[j * 24 + 17] = compute_weight<scalar_t>(pt_z, upper_z);
109 | 
110 |         // UUL -> Upper X, Upper Y, Lower Z
111 |         grid_pt_indexes[j * 8 + 6] = compute_index(ux_offset, uy_offset, lz_offset, len_y, len_z);
112 |         grid_pt_weights[j * 24 + 18] = compute_weight<scalar_t>(pt_x, upper_x);
113 |         grid_pt_weights[j * 24 + 19] = compute_weight<scalar_t>(pt_y, upper_y);
114 |         grid_pt_weights[j * 24 + 20] = compute_weight<scalar_t>(pt_z, lower_z);
115 | 
116 |         // UUU -> Upper X, Upper Y, Upper Z
117 |         grid_pt_indexes[j * 8 + 7] = compute_index(ux_offset, uy_offset, uz_offset, len_y, len_z);
118 |         grid_pt_weights[j * 24 + 21] = compute_weight<scalar_t>(pt_x, upper_x);
119 |         grid_pt_weights[j * 24 + 22] = compute_weight<scalar_t>(pt_y, upper_y);
120 |         grid_pt_weights[j * 24 + 23] = compute_weight<scalar_t>(pt_z, upper_z);
121 |     }
122 | 
123 |     __syncthreads();
124 | 
125 |     int gvtx_idx = 0;
126 |     for (int j = index; j < n_pts; j += stride)
127 |     {
128 |         // LLL -> Lower X, Lower Y, Lower Z
129 |         gvtx_idx = grid_pt_indexes[j * 8 + 0];
130 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 0] *
131 |                                                  grid_pt_weights[j * 24 + 1] *
132 |                                                  grid_pt_weights[j * 24 + 2]);
133 |         // LLU -> Lower X, Lower Y, Upper Z
134 |         gvtx_idx = grid_pt_indexes[j * 8 + 1];
135 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 3] *
136 |                                                  grid_pt_weights[j * 24 + 4] *
137 |                                                  grid_pt_weights[j * 24 + 5]);
138 |         // LUL -> Lower X, Upper Y, Lower Z
139 |         gvtx_idx = grid_pt_indexes[j * 8 + 2];
140 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 6] *
141 |                                                  grid_pt_weights[j * 24 + 7] *
142 |                                                  grid_pt_weights[j * 24 + 8]);
143 |         // LUU -> Lower X, Upper Y, Upper Z
144 |         gvtx_idx = grid_pt_indexes[j * 8 + 3];
145 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 9] *
146 |                                                  grid_pt_weights[j * 24 + 10] *
147 |                                                  grid_pt_weights[j * 24 + 11]);
148 |         // ULL -> Upper X, Lower Y, Lower Z
149 |         gvtx_idx = grid_pt_indexes[j * 8 + 4];
150 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 12] *
151 |                                                  grid_pt_weights[j * 24 + 13] *
152 |                                                  grid_pt_weights[j * 24 + 14]);
153 |         // ULU -> Upper X, Lower Y, Upper Z
154 |         gvtx_idx = grid_pt_indexes[j * 8 + 5];
155 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 15] *
156 |                                                  grid_pt_weights[j * 24 + 16] *
157 |                                                  grid_pt_weights[j * 24 + 17]);
158 |         // UUL -> Upper X, Upper Y, Lower Z
159 |         gvtx_idx = grid_pt_indexes[j * 8 + 6];
160 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 18] *
161 |                                                  grid_pt_weights[j * 24 + 19] *
162 |                                                  grid_pt_weights[j * 24 + 20]);
163 |         // UUU -> Upper X, Upper Y, Upper Z
164 |         gvtx_idx = grid_pt_indexes[j * 8 + 7];
165 |         atomicAdd(&(grid_weights[gvtx_idx]), grid_pt_weights[j * 24 + 21] *
166 |                                                  grid_pt_weights[j * 24 + 22] *
167 |                                                  grid_pt_weights[j * 24 + 23]);
168 |     }
169 | }
170 | 
171 | std::vector<torch::Tensor> gridding_kernel_warpper(float min_x, float max_x, float min_y,
172 |                                                    float max_y, float min_z, float max_z,
173 |                                                    torch::Tensor ptcloud, cudaStream_t stream)
174 | {
175 |     int batch_size = ptcloud.size(0);
176 |     int n_pts = ptcloud.size(1);
177 |     int len_x = max_x - min_x + 1;
178 |     int len_y = max_y - min_y + 1;
179 |     int len_z = max_z - min_z + 1;
180 |     int n_grid_vertices = len_x * len_y * len_z;
181 | 
182 |     torch::Tensor grid_weights =
183 |         torch::zeros({batch_size, n_grid_vertices}, torch::CUDA(ptcloud.scalar_type()));
184 |     torch::Tensor grid_pt_weights =
185 |         torch::zeros({batch_size, n_pts, 8, 3}, torch::CUDA(ptcloud.scalar_type()));
186 |     torch::Tensor grid_pt_indexes = torch::zeros({batch_size, n_pts, 8}, torch::CUDA(torch::kInt));
187 | 
188 |     AT_DISPATCH_FLOATING_TYPES(
189 |         ptcloud.scalar_type(), "gridding_cuda", ([&] {
190 |             gridding_kernel<<<batch_size, get_n_threads(n_pts), 0, stream>>>(
191 |                 n_grid_vertices, n_pts, min_x, min_y, min_z, len_y, len_z,
192 |                 ptcloud.data_ptr<scalar_t>(), grid_weights.data_ptr<scalar_t>(),
193 |                 grid_pt_weights.data_ptr<scalar_t>(), grid_pt_indexes.data_ptr<int>());
194 |         }));
195 | 
196 |     cudaError_t err = cudaGetLastError();
197 |     if (err != cudaSuccess)
198 |     {
199 |         printf("Error in gridding_kernel_warpper: %s\n", cudaGetErrorString(err));
200 |     }
201 |     return {grid_weights, grid_pt_weights, grid_pt_indexes};
202 | }
203 | 
204 | template <typename scalar_t>
205 | __global__ void
206 | gridding_grad_kernel(int n_grid_vertices, int n_pts, const scalar_t* __restrict__ grid_pt_weights,
207 |                      const int* __restrict__ grid_pt_indexes,
208 |                      const scalar_t* __restrict__ grad_grid, scalar_t* __restrict__ grad_ptcloud)
209 | {
210 |     int batch_index = blockIdx.x;
211 |     int index = threadIdx.x;
212 |     int stride = blockDim.x;
213 | 
214 |     grid_pt_weights += batch_index * n_pts * 24;
215 |     grid_pt_indexes += batch_index * n_pts * 8;
216 |     grad_grid += batch_index * n_grid_vertices;
217 |     grad_ptcloud += batch_index * n_pts * 3;
218 | 
219 |     int gvtx_idx = 0;
220 |     scalar_t grad_vtx = 0, x_weights = 0, y_weights = 0, z_weights = 0;
221 |     for (int j = index; j < n_pts; j += stride)
222 |     {
223 |         // Compute gradient for the corresponding positions, a loop for 8 points
224 |         // LLL -> Lower X, Lower Y, Lower Z
225 |         gvtx_idx = grid_pt_indexes[j * 8 + 0];
226 |         grad_vtx = grad_grid[gvtx_idx];
227 |         x_weights = grid_pt_weights[j * 24 + 0];
228 |         y_weights = grid_pt_weights[j * 24 + 1];
229 |         z_weights = grid_pt_weights[j * 24 + 2];
230 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights);
231 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights);
232 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights);
233 | 
234 |         // LLU -> Lower X, Lower Y, Upper Z
235 |         gvtx_idx = grid_pt_indexes[j * 8 + 1];
236 |         grad_vtx = grad_grid[gvtx_idx];
237 |         x_weights = grid_pt_weights[j * 24 + 3];
238 |         y_weights = grid_pt_weights[j * 24 + 4];
239 |         z_weights = grid_pt_weights[j * 24 + 5];
240 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights);
241 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights);
242 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights);
243 | 
244 |         // LUL -> Lower X, Upper Y, Lower Z
245 |         gvtx_idx = grid_pt_indexes[j * 8 + 2];
246 |         grad_vtx = grad_grid[gvtx_idx];
247 |         x_weights = grid_pt_weights[j * 24 + 6];
248 |         y_weights = grid_pt_weights[j * 24 + 7];
249 |         z_weights = grid_pt_weights[j * 24 + 8];
250 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights);
251 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights);
252 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights);
253 | 
254 |         // LUU -> Lower X, Upper Y, Upper Z
255 |         gvtx_idx = grid_pt_indexes[j * 8 + 3];
256 |         grad_vtx = grad_grid[gvtx_idx];
257 |         x_weights = grid_pt_weights[j * 24 + 9];
258 |         y_weights = grid_pt_weights[j * 24 + 10];
259 |         z_weights = grid_pt_weights[j * 24 + 11];
260 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), -grad_vtx * y_weights * z_weights);
261 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights);
262 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights);
263 | 
264 |         // ULL -> Upper X, Lower Y, Lower Z
265 |         gvtx_idx = grid_pt_indexes[j * 8 + 4];
266 |         grad_vtx = grad_grid[gvtx_idx];
267 |         x_weights = grid_pt_weights[j * 24 + 12];
268 |         y_weights = grid_pt_weights[j * 24 + 13];
269 |         z_weights = grid_pt_weights[j * 24 + 14];
270 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights);
271 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights);
272 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights);
273 | 
274 |         // ULU -> Upper X, Lower Y, Upper Z
275 |         gvtx_idx = grid_pt_indexes[j * 8 + 5];
276 |         grad_vtx = grad_grid[gvtx_idx];
277 |         x_weights = grid_pt_weights[j * 24 + 15];
278 |         y_weights = grid_pt_weights[j * 24 + 16];
279 |         z_weights = grid_pt_weights[j * 24 + 17];
280 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights);
281 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), -grad_vtx * x_weights * z_weights);
282 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights);
283 | 
284 |         // UUL -> Upper X, Upper Y, Lower Z
285 |         gvtx_idx = grid_pt_indexes[j * 8 + 6];
286 |         grad_vtx = grad_grid[gvtx_idx];
287 |         x_weights = grid_pt_weights[j * 24 + 18];
288 |         y_weights = grid_pt_weights[j * 24 + 19];
289 |         z_weights = grid_pt_weights[j * 24 + 20];
290 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights);
291 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights);
292 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), -grad_vtx * x_weights * y_weights);
293 | 
294 |         // UUU -> Upper X, Upper Y, Upper Z
295 |         gvtx_idx = grid_pt_indexes[j * 8 + 7];
296 |         grad_vtx = grad_grid[gvtx_idx];
297 |         x_weights = grid_pt_weights[j * 24 + 21];
298 |         y_weights = grid_pt_weights[j * 24 + 22];
299 |         z_weights = grid_pt_weights[j * 24 + 23];
300 |         atomicAdd(&(grad_ptcloud[j * 3 + 0]), grad_vtx * y_weights * z_weights);
301 |         atomicAdd(&(grad_ptcloud[j * 3 + 1]), grad_vtx * x_weights * z_weights);
302 |         atomicAdd(&(grad_ptcloud[j * 3 + 2]), grad_vtx * x_weights * y_weights);
303 |     }
304 | }
305 | 
306 | torch::Tensor gridding_grad_kernel_warpper(torch::Tensor grid_pt_weights,
307 |                                            torch::Tensor grid_pt_indexes, torch::Tensor grad_grid,
308 |                                            cudaStream_t stream)
309 | {
310 |     int batch_size = grad_grid.size(0);
311 |     int n_grid_vertices = grad_grid.size(1);
312 |     int n_pts = grid_pt_indexes.size(1);
313 | 
314 |     torch::Tensor grad_ptcloud =
315 |         torch::zeros({batch_size, n_pts, 3}, torch::CUDA(grid_pt_weights.scalar_type()));
316 | 
317 |     AT_DISPATCH_FLOATING_TYPES(
318 |         grid_pt_weights.scalar_type(), "gridding_grad_cuda", ([&] {
319 |             gridding_grad_kernel<<<batch_size, get_n_threads(n_pts), 0, stream>>>(
320 |                 n_grid_vertices, n_pts, grid_pt_weights.data_ptr<scalar_t>(),
321 |                 grid_pt_indexes.data_ptr<int>(), grad_grid.data_ptr<scalar_t>(),
322 |                 grad_ptcloud.data_ptr<scalar_t>());
323 |         }));
324 | 
325 |     cudaError_t err = cudaGetLastError();
326 |     if (err != cudaSuccess)
327 |     {
328 |         printf("Error in gridding_grad_kernel_warpper: %s\n", cudaGetErrorString(err));
329 |     }
330 |     return grad_ptcloud;
331 | }
332 | 


--------------------------------------------------------------------------------