├── Makefile ├── README.md ├── __init__.py ├── build_ffi.py └── src ├── knn_cuda_kernel.cu ├── knn_cuda_kernel.h ├── knn_pytorch.c └── knn_pytorch.h /Makefile: -------------------------------------------------------------------------------- 1 | # Unix commands. 2 | PYTHON := python 3 | NVCC_COMPILE := nvcc -c -o 4 | RM_RF := rm -rf 5 | 6 | # Library compilation rules. 7 | NVCC_FLAGS := -x cu -Xcompiler -fPIC -shared 8 | 9 | # File structure. 10 | BUILD_DIR := build 11 | INCLUDE_DIRS := src 12 | TORCH_FFI_BUILD := build_ffi.py 13 | KNN_KERNEL := $(BUILD_DIR)/knn_cuda_kernel.so 14 | TORCH_FFI_TARGET := $(BUILD_DIR)/knn_pytorch/_knn_pytorch.so 15 | 16 | INCLUDE_FLAGS := $(foreach d, $(INCLUDE_DIRS), -I$d) 17 | 18 | DEBUB := 0 19 | 20 | # Debugging 21 | ifeq ($(DEBUG), 1) 22 | COMMON_FLAGS += -DDEBUG -g -O0 23 | NVCC_FLAGS += -G 24 | else 25 | COMMON_FLAGS += -DNDEBUG -O2 26 | endif 27 | 28 | all: $(TORCH_FFI_TARGET) 29 | 30 | $(TORCH_FFI_TARGET): $(KNN_KERNEL) $(TORCH_FFI_BUILD) 31 | $(PYTHON) $(TORCH_FFI_BUILD) 32 | 33 | $(BUILD_DIR)/%.so: src/%.cu 34 | @ mkdir -p $(BUILD_DIR) 35 | # Separate cpp shared library that will be loaded to the extern C ffi 36 | $(NVCC_COMPILE) $@ $? $(NVCC_FLAGS) $(INCLUDE_FLAGS) 37 | 38 | clean: 39 | $(RM_RF) $(BUILD_DIR) $(KNN_KERNEL) 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch KNN CUDA 2 | 3 | 4 | - 2019/11/02 This repository will no longer be maintained as pytorch supports `sort()` and `kthvalue` on tensors. 5 | 6 | ```shell 7 | git clone https://github.com/chrischoy/pytorch_knn_cuda 8 | cd pytorch_knn_cuda 9 | make 10 | python __init__.py 11 | ``` 12 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | from torch.autograd import Variable, Function 5 | import knn_pytorch 6 | 7 | 8 | class KNearestNeighbor(Function): 9 | """ Compute k nearest neighbors for each query point. 10 | """ 11 | def __init__(self, k): 12 | self.k = k 13 | 14 | def forward(self, ref, query): 15 | ref = ref.float().cuda() 16 | query = query.float().cuda() 17 | 18 | inds = torch.empty(self.k, query.shape[1]).long().cuda() 19 | dists = torch.empty(self.k, query.shape[1]).float().cuda() 20 | 21 | knn_pytorch.knn(ref, query, inds, dists) 22 | 23 | return inds, dists 24 | 25 | 26 | class TestKNearestNeighbor(unittest.TestCase): 27 | 28 | def test_forward(self): 29 | D, N, M = 128, 100, 1000 30 | ref = Variable(torch.rand(D, N)) 31 | query = Variable(torch.rand(D, M)) 32 | 33 | inds, dists = KNearestNeighbor(2)(ref, query) 34 | print inds, dists 35 | 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /build_ffi.py: -------------------------------------------------------------------------------- 1 | # https://gist.github.com/tonyseek/7821993 2 | import glob 3 | import torch 4 | from os import path as osp 5 | from torch.utils.ffi import create_extension 6 | 7 | abs_path = osp.dirname(osp.realpath(__file__)) 8 | extra_objects = [osp.join(abs_path, 'build/knn_cuda_kernel.so')] 9 | extra_objects += glob.glob('/usr/local/cuda/lib64/*.a') 10 | 11 | ffi = create_extension( 12 | 'knn_pytorch', 13 | headers=['src/knn_pytorch.h'], 14 | sources=['src/knn_pytorch.c'], 15 | define_macros=[('WITH_CUDA', None)], 16 | relative_to=__file__, 17 | with_cuda=True, 18 | extra_objects=extra_objects, 19 | include_dirs=[osp.join(abs_path, 'include')] 20 | ) 21 | 22 | 23 | if __name__ == '__main__': 24 | assert torch.cuda.is_available(), 'Please install CUDA for GPU support.' 25 | ffi.build() 26 | -------------------------------------------------------------------------------- /src/knn_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA 2 | * The modifications are 3 | * removed texture memory usage 4 | * removed split query KNN computation 5 | * added feature extraction with bilinear interpolation 6 | * 7 | * Last modified by Christopher B. Choy 12/23/2016 8 | */ 9 | 10 | // Includes 11 | #include 12 | #include "cuda.h" 13 | 14 | #include "knn_cuda_kernel.h" 15 | 16 | // Constants used by the program 17 | #define BLOCK_DIM 16 18 | #define DEBUG 0 19 | 20 | /** 21 | * Computes the distance between two matrix A (reference points) and 22 | * B (query points) containing respectively wA and wB points. 23 | * 24 | * @param A pointer on the matrix A 25 | * @param wA width of the matrix A = number of points in A 26 | * @param B pointer on the matrix B 27 | * @param wB width of the matrix B = number of points in B 28 | * @param dim dimension of points = height of matrices A and B 29 | * @param AB pointer on the matrix containing the wA*wB distances computed 30 | */ 31 | __global__ void cuComputeDistanceGlobal( float* A, int wA, 32 | float* B, int wB, int dim, float* AB){ 33 | 34 | // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B 35 | __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM]; 36 | __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM]; 37 | 38 | // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step) 39 | __shared__ int begin_A; 40 | __shared__ int begin_B; 41 | __shared__ int step_A; 42 | __shared__ int step_B; 43 | __shared__ int end_A; 44 | 45 | // Thread index 46 | int tx = threadIdx.x; 47 | int ty = threadIdx.y; 48 | 49 | // Other variables 50 | float tmp; 51 | float ssd = 0; 52 | 53 | // Loop parameters 54 | begin_A = BLOCK_DIM * blockIdx.y; 55 | begin_B = BLOCK_DIM * blockIdx.x; 56 | step_A = BLOCK_DIM * wA; 57 | step_B = BLOCK_DIM * wB; 58 | end_A = begin_A + (dim-1) * wA; 59 | 60 | // Conditions 61 | int cond0 = (begin_A + tx < wA); // used to write in shared memory 62 | int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix 63 | int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix 64 | 65 | // Loop over all the sub-matrices of A and B required to compute the block sub-matrix 66 | for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) { 67 | // Load the matrices from device memory to shared memory; each thread loads one element of each matrix 68 | if (a/wA + ty < dim){ 69 | shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0; 70 | shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0; 71 | } 72 | else{ 73 | shared_A[ty][tx] = 0; 74 | shared_B[ty][tx] = 0; 75 | } 76 | 77 | // Synchronize to make sure the matrices are loaded 78 | __syncthreads(); 79 | 80 | // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix 81 | if (cond2 && cond1){ 82 | for (int k = 0; k < BLOCK_DIM; ++k){ 83 | tmp = shared_A[k][ty] - shared_B[k][tx]; 84 | ssd += tmp*tmp; 85 | } 86 | } 87 | 88 | // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration 89 | __syncthreads(); 90 | } 91 | 92 | // Write the block sub-matrix to device memory; each thread writes one element 93 | if (cond2 && cond1) 94 | AB[(begin_A + ty) * wB + begin_B + tx] = ssd; 95 | } 96 | 97 | 98 | /** 99 | * Gathers k-th smallest distances for each column of the distance matrix in the top. 100 | * 101 | * @param dist distance matrix 102 | * @param ind index matrix 103 | * @param width width of the distance matrix and of the index matrix 104 | * @param height height of the distance matrix and of the index matrix 105 | * @param k number of neighbors to consider 106 | */ 107 | __global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){ 108 | 109 | // Variables 110 | int l, i, j; 111 | float *p_dist; 112 | long *p_ind; 113 | float curr_dist, max_dist; 114 | long curr_row, max_row; 115 | unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 116 | 117 | if (xIndexcurr_dist){ 132 | i=a; 133 | break; 134 | } 135 | } 136 | for (j=l; j>i; j--){ 137 | p_dist[j*width] = p_dist[(j-1)*width]; 138 | p_ind[j*width] = p_ind[(j-1)*width]; 139 | } 140 | p_dist[i*width] = curr_dist; 141 | p_ind[i*width] = l+1; 142 | } else { 143 | p_ind[l*width] = l+1; 144 | } 145 | max_dist = p_dist[curr_row]; 146 | } 147 | 148 | // Part 2 : insert element in the k-th first lines 149 | max_row = (k-1)*width; 150 | for (l=k; lcurr_dist){ 156 | i=a; 157 | break; 158 | } 159 | } 160 | for (j=k-1; j>i; j--){ 161 | p_dist[j*width] = p_dist[(j-1)*width]; 162 | p_ind[j*width] = p_ind[(j-1)*width]; 163 | } 164 | p_dist[i*width] = curr_dist; 165 | p_ind[i*width] = l+1; 166 | max_dist = p_dist[max_row]; 167 | } 168 | } 169 | } 170 | } 171 | 172 | 173 | /** 174 | * Computes the square root of the first line (width-th first element) 175 | * of the distance matrix. 176 | * 177 | * @param dist distance matrix 178 | * @param width width of the distance matrix 179 | * @param k number of neighbors to consider 180 | */ 181 | __global__ void cuParallelSqrt(float *dist, int width, int k){ 182 | unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 183 | unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; 184 | if (xIndex>>(ref_dev, ref_nb, 231 | query_dev, query_nb, dim, dist_dev); 232 | 233 | // Kernel 2: Sort each column 234 | cuInsertionSort<<>>(dist_dev, ind_dev, 235 | query_nb, ref_nb, k); 236 | 237 | // Kernel 3: Compute square root of k first elements 238 | cuParallelSqrt<<>>(dist_dev, query_nb, k); 239 | 240 | #if DEBUG 241 | unsigned int size_of_float = sizeof(float); 242 | unsigned long size_of_long = sizeof(long); 243 | 244 | float* dist_host = new float[query_nb * k]; 245 | long* idx_host = new long[query_nb * k]; 246 | 247 | // Memory copy of output from device to host 248 | cudaMemcpy(&dist_host[0], dist_dev, 249 | query_nb * k *size_of_float, cudaMemcpyDeviceToHost); 250 | 251 | cudaMemcpy(&idx_host[0], ind_dev, 252 | query_nb * k * size_of_long, cudaMemcpyDeviceToHost); 253 | 254 | int i = 0; 255 | for(i = 0; i < 100; i++){ 256 | printf("IDX[%d]: %d\n", i, (int)idx_host[i]); 257 | } 258 | #endif 259 | } 260 | -------------------------------------------------------------------------------- /src/knn_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _MATHUTIL_CUDA_KERNEL 2 | #define _MATHUTIL_CUDA_KERNEL 3 | 4 | #define IDX2D(i, j, dj) (dj * i + j) 5 | #define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk)) 6 | 7 | #define BLOCK 512 8 | #define MAX_STREAMS 512 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | void knn_device(float* ref_dev, int ref_width, 15 | float* query_dev, int query_width, 16 | int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/knn_pytorch.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "knn_cuda_kernel.h" 3 | 4 | extern THCState *state; 5 | 6 | int knn(THCudaTensor *ref_tensor, THCudaTensor *query_tensor, 7 | THCudaLongTensor *idx_tensor, THCudaTensor *dist_tensor) { 8 | 9 | THCAssertSameGPU(THCudaTensor_checkGPU(state, 4, idx_tensor, dist_tensor, ref_tensor, query_tensor)); 10 | long ref_nb, query_nb, dim, k; 11 | THArgCheck(THCudaTensor_nDimension(state, ref_tensor) == 2 , 0, "ref_tensor: 2D Tensor expected"); 12 | THArgCheck(THCudaTensor_nDimension(state, query_tensor) == 2 , 1, "query_tensor: 2D Tensor expected"); 13 | THArgCheck(THCudaLongTensor_nDimension(state, idx_tensor) == 2 , 3, "idx_tensor: 2D Tensor expected"); 14 | THArgCheck(THCudaTensor_nDimension(state, dist_tensor) == 2 , 4, "dist_tensor: 2D Tensor expected"); 15 | THArgCheck(THCudaTensor_size(state, ref_tensor, 0) == THCudaTensor_size(state, query_tensor,0), 0, "input sizes must match"); 16 | THArgCheck(THCudaLongTensor_size(state, idx_tensor, 0) == THCudaTensor_size(state, dist_tensor,0), 0, "output sizes must match"); 17 | 18 | ref_tensor = THCudaTensor_newContiguous(state, ref_tensor); 19 | query_tensor = THCudaTensor_newContiguous(state, query_tensor); 20 | 21 | dim = THCudaTensor_size(state, ref_tensor, 0); 22 | k = THCudaLongTensor_size(state, idx_tensor, 0); 23 | ref_nb = THCudaTensor_size(state, ref_tensor, 1); 24 | query_nb = THCudaTensor_size(state, query_tensor, 1); 25 | 26 | float *ref_dev = THCudaTensor_data(state, ref_tensor); 27 | float *query_dev = THCudaTensor_data(state, query_tensor); 28 | long *idx_dev = THCudaLongTensor_data(state, idx_tensor); 29 | float *dist_dev = THCudaTensor_data(state, dist_tensor); 30 | 31 | knn_device(ref_dev, ref_nb, query_dev, query_nb, dim, k, dist_dev, idx_dev, 32 | THCState_getCurrentStream(state)); 33 | 34 | // check for errors 35 | cudaError_t err = cudaGetLastError(); 36 | if (err != cudaSuccess) { 37 | printf("error in knn: %s\n", cudaGetErrorString(err)); 38 | THError("aborting"); 39 | } 40 | 41 | return 1; 42 | } 43 | -------------------------------------------------------------------------------- /src/knn_pytorch.h: -------------------------------------------------------------------------------- 1 | int knn(THCudaTensor *ref_tensor, THCudaTensor *query_tensor, 2 | THCudaLongTensor *idx_tensor, THCudaTensor *dist_tensor); 3 | --------------------------------------------------------------------------------