├── Makefile
├── README.md
├── __init__.py
├── build_ffi.py
└── src
    ├── knn_cuda_kernel.cu
    ├── knn_cuda_kernel.h
    ├── knn_pytorch.c
    └── knn_pytorch.h


/Makefile:
--------------------------------------------------------------------------------
 1 | # Unix commands.
 2 | PYTHON := python
 3 | NVCC_COMPILE := nvcc -c -o
 4 | RM_RF := rm -rf
 5 | 
 6 | # Library compilation rules.
 7 | NVCC_FLAGS := -x cu -Xcompiler -fPIC -shared
 8 | 
 9 | # File structure.
10 | BUILD_DIR := build
11 | INCLUDE_DIRS := src
12 | TORCH_FFI_BUILD := build_ffi.py
13 | KNN_KERNEL := $(BUILD_DIR)/knn_cuda_kernel.so
14 | TORCH_FFI_TARGET := $(BUILD_DIR)/knn_pytorch/_knn_pytorch.so
15 | 
16 | INCLUDE_FLAGS := $(foreach d, $(INCLUDE_DIRS), -I$d)
17 | 
18 | DEBUB := 0
19 | 
20 | # Debugging
21 | ifeq ($(DEBUG), 1)
22 |   COMMON_FLAGS += -DDEBUG -g -O0
23 |   NVCC_FLAGS += -G
24 | else
25 |   COMMON_FLAGS += -DNDEBUG -O2
26 | endif
27 | 
28 | all: $(TORCH_FFI_TARGET)
29 | 
30 | $(TORCH_FFI_TARGET): $(KNN_KERNEL) $(TORCH_FFI_BUILD)
31 | 	$(PYTHON) $(TORCH_FFI_BUILD)
32 | 
33 | $(BUILD_DIR)/%.so: src/%.cu
34 | 	@ mkdir -p $(BUILD_DIR)
35 | 	# Separate cpp shared library that will be loaded to the extern C ffi
36 | 	$(NVCC_COMPILE) $@ $? $(NVCC_FLAGS) $(INCLUDE_FLAGS)
37 | 
38 | clean:
39 | 	$(RM_RF) $(BUILD_DIR) $(KNN_KERNEL)
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pytorch KNN CUDA
 2 | 
 3 | 
 4 | - 2019/11/02 This repository will no longer be maintained as pytorch supports `sort()` and `kthvalue` on tensors.
 5 | 
 6 | ```shell
 7 | git clone https://github.com/chrischoy/pytorch_knn_cuda
 8 | cd pytorch_knn_cuda
 9 | make
10 | python __init__.py
11 | ```
12 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from torch.autograd import Variable, Function
 5 | import knn_pytorch
 6 | 
 7 | 
 8 | class KNearestNeighbor(Function):
 9 |   """ Compute k nearest neighbors for each query point.
10 |   """
11 |   def __init__(self, k):
12 |     self.k = k
13 | 
14 |   def forward(self, ref, query):
15 |     ref = ref.float().cuda()
16 |     query = query.float().cuda()
17 | 
18 |     inds = torch.empty(self.k, query.shape[1]).long().cuda()
19 |     dists = torch.empty(self.k, query.shape[1]).float().cuda()
20 | 
21 |     knn_pytorch.knn(ref, query, inds, dists)
22 | 
23 |     return inds, dists
24 | 
25 | 
26 | class TestKNearestNeighbor(unittest.TestCase):
27 | 
28 |   def test_forward(self):
29 |     D, N, M = 128, 100, 1000
30 |     ref = Variable(torch.rand(D, N))
31 |     query = Variable(torch.rand(D, M))
32 | 
33 |     inds, dists = KNearestNeighbor(2)(ref, query)
34 |     print inds, dists
35 | 
36 | 
37 | if __name__ == '__main__':
38 |   unittest.main()
39 | 


--------------------------------------------------------------------------------
/build_ffi.py:
--------------------------------------------------------------------------------
 1 | # https://gist.github.com/tonyseek/7821993
 2 | import glob
 3 | import torch
 4 | from os import path as osp
 5 | from torch.utils.ffi import create_extension
 6 | 
 7 | abs_path = osp.dirname(osp.realpath(__file__))
 8 | extra_objects = [osp.join(abs_path, 'build/knn_cuda_kernel.so')]
 9 | extra_objects += glob.glob('/usr/local/cuda/lib64/*.a')
10 | 
11 | ffi = create_extension(
12 |     'knn_pytorch',
13 |     headers=['src/knn_pytorch.h'],
14 |     sources=['src/knn_pytorch.c'],
15 |     define_macros=[('WITH_CUDA', None)],
16 |     relative_to=__file__,
17 |     with_cuda=True,
18 |     extra_objects=extra_objects,
19 |     include_dirs=[osp.join(abs_path, 'include')]
20 | )
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     assert torch.cuda.is_available(), 'Please install CUDA for GPU support.'
25 |     ffi.build()
26 | 


--------------------------------------------------------------------------------
/src/knn_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
  2 |  * The modifications are
  3 |  *      removed texture memory usage
  4 |  *      removed split query KNN computation
  5 |  *      added feature extraction with bilinear interpolation
  6 |  *
  7 |  * Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
  8 |  */
  9 | 
 10 | // Includes
 11 | #include <cstdio>
 12 | #include "cuda.h"
 13 | 
 14 | #include "knn_cuda_kernel.h"
 15 | 
 16 | // Constants used by the program
 17 | #define BLOCK_DIM                      16
 18 | #define DEBUG                          0
 19 | 
 20 | /**
 21 |   * Computes the distance between two matrix A (reference points) and
 22 |   * B (query points) containing respectively wA and wB points.
 23 |   *
 24 |   * @param A     pointer on the matrix A
 25 |   * @param wA    width of the matrix A = number of points in A
 26 |   * @param B     pointer on the matrix B
 27 |   * @param wB    width of the matrix B = number of points in B
 28 |   * @param dim   dimension of points = height of matrices A and B
 29 |   * @param AB    pointer on the matrix containing the wA*wB distances computed
 30 |   */
 31 | __global__ void cuComputeDistanceGlobal( float* A, int wA,
 32 |     float* B, int wB, int dim, float* AB){
 33 | 
 34 |   // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
 35 |   __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
 36 |   __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
 37 | 
 38 |   // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
 39 |   __shared__ int begin_A;
 40 |   __shared__ int begin_B;
 41 |   __shared__ int step_A;
 42 |   __shared__ int step_B;
 43 |   __shared__ int end_A;
 44 | 
 45 |   // Thread index
 46 |   int tx = threadIdx.x;
 47 |   int ty = threadIdx.y;
 48 | 
 49 |   // Other variables
 50 |   float tmp;
 51 |   float ssd = 0;
 52 | 
 53 |   // Loop parameters
 54 |   begin_A = BLOCK_DIM * blockIdx.y;
 55 |   begin_B = BLOCK_DIM * blockIdx.x;
 56 |   step_A  = BLOCK_DIM * wA;
 57 |   step_B  = BLOCK_DIM * wB;
 58 |   end_A   = begin_A + (dim-1) * wA;
 59 | 
 60 |     // Conditions
 61 |   int cond0 = (begin_A + tx < wA); // used to write in shared memory
 62 |   int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
 63 |   int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
 64 | 
 65 |   // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
 66 |   for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
 67 |     // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
 68 |     if (a/wA + ty < dim){
 69 |       shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
 70 |       shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
 71 |     }
 72 |     else{
 73 |       shared_A[ty][tx] = 0;
 74 |       shared_B[ty][tx] = 0;
 75 |     }
 76 | 
 77 |     // Synchronize to make sure the matrices are loaded
 78 |     __syncthreads();
 79 | 
 80 |     // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
 81 |     if (cond2 && cond1){
 82 |       for (int k = 0; k < BLOCK_DIM; ++k){
 83 |         tmp = shared_A[k][ty] - shared_B[k][tx];
 84 |         ssd += tmp*tmp;
 85 |       }
 86 |     }
 87 | 
 88 |     // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
 89 |     __syncthreads();
 90 |   }
 91 | 
 92 |   // Write the block sub-matrix to device memory; each thread writes one element
 93 |   if (cond2 && cond1)
 94 |     AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
 95 | }
 96 | 
 97 | 
 98 | /**
 99 |   * Gathers k-th smallest distances for each column of the distance matrix in the top.
100 |   *
101 |   * @param dist        distance matrix
102 |   * @param ind         index matrix
103 |   * @param width       width of the distance matrix and of the index matrix
104 |   * @param height      height of the distance matrix and of the index matrix
105 |   * @param k           number of neighbors to consider
106 |   */
107 | __global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
108 | 
109 |   // Variables
110 |   int l, i, j;
111 |   float *p_dist;
112 |   long  *p_ind;
113 |   float curr_dist, max_dist;
114 |   long  curr_row,  max_row;
115 |   unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
116 | 
117 |   if (xIndex<width){
118 |     // Pointer shift, initialization, and max value
119 |     p_dist   = dist + xIndex;
120 |     p_ind    = ind  + xIndex;
121 |     max_dist = p_dist[0];
122 |     p_ind[0] = 1;
123 | 
124 |     // Part 1 : sort kth firt elementZ
125 |     for (l=1; l<k; l++){
126 |       curr_row  = l * width;
127 |       curr_dist = p_dist[curr_row];
128 |       if (curr_dist<max_dist){
129 |         i=l-1;
130 |         for (int a=0; a<l-1; a++){
131 |           if (p_dist[a*width]>curr_dist){
132 |             i=a;
133 |             break;
134 |           }
135 |         }
136 |         for (j=l; j>i; j--){
137 |           p_dist[j*width] = p_dist[(j-1)*width];
138 |           p_ind[j*width]   = p_ind[(j-1)*width];
139 |         }
140 |         p_dist[i*width] = curr_dist;
141 |         p_ind[i*width]   = l+1;
142 |       } else {
143 |         p_ind[l*width] = l+1;
144 |       }
145 |       max_dist = p_dist[curr_row];
146 |     }
147 | 
148 |     // Part 2 : insert element in the k-th first lines
149 |     max_row = (k-1)*width;
150 |     for (l=k; l<height; l++){
151 |       curr_dist = p_dist[l*width];
152 |       if (curr_dist<max_dist){
153 |         i=k-1;
154 |         for (int a=0; a<k-1; a++){
155 |           if (p_dist[a*width]>curr_dist){
156 |             i=a;
157 |             break;
158 |           }
159 |         }
160 |         for (j=k-1; j>i; j--){
161 |           p_dist[j*width] = p_dist[(j-1)*width];
162 |           p_ind[j*width]   = p_ind[(j-1)*width];
163 |         }
164 |         p_dist[i*width] = curr_dist;
165 |         p_ind[i*width]   = l+1;
166 |         max_dist             = p_dist[max_row];
167 |       }
168 |     }
169 |   }
170 | }
171 | 
172 | 
173 | /**
174 |   * Computes the square root of the first line (width-th first element)
175 |   * of the distance matrix.
176 |   *
177 |   * @param dist    distance matrix
178 |   * @param width   width of the distance matrix
179 |   * @param k       number of neighbors to consider
180 |   */
181 | __global__ void cuParallelSqrt(float *dist, int width, int k){
182 |     unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
183 |     unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
184 |   if (xIndex<width && yIndex<k)
185 |     dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
186 | }
187 | 
188 | 
189 | //-----------------------------------------------------------------------------------------------//
190 | //                                   K-th NEAREST NEIGHBORS                                      //
191 | //-----------------------------------------------------------------------------------------------//
192 | 
193 | /**
194 |   * K nearest neighbor algorithm
195 |   * - Initialize CUDA
196 |   * - Allocate device memory
197 |   * - Copy point sets (reference and query points) from host to device memory
198 |   * - Compute the distances + indexes to the k nearest neighbors for each query point
199 |   * - Copy distances from device to host memory
200 |   *
201 |   * @param ref_host      reference points ; pointer to linear matrix
202 |   * @param ref_nb        number of reference points ; width of the matrix
203 |   * @param query_host    query points ; pointer to linear matrix
204 |   * @param query_nb      number of query points ; width of the matrix
205 |   * @param dim           dimension of points ; height of the matrices
206 |   * @param k             number of neighbor to consider
207 |   * @param dist_host     distances to k nearest neighbors ; pointer to linear matrix
208 |   * @param dist_host     indexes of the k nearest neighbors ; pointer to linear matrix
209 |   *
210 |   */
211 | void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
212 |     int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
213 | 
214 |   // Grids ans threads
215 |   dim3 g_16x16(query_nb/16, ref_nb/16, 1);
216 |   dim3 t_16x16(16, 16, 1);
217 |   if (query_nb%16 != 0) g_16x16.x += 1;
218 |   if (ref_nb  %16 != 0) g_16x16.y += 1;
219 |   //
220 |   dim3 g_256x1(query_nb/256, 1, 1);
221 |   dim3 t_256x1(256, 1, 1);
222 |   if (query_nb%256 != 0) g_256x1.x += 1;
223 | 
224 |   dim3 g_k_16x16(query_nb/16, k/16, 1);
225 |   dim3 t_k_16x16(16, 16, 1);
226 |   if (query_nb%16 != 0) g_k_16x16.x += 1;
227 |   if (k  %16 != 0) g_k_16x16.y += 1;
228 | 
229 |   // Kernel 1: Compute all the distances
230 |   cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb,
231 |       query_dev, query_nb, dim, dist_dev);
232 | 
233 |   // Kernel 2: Sort each column
234 |   cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev,
235 |       query_nb, ref_nb, k);
236 | 
237 |   // Kernel 3: Compute square root of k first elements
238 |   cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
239 | 
240 | #if DEBUG
241 |   unsigned int  size_of_float = sizeof(float);
242 |   unsigned long size_of_long  = sizeof(long);
243 | 
244 |   float* dist_host = new float[query_nb * k];
245 |   long*  idx_host  = new long[query_nb * k];
246 | 
247 |   // Memory copy of output from device to host
248 |   cudaMemcpy(&dist_host[0], dist_dev,
249 |       query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
250 | 
251 |   cudaMemcpy(&idx_host[0], ind_dev,
252 |       query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
253 | 
254 |   int i = 0;
255 |   for(i = 0; i < 100; i++){
256 |     printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
257 |   }
258 | #endif
259 | }
260 | 


--------------------------------------------------------------------------------
/src/knn_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MATHUTIL_CUDA_KERNEL
 2 | #define _MATHUTIL_CUDA_KERNEL
 3 | 
 4 | #define IDX2D(i, j, dj) (dj * i + j)
 5 | #define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
 6 | 
 7 | #define BLOCK 512
 8 | #define MAX_STREAMS 512
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void knn_device(float* ref_dev, int ref_width,
15 |     float* query_dev, int query_width,
16 |     int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);
17 | 
18 | #ifdef __cplusplus
19 | }
20 | #endif
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/knn_pytorch.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include "knn_cuda_kernel.h"
 3 | 
 4 | extern THCState *state;
 5 | 
 6 | int knn(THCudaTensor *ref_tensor, THCudaTensor *query_tensor,
 7 |     THCudaLongTensor *idx_tensor, THCudaTensor *dist_tensor) {
 8 | 
 9 |   THCAssertSameGPU(THCudaTensor_checkGPU(state, 4, idx_tensor, dist_tensor, ref_tensor, query_tensor));
10 |   long ref_nb, query_nb, dim, k;
11 |   THArgCheck(THCudaTensor_nDimension(state, ref_tensor) == 2 , 0, "ref_tensor: 2D Tensor expected");
12 |   THArgCheck(THCudaTensor_nDimension(state, query_tensor) == 2 , 1, "query_tensor: 2D Tensor expected");
13 |   THArgCheck(THCudaLongTensor_nDimension(state, idx_tensor) == 2 , 3, "idx_tensor: 2D Tensor expected");
14 |   THArgCheck(THCudaTensor_nDimension(state, dist_tensor) == 2 , 4, "dist_tensor: 2D Tensor expected");
15 |   THArgCheck(THCudaTensor_size(state, ref_tensor, 0) == THCudaTensor_size(state, query_tensor,0), 0, "input sizes must match");
16 |   THArgCheck(THCudaLongTensor_size(state, idx_tensor, 0) == THCudaTensor_size(state, dist_tensor,0), 0, "output sizes must match");
17 | 
18 |   ref_tensor = THCudaTensor_newContiguous(state, ref_tensor);
19 |   query_tensor = THCudaTensor_newContiguous(state, query_tensor);
20 | 
21 |   dim = THCudaTensor_size(state, ref_tensor, 0);
22 |   k = THCudaLongTensor_size(state, idx_tensor, 0);
23 |   ref_nb = THCudaTensor_size(state, ref_tensor, 1);
24 |   query_nb = THCudaTensor_size(state, query_tensor, 1);
25 | 
26 |   float *ref_dev = THCudaTensor_data(state, ref_tensor);
27 |   float *query_dev = THCudaTensor_data(state, query_tensor);
28 |   long *idx_dev = THCudaLongTensor_data(state, idx_tensor);
29 |   float *dist_dev = THCudaTensor_data(state, dist_tensor);
30 | 
31 |   knn_device(ref_dev, ref_nb, query_dev, query_nb, dim, k, dist_dev, idx_dev,
32 |     THCState_getCurrentStream(state));
33 | 
34 |   // check for errors
35 |   cudaError_t err = cudaGetLastError();
36 |   if (err != cudaSuccess) {
37 |     printf("error in knn: %s\n", cudaGetErrorString(err));
38 |     THError("aborting");
39 |   }
40 | 
41 |   return 1;
42 | }
43 | 


--------------------------------------------------------------------------------
/src/knn_pytorch.h:
--------------------------------------------------------------------------------
1 | int knn(THCudaTensor *ref_tensor, THCudaTensor *query_tensor,
2 |     THCudaLongTensor *idx_tensor, THCudaTensor *dist_tensor);
3 | 


--------------------------------------------------------------------------------