├── Makefile ├── Makefile.config ├── README.md ├── __init__.py ├── example.py └── src ├── knn.cpp ├── knn.h └── knn_cuda.cu /Makefile: -------------------------------------------------------------------------------- 1 | CONFIG_FILE := Makefile.config 2 | 3 | # Explicitly check for the config file, otherwise make -k will proceed anyway. 4 | ifeq ($(wildcard $(CONFIG_FILE)),) 5 | $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example.) 6 | endif 7 | include $(CONFIG_FILE) 8 | 9 | LIBRARIES := cuda cudart cudadevrt boost_python python2.7 10 | 11 | # CUDA compilation rules 12 | NVCC := $(CUDA_DIR)/bin/nvcc 13 | NVCCFLAGS := -m64 14 | CUDA_TARGET = knn_cuda 15 | 16 | # Library compilation rules 17 | CXXFLAGS := -fPIC 18 | LDFLAGS := -shared 19 | TARGET := knn 20 | COMMON_FLAGS := 21 | 22 | ############################## 23 | # Derive include and lib directories 24 | ############################## 25 | CUDA_INCLUDE_DIR := $(CUDA_DIR)/include 26 | 27 | CUDA_LIB_DIR := 28 | # add /lib64 only if it exists 29 | ifneq ("$(wildcard $(CUDA_DIR)/lib64)","") 30 | CUDA_LIB_DIR += $(CUDA_DIR)/lib64 31 | endif 32 | CUDA_LIB_DIR += $(CUDA_DIR)/lib 33 | 34 | INCLUDE_DIRS += $(PYTHON_INCLUDE) ./src 35 | ifneq ($(CPU_ONLY), 1) 36 | INCLUDE_DIRS += $(CUDA_INCLUDE_DIR) 37 | LIBRARY_DIRS += $(CUDA_LIB_DIR) 38 | endif 39 | 40 | 41 | # Linux 42 | ifeq ($(LINUX), 1) 43 | CXX ?= /usr/bin/g++ 44 | GCCVERSION := $(shell $(CXX) -dumpversion | cut -f1,2 -d.) 45 | # older versions of gcc are too dumb to build boost with -Wuninitalized 46 | ifeq ($(shell echo | awk '{exit $(GCCVERSION) < 4.6;}'), 1) 47 | WARNINGS += -Wno-uninitialized 48 | endif 49 | # boost::thread is reasonably called boost_thread (compare OS X) 50 | # We will also explicitly add stdc++ to the link target. 51 | LIBRARIES += stdc++ 52 | VERSIONFLAGS += -Wl,-soname,$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../lib 53 | endif 54 | 55 | # OS X: 56 | # clang++ instead of g++ 57 | # libstdc++ for NVCC compatibility on OS X >= 10.9 with CUDA < 7.0 58 | ifeq ($(OSX), 1) 59 | CXX := /usr/bin/clang++ 60 | ifneq ($(CPU_ONLY), 1) 61 | CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]') 62 | ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1) 63 | CXXFLAGS += -stdlib=libstdc++ 64 | LINKFLAGS += -stdlib=libstdc++ 65 | endif 66 | # clang throws this warning for cuda headers 67 | WARNINGS += -Wno-unneeded-internal-declaration 68 | # 10.11 strips DYLD_* env vars so link CUDA (rpath is available on 10.5+) 69 | OSX_10_OR_LATER := $(shell [ $(OSX_MAJOR_VERSION) -ge 10 ] && echo true) 70 | OSX_10_5_OR_LATER := $(shell [ $(OSX_MINOR_VERSION) -ge 5 ] && echo true) 71 | ifeq ($(OSX_10_OR_LATER),true) 72 | ifeq ($(OSX_10_5_OR_LATER),true) 73 | LDFLAGS += -Wl,-rpath,$(CUDA_LIB_DIR) 74 | endif 75 | endif 76 | endif 77 | # gtest needs to use its own tuple to not conflict with clang 78 | COMMON_FLAGS += -DGTEST_USE_OWN_TR1_TUPLE=1 79 | # we need to explicitly ask for the rpath to be obeyed 80 | ORIGIN := @loader_path 81 | VERSIONFLAGS += -Wl,-install_name,@rpath/$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib 82 | else 83 | ORIGIN := \$$ORIGIN 84 | endif 85 | 86 | # Debugging 87 | ifeq ($(DEBUG), 1) 88 | COMMON_FLAGS += -DDEBUG -g -O0 89 | NVCCFLAGS += -G 90 | else 91 | COMMON_FLAGS += -DNDEBUG -O2 92 | endif 93 | 94 | # Complete build flags. 95 | COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) 96 | CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS) 97 | NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) 98 | 99 | LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \ 100 | $(foreach library,$(LIBRARIES),-l$(library)) 101 | 102 | ALL_OBJS := $(addprefix $(BUILD_DIR)/, $(TARGET).o $(CUDA_TARGET).o $(CUDA_TARGET)_link.o) 103 | 104 | all: clean $(TARGET).so 105 | 106 | $(TARGET).so: $(ALL_OBJS) 107 | $(CXX) $? $(LDFLAGS) -o $@ 108 | 109 | $(BUILD_DIR)/$(TARGET).o: src/$(TARGET).cpp 110 | @ mkdir -p $(BUILD_DIR) 111 | $(CXX) $(CXXFLAGS) -c $? -o $@ 112 | 113 | $(BUILD_DIR)/$(CUDA_TARGET).o: src/$(CUDA_TARGET).cu 114 | @ mkdir -p $(BUILD_DIR) 115 | $(NVCC) $(NVCCFLAGS) -dc $? -o $@ 116 | 117 | $(BUILD_DIR)/$(CUDA_TARGET)_link.o: $(BUILD_DIR)/$(CUDA_TARGET).o 118 | @ mkdir -p $(BUILD_DIR) 119 | $(NVCC) $(NVCCFLAGS) -dlink -o $@ $? 120 | 121 | clean: 122 | rm -f $(BUILD_DIR)/*.o 123 | rm -f $(TARGET).so 124 | -------------------------------------------------------------------------------- /Makefile.config: -------------------------------------------------------------------------------- 1 | # Extracted from caffe Makefile.config.example 2 | # 3 | # We need to be able to find Python.h and numpy/arrayobject.h. 4 | PYTHON_INCLUDE := /usr/include/python2.7 \ 5 | /usr/lib/python2.7/dist-packages/numpy/core/include 6 | # Anaconda Python distribution is quite popular. Include path: 7 | # Verify anaconda location, sometimes it's in root. 8 | # ANACONDA_HOME := $(HOME)/anaconda 9 | # PYTHON_INCLUDE := $(ANACONDA_HOME)/include \ 10 | # $(ANACONDA_HOME)/include/python2.7 \ 11 | # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include 12 | 13 | 14 | # We need to be able to find libpythonX.X.so or .dylib. 15 | PYTHON_LIB := /usr/lib 16 | # PYTHON_LIB := $(ANACONDA_HOME)/lib 17 | 18 | # Whatever else you find you need goes here. 19 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include 20 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib 21 | 22 | # Homebrew installs numpy in a non standard path (keg only) 23 | # PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include 24 | # PYTHON_LIB += $(shell brew --prefix numpy)/lib 25 | 26 | # CUDA directory contains bin/ and lib/ directories that we need. 27 | CUDA_DIR := /usr/local/cuda 28 | 29 | # Build directory 30 | BUILD_DIR := build 31 | 32 | # Uncomment for debugging. 33 | # DEBUG := 1 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # K-Nearest Neighbor GPU 2 | 3 | This repository contains a GPU version of K-Nearest Neighbor search. It also provides a python wrapper for the ease of use. The main CUDA code is modified from the [K Nearest Neighbor CUDA library](https://github.com/vincentfpgarcia/kNN-CUDA). Along with the K-NN search, the code provides feature extraction from a feature map using a bilinear interpolation. 4 | 5 | # Installation 6 | 7 | Please modify the `Makefile.config` to make sure all the dependencies are set correctly. 8 | 9 | ``` 10 | git clone https://github.com/chrischoy/knn_cuda.git 11 | cd knn_cuda 12 | ``` 13 | 14 | Modify the `Makefile.config` file to set `PYTHON_INCLUDE`, `PYTHON_LIB`, `CUDA_DIR` correctly. By default, The variables are set to the default python and CUDA installation directories. 15 | 16 | Then 17 | 18 | ``` 19 | make 20 | ``` 21 | 22 | # Example 23 | 24 | Once you build the wrapper, run 25 | 26 | ``` 27 | python example.py 28 | [[3367 2785 1523 ..., 1526 569 3616] 29 | [1929 3353 339 ..., 690 463 2972]] 30 | [[3413 3085 1528 ..., 608 2258 733] 31 | [1493 3849 1616 ..., 743 2012 1786]] 32 | [[2446 3320 2379 ..., 2718 598 1854] 33 | [1348 3857 1393 ..., 3258 1642 3436]] 34 | [[3044 2604 3972 ..., 3968 1710 2916] 35 | [ 812 1090 355 ..., 699 3231 2302]] 36 | ``` 37 | 38 | # Usage 39 | 40 | In python, after you `import knn`, you can access the knn function. 41 | 42 | ## distances, indices = knn.knn(query_points, reference_points, K) 43 | 44 | Both query_points and reference_points must be numpy arrays with float32 format. 45 | For both query and reference, the first dimension is the dimension of the vector and the second dimension is the number of vectors. 46 | K is the number of nearest neighbors. 47 | 48 | For each vector in the query_points, the function returns the distance from the query and the K-NNs and the 1-based indices of the K nearest neighbors. 49 | Both `distances` and `indices` have the same dimensions and the first dimension has size `K` and the size the second dimension is equal to the number of vectors in the query_points. 50 | 51 | ## extracted_features = knn.extract_feature(activations, coordinates) 52 | 53 | Extract features from the activation maps using bilinear interpolation. 54 | The `activations` is a 4D (N, C, H, W) tensor from which we extract features. N is the number of feature maps; C is the number of channels; H and W are height and width respectively. 55 | The `coordinates` is a 3D (N, M, 2) tensor which contains the coordinates which we use to extract features. N is the number of feature maps; M is the number of coordinates; The last 2 is for x and y coordinate. 56 | 57 | # Warning 58 | 59 | The returned index is 1-base. 60 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import knn 3 | 4 | c = 128 5 | 6 | for n in range(4): 7 | query = np.random.rand(c, 1000).astype(np.float32) 8 | 9 | reference = np.random.rand(c, 4000).astype(np.float32) 10 | 11 | # Index is 1-based 12 | dist, ind = knn.knn(query.reshape(c, -1), 13 | reference.reshape(c, -1), 2) 14 | 15 | print ind 16 | -------------------------------------------------------------------------------- /src/knn.cpp: -------------------------------------------------------------------------------- 1 | // Python 2 | #include 3 | #include 4 | #include "knn.h" 5 | 6 | using namespace boost::python; 7 | 8 | // For extracting features from a 4-D blob feature map. 9 | object extract_feature(PyObject* activation_, PyObject* coords_) 10 | { 11 | PyArrayObject* activation_py = (PyArrayObject*) activation_; 12 | PyArrayObject* coords_py = (PyArrayObject*) coords_; 13 | int n_batch = activation_py->dimensions[0]; 14 | int n_channel = activation_py->dimensions[1]; 15 | int height = activation_py->dimensions[2]; 16 | int width = activation_py->dimensions[3]; 17 | 18 | int n_max_coord = coords_py->dimensions[1]; 19 | int dim_coord = coords_py->dimensions[2]; 20 | 21 | float* activation = new float[n_batch * n_channel * height * width]; 22 | float* coords = new float[n_batch * n_max_coord * dim_coord]; 23 | float* extracted_activation = new float[n_batch * n_channel * n_max_coord];; 24 | 25 | // Copy python objects 26 | for(int n = 0; n < n_batch; n++){ 27 | for (int c = 0; c < n_channel; c++){ 28 | for(int i = 0; i < height; i++) { 29 | for(int j = 0; j < width; j++) { 30 | activation[((n * n_channel + c) * height + i) * width + j] = 31 | *(float*)PyArray_GETPTR4(activation_py, n, c, i, j); 32 | } 33 | } 34 | } 35 | } 36 | 37 | for(int n = 0; n < n_batch; n++){ 38 | for(int i = 0; i < n_max_coord; i++) { 39 | for(int j = 0; j < dim_coord; j++) { 40 | coords[(n * n_max_coord + i) * dim_coord + j] = 41 | *(float*)PyArray_GETPTR3(coords_py, n, i, j); 42 | } 43 | } 44 | } 45 | 46 | extract_cuda(activation, n_batch, n_channel, height, 47 | width, coords, n_max_coord, dim_coord, extracted_activation); 48 | 49 | npy_intp dims[3] = {n_batch, n_channel, n_max_coord}; 50 | PyObject* py_obj = PyArray_SimpleNewFromData(3, dims, NPY_FLOAT, 51 | extracted_activation); 52 | handle<> handle(py_obj); 53 | 54 | numeric::array arr(handle); 55 | 56 | free(activation); 57 | free(coords); 58 | 59 | return arr.copy(); 60 | } 61 | 62 | // CUDA K-NN wrapper 63 | // Takes features and retuns the distances and indices of the k-nearest 64 | // neighboring features. 65 | object knn(PyObject* query_points_, PyObject* ref_points_, int k) 66 | { 67 | PyArrayObject* query_points = (PyArrayObject*) query_points_; 68 | PyArrayObject* ref_points = (PyArrayObject*) ref_points_; 69 | int n_query = query_points->dimensions[1]; 70 | int n_ref = ref_points->dimensions[1]; 71 | int dim = query_points->dimensions[0]; 72 | float* query_points_c = new float[n_query * dim]; 73 | float* ref_points_c = new float[n_ref * dim]; 74 | float* dist = new float[n_query * k]; 75 | int* ind = new int[n_query * k]; 76 | 77 | // Copy python objects 78 | for(int i = 0; i < n_query; i++) { 79 | for(int j = 0; j < dim; j++) { 80 | query_points_c[n_query * j + i] = 81 | *(float*)PyArray_GETPTR2(query_points, j, i); 82 | } 83 | } 84 | 85 | for(int i = 0; i < n_ref; i++) { 86 | for(int j = 0; j < dim; j++) { 87 | ref_points_c[n_ref * j + i] = *(float*)PyArray_GETPTR2(ref_points, j, i); 88 | } 89 | } 90 | 91 | knn_cuda(ref_points_c, n_ref, query_points_c, n_query, dim, k, dist, ind); 92 | 93 | npy_intp dims[2] = {k, n_query}; 94 | PyObject* py_obj_dist = PyArray_SimpleNewFromData(2, dims, NPY_FLOAT, dist); 95 | PyObject* py_obj_ind = PyArray_SimpleNewFromData(2, dims, NPY_INT, ind); 96 | handle<> handle_dist(py_obj_dist); 97 | handle<> handle_ind(py_obj_ind); 98 | 99 | numeric::array arr_dist(handle_dist); 100 | numeric::array arr_ind(handle_ind); 101 | 102 | free(query_points_c); 103 | free(ref_points_c); 104 | 105 | return make_tuple(arr_dist.copy(), arr_ind.copy()); 106 | } 107 | 108 | BOOST_PYTHON_MODULE(knn) 109 | { 110 | import_array(); 111 | numeric::array::set_module_and_type("numpy", "ndarray"); 112 | def("knn", knn); 113 | def("extract", extract_feature); 114 | } 115 | -------------------------------------------------------------------------------- /src/knn.h: -------------------------------------------------------------------------------- 1 | void knn_cuda(float* ref_host, int ref_width, float* query_host, int query_width, int height, int k, float* dist_host, int* ind_host); 2 | void extract_cuda(float* activation, int n_batch, int n_channel, int height, 3 | int width, float* coords, int n_max_coord, int dim_coord, float* extracted_activation); 4 | -------------------------------------------------------------------------------- /src/knn_cuda.cu: -------------------------------------------------------------------------------- 1 | /** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA 2 | * The modifications are 3 | * removed texture memory usage 4 | * removed split query KNN computation 5 | * added feature extraction with bilinear interpolation 6 | * 7 | * Last modified by Christopher B. Choy 12/23/2016 8 | */ 9 | 10 | // Includes 11 | #include 12 | #include "cuda.h" 13 | 14 | // Constants used by the program 15 | #define BLOCK_DIM 16 16 | 17 | 18 | //-----------------------------------------------------------------------------------------------// 19 | // KERNELS // 20 | //-----------------------------------------------------------------------------------------------// 21 | __global__ void extract_with_interpolation( 22 | int nthreads, 23 | float *data, float *n_xy_coords, float *extracted_data, 24 | int n_max_coord, int channels, int height, int width) { 25 | 26 | int x0, x1, y0, y1, nc; 27 | float wx0, wx1, wy0, wy1; 28 | int n, nd; 29 | float x, y; 30 | 31 | for (int index = blockIdx.x * blockDim.x + threadIdx.x; 32 | index < (nthreads); 33 | index += blockDim.x * gridDim.x) { 34 | n = (index / n_max_coord); 35 | nd = n * n_max_coord * channels; 36 | x = n_xy_coords[index * 2]; 37 | y = n_xy_coords[index * 2 + 1]; 38 | 39 | x0 = static_cast(floor(x)); 40 | x1 = x0 + 1; 41 | y0 = static_cast(floor(y)); 42 | y1 = y0 + 1; 43 | 44 | x0 = x0 <= 0 ? 0 : (x0 >= (width - 1) ? (width - 1) : x0); 45 | y0 = y0 <= 0 ? 0 : (y0 >= (height - 1) ? (height - 1) : y0); 46 | x1 = x1 <= 0 ? 0 : (x1 >= (width - 1) ? (width - 1) : x1); 47 | y1 = y1 <= 0 ? 0 : (y1 >= (height - 1) ? (height - 1) : y1); 48 | 49 | wx0 = static_cast(x1) - x; 50 | wx1 = x - x0; 51 | wy0 = static_cast(y1) - y; 52 | wy1 = y - y0; 53 | 54 | if(x0 == x1){ wx0 = 1; wx1 = 0; } 55 | if(y0 == y1){ wy0 = 1; wy1 = 0; } 56 | for(int c=0; c < channels; c++) { 57 | nc = (n * channels + c) * height; 58 | // extracted_data[index * channels + c] = wy0 * wx0 * data[(nc + y0) * width + x0] 59 | // extracted_data[nd + index % n_max_coord + n_max_coord * c] = index; 60 | extracted_data[nd + index % n_max_coord + n_max_coord * c] = wy0 * wx0 * data[(nc + y0) * width + x0] 61 | + wy1 * wx0 * data[(nc + y1) * width + x0] 62 | + wy0 * wx1 * data[(nc + y0) * width + x1] 63 | + wy1 * wx1 * data[(nc + y1) * width + x1]; 64 | } 65 | } 66 | } 67 | 68 | /** 69 | * Computes the distance between two matrix A (reference points) and 70 | * B (query points) containing respectively wA and wB points. 71 | * 72 | * @param A pointer on the matrix A 73 | * @param wA width of the matrix A = number of points in A 74 | * @param B pointer on the matrix B 75 | * @param wB width of the matrix B = number of points in B 76 | * @param dim dimension of points = height of matrices A and B 77 | * @param AB pointer on the matrix containing the wA*wB distances computed 78 | */ 79 | __global__ void cuComputeDistanceGlobal( float* A, int wA, 80 | float* B, int wB, int dim, float* AB){ 81 | 82 | // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B 83 | __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM]; 84 | __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM]; 85 | 86 | // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step) 87 | __shared__ int begin_A; 88 | __shared__ int begin_B; 89 | __shared__ int step_A; 90 | __shared__ int step_B; 91 | __shared__ int end_A; 92 | 93 | // Thread index 94 | int tx = threadIdx.x; 95 | int ty = threadIdx.y; 96 | 97 | // Other variables 98 | float tmp; 99 | float ssd = 0; 100 | 101 | // Loop parameters 102 | begin_A = BLOCK_DIM * blockIdx.y; 103 | begin_B = BLOCK_DIM * blockIdx.x; 104 | step_A = BLOCK_DIM * wA; 105 | step_B = BLOCK_DIM * wB; 106 | end_A = begin_A + (dim-1) * wA; 107 | 108 | // Conditions 109 | int cond0 = (begin_A + tx < wA); // used to write in shared memory 110 | int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix 111 | int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix 112 | 113 | // Loop over all the sub-matrices of A and B required to compute the block sub-matrix 114 | for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) { 115 | // Load the matrices from device memory to shared memory; each thread loads one element of each matrix 116 | if (a/wA + ty < dim){ 117 | shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0; 118 | shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0; 119 | } 120 | else{ 121 | shared_A[ty][tx] = 0; 122 | shared_B[ty][tx] = 0; 123 | } 124 | 125 | // Synchronize to make sure the matrices are loaded 126 | __syncthreads(); 127 | 128 | // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix 129 | if (cond2 && cond1){ 130 | for (int k = 0; k < BLOCK_DIM; ++k){ 131 | tmp = shared_A[k][ty] - shared_B[k][tx]; 132 | ssd += tmp*tmp; 133 | } 134 | } 135 | 136 | // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration 137 | __syncthreads(); 138 | } 139 | 140 | // Write the block sub-matrix to device memory; each thread writes one element 141 | if (cond2 && cond1) 142 | AB[(begin_A + ty) * wB + begin_B + tx] = ssd; 143 | } 144 | 145 | 146 | /** 147 | * Gathers k-th smallest distances for each column of the distance matrix in the top. 148 | * 149 | * @param dist distance matrix 150 | * @param ind index matrix 151 | * @param width width of the distance matrix and of the index matrix 152 | * @param height height of the distance matrix and of the index matrix 153 | * @param k number of neighbors to consider 154 | */ 155 | __global__ void cuInsertionSort(float *dist, int *ind, int width, int height, int k){ 156 | 157 | // Variables 158 | int l, i, j; 159 | float *p_dist; 160 | int *p_ind; 161 | float curr_dist, max_dist; 162 | int curr_row, max_row; 163 | unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 164 | 165 | if (xIndexcurr_dist){ 180 | i=a; 181 | break; 182 | } 183 | } 184 | for (j=l; j>i; j--){ 185 | p_dist[j*width] = p_dist[(j-1)*width]; 186 | p_ind[j*width] = p_ind[(j-1)*width]; 187 | } 188 | p_dist[i*width] = curr_dist; 189 | p_ind[i*width] = l+1; 190 | } else { 191 | p_ind[l*width] = l+1; 192 | } 193 | max_dist = p_dist[curr_row]; 194 | } 195 | 196 | // Part 2 : insert element in the k-th first lines 197 | max_row = (k-1)*width; 198 | for (l=k; lcurr_dist){ 204 | i=a; 205 | break; 206 | } 207 | } 208 | for (j=k-1; j>i; j--){ 209 | p_dist[j*width] = p_dist[(j-1)*width]; 210 | p_ind[j*width] = p_ind[(j-1)*width]; 211 | } 212 | p_dist[i*width] = curr_dist; 213 | p_ind[i*width] = l+1; 214 | max_dist = p_dist[max_row]; 215 | } 216 | } 217 | } 218 | } 219 | 220 | 221 | /** 222 | * Computes the square root of the first line (width-th first element) 223 | * of the distance matrix. 224 | * 225 | * @param dist distance matrix 226 | * @param width width of the distance matrix 227 | * @param k number of neighbors to consider 228 | */ 229 | __global__ void cuParallelSqrt(float *dist, int width, int k){ 230 | unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 231 | unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; 232 | if (xIndex>>(n_batch * n_max_coord, 321 | activation_device, coord_device, extracted_activation_device, 322 | n_max_coord, n_channel, height, width); 323 | 324 | // Memory copy of output from device to host 325 | cudaMemcpy(extracted_activation, &extracted_activation_device[0], 326 | n_batch * n_channel * n_max_coord * size_of_float, 327 | cudaMemcpyDeviceToHost); 328 | 329 | // Free memory 330 | cudaFree(coord_device); 331 | cudaFree(activation_device); 332 | cudaFree(extracted_activation_device); 333 | } 334 | 335 | /** 336 | * K nearest neighbor algorithm 337 | * - Initialize CUDA 338 | * - Allocate device memory 339 | * - Copy point sets (reference and query points) from host to device memory 340 | * - Compute the distances + indexes to the k nearest neighbors for each query point 341 | * - Copy distances from device to host memory 342 | * 343 | * @param ref_host reference points ; pointer to linear matrix 344 | * @param ref_width number of reference points ; width of the matrix 345 | * @param query_host query points ; pointer to linear matrix 346 | * @param query_width number of query points ; width of the matrix 347 | * @param height dimension of points ; height of the matrices 348 | * @param k number of neighbor to consider 349 | * @param dist_host distances to k nearest neighbors ; pointer to linear matrix 350 | * @param dist_host indexes of the k nearest neighbors ; pointer to linear matrix 351 | * 352 | */ 353 | void knn_cuda(float* ref_host, int ref_width, float* query_host, 354 | int query_width, int height, int k, float* dist_host, int* ind_host){ 355 | 356 | unsigned int size_of_float = sizeof(float); 357 | unsigned int size_of_int = sizeof(int); 358 | 359 | // Variables 360 | float *query_dev; 361 | float *ref_dev; 362 | float *dist_dev; 363 | int *ind_dev; 364 | 365 | // CUDA Initialisation 366 | cuInit(0); 367 | 368 | // Allocation of global memory for query points and for distances, CUDA_CHECK 369 | cudaMalloc((void **) &query_dev, query_width * height * size_of_float); 370 | cudaMalloc((void **) &dist_dev, query_width * ref_width * size_of_float); 371 | 372 | // Allocation of global memory for indexes CUDA_CHECK 373 | cudaMalloc((void **) &ind_dev, query_width * k * size_of_int); 374 | 375 | // Allocation of global memory CUDA_CHECK 376 | cudaMalloc( (void **) &ref_dev, ref_width * height * size_of_float); 377 | 378 | cudaMemcpy(ref_dev, &ref_host[0], ref_width * height * size_of_float, 379 | cudaMemcpyHostToDevice); 380 | 381 | // Copy of part of query actually being treated 382 | cudaMemcpy(query_dev, &query_host[0], 383 | query_width * height * size_of_float, cudaMemcpyHostToDevice); 384 | 385 | // Grids ans threads 386 | dim3 g_16x16(query_width/16, ref_width/16, 1); 387 | dim3 t_16x16(16, 16, 1); 388 | if (query_width%16 != 0) g_16x16.x += 1; 389 | if (ref_width %16 != 0) g_16x16.y += 1; 390 | // 391 | dim3 g_256x1(query_width/256, 1, 1); 392 | dim3 t_256x1(256, 1, 1); 393 | if (query_width%256 != 0) g_256x1.x += 1; 394 | 395 | dim3 g_k_16x16(query_width/16, k/16, 1); 396 | dim3 t_k_16x16(16, 16, 1); 397 | if (query_width%16 != 0) g_k_16x16.x += 1; 398 | if (k %16 != 0) g_k_16x16.y += 1; 399 | 400 | // Kernel 1: Compute all the distances 401 | cuComputeDistanceGlobal<<>>(ref_dev, ref_width, 402 | query_dev, query_width, height, dist_dev); 403 | 404 | // Kernel 2: Sort each column 405 | cuInsertionSort<<>>(dist_dev, ind_dev, 406 | query_width, ref_width, k); 407 | 408 | // Kernel 3: Compute square root of k first elements 409 | cuParallelSqrt<<>>(dist_dev, query_width, k); 410 | 411 | // Memory copy of output from device to host 412 | cudaMemcpy(&dist_host[0], dist_dev, 413 | query_width * k *size_of_float, cudaMemcpyDeviceToHost); 414 | 415 | cudaMemcpy(&ind_host[0], ind_dev, 416 | query_width * k * size_of_int, cudaMemcpyDeviceToHost); 417 | 418 | // Free memory 419 | cudaFree(ref_dev); 420 | cudaFree(ind_dev); 421 | cudaFree(query_dev); 422 | cudaFree(dist_dev); 423 | } 424 | 425 | 426 | /** 427 | * Example of use of kNN search CUDA. 428 | */ 429 | int main(void){ 430 | // Variables and parameters 431 | float* ref; // Pointer to reference point array 432 | float* query; // Pointer to query point array 433 | float* dist; // Pointer to distance array 434 | int* ind; // Pointer to index array 435 | int ref_nb = 4096; // Reference point number, max=65535 436 | int query_nb = 4096; // Query point number, max=65535 437 | int dim = 32; // Dimension of points 438 | int k = 20; // Nearest neighbors to consider 439 | int iterations = 100; 440 | int i; 441 | 442 | // Memory allocation 443 | ref = (float *) malloc(ref_nb * dim * sizeof(float)); 444 | query = (float *) malloc(query_nb * dim * sizeof(float)); 445 | dist = (float *) malloc(query_nb * k * sizeof(float)); 446 | ind = (int *) malloc(query_nb * k * sizeof(float)); 447 | 448 | // Init 449 | srand(time(NULL)); 450 | for (i=0 ; i