├── README.md ├── knn-0-1.rockspec ├── LICENSE.md ├── CMakeLists.txt ├── generic └── knn.cpp ├── init.lua ├── test.lua ├── knn.cpp └── knn_cuda.cu /README.md: -------------------------------------------------------------------------------- 1 | lua-knn 2 | ======= 3 | 4 | torch7 wrapper for knn CUDA code 5 | -------------------------------------------------------------------------------- /knn-0-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "knn" 2 | version = "0-1" 3 | 4 | source = { 5 | url = "git://github.com/torch/nn.git", 6 | } 7 | 8 | description = { 9 | summary = "KNN cuda bindings for Torch", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/torch/nn", 13 | license = "Attribution-Noncommercial-Share Alike 3.0 Unported" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | } 19 | 20 | build = { 21 | type = "command", 22 | build_command = [[ 23 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 24 | ]], 25 | install_command = "cd build && $(MAKE) install" 26 | } -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | CREATIVE COMMONS LICENSE 2 | ======================== 3 | 4 | Attribution-Noncommercial-Share Alike 3.0 Unported 5 | 6 | You are free: 7 | 8 | * to Share : to copy, distribute and transmit the work 9 | * to Remix : to adapt the work 10 | 11 | Under the following conditions: 12 | 13 | * Attribution : You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work). 14 | * Noncommercial : You may not use this work for commercial purposes. 15 | * Share Alike : If you alter, transform, or build upon this work, you may distribute the resulting work only under the same or similar license to this one. 16 | 17 | For more information, please consult the page http://creativecommons.org/licenses/by-nc-sa/3.0/ 18 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.6) 3 | 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") 5 | 6 | FIND_PACKAGE(Torch REQUIRED) 7 | FIND_PACKAGE(CUDA 4.0 REQUIRED) 8 | 9 | # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 10 | 11 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}") 12 | INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) 13 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}") 14 | 15 | # SET(src knn.cpp)s 16 | SET(src-cuda knn.cpp knn_cuda.cu generic/knn.cpp) 17 | SET(luasrc init.lua) 18 | 19 | CUDA_ADD_LIBRARY(knn MODULE ${src-cuda}) 20 | 21 | CUDA_ADD_CUBLAS_TO_TARGET(knn) 22 | 23 | # ADD_TORCH_PACKAGE(knn "${src}" "${luasrc}" "Machine Learning") 24 | TARGET_LINK_LIBRARIES(knn luaT TH cuda) 25 | 26 | 27 | SET_TARGET_PROPERTIES(knn PROPERTIES 28 | PREFIX "lib" 29 | IMPORT_PREFIX "lib") 30 | 31 | INSTALL(TARGETS knn 32 | RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" 33 | LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") 34 | 35 | INSTALL( 36 | FILES 37 | ${luasrc} 38 | 39 | DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/knn") -------------------------------------------------------------------------------- /generic/knn.cpp: -------------------------------------------------------------------------------- 1 | #ifndef TH_GENERIC_FILE 2 | #define TH_GENERIC_FILE "generic/knn.cpp" 3 | #else 4 | 5 | 6 | #include 7 | #include 8 | 9 | 10 | inline void libknn_(push)(lua_State *L, THTensor *tensor) { 11 | THTensor_(retain)(tensor); 12 | luaT_pushudata(L, tensor, torch_Tensor); 13 | } 14 | 15 | 16 | inline THTensor *libknn_(checkTensor)(lua_State* L, int arg) { 17 | return (THTensor*)luaT_checkudata(L, arg, torch_Tensor); 18 | } 19 | 20 | template<> struct TensorType { typedef THTensor Tensor; }; 21 | inline real *data(THTensor *tensor) { return THTensor_(data)(tensor); } 22 | 23 | inline THLongStorage *newSizeOf(THTensor *tensor) { return THTensor_(newSizeOf)(tensor); } 24 | 25 | 26 | template<> 27 | inline THTensor *check(lua_State *L, int i) { return libknn_(checkTensor)(L, i); } 28 | 29 | 30 | template 31 | inline int libknn_(lookup) (lua_State *L) { 32 | try { 33 | 34 | THTensor *table = libknn_(checkTensor)(L, 1); 35 | typename TensorType::Tensor *index = check(L, 2); 36 | 37 | THTensor *dest = THTensor_(newWithSize)(newSizeOf(index), NULL); 38 | real *tableData = (real*)data(table); 39 | 40 | TH_TENSOR_APPLY2(real, dest, T, index, *dest_data = tableData[*index_data - 1]; ); 41 | 42 | libknn_(push)(L, dest); 43 | 44 | return 1; 45 | 46 | } catch (std::exception const &e) { 47 | luaL_error(L, e.what()); 48 | } 49 | } 50 | 51 | static int libknn_(lookupByte) (lua_State *L) { return libknn_(lookup)(L); } 52 | static int libknn_(lookupShort) (lua_State *L) { return libknn_(lookup)(L); } 53 | static int libknn_(lookupInt) (lua_State *L) { return libknn_(lookup)(L); } 54 | static int libknn_(lookupLong) (lua_State *L) { return libknn_(lookup)(L); } 55 | 56 | 57 | static const luaL_reg libknn_(Main__) [] = 58 | { 59 | {"lookup_byte", libknn_(lookupByte) }, 60 | {"lookup_short", libknn_(lookupShort) }, 61 | {"lookup_int", libknn_(lookupInt) }, 62 | {"lookup_long", libknn_(lookupLong) }, 63 | {NULL, NULL} /* sentinel */ 64 | }; 65 | 66 | 67 | extern "C" { 68 | 69 | DLL_EXPORT int libknn_(Main_init) (lua_State *L) { 70 | luaT_pushmetatable(L, torch_Tensor); 71 | luaT_registeratname(L, libknn_(Main__), "libknn"); 72 | lua_pop(L,1); 73 | return 1; 74 | } 75 | 76 | } 77 | 78 | #endif -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'torch' 3 | require 'sys' 4 | require 'paths' 5 | require 'dok' 6 | 7 | 8 | -- load C lib 9 | require 'libknn' 10 | 11 | 12 | local knn = {} 13 | 14 | function knn.knn(...) 15 | 16 | local _, ref, query, k = dok.unpack( 17 | {...}, 18 | 'knn.knn', 19 | [[K-Nearest Neighbours]], 20 | 21 | {arg='ref', type='torch.FloatTensor', 22 | help='reference points (m x h) 2d tensor', req=true}, 23 | 24 | {arg='query', type='torch.FloatTensor', 25 | help='query point(s) (n x h) 2d tensor or (h) 1d tensor', req=true}, 26 | 27 | {arg='k', type='number', 28 | help='number of results returned per query point', default=1} 29 | ) 30 | 31 | 32 | if(query:dim() == 1) then 33 | query = query:resize(1, query:size(1)) 34 | end 35 | 36 | assert(query:dim() == 2 and ref:dim() == 2, "query must be 1d or 2d tensor (h or n x h), ref must be a 2d (h x m) tensor") 37 | assert(query:size(2) == ref:size(2), "query and ref must have equal size features") 38 | -- assert(query:size(1) <= 65535 and ref:size(1) <= 65535, "maximum size permitted is 65535") 39 | 40 | k = math.min(k, ref:size(1)) 41 | 42 | local distances, indices = libknn.knn(k, ref:t():contiguous(), query:t():contiguous()) 43 | return distances:t(), indices:t() 44 | end 45 | 46 | 47 | function knn.lookup(...) 48 | 49 | local _, table, indexes = dok.unpack( 50 | {...}, 51 | 'knn.knn', 52 | [[K-Nearest Neighbours]], 53 | 54 | {arg='table', type='torch.*Tensor', 55 | help='table to index, 1d tensor', req=true}, 56 | 57 | {arg='indexes', type='torch.IntTensor | torch.LongTensor | torch.ShortTensor | torch.ByteTensor', 58 | help='tensor of indexes into the table', req=true} 59 | ) 60 | 61 | 62 | assert(table:dim() == 1) 63 | 64 | if(torch.typename(indexes) == "torch.ByteTensor") then 65 | return table.libknn.lookup_byte(table, indexes) 66 | elseif (torch.typename(indexes) == "torch.ShortTensor") then 67 | return table.libknn.lookup_short(table, indexes) 68 | elseif (torch.typename(indexes) == "torch.IntTensor") then 69 | return table.libknn.lookup_int(table, indexes) 70 | elseif (torch.typename(indexes) == "torch.LongTensor") then 71 | return table.libknn.lookup_long(table, indexes) 72 | else 73 | assert(false, "indexes must have integer type") 74 | end 75 | 76 | end 77 | 78 | 79 | return knn -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | 2 | local knn = require 'knn' 3 | 4 | local function time(name, f) 5 | local t = torch.Timer() 6 | local r = f() 7 | print(string.format("%s = %s: %.2f", name, tostring(r), t:time().real)) 8 | 9 | collectgarbage() 10 | end 11 | 12 | local test = {} 13 | 14 | 15 | local makeData = function(n, size) 16 | return torch.FloatTensor():range(1, size * n):reshape(n, size) 17 | end 18 | 19 | test.benchmark = function(k, size, q, n) 20 | local data = makeData(n, size) 21 | local query = makeData(q, size) 22 | 23 | time(string.format("knn (k: %d) (features: %d) (query: %d) (data: %d)", k, query:size(2), query:size(1), data:size(1)), function () 24 | knn.knn(data, query, k) 25 | end) 26 | 27 | end 28 | 29 | 30 | test.knn = function(k, size, q, n) 31 | local data = torch.FloatTensor():rand(n, size) 32 | local query = torch.FloatTensor(q, size):zero() 33 | 34 | 35 | local inds = {} 36 | for i = 1, q do 37 | local index = torch.random(n) 38 | inds[i] = index 39 | query[i] = data[index] 40 | end 41 | 42 | local dists, indices = knn.knn(data, query, k) 43 | 44 | -- print(data, query, dists, indices) 45 | 46 | for i = 1, q do 47 | assert(dists[i][1] == 0, "distance should be zero, was: "..tostring(dists[i][1])) 48 | assert(indices[i][1] == inds[i], "indices aren't correct, was: "..tostring(indices[i][1]).." should be: "..inds[i]) 49 | end 50 | 51 | print(string.format("test passed, knn (k: %d) (features: %d) (query: %d) (data: %d)", k, query:size(2), query:size(1), data:size(1))) 52 | end 53 | 54 | 55 | 56 | test.lookup = function(n) 57 | 58 | for i = 1, n do 59 | 60 | local n1 = torch.random(100) 61 | local n2 = torch.random(10) 62 | 63 | local l = torch.random(20) 64 | 65 | local table = torch.LongTensor():range(1, l) 66 | local indices = torch.IntTensor(n1, n2):random(1, l) 67 | 68 | local r = knn.lookup(table, indices):int() 69 | 70 | assert(r:eq(indices):min() == 1, string.format("lookup failed table = %d indices = (%d, %d)", l, n1, n2)) 71 | end 72 | 73 | print(string.format("lookup passed %d tests", n)) 74 | 75 | end 76 | 77 | test.lookup(10000) 78 | 79 | test.knn(2, 5, 10, 10) 80 | test.knn(4, 1280, 100, 10000) 81 | test.knn(4, 100, 200, 10000) 82 | 83 | test.knn(16, 1024, 2000, 70000) 84 | 85 | test.benchmark(2, 128, 10000, 10000) 86 | test.benchmark(4, 128, 10000, 50000) 87 | test.benchmark(8, 128, 50000, 50000) 88 | test.benchmark(24, 128, 10000, 100000) 89 | test.benchmark(16, 1024, 10000, 50000) 90 | 91 | return test -------------------------------------------------------------------------------- /knn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | template struct TensorType; 7 | 8 | 9 | template typename TensorType::Tensor *check(lua_State *L, int i); 10 | 11 | 12 | 13 | extern void knn(float* ref_host, int ref_width, float* query_host, int query_width, int height, int k, float* dist_host, int* ind_host); 14 | 15 | static int knn(lua_State* L) { 16 | THFloatTensor *distances = NULL; 17 | THIntTensor *indices = NULL; 18 | 19 | try { 20 | 21 | int k = lua_tonumber(L, 1); 22 | THFloatTensor *ref = (THFloatTensor*)luaT_checkudata(L, 2, "torch.FloatTensor"); 23 | THFloatTensor *query = (THFloatTensor*)luaT_checkudata(L, 3, "torch.FloatTensor"); 24 | 25 | if(ref->nDimension != 2 || query->nDimension != 2) 26 | throw std::invalid_argument("knn: expected 2d tensor of reference and query points"); 27 | 28 | size_t features = ref->size[0]; 29 | if(features != query->size[0]) 30 | throw std::invalid_argument("knn: query and reference points must have the same size"); 31 | 32 | distances = THFloatTensor_newWithSize2d(k, query->size[1]); 33 | indices = THIntTensor_newWithSize2d(k, query->size[1]); 34 | 35 | 36 | knn(THFloatTensor_data(ref), ref->size[1], THFloatTensor_data(query), query->size[1], features, k, THFloatTensor_data(distances), THIntTensor_data(indices)); 37 | 38 | THFloatTensor_retain(distances); 39 | THIntTensor_retain(indices); 40 | 41 | luaT_pushudata(L, distances, "torch.FloatTensor"); 42 | luaT_pushudata(L, indices, "torch.IntTensor"); 43 | 44 | return 2; 45 | 46 | } catch (std::exception const &e) { 47 | luaL_error(L, e.what()); 48 | } 49 | 50 | 51 | return 1; 52 | } 53 | 54 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 55 | #define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor) 56 | #define libknn_(NAME) TH_CONCAT_3(libknn_, Real, NAME) 57 | 58 | #include "generic/knn.cpp" 59 | #include "THGenerateAllTypes.h" 60 | 61 | 62 | 63 | 64 | //============================================================ 65 | // Register functions in LUA 66 | // 67 | static const luaL_reg libknn_init [] = 68 | { 69 | {"knn", knn}, 70 | {NULL,NULL} 71 | }; 72 | 73 | 74 | extern "C" { 75 | 76 | DLL_EXPORT int luaopen_libknn(lua_State *L) 77 | { 78 | 79 | luaL_register(L, "libknn", libknn_init); 80 | 81 | libknn_ByteMain_init(L); 82 | libknn_CharMain_init(L); 83 | libknn_IntMain_init(L); 84 | libknn_LongMain_init(L); 85 | libknn_FloatMain_init(L); 86 | libknn_DoubleMain_init(L); 87 | 88 | return 1; 89 | } 90 | 91 | } -------------------------------------------------------------------------------- /knn_cuda.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Date 03/07/2009 4 | * ==== 5 | * 6 | * Authors Vincent Garcia 7 | * ======= Eric Debreuve 8 | * Michel Barlaud 9 | * 10 | * Description Given a reference point set and a query point set, the program returns 11 | * =========== firts the distance between each query point and its k nearest neighbors in 12 | * the reference point set, and second the indexes of these k nearest neighbors. 13 | * The computation is performed using the API NVIDIA CUDA. 14 | * 15 | * Paper Fast k nearest neighbor search using GPU 16 | * ===== 17 | * 18 | * BibTeX @INPROCEEDINGS{2008_garcia_cvgpu, 19 | * ====== author = {V. Garcia and E. Debreuve and M. Barlaud}, 20 | * title = {Fast k nearest neighbor search using GPU}, 21 | * booktitle = {CVPR Workshop on Computer Vision on GPU}, 22 | * year = {2008}, 23 | * address = {Anchorage, Alaska, USA}, 24 | * month = {June} 25 | * } 26 | * 27 | */ 28 | 29 | 30 | 31 | // Includes 32 | #include 33 | #include 34 | #include "cuda.h" 35 | #include 36 | 37 | #include 38 | #include 39 | 40 | // Constants used by the program 41 | #define MAX_PART_OF_FREE_MEMORY_USED 0.9 42 | 43 | //Code breaks with different values of this constant 44 | #define BLOCK_DIM 32 45 | 46 | 47 | 48 | //-----------------------------------------------------------------------------------------------// 49 | // KERNELS // 50 | //-----------------------------------------------------------------------------------------------// 51 | 52 | 53 | 54 | /** 55 | * Computes the distance between two matrix A (reference points) and 56 | * B (query points) containing respectively wA and wB points. 57 | * 58 | * @param A pointer on the matrix A 59 | * @param wA width of the matrix A = number of points in A 60 | * @param pA pitch of matrix A given in number of columns 61 | * @param B pointer on the matrix B 62 | * @param wB width of the matrix B = number of points in B 63 | * @param pB pitch of matrix B given in number of columns 64 | * @param dim dimension of points = height of matrices A and B 65 | * @param AB pointer on the matrix containing the wA*wB distances computed 66 | */ 67 | __global__ void cuComputeDistanceGlobal( float* A, int wA, int pA, float* B, int wB, int pB, int dim, float* AB){ 68 | 69 | // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B 70 | __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM]; 71 | __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM]; 72 | 73 | // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step) 74 | __shared__ int begin_A; 75 | __shared__ int begin_B; 76 | __shared__ int step_A; 77 | __shared__ int step_B; 78 | __shared__ int end_A; 79 | 80 | // Thread index 81 | int tx = threadIdx.x; 82 | int ty = threadIdx.y; 83 | 84 | // Other variables 85 | float tmp; 86 | float ssd = 0; 87 | 88 | // Loop parameters 89 | begin_A = BLOCK_DIM * blockIdx.y; 90 | begin_B = BLOCK_DIM * blockIdx.x; 91 | step_A = BLOCK_DIM * pA; 92 | step_B = BLOCK_DIM * pB; 93 | end_A = begin_A + (dim-1) * pA; 94 | 95 | // Conditions 96 | int cond0 = (begin_A + tx < wA); // used to write in shared memory 97 | int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix 98 | int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix 99 | 100 | // Loop over all the sub-matrices of A and B required to compute the block sub-matrix 101 | for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) { 102 | 103 | // Load the matrices from device memory to shared memory; each thread loads one element of each matrix 104 | if (a/pA + ty < dim){ 105 | shared_A[ty][tx] = (cond0)? A[a + pA * ty + tx] : 0; 106 | shared_B[ty][tx] = (cond1)? B[b + pB * ty + tx] : 0; 107 | } 108 | else{ 109 | shared_A[ty][tx] = 0; 110 | shared_B[ty][tx] = 0; 111 | } 112 | 113 | // Synchronize to make sure the matrices are loaded 114 | __syncthreads(); 115 | 116 | // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix 117 | if (cond2 && cond1){ 118 | for (int k = 0; k < BLOCK_DIM; ++k){ 119 | tmp = shared_A[k][ty] - shared_B[k][tx]; 120 | ssd += tmp*tmp; 121 | } 122 | } 123 | 124 | // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration 125 | __syncthreads(); 126 | } 127 | 128 | // Write the block sub-matrix to device memory; each thread writes one element 129 | if (cond2 && cond1) 130 | AB[ (begin_A + ty) * pB + begin_B + tx ] = ssd; 131 | } 132 | 133 | 134 | 135 | /** 136 | * Gathers k-th smallest distances for each column of the distance matrix in the top. 137 | * 138 | * @param dist distance matrix 139 | * @param dist_pitch pitch of the distance matrix given in number of columns 140 | * @param ind index matrix 141 | * @param ind_pitch pitch of the index matrix given in number of columns 142 | * @param width width of the distance matrix and of the index matrix 143 | * @param height height of the distance matrix and of the index matrix 144 | * @param k number of neighbors to consider 145 | */ 146 | __global__ void cuInsertionSort(float *dist, int dist_pitch, int *ind, int ind_pitch, int width, int height, int k){ 147 | 148 | // Variables 149 | int l, i, j; 150 | float *p_dist; 151 | int *p_ind; 152 | float curr_dist, max_dist; 153 | int curr_row, max_row; 154 | unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 155 | 156 | if (xIndexcurr_dist){ 172 | i=a; 173 | break; 174 | } 175 | } 176 | for (j=l; j>i; j--){ 177 | p_dist[j*dist_pitch] = p_dist[(j-1)*dist_pitch]; 178 | p_ind[j*ind_pitch] = p_ind[(j-1)*ind_pitch]; 179 | } 180 | p_dist[i*dist_pitch] = curr_dist; 181 | p_ind[i*ind_pitch] = l+1; 182 | } 183 | else 184 | p_ind[l*ind_pitch] = l+1; 185 | max_dist = p_dist[curr_row]; 186 | } 187 | 188 | // Part 2 : insert element in the k-th first lines 189 | max_row = (k-1)*dist_pitch; 190 | for (l=k; lcurr_dist){ 196 | i=a; 197 | break; 198 | } 199 | } 200 | for (j=k-1; j>i; j--){ 201 | p_dist[j*dist_pitch] = p_dist[(j-1)*dist_pitch]; 202 | p_ind[j*ind_pitch] = p_ind[(j-1)*ind_pitch]; 203 | } 204 | p_dist[i*dist_pitch] = curr_dist; 205 | p_ind[i*ind_pitch] = l+1; 206 | max_dist = p_dist[max_row]; 207 | } 208 | } 209 | } 210 | } 211 | 212 | 213 | 214 | 215 | //-----------------------------------------------------------------------------------------------// 216 | // K-th NEAREST NEIGHBORS // 217 | //-----------------------------------------------------------------------------------------------// 218 | 219 | 220 | 221 | /** 222 | * Prints the error message return during the memory allocation. 223 | * 224 | * @param error error value return by the memory allocation function 225 | * @param memorySize size of memory tried to be allocated 226 | */ 227 | void checkAlloc(cudaError_t error, int memorySize) { 228 | std::ostringstream out; 229 | out << "allocation failure (allocating " << memorySize << " bytes): " << cudaGetErrorString(error); 230 | if(error) { 231 | throw std::logic_error(out.str()); 232 | } 233 | } 234 | 235 | 236 | 237 | /** 238 | * K nearest neighbor algorithm 239 | * - Initialize CUDA 240 | * - Allocate device memory 241 | * - Copy point sets (reference and query points) from host to device memory 242 | * - Compute the distances + indexes to the k nearest neighbors for each query point 243 | * - Copy distances from device to host memory 244 | * 245 | * @param ref_host reference points ; pointer to linear matrix 246 | * @param ref_width number of reference points ; width of the matrix 247 | * @param query_host query points ; pointer to linear matrix 248 | * @param query_width number of query points ; width of the matrix 249 | * @param height dimension of points ; height of the matrices 250 | * @param k number of neighbor to consider 251 | * @param dist_host distances to k nearest neighbors ; pointer to linear matrix 252 | * @param dist_host indexes of the k nearest neighbors ; pointer to linear matrix 253 | * 254 | */ 255 | void knn(float* ref_host, int ref_width, float* query_host, int query_width, int height, int k, float* dist_host, int* ind_host){ 256 | 257 | 258 | // Variables 259 | float *query_dev; 260 | float *ref_dev; 261 | float *dist_dev; 262 | int *ind_dev; 263 | 264 | cudaError_t result; 265 | size_t query_pitch; 266 | size_t query_pitch_in_bytes; 267 | size_t ref_pitch; 268 | size_t ref_pitch_in_bytes; 269 | size_t ind_pitch; 270 | size_t ind_pitch_in_bytes; 271 | size_t max_nb_query_traited; 272 | size_t actual_nb_query_width; 273 | size_t memory_total; 274 | size_t memory_free; 275 | 276 | try { 277 | // CUDA Initialisation 278 | cuInit(0); 279 | 280 | // Check free memory using driver API ; only (MAX_PART_OF_FREE_MEMORY_USED*100)% of memory will be used 281 | CUcontext cuContext; 282 | CUdevice cuDevice=0; 283 | cuCtxCreate(&cuContext, 0, cuDevice); 284 | cuMemGetInfo(&memory_free, &memory_total); 285 | cuCtxDetach (cuContext); 286 | 287 | // Determine maximum number of query that can be treated 288 | max_nb_query_traited = ( memory_free * MAX_PART_OF_FREE_MEMORY_USED - sizeof(float) * ref_width*height ) / ( sizeof(float) * (height + ref_width) + sizeof(int) * k); 289 | max_nb_query_traited = min( query_width, int((max_nb_query_traited / BLOCK_DIM) * BLOCK_DIM) ); 290 | 291 | // Allocation of global memory for query points and for distances 292 | result = cudaMallocPitch( (void **) &query_dev, &query_pitch_in_bytes, max_nb_query_traited * sizeof(float), height + ref_width); 293 | checkAlloc(result, max_nb_query_traited*sizeof(float)*(height+ref_width)); 294 | 295 | query_pitch = query_pitch_in_bytes/sizeof(float); 296 | dist_dev = query_dev + height * query_pitch; 297 | 298 | // Allocation of global memory for indexes 299 | result = cudaMallocPitch( (void **) &ind_dev, &ind_pitch_in_bytes, max_nb_query_traited * sizeof(int), k); 300 | checkAlloc(result, max_nb_query_traited*sizeof(int)*k); 301 | 302 | ind_pitch = ind_pitch_in_bytes/sizeof(int); 303 | 304 | // Allocation of global memory 305 | result = cudaMallocPitch( (void **) &ref_dev, &ref_pitch_in_bytes, ref_width * sizeof(float), height); 306 | checkAlloc(result, ref_width*sizeof(float)*height); 307 | 308 | ref_pitch = ref_pitch_in_bytes/sizeof(float); 309 | cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref_host, ref_width*sizeof(float), ref_width*sizeof(float), height, cudaMemcpyHostToDevice); 310 | 311 | 312 | // Split queries to fit in GPU memory 313 | for (int i=0; i>>(ref_dev, ref_width, ref_pitch, query_dev, actual_nb_query_width, query_pitch, height, dist_dev); 341 | 342 | // Kernel 2: Sort each column 343 | cuInsertionSort<<>>(dist_dev, query_pitch, ind_dev, ind_pitch, actual_nb_query_width, ref_width, k); 344 | 345 | // Kernel 3: Compute square root of k first elements 346 | // cuParallelSqrt<<>>(dist_dev, query_width, query_pitch, k); 347 | 348 | // Memory copy of output from device to host 349 | cudaMemcpy2D(&dist_host[i], query_width*sizeof(float), dist_dev, query_pitch_in_bytes, actual_nb_query_width*sizeof(float), k, cudaMemcpyDeviceToHost); 350 | cudaMemcpy2D(&ind_host[i], query_width*sizeof(int), ind_dev, ind_pitch_in_bytes, actual_nb_query_width*sizeof(int), k, cudaMemcpyDeviceToHost); 351 | } 352 | } catch(...) { 353 | cudaFree(ref_dev); 354 | cudaFree(ind_dev); 355 | cudaFree(query_dev); 356 | 357 | throw; 358 | } 359 | 360 | cudaFree(ref_dev); 361 | cudaFree(ind_dev); 362 | cudaFree(query_dev); 363 | } 364 | 365 | 366 | 367 | /* 368 | /** 369 | * Example of use of kNN search CUDA. 370 | */ 371 | int main(void){ 372 | 373 | // Variables and parameters 374 | float* ref; // Pointer to reference point array 375 | float* query; // Pointer to query point array 376 | float* dist; // Pointer to distance array 377 | int* ind; // Pointer to index array 378 | int ref_nb = 100000; // Reference point number, max=65535 379 | int query_nb = 100000; // Query point number, max=65535 380 | int dim = 128; // Dimension of points 381 | int k = 20; // Nearest neighbors to consider 382 | int iterations = 1; 383 | int i; 384 | 385 | // Memory allocation 386 | ref = (float *) malloc(ref_nb * dim * sizeof(float)); 387 | query = (float *) malloc(query_nb * dim * sizeof(float)); 388 | dist = (float *) malloc(query_nb * k * sizeof(float)); 389 | ind = (int *) malloc(query_nb * k * sizeof(float)); 390 | 391 | // Init 392 | srand(time(NULL)); 393 | for (i=0 ; i