├── run.sh ├── README.md ├── nn.cpp └── nn.cu /run.sh: -------------------------------------------------------------------------------- 1 | g++ -std=c++11 -o nn_cpu nn.cpp; 2 | nvcc -std=c++11 -o nn_gpu nn.cu; 3 | echo CPU implementation:; 4 | ./nn_cpu; 5 | echo GPU implementation:; 6 | ./nn_gpu; 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 3d nearest neighbor search in kd-tree with CUDA 2 | 3 | The goal of a project is to compare the performance of CPU ([nn.cpp](nn.cpp)) and CUDA GPU ([nn.cu](nn.cu)) implementations of the same problem. 4 | Both solutions find the nearest neighbor for one million query points in the kd-tree consisting of 10000 3 dimensional points. 5 | 6 | To compile and run both implementations, use the [run.sh](run.sh) script. 7 | -------------------------------------------------------------------------------- /nn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | const int N_POINTS = 1e4, N_QUERIES = 1e6, INF = 1e9, RANGE_MAX = 100, N_PRINT = 10; 8 | 9 | struct int3 10 | { 11 | int x; 12 | int y; 13 | int z; 14 | }; 15 | 16 | void print(int3 *points, int n); 17 | void generatePoints(int3 *points, int n); 18 | void buildKDTree(int3 *points, int3 *tree, int n, int m); 19 | int3 findNearestNeighbor(int3 *tree, int treeSize, int treeNode, int depth, int3 query); 20 | void printResults(int3 *queries, int3 *results, int n); 21 | 22 | int main() 23 | { 24 | srand(17); 25 | 26 | int TREE_SIZE = 1; 27 | while (TREE_SIZE < N_POINTS) 28 | TREE_SIZE <<= 1; 29 | 30 | int3 *points = new int3[N_POINTS]; 31 | int3 *tree = new int3[TREE_SIZE]; 32 | int3 *queries = new int3[N_QUERIES]; 33 | 34 | generatePoints(points, N_POINTS); 35 | buildKDTree(points, tree, N_POINTS, TREE_SIZE); 36 | generatePoints(queries, N_QUERIES); 37 | 38 | 39 | auto start = std::chrono::system_clock::now(); 40 | 41 | int3 *results = new int3[N_QUERIES]; 42 | for (int i = 0; i < N_QUERIES; i++) 43 | { 44 | results[i] = findNearestNeighbor(tree, TREE_SIZE, 1, 0, queries[i]); 45 | } 46 | 47 | auto end = std::chrono::system_clock::now(); 48 | float duration = 1000.0 * std::chrono::duration(end - start).count(); 49 | 50 | printResults(queries, results, N_PRINT); 51 | std::cout << "Elapsed time in milliseconds : " << duration << "ms\n\n"; 52 | } 53 | 54 | void print(int3 *points, int n) 55 | { 56 | for (int i = 0; i < n; i++) 57 | { 58 | std::cout << "[" << points[i].x << ", " << points[i].y << ", " << points[i].z << "] "; 59 | } 60 | std::cout << std::endl; 61 | } 62 | 63 | void generatePoints(int3 *points, int n) 64 | { 65 | for (int i = 0; i < n; i++) 66 | { 67 | points[i] = {.x = rand() % RANGE_MAX+1, .y = rand() % RANGE_MAX+1, .z = rand() % RANGE_MAX+1}; 68 | } 69 | } 70 | 71 | int3 closer(int3 p, int3 p2, int3 p3) 72 | { 73 | if ((abs(p.x - p2.x) + abs(p.y - p2.y) + abs(p.z - p2.z)) < (abs(p.x - p3.x) + abs(p.y - p3.y) + abs(p.z - p3.z))) 74 | { 75 | return p2; 76 | } 77 | return p3; 78 | } 79 | 80 | void buildSubTree(int3 *points, int3 *tree, int start, int end, int depth, int node) 81 | { 82 | if (start >= end) 83 | { 84 | return; 85 | } 86 | 87 | std::sort(points + start, points + end, [depth](int3 p1, int3 p2) -> bool { 88 | if (depth % 3 == 0) 89 | return p1.x < p2.x; 90 | if (depth % 3 == 1) 91 | return p1.y < p2.y; 92 | return p1.z < p2.z; 93 | }); 94 | 95 | int split = (start + end - 1) / 2; 96 | 97 | tree[node] = points[split]; 98 | 99 | buildSubTree(points, tree, start, split, depth + 1, node * 2); 100 | buildSubTree(points, tree, split + 1, end, depth + 1, node * 2 + 1); 101 | } 102 | 103 | void buildKDTree(int3 *points, int3 *tree, int n, int treeSize) 104 | { 105 | for (int i = 0; i < treeSize; i++) 106 | { 107 | tree[i] = {.x = INF, .y = -INF, .z = -INF}; 108 | } 109 | 110 | buildSubTree(points, tree, 0, n, 0, 1); 111 | } 112 | 113 | int3 findNearestNeighbor(int3 *tree, int treeSize, int treeNode, int depth, int3 query) 114 | { 115 | int3 node = tree[treeNode]; 116 | 117 | int val1, val2; 118 | if (depth % 3 == 0) 119 | { 120 | val1 = node.x; 121 | val2 = query.x; 122 | } 123 | else if (depth % 3 == 1) 124 | { 125 | val1 = node.y; 126 | val2 = query.y; 127 | } 128 | else 129 | { 130 | val1 = node.z; 131 | val2 = query.z; 132 | } 133 | 134 | if ((val1 < val2) && (treeNode * 2 < treeSize)) 135 | { 136 | int3 leftChild = tree[treeNode * 2]; 137 | if (leftChild.x != -INF && leftChild.y != -INF && leftChild.z != -INF) 138 | { 139 | return closer(query, node, findNearestNeighbor(tree, treeSize, treeNode * 2, depth + 1, query)); 140 | } 141 | } 142 | else if ((val1 > val2) && (treeNode * 2 + 1 < treeSize)) 143 | { 144 | int3 rightChild = tree[treeNode * 2 + 1]; 145 | if (rightChild.x != -INF && rightChild.y != -INF && rightChild.z != -INF) 146 | { 147 | return closer(query, node, findNearestNeighbor(tree, treeSize, treeNode * 2 + 1, depth + 1, query)); 148 | } 149 | } 150 | return node; 151 | } 152 | 153 | void printResults(int3 *queries, int3 *results, int n) { 154 | for(int i = 0; i < n; i++) { 155 | std::cout<<"query: ["< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define eChk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 8 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 9 | if (code != cudaSuccess) { 10 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 11 | if (abort) exit(code); 12 | } 13 | } 14 | 15 | const int N_POINTS = 1e4, N_QUERIES = 1e6, INF = 1e9, RANGE_MAX = 100, N_PRINT = 10; 16 | 17 | __host__ void print(int3 *points, int n); 18 | __host__ void generatePoints(int3 *points, int n); 19 | __host__ void buildKDTree(int3 *points, int3 *tree, int n, int m); 20 | __global__ void nearestNeighborGPU(int3 *tree, int treeSize, int3 *queries, int3 *results, int nQueries); 21 | __host__ void printResults(int3 *queries, int3 *results, int start, int end); 22 | 23 | int main() { 24 | srand(16); 25 | 26 | int TREE_SIZE = 1; 27 | while(TREE_SIZE < N_POINTS) TREE_SIZE <<= 1; 28 | 29 | int3 *points; 30 | int3 *tree; 31 | int3 *queries; 32 | 33 | eChk(cudaMallocManaged(&points, N_POINTS * sizeof(int3))); 34 | eChk(cudaMallocManaged(&tree, TREE_SIZE * sizeof(int3))); 35 | eChk(cudaMallocManaged(&queries, N_QUERIES * sizeof(int3))); 36 | 37 | generatePoints(points, N_POINTS); 38 | buildKDTree(points, tree, N_POINTS, TREE_SIZE); 39 | generatePoints(queries, N_QUERIES); 40 | 41 | auto start = std::chrono::system_clock::now(); 42 | 43 | int3 *results; 44 | eChk(cudaMallocManaged(&results, N_QUERIES * sizeof(int3))); 45 | 46 | nearestNeighborGPU<<<32768, 32>>>(tree, TREE_SIZE, queries, results, N_QUERIES); 47 | eChk(cudaDeviceSynchronize()); 48 | 49 | auto end = std::chrono::system_clock::now(); 50 | float duration = 1000.0 * std::chrono::duration(end - start).count(); 51 | 52 | printResults(queries, results, N_QUERIES-N_PRINT-1, N_QUERIES); 53 | 54 | std::cout << "Elapsed time in milliseconds : " << duration << "ms\n\n"; 55 | 56 | eChk(cudaFree(results)); 57 | eChk(cudaFree(points)); 58 | eChk(cudaFree(tree)); 59 | eChk(cudaFree(queries)); 60 | } 61 | 62 | __host__ void generatePoints(int3 *points, int n) { 63 | for(int i = 0; i < n; i++) { 64 | points[i] = make_int3(rand()%RANGE_MAX+1, rand()%RANGE_MAX+1, rand()%RANGE_MAX+1); 65 | } 66 | } 67 | 68 | __host__ void buildSubTree(int3 *points, int3 *tree, int start, int end, int depth, int node) { 69 | if(start >= end) return; 70 | 71 | std::sort(points + start, points + end, [depth](int3 p1, int3 p2) -> bool { 72 | if(depth % 3 == 0) return p1.x < p2.x; 73 | if(depth % 3 == 1) return p1.y < p2.y; 74 | return p1.z < p2.z; 75 | }); 76 | 77 | int split = (start + end - 1)/2; 78 | tree[node] = points[split]; 79 | 80 | buildSubTree(points, tree, start, split, depth+1, node*2); 81 | buildSubTree(points, tree, split + 1, end, depth+1, node*2 + 1); 82 | } 83 | 84 | __host__ void buildKDTree(int3 *points, int3 *tree, int n, int treeSize) { 85 | for(int i = 0; i < treeSize; i++) { 86 | tree[i] = make_int3(-INF, -INF, -INF); 87 | } 88 | 89 | buildSubTree(points, tree, 0, n, 0, 1); 90 | } 91 | 92 | void print(int3 *points, int n) { 93 | for(int i = 0; i < n; i++) { 94 | std::cout<<"["< val2) && (treeNode * 2 + 1 < treeSize)) 138 | { 139 | int3 rightChild = tree[treeNode * 2 + 1]; 140 | if (rightChild.x != -INF && rightChild.y != -INF && rightChild.z != -INF) 141 | { 142 | return getCloser(query, node, findNearestNeighbor(tree, treeSize, treeNode * 2 + 1, depth + 1, query)); 143 | } 144 | } 145 | return node; 146 | } 147 | 148 | __global__ void nearestNeighborGPU(int3 *tree, int treeSize, int3 *queries, int3 *results, int nQueries) { 149 | int index = blockIdx.x * blockDim.x + threadIdx.x; 150 | 151 | if(index < nQueries) { 152 | results[index] = findNearestNeighbor(tree, treeSize, 1, 0, queries[index]); 153 | } 154 | } 155 | 156 | __host__ void printResults(int3 *queries, int3 *results, int start, int end) { 157 | for(int i = start; i < end; i++) { 158 | std::cout<<"query: ["<