├── .gitignore ├── DeviceIndex.cu ├── README.md ├── floatKNearestNeighbors.cu ├── kNearestNeighbors.cu ├── kNearestNeighbors.h ├── radixSelect.cu ├── radixSelect.h ├── server.cu ├── testDeviceIndex.cu ├── testFloatKNN.cu ├── testKNN.cu ├── testRadixSelect.cu └── testScripts ├── insert.py └── query.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | .vscode/ 3 | *.txt 4 | *.index -------------------------------------------------------------------------------- /DeviceIndex.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "kNearestNeighbors.h" 15 | 16 | // Requires keys to be sequential, representing array indexes 17 | __global__ void retrieveVectorsFromKeys(uint64_cu *vectors, unsigned *keys, 18 | int numKeys, uint64_cu *retrieved) { 19 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 20 | int stride = blockDim.x * gridDim.x; 21 | 22 | for (int i = idx; i < numKeys; i += stride) 23 | retrieved[i] = vectors[keys[i]]; 24 | } 25 | 26 | void printBits(uint64_cu &x) { 27 | std::bitset b(x); 28 | std::cout << b << std::endl; 29 | } 30 | 31 | using boost::uuids::random_generator; 32 | using boost::uuids::to_string; 33 | using boost::uuids::uuid; 34 | 35 | class DeviceIndex { 36 | private: 37 | // Pointers to device memory 38 | unsigned *workingMem1; 39 | unsigned *workingMem2; 40 | unsigned *workingMem3; 41 | uint64_cu *vectors; 42 | uint64_cu *deviceQueryVector; 43 | unsigned *deviceKeys; // sequential keys e.g., a range. 44 | 45 | // Use an in-memory hash map to keep track of deviceKey to vectorKey mappings 46 | // In practice, this means vector ids can't be too big. An alternate 47 | // implementation could retrieve ids from disk instead. This would be much 48 | // slower for large k when querying 49 | std::unordered_map idMap; 50 | 51 | public: 52 | int numVectors = 0; 53 | const char *name; 54 | 55 | // Capacity must be passed as a maximum vector count as this enables 56 | // insertion and querying without allocation of memory every time. 57 | DeviceIndex(const char *nameParam, int capacity) { 58 | name = nameParam; 59 | 60 | // Allocate deviceKeys and initialize (initialization requires memory) 61 | cudaMalloc(&deviceKeys, capacity * sizeof(unsigned)); 62 | thrust::sequence(thrust::device, deviceKeys, deviceKeys + capacity); 63 | 64 | // Allocate rest of on-device memory 65 | cudaMalloc(&workingMem1, capacity * sizeof(unsigned)); 66 | cudaMalloc(&workingMem2, capacity * sizeof(unsigned)); 67 | cudaMalloc(&workingMem3, capacity * sizeof(unsigned)); 68 | cudaMalloc(&vectors, capacity * sizeof(uint64_cu)); 69 | cudaMalloc(&deviceQueryVector, sizeof(uint64_cu)); 70 | 71 | // Read vectors from file to device and idMap using a buffer 72 | int bufferSize = 4 << 20; 73 | uint64_cu *buffer = (uint64_cu *)malloc(bufferSize * sizeof(uint64_cu)); 74 | int bufferCount = 0; 75 | auto flushBuffer = [&]() { 76 | cudaMemcpy(vectors + numVectors, buffer, bufferCount * sizeof(uint64_cu), 77 | cudaMemcpyHostToDevice); 78 | numVectors += bufferCount; 79 | bufferCount = 0; 80 | }; 81 | 82 | std::ifstream f(name); 83 | int lineSize = sizeof(uuid) + sizeof(uint64_cu); 84 | assert(lineSize == 24); 85 | char *lineBuf = (char *)malloc(lineSize); 86 | 87 | int lineCount = 0; 88 | while (f.read(lineBuf, lineSize)) { 89 | lineCount++; 90 | // TODO: implement upsert and not just insert here. Have defined behavior 91 | // if id already exists 92 | 93 | // Get id and record in idMap 94 | uuid id; 95 | memcpy(&id, lineBuf, sizeof(uuid)); 96 | idMap[numVectors + bufferCount] = id; 97 | 98 | // Copy vector to buffer 99 | memcpy(buffer + bufferCount, lineBuf + sizeof(uuid), sizeof(uint64_cu)); 100 | bufferCount++; 101 | 102 | // Flush buffer to device if full 103 | if (bufferCount == bufferSize) 104 | flushBuffer(); 105 | } 106 | flushBuffer(); 107 | 108 | free(buffer); 109 | } 110 | 111 | ~DeviceIndex() { 112 | cudaFree(workingMem1); 113 | cudaFree(workingMem2); 114 | cudaFree(workingMem3); 115 | cudaFree(vectors); 116 | cudaFree(deviceQueryVector); 117 | cudaFree(deviceKeys); 118 | } 119 | 120 | /* 121 | Inserts keys. Behaviour is undefined if ids already exist 122 | */ 123 | void insert(int numToAdd, uuid ids[], uint64_cu vectorsToAdd[]) { 124 | // write ids and vectors to disk 125 | std::ofstream f; 126 | f.open(name, std::ios_base::app); 127 | int lineSize = sizeof(uuid) + sizeof(uint64_cu); 128 | 129 | char *buffer = (char *)malloc(numToAdd * lineSize); 130 | for (int i = 0; i < numToAdd; ++i) { 131 | memcpy(buffer + i * lineSize, &ids[i], sizeof(uuid)); 132 | memcpy(buffer + i * lineSize + sizeof(uuid), &vectorsToAdd[i], 133 | sizeof(uint64_cu)); 134 | } 135 | f.write(buffer, numToAdd * lineSize); 136 | f.close(); 137 | free(buffer); 138 | 139 | // insert ids into keymap 140 | for (int i = 0; i < numToAdd; ++i) { 141 | // Store id in memory 142 | idMap[numVectors + i] = ids[i]; 143 | } 144 | 145 | // copy vectors to device 146 | cudaMemcpy(vectors + numVectors, vectorsToAdd, numToAdd * sizeof(uint64_cu), 147 | cudaMemcpyHostToDevice); 148 | 149 | // update numVectors 150 | numVectors += numToAdd; 151 | } 152 | 153 | /* 154 | 155 | */ 156 | void query(uint64_cu &queryVector, int k, float kNearestDistances[], 157 | uint64_cu kNearestVectors[], uuid kNearestIds[]) { 158 | float *deviceKNearestDistances; 159 | unsigned *deviceKNearestKeys; 160 | uint64_cu *deviceKNearestVectors; 161 | cudaMalloc(&deviceKNearestDistances, k * sizeof(float)); 162 | cudaMallocManaged(&deviceKNearestKeys, k * sizeof(unsigned)); 163 | cudaMalloc(&deviceKNearestVectors, k * sizeof(uint64_cu)); 164 | 165 | // copy query vector to device 166 | cudaMemcpy(deviceQueryVector, &queryVector, sizeof(uint64_cu), 167 | cudaMemcpyHostToDevice); 168 | 169 | kNearestNeighbors(vectors, deviceKeys, deviceQueryVector, numVectors, k, 170 | deviceKNearestDistances, deviceKNearestKeys, workingMem1, 171 | workingMem2, workingMem3); 172 | 173 | // retrieve vectors from relevant keys 174 | retrieveVectorsFromKeys<<<1, 1024>>>(vectors, deviceKNearestKeys, k, 175 | deviceKNearestVectors); 176 | cudaDeviceSynchronize(); 177 | 178 | // copy solution from device to host specified by caller 179 | cudaMemcpy(kNearestDistances, deviceKNearestDistances, k * sizeof(float), 180 | cudaMemcpyDeviceToHost); 181 | cudaMemcpy(kNearestVectors, deviceKNearestVectors, k * sizeof(uint64_cu), 182 | cudaMemcpyDeviceToHost); 183 | for (int i = 0; i < k; ++i) 184 | kNearestIds[i] = idMap[deviceKNearestKeys[i]]; 185 | 186 | cudaFree(deviceKNearestDistances); 187 | cudaFree(deviceKNearestKeys); 188 | cudaFree(deviceKNearestVectors); 189 | } 190 | }; 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Binary GPU Vector Index 2 | 3 | A proof of concept vector index that supports insertion and k-nearest-neighbors querying. The index is implemented as DeviceIndex in `DeviceIndex.cu` and can be used as a library within a C++ program. A simple web server that allows insertions and queries over a network is implemented in `server.cu`. 4 | 5 | While this implementation can only support 64-bit binary vectors, it can be extended to support any vector type including vectors of non-binary elements such as floating point values. Such an extension would only change the way in which distances are computed between vectors and not the way in which those distances are ranked (e.g., radix select). 6 | 7 | Vectors must be inserted along with an UUID. This index can be extended to support other types of vector keys, including arbitrarily-sized strings. UUIDs were chosen for their frequent use and because on a technical level, it was easier to store them given they are of constant length. 8 | 9 | ## Dependencies 10 | 11 | - Cuda Toolkit 11.0+ 12 | - Crow 1.0+ 13 | 14 | ## Usage 15 | 16 | The server is complied and run in the following manner, in a shell: 17 | 18 | nvcc server.cu 19 | ./a.out [index filename] 20 | 21 | [index filename] must be replaced by the name of the file where indexes are 22 | to be stored. 23 | 24 | The server will then specify the port on which it is running, which should be configured in the appropriate manner if using a cloud service to host (e.g., AWS EC2). 25 | 26 | The server has two methods, explained below: 27 | 28 | - POST /insert 29 | - POST /query 30 | 31 | ## Inserting 32 | 33 | Requests to /insert must contain, in the body, the vectors to be inserted, along with their ids, in JSON format. The format is the following: 34 | 35 | ```json 36 | { 37 | "vectors": [ 38 | { 39 | "id": "[valid UUID]" 40 | "values": "[valid binary string of length 64]" 41 | } 42 | ] 43 | } 44 | ``` 45 | 46 | The following is an example of a valid request to /insert: 47 | 48 | ```json 49 | { 50 | "vectors": [ 51 | { 52 | "id": "4d1027ec-80b7-4df3-b950-ae824fadbd61", 53 | "values": "1000001011111100100011010001100010011000110010011110110111110110" 54 | }, 55 | { 56 | "id": "e78241cc-5bc6-4532-8b7c-76809c2704bd", 57 | "values": "110010110000101101011000101101110101000110110010001100001110010" 58 | }, 59 | { 60 | "id": "87e298cc-8e46-4a6a-922c-127026f99dea", 61 | "values": "100010101010101100000011101101000000011011010100000000001110001" 62 | } 63 | ] 64 | } 65 | ``` 66 | 67 | The response will return the number of vectors inserted, if succesful. The response to the example request above would be 68 | 69 | ```json 70 | { 71 | "insertedCount": "3" 72 | } 73 | ``` 74 | 75 | ## Querying 76 | 77 | Request to /query must contain the vector to be queried and the number topK of vectors to be retrieved. The format is the following: 78 | 79 | ```json 80 | { 81 | "topK": "[valid integer]", 82 | "vector": "[valid binary string of length 64]" 83 | } 84 | ``` 85 | 86 | The topK amount must be less than or equal to the number of vectors in the index for the query to succeed. 87 | 88 | The following is an example of a valid /query POST body: 89 | 90 | ```json 91 | { 92 | "topK": "1000", 93 | "vector": "1100111111101100111100110010111011000101000001011101010010010100" 94 | } 95 | ``` 96 | 97 | The response is a list of the retrieved vectors (matches) along with their ids and corresponding distances, which represent cosine distances. The format is the following: 98 | 99 | ```json 100 | { 101 | "matches": [ 102 | { 103 | "values": "[binary string representing vector]", 104 | "distance": "[floating point value]", 105 | "id": "[vector UUID]" 106 | } 107 | ] 108 | } 109 | ``` 110 | 111 | The following is an example response to a query with topK equal to 3: 112 | 113 | ```json 114 | { 115 | "matches": [ 116 | { 117 | "values": "1100111111101100111100110010111011000101000001011101010010010100", 118 | "distance": "0.125980", 119 | "id": "8ea44221-707e-4b26-815a-90bb60339401" 120 | }, 121 | { 122 | "id": "770d9f87-7a81-484d-95f9-5ca3321a6028", 123 | "distance": "0.125980", 124 | "values": "1100110101101010111100110010111011000111000101011111110011010100" 125 | }, 126 | { 127 | "values": "1100111111001100111000010010111111000001000001111110010000010100", 128 | "distance": "0.137542", 129 | "id": "2c7a0a4f-de42-482a-9bee-a9f4c3c3102b" 130 | } 131 | ] 132 | } 133 | ``` 134 | 135 | ## Benchmark 136 | 137 | The results of simple benchmark on an index containing half a billion vectors is 138 | shown below. 139 | 140 | Opening index... 141 | Done. Execution time: 195572 ms. 142 | Server is running on port 80. 143 | 144 | Opening the index containing 500 million indexes took a little over 3 minutes. This is a one-time wait for the entire duration of the server, and this opening must be done every time the server is killed and started again. 145 | 146 | A benchmark of 1000 queries shows an average latency of 193 milliseconds: 147 | 148 | Making 1000 queries with topK=1000... 149 | Total time: 193152 ms. 150 | Per query average: 193 ms. 151 | 152 | As for inserts, the following is a benchmark on inserts of 1000 vectors at a time. 153 | 154 | Making 1000 inserts with 1000 vectors per insert... 155 | Total time: 114483 ms. 156 | Per insert average: 114 ms. 157 | 158 | 159 | ## Future directions 160 | 161 | - Implementation of insert as upsert, e.g., better behavior for insertion of keys that already exist. In the current implementation, duplicates of ids are allowed. 162 | -------------------------------------------------------------------------------- /floatKNearestNeighbors.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "radixSelect.h" 14 | 15 | using namespace thrust::placeholders; 16 | 17 | /* 18 | Multiplies a consecutive array of vectors by a single vector. All vectors 19 | must have size D. This method is used as an alternative to complicated 20 | usage of thrust::transform with strided iterators, etc. 21 | 22 | V is an array of N vectors represented as N * D floats (T type) 23 | Q is a single vectors represented as D floats (T type) 24 | */ 25 | template 26 | __global__ void multiplyManyBySingle(T *V, T *Q, size_t N, size_t D, 27 | T *results) { 28 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | int stride = blockDim.x * gridDim.x; 30 | 31 | for (int i = idx; i < N; i += stride) 32 | results[i] = V[i] * Q[i % D]; 33 | } 34 | 35 | /* 36 | Conversions to and from floats to unsigned integers for ranking with 37 | radix select. 38 | */ 39 | __global__ void floatToUnsigned(float *fValues, unsigned *uintValues, 40 | int numValues) { 41 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 42 | int stride = blockDim.x * gridDim.x; 43 | 44 | for (int i = idx; i < numValues; i += stride) 45 | uintValues[i] = (unsigned)(fValues[i] * UINT_MAX); 46 | } 47 | __global__ void unsignedToFloat(unsigned *uintValues, float *fValues, 48 | int numValues) { 49 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 50 | int stride = blockDim.x * gridDim.x; 51 | 52 | for (int i = idx; i < numValues; i += stride) 53 | fValues[i] = (float)uintValues[i] / (float)UINT_MAX; 54 | } 55 | 56 | // unary operator used in thrust::transform in norm function 57 | struct squareRoot : std::unary_function { 58 | __device__ bool operator()(float x) const { return pow(x, 0.5); } 59 | }; 60 | 61 | /* 62 | Computes norms of sequentially stored vectors of equal length. 63 | 64 | wMem must have size N * D * sizeof(T) 65 | results must have size N * sizeof(float) 66 | */ 67 | template 68 | void norms(T *V, size_t N, size_t D, T *wMem, float *results) { 69 | // square elements 70 | thrust::transform(thrust::device, V, V + N * D, wMem, 71 | thrust::square()); 72 | // sum elements 73 | thrust::reduce_by_key(thrust::device, 74 | thrust::make_transform_iterator( 75 | thrust::counting_iterator(0), _1 / D), 76 | thrust::make_transform_iterator( 77 | thrust::counting_iterator(N * D), _1 / D), 78 | V, thrust::discard_iterator(), results); 79 | // square root results 80 | thrust::transform(thrust::device, results, results + N, results, 81 | squareRoot()); 82 | } 83 | 84 | // unary operator functor used in thrust::transform 85 | struct divisionFunctor { 86 | float divisor; 87 | 88 | divisionFunctor(float _divisor) : divisor(_divisor){}; 89 | 90 | __device__ float operator()(float &x) const { return x / divisor; } 91 | }; 92 | 93 | /* 94 | Compute cosine distances. 95 | 96 | wMem must have size N * D * sizeof(T) 97 | distances must have size N * sizeof(float) 98 | */ 99 | template 100 | void cosineDistances(T *vectors, size_t D, T *query, int N, T *wMem, 101 | float *distances) { 102 | int numElts = N * D; 103 | int blockSize = 1024; 104 | int numBlocks = (numElts + blockSize - 1) / blockSize; 105 | 106 | float *vectorNorms; 107 | cudaMalloc(&vectorNorms, N * sizeof(float)); 108 | 109 | float *queryNorm; 110 | cudaMallocManaged(&queryNorm, sizeof(float)); 111 | 112 | // compute vector norms 113 | // IMPORTANT NOTE: This computation can be cached, and should be. Vector 114 | // norms need not be computed every time a query is made, only once after 115 | // the vector(s) are inserted. 116 | norms(vectors, N, D, wMem, vectorNorms); 117 | 118 | // compute query norm 119 | norms(query, 1, D, wMem, queryNorm); 120 | 121 | // compute inner product of vectors by the query into distances 122 | multiplyManyBySingle<<>>(vectors, query, N, D, wMem); 123 | thrust::reduce_by_key(thrust::device, 124 | thrust::make_transform_iterator( 125 | thrust::counting_iterator(0), _1 / D), 126 | thrust::make_transform_iterator( 127 | thrust::counting_iterator(N * D), _1 / D), 128 | wMem, thrust::discard_iterator(), distances); 129 | 130 | // divide results by vector norms 131 | thrust::transform(thrust::device, distances, distances + N, vectorNorms, 132 | distances, thrust::divides()); 133 | 134 | // divide results by query norm 135 | thrust::transform(thrust::device, distances, distances + N, distances, 136 | divisionFunctor(*queryNorm)); 137 | 138 | cudaFree(vectorNorms); 139 | cudaFree(queryNorm); 140 | } 141 | 142 | /* 143 | Select K Nearest Neighbors of a query vector from a set of vectors of equal 144 | length. 145 | 146 | NOTE: Only full precision floats currently supported. This may be extended 147 | but will involve working closely with the capabilities of the thrust library 148 | and writing custom code for its shortcomings regarding half-precision floats. 149 | */ 150 | template 151 | void kNearestNeighbors(T *vectors, size_t D, unsigned *keys, T *query, int N, 152 | int K, float *kNearestDistances, unsigned *kNearestKeys, 153 | unsigned *workingMem1, unsigned *workingMem2, 154 | unsigned *workingMem3) { 155 | int numElts = N * D; 156 | int blockSize = 1024; 157 | int numBlocks = (numElts + blockSize - 1) / blockSize; 158 | 159 | // use working memory to compute distances 160 | float *distances = (float *)workingMem1; 161 | unsigned *uintDistances = workingMem2; 162 | 163 | // collect the best distances in their unsigned integer versions 164 | unsigned *uintKNearestDistances; 165 | cudaMalloc(&uintKNearestDistances, K * sizeof(unsigned)); 166 | 167 | // allocate huge chunk of working memory on device 168 | // NOTE: this can and should be passed as pre-allocated memory such that it 169 | // is not allocated every time it is needed. 170 | T *wMem; 171 | cudaMalloc(&wMem, N * D * sizeof(T)); 172 | 173 | // compute distances 174 | cosineDistances(vectors, D, query, N, wMem, distances); 175 | 176 | // convert distances to unsigned integers 177 | floatToUnsigned<<>>(distances, uintDistances, N); 178 | cudaDeviceSynchronize(); 179 | 180 | // select smallest K distances 181 | radixSelect(uintDistances, keys, N, K, uintKNearestDistances, kNearestKeys, 182 | workingMem1, workingMem3); 183 | 184 | // convert unsigned integer distances back to floating point distances 185 | unsignedToFloat<<<1, blockSize>>>(uintKNearestDistances, kNearestDistances, 186 | K); 187 | cudaDeviceSynchronize(); 188 | 189 | cudaFree(uintKNearestDistances); 190 | cudaFree(wMem); 191 | } 192 | -------------------------------------------------------------------------------- /kNearestNeighbors.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "radixSelect.h" 7 | 8 | typedef unsigned long long int uint64_cu; 9 | 10 | /* 11 | We can shave off ~20ms of each call by performing the float to unsigned 12 | conversion directly in this function. We don't do this to show that the 13 | algorithm is completely agnostic to the distance metric used, whether 14 | it returns a floating point value or an integer. 15 | */ 16 | __device__ void cosineDistance(uint64_cu *a, uint64_cu *b, float *dest) { 17 | // __popcll computes the Hamming Weight of an integer (e.g., number of bits 18 | // that are 1) 19 | float a_dot_b = (float)__popcll(*a & *b); 20 | float a_dot_a = (float)__popcll(*a); 21 | float b_dot_b = (float)__popcll(*b); 22 | 23 | *dest = (1 - (a_dot_b / (sqrt(a_dot_a) * sqrt(b_dot_b)))); 24 | } 25 | 26 | __global__ void computeDistances(int numIndexes, uint64_cu *query, 27 | uint64_cu *indexes, float *distances) { 28 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | int stride = blockDim.x * gridDim.x; 30 | 31 | for (int i = idx; i < numIndexes; i += stride) 32 | cosineDistance(query, &indexes[i], &distances[i]); 33 | } 34 | 35 | __global__ void floatToUnsigned(float *fValues, unsigned *uintValues, 36 | int numValues) { 37 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 38 | int stride = blockDim.x * gridDim.x; 39 | 40 | for (int i = idx; i < numValues; i += stride) 41 | uintValues[i] = (unsigned)(fValues[i] * UINT_MAX); 42 | } 43 | 44 | __global__ void unsignedToFloat(unsigned *uintValues, float *fValues, 45 | int numValues) { 46 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 47 | int stride = blockDim.x * gridDim.x; 48 | 49 | for (int i = idx; i < numValues; i += stride) 50 | fValues[i] = (float)uintValues[i] / (float)UINT_MAX; 51 | } 52 | 53 | void kNearestNeighbors(uint64_cu *vectors, unsigned *keys, uint64_cu *query, 54 | int numVectors, int k, float *kNearestDistances, 55 | unsigned *kNearestKeys, unsigned *workingMem1, 56 | unsigned *workingMem2, unsigned *workingMem3) { 57 | int blockSize = 1024; 58 | int numBlocks = (numVectors + blockSize - 1) / blockSize; 59 | 60 | // use working memory to compute distances 61 | float *distances = (float *)workingMem1; 62 | unsigned *uintDistances = workingMem2; 63 | 64 | // collect the best distances in their unsigned integer versions 65 | unsigned *uintKNearestDistances; 66 | cudaMalloc(&uintKNearestDistances, k * sizeof(unsigned)); 67 | 68 | // compute distances 69 | computeDistances<<>>(numVectors, query, vectors, 70 | distances); 71 | cudaDeviceSynchronize(); 72 | 73 | // convert distances to unsigned integers 74 | floatToUnsigned<<>>(distances, uintDistances, 75 | numVectors); 76 | cudaDeviceSynchronize(); 77 | 78 | // select smallest `k` distances 79 | radixSelect(uintDistances, keys, numVectors, k, uintKNearestDistances, 80 | kNearestKeys, workingMem1, workingMem3); 81 | 82 | // convert unsigned integer distances back to floating point distances 83 | unsignedToFloat<<<1, blockSize>>>(uintKNearestDistances, kNearestDistances, 84 | k); 85 | cudaDeviceSynchronize(); 86 | 87 | cudaFree(uintKNearestDistances); 88 | } -------------------------------------------------------------------------------- /kNearestNeighbors.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | Used to represent 64-bit binary vector. 5 | */ 6 | typedef unsigned long long int uint64_cu; 7 | 8 | /* 9 | keys MUST be a sequence of integers representing array indexes!! 10 | e.g., [0, 1, 2, ..., numVectors]. 11 | 12 | All memory that is passed must be on device. 13 | */ 14 | void kNearestNeighbors(uint64_cu *vectors, unsigned *keys, uint64_cu *query, 15 | int numVectors, int k, float *kNearestDistances, 16 | unsigned *kNearestKeys, unsigned *workingMem1, 17 | unsigned *workingMem2, unsigned *workingMem3); 18 | 19 | #include "kNearestNeighbors.cu" -------------------------------------------------------------------------------- /radixSelect.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | /* 12 | E.g., if value is 100000 10111111 0000001 11110111 then 13 | positionBits(value, 1) == 100000, positionBits(value, 2) is 10111111, 14 | positionBits(value, 2) == 000001, and positionBits(value, 4) is 11110111. 15 | */ 16 | __device__ unsigned positionBits(unsigned value, int position) { 17 | return (value >> ((sizeof(unsigned) - position) * 8)) & 0xff; 18 | } 19 | 20 | /* 21 | Collect histogram. 22 | */ 23 | __global__ void collectHistogram(int numValues, unsigned *values, 24 | unsigned *histogram, int position) { 25 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 26 | int stride = blockDim.x * gridDim.x; 27 | int id = threadIdx.x; 28 | 29 | // first collect in per-block shared memory 30 | __shared__ unsigned sharedHistogram[256]; 31 | 32 | if (id < 256) { 33 | sharedHistogram[id] = 0; // need to zero out first 34 | } 35 | 36 | __syncthreads(); 37 | 38 | for (int i = idx; i < numValues; i += stride) { 39 | unsigned bin = positionBits(values[i], position); 40 | atomicAdd(&sharedHistogram[bin], 1); 41 | } 42 | 43 | __syncthreads(); 44 | 45 | // now, use 256 threads per block to add shared histogram to global histogram 46 | if (id < 256) { 47 | atomicAdd(&histogram[id], sharedHistogram[id]); 48 | } 49 | } 50 | 51 | // predicates for `thrust::copy_if` 52 | struct belongsToPivotBin { 53 | int position; 54 | unsigned pivot; 55 | 56 | belongsToPivotBin(int position, unsigned pivot) 57 | : position(position), pivot(pivot) {} 58 | 59 | __device__ bool operator()(unsigned value) { 60 | return positionBits(value, position) == pivot; 61 | } 62 | }; 63 | struct valueBelowThreshold { 64 | unsigned *values; 65 | unsigned threshold; 66 | 67 | valueBelowThreshold(unsigned *values, unsigned threshold) 68 | : values(values), threshold(threshold) {} 69 | 70 | __device__ bool operator()(unsigned value) { return value < threshold; } 71 | }; 72 | struct valueEqualToThreshold { 73 | unsigned *values; 74 | unsigned threshold; 75 | 76 | valueEqualToThreshold(unsigned *values, unsigned threshold) 77 | : values(values), threshold(threshold) {} 78 | 79 | __device__ bool operator()(unsigned value) { return value == threshold; } 80 | }; 81 | 82 | void radixSelect(unsigned *values, unsigned *keys, int numValues, int k, 83 | unsigned *kSmallestValues, unsigned *kSmallestKeys, 84 | unsigned *workingMem1, unsigned *workingMem2) { 85 | // allocate histogram, prefix sum, and temporary arrays 86 | unsigned *histogram, *prefixSums; 87 | 88 | cudaMalloc(&histogram, 256 * sizeof(unsigned)); 89 | cudaMalloc(&prefixSums, 256 * sizeof(unsigned)); 90 | 91 | // allocate a host variable that is used to alter `k` after each iteration 92 | unsigned *toSubtract; 93 | toSubtract = (unsigned *)malloc(sizeof(unsigned)); 94 | 95 | // declare values that are altered over the iterations 96 | unsigned kthSmallestValue = 0; 97 | int currNumValues = numValues; 98 | int currK = k; 99 | unsigned *currValues = values; 100 | unsigned *tempValues = workingMem1; 101 | 102 | // iterate over four 8-bit chunks in a 32-bit integer to find kth smallest 103 | // value 104 | for (int position = 1; position <= 4; ++position) { 105 | int blockSize = 1024; 106 | int numBlocks = (currNumValues + blockSize - 1) / blockSize; 107 | 108 | // Collect histogram 109 | cudaMemset(histogram, 0, 256 * sizeof(unsigned)); 110 | 111 | collectHistogram<<>>(currNumValues, currValues, 112 | histogram, position); 113 | cudaDeviceSynchronize(); 114 | 115 | // compute prefix sums 116 | cudaMemset(prefixSums, 0, 256 * sizeof(unsigned)); 117 | thrust::inclusive_scan(thrust::device, histogram, histogram + 256, 118 | prefixSums); 119 | // find pivot bin 120 | unsigned *pivotPtr = thrust::lower_bound(thrust::device, prefixSums, 121 | prefixSums + 256, currK); 122 | unsigned pivot = (unsigned)(pivotPtr - prefixSums); 123 | 124 | // record pivot bits in their corresponding position in `kthSmallestValue` 125 | kthSmallestValue = 126 | kthSmallestValue | (pivot << ((sizeof(unsigned) - position) * 8)); 127 | 128 | if (position <= 3) { 129 | // copy integers from their corresponding pivot from `currValues` into 130 | // `temp` and record the count 131 | unsigned *copy_ifResult = thrust::copy_if( 132 | thrust::device, currValues, currValues + currNumValues, tempValues, 133 | belongsToPivotBin(position, pivot)); 134 | unsigned binCount = copy_ifResult - tempValues; 135 | 136 | // in next iteration make `currNumValues` the number of elements in the 137 | // pivot bin and subtract from `currK` the number of elements in lesser 138 | // bins. 139 | currNumValues = binCount; 140 | if (pivot > 0) { 141 | cudaMemcpy(toSubtract, &prefixSums[pivot - 1], sizeof(unsigned), 142 | cudaMemcpyDeviceToHost); 143 | currK -= *toSubtract; 144 | } 145 | 146 | // update `currValues` pointer and cycle between temporary arrays 147 | if (currValues == values || currValues == workingMem2) { 148 | currValues = workingMem1; 149 | tempValues = workingMem2; 150 | } else if (currValues == workingMem1) { 151 | currValues = workingMem2; 152 | tempValues = workingMem1; 153 | } 154 | } 155 | } 156 | 157 | // reuse `workingMem1` to copy keys whose values are strictly less than 158 | // `kthSmallestValue` 159 | unsigned *copy_ifResult = thrust::copy_if( 160 | thrust::device, keys, keys + numValues, values, workingMem1, 161 | valueBelowThreshold(values, kthSmallestValue)); 162 | unsigned countLessThan = copy_ifResult - workingMem1; 163 | 164 | // copy keys whose values are equal to `kthSmallestValue` into the remaining 165 | // space in `workingMem1`. 166 | thrust::copy_if(thrust::device, keys, keys + numValues, values, 167 | workingMem1 + countLessThan, 168 | valueEqualToThreshold(values, kthSmallestValue)); 169 | 170 | // reuse `workingMem2` to copy all values strictly less than 171 | // `kthSmallestValue` 172 | thrust::copy_if(thrust::device, values, values + numValues, workingMem2, 173 | valueBelowThreshold(values, kthSmallestValue)); 174 | 175 | // append onto values just copied into `workingMem1` values equal to 176 | // `kthSmallestValue` such that we have accounted for `k` total values 177 | thrust::fill(thrust::device, workingMem2 + countLessThan, workingMem2 + k, 178 | kthSmallestValue); 179 | 180 | cudaMemcpy(kSmallestKeys, workingMem1, k * sizeof(unsigned), 181 | cudaMemcpyDeviceToDevice); 182 | cudaMemcpy(kSmallestValues, workingMem2, k * sizeof(unsigned), 183 | cudaMemcpyDeviceToDevice); 184 | 185 | cudaFree(histogram); 186 | cudaFree(prefixSums); 187 | 188 | free(toSubtract); 189 | } 190 | -------------------------------------------------------------------------------- /radixSelect.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | Copy the smallest k values and their corresponding keys into kSmallestValues 5 | and kSmallestKeys. Two pointers to allocated memory must be passed as 6 | working memory of type unsigned integer and of size 7 | (numValues * sizeof(unsigned)). 8 | 9 | All memory that is passed must be located on device. 10 | 11 | Note: It is possible to make this method retrieve the keys' corresponding 12 | vectors too. A new argument would be introduced with a name like 13 | "passengerValues" of any type. This would be possible by the addition of two 14 | more calls to thrust::copy_if. It is also worth mentioning here that if 15 | the keys are made to be sequential and representative of indexes in an array, 16 | retrieval of vectors can be made easily much quicker. 17 | */ 18 | void radixSelect(unsigned *values, unsigned *keys, int numValues, int k, 19 | unsigned *kSmallestValues, unsigned *kSmallestKeys, 20 | unsigned *workingMem1, unsigned *workingMem2); 21 | 22 | #include "radixSelect.cu" -------------------------------------------------------------------------------- /server.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "DeviceIndex.cu" 15 | #include "crow.h" 16 | 17 | using namespace boost::uuids; 18 | 19 | using std::chrono::duration; 20 | using std::chrono::duration_cast; 21 | using std::chrono::high_resolution_clock; 22 | using std::chrono::milliseconds; 23 | 24 | std::string vectorToString(uint64_cu vector) { 25 | return std::bitset<64>(vector).to_string(); 26 | } 27 | 28 | int main(int argc, char** argv) { 29 | int A10GCapacity = 950000000; 30 | int port = 80; 31 | 32 | assert(argc == 2); 33 | const char *indexName = argv[1]; 34 | 35 | crow::SimpleApp app; 36 | app.loglevel(crow::LogLevel::Warning); 37 | 38 | // Open index 39 | printf("Opening index...\n"); 40 | auto t1 = high_resolution_clock::now(); 41 | DeviceIndex *index = new DeviceIndex(indexName, A10GCapacity); 42 | auto t2 = high_resolution_clock::now(); 43 | auto ms_int = duration_cast(t2 - t1); 44 | printf("Done. Execution time: %ld ms.\n", ms_int.count()); 45 | 46 | // Insert route 47 | CROW_ROUTE(app, "/insert") 48 | .methods("POST"_method)([&](const crow::request &req) { 49 | auto jsonBody = crow::json::load(req.body); 50 | 51 | if (!jsonBody["vectors"]) 52 | return crow::response(crow::status::BAD_REQUEST); 53 | 54 | int numToInsert = jsonBody["vectors"].size(); 55 | 56 | uuid *ids = (uuid *)malloc(numToInsert * sizeof(uuid)); 57 | uint64_cu *vectors = 58 | (uint64_cu *)malloc(numToInsert * sizeof(uint64_cu)); 59 | 60 | // Retrieve ids and vectors 61 | for (int i = 0; i < numToInsert; ++i) { 62 | std::string idString = jsonBody["vectors"][i]["id"].s(); 63 | std::string vectorString = jsonBody["vectors"][i]["values"].s(); 64 | 65 | uuid id = boost::lexical_cast(idString); 66 | uint64_cu vector = strtoull(vectorString.c_str(), NULL, 2); 67 | 68 | ids[i] = id; 69 | vectors[i] = vector; 70 | } 71 | 72 | // insert ids and vectors into index 73 | index->insert(numToInsert, ids, vectors); 74 | 75 | free(ids); 76 | free(vectors); 77 | 78 | crow::json::wvalue response( 79 | {{"insertedCount", std::to_string(numToInsert)}}); 80 | 81 | return crow::response{response}; 82 | }); 83 | 84 | // Query route 85 | CROW_ROUTE(app, "/query") 86 | .methods("POST"_method)([&](const crow::request &req) { 87 | auto jsonBody = crow::json::load(req.body); 88 | 89 | if (!jsonBody["topK"] || !jsonBody["vector"]) 90 | return crow::response(crow::status::BAD_REQUEST); 91 | 92 | // Retrieve topK and vector 93 | int topK = jsonBody["topK"].i(); 94 | std::string vectorString = jsonBody["vector"].s(); 95 | uint64_cu vector = strtoull(vectorString.c_str(), NULL, 2); 96 | 97 | // Make sure topK is within bounds 98 | if (topK > index->numVectors) 99 | return crow::response(crow::status::BAD_REQUEST); 100 | 101 | // Set up memory to collect query results 102 | uint64_cu *kNearestVectors = 103 | (uint64_cu *)malloc(topK * sizeof(uint64_cu)); 104 | float *kNearestDistances = (float *)malloc(topK * sizeof(float)); 105 | uuid *kNearestIds = (uuid *)malloc(topK * sizeof(uuid)); 106 | 107 | // Query index 108 | index->query(vector, topK, kNearestDistances, kNearestVectors, 109 | kNearestIds); 110 | 111 | crow::json::wvalue response({}); 112 | 113 | for (int i = 0; i < topK; ++i) { 114 | response["matches"][i]["id"] = to_string(kNearestIds[i]); 115 | response["matches"][i]["distance"] = 116 | std::to_string(kNearestDistances[i]); 117 | response["matches"][i]["values"] = vectorToString(kNearestVectors[i]); 118 | } 119 | 120 | free(kNearestVectors); 121 | free(kNearestDistances); 122 | free(kNearestIds); 123 | 124 | return crow::response{response}; 125 | }); 126 | 127 | // Run server 128 | printf("Server is running on port %d.\n", port); 129 | app.port(port).run(); 130 | 131 | // Close index 132 | delete index; 133 | } 134 | -------------------------------------------------------------------------------- /testDeviceIndex.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "DeviceIndex.cu" 5 | 6 | // murmur64 hash function 7 | uint64_cu hash(uint64_cu h) { 8 | h ^= h >> 33; 9 | h *= 0xff51afd7ed558ccdL; 10 | h ^= h >> 33; 11 | h *= 0xc4ceb9fe1a85ec53L; 12 | h ^= h >> 33; 13 | return h; 14 | } 15 | 16 | /* 17 | This test opens an index, inserts vectors into it, makes a query, and closes it. 18 | It then re-opens the same index, and makes the same query as before, printing 19 | the output to insure that the results are the same. 20 | 21 | Different steps in this test can be commented out to accomplish different 22 | things, such as generating a random index 23 | */ 24 | 25 | int main(void) { 26 | using std::chrono::duration; 27 | using std::chrono::duration_cast; 28 | using std::chrono::high_resolution_clock; 29 | using std::chrono::milliseconds; 30 | 31 | using boost::uuids::random_generator; 32 | using boost::uuids::to_string; 33 | using boost::uuids::uuid; 34 | 35 | const char *vdbName = "big.index"; 36 | int vdbCapacity = 950000000; 37 | int numToInsert = 500000000; 38 | 39 | // open vector db 40 | printf("Opening...\n"); 41 | auto t1 = high_resolution_clock::now(); 42 | DeviceIndex *vdb = new DeviceIndex(vdbName, vdbCapacity); 43 | auto t2 = high_resolution_clock::now(); 44 | auto ms_int = duration_cast(t2 - t1); 45 | printf("Done. Execution time: %ldms.\n", ms_int.count()); 46 | 47 | int batchSize = 4 << 20; 48 | int numBatches = (numToInsert + batchSize - 1) / batchSize; 49 | 50 | // use heap since these arrays are huge 51 | uuid *ids = (uuid *)malloc(batchSize * sizeof(uuid)); 52 | uint64_cu *vectorsToAdd = (uint64_cu *)malloc(batchSize * sizeof(uint64_cu)); 53 | 54 | for (int batch = 0; batch < numBatches; ++batch) { 55 | // Adjust batch size if last batch 56 | if (batch == numBatches - 1) 57 | batchSize = numToInsert % batchSize; 58 | 59 | printf("Generating and inserting (%d\\%d)...\n", batch, numBatches); 60 | t1 = high_resolution_clock::now(); 61 | 62 | for (int i = 0; i < batchSize; ++i) { 63 | ids[i] = random_generator()(); 64 | vectorsToAdd[i] = hash((batch + 1) * (i + 1)); 65 | } 66 | 67 | vdb->insert(batchSize, ids, vectorsToAdd); 68 | 69 | t2 = high_resolution_clock::now(); 70 | ms_int = duration_cast(t2 - t1); 71 | printf("Done. Execution time: %ldms.\n", ms_int.count()); 72 | } 73 | 74 | free(ids); 75 | free(vectorsToAdd); 76 | 77 | // query 78 | const int k = 2; 79 | uint64_cu queryVector = hash(~1); 80 | uint64_cu kNearestVectors[k]; 81 | float kNearestDistances[k]; 82 | uuid kNearestIds[k]; 83 | 84 | printf("Querying...\n"); 85 | t1 = high_resolution_clock::now(); 86 | vdb->query(queryVector, k, kNearestDistances, kNearestVectors, kNearestIds); 87 | t2 = high_resolution_clock::now(); 88 | ms_int = duration_cast(t2 - t1); 89 | printf("Done. Execution time: %ldms.\n", ms_int.count()); 90 | 91 | // // print results 92 | // printf("Query: "); 93 | // printBits(queryVector); 94 | // for (int i = 0; i < k; ++i) { 95 | // printf("%d: %8.8f %s ", i, kNearestDistances[i], 96 | // to_string(kNearestIds[i]).c_str()); 97 | // printBits(kNearestVectors[i]); 98 | // } 99 | 100 | // // close db 101 | // delete vdb; 102 | 103 | // // reopen 104 | // printf("Reopening...\n"); 105 | // t1 = high_resolution_clock::now(); 106 | // DeviceIndex *vdb2 = new DeviceIndex(vdbName, vdbCapacity); 107 | // t2 = high_resolution_clock::now(); 108 | // ms_int = duration_cast(t2 - t1); 109 | // printf("Done. Execution time: %ldms.\n", ms_int.count()); 110 | 111 | // // query again 112 | // printf("Querying again...\n"); 113 | // t1 = high_resolution_clock::now(); 114 | // vdb2->query(queryVector, k, kNearestDistances, kNearestVectors, kNearestIds); 115 | // t2 = high_resolution_clock::now(); 116 | // ms_int = duration_cast(t2 - t1); 117 | // printf("Done. Execution time: %ldms.\n", ms_int.count()); 118 | 119 | // // print results 120 | // printf("Query: "); 121 | // printBits(queryVector); 122 | // for (int i = 0; i < k; ++i) { 123 | // printf("%d: %8.8f %s ", i, kNearestDistances[i], 124 | // to_string(kNearestIds[i]).c_str()); 125 | // printBits(kNearestVectors[i]); 126 | // } 127 | 128 | // // close second db 129 | // delete vdb2; 130 | 131 | // // delete file 132 | // std::remove(vdbName); 133 | 134 | return 0; 135 | } -------------------------------------------------------------------------------- /testFloatKNN.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "kNearestNeighbors.h" 15 | 16 | // murmur32 hash function 17 | __device__ unsigned hash(unsigned a) { 18 | a = (a ^ 61) ^ (a >> 16); 19 | a = a + (a << 3); 20 | a = a ^ (a >> 4); 21 | a = a * 0x27d4eb2d; 22 | a = a ^ (a >> 15); 23 | return a; 24 | } 25 | 26 | template __global__ void rand(T *vectors, int n, size_t D) { 27 | int index = blockIdx.x * blockDim.x + threadIdx.x; 28 | int stride = blockDim.x * gridDim.x; 29 | 30 | for (int i = index; i < n; i += stride) { 31 | for (int d = 0; d < D; ++d) { 32 | vectors[i * D + d] = (float)hash(i * D + d) / (float)UINT_MAX; 33 | } 34 | } 35 | } 36 | 37 | __host__ void printBits(uint64_cu *x) { 38 | std::bitset b(*x); 39 | std::cout << b << std::endl; 40 | } 41 | 42 | template 43 | void retrieveVectorsFromKeys(T *vectors, size_t D, unsigned *keys, int numKeys, 44 | T *retrieved) { 45 | for (int i = 0; i < numKeys; ++i) 46 | cudaMemcpy(retrieved + i * D, vectors + keys[i] * D, D * sizeof(T), 47 | cudaMemcpyDeviceToHost); 48 | } 49 | 50 | int main(void) { 51 | typedef float T; 52 | 53 | int numIndexes = 10000000; 54 | const size_t D = 2; 55 | int K = 5; 56 | 57 | int blockSize = 1024; 58 | int numBlocks = (numIndexes + blockSize - 1) / blockSize; 59 | 60 | // allocate K nearest distances, keys, and vectors 61 | float *kNearestDistances; 62 | unsigned *kNearestKeys; 63 | T *kNearestVectors; 64 | cudaMallocManaged(&kNearestDistances, K * sizeof(unsigned)); 65 | cudaMallocManaged(&kNearestKeys, K * sizeof(unsigned)); 66 | cudaMallocManaged(&kNearestVectors, K * D * sizeof(T)); 67 | 68 | // allocate space on device for query and vectors 69 | T *query; 70 | T *vectors; 71 | cudaMallocManaged(&query, D * sizeof(T)); 72 | cudaMalloc(&vectors, numIndexes * D * sizeof(T)); 73 | 74 | // allocate and initalize keys on device 75 | unsigned *keys; 76 | cudaMalloc(&keys, numIndexes * sizeof(unsigned)); 77 | thrust::sequence(thrust::device, keys, keys + numIndexes); 78 | 79 | // allocate working memory on device 80 | unsigned *workingMem1, *workingMem2, *workingMem3; 81 | cudaMalloc(&workingMem1, numIndexes * sizeof(unsigned)); 82 | cudaMalloc(&workingMem2, numIndexes * sizeof(unsigned)); 83 | cudaMalloc(&workingMem3, numIndexes * sizeof(unsigned)); 84 | 85 | // generate random vectors on device 86 | rand<<>>(vectors, numIndexes, D); 87 | cudaDeviceSynchronize(); 88 | 89 | // generate random query on device and transfer to host 90 | rand<<<1, 1>>>(query, 1, D); 91 | cudaDeviceSynchronize(); 92 | 93 | // run and time kNearestNeighbors call 94 | float time; 95 | cudaEvent_t start, stop; 96 | 97 | cudaEventCreate(&start); 98 | cudaEventCreate(&stop); 99 | cudaEventRecord(start, 0); 100 | 101 | kNearestNeighbors(vectors, D, keys, query, numIndexes, K, 102 | kNearestDistances, kNearestKeys, workingMem1, 103 | workingMem2, workingMem3); 104 | 105 | cudaEventRecord(stop, 0); 106 | cudaEventSynchronize(stop); 107 | cudaEventElapsedTime(&time, start, stop); 108 | 109 | printf("Execution time: %.3f ms \n\n", time); 110 | 111 | // retrieve vectors from relevant keys 112 | retrieveVectorsFromKeys(vectors, D, kNearestKeys, K, kNearestVectors); 113 | 114 | // print results 115 | printf("Query:\n"); 116 | for (int d = 0; d < D; ++d) { 117 | printf("%f ", query[d]); 118 | } 119 | printf("\n\n"); 120 | 121 | for (int i = 0; i < K; ++i) { 122 | printf("%d: %f \n", i, kNearestDistances[i]); 123 | for (int d = 0; d < D; ++d) { 124 | printf("%f ", kNearestVectors[i * D + d]); 125 | } 126 | printf("\n\n"); 127 | } 128 | 129 | // free device memory 130 | cudaFree(query); 131 | cudaFree(vectors); 132 | cudaFree(keys); 133 | cudaFree(kNearestDistances); 134 | cudaFree(kNearestKeys); 135 | cudaFree(kNearestVectors); 136 | cudaFree(workingMem1); 137 | cudaFree(workingMem2); 138 | cudaFree(workingMem3); 139 | 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /testKNN.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "kNearestNeighbors.h" 15 | 16 | // murmur64 hash function 17 | __device__ uint64_cu hash(uint64_cu h) { 18 | h ^= h >> 33; 19 | h *= 0xff51afd7ed558ccdL; 20 | h ^= h >> 33; 21 | h *= 0xc4ceb9fe1a85ec53L; 22 | h ^= h >> 33; 23 | return h; 24 | } 25 | 26 | __global__ void randf(uint64_cu *p, int n) { 27 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 28 | while (idx < n) { 29 | // hash address 30 | p[idx] = hash((uint64_cu)~idx); 31 | idx += blockDim.x * gridDim.x; 32 | } 33 | } 34 | 35 | __host__ void printBits(uint64_cu *x) { 36 | std::bitset b(*x); 37 | std::cout << b << std::endl; 38 | } 39 | 40 | __global__ void retrieveVectorsFromKeys(uint64_cu *vectors, unsigned *keys, 41 | int numKeys, uint64_cu *retrieved) { 42 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 43 | int stride = blockDim.x * gridDim.x; 44 | 45 | for (int i = idx; i < numKeys; i += stride) 46 | retrieved[i] = vectors[keys[i]]; 47 | } 48 | 49 | int main(void) { 50 | int numIndexes = 970000000; 51 | int k = 10; 52 | 53 | int blockSize = 1024; 54 | int numBlocks = (numIndexes + blockSize - 1) / blockSize; 55 | 56 | // allocate space on host for query, and k nearest indexes 57 | uint64_cu *hostQuery; 58 | hostQuery = (uint64_cu *)malloc(sizeof(uint64_cu)); 59 | 60 | // allocate k nearest distances, keys, and indexes on device 61 | float *kNearestDistances; 62 | unsigned *kNearestKeys; 63 | uint64_cu *kNearestIndexes; 64 | cudaMalloc(&kNearestDistances, k * sizeof(unsigned)); 65 | cudaMalloc(&kNearestKeys, k * sizeof(unsigned)); 66 | cudaMalloc(&kNearestIndexes, k * sizeof(uint64_cu)); 67 | 68 | // allocate host versions of kNearestDistances, kNearestKeys, and 69 | // kNearestIndexes 70 | float *hostKNearestDistances; 71 | unsigned *hostKNearestKeys; 72 | uint64_cu *hostKNearestIndexes; 73 | hostKNearestDistances = (float *)malloc(k * sizeof(float)); 74 | hostKNearestKeys = (unsigned *)malloc(k * sizeof(unsigned)); 75 | hostKNearestIndexes = (uint64_cu *)malloc(k * sizeof(uint64_cu)); 76 | 77 | // allocate space on device for query and indexes 78 | uint64_cu *query, *indexes; 79 | cudaMalloc(&query, sizeof(uint64_cu)); 80 | cudaMalloc(&indexes, numIndexes * sizeof(uint64_cu)); 81 | 82 | // allocate and initalize keys on device 83 | // TODO: would make code cleaner but would add ~6ms on ~1B vectors to 84 | // move initalization of keys into kNearestNeighbors (thrust::sequence) 85 | // such that the function does not use "keys" terminology and instead 86 | // is defined as returning the array indexes of the closest k vectors. 87 | unsigned *keys; 88 | cudaMalloc(&keys, numIndexes * sizeof(unsigned)); 89 | thrust::sequence(thrust::device, keys, keys + numIndexes); 90 | 91 | // allocate working memory on device 92 | unsigned *workingMem1, *workingMem2, *workingMem3; 93 | cudaMalloc(&workingMem1, numIndexes * sizeof(unsigned)); 94 | cudaMalloc(&workingMem2, numIndexes * sizeof(unsigned)); 95 | cudaMalloc(&workingMem3, numIndexes * sizeof(unsigned)); 96 | 97 | // generate random indexes on device 98 | randf<<>>(indexes, numIndexes); 99 | cudaDeviceSynchronize(); 100 | 101 | // generate random query on device and transfer to host 102 | randf<<<1, 1>>>(query, 1); 103 | cudaDeviceSynchronize(); 104 | cudaMemcpy(hostQuery, query, sizeof(uint64_t), cudaMemcpyDeviceToHost); 105 | 106 | // run and time kNearestNeighbors call 107 | float time; 108 | cudaEvent_t start, stop; 109 | 110 | cudaEventCreate(&start); 111 | cudaEventCreate(&stop); 112 | cudaEventRecord(start, 0); 113 | 114 | kNearestNeighbors(indexes, keys, query, numIndexes, k, kNearestDistances, 115 | kNearestKeys, workingMem1, workingMem2, workingMem3); 116 | 117 | cudaEventRecord(stop, 0); 118 | cudaEventSynchronize(stop); 119 | cudaEventElapsedTime(&time, start, stop); 120 | 121 | printf("Execution time: %.3f ms \n", time); 122 | 123 | // retrieve vectors from relevant keys 124 | retrieveVectorsFromKeys<<<1, blockSize>>>(indexes, kNearestKeys, k, 125 | kNearestIndexes); 126 | cudaDeviceSynchronize(); 127 | 128 | // copy results from device to host 129 | cudaMemcpy(hostKNearestDistances, kNearestDistances, k * sizeof(float), 130 | cudaMemcpyDeviceToHost); 131 | cudaMemcpy(hostKNearestKeys, kNearestKeys, k * sizeof(unsigned), 132 | cudaMemcpyDeviceToHost); 133 | cudaMemcpy(hostKNearestIndexes, kNearestIndexes, k * sizeof(uint64_cu), 134 | cudaMemcpyDeviceToHost); 135 | 136 | // print results 137 | printf("Query: "); 138 | printBits(hostQuery); 139 | for (int i = 0; i < k; ++i) { 140 | printf("%d: %f ", i, hostKNearestDistances[i]); 141 | printBits(&hostKNearestIndexes[i]); 142 | } 143 | 144 | // free device memory 145 | cudaFree(query); 146 | cudaFree(indexes); 147 | cudaFree(keys); 148 | cudaFree(kNearestDistances); 149 | cudaFree(kNearestKeys); 150 | cudaFree(kNearestIndexes); 151 | cudaFree(workingMem1); 152 | cudaFree(workingMem2); 153 | cudaFree(workingMem3); 154 | 155 | // free host memory 156 | free(hostQuery); 157 | free(hostKNearestDistances); 158 | free(hostKNearestKeys); 159 | 160 | return 0; 161 | } -------------------------------------------------------------------------------- /testRadixSelect.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "radixSelect.h" 6 | 7 | __device__ unsigned hash(unsigned a) { 8 | a = (a ^ 61) ^ (a >> 16); 9 | a = a + (a << 3); 10 | a = a ^ (a >> 4); 11 | a = a * 0x27d4eb2d; 12 | a = a ^ (a >> 15); 13 | return a; 14 | } 15 | 16 | __global__ void rand(int n, unsigned *xs) { 17 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 18 | int stride = blockDim.x * gridDim.x; 19 | 20 | for (int i = idx; i < n; i += stride) 21 | xs[idx] = hash(~idx); 22 | } 23 | 24 | int main() { 25 | int n = 1 << 20; 26 | int k = 10; 27 | 28 | int blockSize = 512; 29 | int numBlocks = (n + blockSize - 1) / blockSize; 30 | 31 | // generate random numbers 32 | unsigned *xs; 33 | cudaMalloc(&xs, n * sizeof(unsigned)); 34 | rand<<>>(n, xs); 35 | cudaDeviceSynchronize(); 36 | 37 | // allocate and initalize keys on device 38 | unsigned *keys; 39 | cudaMalloc(&keys, n * sizeof(unsigned)); 40 | thrust::sequence(thrust::device, keys, keys + n); 41 | 42 | // allocate kSmallestKeys and kSmallestValues on device 43 | unsigned *kSmallestKeys; 44 | unsigned *kSmallestValues; 45 | cudaMalloc(&kSmallestKeys, k * sizeof(unsigned)); 46 | cudaMalloc(&kSmallestValues, k * sizeof(unsigned)); 47 | 48 | unsigned *tempValues1, *tempValues2; 49 | cudaMalloc(&tempValues1, n * sizeof(unsigned)); 50 | cudaMalloc(&tempValues2, n * sizeof(unsigned)); 51 | 52 | // allocate hostKSmallestKeys and hostKSmallestValues on host 53 | unsigned *hostKSmallestKeys; 54 | unsigned *hostKSmallestValues; 55 | hostKSmallestKeys = (unsigned *)malloc(k * sizeof(unsigned)); 56 | hostKSmallestValues = (unsigned *)malloc(k * sizeof(unsigned)); 57 | 58 | // run radix select 59 | float time; 60 | cudaEvent_t start, stop; 61 | 62 | cudaEventCreate(&start); 63 | cudaEventCreate(&stop); 64 | cudaEventRecord(start, 0); 65 | 66 | radixSelect(xs, keys, n, k, kSmallestValues, kSmallestKeys, tempValues1, tempValues2); 67 | 68 | cudaEventRecord(stop, 0); 69 | cudaEventSynchronize(stop); 70 | cudaEventElapsedTime(&time, start, stop); 71 | 72 | printf("Execution time: %.3f ms \n", time); 73 | 74 | // copy solution from device to host 75 | cudaMemcpy(hostKSmallestKeys, kSmallestKeys, k * sizeof(unsigned), cudaMemcpyDeviceToHost); 76 | cudaMemcpy(hostKSmallestValues, kSmallestValues, k * sizeof(unsigned), cudaMemcpyDeviceToHost); 77 | 78 | for (int i = 0; i < k; ++i) { 79 | printf("kSmallestKeys: %d: %u\n", i, hostKSmallestKeys[i]); 80 | } 81 | for (int i = 0; i < k; ++i) { 82 | printf("kSmallestValues: %d: %u\n", i, hostKSmallestValues[i]); 83 | } 84 | 85 | cudaFree(xs); 86 | cudaFree(keys); 87 | cudaFree(tempValues1); 88 | cudaFree(tempValues2); 89 | cudaFree(kSmallestKeys); 90 | cudaFree(kSmallestValues); 91 | 92 | free(hostKSmallestKeys); 93 | free(hostKSmallestValues); 94 | 95 | return 0; 96 | } -------------------------------------------------------------------------------- /testScripts/insert.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import random 3 | import time 4 | import requests 5 | 6 | serverURL: str = "http://ec2-54-221-48-223.compute-1.amazonaws.com" 7 | numRequests: int = 1000 8 | numToInsertPerRequest: int = 1000 9 | 10 | def generateInsertBody(numToInsert: int) -> dict: 11 | bodyDict = {} 12 | bodyDict["vectors"] = [] 13 | 14 | for i in range(numToInsert): 15 | id = str(uuid.uuid4()) 16 | vector = bin(random.randint(0, 2**64 - 1))[2:] 17 | vectorsDict = {} 18 | vectorsDict["id"] = id 19 | vectorsDict["values"] = vector 20 | bodyDict["vectors"].append(vectorsDict) 21 | 22 | return bodyDict 23 | 24 | # import json 25 | # print(json.dumps(generateInsertBody(3), indent=4)) 26 | 27 | print(f"Making {numRequests} inserts with {numToInsertPerRequest} vectors per insert...") 28 | 29 | total: int = 0 30 | for _ in range(numRequests): 31 | requestBody: dict = generateInsertBody(numToInsert=numToInsertPerRequest) 32 | 33 | start = time.time() 34 | r: requests.Response = requests.post( 35 | serverURL + "/insert", json=requestBody 36 | ) 37 | end = time.time() 38 | total += end - start 39 | 40 | assert(r.status_code == 200) 41 | 42 | print("Total time: {:.0f} ms.".format(1000 * total)) 43 | print("Per insert average: {:.0f} ms.".format(1000 * total / numRequests)) 44 | -------------------------------------------------------------------------------- /testScripts/query.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import requests 4 | 5 | serverURL: str = "http://ec2-54-221-48-223.compute-1.amazonaws.com" 6 | numRequests: int = 1000 7 | topKPerRequest: int = 1000 8 | 9 | def generateQueryBody(topK: int) -> dict: 10 | bodyDict = {} 11 | 12 | bodyDict["topK"] = topK 13 | bodyDict["vector"] = bin(random.randint(0, 2**64 - 1))[2:] 14 | 15 | return bodyDict 16 | 17 | # import json 18 | # print(json.dumps(generateQueryBody(topK=topKPerRequest), indent=4)) 19 | 20 | print(f"Making {numRequests} queries with topK={topKPerRequest}...") 21 | 22 | total: int = 0 23 | for _ in range(numRequests): 24 | requestBody = generateQueryBody(topK=topKPerRequest) 25 | 26 | start = time.time() 27 | r: requests.Response = requests.post( 28 | serverURL + "/query", json=requestBody 29 | ) 30 | end = time.time() 31 | total += end - start 32 | 33 | assert(r.status_code == 200) 34 | 35 | print("Total time: {:.0f} ms.".format(1000 * total)) 36 | print("Per query average: {:.0f} ms.".format(1000 * total / numRequests)) 37 | 38 | --------------------------------------------------------------------------------