├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── dciknn_cuda ├── __init__.py └── core.py ├── example.py ├── include ├── dci.h └── util.h ├── setup.py └── src ├── dci_cuda.cpp ├── dci_cuda_kernel.cu └── util_kernel.cu /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dciknn_cuda.egg-info/ 3 | dciknn_cuda/__pycache__/ 4 | dist/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft include/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DCI CUDA 2 | 3 | This is the CUDA GPU implementation + Python interface (using PyTorch) of Dynamic Continuous Indexing (DCI) . The paper can be found [here](https://arxiv.org/abs/1512.00442). 4 | 5 | ## Prerequisites 6 | * NVCC version >= 9.2 (Note: this should match the CUDA version that PyTorch is built with) 7 | * PyTorch >= 1.4.0 8 | 9 | ## Setup 10 | 11 | The library can be compiled using Python distutils. 12 | 13 | **Note:** If your Python interpreter is named differently, e.g.: "python3", you will need to replace all occurrences of "python" with "python3" in the commands below. 14 | 15 | If your Python installation is local (e.g. part of Anaconda), run the following command from the root directory of the code base to compile and install as a Python package: 16 | ```bash 17 | python setup.py install 18 | ``` 19 | 20 | Otherwise, if you have sudo access, run the following command instead: 21 | ```bash 22 | sudo python setup.py install 23 | ``` 24 | 25 | If you do not have sudo access, run the following command instead: 26 | ```bash 27 | python setup.py install --user 28 | ``` 29 | 30 | 31 | ## Experimental PyPI install 32 | Simply run: 33 | ```bash 34 | pip install -i https://test.pypi.org/simple/ dciknn-cuda==0.1.11 35 | ``` 36 | If you don't have internet access (e.g., in a requested job in clusters), you can run the following before requesting the job: 37 | ```bash 38 | pip download -i https://test.pypi.org/simple/ dciknn-cuda==0.1.11 39 | ``` 40 | Then run the following in the requested job to install offline: 41 | ``` 42 | pip install dciknn_cuda-0.1.11.tar.gz 43 | ``` 44 | 45 | 46 | ## Getting Started 47 | 48 | An example code using the PyTorch interface is provided. In the root directory of the code base, execute the following command: 49 | 50 | ```bash 51 | python example.py 52 | ``` 53 | 54 | ### Multi-GPU example 55 | The multi-GPU version of DCI exposes the same APIs to be used. The following is a simple example for using four GPUs for computing nearest neighbours: 56 | ```python 57 | # Multi-GPU version of DCI 58 | dci_db = MDCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, devices=[0, 1, 2, 3]) # We specify GPUs to be used by the DCI instance with `devices`. Set to list(range(torch.cuda.device_count())) to use all available GPUs 59 | 60 | dci_db.add(data) # We add the pool of data 61 | indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations) # We run our desired query 62 | ``` 63 | 64 | 65 | ## Directory Layout 66 | * `src`, all of the `*.cpp`, `.cu` files 67 | * `include`, the header files 68 | * `dciknn`, the Python interface 69 | 70 | ## Important Files 71 | * `src/dci_cuda.cpp`: defines the PyTorch extension functions 72 | * `src/util_kernel.cu`: matrix multiplication and random distribution generation functions 73 | * `src/dci_cuda_kernel.cu`: main components of prioritized DCI 74 | * `dciknn/core.py`: defines Python interface 75 | 76 | ## Reference 77 | 78 | Please cite the following paper if you found this library useful in your research: 79 | 80 | ### [Fast _k_-Nearest Neighbour Search via Dynamic Continuous Indexing](https://arxiv.org/abs/1512.00442) 81 | [Ke Li](https://people.eecs.berkeley.edu/~ke.li/), [Jitendra Malik](https://people.eecs.berkeley.edu/~malik/)\ 82 | *International Conference on Machine Learning (ICML)*, 2016 83 | 84 | ``` 85 | @inproceedings{li2016fast, 86 | title={Fast k-nearest neighbour search via {Dynamic Continuous Indexing}}, 87 | author={Li, Ke and Malik, Jitendra}, 88 | booktitle={International Conference on Machine Learning}, 89 | pages={671--679}, 90 | year={2016} 91 | } 92 | ``` 93 | -------------------------------------------------------------------------------- /dciknn_cuda/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | 4 | This code implements the method described in the Prioritized DCI paper, 5 | which can be found at https://arxiv.org/abs/1703.00440 6 | 7 | This file is a part of the Dynamic Continuous Indexing reference 8 | implementation. 9 | 10 | 11 | This Source Code Form is subject to the terms of the Mozilla Public 12 | License, v. 2.0. If a copy of the MPL was not distributed with this 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | 15 | Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | ''' 17 | 18 | __version__ = "0.1.11" 19 | __author__ = 'Ke Li, Shichong Peng, Mehran Aghabozorgi' 20 | __credits__ = 'APEX Lab' 21 | 22 | from .core import DCI, MDCI 23 | -------------------------------------------------------------------------------- /dciknn_cuda/core.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | 4 | This code implements the method described in the Prioritized DCI paper, 5 | which can be found at https://arxiv.org/abs/1703.00440 6 | 7 | This file is a part of the Dynamic Continuous Indexing reference 8 | implementation. 9 | 10 | 11 | This Source Code Form is subject to the terms of the Mozilla Public 12 | License, v. 2.0. If a copy of the MPL was not distributed with this 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | 15 | Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | ''' 17 | 18 | import torch 19 | from _dci_cuda import _dci_new, _dci_add, _dci_query, _dci_clear, _dci_reset, _dci_free, _dci_multi_query 20 | # from _dci_cuda import _dci_new, _dci_add, _dci_query, _dci_clear, _dci_reset, _dci_free 21 | 22 | from math import sqrt 23 | 24 | 25 | class DCI(object): 26 | 27 | def __init__(self, dim, num_comp_indices=2, num_simp_indices=7, bs=100, ts=10, device=0): 28 | 29 | if not torch.cuda.is_available(): 30 | raise RuntimeError("DCI CUDA version requires GPU access, please check CUDA driver.") 31 | 32 | self._dim = dim 33 | self._num_comp_indices = num_comp_indices 34 | self._num_simp_indices = num_simp_indices 35 | self._dci_inst = _dci_new(dim, num_comp_indices, num_simp_indices, device) 36 | self._array = None 37 | self._block_size = bs 38 | self._thread_size = ts 39 | self.num_points = 0 40 | 41 | @property 42 | def dim(self): 43 | return self._dim 44 | 45 | @property 46 | def num_comp_indices(self): 47 | return self._num_comp_indices 48 | 49 | @property 50 | def num_simp_indices(self): 51 | return self._num_simp_indices 52 | 53 | def _ensure_positive_integer(self, x): 54 | if not isinstance(x, int): 55 | raise TypeError("number must be an integer") 56 | elif x <= 0: 57 | raise ValueError("number must be positive") 58 | 59 | def _check_data(self, arr): 60 | if arr.shape[1] != self.dim: 61 | raise ValueError("mismatch between tensor dimension (%d) and the declared dimension of this DCI instance (%d)" % (arr.shape[1], self.dim)) 62 | if arr.dtype != torch.float: 63 | raise TypeError("tensor must consist of double-precision floats") 64 | if not arr.is_contiguous(): 65 | raise ValueError("the memory layout of tensor must be in row-major (C-order)") 66 | if not arr.is_cuda: 67 | raise TypeError("tensor must be a cuda tensor") 68 | 69 | def add(self, data): 70 | if self.num_points > 0: 71 | raise RuntimeError("DCI class does not support insertion of more than one tensor. Must combine all tensors into one tensor before inserting") 72 | self._check_data(data) 73 | self.num_points = data.shape[0] 74 | _dci_add(self._dci_inst, self._dim, self.num_points, data.flatten(), self._block_size, self._thread_size) 75 | self._array = data 76 | 77 | # query is num_queries x dim, returns num_queries x num_neighbours 78 | def query(self, query, num_neighbours=-1, num_outer_iterations=5000, blind=False): 79 | if len(query.shape) < 2: 80 | _query = query.unsqueeze(0) 81 | else: 82 | _query = query 83 | self._check_data(_query) 84 | if num_neighbours < 0: 85 | num_neighbours = self.num_points 86 | self._ensure_positive_integer(num_neighbours) 87 | max_num_candidates = 10 * num_neighbours 88 | # num_queries x num_neighbours 89 | 90 | _query_result = _dci_query(self._dci_inst, self._dim, _query.shape[0], _query.flatten(), num_neighbours, blind, num_outer_iterations, max_num_candidates, self._block_size, self._thread_size) 91 | half = _query_result.shape[0] // 2 92 | return _query_result[:half].reshape(_query.shape[0], -1), _query_result[half:].reshape(_query.shape[0], -1) 93 | 94 | def clear(self): 95 | _dci_clear(self._dci_inst) 96 | self.num_points = 0 97 | self._array = None 98 | 99 | def reset(self): 100 | _dci_reset(self._dci_inst) 101 | self.num_points = 0 102 | self._array = None 103 | 104 | def free(self): 105 | _dci_free(self._dci_inst) 106 | self.num_points = 0 107 | self._array = None 108 | 109 | 110 | class MDCI(object): 111 | def __init__(self, dim, num_comp_indices=2, num_simp_indices=7, bs=100, ts=10, devices=[0]): 112 | # if len(devices) < 2: 113 | # raise RuntimeError("You should specify at least two GPU for multi-GPU DCI to work") 114 | 115 | self.devices = devices 116 | self.num_devices = len(devices) 117 | self.dcis = [DCI(dim, num_comp_indices, num_simp_indices, bs, ts, dev) for dev in devices] 118 | self.data_per_device = 0 119 | 120 | 121 | def add(self, data): 122 | self.data_per_device = data.shape[0] // self.num_devices + 1 123 | for dev_ind in range(self.num_devices): 124 | device = self.devices[dev_ind] 125 | cur_data = data[dev_ind * self.data_per_device: dev_ind * self.data_per_device + self.data_per_device].to(device) 126 | self.dcis[dev_ind].add(cur_data) 127 | 128 | def query(self, query, num_neighbours=-1, num_outer_iterations=5000, blind=False): 129 | dists = [] 130 | nns = [] 131 | if num_neighbours <= 0: 132 | raise RuntimeError('num_neighbours must be positive') 133 | 134 | if len(query.shape) < 2: 135 | _query = query.unsqueeze(0) 136 | else: 137 | _query = query 138 | _query = _query.detach().clone() 139 | 140 | 141 | max_num_candidates = 10 * num_neighbours 142 | 143 | queries = [_query.to(self.devices[dev_ind]).flatten() for dev_ind in self.devices] 144 | res = _dci_multi_query([dc._dci_inst for dc in self.dcis], self.dcis[0]._dim, _query.shape[0], queries, num_neighbours, blind, num_outer_iterations, max_num_candidates, self.dcis[0]._block_size, self.dcis[0]._thread_size) 145 | 146 | for ind, cur_res in enumerate(res): 147 | half = cur_res.shape[0] // 2 148 | cur_nns, cur_dist = cur_res[:half].reshape(_query.shape[0], -1), cur_res[half:].reshape(_query.shape[0], -1) 149 | cur_nns = cur_nns + self.data_per_device * ind 150 | dists.append(cur_dist.detach().clone().to(self.devices[0])) 151 | nns.append(cur_nns.detach().clone().to(self.devices[0])) 152 | 153 | merged_dists = torch.cat(dists, dim=1) 154 | merged_nns = torch.cat(nns, dim=1) 155 | _, sort_indices = torch.sort(merged_dists, dim=1) 156 | sort_indices = sort_indices[:, :num_neighbours] 157 | return torch.gather(merged_nns, 1, sort_indices), torch.gather(merged_dists, 1, sort_indices) 158 | 159 | def clear(self): 160 | for dci in self.dcis: 161 | dci.clear() 162 | 163 | def reset(self): 164 | for dci in self.dcis: 165 | dci.reset() 166 | 167 | def free(self): 168 | for dci in self.dcis: 169 | dci.free() -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | 4 | This code implements the method described in the Prioritized DCI paper, 5 | which can be found at https://arxiv.org/abs/1703.00440 6 | 7 | This file is a part of the Dynamic Continuous Indexing reference 8 | implementation. 9 | 10 | 11 | This Source Code Form is subject to the terms of the Mozilla Public 12 | License, v. 2.0. If a copy of the MPL was not distributed with this 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | 15 | Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | ''' 17 | from dciknn_cuda import DCI, MDCI 18 | import torch 19 | import random 20 | import datetime 21 | 22 | random.seed(10) 23 | torch.manual_seed(0) 24 | 25 | def gen_data(ambient_dim, intrinsic_dim, num_points): 26 | latent_data = torch.randn((num_points, intrinsic_dim)) 27 | transformation = torch.randn((intrinsic_dim, ambient_dim)) 28 | data = torch.matmul(latent_data, transformation) 29 | return data # num_points x ambient_dim 30 | 31 | 32 | def main(): 33 | assert torch.cuda.is_available() 34 | device = torch.device('cuda:1') 35 | 36 | ############################################################################################################################################# 37 | # # 38 | # Data Generation Hyperparameters # 39 | # # 40 | ############################################################################################################################################# 41 | dim = 100 42 | num_pts = 3000 43 | num_queries = 500 44 | # dim = 80 45 | # num_pts = 1000 46 | # num_queries = 100 47 | 48 | intrinsic_dim = 400 49 | data_and_queries = gen_data(dim, intrinsic_dim, num_pts + num_queries) 50 | 51 | data = data_and_queries[:num_pts, :].detach().clone().to(device) 52 | query = data_and_queries[num_pts:, :].detach().clone().to(device) 53 | 54 | ############################################################################################################################################# 55 | # # 56 | # Problem Hyperparameter # 57 | # # 58 | ############################################################################################################################################# 59 | num_neighbours = 10 # The k in k-NN 60 | 61 | ############################################################################################################################################# 62 | # # 63 | # DCI Hyperparameters # 64 | # # 65 | ############################################################################################################################################# 66 | block_size = 100 67 | thread_size = 10 68 | num_comp_indices = 2 69 | num_simp_indices = 10 70 | num_outer_iterations = 5000 71 | 72 | # initialize the DCI instance 73 | for i in range(2): 74 | a = datetime.datetime.now() 75 | dci_db = MDCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, devices=[0, 1]) 76 | 77 | dci_db.add(data) 78 | # Query 79 | indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations) 80 | print("Nearest Indices:", indices) 81 | print("Indices Distances:", dists) 82 | dci_db.clear() 83 | b = datetime.datetime.now() 84 | print(b-a) 85 | 86 | data = data_and_queries[:num_pts, :].detach().clone().to(0) 87 | query = data_and_queries[num_pts:, :].detach().clone().to(0) 88 | a = datetime.datetime.now() 89 | dci_db = DCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, device=0) 90 | 91 | dci_db.add(data) 92 | # Query 93 | indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations) 94 | print("Nearest Indices:", indices) 95 | print("Indices Distances:", dists) 96 | dci_db.clear() 97 | b = datetime.datetime.now() 98 | print(b-a) 99 | 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /include/dci.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | * 4 | * This code implements the method described in the Prioritized DCI paper, 5 | * which can be found at https://arxiv.org/abs/1703.00440 6 | * 7 | * This file is a part of the Dynamic Continuous Indexing reference 8 | * implementation. 9 | * 10 | * 11 | * This Source Code Form is subject to the terms of the Mozilla Public 12 | * License, v. 2.0. If a copy of the MPL was not distributed with this 13 | * file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | * 15 | * Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | */ 17 | 18 | #ifndef DCI_H 19 | #define DCI_H 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | 27 | typedef struct idx_elem { 28 | float key; // value of the projection of point onto vector 29 | int value; // index of the point 30 | } idx_elem; 31 | 32 | // sorting alg we are using 33 | __device__ 34 | void mix_sort(idx_elem arr[], int n); 35 | 36 | float compute_dist(const float* const vec1, const float* const vec2, 37 | const int dim); 38 | 39 | typedef struct dci { 40 | int dim; // (Ambient) dimensionality of data 41 | int num_comp_indices; // Number of composite indices 42 | int num_simp_indices; // Number of simple indices in each composite index 43 | int num_points; 44 | idx_elem* indices; // Assuming row-major layout, matrix of size required_num_points x (num_comp_indices*num_simp_indices) 45 | float* proj_vec; // Assuming row-major layout, matrix of size dim x (num_comp_indices*num_simp_indices) 46 | float* data_proj; // Device copy of data_proj 47 | float* data; 48 | float* d_data; 49 | int devID; // To initialize CUDA's matmul, set to 0 50 | } dci; 51 | 52 | typedef struct dci_query_config { 53 | bool blind; 54 | int num_outer_iterations; 55 | int max_num_candidates; 56 | } dci_query_config; 57 | 58 | void dci_gen_proj_vec(float* proj_vec, const int dim, 59 | const int num_indices); 60 | 61 | void dci_init(dci* const dci_inst, const int dim, const int num_comp_indices, 62 | const int num_simp_indices, const int devId); 63 | 64 | __device__ 65 | void insertion_sort(idx_elem arr[], int n); 66 | 67 | // // Note: the data itself is not kept in the index and must be kept in-place 68 | void dci_add(dci* const dci_inst, const int dim, const int num_points, 69 | float* const data, const int block_size, const int thread_size); 70 | 71 | void dci_query(dci* const dci_inst, const int dim, const int num_queries, 72 | const float* const query, const int num_neighbours, 73 | const dci_query_config query_config, int* const nearest_neighbours, 74 | float* const nearest_neighbour_dists, const int block_size, 75 | const int thread_size); 76 | 77 | void dci_clear(dci* const dci_inst); 78 | 79 | // Clear indices and reset the projection directions 80 | void dci_reset(dci* const dci_inst); 81 | 82 | void dci_free(const dci* const dci_inst); 83 | 84 | void dci_dump(const dci* const dci_inst); 85 | 86 | #endif // DCI_H 87 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | * 4 | * This code implements the method described in the Prioritized DCI paper, 5 | * which can be found at https://arxiv.org/abs/1703.00440 6 | * 7 | * This file is a part of the Dynamic Continuous Indexing reference 8 | * implementation. 9 | * 10 | * 11 | * This Source Code Form is subject to the terms of the Mozilla Public 12 | * License, v. 2.0. If a copy of the MPL was not distributed with this 13 | * file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | * 15 | * Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | */ 17 | 18 | // CUDA runtime 19 | #include 20 | #include 21 | 22 | // CUDA random 23 | #include 24 | #include 25 | #include 26 | 27 | #ifndef UTIL_H 28 | #define UTIL_H 29 | 30 | #define GAUSS_RAND 0 31 | #define UNIFORM_RAND 1 32 | 33 | typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes 34 | { 35 | unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; 36 | } sMatrixSize; 37 | 38 | // put in device pointers. Saves on memcpy operations 39 | void matmul_device(const cublasOperation_t op_A, const cublasOperation_t op_B, 40 | const int M, const int N, const int K, const float* const A, const float* const B, float* const C, int &devID); 41 | 42 | // put in device pointers. Saves on memcpy operations 43 | void rng_parallel_device(float* const vec, const int n, const int rng_type); 44 | 45 | __global__ void init_curand_state(unsigned int seed, curandState_t* states); 46 | 47 | __global__ void gauss_parallel_rng(curandState_t* states, float *vec, const int n); 48 | 49 | __global__ void uniform_parallel_rng(curandState_t* states, float *vec, const int n); 50 | 51 | #endif // UTIL_H 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | 4 | This code implements the method described in the Prioritized DCI paper, 5 | which can be found at https://arxiv.org/abs/1703.00440 6 | 7 | This file is a part of the Dynamic Continuous Indexing reference 8 | implementation. 9 | 10 | 11 | This Source Code Form is subject to the terms of the Mozilla Public 12 | License, v. 2.0. If a copy of the MPL was not distributed with this 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | 15 | Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | ''' 17 | from setuptools import setup 18 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, include_paths 19 | import os 20 | import sys 21 | 22 | if sys.version_info[0] < 3: 23 | with open('README.md') as f: 24 | long_description = f.read() 25 | else: 26 | with open('README.md', encoding='utf-8') as f: 27 | long_description = f.read() 28 | 29 | setup( 30 | name='dciknn_cuda', 31 | packages=['dciknn_cuda'], 32 | version='0.1.11', 33 | long_description=long_description, 34 | long_description_content_type='text/markdown', 35 | description='DCI CUDA for fast K nearest neighbour finding', 36 | url='https://github.com/niopeng/dciknn_cuda', 37 | author='Ke Li, Shichong Peng, Mehran Aghabozorgi', 38 | author_email='keli@sfu.ca', 39 | license='Mozilla Public License Version 2.0', 40 | install_requires=['torch>=1.4.0'], 41 | include_dirs=include_paths(), 42 | language='c++', 43 | soruces=['./src/dci_cuda.cpp', 44 | './src/dci_cuda_kernel.cu', 45 | './src/util_kernel.cu',], 46 | ext_modules=[ 47 | CUDAExtension('_dci_cuda', [ 48 | './src/dci_cuda.cpp', 49 | './src/dci_cuda_kernel.cu', 50 | './src/util_kernel.cu', 51 | ], include_dirs=[ 52 | os.path.abspath(os.path.join(os.path.dirname(__file__), 'include')), 53 | ] 54 | ) 55 | ], 56 | cmdclass={ 57 | 'build_ext': BuildExtension 58 | }) 59 | -------------------------------------------------------------------------------- /src/dci_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | * 4 | * This code implements the method described in the Prioritized DCI paper, 5 | * which can be found at https://arxiv.org/abs/1703.00440 6 | * 7 | * This file is a part of the Dynamic Continuous Indexing reference 8 | * implementation. 9 | * 10 | * 11 | * This Source Code Form is subject to the terms of the Mozilla Public 12 | * License, v. 2.0. If a copy of the MPL was not distributed with this 13 | * file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | * 15 | * Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | */ 17 | 18 | #include 19 | #include 20 | #include "dci.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | 28 | typedef struct py_dci { 29 | dci dci_inst; 30 | PyObject *py_array; 31 | } py_dci; 32 | 33 | namespace py = pybind11; 34 | 35 | static void py_dci_free_wrap(PyObject *py_dci_inst_wrapper) { 36 | 37 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_dci_inst_wrapper, "py_dci_inst"); 38 | const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID); 39 | 40 | if (py_dci_inst->py_array) { 41 | Py_DECREF(py_dci_inst->py_array); 42 | } 43 | 44 | dci_free(&(py_dci_inst->dci_inst)); 45 | cudaFree(py_dci_inst); 46 | } 47 | 48 | static void py_tensor_free(PyObject *py_tensor_wrapper) { 49 | torch::Tensor *py_tensor = (torch::Tensor *)PyCapsule_GetPointer(py_tensor_wrapper, "py_tensor"); 50 | const at::cuda::OptionalCUDAGuard device_guard(device_of(*py_tensor)); 51 | cudaFree(py_tensor); 52 | } 53 | 54 | py::handle py_dci_new(const int dim, const int num_comp_indices, 55 | const int num_simp_indices, const int deviceId) { 56 | const at::cuda::OptionalCUDAGuard device_guard(deviceId); 57 | py_dci *py_dci_inst; 58 | cudaMallocManaged((void **) &py_dci_inst, sizeof(py_dci)); 59 | 60 | // initialize DCI instance 61 | dci_init(&(py_dci_inst->dci_inst), dim, num_comp_indices, num_simp_indices, deviceId); 62 | 63 | // Returns new reference 64 | PyObject *py_dci_inst_wrapper = PyCapsule_New(py_dci_inst, "py_dci_inst", py_dci_free_wrap); 65 | return py_dci_inst_wrapper; 66 | } 67 | 68 | void py_dci_add(py::handle py_dci_inst_wrapper, const int dim, const int num_points, 69 | torch::Tensor py_data, const int block_size, const int thread_size) { 70 | const at::cuda::OptionalCUDAGuard device_guard(device_of(py_data)); 71 | 72 | PyObject *py_obj = py_dci_inst_wrapper.ptr(); 73 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst"); 74 | float* data = (float *)py_data.data_ptr(); 75 | 76 | // add data to DCI instance 77 | dci_add(&(py_dci_inst->dci_inst), dim, num_points, data, block_size, thread_size); 78 | 79 | PyObject *py_tensor_wrapper = PyCapsule_New(&py_data, "py_tensor", py_tensor_free); 80 | py_dci_inst->py_array = py_tensor_wrapper; 81 | Py_INCREF(py_tensor_wrapper); 82 | } 83 | 84 | torch::Tensor py_dci_query(py::handle py_dci_inst_wrapper, const int dim, const int num_queries, 85 | torch::Tensor py_query, const int num_neighbours, const bool blind, const int num_outer_iterations, 86 | const int max_num_candidates, const int block_size, 87 | const int thread_size) { 88 | const at::cuda::OptionalCUDAGuard device_guard(device_of(py_query)); 89 | 90 | PyObject *py_obj = py_dci_inst_wrapper.ptr(); 91 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst"); 92 | 93 | // Assuming row-major layout, py_query->data is N x D, where N is the number of queries and D is the dimensionality 94 | float* query = (float *)py_query.data_ptr(); 95 | 96 | dci_query_config query_config = {blind, num_outer_iterations, max_num_candidates}; 97 | int* final_outputs; 98 | float* final_distances; 99 | const int output_size = num_neighbours * num_queries; 100 | cudaMalloc((void **) &(final_outputs), sizeof(int) * output_size); 101 | cudaMalloc((void **) &(final_distances), sizeof(float) * output_size); 102 | 103 | // query using DCI 104 | dci_query(&(py_dci_inst->dci_inst), dim, num_queries, query, num_neighbours, 105 | query_config, final_outputs, final_distances, block_size, thread_size); 106 | 107 | auto options = torch::TensorOptions().device(torch::kCUDA); 108 | auto new_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA); 109 | torch::Tensor final_outputs_array = torch::from_blob(final_outputs, {output_size}, new_options); 110 | // convert to float tensor to concatenate with the computed distances 111 | torch::Tensor final = final_outputs_array.to(torch::kFloat32); 112 | 113 | torch::Tensor final_distances_array = torch::from_blob(final_distances, {output_size}, options); 114 | 115 | torch::Tensor final_result = torch::cat({ final, final_distances_array }, 0); 116 | 117 | return final_result; 118 | } 119 | 120 | std::vector py_dci_multi_query(std::vector py_dci_inst_wrapper, const int dim, const int num_queries, 121 | std::vector py_query, const int num_neighbours, const bool blind, const int num_outer_iterations, 122 | const int max_num_candidates, const int block_size, 123 | const int thread_size) { 124 | std::vector results; 125 | std::vector> calcs; 126 | for (unsigned int i = 0; i < py_query.size(); i++) { 127 | calcs.push_back(std::async(py_dci_query, py_dci_inst_wrapper[i], dim, num_queries, 128 | py_query[i], num_neighbours, blind, num_outer_iterations, max_num_candidates, block_size, thread_size)); 129 | } 130 | for (unsigned int i = 0; i < py_query.size(); i++) { 131 | results.push_back(calcs[i].get()); 132 | } 133 | return results; 134 | } 135 | 136 | void py_dci_clear(py::handle py_dci_inst_wrapper) { 137 | PyObject *py_obj = py_dci_inst_wrapper.ptr(); 138 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst"); 139 | const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID); 140 | 141 | if (py_dci_inst->py_array) { 142 | Py_DECREF(py_dci_inst->py_array); 143 | } 144 | 145 | dci_clear(&(py_dci_inst->dci_inst)); 146 | py_dci_inst->py_array = NULL; 147 | } 148 | 149 | void py_dci_reset(py::handle py_dci_inst_wrapper) { 150 | PyObject *py_obj = py_dci_inst_wrapper.ptr(); 151 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst"); 152 | const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID); 153 | 154 | if (py_dci_inst->py_array) { 155 | Py_DECREF(py_dci_inst->py_array); 156 | } 157 | 158 | dci_reset(&(py_dci_inst->dci_inst)); 159 | py_dci_inst->py_array = NULL; 160 | } 161 | 162 | void py_dci_free(py::handle py_dci_inst_wrapper) { 163 | PyObject *py_obj = py_dci_inst_wrapper.ptr(); 164 | py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst"); 165 | const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID); 166 | 167 | if (py_dci_inst->py_array) { 168 | Py_DECREF(py_dci_inst->py_array); 169 | } 170 | 171 | dci_free(&(py_dci_inst->dci_inst)); 172 | cudaFree(py_dci_inst); 173 | } 174 | 175 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 176 | m.def("_dci_new", &py_dci_new, "Create new DCI instance. (CUDA)"); 177 | m.def("_dci_add", &py_dci_add, "Add data. (CUDA)"); 178 | m.def("_dci_query", &py_dci_query, "Search for nearest neighbours. (CUDA)"); 179 | m.def("_dci_clear", &py_dci_clear, "Clear DCI. (CUDA)"); 180 | m.def("_dci_reset", &py_dci_reset, "Reset DCI. (CUDA)"); 181 | m.def("_dci_free", &py_dci_free, "Free DCI. (CUDA)"); 182 | m.def("_dci_multi_query", &py_dci_multi_query, "Search for nearest neighbours with multiple GPUs. (CUDA)"); 183 | } 184 | -------------------------------------------------------------------------------- /src/dci_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | * 4 | * This code implements the method described in the Prioritized DCI paper, 5 | * which can be found at https://arxiv.org/abs/1703.00440 6 | * 7 | * This file is a part of the Dynamic Continuous Indexing reference 8 | * implementation. 9 | * 10 | * 11 | * This Source Code Form is subject to the terms of the Mozilla Public 12 | * License, v. 2.0. If a copy of the MPL was not distributed with this 13 | * file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | * 15 | * Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "dci.h" 25 | #include "util.h" 26 | 27 | /* Sorting functions */ 28 | #include 29 | #include 30 | 31 | /* CUDA runtime */ 32 | #include 33 | #include 34 | 35 | __device__ 36 | float compute_dist_device(const float* const vec1, const float* const vec2, 37 | const int dim) { 38 | int i; 39 | float sq_dist = 0.0; 40 | for (i = 0; i < dim; i++) { 41 | sq_dist += (vec1[i] - vec2[i]) * (vec1[i] - vec2[i]); 42 | } 43 | return sqrt(sq_dist); 44 | } 45 | 46 | __device__ 47 | static inline float abs_d(float x) { 48 | return x > 0 ? x : -x; 49 | } 50 | 51 | /* Normalize the input projection vectors. Vectors are normalized along each row. */ 52 | __global__ void normalize_proj_vecs(float* const proj_vec, const int dim, 53 | const int num_indices) { 54 | int i = blockDim.x * blockIdx.x + threadIdx.x; 55 | /* Note: Assumes num_blocks = num_threads */ 56 | int chunk_size = (num_indices + blockDim.x * blockDim.x - 1) 57 | / (blockDim.x * blockDim.x); 58 | int vec_index; 59 | for (int j = 0; j < chunk_size; ++j) { 60 | vec_index = i * chunk_size + j; 61 | if (vec_index < num_indices) { 62 | float sq_norm = 0.0; 63 | for (int k = 0; k < dim; ++k) { 64 | sq_norm += proj_vec[vec_index * dim + k] 65 | * proj_vec[vec_index * dim + k]; 66 | } 67 | float norm = sqrtf(sq_norm); 68 | for (int k = 0; k < dim; ++k) { 69 | proj_vec[vec_index * dim + k] /= norm; 70 | } 71 | } 72 | } 73 | } 74 | 75 | /* Create matrix with proj_vec dim-dimensional normalized gaussian vectors. 76 | vectors are normalized along each row */ 77 | void dci_gen_proj_vec(float* const proj_vec, const int dim, 78 | const int num_indices) { 79 | /* Generate the random indices */ 80 | rng_parallel_device(proj_vec, dim * num_indices, GAUSS_RAND); 81 | 82 | /* Normalize */ 83 | int block_size = 32; 84 | int thread_size = 32; 85 | normalize_proj_vecs<<>>(proj_vec, dim, 86 | num_indices); 87 | 88 | /* Synchronize the threads */ 89 | cudaDeviceSynchronize(); 90 | } 91 | 92 | /* Initializes the master DCI data structure. */ 93 | void dci_init(dci* const dci_inst, const int dim, const int num_comp_indices, 94 | const int num_simp_indices, const int devId) { 95 | int num_indices = num_comp_indices * num_simp_indices; 96 | 97 | dci_inst->dim = dim; 98 | dci_inst->num_comp_indices = num_comp_indices; 99 | dci_inst->num_simp_indices = num_simp_indices; 100 | 101 | cudaMallocManaged((void **) &dci_inst->proj_vec, 102 | sizeof(float) * dim * num_indices); 103 | dci_gen_proj_vec(dci_inst->proj_vec, dim, num_indices); 104 | 105 | /* Variables that initialize to default values */ 106 | dci_inst->num_points = 0; 107 | dci_inst->indices = NULL; 108 | dci_inst->data = NULL; 109 | dci_inst->devID = devId; 110 | } 111 | 112 | /* Sort indices */ 113 | __global__ void sort_indices(dci* const dci_inst, const int num_indices, 114 | const int num_points, const int points_per_block) { 115 | int chunk_size = (num_indices + blockDim.x - 1) / blockDim.x; 116 | int idx; 117 | int num_points_in_block = min( 118 | (int) (dci_inst->num_points - blockIdx.x * points_per_block), 119 | points_per_block); 120 | for (int j = 0; j < chunk_size; j++) { 121 | idx = threadIdx.x * chunk_size + j; 122 | if (idx < num_indices) { 123 | mix_sort( 124 | &(dci_inst->indices[idx * dci_inst->num_points 125 | + points_per_block * blockIdx.x]), 126 | num_points_in_block); 127 | } 128 | } 129 | } 130 | 131 | /* Copy data in proj_vec to indices */ 132 | __global__ void copy_to_indices(dci* const dci_inst, float* const data_proj, 133 | const int num_indices, const int num_points) { 134 | int i = blockDim.x * blockIdx.x + threadIdx.x; 135 | int n = num_indices * num_points; 136 | int chunk_size = (n + blockDim.x * gridDim.x - 1) 137 | / (blockDim.x * gridDim.x); 138 | int idx; 139 | for (int j = 0; j < chunk_size; j++) { 140 | idx = i * chunk_size + j; 141 | if (idx < n) { 142 | dci_inst->indices[idx].key = data_proj[idx]; 143 | dci_inst->indices[idx].value = idx % num_points; 144 | } 145 | } 146 | } 147 | 148 | /* Add data to the master DCI data structure. */ 149 | void dci_add(dci* const dci_inst, const int dim, const int num_points, 150 | float* const data, const int block_size, const int thread_size) { 151 | int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices; 152 | float *data_proj; 153 | cudaMallocManaged((void **) &data_proj, 154 | sizeof(float) * num_points * num_indices); 155 | 156 | assert(dim == dci_inst->dim); 157 | assert(dci_inst->num_points == 0); 158 | 159 | cudaMallocManaged((void **) &dci_inst->data, 160 | sizeof(float) * num_points * dim); 161 | dci_inst->data = data; 162 | cudaMallocManaged((void **) &dci_inst->indices, 163 | sizeof(idx_elem) * num_points * num_indices); 164 | 165 | dci_inst->num_points = num_points; 166 | 167 | matmul_device(CUBLAS_OP_N, CUBLAS_OP_T, num_indices, num_points, 168 | dci_inst->dim, dci_inst->proj_vec, dci_inst->data, data_proj, 169 | dci_inst->devID); 170 | cudaDeviceSynchronize(); 171 | 172 | /* Add to indices */ 173 | copy_to_indices <<>>(dci_inst, data_proj, num_indices, num_points); 174 | 175 | /* Synchronize the threads */ 176 | cudaDeviceSynchronize(); 177 | 178 | int points_per_block = (dci_inst->num_points + block_size - 1) / block_size; 179 | /* Sort the indices */ 180 | sort_indices<<>>(dci_inst, num_indices, num_points, 181 | points_per_block); 182 | 183 | /* Synchronize the threads */ 184 | cudaDeviceSynchronize(); 185 | 186 | cudaFree(data_proj); 187 | } 188 | 189 | __device__ 190 | void insertion_sort(idx_elem arr[], int n) { 191 | int i, j; 192 | idx_elem key; 193 | for (i = 1; i < n; i++) { 194 | key = arr[i]; 195 | j = i - 1; 196 | while (j >= 0 && arr[j].key > key.key) { 197 | arr[j + 1] = arr[j]; 198 | j = j - 1; 199 | } 200 | arr[j + 1] = key; 201 | } 202 | } 203 | 204 | /* Modified quick_sort to use "mix_sort" below. */ 205 | __device__ 206 | void quick_sort(idx_elem arr[], int n) { 207 | // arbitrary pivot 208 | float pivot_key = arr[n / 2].key; 209 | idx_elem swp; 210 | int low = 0; 211 | int high = n - 1; 212 | while (low < n || high > 0) { 213 | while (arr[low].key < pivot_key && low < n) { 214 | low++; 215 | } 216 | while (arr[high].key > pivot_key && high > 0) { 217 | high--; 218 | } 219 | if (low <= high) { 220 | swp = arr[low]; 221 | arr[low] = arr[high]; 222 | arr[high] = swp; 223 | low++; 224 | high--; 225 | } else { 226 | if (high > 0) { 227 | mix_sort(arr, high + 1); 228 | } 229 | if (low < n - 1) { 230 | mix_sort(&arr[low], n - low); 231 | } 232 | return; 233 | } 234 | } 235 | } 236 | 237 | /* Sorting algorithm. If the number of data points is fewer than 64, then it does 238 | Insertion Sort. Otherwise, it uses Quick Sort. The reasoning is that if there are 239 | too few data points, then Quick Sort's overhead may be too large. */ 240 | __device__ 241 | void mix_sort(idx_elem arr[], int n) { 242 | if (n > 64) { 243 | quick_sort(arr, n); 244 | } else { 245 | insertion_sort(arr, n); 246 | } 247 | } 248 | 249 | __device__ 250 | static inline int dci_next_closest_proj(const idx_elem* const idx, 251 | int* const left_pos, int* const right_pos, const float query_proj, 252 | const int num_elems) { 253 | int cur_pos; 254 | int lower_bound = -blockDim.x; 255 | int upper_bound = num_elems + blockDim.x - 1; 256 | if ((*left_pos <= lower_bound) && (*right_pos >= upper_bound)) { 257 | cur_pos = lower_bound; 258 | } else if (*left_pos <= lower_bound) { 259 | cur_pos = *right_pos; 260 | (*right_pos) += blockDim.x; 261 | } else if (*right_pos >= upper_bound) { 262 | cur_pos = *left_pos; 263 | (*left_pos) -= blockDim.x; 264 | } else if (idx[min(*right_pos, num_elems - 1)].key - query_proj 265 | < query_proj - idx[max(*left_pos, 0)].key) { 266 | cur_pos = *right_pos; 267 | (*right_pos) += blockDim.x; 268 | } else { 269 | cur_pos = *left_pos; 270 | (*left_pos) -= blockDim.x; 271 | } 272 | return cur_pos; 273 | } 274 | 275 | // Returns the index of the element whose key is the largest that is less than the key 276 | // Returns an integer from -1 to num_elems - 1 inclusive 277 | // Could return -1 if all elements are greater or equal to key 278 | __device__ 279 | static inline int dci_search_index(const idx_elem* const idx, const float key, 280 | const int num_elems) { 281 | int start_pos, end_pos, cur_pos; 282 | 283 | start_pos = -1; 284 | end_pos = num_elems - 1; 285 | cur_pos = (start_pos + end_pos + 2) / 2; 286 | 287 | while (start_pos < end_pos) { 288 | if (idx[cur_pos].key < key) { 289 | start_pos = cur_pos; 290 | } else { 291 | end_pos = cur_pos - 1; 292 | } 293 | cur_pos = (start_pos + end_pos + 2) / 2; 294 | } 295 | 296 | return start_pos; 297 | } 298 | 299 | /* Search indices */ 300 | __device__ void search_index(const dci* const dci_inst, 301 | const float* const query_proj, const int num_indices, 302 | int* const left_pos, int* const right_pos, const int points_per_block) { 303 | int total = num_indices; 304 | int chunk_size = (total + blockDim.x - 1) / blockDim.x; 305 | int idx; 306 | for (int j = 0; j < chunk_size; j++) { 307 | idx = threadIdx.x * chunk_size + j; 308 | if (idx < total) { 309 | left_pos[idx] = dci_search_index( 310 | &(dci_inst->indices[idx * (dci_inst->num_points) 311 | + blockIdx.x * points_per_block]), 312 | query_proj[idx], 313 | min(dci_inst->num_points - blockIdx.x * points_per_block, 314 | points_per_block)) - blockDim.x + 1; 315 | right_pos[idx] = left_pos[idx] + blockDim.x; 316 | } 317 | } 318 | } 319 | 320 | __device__ void init_index_priority(const dci* const dci_inst, 321 | const float* const query_proj, const int num_indices, 322 | int* const left_pos, int* const right_pos, float* const index_priority, 323 | int* const cur_pos, const int points_per_block) { 324 | int total = num_indices; 325 | int chunk_size = (total + blockDim.x - 1) / blockDim.x; 326 | int idx; 327 | int num_points_in_block = min( 328 | (int) (dci_inst->num_points - blockIdx.x * points_per_block), 329 | points_per_block); 330 | for (int j = 0; j < chunk_size; j++) { 331 | idx = threadIdx.x * chunk_size + j; 332 | if (idx < total && num_points_in_block > 0) { 333 | cur_pos[idx] = dci_next_closest_proj( 334 | &(dci_inst->indices[idx * (dci_inst->num_points) 335 | + blockIdx.x * points_per_block]), 336 | &(left_pos[idx]), &(right_pos[idx]), query_proj[idx], 337 | num_points_in_block); 338 | int position; 339 | if ((cur_pos[idx] < 0) && (cur_pos[idx] > -blockDim.x)) { 340 | position = 0; 341 | } else if ((cur_pos[idx] < (num_points_in_block + blockDim.x - 1)) 342 | && (cur_pos[idx] >= num_points_in_block)) { 343 | position = num_points_in_block - 1; 344 | } else { 345 | position = cur_pos[idx]; 346 | } 347 | assert(position >= 0); // There should be at least one point in the index 348 | assert(position < num_points_in_block); 349 | index_priority[idx] = abs_d( 350 | dci_inst->indices[position + idx * (dci_inst->num_points) 351 | + blockIdx.x * points_per_block].key 352 | - query_proj[idx]); 353 | } 354 | } 355 | } 356 | 357 | __global__ void init_counts(const dci* const dci_inst, int* counts) { 358 | int i = blockDim.x * blockIdx.x + threadIdx.x; 359 | int total = dci_inst->num_comp_indices * dci_inst->num_points; 360 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 361 | / (blockDim.x * gridDim.x); 362 | for (int j = 0; j < chunk_size; j++) { 363 | int l = i * chunk_size + j; 364 | if (l < total) { 365 | counts[l] = 0; 366 | } 367 | } 368 | } 369 | 370 | __global__ void init_candidate_dists(const dci* const dci_inst, 371 | float* candidate_dists) { 372 | int i = blockDim.x * blockIdx.x + threadIdx.x; 373 | int total = dci_inst->num_points; 374 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 375 | / (blockDim.x * gridDim.x); 376 | for (int j = 0; j < chunk_size; j++) { 377 | int l = i * chunk_size + j; 378 | if (l < total) { 379 | candidate_dists[l] = -2.0; 380 | } 381 | } 382 | } 383 | 384 | __global__ void init_candidate_indices(const dci* const dci_inst, 385 | int* candidate_indices) { 386 | int i = blockDim.x * blockIdx.x + threadIdx.x; 387 | int total = dci_inst->num_points; 388 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 389 | / (blockDim.x * gridDim.x); 390 | for (int j = 0; j < chunk_size; j++) { 391 | int l = i * chunk_size + j; 392 | if (l < total) { 393 | candidate_indices[l] = -1; 394 | } 395 | } 396 | } 397 | 398 | // Blind querying does not compute distances or look at the values of indexed vectors 399 | // For blind querying, top_candidates is not used; all_candidates is used to store candidates in the order of retrieval 400 | __global__ 401 | static void dci_query_single_point_by_block(const dci* const dci_inst, 402 | const int num_neighbours, const float* const query, 403 | const float* const query_proj, const dci_query_config query_config, 404 | float* const d_top_candidates_dist, int* const d_top_candidates_index, 405 | int* const all_candidates, int* counts, float* candidate_dists) { 406 | int j, h; 407 | float cur_dist; 408 | int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices; 409 | __shared__ float top_index_priority; 410 | __shared__ int k, top_h, position, m, i; 411 | __shared__ bool could_break; // Bug fix: resolve infinite loop if thread 0 exits first 412 | float last_top_candidate_dist = -1.0; // The distance of the k^th closest candidate found so far 413 | int num_candidates = 0, last_top_candidate = -1; 414 | 415 | // init variables 416 | if (threadIdx.x == 0) { 417 | k = 0; 418 | could_break = false; 419 | } 420 | 421 | int max_possible_num_candidates = min(query_config.max_num_candidates, 422 | query_config.num_outer_iterations); 423 | 424 | int points_per_block = (dci_inst->num_points + gridDim.x - 1) / gridDim.x; 425 | int num_points_in_block = min( 426 | (int) (dci_inst->num_points - blockIdx.x * points_per_block), 427 | points_per_block); 428 | 429 | if (num_points_in_block > 0) { 430 | 431 | __shared__ int* left_pos; 432 | __shared__ int* right_pos; 433 | __shared__ int* cur_pos; 434 | __shared__ float* index_priority; 435 | // init variables 436 | if (threadIdx.x == 0) { 437 | left_pos = new int[num_indices]; 438 | right_pos = new int[num_indices]; 439 | cur_pos = new int[num_indices]; 440 | index_priority = new float[num_indices]; 441 | } 442 | __syncthreads(); 443 | 444 | /* Search index */ 445 | search_index(dci_inst, query_proj, num_indices, left_pos, right_pos, 446 | points_per_block); 447 | 448 | /* Synchronize the threads */ 449 | __syncthreads(); 450 | 451 | /* Populate the closest indices */ 452 | init_index_priority(dci_inst, query_proj, num_indices, left_pos, right_pos, 453 | index_priority, cur_pos, points_per_block); 454 | 455 | /* Synchronize the threads */ 456 | __syncthreads(); 457 | 458 | while (k < num_points_in_block * dci_inst->num_simp_indices * blockDim.x) { 459 | 460 | if (threadIdx.x == 0) { 461 | m = 0; 462 | } 463 | __syncthreads(); 464 | while (m < dci_inst->num_comp_indices) { 465 | // only one thread to get the top 466 | if (threadIdx.x == 0) { 467 | /* Get the top priority and data index in priority queue */ 468 | top_index_priority = DBL_MAX; 469 | top_h = -1; 470 | for (h = 0; h < dci_inst->num_simp_indices; h++) { 471 | if (index_priority[h + m * dci_inst->num_simp_indices] 472 | < top_index_priority) { 473 | top_index_priority = index_priority[h 474 | + m * dci_inst->num_simp_indices]; 475 | top_h = h; 476 | } 477 | } 478 | } 479 | /* Synchronize the threads */ 480 | __syncthreads(); 481 | if (top_h >= 0) { 482 | if (threadIdx.x == 0) { 483 | i = top_h + m * dci_inst->num_simp_indices; 484 | position = cur_pos[i]; 485 | } 486 | __syncthreads(); 487 | int cur_index = position + threadIdx.x; 488 | // check whether the current thread pointing index is within range 489 | if (cur_index >= 0 && cur_index < num_points_in_block) { 490 | int cur_point = dci_inst->indices[cur_index 491 | + i * (dci_inst->num_points) 492 | + blockIdx.x * points_per_block].value; 493 | counts[cur_point + m * (dci_inst->num_points)]++; 494 | if (counts[cur_point + m * (dci_inst->num_points)] 495 | == dci_inst->num_simp_indices) { 496 | // add offset to candidate_dists 497 | if (candidate_dists[cur_point] == -2.0) { 498 | if (query_config.blind) { 499 | candidate_dists[cur_point] = -1.0; 500 | // lock 501 | all_candidates[num_candidates 502 | + blockIdx.x 503 | * max_possible_num_candidates] = 504 | cur_point; 505 | num_candidates++; 506 | } else { 507 | // Compute distance 508 | cur_dist = compute_dist_device( 509 | &(dci_inst->data[cur_point 510 | * dci_inst->dim]), query, 511 | dci_inst->dim); 512 | candidate_dists[cur_point] = cur_dist; 513 | if (num_candidates < num_neighbours) { 514 | d_top_candidates_dist[blockIdx.x 515 | * num_neighbours 516 | + threadIdx.x * num_neighbours 517 | + num_candidates] = cur_dist; 518 | d_top_candidates_index[blockIdx.x 519 | * num_neighbours 520 | + threadIdx.x * num_neighbours 521 | + num_candidates] = cur_point; 522 | if (cur_dist > last_top_candidate_dist) { 523 | last_top_candidate_dist = cur_dist; 524 | last_top_candidate = num_candidates; 525 | } 526 | } else if (cur_dist < last_top_candidate_dist) { 527 | d_top_candidates_dist[blockIdx.x 528 | * num_neighbours 529 | + threadIdx.x * num_neighbours 530 | + last_top_candidate] = cur_dist; 531 | d_top_candidates_index[blockIdx.x 532 | * num_neighbours 533 | + threadIdx.x * num_neighbours 534 | + last_top_candidate] = cur_point; 535 | last_top_candidate_dist = -1.0; 536 | // Assuming num_neighbours less than the min(blockDim) = 32 537 | // no need to run on gpu 538 | for (j = 0; j < num_neighbours; j++) { 539 | if (d_top_candidates_dist[blockIdx.x 540 | * num_neighbours 541 | + threadIdx.x * num_neighbours 542 | + j] 543 | > last_top_candidate_dist) { 544 | last_top_candidate_dist = 545 | d_top_candidates_dist[blockIdx.x 546 | * num_neighbours 547 | + threadIdx.x 548 | * num_neighbours 549 | + j]; 550 | last_top_candidate = j; 551 | } 552 | } 553 | } 554 | num_candidates++; 555 | } 556 | } else { 557 | if (!query_config.blind) { 558 | cur_dist = candidate_dists[cur_point]; 559 | } 560 | } 561 | } 562 | } 563 | /* Synchronize the threads */ 564 | __syncthreads(); 565 | // use the first thread to update 566 | if (threadIdx.x == 0) { 567 | cur_pos[i] = dci_next_closest_proj( 568 | &(dci_inst->indices[i * (dci_inst->num_points) 569 | + blockIdx.x * points_per_block]), 570 | &(left_pos[i]), &(right_pos[i]), query_proj[i], 571 | num_points_in_block); 572 | if ((cur_pos[i] < 0) && (cur_pos[i] > -blockDim.x)) { 573 | position = 0; 574 | } else if ((cur_pos[i] 575 | < (num_points_in_block + blockDim.x - 1)) 576 | && (cur_pos[i] >= num_points_in_block)) { 577 | position = num_points_in_block - 1; 578 | } else { 579 | position = cur_pos[i]; 580 | } 581 | if (position >= 0 && position < num_points_in_block) { 582 | index_priority[i] = abs_d( 583 | dci_inst->indices[position 584 | + i * (dci_inst->num_points) 585 | + blockIdx.x * points_per_block].key 586 | - query_proj[i]); 587 | } else { 588 | index_priority[i] = DBL_MAX; 589 | cur_pos[i] = -blockDim.x; 590 | } 591 | } 592 | } 593 | if (threadIdx.x == 0) { 594 | m++; 595 | } 596 | __syncthreads(); 597 | } 598 | if (threadIdx.x == 0) { 599 | if (num_candidates >= num_neighbours) { 600 | if (k + 1 601 | >= query_config.num_outer_iterations 602 | * dci_inst->num_simp_indices 603 | || num_candidates >= query_config.max_num_candidates) { 604 | could_break = true; 605 | break; 606 | } 607 | } 608 | k++; 609 | } 610 | /* Synchronize the threads */ 611 | __syncthreads(); 612 | if (could_break) { 613 | break; 614 | } 615 | } 616 | // free variables 617 | if (threadIdx.x == 0) { 618 | free(left_pos); 619 | free(right_pos); 620 | free(cur_pos); 621 | free(index_priority); 622 | } 623 | } 624 | 625 | 626 | } 627 | 628 | __global__ void mix_sort_kernel(idx_elem* const d_top_candidates, 629 | const int total) { 630 | if (threadIdx.x == 0 && blockIdx.x == 0) { 631 | mix_sort(d_top_candidates, total); 632 | } 633 | } 634 | 635 | __global__ void update_top(const dci* const dci_inst, 636 | double* const index_priority, int const comp_index, int* top_h, 637 | int *mutex) { 638 | double top_h_priority = DBL_MAX; 639 | // Shared top priority array 640 | extern __shared__ double top_priority[]; 641 | // Shared top priority index in data array 642 | extern __shared__ double top_index[]; 643 | 644 | unsigned int tid = threadIdx.x; 645 | unsigned int idx = blockIdx.x * blockDim.x + tid; 646 | top_priority[tid] = DBL_MAX; 647 | top_index[tid] = idx % dci_inst->num_simp_indices; 648 | 649 | while (idx < dci_inst->num_simp_indices) { 650 | double cur_priority = index_priority[comp_index 651 | * dci_inst->num_simp_indices + idx]; 652 | if (top_priority[tid] > cur_priority) { 653 | top_priority[tid] = cur_priority; 654 | top_index[tid] = idx % dci_inst->num_simp_indices; 655 | } 656 | idx += gridDim.x * blockDim.x; 657 | } 658 | __syncthreads(); 659 | idx = blockIdx.x * blockDim.x + tid; 660 | // block-wide reduction 661 | for (unsigned int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { 662 | if (tid < offset && idx < dci_inst->num_simp_indices) { 663 | double cur_priority = index_priority[comp_index 664 | * dci_inst->num_simp_indices + tid]; 665 | double compare_priority = index_priority[comp_index 666 | * dci_inst->num_simp_indices + tid + offset]; 667 | if (cur_priority > compare_priority) { 668 | top_priority[tid] = compare_priority; 669 | top_index[tid] = (blockIdx.x * blockDim.x + tid + offset) 670 | % dci_inst->num_simp_indices; 671 | } 672 | } 673 | __syncthreads(); 674 | } 675 | 676 | // finally, thread 0 writes the result 677 | if (threadIdx.x == 0) { 678 | while (atomicCAS(mutex, 0, 1) != 0) 679 | ; //lock 680 | if (top_priority[0] < top_h_priority) { 681 | top_h_priority = top_priority[0]; 682 | *top_h = top_index[0]; 683 | } 684 | atomicExch(mutex, 0); //unlock 685 | } 686 | } 687 | 688 | /* 689 | * Update the top nearest neighbors with distance from the partial results 690 | */ 691 | void get_top_candidates(int* const nearest_neighbours, 692 | float* const nearest_neighbour_dists, 693 | float* const d_top_candidates_dist, int* const d_top_candidates_index, 694 | const int num_neighbours, const int total) { 695 | thrust::sort_by_key(thrust::device, d_top_candidates_dist, 696 | d_top_candidates_dist + total, d_top_candidates_index); 697 | cudaMemcpy(nearest_neighbour_dists, d_top_candidates_dist, 698 | sizeof(float) * num_neighbours, cudaMemcpyDeviceToDevice); 699 | cudaMemcpy(nearest_neighbours, d_top_candidates_index, 700 | sizeof(int) * num_neighbours, cudaMemcpyDeviceToDevice); 701 | } 702 | 703 | __global__ void init_dist(float* const candidate_map, const int total, 704 | const float value) { 705 | int idx, i = blockDim.x * blockIdx.x + threadIdx.x; 706 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 707 | / (blockDim.x * gridDim.x); 708 | int j; 709 | // initialize the counters 710 | for (j = 0; j < chunk_size; j++) { 711 | idx = i * chunk_size + j; 712 | if (idx < total) { 713 | candidate_map[idx] = value; 714 | } 715 | } 716 | } 717 | 718 | __global__ void init_candidates(idx_elem* const candidate_map, const int total, 719 | const float value) { 720 | int idx, i = blockDim.x * blockIdx.x + threadIdx.x; 721 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 722 | / (blockDim.x * gridDim.x); 723 | int j; 724 | // initialize the counters 725 | for (j = 0; j < chunk_size; j++) { 726 | idx = i * chunk_size + j; 727 | if (idx < total) { 728 | candidate_map[idx].key = value; 729 | candidate_map[idx].value = -1; 730 | } 731 | } 732 | } 733 | 734 | __global__ void get_blind_candidate_count(idx_elem* const candidate_map, 735 | int* const d_all_candidates, const int total) { 736 | int idx, i = blockDim.x * blockIdx.x + threadIdx.x; 737 | int chunk_size = (total + blockDim.x * gridDim.x - 1) 738 | / (blockDim.x * gridDim.x); 739 | int j; 740 | // maintain counts as negative numbers for candidate_map.key in order to reuse mix_sort (ascending) 741 | for (j = 0; j < chunk_size; j++) { 742 | idx = i * chunk_size + j; 743 | if (idx < total) { 744 | candidate_map[d_all_candidates[idx]].key--; 745 | candidate_map[d_all_candidates[idx]].value = 746 | d_all_candidates[idx]; 747 | } 748 | } 749 | } 750 | 751 | /* 752 | * Update the top nearest neighbors from the partial results 753 | */ 754 | void get_top_blind_candidates(int* const nearest_neighbours, 755 | int* const d_all_candidates, const int max_possible_num_candidates, 756 | const int total) { 757 | int i; 758 | idx_elem* candidate_map; 759 | cudaMallocManaged((void **) (&candidate_map), 760 | sizeof(idx_elem) * total); 761 | int block_size = 1024; 762 | int thread_size = 32; 763 | init_candidates<<>>(candidate_map, total, 0); 764 | // synch all blocks 765 | cudaDeviceSynchronize(); 766 | get_blind_candidate_count<<>>(candidate_map, d_all_candidates, total); 767 | // synch all blocks 768 | cudaDeviceSynchronize(); 769 | mix_sort_kernel<<<1, 1>>>(candidate_map, total); 770 | for (i = 0; i < max_possible_num_candidates; i++) { 771 | nearest_neighbours[i] = candidate_map[i].value; 772 | } 773 | } 774 | 775 | // If blind querying is used, nearest_neighbours must be of size num_queries * max_possible_num_candidates; otherwise, it must be of size num_queries * num_neighbours 776 | // nearest_neighbour_dists can be NULL when blind querying is used 777 | void dci_query(dci* const dci_inst, const int dim, const int num_queries, 778 | const float* const query, const int num_neighbours, 779 | const dci_query_config query_config, int* const nearest_neighbours, 780 | float* const nearest_neighbour_dists, const int block_size, 781 | const int thread_size) { 782 | 783 | int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices; 784 | int max_possible_num_candidates = min(query_config.max_num_candidates, 785 | query_config.num_outer_iterations); 786 | 787 | assert(dim == dci_inst->dim); 788 | assert(num_neighbours > 0); 789 | assert(num_neighbours <= dci_inst->num_points); 790 | 791 | // for fixing timeout 792 | void* dummy; 793 | cudaMalloc(&dummy, 1); 794 | 795 | // calculate query_proj 796 | int devId = 0; 797 | float* query_proj; 798 | 799 | cudaMallocManaged((void **) (&query_proj), 800 | sizeof(float) * num_indices * num_queries); 801 | 802 | matmul_device(CUBLAS_OP_N, CUBLAS_OP_T, num_queries, num_indices, 803 | dci_inst->dim, query, dci_inst->proj_vec, query_proj, devId); 804 | 805 | // copy query config to device pointer 806 | dci_query_config* d_query_config; 807 | cudaMallocManaged((void **) (&d_query_config), 808 | sizeof(dci_query_config)); 809 | cudaMemcpy(d_query_config, &query_config, sizeof(dci_query_config), 810 | cudaMemcpyHostToDevice); 811 | 812 | // make the raw nearest neighbors 813 | int* d_all_candidates; 814 | cudaMallocManaged((void **) (&d_all_candidates), 815 | sizeof(int) * max_possible_num_candidates * block_size); 816 | 817 | float* d_top_candidates_dist; 818 | cudaMalloc((void **) (&d_top_candidates_dist), 819 | sizeof(float) * num_neighbours * block_size * thread_size); 820 | int* d_top_candidates_index; 821 | cudaMalloc((void **) (&d_top_candidates_index), 822 | sizeof(int) * num_neighbours * block_size * thread_size); 823 | 824 | int* counts; 825 | cudaMallocManaged((void **) (&counts), 826 | sizeof(int) * dci_inst->num_points 827 | * dci_inst->num_comp_indices); 828 | 829 | float* candidate_dists; 830 | cudaMallocManaged((void **) (&candidate_dists), 831 | sizeof(float) * dci_inst->num_points); 832 | 833 | for (int j = 0; j < num_queries; j++) { 834 | // need to refresh the result holder to avoid carry over results 835 | init_dist<<>>(d_top_candidates_dist, 836 | num_neighbours * block_size * thread_size, DBL_MAX); 837 | 838 | cudaDeviceSynchronize(); 839 | init_counts<<>>(dci_inst, counts); 840 | init_candidate_dists<<>>(dci_inst, 841 | candidate_dists); 842 | 843 | cudaDeviceSynchronize(); 844 | 845 | dci_query_single_point_by_block<<>>(dci_inst, 846 | num_neighbours, &(query[j * dim]), 847 | &(query_proj[j * num_indices]), *d_query_config, 848 | d_top_candidates_dist, d_top_candidates_index, d_all_candidates, 849 | counts, candidate_dists); 850 | 851 | cudaDeviceSynchronize(); 852 | 853 | // get the final output 854 | if (!query_config.blind) { 855 | get_top_candidates(&(nearest_neighbours[j * num_neighbours]), 856 | &(nearest_neighbour_dists[j * num_neighbours]), 857 | d_top_candidates_dist, d_top_candidates_index, 858 | num_neighbours, block_size * num_neighbours * thread_size); 859 | } else { 860 | get_top_blind_candidates( 861 | &(nearest_neighbours[j * max_possible_num_candidates]), 862 | d_all_candidates, max_possible_num_candidates, 863 | block_size * max_possible_num_candidates); 864 | } 865 | } 866 | 867 | // free the allocated memories 868 | cudaFree(query_proj); 869 | cudaFree(d_query_config); 870 | cudaFree(d_all_candidates); 871 | cudaFree(d_top_candidates_dist); 872 | cudaFree(d_top_candidates_index); 873 | cudaFree(counts); 874 | cudaFree(candidate_dists); 875 | } 876 | 877 | 878 | void dci_clear(dci* const dci_inst) { 879 | if (dci_inst->indices) { 880 | cudaFree(dci_inst->indices); 881 | dci_inst->indices = NULL; 882 | } 883 | dci_inst->data = NULL; 884 | dci_inst->num_points = 0; 885 | } 886 | 887 | void dci_reset(dci* const dci_inst) { 888 | dci_clear(dci_inst); 889 | dci_gen_proj_vec(dci_inst->proj_vec, dci_inst->dim, 890 | dci_inst->num_comp_indices * dci_inst->num_simp_indices); 891 | } 892 | 893 | void dci_free(const dci* const dci_inst) { 894 | if (dci_inst->indices) { 895 | cudaFree(dci_inst->indices); 896 | } 897 | cudaFree(dci_inst->proj_vec); 898 | 899 | } 900 | 901 | void dci_dump(const dci* const dci_inst) { 902 | int i, j; 903 | int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices; 904 | for (j = 0; j < num_indices; j++) { 905 | for (i = 0; i < dci_inst->num_points; i++) { 906 | printf("%f[%d],", 907 | dci_inst->indices[i + j * (dci_inst->num_points)].key, 908 | dci_inst->indices[i + j * (dci_inst->num_points)].value); 909 | } 910 | printf("\n"); 911 | } 912 | } 913 | -------------------------------------------------------------------------------- /src/util_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Code for Fast k-Nearest Neighbour Search via Prioritized DCI 3 | * 4 | * This code implements the method described in the Prioritized DCI paper, 5 | * which can be found at https://arxiv.org/abs/1703.00440 6 | * 7 | * This file is a part of the Dynamic Continuous Indexing reference 8 | * implementation. 9 | * 10 | * 11 | * This Source Code Form is subject to the terms of the Mozilla Public 12 | * License, v. 2.0. If a copy of the MPL was not distributed with this 13 | * file, You can obtain one at https://mozilla.org/MPL/2.0/. 14 | * 15 | * Copyright (C) 2020 Ke Li, Shichong Peng, Mehran Aghabozorgi 16 | */ 17 | 18 | #include "util.h" 19 | // Utilities and system includes 20 | #include 21 | #include 22 | #include 23 | 24 | // generate the random seed 25 | #include 26 | 27 | // CUDA runtime 28 | #include 29 | #include 30 | 31 | // CUDA random 32 | #include 33 | #include 34 | #include 35 | 36 | 37 | #ifndef min 38 | #define min(a,b) ((a < b) ? a : b) 39 | #endif 40 | #ifndef max 41 | #define max(a,b) ((a > b) ? a : b) 42 | #endif 43 | 44 | // uses device pointers, save on malloc ops 45 | void matmul_device(const cublasOperation_t op_A, const cublasOperation_t op_B, 46 | const int M, const int N, const int K, const float* const A, const float* const B, float* const C, int &devID) { 47 | // initialize the CUDA variables 48 | cudaDeviceProp deviceProp; 49 | 50 | cudaGetDeviceProperties(&deviceProp, devID); 51 | int block_size = 32; // size 16 has also been used. Think 32 is faster 52 | 53 | // setup execution parameters 54 | dim3 threads(block_size, block_size); 55 | dim3 grid(N / threads.x, M / threads.y); 56 | 57 | // CUBLAS version 2.0 58 | const float alpha = 1.0f; 59 | const float beta = 0.0f; 60 | cublasHandle_t handle; 61 | 62 | cublasCreate(&handle); 63 | 64 | int lda, ldb; 65 | if(op_A == CUBLAS_OP_N) { 66 | lda = K; 67 | } else { 68 | lda = M; 69 | } 70 | if(op_B == CUBLAS_OP_N) { 71 | ldb = N; 72 | } else { 73 | ldb = K; 74 | } 75 | 76 | cublasSgemm(handle, op_B, op_A, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N); 77 | 78 | // Destroy the handle 79 | cublasDestroy(handle); 80 | } 81 | 82 | __global__ void init_curand_state(unsigned int seed, curandState_t* states) { 83 | int id = blockDim.x * blockIdx.x + threadIdx.x; 84 | curand_init(seed, id, 0, &states[id]); 85 | } 86 | 87 | 88 | // gauss random variables in parallel 89 | __global__ void 90 | gauss_parallel_rng(curandState_t* states, float* vec, const int n) { 91 | int i = blockDim.x * blockIdx.x + threadIdx.x; 92 | // Note assumes num_blocks = num_threads 93 | int chunk_size = (n + blockDim.x * blockDim.x - 1) / (blockDim.x * blockDim.x); 94 | int index; 95 | for(int j = 0; j < chunk_size; ++j) { 96 | index = i*chunk_size+j; 97 | if(index < n) { 98 | vec[i*chunk_size+j] = curand_normal(&states[i]); 99 | } 100 | } 101 | } 102 | 103 | // uniform distribution in [-1, 1] in parallel 104 | __global__ void 105 | uniform_parallel_rng(curandState_t* states, float *vec, const int n) { 106 | int i = blockDim.x * blockIdx.x + threadIdx.x; 107 | // Note assumes num_blocks = num_threads 108 | int chunk_size = (n + blockDim.x * blockDim.x - 1) / (blockDim.x * blockDim.x); 109 | int index; 110 | for(int j = 0; j < chunk_size; ++j) { 111 | index = i*chunk_size+j; 112 | if(index < n) { 113 | vec[i*chunk_size+j] = (curand_uniform(&states[i]) * 2.0) - 1.0; 114 | } 115 | } 116 | } 117 | 118 | // helper functon, assumes vec is device pointer 119 | void rng_parallel_device(float* const vec, const int n, const int rng_type) { 120 | int num_blocks = 64; // for now using num_blocks blocks, num_blocks threads per block 121 | 122 | // curand initialization 123 | curandState_t* states; 124 | long long seed = 0; 125 | for(int i = 0; i < 4; ++i) { 126 | seed = (seed << 32) | rand(); 127 | } 128 | cudaMalloc((void**) &states, num_blocks * num_blocks * sizeof(curandState_t)); 129 | init_curand_state<<>>(seed, states); 130 | 131 | // generate random numbers 132 | if(rng_type == GAUSS_RAND) { 133 | gauss_parallel_rng<<>>(states, vec, n); 134 | } else { 135 | uniform_parallel_rng<<>>(states, vec, n); 136 | } 137 | } 138 | --------------------------------------------------------------------------------