├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── dciknn_cuda
    ├── __init__.py
    └── core.py
├── example.py
├── include
    ├── dci.h
    └── util.h
├── setup.py
└── src
    ├── dci_cuda.cpp
    ├── dci_cuda_kernel.cu
    └── util_kernel.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dciknn_cuda.egg-info/
3 | dciknn_cuda/__pycache__/
4 | dist/
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft include/
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DCI CUDA
 2 | 
 3 | This is the CUDA GPU implementation + Python interface (using PyTorch) of Dynamic Continuous Indexing (DCI) . The paper can be found [here](https://arxiv.org/abs/1512.00442).
 4 | 
 5 | ## Prerequisites
 6 | * NVCC version >= 9.2 (Note: this should match the CUDA version that PyTorch is built with)
 7 | * PyTorch >= 1.4.0
 8 | 
 9 | ## Setup
10 | 
11 | The library can be compiled using Python distutils.
12 | 
13 | **Note:** If your Python interpreter is named differently, e.g.: "python3", you will need to replace all occurrences of "python" with "python3" in the commands below.
14 | 
15 | If your Python installation is local (e.g. part of Anaconda), run the following command from the root directory of the code base to compile and install as a Python package:
16 | ```bash
17 | python setup.py install
18 | ```
19 | 
20 | Otherwise, if you have sudo access, run the following command instead:
21 | ```bash
22 | sudo python setup.py install
23 | ```
24 | 
25 | If you do not have sudo access, run the following command instead:
26 | ```bash
27 | python setup.py install --user
28 | ```
29 | 
30 | 
31 | ## Experimental PyPI install
32 | Simply run:
33 | ```bash
34 | pip install -i https://test.pypi.org/simple/ dciknn-cuda==0.1.11
35 | ```
36 | If you don't have internet access (e.g., in a requested job in clusters), you can run the following before requesting the job:
37 | ```bash
38 | pip download -i https://test.pypi.org/simple/ dciknn-cuda==0.1.11
39 | ```
40 | Then run the following in the requested job to install offline:
41 | ```
42 | pip install dciknn_cuda-0.1.11.tar.gz
43 | ```
44 | 
45 | 
46 | ## Getting Started
47 | 
48 | An example code using the PyTorch interface is provided. In the root directory of the code base, execute the following command:
49 | 
50 | ```bash
51 | python example.py
52 | ```
53 | 
54 | ### Multi-GPU example
55 | The multi-GPU version of DCI exposes the same APIs to be used. The following is a simple example for using four GPUs for computing nearest neighbours:
56 | ```python
57 | # Multi-GPU version of DCI
58 | dci_db = MDCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, devices=[0, 1, 2, 3])  # We specify GPUs to be used by the DCI instance with `devices`. Set to list(range(torch.cuda.device_count())) to use all available GPUs
59 | 
60 |         dci_db.add(data)  # We add the pool of data
61 |         indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations)  # We run our desired query
62 | ```
63 | 
64 | 
65 | ## Directory Layout
66 | * `src`, all of the `*.cpp`, `.cu` files
67 | * `include`, the header files
68 | * `dciknn`, the Python interface
69 | 
70 | ## Important Files
71 | * `src/dci_cuda.cpp`: defines the PyTorch extension functions
72 | * `src/util_kernel.cu`: matrix multiplication and random distribution generation functions
73 | * `src/dci_cuda_kernel.cu`: main components of prioritized DCI
74 | * `dciknn/core.py`: defines Python interface
75 | 
76 | ## Reference
77 | 
78 | Please cite the following paper if you found this library useful in your research:
79 | 
80 | ### [Fast _k_-Nearest Neighbour Search via Dynamic Continuous Indexing](https://arxiv.org/abs/1512.00442)
81 | [Ke Li](https://people.eecs.berkeley.edu/~ke.li/), [Jitendra Malik](https://people.eecs.berkeley.edu/~malik/)\
82 | *International Conference on Machine Learning (ICML)*, 2016
83 | 
84 | ```
85 | @inproceedings{li2016fast,
86 |   title={Fast k-nearest neighbour search via {Dynamic Continuous Indexing}},
87 |   author={Li, Ke and Malik, Jitendra},
88 |   booktitle={International Conference on Machine Learning},
89 |   pages={671--679},
90 |   year={2016}
91 | }
92 | ```
93 | 


--------------------------------------------------------------------------------
/dciknn_cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI
 3 | 
 4 | This code implements the method described in the Prioritized DCI paper, 
 5 | which can be found at https://arxiv.org/abs/1703.00440
 6 | 
 7 | This file is a part of the Dynamic Continuous Indexing reference 
 8 | implementation.
 9 | 
10 | 
11 | This Source Code Form is subject to the terms of the Mozilla Public
12 | License, v. 2.0. If a copy of the MPL was not distributed with this
13 | file, You can obtain one at https://mozilla.org/MPL/2.0/.
14 | 
15 | Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
16 | '''
17 | 
18 | __version__ = "0.1.11"
19 | __author__ = 'Ke Li, Shichong Peng, Mehran Aghabozorgi'
20 | __credits__ = 'APEX Lab'
21 | 
22 | from .core import DCI, MDCI
23 | 


--------------------------------------------------------------------------------
/dciknn_cuda/core.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI
  3 | 
  4 | This code implements the method described in the Prioritized DCI paper, 
  5 | which can be found at https://arxiv.org/abs/1703.00440
  6 | 
  7 | This file is a part of the Dynamic Continuous Indexing reference 
  8 | implementation.
  9 | 
 10 | 
 11 | This Source Code Form is subject to the terms of the Mozilla Public
 12 | License, v. 2.0. If a copy of the MPL was not distributed with this
 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/.
 14 | 
 15 | Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
 16 | '''
 17 | 
 18 | import torch
 19 | from _dci_cuda import _dci_new, _dci_add, _dci_query, _dci_clear, _dci_reset, _dci_free, _dci_multi_query
 20 | # from _dci_cuda import _dci_new, _dci_add, _dci_query, _dci_clear, _dci_reset, _dci_free
 21 | 
 22 | from math import sqrt
 23 | 
 24 | 
 25 | class DCI(object):
 26 |     
 27 |     def __init__(self, dim, num_comp_indices=2, num_simp_indices=7, bs=100, ts=10, device=0):
 28 |         
 29 |         if not torch.cuda.is_available():
 30 |             raise RuntimeError("DCI CUDA version requires GPU access, please check CUDA driver.")
 31 | 
 32 |         self._dim = dim
 33 |         self._num_comp_indices = num_comp_indices
 34 |         self._num_simp_indices = num_simp_indices
 35 |         self._dci_inst = _dci_new(dim, num_comp_indices, num_simp_indices, device)
 36 |         self._array = None
 37 |         self._block_size = bs
 38 |         self._thread_size = ts
 39 |         self.num_points = 0
 40 | 
 41 |     @property
 42 |     def dim(self):
 43 |         return self._dim
 44 |         
 45 |     @property
 46 |     def num_comp_indices(self):
 47 |         return self._num_comp_indices
 48 |         
 49 |     @property
 50 |     def num_simp_indices(self):
 51 |         return self._num_simp_indices
 52 |             
 53 |     def _ensure_positive_integer(self, x):
 54 |         if not isinstance(x, int):
 55 |             raise TypeError("number must be an integer")
 56 |         elif x <= 0:
 57 |             raise ValueError("number must be positive")
 58 |     
 59 |     def _check_data(self, arr):
 60 |         if arr.shape[1] != self.dim:
 61 |             raise ValueError("mismatch between tensor dimension (%d) and the declared dimension of this DCI instance (%d)" % (arr.shape[1], self.dim))
 62 |         if arr.dtype != torch.float:
 63 |             raise TypeError("tensor must consist of double-precision floats")
 64 |         if not arr.is_contiguous():
 65 |             raise ValueError("the memory layout of tensor must be in row-major (C-order)")
 66 |         if not arr.is_cuda:
 67 |             raise TypeError("tensor must be a cuda tensor")
 68 | 
 69 |     def add(self, data):
 70 |         if self.num_points > 0:
 71 |             raise RuntimeError("DCI class does not support insertion of more than one tensor. Must combine all tensors into one tensor before inserting")
 72 |         self._check_data(data)
 73 |         self.num_points = data.shape[0]
 74 |         _dci_add(self._dci_inst, self._dim, self.num_points, data.flatten(), self._block_size, self._thread_size)
 75 |         self._array = data
 76 |     
 77 |     # query is num_queries x dim, returns num_queries x num_neighbours
 78 |     def query(self, query, num_neighbours=-1, num_outer_iterations=5000, blind=False):
 79 |         if len(query.shape) < 2:
 80 |             _query = query.unsqueeze(0)
 81 |         else:
 82 |             _query = query
 83 |         self._check_data(_query)
 84 |         if num_neighbours < 0:
 85 |             num_neighbours = self.num_points
 86 |         self._ensure_positive_integer(num_neighbours)
 87 |         max_num_candidates = 10 * num_neighbours
 88 |         # num_queries x num_neighbours
 89 | 
 90 |         _query_result = _dci_query(self._dci_inst, self._dim, _query.shape[0], _query.flatten(), num_neighbours, blind, num_outer_iterations, max_num_candidates, self._block_size, self._thread_size)
 91 |         half = _query_result.shape[0] // 2
 92 |         return _query_result[:half].reshape(_query.shape[0], -1), _query_result[half:].reshape(_query.shape[0], -1)
 93 |     
 94 |     def clear(self):
 95 |         _dci_clear(self._dci_inst)
 96 |         self.num_points = 0
 97 |         self._array = None
 98 |     
 99 |     def reset(self):
100 |         _dci_reset(self._dci_inst)
101 |         self.num_points = 0
102 |         self._array = None
103 | 
104 |     def free(self):
105 |         _dci_free(self._dci_inst)
106 |         self.num_points = 0
107 |         self._array = None
108 | 
109 | 
110 | class MDCI(object):
111 |     def __init__(self, dim, num_comp_indices=2, num_simp_indices=7, bs=100, ts=10, devices=[0]):
112 |         # if len(devices) < 2:
113 |         #     raise RuntimeError("You should specify at least two GPU for multi-GPU DCI to work")
114 |         
115 |         self.devices = devices
116 |         self.num_devices = len(devices)
117 |         self.dcis = [DCI(dim, num_comp_indices, num_simp_indices, bs, ts, dev) for dev in devices]
118 |         self.data_per_device = 0
119 |         
120 |     
121 |     def add(self, data):
122 |         self.data_per_device = data.shape[0] // self.num_devices + 1
123 |         for dev_ind in range(self.num_devices):
124 |             device = self.devices[dev_ind]
125 |             cur_data = data[dev_ind * self.data_per_device: dev_ind * self.data_per_device + self.data_per_device].to(device)
126 |             self.dcis[dev_ind].add(cur_data)
127 |         
128 |     def query(self, query, num_neighbours=-1, num_outer_iterations=5000, blind=False):
129 |         dists = []
130 |         nns = []
131 |         if num_neighbours <= 0:
132 |             raise RuntimeError('num_neighbours must be positive')
133 | 
134 |         if len(query.shape) < 2:
135 |             _query = query.unsqueeze(0)
136 |         else:
137 |             _query = query
138 |         _query = _query.detach().clone()
139 | 
140 | 
141 |         max_num_candidates = 10 * num_neighbours
142 | 
143 |         queries = [_query.to(self.devices[dev_ind]).flatten() for dev_ind in self.devices]
144 |         res = _dci_multi_query([dc._dci_inst for dc in self.dcis], self.dcis[0]._dim, _query.shape[0], queries, num_neighbours, blind, num_outer_iterations, max_num_candidates, self.dcis[0]._block_size, self.dcis[0]._thread_size)
145 | 
146 |         for ind, cur_res in enumerate(res):
147 |             half = cur_res.shape[0] // 2
148 |             cur_nns, cur_dist = cur_res[:half].reshape(_query.shape[0], -1), cur_res[half:].reshape(_query.shape[0], -1)
149 |             cur_nns = cur_nns + self.data_per_device * ind
150 |             dists.append(cur_dist.detach().clone().to(self.devices[0]))
151 |             nns.append(cur_nns.detach().clone().to(self.devices[0]))
152 | 
153 |         merged_dists = torch.cat(dists, dim=1)
154 |         merged_nns = torch.cat(nns, dim=1)
155 |         _, sort_indices = torch.sort(merged_dists, dim=1)
156 |         sort_indices = sort_indices[:, :num_neighbours]
157 |         return torch.gather(merged_nns, 1, sort_indices), torch.gather(merged_dists, 1, sort_indices)
158 | 
159 |     def clear(self):
160 |         for dci in self.dcis:
161 |             dci.clear()
162 | 
163 |     def reset(self):
164 |         for dci in self.dcis:
165 |             dci.reset()
166 | 
167 |     def free(self):
168 |         for dci in self.dcis:
169 |             dci.free()


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI
  3 | 
  4 | This code implements the method described in the Prioritized DCI paper,
  5 | which can be found at https://arxiv.org/abs/1703.00440
  6 | 
  7 | This file is a part of the Dynamic Continuous Indexing reference
  8 | implementation.
  9 | 
 10 | 
 11 | This Source Code Form is subject to the terms of the Mozilla Public
 12 | License, v. 2.0. If a copy of the MPL was not distributed with this
 13 | file, You can obtain one at https://mozilla.org/MPL/2.0/.
 14 | 
 15 | Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
 16 | '''
 17 | from dciknn_cuda import DCI, MDCI
 18 | import torch
 19 | import random
 20 | import datetime
 21 | 
 22 | random.seed(10)
 23 | torch.manual_seed(0)
 24 | 
 25 | def gen_data(ambient_dim, intrinsic_dim, num_points):
 26 |     latent_data = torch.randn((num_points, intrinsic_dim))
 27 |     transformation = torch.randn((intrinsic_dim, ambient_dim))
 28 |     data = torch.matmul(latent_data, transformation)
 29 |     return data     # num_points x ambient_dim
 30 | 
 31 | 
 32 | def main():
 33 |     assert torch.cuda.is_available()
 34 |     device = torch.device('cuda:1')
 35 | 
 36 |     #############################################################################################################################################
 37 |     #                                                                                                                                           #
 38 |     # Data Generation Hyperparameters                                                                                                           #
 39 |     #                                                                                                                                           #
 40 |     #############################################################################################################################################
 41 |     dim = 100
 42 |     num_pts = 3000
 43 |     num_queries = 500
 44 |     # dim = 80
 45 |     # num_pts = 1000
 46 |     # num_queries = 100
 47 | 
 48 |     intrinsic_dim = 400
 49 |     data_and_queries = gen_data(dim, intrinsic_dim, num_pts + num_queries)
 50 | 
 51 |     data = data_and_queries[:num_pts, :].detach().clone().to(device)
 52 |     query = data_and_queries[num_pts:, :].detach().clone().to(device)
 53 | 
 54 |     #############################################################################################################################################
 55 |     #                                                                                                                                           #
 56 |     # Problem Hyperparameter                                                                                                                    #
 57 |     #                                                                                                                                           #
 58 |     #############################################################################################################################################
 59 |     num_neighbours = 10  # The k in k-NN
 60 | 
 61 |     #############################################################################################################################################
 62 |     #                                                                                                                                           #
 63 |     # DCI Hyperparameters                                                                                                                       #
 64 |     #                                                                                                                                           #
 65 |     #############################################################################################################################################
 66 |     block_size = 100
 67 |     thread_size = 10
 68 |     num_comp_indices = 2
 69 |     num_simp_indices = 10
 70 |     num_outer_iterations = 5000
 71 | 
 72 |     # initialize the DCI instance
 73 |     for i in range(2):
 74 |         a = datetime.datetime.now()
 75 |         dci_db = MDCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, devices=[0, 1])
 76 | 
 77 |         dci_db.add(data)
 78 |         # Query
 79 |         indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations)
 80 |         print("Nearest Indices:", indices)
 81 |         print("Indices Distances:", dists)
 82 |         dci_db.clear()
 83 |         b = datetime.datetime.now()
 84 |         print(b-a)
 85 | 
 86 |         data = data_and_queries[:num_pts, :].detach().clone().to(0)
 87 |         query = data_and_queries[num_pts:, :].detach().clone().to(0)
 88 |         a = datetime.datetime.now()
 89 |         dci_db = DCI(dim, num_comp_indices, num_simp_indices, block_size, thread_size, device=0)
 90 | 
 91 |         dci_db.add(data)
 92 |         # Query
 93 |         indices, dists = dci_db.query(query, num_neighbours, num_outer_iterations)
 94 |         print("Nearest Indices:", indices)
 95 |         print("Indices Distances:", dists)
 96 |         dci_db.clear()
 97 |         b = datetime.datetime.now()
 98 |         print(b-a)
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/include/dci.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Code for Fast k-Nearest Neighbour Search via Prioritized DCI
 3 |  *
 4 |  * This code implements the method described in the Prioritized DCI paper,
 5 |  * which can be found at https://arxiv.org/abs/1703.00440
 6 |  *
 7 |  * This file is a part of the Dynamic Continuous Indexing reference
 8 |  * implementation.
 9 |  *
10 |  *
11 |  * This Source Code Form is subject to the terms of the Mozilla Public
12 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
13 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
14 |  *
15 |  * Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
16 |  */
17 | 
18 | #ifndef DCI_H
19 | #define DCI_H
20 | 
21 | #include <cuda.h>
22 | #include <cuda_runtime.h>
23 | 
24 | #include <stdbool.h>
25 | 
26 | 
27 | typedef struct idx_elem {
28 | 	float key;  // value of the projection of point onto vector
29 | 	int value;  // index of the point
30 | } idx_elem;
31 | 
32 | // sorting alg we are using
33 | __device__
34 | void mix_sort(idx_elem arr[], int n);
35 | 
36 | float compute_dist(const float* const vec1, const float* const vec2,
37 | 		const int dim);
38 | 
39 | typedef struct dci {
40 | 	int dim;                // (Ambient) dimensionality of data
41 | 	int num_comp_indices;   // Number of composite indices
42 | 	int num_simp_indices;   // Number of simple indices in each composite index
43 | 	int num_points;
44 | 	idx_elem* indices; // Assuming row-major layout, matrix of size required_num_points x (num_comp_indices*num_simp_indices)
45 | 	float* proj_vec; // Assuming row-major layout, matrix of size dim x (num_comp_indices*num_simp_indices)
46 | 	float* data_proj;    // Device copy of data_proj
47 | 	float* data;
48 | 	float* d_data;
49 | 	int devID;              // To initialize CUDA's matmul, set to 0
50 | } dci;
51 | 
52 | typedef struct dci_query_config {
53 | 	bool blind;
54 | 	int num_outer_iterations;
55 | 	int max_num_candidates;
56 | } dci_query_config;
57 | 
58 | void dci_gen_proj_vec(float* proj_vec, const int dim,
59 | 		const int num_indices);
60 | 
61 | void dci_init(dci* const dci_inst, const int dim, const int num_comp_indices,
62 | 		const int num_simp_indices, const int devId);
63 | 
64 | __device__
65 | void insertion_sort(idx_elem arr[], int n);
66 | 
67 | // // Note: the data itself is not kept in the index and must be kept in-place
68 | void dci_add(dci* const dci_inst, const int dim, const int num_points,
69 | 		float* const data, const int block_size, const int thread_size);
70 | 
71 | void dci_query(dci* const dci_inst, const int dim, const int num_queries,
72 | 		const float* const query, const int num_neighbours,
73 | 		const dci_query_config query_config, int* const nearest_neighbours,
74 | 		float* const nearest_neighbour_dists, const int block_size,
75 | 		const int thread_size);
76 | 
77 | void dci_clear(dci* const dci_inst);
78 | 
79 | // Clear indices and reset the projection directions
80 | void dci_reset(dci* const dci_inst);
81 | 
82 | void dci_free(const dci* const dci_inst);
83 | 
84 | void dci_dump(const dci* const dci_inst);
85 | 
86 | #endif // DCI_H
87 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Code for Fast k-Nearest Neighbour Search via Prioritized DCI
 3 |  *
 4 |  * This code implements the method described in the Prioritized DCI paper,
 5 |  * which can be found at https://arxiv.org/abs/1703.00440
 6 |  *
 7 |  * This file is a part of the Dynamic Continuous Indexing reference
 8 |  * implementation.
 9 |  *
10 |  *
11 |  * This Source Code Form is subject to the terms of the Mozilla Public
12 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
13 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
14 |  *
15 |  * Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
16 |  */
17 | 
18 | // CUDA runtime
19 | #include <cuda_runtime.h>
20 | #include <cublas_v2.h>
21 | 
22 | // CUDA random
23 | #include <curand.h>
24 | #include <curand_kernel.h>
25 | #include <unistd.h>
26 | 
27 | #ifndef UTIL_H
28 | #define UTIL_H
29 | 
30 | #define GAUSS_RAND 0
31 | #define UNIFORM_RAND 1
32 | 
33 | typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
34 | {
35 |         unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
36 | } sMatrixSize;
37 | 
38 | // put in device pointers. Saves on memcpy operations
39 | void matmul_device(const cublasOperation_t op_A, const cublasOperation_t op_B,
40 |     const int M, const int N, const int K, const float* const A, const float* const B, float* const C, int &devID);
41 | 
42 | // put in device pointers. Saves on memcpy operations
43 | void rng_parallel_device(float* const vec, const int n, const int rng_type);
44 | 
45 | __global__ void init_curand_state(unsigned int seed, curandState_t* states);
46 | 
47 | __global__ void gauss_parallel_rng(curandState_t* states, float *vec, const int n);
48 | 
49 | __global__ void uniform_parallel_rng(curandState_t* states, float *vec, const int n);
50 | 
51 | #endif // UTIL_H
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Code for Fast k-Nearest Neighbour Search via Prioritized DCI
 3 | 
 4 | This code implements the method described in the Prioritized DCI paper,
 5 | which can be found at https://arxiv.org/abs/1703.00440
 6 | 
 7 | This file is a part of the Dynamic Continuous Indexing reference
 8 | implementation.
 9 | 
10 | 
11 | This Source Code Form is subject to the terms of the Mozilla Public
12 | License, v. 2.0. If a copy of the MPL was not distributed with this
13 | file, You can obtain one at https://mozilla.org/MPL/2.0/.
14 | 
15 | Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
16 | '''
17 | from setuptools import setup
18 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, include_paths
19 | import os
20 | import sys
21 | 
22 | if sys.version_info[0] < 3:
23 |     with open('README.md') as f:
24 |         long_description = f.read()
25 | else:
26 |     with open('README.md', encoding='utf-8') as f:
27 |         long_description = f.read()
28 | 
29 | setup(
30 |     name='dciknn_cuda',
31 |     packages=['dciknn_cuda'],
32 |     version='0.1.11',    
33 |     long_description=long_description,
34 |     long_description_content_type='text/markdown',
35 |     description='DCI CUDA for fast K nearest neighbour finding',
36 |     url='https://github.com/niopeng/dciknn_cuda',
37 |     author='Ke Li, Shichong Peng, Mehran Aghabozorgi',
38 |     author_email='keli@sfu.ca',
39 |     license='Mozilla Public License Version 2.0',
40 |     install_requires=['torch>=1.4.0'],
41 |     include_dirs=include_paths(),
42 |     language='c++',
43 |     soruces=['./src/dci_cuda.cpp',
44 |             './src/dci_cuda_kernel.cu',
45 |             './src/util_kernel.cu',],
46 |     ext_modules=[
47 |         CUDAExtension('_dci_cuda', [
48 |             './src/dci_cuda.cpp',
49 |             './src/dci_cuda_kernel.cu',
50 |             './src/util_kernel.cu',
51 |         ], include_dirs=[
52 |             os.path.abspath(os.path.join(os.path.dirname(__file__), 'include')),
53 |         ]
54 |         )
55 |     ],
56 |     cmdclass={
57 |         'build_ext': BuildExtension
58 |     })
59 | 


--------------------------------------------------------------------------------
/src/dci_cuda.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Code for Fast k-Nearest Neighbour Search via Prioritized DCI
  3 |  *
  4 |  * This code implements the method described in the Prioritized DCI paper,
  5 |  * which can be found at https://arxiv.org/abs/1703.00440
  6 |  *
  7 |  * This file is a part of the Dynamic Continuous Indexing reference
  8 |  * implementation.
  9 |  *
 10 |  *
 11 |  * This Source Code Form is subject to the terms of the Mozilla Public
 12 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 13 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 14 |  *
 15 |  * Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
 16 |  */
 17 | 
 18 | #include <torch/extension.h>
 19 | #include <cuda_runtime.h>
 20 | #include "dci.h"
 21 | #include <c10/cuda/CUDAGuard.h>
 22 | #include <torch/torch.h>
 23 | #include <thread>
 24 | #include <future>
 25 | #include <vector>
 26 | 
 27 | 
 28 | typedef struct py_dci {
 29 |     dci dci_inst;
 30 |     PyObject *py_array;
 31 | } py_dci;
 32 | 
 33 | namespace py = pybind11;
 34 | 
 35 | static void py_dci_free_wrap(PyObject *py_dci_inst_wrapper) {
 36 | 
 37 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_dci_inst_wrapper, "py_dci_inst");
 38 |     const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID);
 39 | 
 40 |     if (py_dci_inst->py_array) {
 41 |         Py_DECREF(py_dci_inst->py_array);
 42 |     }
 43 | 
 44 |     dci_free(&(py_dci_inst->dci_inst));
 45 |     cudaFree(py_dci_inst);
 46 | }
 47 | 
 48 | static void py_tensor_free(PyObject *py_tensor_wrapper) {
 49 |     torch::Tensor *py_tensor = (torch::Tensor *)PyCapsule_GetPointer(py_tensor_wrapper, "py_tensor");
 50 |     const at::cuda::OptionalCUDAGuard device_guard(device_of(*py_tensor));
 51 |     cudaFree(py_tensor);
 52 | }
 53 | 
 54 | py::handle py_dci_new(const int dim, const int num_comp_indices,
 55 |     const int num_simp_indices, const int deviceId) {
 56 |     const at::cuda::OptionalCUDAGuard device_guard(deviceId);
 57 |     py_dci *py_dci_inst;
 58 |     cudaMallocManaged((void **) &py_dci_inst, sizeof(py_dci));
 59 | 
 60 |     // initialize DCI instance
 61 |     dci_init(&(py_dci_inst->dci_inst), dim, num_comp_indices, num_simp_indices, deviceId);
 62 | 
 63 |     // Returns new reference
 64 |     PyObject *py_dci_inst_wrapper = PyCapsule_New(py_dci_inst, "py_dci_inst", py_dci_free_wrap);
 65 |     return py_dci_inst_wrapper;
 66 | }
 67 | 
 68 | void py_dci_add(py::handle py_dci_inst_wrapper, const int dim, const int num_points,
 69 |     torch::Tensor py_data, const int block_size, const int thread_size) {
 70 |     const at::cuda::OptionalCUDAGuard device_guard(device_of(py_data));
 71 | 
 72 |     PyObject *py_obj = py_dci_inst_wrapper.ptr();
 73 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst");
 74 |     float* data = (float *)py_data.data_ptr();
 75 | 
 76 |     // add data to DCI instance
 77 |     dci_add(&(py_dci_inst->dci_inst), dim, num_points, data, block_size, thread_size);
 78 | 
 79 |     PyObject *py_tensor_wrapper = PyCapsule_New(&py_data, "py_tensor", py_tensor_free);
 80 |     py_dci_inst->py_array = py_tensor_wrapper;
 81 |     Py_INCREF(py_tensor_wrapper);
 82 | }
 83 | 
 84 | torch::Tensor py_dci_query(py::handle py_dci_inst_wrapper, const int dim, const int num_queries,
 85 |     torch::Tensor py_query, const int num_neighbours, const bool blind, const int num_outer_iterations,
 86 |     const int max_num_candidates, const int block_size,
 87 |     const int thread_size) {
 88 |     const at::cuda::OptionalCUDAGuard device_guard(device_of(py_query));
 89 | 
 90 |     PyObject *py_obj = py_dci_inst_wrapper.ptr();
 91 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst");
 92 | 
 93 |     // Assuming row-major layout, py_query->data is N x D, where N is the number of queries and D is the dimensionality
 94 |     float* query = (float *)py_query.data_ptr();
 95 | 
 96 |     dci_query_config query_config = {blind, num_outer_iterations, max_num_candidates};
 97 |     int*  final_outputs;
 98 |     float* final_distances;
 99 |     const int output_size = num_neighbours * num_queries;
100 |     cudaMalloc((void **) &(final_outputs), sizeof(int) * output_size);
101 |     cudaMalloc((void **) &(final_distances), sizeof(float) * output_size);
102 | 
103 |     // query using DCI
104 |     dci_query(&(py_dci_inst->dci_inst), dim, num_queries, query, num_neighbours,
105 |       query_config, final_outputs, final_distances, block_size, thread_size);
106 | 
107 |     auto options = torch::TensorOptions().device(torch::kCUDA);
108 |     auto new_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
109 |     torch::Tensor final_outputs_array = torch::from_blob(final_outputs, {output_size}, new_options);
110 |     // convert to float tensor to concatenate with the computed distances
111 |     torch::Tensor final = final_outputs_array.to(torch::kFloat32);
112 | 
113 |     torch::Tensor final_distances_array = torch::from_blob(final_distances, {output_size}, options);
114 | 
115 |     torch::Tensor final_result = torch::cat({ final, final_distances_array }, 0);
116 | 
117 |     return final_result;
118 | }
119 | 
120 | std::vector<torch::Tensor> py_dci_multi_query(std::vector<py::handle> py_dci_inst_wrapper, const int dim, const int num_queries,
121 |     std::vector<torch::Tensor> py_query, const int num_neighbours, const bool blind, const int num_outer_iterations,
122 |     const int max_num_candidates, const int block_size,
123 |     const int thread_size) {
124 |     std::vector<torch::Tensor> results;
125 |     std::vector<std::future<torch::Tensor>> calcs;
126 |     for (unsigned int i = 0; i < py_query.size(); i++) {
127 |         calcs.push_back(std::async(py_dci_query, py_dci_inst_wrapper[i], dim, num_queries,
128 |             py_query[i], num_neighbours, blind, num_outer_iterations, max_num_candidates, block_size, thread_size));
129 |     }
130 |     for (unsigned int i = 0; i < py_query.size(); i++) {
131 |         results.push_back(calcs[i].get());
132 |     }
133 |     return results;
134 | }
135 | 
136 | void py_dci_clear(py::handle py_dci_inst_wrapper) {
137 |     PyObject *py_obj = py_dci_inst_wrapper.ptr();
138 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst");
139 |     const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID);
140 | 
141 |     if (py_dci_inst->py_array) {
142 |         Py_DECREF(py_dci_inst->py_array);
143 |     }
144 | 
145 |     dci_clear(&(py_dci_inst->dci_inst));
146 |     py_dci_inst->py_array = NULL;
147 | }
148 | 
149 | void py_dci_reset(py::handle py_dci_inst_wrapper) {
150 |     PyObject *py_obj = py_dci_inst_wrapper.ptr();
151 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst");
152 |     const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID);
153 | 
154 |     if (py_dci_inst->py_array) {
155 |         Py_DECREF(py_dci_inst->py_array);
156 |     }
157 | 
158 |     dci_reset(&(py_dci_inst->dci_inst));
159 |     py_dci_inst->py_array = NULL;
160 | }
161 | 
162 | void py_dci_free(py::handle py_dci_inst_wrapper) {
163 |     PyObject *py_obj = py_dci_inst_wrapper.ptr();
164 |     py_dci *py_dci_inst = (py_dci *)PyCapsule_GetPointer(py_obj, "py_dci_inst");
165 |     const at::cuda::OptionalCUDAGuard device_guard(py_dci_inst->dci_inst.devID);
166 | 
167 |     if (py_dci_inst->py_array) {
168 |         Py_DECREF(py_dci_inst->py_array);
169 |     }
170 | 
171 |     dci_free(&(py_dci_inst->dci_inst));
172 |     cudaFree(py_dci_inst);
173 | }
174 | 
175 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
176 |     m.def("_dci_new", &py_dci_new, "Create new DCI instance. (CUDA)");
177 |     m.def("_dci_add", &py_dci_add, "Add data. (CUDA)");
178 |     m.def("_dci_query", &py_dci_query, "Search for nearest neighbours. (CUDA)");
179 |     m.def("_dci_clear", &py_dci_clear, "Clear DCI. (CUDA)");
180 |     m.def("_dci_reset", &py_dci_reset, "Reset DCI. (CUDA)");
181 |     m.def("_dci_free", &py_dci_free, "Free DCI. (CUDA)");
182 |     m.def("_dci_multi_query", &py_dci_multi_query, "Search for nearest neighbours with multiple GPUs. (CUDA)");
183 | }
184 | 


--------------------------------------------------------------------------------
/src/dci_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Code for Fast k-Nearest Neighbour Search via Prioritized DCI
  3 |  *
  4 |  * This code implements the method described in the Prioritized DCI paper,
  5 |  * which can be found at https://arxiv.org/abs/1703.00440
  6 |  *
  7 |  * This file is a part of the Dynamic Continuous Indexing reference
  8 |  * implementation.
  9 |  *
 10 |  *
 11 |  * This Source Code Form is subject to the terms of the Mozilla Public
 12 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 13 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 14 |  *
 15 |  * Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
 16 |  */
 17 | 
 18 | #include <malloc.h>
 19 | #include <stdlib.h>
 20 | #include <time.h>
 21 | #include <math.h>
 22 | #include <assert.h>
 23 | #include <float.h>
 24 | #include "dci.h"
 25 | #include "util.h"
 26 | 
 27 | /* Sorting functions */
 28 | #include <thrust/sort.h>
 29 | #include <thrust/execution_policy.h>
 30 | 
 31 | /* CUDA runtime */
 32 | #include <cuda_runtime.h>
 33 | #include <cublas_v2.h>
 34 | 
 35 | __device__
 36 | float compute_dist_device(const float* const vec1, const float* const vec2,
 37 | 		const int dim) {
 38 | 	int i;
 39 | 	float sq_dist = 0.0;
 40 | 	for (i = 0; i < dim; i++) {
 41 | 		sq_dist += (vec1[i] - vec2[i]) * (vec1[i] - vec2[i]);
 42 | 	}
 43 | 	return sqrt(sq_dist);
 44 | }
 45 | 
 46 | __device__
 47 | static inline float abs_d(float x) {
 48 | 	return x > 0 ? x : -x;
 49 | }
 50 | 
 51 | /* Normalize the input projection vectors. Vectors are normalized along each row. */
 52 | __global__ void normalize_proj_vecs(float* const proj_vec, const int dim,
 53 | 		const int num_indices) {
 54 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
 55 | 	/* Note: Assumes num_blocks = num_threads */
 56 | 	int chunk_size = (num_indices + blockDim.x * blockDim.x - 1)
 57 | 			/ (blockDim.x * blockDim.x);
 58 | 	int vec_index;
 59 | 	for (int j = 0; j < chunk_size; ++j) {
 60 | 		vec_index = i * chunk_size + j;
 61 | 		if (vec_index < num_indices) {
 62 | 			float sq_norm = 0.0;
 63 | 			for (int k = 0; k < dim; ++k) {
 64 | 				sq_norm += proj_vec[vec_index * dim + k]
 65 | 						* proj_vec[vec_index * dim + k];
 66 | 			}
 67 | 			float norm = sqrtf(sq_norm);
 68 | 			for (int k = 0; k < dim; ++k) {
 69 | 				proj_vec[vec_index * dim + k] /= norm;
 70 | 			}
 71 | 		}
 72 | 	}
 73 | }
 74 | 
 75 | /* Create matrix with proj_vec dim-dimensional normalized gaussian vectors.
 76 |  vectors are normalized along each row */
 77 | void dci_gen_proj_vec(float* const proj_vec, const int dim,
 78 | 		const int num_indices) {
 79 | 	/* Generate the random indices */
 80 | 	rng_parallel_device(proj_vec, dim * num_indices, GAUSS_RAND);
 81 | 
 82 | 	/* Normalize */
 83 | 	int block_size = 32;
 84 | 	int thread_size = 32;
 85 | 	normalize_proj_vecs<<<block_size, thread_size>>>(proj_vec, dim,
 86 | 			num_indices);
 87 | 
 88 | 	/* Synchronize the threads */
 89 | 	cudaDeviceSynchronize();
 90 | }
 91 | 
 92 | /* Initializes the master DCI data structure.  */
 93 | void dci_init(dci* const dci_inst, const int dim, const int num_comp_indices,
 94 | 		const int num_simp_indices, const int devId) {
 95 | 	int num_indices = num_comp_indices * num_simp_indices;
 96 | 
 97 | 	dci_inst->dim = dim;
 98 | 	dci_inst->num_comp_indices = num_comp_indices;
 99 | 	dci_inst->num_simp_indices = num_simp_indices;
100 | 
101 | 	cudaMallocManaged((void **) &dci_inst->proj_vec,
102 | 			sizeof(float) * dim * num_indices);
103 | 	dci_gen_proj_vec(dci_inst->proj_vec, dim, num_indices);
104 | 
105 | 	/* Variables that initialize to default values */
106 | 	dci_inst->num_points = 0;
107 | 	dci_inst->indices = NULL;
108 | 	dci_inst->data = NULL;
109 | 	dci_inst->devID = devId;
110 | }
111 | 
112 | /* Sort indices */
113 | __global__ void sort_indices(dci* const dci_inst, const int num_indices,
114 | 		const int num_points, const int points_per_block) {
115 | 	int chunk_size = (num_indices + blockDim.x - 1) / blockDim.x;
116 | 	int idx;
117 | 	int num_points_in_block = min(
118 | 			(int) (dci_inst->num_points - blockIdx.x * points_per_block),
119 | 			points_per_block);
120 | 	for (int j = 0; j < chunk_size; j++) {
121 | 		idx = threadIdx.x * chunk_size + j;
122 | 		if (idx < num_indices) {
123 | 			mix_sort(
124 | 					&(dci_inst->indices[idx * dci_inst->num_points
125 | 							+ points_per_block * blockIdx.x]),
126 | 					num_points_in_block);
127 | 		}
128 | 	}
129 | }
130 | 
131 | /* Copy data in proj_vec to indices */
132 | __global__ void copy_to_indices(dci* const dci_inst, float* const data_proj,
133 | 		const int num_indices, const int num_points) {
134 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
135 | 	int n = num_indices * num_points;
136 | 	int chunk_size = (n + blockDim.x * gridDim.x - 1)
137 | 			/ (blockDim.x * gridDim.x);
138 | 	int idx;
139 | 	for (int j = 0; j < chunk_size; j++) {
140 | 		idx = i * chunk_size + j;
141 | 		if (idx < n) {
142 | 			dci_inst->indices[idx].key = data_proj[idx];
143 | 			dci_inst->indices[idx].value = idx % num_points;
144 | 		}
145 | 	}
146 | }
147 | 
148 | /* Add data to the master DCI data structure.  */
149 | void dci_add(dci* const dci_inst, const int dim, const int num_points,
150 | 		float* const data, const int block_size, const int thread_size) {
151 | 	int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices;
152 | 	float *data_proj;
153 | 	cudaMallocManaged((void **) &data_proj,
154 | 			sizeof(float) * num_points * num_indices);
155 | 
156 | 	assert(dim == dci_inst->dim);
157 | 	assert(dci_inst->num_points == 0);
158 | 
159 | 	cudaMallocManaged((void **) &dci_inst->data,
160 | 			sizeof(float) * num_points * dim);
161 | 	dci_inst->data = data;
162 | 	cudaMallocManaged((void **) &dci_inst->indices,
163 | 			sizeof(idx_elem) * num_points * num_indices);
164 | 
165 | 	dci_inst->num_points = num_points;
166 | 
167 | 	matmul_device(CUBLAS_OP_N, CUBLAS_OP_T, num_indices, num_points,
168 | 			dci_inst->dim, dci_inst->proj_vec, dci_inst->data, data_proj,
169 | 			dci_inst->devID);
170 | 	cudaDeviceSynchronize();
171 | 
172 | 	/* Add to indices */
173 | 	copy_to_indices	<<<block_size, thread_size>>>(dci_inst, data_proj, num_indices, num_points);
174 | 
175 | 	/* Synchronize the threads */
176 | 	cudaDeviceSynchronize();
177 | 
178 | 	int points_per_block = (dci_inst->num_points + block_size - 1) / block_size;
179 | 	/* Sort the indices */
180 | 	sort_indices<<<block_size, thread_size>>>(dci_inst, num_indices, num_points,
181 | 			points_per_block);
182 | 
183 | 	/* Synchronize the threads */
184 | 	cudaDeviceSynchronize();
185 | 
186 | 	cudaFree(data_proj);
187 | }
188 | 
189 | __device__
190 | void insertion_sort(idx_elem arr[], int n) {
191 | 	int i, j;
192 | 	idx_elem key;
193 | 	for (i = 1; i < n; i++) {
194 | 		key = arr[i];
195 | 		j = i - 1;
196 | 		while (j >= 0 && arr[j].key > key.key) {
197 | 			arr[j + 1] = arr[j];
198 | 			j = j - 1;
199 | 		}
200 | 		arr[j + 1] = key;
201 | 	}
202 | }
203 | 
204 | /* Modified quick_sort to use "mix_sort" below. */
205 | __device__
206 | void quick_sort(idx_elem arr[], int n) {
207 | 	// arbitrary pivot
208 | 	float pivot_key = arr[n / 2].key;
209 | 	idx_elem swp;
210 | 	int low = 0;
211 | 	int high = n - 1;
212 | 	while (low < n || high > 0) {
213 | 		while (arr[low].key < pivot_key && low < n) {
214 | 			low++;
215 | 		}
216 | 		while (arr[high].key > pivot_key && high > 0) {
217 | 			high--;
218 | 		}
219 | 		if (low <= high) {
220 | 			swp = arr[low];
221 | 			arr[low] = arr[high];
222 | 			arr[high] = swp;
223 | 			low++;
224 | 			high--;
225 | 		} else {
226 | 			if (high > 0) {
227 | 				mix_sort(arr, high + 1);
228 | 			}
229 | 			if (low < n - 1) {
230 | 				mix_sort(&arr[low], n - low);
231 | 			}
232 | 			return;
233 | 		}
234 | 	}
235 | }
236 | 
237 | /* Sorting algorithm. If the number of data points is fewer than 64, then it does
238 |  Insertion Sort. Otherwise, it uses Quick Sort. The reasoning is that if there are
239 |  too few data points, then Quick Sort's overhead may be too large. */
240 | __device__
241 | void mix_sort(idx_elem arr[], int n) {
242 | 	if (n > 64) {
243 | 		quick_sort(arr, n);
244 | 	} else {
245 | 		insertion_sort(arr, n);
246 | 	}
247 | }
248 | 
249 | __device__
250 | static inline int dci_next_closest_proj(const idx_elem* const idx,
251 | 		int* const left_pos, int* const right_pos, const float query_proj,
252 | 		const int num_elems) {
253 | 	int cur_pos;
254 | 	int lower_bound = -blockDim.x;
255 | 	int upper_bound = num_elems + blockDim.x - 1;
256 | 	if ((*left_pos <= lower_bound) && (*right_pos >= upper_bound)) {
257 | 		cur_pos = lower_bound;
258 | 	} else if (*left_pos <= lower_bound) {
259 | 		cur_pos = *right_pos;
260 | 		(*right_pos) += blockDim.x;
261 | 	} else if (*right_pos >= upper_bound) {
262 | 		cur_pos = *left_pos;
263 | 		(*left_pos) -= blockDim.x;
264 | 	} else if (idx[min(*right_pos, num_elems - 1)].key - query_proj
265 | 			< query_proj - idx[max(*left_pos, 0)].key) {
266 | 		cur_pos = *right_pos;
267 | 		(*right_pos) += blockDim.x;
268 | 	} else {
269 | 		cur_pos = *left_pos;
270 | 		(*left_pos) -= blockDim.x;
271 | 	}
272 | 	return cur_pos;
273 | }
274 | 
275 | // Returns the index of the element whose key is the largest that is less than the key
276 | // Returns an integer from -1 to num_elems - 1 inclusive
277 | // Could return -1 if all elements are greater or equal to key
278 | __device__
279 | static inline int dci_search_index(const idx_elem* const idx, const float key,
280 | 		const int num_elems) {
281 | 	int start_pos, end_pos, cur_pos;
282 | 
283 | 	start_pos = -1;
284 | 	end_pos = num_elems - 1;
285 | 	cur_pos = (start_pos + end_pos + 2) / 2;
286 | 
287 | 	while (start_pos < end_pos) {
288 | 		if (idx[cur_pos].key < key) {
289 | 			start_pos = cur_pos;
290 | 		} else {
291 | 			end_pos = cur_pos - 1;
292 | 		}
293 | 		cur_pos = (start_pos + end_pos + 2) / 2;
294 | 	}
295 | 
296 | 	return start_pos;
297 | }
298 | 
299 | /* Search indices */
300 | __device__ void search_index(const dci* const dci_inst,
301 | 		const float* const query_proj, const int num_indices,
302 | 		int* const left_pos, int* const right_pos, const int points_per_block) {
303 | 	int total = num_indices;
304 | 	int chunk_size = (total + blockDim.x - 1) / blockDim.x;
305 | 	int idx;
306 | 	for (int j = 0; j < chunk_size; j++) {
307 | 		idx = threadIdx.x * chunk_size + j;
308 | 		if (idx < total) {
309 | 			left_pos[idx] = dci_search_index(
310 | 					&(dci_inst->indices[idx * (dci_inst->num_points)
311 | 							+ blockIdx.x * points_per_block]),
312 | 					query_proj[idx],
313 | 					min(dci_inst->num_points - blockIdx.x * points_per_block,
314 | 							points_per_block)) - blockDim.x + 1;
315 | 			right_pos[idx] = left_pos[idx] + blockDim.x;
316 | 		}
317 | 	}
318 | }
319 | 
320 | __device__ void init_index_priority(const dci* const dci_inst,
321 | 		const float* const query_proj, const int num_indices,
322 | 		int* const left_pos, int* const right_pos, float* const index_priority,
323 | 		int* const cur_pos, const int points_per_block) {
324 | 	int total = num_indices;
325 | 	int chunk_size = (total + blockDim.x - 1) / blockDim.x;
326 | 	int idx;
327 | 	int num_points_in_block = min(
328 | 			(int) (dci_inst->num_points - blockIdx.x * points_per_block),
329 | 			points_per_block);
330 | 	for (int j = 0; j < chunk_size; j++) {
331 | 		idx = threadIdx.x * chunk_size + j;
332 | 		if (idx < total && num_points_in_block > 0) {
333 | 			cur_pos[idx] = dci_next_closest_proj(
334 | 					&(dci_inst->indices[idx * (dci_inst->num_points)
335 | 							+ blockIdx.x * points_per_block]),
336 | 					&(left_pos[idx]), &(right_pos[idx]), query_proj[idx],
337 | 					num_points_in_block);
338 | 			int position;
339 | 			if ((cur_pos[idx] < 0) && (cur_pos[idx] > -blockDim.x)) {
340 | 				position = 0;
341 | 			} else if ((cur_pos[idx] < (num_points_in_block + blockDim.x - 1))
342 | 					&& (cur_pos[idx] >= num_points_in_block)) {
343 | 				position = num_points_in_block - 1;
344 | 			} else {
345 | 				position = cur_pos[idx];
346 | 			}
347 | 			assert(position >= 0); // There should be at least one point in the index
348 | 			assert(position < num_points_in_block);
349 | 			index_priority[idx] = abs_d(
350 | 					dci_inst->indices[position + idx * (dci_inst->num_points)
351 | 							+ blockIdx.x * points_per_block].key
352 | 							- query_proj[idx]);
353 | 		}
354 | 	}
355 | }
356 | 
357 | __global__ void init_counts(const dci* const dci_inst, int* counts) {
358 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
359 | 	int total = dci_inst->num_comp_indices * dci_inst->num_points;
360 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
361 | 			/ (blockDim.x * gridDim.x);
362 | 	for (int j = 0; j < chunk_size; j++) {
363 | 		int l = i * chunk_size + j;
364 | 		if (l < total) {
365 | 			counts[l] = 0;
366 | 		}
367 | 	}
368 | }
369 | 
370 | __global__ void init_candidate_dists(const dci* const dci_inst,
371 | 		float* candidate_dists) {
372 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
373 | 	int total = dci_inst->num_points;
374 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
375 | 			/ (blockDim.x * gridDim.x);
376 | 	for (int j = 0; j < chunk_size; j++) {
377 | 		int l = i * chunk_size + j;
378 | 		if (l < total) {
379 | 			candidate_dists[l] = -2.0;
380 | 		}
381 | 	}
382 | }
383 | 
384 | __global__ void init_candidate_indices(const dci* const dci_inst,
385 | 		int* candidate_indices) {
386 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
387 | 	int total = dci_inst->num_points;
388 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
389 | 			/ (blockDim.x * gridDim.x);
390 | 	for (int j = 0; j < chunk_size; j++) {
391 | 		int l = i * chunk_size + j;
392 | 		if (l < total) {
393 | 			candidate_indices[l] = -1;
394 | 		}
395 | 	}
396 | }
397 | 
398 | // Blind querying does not compute distances or look at the values of indexed vectors
399 | // For blind querying, top_candidates is not used; all_candidates is used to store candidates in the order of retrieval
400 | __global__
401 | static void dci_query_single_point_by_block(const dci* const dci_inst,
402 | 		const int num_neighbours, const float* const query,
403 | 		const float* const query_proj, const dci_query_config query_config,
404 | 		float* const d_top_candidates_dist, int* const d_top_candidates_index,
405 | 		int* const all_candidates, int* counts, float* candidate_dists) {
406 | 	int j, h;
407 | 	float cur_dist;
408 | 	int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices;
409 | 	__shared__ float top_index_priority;
410 | 	__shared__ int k, top_h, position, m, i;
411 | 	__shared__ bool could_break; // Bug fix: resolve infinite loop if thread 0 exits first
412 | 	float last_top_candidate_dist = -1.0; // The distance of the k^th closest candidate found so far
413 | 	int num_candidates = 0, last_top_candidate = -1;
414 | 
415 | 	// init variables
416 | 	if (threadIdx.x == 0) {
417 | 		k = 0;
418 | 		could_break = false;
419 | 	}
420 | 
421 | 	int max_possible_num_candidates = min(query_config.max_num_candidates,
422 | 			query_config.num_outer_iterations);
423 | 
424 | 	int points_per_block = (dci_inst->num_points + gridDim.x - 1) / gridDim.x;
425 | 	int num_points_in_block = min(
426 | 			(int) (dci_inst->num_points - blockIdx.x * points_per_block),
427 | 			points_per_block);
428 | 
429 | 	if (num_points_in_block > 0) {
430 | 
431 | 		__shared__ int* left_pos;
432 | 		__shared__ int* right_pos;
433 | 		__shared__ int* cur_pos;
434 | 		__shared__ float* index_priority;
435 | 		// init variables
436 | 		if (threadIdx.x == 0) {
437 | 			left_pos = new int[num_indices];
438 | 			right_pos = new int[num_indices];
439 | 			cur_pos = new int[num_indices];
440 | 			index_priority = new float[num_indices];
441 | 		}
442 | 		__syncthreads();
443 | 
444 | 		/* Search index */
445 | 		search_index(dci_inst, query_proj, num_indices, left_pos, right_pos,
446 | 				points_per_block);
447 | 
448 | 		/* Synchronize the threads */
449 | 		__syncthreads();
450 | 
451 | 		/* Populate the closest indices */
452 | 		init_index_priority(dci_inst, query_proj, num_indices, left_pos, right_pos,
453 | 				index_priority, cur_pos, points_per_block);
454 | 
455 | 		/* Synchronize the threads */
456 | 		__syncthreads();
457 | 
458 | 		while (k < num_points_in_block * dci_inst->num_simp_indices * blockDim.x) {
459 | 
460 | 			if (threadIdx.x == 0) {
461 | 				m = 0;
462 | 			}
463 | 			__syncthreads();
464 | 			while (m < dci_inst->num_comp_indices) {
465 | 				// only one thread to get the top
466 | 				if (threadIdx.x == 0) {
467 | 					/* Get the top priority and data index in priority queue */
468 | 					top_index_priority = DBL_MAX;
469 | 					top_h = -1;
470 | 					for (h = 0; h < dci_inst->num_simp_indices; h++) {
471 | 						if (index_priority[h + m * dci_inst->num_simp_indices]
472 | 								< top_index_priority) {
473 | 							top_index_priority = index_priority[h
474 | 									+ m * dci_inst->num_simp_indices];
475 | 							top_h = h;
476 | 						}
477 | 					}
478 | 				}
479 | 				/* Synchronize the threads */
480 | 				__syncthreads();
481 | 				if (top_h >= 0) {
482 | 					if (threadIdx.x == 0) {
483 | 						i = top_h + m * dci_inst->num_simp_indices;
484 | 						position = cur_pos[i];
485 | 					}
486 | 					__syncthreads();
487 | 					int cur_index = position + threadIdx.x;
488 | 					// check whether the current thread pointing index is within range
489 | 					if (cur_index >= 0 && cur_index < num_points_in_block) {
490 | 						int cur_point = dci_inst->indices[cur_index
491 | 								+ i * (dci_inst->num_points)
492 | 								+ blockIdx.x * points_per_block].value;
493 | 						counts[cur_point + m * (dci_inst->num_points)]++;
494 | 						if (counts[cur_point + m * (dci_inst->num_points)]
495 | 								== dci_inst->num_simp_indices) {
496 | 							// add offset to candidate_dists
497 | 							if (candidate_dists[cur_point] == -2.0) {
498 | 								if (query_config.blind) {
499 | 									candidate_dists[cur_point] = -1.0;
500 | 									// lock
501 | 									all_candidates[num_candidates
502 | 											+ blockIdx.x
503 | 													* max_possible_num_candidates] =
504 | 											cur_point;
505 | 									num_candidates++;
506 | 								} else {
507 | 									// Compute distance
508 | 									cur_dist = compute_dist_device(
509 | 											&(dci_inst->data[cur_point
510 | 													* dci_inst->dim]), query,
511 | 											dci_inst->dim);
512 | 									candidate_dists[cur_point] = cur_dist;
513 | 									if (num_candidates < num_neighbours) {
514 | 										d_top_candidates_dist[blockIdx.x
515 | 												* num_neighbours
516 | 												+ threadIdx.x * num_neighbours
517 | 												+ num_candidates] = cur_dist;
518 | 										d_top_candidates_index[blockIdx.x
519 | 												* num_neighbours
520 | 												+ threadIdx.x * num_neighbours
521 | 												+ num_candidates] = cur_point;
522 | 										if (cur_dist > last_top_candidate_dist) {
523 | 											last_top_candidate_dist = cur_dist;
524 | 											last_top_candidate = num_candidates;
525 | 										}
526 | 									} else if (cur_dist < last_top_candidate_dist) {
527 | 										d_top_candidates_dist[blockIdx.x
528 | 												* num_neighbours
529 | 												+ threadIdx.x * num_neighbours
530 | 												+ last_top_candidate] = cur_dist;
531 | 										d_top_candidates_index[blockIdx.x
532 | 												* num_neighbours
533 | 												+ threadIdx.x * num_neighbours
534 | 												+ last_top_candidate] = cur_point;
535 | 										last_top_candidate_dist = -1.0;
536 | 										// Assuming num_neighbours less than the min(blockDim) = 32
537 | 										// no need to run on gpu
538 | 										for (j = 0; j < num_neighbours; j++) {
539 | 											if (d_top_candidates_dist[blockIdx.x
540 | 													* num_neighbours
541 | 													+ threadIdx.x * num_neighbours
542 | 													+ j]
543 | 													> last_top_candidate_dist) {
544 | 												last_top_candidate_dist =
545 | 														d_top_candidates_dist[blockIdx.x
546 | 																* num_neighbours
547 | 																+ threadIdx.x
548 | 																		* num_neighbours
549 | 																+ j];
550 | 												last_top_candidate = j;
551 | 											}
552 | 										}
553 | 									}
554 | 									num_candidates++;
555 | 								}
556 | 							} else {
557 | 								if (!query_config.blind) {
558 | 									cur_dist = candidate_dists[cur_point];
559 | 								}
560 | 							}
561 | 						}
562 | 					}
563 | 					/* Synchronize the threads */
564 | 					__syncthreads();
565 | 					// use the first thread to update
566 | 					if (threadIdx.x == 0) {
567 | 						cur_pos[i] = dci_next_closest_proj(
568 | 								&(dci_inst->indices[i * (dci_inst->num_points)
569 | 										+ blockIdx.x * points_per_block]),
570 | 								&(left_pos[i]), &(right_pos[i]), query_proj[i],
571 | 								num_points_in_block);
572 | 						if ((cur_pos[i] < 0) && (cur_pos[i] > -blockDim.x)) {
573 | 							position = 0;
574 | 						} else if ((cur_pos[i]
575 | 								< (num_points_in_block + blockDim.x - 1))
576 | 								&& (cur_pos[i] >= num_points_in_block)) {
577 | 							position = num_points_in_block - 1;
578 | 						} else {
579 | 							position = cur_pos[i];
580 | 						}
581 | 						if (position >= 0 && position < num_points_in_block) {
582 | 							index_priority[i] = abs_d(
583 | 									dci_inst->indices[position
584 | 											+ i * (dci_inst->num_points)
585 | 											+ blockIdx.x * points_per_block].key
586 | 											- query_proj[i]);
587 | 						} else {
588 | 							index_priority[i] = DBL_MAX;
589 | 							cur_pos[i] = -blockDim.x;
590 | 						}
591 | 					}
592 | 				}
593 | 				if (threadIdx.x == 0) {
594 | 					m++;
595 | 				}
596 | 				__syncthreads();
597 | 			}
598 | 			if (threadIdx.x == 0) {
599 | 				if (num_candidates >= num_neighbours) {
600 | 					if (k + 1
601 | 							>= query_config.num_outer_iterations
602 | 									* dci_inst->num_simp_indices
603 | 							|| num_candidates >= query_config.max_num_candidates) {
604 | 						could_break = true;
605 | 						break;
606 | 					}
607 | 				}
608 | 				k++;
609 | 			}
610 | 			/* Synchronize the threads */
611 | 			__syncthreads();
612 | 			if (could_break) {
613 | 			    break;
614 | 			}
615 | 		}
616 | 		// free variables
617 | 		if (threadIdx.x == 0) {
618 | 			free(left_pos);
619 | 			free(right_pos);
620 | 			free(cur_pos);
621 | 			free(index_priority);
622 | 		}
623 | 	}
624 | 
625 | 
626 | }
627 | 
628 | __global__ void mix_sort_kernel(idx_elem* const d_top_candidates,
629 | 		const int total) {
630 | 	if (threadIdx.x == 0 && blockIdx.x == 0) {
631 | 		mix_sort(d_top_candidates, total);
632 | 	}
633 | }
634 | 
635 | __global__ void update_top(const dci* const dci_inst,
636 | 		double* const index_priority, int const comp_index, int* top_h,
637 | 		int *mutex) {
638 | 	double top_h_priority = DBL_MAX;
639 | 	//	Shared top priority array
640 | 	extern __shared__ double top_priority[];
641 | 	//	Shared top priority index in data array
642 | 	extern __shared__ double top_index[];
643 | 
644 | 	unsigned int tid = threadIdx.x;
645 | 	unsigned int idx = blockIdx.x * blockDim.x + tid;
646 | 	top_priority[tid] = DBL_MAX;
647 | 	top_index[tid] = idx % dci_inst->num_simp_indices;
648 | 
649 | 	while (idx < dci_inst->num_simp_indices) {
650 | 		double cur_priority = index_priority[comp_index
651 | 				* dci_inst->num_simp_indices + idx];
652 | 		if (top_priority[tid] > cur_priority) {
653 | 			top_priority[tid] = cur_priority;
654 | 			top_index[tid] = idx % dci_inst->num_simp_indices;
655 | 		}
656 | 		idx += gridDim.x * blockDim.x;
657 | 	}
658 | 	__syncthreads();
659 | 	idx = blockIdx.x * blockDim.x + tid;
660 | 	// block-wide reduction
661 | 	for (unsigned int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
662 | 		if (tid < offset && idx < dci_inst->num_simp_indices) {
663 | 			double cur_priority = index_priority[comp_index
664 | 					* dci_inst->num_simp_indices + tid];
665 | 			double compare_priority = index_priority[comp_index
666 | 					* dci_inst->num_simp_indices + tid + offset];
667 | 			if (cur_priority > compare_priority) {
668 | 				top_priority[tid] = compare_priority;
669 | 				top_index[tid] = (blockIdx.x * blockDim.x + tid + offset)
670 | 						% dci_inst->num_simp_indices;
671 | 			}
672 | 		}
673 | 		__syncthreads();
674 | 	}
675 | 
676 | 	// finally, thread 0 writes the result
677 | 	if (threadIdx.x == 0) {
678 | 		while (atomicCAS(mutex, 0, 1) != 0)
679 | 			;  //lock
680 | 		if (top_priority[0] < top_h_priority) {
681 | 			top_h_priority = top_priority[0];
682 | 			*top_h = top_index[0];
683 | 		}
684 | 		atomicExch(mutex, 0);  //unlock
685 | 	}
686 | }
687 | 
688 | /*
689 |  * Update the top nearest neighbors with distance from the partial results
690 |  */
691 | void get_top_candidates(int* const nearest_neighbours,
692 | 		float* const nearest_neighbour_dists,
693 | 		float* const d_top_candidates_dist, int* const d_top_candidates_index,
694 | 		const int num_neighbours, const int total) {
695 | 	thrust::sort_by_key(thrust::device, d_top_candidates_dist,
696 | 			d_top_candidates_dist + total, d_top_candidates_index);
697 | 	cudaMemcpy(nearest_neighbour_dists, d_top_candidates_dist,
698 | 			sizeof(float) * num_neighbours, cudaMemcpyDeviceToDevice);
699 | 	cudaMemcpy(nearest_neighbours, d_top_candidates_index,
700 | 			sizeof(int) * num_neighbours, cudaMemcpyDeviceToDevice);
701 | }
702 | 
703 | __global__ void init_dist(float* const candidate_map, const int total,
704 | 		const float value) {
705 | 	int idx, i = blockDim.x * blockIdx.x + threadIdx.x;
706 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
707 | 			/ (blockDim.x * gridDim.x);
708 | 	int j;
709 | 	// initialize the counters
710 | 	for (j = 0; j < chunk_size; j++) {
711 | 		idx = i * chunk_size + j;
712 | 		if (idx < total) {
713 | 			candidate_map[idx] = value;
714 | 		}
715 | 	}
716 | }
717 | 
718 | __global__ void init_candidates(idx_elem* const candidate_map, const int total,
719 | 		const float value) {
720 | 	int idx, i = blockDim.x * blockIdx.x + threadIdx.x;
721 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
722 | 			/ (blockDim.x * gridDim.x);
723 | 	int j;
724 | 	// initialize the counters
725 | 	for (j = 0; j < chunk_size; j++) {
726 | 		idx = i * chunk_size + j;
727 | 		if (idx < total) {
728 | 			candidate_map[idx].key = value;
729 | 			candidate_map[idx].value = -1;
730 | 		}
731 | 	}
732 | }
733 | 
734 | __global__ void get_blind_candidate_count(idx_elem* const candidate_map,
735 | 		int* const d_all_candidates, const int total) {
736 | 	int idx, i = blockDim.x * blockIdx.x + threadIdx.x;
737 | 	int chunk_size = (total + blockDim.x * gridDim.x - 1)
738 | 			/ (blockDim.x * gridDim.x);
739 | 	int j;
740 | 	// maintain counts as negative numbers for candidate_map.key in order to reuse mix_sort (ascending)
741 | 	for (j = 0; j < chunk_size; j++) {
742 | 		idx = i * chunk_size + j;
743 | 		if (idx < total) {
744 | 			candidate_map[d_all_candidates[idx]].key--;
745 | 			candidate_map[d_all_candidates[idx]].value =
746 | 					d_all_candidates[idx];
747 | 		}
748 | 	}
749 | }
750 | 
751 | /*
752 |  * Update the top nearest neighbors from the partial results
753 |  */
754 | void get_top_blind_candidates(int* const nearest_neighbours,
755 | 		int* const d_all_candidates, const int max_possible_num_candidates,
756 | 		const int total) {
757 | 	int i;
758 | 	idx_elem* candidate_map;
759 | 	cudaMallocManaged((void **) (&candidate_map),
760 | 			sizeof(idx_elem) * total);
761 | 	int block_size = 1024;
762 | 	int thread_size = 32;
763 | 	init_candidates<<<block_size, thread_size>>>(candidate_map, total, 0);
764 | 	// synch all blocks
765 | 	cudaDeviceSynchronize();
766 | 	get_blind_candidate_count<<<block_size, thread_size>>>(candidate_map, d_all_candidates, total);
767 | 	// synch all blocks
768 | 	cudaDeviceSynchronize();
769 | 	mix_sort_kernel<<<1, 1>>>(candidate_map, total);
770 | 	for (i = 0; i < max_possible_num_candidates; i++) {
771 | 		nearest_neighbours[i] = candidate_map[i].value;
772 | 	}
773 | }
774 | 
775 | // If blind querying is used, nearest_neighbours must be of size num_queries * max_possible_num_candidates; otherwise, it must be of size num_queries * num_neighbours
776 | // nearest_neighbour_dists can be NULL when blind querying is used
777 | void dci_query(dci* const dci_inst, const int dim, const int num_queries,
778 | 		const float* const query, const int num_neighbours,
779 | 		const dci_query_config query_config, int* const nearest_neighbours,
780 | 		float* const nearest_neighbour_dists, const int block_size,
781 | 		const int thread_size) {
782 | 
783 | 	int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices;
784 | 	int max_possible_num_candidates = min(query_config.max_num_candidates,
785 | 			query_config.num_outer_iterations);
786 | 
787 | 	assert(dim == dci_inst->dim);
788 | 	assert(num_neighbours > 0);
789 | 	assert(num_neighbours <= dci_inst->num_points);
790 | 
791 | 	// for fixing timeout
792 | 	void* dummy;
793 | 	cudaMalloc(&dummy, 1);
794 | 
795 | 	// calculate query_proj
796 | 	int devId = 0;
797 | 	float* query_proj;
798 | 
799 | 	cudaMallocManaged((void **) (&query_proj),
800 | 			sizeof(float) * num_indices * num_queries);
801 | 
802 | 	matmul_device(CUBLAS_OP_N, CUBLAS_OP_T, num_queries, num_indices,
803 | 			dci_inst->dim, query, dci_inst->proj_vec, query_proj, devId);
804 | 
805 | 	// copy query config to device pointer
806 | 	dci_query_config* d_query_config;
807 | 	cudaMallocManaged((void **) (&d_query_config),
808 | 			sizeof(dci_query_config));
809 | 	cudaMemcpy(d_query_config, &query_config, sizeof(dci_query_config),
810 | 			cudaMemcpyHostToDevice);
811 | 
812 | 	// make the raw nearest neighbors
813 | 	int* d_all_candidates;
814 | 	cudaMallocManaged((void **) (&d_all_candidates),
815 | 			sizeof(int) * max_possible_num_candidates * block_size);
816 | 
817 | 	float* d_top_candidates_dist;
818 | 	cudaMalloc((void **) (&d_top_candidates_dist),
819 | 			sizeof(float) * num_neighbours * block_size * thread_size);
820 | 	int* d_top_candidates_index;
821 | 	cudaMalloc((void **) (&d_top_candidates_index),
822 | 			sizeof(int) * num_neighbours * block_size * thread_size);
823 | 
824 | 	int* counts;
825 | 	cudaMallocManaged((void **) (&counts),
826 | 			sizeof(int) * dci_inst->num_points
827 | 					* dci_inst->num_comp_indices);
828 | 
829 | 	float* candidate_dists;
830 | 	cudaMallocManaged((void **) (&candidate_dists),
831 | 			sizeof(float) * dci_inst->num_points);
832 | 
833 | 	for (int j = 0; j < num_queries; j++) {
834 | 		// need to refresh the result holder to avoid carry over results
835 | 		init_dist<<<block_size, thread_size>>>(d_top_candidates_dist,
836 | 				num_neighbours * block_size * thread_size, DBL_MAX);
837 | 
838 | 		cudaDeviceSynchronize();
839 | 		init_counts<<<block_size, thread_size>>>(dci_inst, counts);
840 | 		init_candidate_dists<<<block_size, thread_size>>>(dci_inst,
841 | 				candidate_dists);
842 | 
843 | 		cudaDeviceSynchronize();
844 | 
845 | 		dci_query_single_point_by_block<<<block_size, thread_size>>>(dci_inst,
846 | 				num_neighbours, &(query[j * dim]),
847 | 				&(query_proj[j * num_indices]), *d_query_config,
848 | 				d_top_candidates_dist, d_top_candidates_index, d_all_candidates,
849 | 				counts, candidate_dists);
850 | 
851 | 		cudaDeviceSynchronize();
852 | 
853 | 		// get the final output
854 | 		if (!query_config.blind) {
855 | 			get_top_candidates(&(nearest_neighbours[j * num_neighbours]),
856 | 					&(nearest_neighbour_dists[j * num_neighbours]),
857 | 					d_top_candidates_dist, d_top_candidates_index,
858 | 					num_neighbours, block_size * num_neighbours * thread_size);
859 | 		} else {
860 | 			get_top_blind_candidates(
861 | 					&(nearest_neighbours[j * max_possible_num_candidates]),
862 | 					d_all_candidates, max_possible_num_candidates,
863 | 					block_size * max_possible_num_candidates);
864 | 		}
865 | 	}
866 | 
867 | 	// free the allocated memories
868 | 	cudaFree(query_proj);
869 | 	cudaFree(d_query_config);
870 | 	cudaFree(d_all_candidates);
871 | 	cudaFree(d_top_candidates_dist);
872 | 	cudaFree(d_top_candidates_index);
873 | 	cudaFree(counts);
874 | 	cudaFree(candidate_dists);
875 | }
876 | 
877 | 
878 | void dci_clear(dci* const dci_inst) {
879 | 	if (dci_inst->indices) {
880 | 		cudaFree(dci_inst->indices);
881 | 		dci_inst->indices = NULL;
882 | 	}
883 | 	dci_inst->data = NULL;
884 | 	dci_inst->num_points = 0;
885 | }
886 | 
887 | void dci_reset(dci* const dci_inst) {
888 | 	dci_clear(dci_inst);
889 | 	dci_gen_proj_vec(dci_inst->proj_vec, dci_inst->dim,
890 | 			dci_inst->num_comp_indices * dci_inst->num_simp_indices);
891 | }
892 | 
893 | void dci_free(const dci* const dci_inst) {
894 | 	if (dci_inst->indices) {
895 | 		cudaFree(dci_inst->indices);
896 | 	}
897 | 	cudaFree(dci_inst->proj_vec);
898 | 
899 | }
900 | 
901 | void dci_dump(const dci* const dci_inst) {
902 | 	int i, j;
903 | 	int num_indices = dci_inst->num_comp_indices * dci_inst->num_simp_indices;
904 | 	for (j = 0; j < num_indices; j++) {
905 | 		for (i = 0; i < dci_inst->num_points; i++) {
906 | 			printf("%f[%d],",
907 | 					dci_inst->indices[i + j * (dci_inst->num_points)].key,
908 | 					dci_inst->indices[i + j * (dci_inst->num_points)].value);
909 | 		}
910 | 		printf("\n");
911 | 	}
912 | }
913 | 


--------------------------------------------------------------------------------
/src/util_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Code for Fast k-Nearest Neighbour Search via Prioritized DCI
  3 |  *
  4 |  * This code implements the method described in the Prioritized DCI paper,
  5 |  * which can be found at https://arxiv.org/abs/1703.00440
  6 |  *
  7 |  * This file is a part of the Dynamic Continuous Indexing reference
  8 |  * implementation.
  9 |  *
 10 |  *
 11 |  * This Source Code Form is subject to the terms of the Mozilla Public
 12 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 13 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 14 |  *
 15 |  * Copyright (C) 2020    Ke Li, Shichong Peng, Mehran Aghabozorgi
 16 |  */
 17 | 
 18 | #include "util.h"
 19 | // Utilities and system includes
 20 | #include <assert.h>
 21 | #include <malloc.h>
 22 | #include <math.h>
 23 | 
 24 | // generate the random seed
 25 | #include <inttypes.h>
 26 | 
 27 | // CUDA runtime
 28 | #include <cuda_runtime.h>
 29 | #include <cublas_v2.h>
 30 | 
 31 | // CUDA random
 32 | #include <curand.h>
 33 | #include <curand_kernel.h>
 34 | #include <unistd.h>
 35 | 
 36 | 
 37 | #ifndef min
 38 | #define min(a,b) ((a < b) ? a : b)
 39 | #endif
 40 | #ifndef max
 41 | #define max(a,b) ((a > b) ? a : b)
 42 | #endif
 43 | 
 44 | // uses device pointers, save on malloc ops
 45 | void matmul_device(const cublasOperation_t op_A, const cublasOperation_t op_B,
 46 |     const int M, const int N, const int K, const float* const A, const float* const B, float* const C, int &devID) {
 47 |     // initialize the CUDA variables
 48 |     cudaDeviceProp deviceProp;
 49 | 
 50 |     cudaGetDeviceProperties(&deviceProp, devID);
 51 |     int block_size = 32;  // size 16 has also been used. Think 32 is faster
 52 | 
 53 |     // setup execution parameters
 54 |     dim3 threads(block_size, block_size);
 55 |     dim3 grid(N / threads.x, M / threads.y);
 56 | 
 57 |     // CUBLAS version 2.0
 58 |     const float alpha = 1.0f;
 59 |     const float beta  = 0.0f;
 60 |     cublasHandle_t handle;
 61 | 
 62 |     cublasCreate(&handle);
 63 | 
 64 |     int lda, ldb;
 65 |     if(op_A == CUBLAS_OP_N) {
 66 |         lda = K;
 67 |     } else {
 68 |         lda = M;
 69 |     }
 70 |     if(op_B == CUBLAS_OP_N) {
 71 |         ldb = N;
 72 |     } else {
 73 |         ldb = K;
 74 |     }
 75 | 
 76 |     cublasSgemm(handle, op_B, op_A, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N);
 77 | 
 78 |     // Destroy the handle
 79 |     cublasDestroy(handle);
 80 | }
 81 | 
 82 | __global__ void init_curand_state(unsigned int seed, curandState_t* states) {
 83 |     int id = blockDim.x * blockIdx.x + threadIdx.x;
 84 |     curand_init(seed, id, 0, &states[id]);
 85 | }
 86 | 
 87 | 
 88 | // gauss random variables in parallel
 89 | __global__ void
 90 | gauss_parallel_rng(curandState_t* states, float* vec, const int n) {
 91 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 92 |     // Note assumes num_blocks = num_threads
 93 |     int chunk_size = (n + blockDim.x * blockDim.x - 1) / (blockDim.x * blockDim.x);
 94 |     int index;
 95 |     for(int j = 0; j < chunk_size; ++j) {
 96 |         index = i*chunk_size+j;
 97 |         if(index < n) {
 98 |             vec[i*chunk_size+j] = curand_normal(&states[i]);
 99 |         }
100 |     }
101 | }
102 | 
103 | // uniform distribution in [-1, 1] in parallel
104 | __global__ void
105 | uniform_parallel_rng(curandState_t* states, float *vec, const int n) {
106 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
107 |     // Note assumes num_blocks = num_threads
108 |     int chunk_size = (n + blockDim.x * blockDim.x - 1) / (blockDim.x * blockDim.x);
109 |     int index;
110 |     for(int j = 0; j < chunk_size; ++j) {
111 |         index = i*chunk_size+j;
112 |         if(index < n) {
113 |             vec[i*chunk_size+j] = (curand_uniform(&states[i]) * 2.0) - 1.0;
114 |         }
115 |     }
116 | }
117 | 
118 | // helper functon, assumes vec is device pointer
119 | void rng_parallel_device(float* const vec, const int n, const int rng_type) {
120 |     int num_blocks = 64;  //  for now using num_blocks blocks, num_blocks threads per block
121 | 
122 |     // curand initialization
123 |     curandState_t* states;
124 |     long long seed = 0;
125 |     for(int i = 0; i < 4; ++i) {
126 |         seed = (seed << 32) | rand();
127 |     }
128 |     cudaMalloc((void**) &states, num_blocks * num_blocks * sizeof(curandState_t));
129 |     init_curand_state<<<num_blocks, num_blocks>>>(seed, states);
130 | 
131 |     // generate random numbers
132 |     if(rng_type == GAUSS_RAND) {
133 |         gauss_parallel_rng<<<num_blocks, num_blocks>>>(states, vec, n);
134 |     } else {
135 |         uniform_parallel_rng<<<num_blocks, num_blocks>>>(states, vec, n);
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------