├── examples
    ├── .gitignore
    ├── requirements.txt
    ├── README.md
    └── example1.py
├── cuhnsw
    ├── .gitignore
    ├── __init__.py
    ├── proto
    │   └── config.proto
    ├── bindings.cc
    ├── pyhnsw.py
    └── aux.py
├── requirements.txt
├── pyproject.toml
├── .gitmodules
├── MANIFEST.in
├── cpp
    ├── include
    │   ├── stop_watch.hpp
    │   ├── cuda_base_kernels.cuh
    │   ├── cuda_heap_kernels.cuh
    │   ├── types.hpp
    │   ├── log.hpp
    │   ├── level_graph.hpp
    │   ├── cuhnsw.hpp
    │   ├── cuda_dist_kernels.cuh
    │   ├── cuda_search_kernels.cuh
    │   ├── cuda_utils_kernels.cuh
    │   └── cuda_build_kernels.cuh
    └── src
    │   ├── log.cc
    │   ├── cuhnsw_build.cu
    │   └── cuhnsw_base.cu
├── .travis.yml
├── setup.py
├── README.md
├── cuda_setup.py
└── LICENSE


/examples/.gitignore:
--------------------------------------------------------------------------------
1 | *.hdf5
2 | *.index
3 | 


--------------------------------------------------------------------------------
/cuhnsw/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 | version.py
3 | config_pb2.py
4 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py
2 | fire
3 | hnswlib
4 | pandas
5 | tabulate
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools>=1.3.2
2 | jsmin
3 | numpy
4 | pybind11
5 | protobuf==3.10.0
6 | grpcio-tools==1.27.1
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=1.3.2",
4 |     "numpy",
5 |     "pybind11"
6 | ]
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rd/json11"]
2 | 	path = 3rd/json11
3 | 	url = https://github.com/dropbox/json11
4 | [submodule "3rd/spdlog"]
5 | 	path = 3rd/spdlog
6 | 	url = https://github.com/gabime/spdlog
7 | 


--------------------------------------------------------------------------------
/cuhnsw/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 Jisang Yoon
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the Apache 2.0 license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from cuhnsw.pyhnsw import CuHNSW
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include cuda_setup.py
 2 | include requirements.txt
 3 | include pyproject.toml
 4 | recursive-include cpp/src/ *.cu
 5 | recursive-include cpp/src/ *.cc
 6 | recursive-include cpp/include/ *.cuh
 7 | recursive-include cpp/include/ *.hpp
 8 | recursive-include 3rd/json11/ *
 9 | recursive-include 3rd/spdlog/ *
10 | recursive-include 3rd/pybind11/ *
11 | 


--------------------------------------------------------------------------------
/cpp/include/stop_watch.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | 
 8 | #include <time.h>
 9 | #include <utility>
10 | 
11 | class StopWatch {
12 |  public:
13 |   StopWatch() {
14 |     clock_gettime(CLOCK_MONOTONIC, &beg_);
15 |   }
16 |   ~StopWatch() {}
17 |   inline double CheckPoint() {
18 |     clock_gettime(CLOCK_MONOTONIC, &end_);
19 |     double ret = (end_.tv_sec - beg_.tv_sec) + (end_.tv_nsec - beg_.tv_nsec) / 1e9;
20 |     std::swap(beg_, end_);
21 |     return ret;
22 |   }
23 |  private:
24 |   timespec beg_, end_;
25 | };
26 | 


--------------------------------------------------------------------------------
/cuhnsw/proto/config.proto:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | syntax = "proto2";
 8 | 
 9 | message ConfigProto {
10 |   optional int32 seed = 1 [default = 777];
11 |   optional int32 c_log_level = 3 [default = 2];
12 |   optional int32 py_log_level = 4 [default = 2];
13 |   optional int32 max_m = 5 [default = 12];
14 |   optional int32 max_m0 = 6 [default = 24];
15 |   optional int32 ef_construction = 7 [default = 150];
16 |   // optional int32 ef_search = 8 [default = 50];
17 |   optional double level_mult = 9;
18 |   optional bool save_remains = 10;
19 |   optional double hyper_threads = 11 [default = 10];
20 |   optional int32 block_dim = 12 [default = 32];
21 |   optional string dist_type = 13 [default = "dot"];
22 |   optional int32 visited_table_size = 17;
23 |   optional int32 visited_list_size = 14 [default = 8192];
24 |   optional bool nrz = 15;
25 |   optional bool reverse_cand = 16;
26 |   optional double heuristic_coef = 18 [default = 0.25];
27 | }
28 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ### How to run example code
 2 | 
 3 | 0. install requirements
 4 | 
 5 | ```shell
 6 | # install python requirements
 7 | pip install -r requirements.txt
 8 | 
 9 | # install wget to download data (in ubuntu)
10 | apt install wget
11 | ```
12 | 
13 | 1. first, it is good to know about python-fire in https://github.com/google/python-fire, if you haven't heard yet.
14 | 
15 | 2. download data
16 | 
17 | ```shell
18 | python example1.py download
19 | ```
20 | 
21 | 3. run gpu training
22 | 
23 | 
24 | ```shell
25 | python example1.py run_gpu_training
26 | ```
27 | 
28 | 4. check the saved index (filename: `cuhnsw.index`)
29 | 
30 | 
31 | 
32 | 5. search the nearest neighbors loading the file in cuhnsw (GPU)
33 | 
34 | ```shell
35 | python example1.py run_gpu_inference --index-file=cuhnsw.index --topk=10
36 | ```
37 | 
38 | 6. you can also search the nearest neighbor by hnswlib (CPU)
39 | 
40 | ```shell
41 | python example1.py run_cpu_inference --index-file=cuhnsw.index --topk=10
42 | ```
43 | 
44 | 7. reproduce the experimental results shown in README.md on the root directory
45 | 
46 | ```shell
47 | python example1.py run_experiments
48 | ```
49 | 


--------------------------------------------------------------------------------
/cpp/src/log.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc
 8 | #include "log.hpp"
 9 | 
10 | int CuHNSWLogger::global_logging_level_ = 2;
11 | 
12 | CuHNSWLogger::CuHNSWLogger() {
13 |   spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v");
14 |   logger_ = spdlog::default_logger();
15 | }
16 | 
17 | std::shared_ptr<spdlog::logger>& CuHNSWLogger::get_logger() {
18 |   return logger_;
19 | }
20 | 
21 | void CuHNSWLogger::set_log_level(int level) {
22 |   global_logging_level_ = level;
23 |   switch (level) {
24 |     case 0: spdlog::set_level(spdlog::level::off); break;
25 |     case 1: spdlog::set_level(spdlog::level::warn); break;
26 |     case 2: spdlog::set_level(spdlog::level::info); break;
27 |     case 3: spdlog::set_level(spdlog::level::debug); break;
28 |     default: spdlog::set_level(spdlog::level::trace); break;
29 |   }
30 | }
31 | 
32 | int CuHNSWLogger::get_log_level() {
33 |   return global_logging_level_;
34 | }
35 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_base_kernels.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include <unistd.h>
 8 | #include <cuda_runtime.h>
 9 | 
10 | #include <thrust/copy.h>
11 | #include <thrust/fill.h>
12 | #include <thrust/random.h>
13 | #include <thrust/host_vector.h>
14 | #include <thrust/device_vector.h>
15 | #include <thrust/binary_search.h>
16 | #include <thrust/execution_policy.h>
17 | #include <cooperative_groups.h>
18 | 
19 | #include <stdexcept>
20 | #include <sstream>
21 | #include <ctime>
22 | #include <utility>
23 | 
24 | #include "types.hpp"
25 | 
26 | namespace cuhnsw {
27 | 
28 | // Error Checking utilities, checks status codes from cuda calls
29 | // and throws exceptions on failure (which cython can proxy back to python)
30 | #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
31 | inline void checkCuda(cudaError_t code, const char *file, int line) {
32 |   if (code != cudaSuccess) {
33 |     std::stringstream err;
34 |     err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")";
35 |     throw std::runtime_error(err.str());
36 |   }
37 | }
38 | 
39 | } // namespace cuhnsw
40 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_heap_kernels.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include "cuda_base_kernels.cuh"
 8 | 
 9 | namespace cuhnsw {
10 | 
11 | // pop and push for heap
12 | // reference: https://github.com/NVlabs/nvbio/blob/master/nvbio/basic/priority_queue_inline.h
13 | __inline__ __device__
14 | void PqPop(Neighbor* pq, int* size) {
15 |   if (threadIdx.x != 0) return;
16 |   if (*size == 0) return;
17 |   (*size)--;
18 |   if (*size == 0) return;
19 |   cuda_scalar tail_dist = pq[*size].distance;
20 |   int p = 0, r = 1;
21 |   while (r < *size) {
22 |     if (r < (*size) - 1 and gt(pq[r + 1].distance, pq[r].distance))
23 |       r++;
24 |     if (ge(tail_dist, pq[r].distance))
25 |       break;
26 |     pq[p] = pq[r];
27 |     p = r;
28 |     r = 2 * p + 1;
29 |   }
30 |   pq[p] = pq[*size];
31 | }
32 | 
33 | __inline__ __device__
34 | void PqPush(Neighbor* pq, int* size,
35 |     float dist, int nodeid, bool check) {
36 |   if (threadIdx.x != 0) return;
37 |   int idx = *size;
38 |   while (idx > 0) {
39 |     int nidx = (idx + 1) / 2 - 1;
40 |     if (ge(pq[nidx].distance, dist))
41 |       break;
42 |     pq[idx] = pq[nidx];
43 |     idx = nidx;
44 |   }
45 |   pq[idx].distance = dist;
46 |   pq[idx].nodeid = nodeid;
47 |   pq[idx].checked = check;
48 |   (*size)++;
49 | }
50 | 
51 | } // namespace cuhnsw
52 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/jeremad/cuda-travis/blob/master/.travis.yml
 2 | language: cpp
 3 | 
 4 | sudo: enabled
 5 | 
 6 | compiler:
 7 |   - gcc
 8 | 
 9 | matrix:
10 |   include:
11 |     - name: CUDA 10
12 |       env:
13 |       - CUDA=10.1.105-1
14 |       - CUDA_SHORT=10.1
15 |       - UBUNTU_VERSION=ubuntu1804
16 |       dist: bionic
17 | 
18 | before_install:
19 |   - sudo apt update
20 |   - sudo apt install -y software-properties-common
21 |   - sudo add-apt-repository -y ppa:deadsnakes/ppa
22 |   - sudo apt update
23 |   - sudo apt install -y python3-pip python3.6 g++
24 |   - pip3 install -U pip
25 |   - pip3 install setuptools
26 |   - pip3 install -r requirements.txt
27 |   - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
28 |   - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}
29 |   - sudo dpkg -i ${INSTALLER}
30 |   - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
31 |   - sudo apt-key add 7fa2af80.pub
32 |   - sudo apt update -qq
33 |   - sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} 
34 |   - sudo apt clean
35 |   - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
36 |   - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
37 |   - export PATH=${CUDA_HOME}/bin:${PATH}
38 |   - python3.6 -m grpc_tools.protoc --python_out cuhnsw/ --proto_path cuhnsw/proto/ config.proto
39 | 
40 | script:
41 |   - sudo python3.6 setup.py install
42 | 


--------------------------------------------------------------------------------
/cpp/include/types.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include <cuda_fp16.h>
 8 | 
 9 | // experimental codes to use half precision
10 | // not properly working yet..
11 | // #define HALF_PRECISION 1
12 | 
13 | // #if __CUDA_ARCH__ < 530
14 | //   #undef HALF_PRECISION
15 | // #endif
16 | 
17 | #ifdef HALF_PRECISION
18 |   typedef half cuda_scalar;
19 |   #define mul(x, y) ( __hmul(x, y) )
20 |   #define add(x, y) ( __hadd(x, y) )
21 |   #define sub(x, y) ( __hsub(x, y) )
22 |   #define gt(x, y) ( __hgt(x, y) )  // x > y
23 |   #define ge(x, y) ( __hge(x, y) )  // x >= y
24 |   #define lt(x, y) ( __hlt(x, y) )  // x < y
25 |   #define le(x, y) ( __hle(x, y) )  // x <= y
26 |   #define out_scalar(x) ( __half2float(x) )
27 |   #define conversion(x) ( __float2half(x) )
28 | #else
29 |   typedef float cuda_scalar;
30 |   #define mul(x, y) ( x * y )
31 |   #define add(x, y) ( x + y )
32 |   #define sub(x, y) ( x - y )
33 |   #define gt(x, y) ( x > y )
34 |   #define ge(x, y) ( x >= y )
35 |   #define lt(x, y) ( x < y )
36 |   #define le(x, y) ( x <= y )
37 |   #define out_scalar(x) ( x )
38 |   #define conversion(x) ( x )
39 | #endif
40 | 
41 | #define WARP_SIZE 32
42 | 
43 | struct Neighbor {
44 |   cuda_scalar distance;
45 |   int nodeid;
46 |   bool checked;
47 | };
48 | 
49 | // to manage the compatibility with hnswlib
50 | typedef unsigned int tableint;
51 | typedef unsigned int sizeint;
52 | typedef float scalar;
53 | typedef size_t labeltype;
54 | 
55 | enum DIST_TYPE {
56 |   DOT,
57 |   L2,
58 | };
59 | 


--------------------------------------------------------------------------------
/cpp/include/log.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc
 8 | #pragma once
 9 | #include <memory>
10 | 
11 | #define SPDLOG_EOL ""
12 | #define SPDLOG_TRACE_ON
13 | #include "spdlog/spdlog.h"
14 | #include "spdlog/sinks/stdout_color_sinks.h"
15 | 
16 | #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
17 | 
18 | #define INFO(x, ...) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
19 | #define DEBUG(x, ...) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
20 | #define WARN(x, ...) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
21 | #define TRACE(x, ...) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
22 | #define CRITICAL(x, ...) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
23 | 
24 | #define INFO0(x) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__);
25 | #define DEBUG0(x) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__);
26 | #define WARN0(x) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__);
27 | #define TRACE0(x) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__);
28 | #define CRITICAL0(x) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__);
29 | 
30 | class CuHNSWLogger {
31 |  public:
32 |   CuHNSWLogger();
33 |   std::shared_ptr<spdlog::logger>& get_logger();
34 |   void set_log_level(int level);
35 |   int get_log_level();
36 | 
37 |  private:
38 |   static int global_logging_level_;
39 |   std::shared_ptr<spdlog::logger> logger_;
40 | };  // class CuHNSWLogger
41 | 


--------------------------------------------------------------------------------
/cpp/include/level_graph.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include <set>
 8 | #include <unordered_set>
 9 | #include <random>
10 | #include <memory>
11 | #include <string>
12 | #include <fstream>
13 | #include <utility>
14 | #include <queue>
15 | #include <functional>
16 | #include <vector>
17 | #include <unordered_map>
18 | 
19 | #include "log.hpp"
20 | 
21 | namespace cuhnsw {
22 | 
23 | class LevelGraph {
24 |  public:
25 |   LevelGraph() {
26 |     logger_ = CuHNSWLogger().get_logger();
27 |   }
28 | 
29 |   ~LevelGraph() {}
30 | 
31 |   void SetNodes(std::vector<int>& nodes, int num_data, int ef_construction) {
32 |     nodes_ = nodes;
33 |     num_nodes_ = nodes_.size();
34 |     neighbors_.clear();
35 |     neighbors_.resize(num_nodes_);
36 |     nodes_idmap_.resize(num_data);
37 |     std::fill(nodes_idmap_.begin(), nodes_idmap_.end(), -1);
38 |     for (int i = 0; i < num_nodes_; ++i)
39 |       nodes_idmap_[nodes[i]] = i;
40 |   }
41 | 
42 |   const std::vector<std::pair<float, int>>& GetNeighbors(int node) const  {
43 |     int nodeid = GetNodeId(node);
44 |     return neighbors_[nodeid];
45 |   }
46 | 
47 |   const std::vector<int>& GetNodes() const {
48 |     return nodes_;
49 |   }
50 | 
51 |   void ClearEdges(int node) {
52 |     neighbors_[GetNodeId(node)].clear();
53 |   }
54 | 
55 |   void AddEdge(int src, int dst, float dist) {
56 |     if (src == dst) return;
57 |     int srcid = GetNodeId(src);
58 |     neighbors_[srcid].emplace_back(dist, dst);
59 |   }
60 | 
61 |   inline int GetNodeId(int node) const {
62 |     int nodeid = nodes_idmap_.at(node);
63 |     if (not(nodeid >= 0 and nodeid < num_nodes_)) {
64 |       throw std::runtime_error(
65 |           fmt::format("[{}:{}] invalid nodeid: {}, node: {}, num_nodes: {}",
66 |             __FILE__, __LINE__, nodeid, node, num_nodes_));
67 |     }
68 |     return nodeid;
69 |   }
70 | 
71 |   void ShowGraph() {
72 |     for (int i = 0; i < num_nodes_; ++i) {
73 |       std::cout << std::string(50, '=') << std::endl;
74 |       printf("nodeid %d: %d\n", i, nodes_[i]);
75 |       for (auto& nb: GetNeighbors(nodes_[i])) {
76 |         printf("neighbor id: %d, dist: %f\n",
77 |             nb.second, nb.first);
78 |       }
79 |       std::cout << std::string(50, '=') << std::endl;
80 |     }
81 |   }
82 | 
83 |  private:
84 |   std::shared_ptr<spdlog::logger> logger_;
85 |   std::vector<int> nodes_;
86 |   std::vector<std::vector<std::pair<float, int>>> neighbors_;
87 |   int num_nodes_ = 0;
88 |   std::vector<int> nodes_idmap_;
89 | };  // class LevelGraph
90 | 
91 | } // namespace cuhnsw
92 | 


--------------------------------------------------------------------------------
/cuhnsw/bindings.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | //  All rights reserved.
 3 | //
 4 | //  This source code is licensed under the Apache 2.0 license found in the
 5 | //  LICENSE file in the root directory of this source tree.
 6 | #include <pybind11/pybind11.h>
 7 | #include <pybind11/numpy.h>
 8 | #include <pybind11/stl.h>
 9 | 
10 | #include <iostream>
11 | #include "cuhnsw.hpp"
12 | 
13 | namespace py = pybind11;
14 | 
15 | typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
16 | typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
17 | 
18 | class CuHNSWBind {
19 |  public:
20 |   CuHNSWBind() {}
21 | 
22 |   bool Init(std::string opt_path) {
23 |     return obj_.Init(opt_path);
24 |   }
25 | 
26 |   void SetData(py::object& input) {
27 |     float_array array(input);
28 |     auto buffer = array.request();
29 |     if (buffer.ndim != 2) throw std::runtime_error("data must be 2d array");
30 |     int num_data = buffer.shape[0];
31 |     int num_dims = buffer.shape[1];
32 |     obj_.SetData(array.data(0), num_data, num_dims);
33 |   }
34 | 
35 |   void BuildGraph() {
36 |     obj_.BuildGraph();
37 |   }
38 | 
39 |   void SetRandomLevels(py::object& input) {
40 |     int_array array(input);
41 |     auto buffer = array.request();
42 |     if (buffer.ndim != 1) throw std::runtime_error("levels must be 1d array");
43 |     obj_.SetRandomLevels(array.data(0));
44 |   }
45 | 
46 |   void SaveIndex(std::string fpath) {
47 |     obj_.SaveIndex(fpath);
48 |   }
49 | 
50 |   void LoadIndex(std::string fpath) {
51 |     obj_.LoadIndex(fpath);
52 |   }
53 | 
54 |   void SearchGraph(py::object& qdata, int topk, int ef_search,
55 |       py::object& nns, py::object& distances, py::object& found_cnt) {
56 |     float_array _qdata(qdata);
57 |     int_array _nns(nns);
58 |     float_array _distances(distances);
59 |     int_array _found_cnt(found_cnt);
60 |     auto buffer = _qdata.request();
61 | 
62 |     if (buffer.ndim != 1 and buffer.ndim != 2)
63 |       throw std::runtime_error("data array must be 1d / 2d shape");
64 | 
65 |     int num_queries = buffer.ndim == 1? 1: buffer.shape[0];
66 |     obj_.SearchGraph(_qdata.data(0), num_queries, topk, ef_search,
67 |         _nns.mutable_data(0), _distances.mutable_data(0), _found_cnt.mutable_data(0));
68 |   }
69 | 
70 |  private:
71 |   cuhnsw::CuHNSW obj_;
72 | };
73 | 
74 | PYBIND11_PLUGIN(cuhnsw_bind) {
75 |   py::module m("CuHNSWBind");
76 | 
77 |   py::class_<CuHNSWBind>(m, "CuHNSWBind")
78 |   .def(py::init())
79 |   .def("init", &CuHNSWBind::Init, py::arg("opt_path"))
80 |   .def("set_data", &CuHNSWBind::SetData, py::arg("data"))
81 |   .def("build_graph", &CuHNSWBind::BuildGraph)
82 |   .def("set_random_levels", &CuHNSWBind::SetRandomLevels, py::arg("levels"))
83 |   .def("save_index", &CuHNSWBind::SaveIndex, py::arg("fpath"))
84 |   .def("load_index", &CuHNSWBind::LoadIndex, py::arg("fpath"))
85 |   .def("search_knn", &CuHNSWBind::SearchGraph,
86 |       py::arg("qdata"), py::arg("topk"), py::arg("ef_search"),
87 |       py::arg("nns"), py::arg("distances"), py::arg("found"))
88 |   .def("__repr__",
89 |   [](const CuHNSWBind &a) {
90 |     return "<CuHNSWBind>";
91 |   }
92 |   );
93 |   return m.ptr();
94 | }
95 | 


--------------------------------------------------------------------------------
/cuhnsw/pyhnsw.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Jisang Yoon
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the Apache 2.0 license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member
 8 | import os
 9 | import json
10 | import tempfile
11 | 
12 | import numpy as np
13 | 
14 | from cuhnsw import aux
15 | from cuhnsw.cuhnsw_bind import CuHNSWBind
16 | 
17 | EPS = 1e-10
18 | WARP_SIZE = 32
19 | DIST_ALIAS = {"ip": "dot", "euclidean": "l2", "cosine": "dot"}
20 | 
21 | 
22 | class CuHNSW:
23 |   def __init__(self, opt=None):
24 |     self.opt = aux.get_opt_as_proto(opt or {})
25 | 
26 |     self.opt.level_mult = self.opt.level_mult or 1 / np.log(self.opt.max_m)
27 | 
28 |     # handle aliases of dist_type
29 |     assert self.opt.dist_type in ["l2", "euclidean", "dot", "ip", "cosine"], \
30 |       self.opt.dist_type
31 |     self.opt.dist_type = DIST_ALIAS.get(self.opt.dist_type, self.opt.dist_type)
32 |     if self.opt.dist_type == "cosine":
33 |       self.opt.nrz = True
34 | 
35 |     self.logger = aux.get_logger("cuhnsw", self.opt.py_log_level)
36 |     tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
37 |     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
38 |     tmp.write(opt_content)
39 |     tmp.close()
40 |     self.logger.info("opt: %s", opt_content)
41 |     self.data = None
42 |     self.obj = CuHNSWBind()
43 |     assert self.opt.block_dim <= WARP_SIZE ** 2 and \
44 |       self.opt.block_dim % WARP_SIZE == 0, \
45 |       f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})"
46 |     assert self.obj.init(bytes(tmp.name, "utf8")), \
47 |       f"failed to load {tmp.name}"
48 |     os.remove(tmp.name)
49 | 
50 |   def set_data(self, data):
51 |     self.data = data.copy()
52 |     if self.opt.nrz and self.opt.dist_type == "l2":
53 |       self.logger.warning( \
54 |         "it is not common to set nrz = True and dist_type = l2")
55 |     if self.opt.nrz:
56 |       self.data /= np.linalg.norm(self.data, axis=1)[:, None]
57 |     num_data, num_dims = self.data.shape
58 |     self.logger.info("data shape: %d x %d", num_data, num_dims)
59 |     self.obj.set_data(self.data)
60 | 
61 |   def build(self):
62 |     self.set_random_levels()
63 |     self.obj.build_graph()
64 | 
65 |   def set_random_levels(self):
66 |     np.random.seed(self.opt.seed)
67 |     num_data = self.data.shape[0]
68 |     levels = np.random.uniform(size=num_data)
69 |     levels = np.maximum(levels, EPS)
70 |     levels = (-np.log(levels) * self.opt.level_mult).astype(np.int32)
71 |     self.obj.set_random_levels(levels)
72 | 
73 |   def save_index(self, fpath):
74 |     self.obj.save_index(fpath.encode("utf-8"))
75 | 
76 |   def load_index(self, fpath):
77 |     self.obj.load_index(fpath.encode("utf-8"))
78 | 
79 |   def search_knn(self, qdata, topk, ef_search):
80 |     ef_search = max(topk, ef_search)
81 |     qdata = qdata.astype(np.float32)
82 |     num_queries = qdata.shape[0]
83 |     nns = np.empty(shape=(num_queries, topk), dtype=np.int32)
84 |     distances = np.empty(shape=(num_queries, topk), dtype=np.float32)
85 |     found_cnt = np.empty(shape=(num_queries,), dtype=np.int32)
86 |     self.obj.search_knn(qdata, topk, ef_search,
87 |                         nns, distances, found_cnt)
88 |     return nns, distances, found_cnt
89 | 


--------------------------------------------------------------------------------
/cpp/include/cuhnsw.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include <thrust/copy.h>
  8 | #include <thrust/fill.h>
  9 | #include <thrust/random.h>
 10 | #include <thrust/host_vector.h>
 11 | #include <thrust/device_vector.h>
 12 | #include <thrust/binary_search.h>
 13 | #include <thrust/execution_policy.h>
 14 | 
 15 | #include <omp.h>
 16 | #include <set>
 17 | #include <random>
 18 | #include <memory>
 19 | #include <string>
 20 | #include <fstream>
 21 | #include <utility>
 22 | #include <queue>
 23 | #include <deque>
 24 | #include <functional>
 25 | #include <vector>
 26 | #include <cmath>
 27 | #include <chrono> // NOLINT
 28 | 
 29 | #include "json11.hpp"
 30 | #include "log.hpp"
 31 | #include "level_graph.hpp"
 32 | // #include "stop_watch.hpp"
 33 | #include "types.hpp"
 34 | 
 35 | namespace cuhnsw {
 36 | 
 37 | // for the compatibility with hnswlib
 38 | // following two functions refer to
 39 | // https://github.com/nmslib/hnswlib/blob/
 40 | // 2571bdb6ef3f91d6f4c2e59178fde49055d2f980/hnswlib/hnswlib.h
 41 | template<typename T>
 42 | static void writeBinaryPOD(std::ostream &out, const T &podRef) {
 43 |     out.write(reinterpret_cast<const char*>(&podRef), sizeof(T));
 44 | }
 45 | template<typename T>
 46 | static void readBinaryPOD(std::istream &in, T &podRef) {
 47 |     in.read(reinterpret_cast<char*>(&podRef), sizeof(T));
 48 | }
 49 | 
 50 | class CuHNSW {
 51 |  public:
 52 |   // enum ProfileColumns {
 53 |   //   GPU,
 54 |   //   PROFILE_SIZE,
 55 |   // };
 56 | 
 57 |   // std::vector<std::string> PROFILE_KEYS = {
 58 |   //   "gpu",
 59 |   // };
 60 | 
 61 |   CuHNSW();
 62 |   ~CuHNSW();
 63 | 
 64 |   bool Init(std::string opt_path);
 65 |   void SetData(const float* data, int num_data, int num_dims);
 66 |   void SetRandomLevels(const int* levels);
 67 |   void BuildGraph();
 68 |   void SaveIndex(std::string fpath);
 69 |   void LoadIndex(std::string fpath);
 70 |   void SearchGraph(const float* qdata, const int num_queries, const int topk, const int ef_search,
 71 |     int* nns, float* distances, int* found_cnt);
 72 | 
 73 |  private:
 74 |   void GetDeviceInfo();
 75 |   void GetEntryPoints(
 76 |       const std::vector<int>& nodes,
 77 |       std::vector<int>& entries,
 78 |       int level, bool search);
 79 |   void SearchAtLayer(
 80 |       const std::vector<int>& queries,
 81 |       std::vector<std::deque<std::pair<float, int>>>& entries,
 82 |       int level, int max_m);
 83 |   void SearchHeuristicAtLayer(
 84 |       const std::vector<int>& queries,
 85 |       int level, int max_m, bool postprocess);
 86 |   void BuildLevelGraph(int level);
 87 |   std::vector<LevelGraph> level_graphs_;
 88 |   std::vector<int> levels_;
 89 | 
 90 |   json11::Json opt_;
 91 |   std::shared_ptr<spdlog::logger> logger_;
 92 | 
 93 |   int num_data_, num_dims_, batch_size_;
 94 |   thrust::device_vector<cuda_scalar> device_data_, device_qdata_;
 95 |   const float* data_;
 96 |   std::vector<int> labels_;
 97 |   bool labelled_ = false;
 98 |   bool reverse_cand_ = false;
 99 | 
100 |   int major_, minor_, cores_, devId_, mp_cnt_;
101 |   int block_cnt_, block_dim_;
102 |   int visited_table_size_, visited_list_size_;
103 |   int max_level_, max_m_, max_m0_;
104 |   int enter_point_, ef_construction_;
105 |   float level_mult_;
106 |   int dist_type_;
107 |   bool save_remains_;
108 |   double heuristic_coef_;
109 |   // std::vector<StopWatch> sw_;
110 |   // std::vector<double> el_;
111 | 
112 |   bool* visited_;
113 | };  // class CuHNSW
114 | 
115 | } // namespace cuhnsw
116 | 


--------------------------------------------------------------------------------
/cuhnsw/aux.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import re
  7 | import json
  8 | import logging
  9 | import logging.handlers
 10 | 
 11 | import jsmin
 12 | from google.protobuf.json_format import Parse, MessageToDict
 13 | from cuhnsw.config_pb2 import ConfigProto
 14 | 
 15 | # get_logger and Option refer to
 16 | # https://github.com/kakao/buffalo/blob/
 17 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/buffalo/misc/aux.py
 18 | def get_logger(name=__file__, level=2):
 19 |   if level == 1:
 20 |     level = logging.WARNING
 21 |   elif level == 2:
 22 |     level = logging.INFO
 23 |   elif level == 3:
 24 |     level = logging.DEBUG
 25 |   logger = logging.getLogger(name)
 26 |   if logger.handlers:
 27 |     return logger
 28 |   logger.setLevel(level)
 29 |   sh0 = logging.StreamHandler()
 30 |   sh0.setLevel(level)
 31 |   formatter = logging.Formatter('[%(levelname)-8s] %(asctime)s '
 32 |                                 '[%(filename)s] [%(funcName)s:%(lineno)d]'
 33 |                                 '%(message)s', '%Y-%m-%d %H:%M:%S')
 34 |   sh0.setFormatter(formatter)
 35 |   logger.addHandler(sh0)
 36 |   return logger
 37 | 
 38 | # This function helps you to read non-standard json strings.
 39 | # - Handles json string with c++ style inline comments
 40 | # - Handles json string with trailing commas.
 41 | def load_json_string(cont):
 42 |   # (1) Removes comment.
 43 |   #     Refer to https://plus.google.com/+DouglasCrockfordEsq/posts/RK8qyGVaGSr
 44 |   cont = jsmin.jsmin(cont)
 45 | 
 46 |   # (2) Removes trailing comma.
 47 |   cont = re.sub(",[ \t\r\n]*}", "}", cont)
 48 |   cont = re.sub(",[ \t\r\n]*" + r"\]", "]", cont)
 49 | 
 50 |   return json.loads(cont)
 51 | 
 52 | 
 53 | # function read json file from filename
 54 | def load_json_file(fname):
 55 |   with open(fname, "r") as fin:
 56 |     ret = load_json_string(fin.read())
 57 |   return ret
 58 | 
 59 | # use protobuf to restrict field and types
 60 | def get_opt_as_proto(raw, proto_type=ConfigProto):
 61 |   proto = proto_type()
 62 |   # convert raw to proto
 63 |   Parse(json.dumps(Option(raw)), proto)
 64 |   err = []
 65 |   assert proto.IsInitialized(err), \
 66 |     f"some required fields are missing in proto {err}\n {proto}"
 67 |   return proto
 68 | 
 69 | def proto_to_dict(proto):
 70 |   return MessageToDict(proto, \
 71 |     including_default_value_fields=True, \
 72 |     preserving_proto_field_name=True)
 73 | 
 74 | def copy_proto(proto):
 75 |   newproto = type(proto)()
 76 |   Parse(json.dumps(proto_to_dict(proto)), newproto)
 77 |   return newproto
 78 | 
 79 | class Option(dict):
 80 |   def __init__(self, *args, **kwargs):
 81 |     args = [arg if isinstance(arg, dict)
 82 |             else load_json_file(arg) for arg in args]
 83 |     super().__init__(*args, **kwargs)
 84 |     for arg in args:
 85 |       if isinstance(arg, dict):
 86 |         for k, val in arg.items():
 87 |           if isinstance(val, dict):
 88 |             self[k] = Option(val)
 89 |           else:
 90 |             self[k] = val
 91 |     if kwargs:
 92 |       for k, val in kwargs.items():
 93 |         if isinstance(val, dict):
 94 |           self[k] = Option(val)
 95 |         else:
 96 |           self[k] = val
 97 | 
 98 |   def __getattr__(self, attr):
 99 |     return self.get(attr)
100 | 
101 |   def __setattr__(self, key, value):
102 |     self.__setitem__(key, value)
103 | 
104 |   def __setitem__(self, key, value):
105 |     super().__setitem__(key, value)
106 |     self.__dict__.update({key: value})
107 | 
108 |   def __delattr__(self, item):
109 |     self.__delitem__(item)
110 | 
111 |   def __delitem__(self, key):
112 |     super().__delitem__(key)
113 |     del self.__dict__[key]
114 | 
115 |   def __getstate__(self):
116 |     return vars(self)
117 | 
118 |   def __setstate__(self, state):
119 |     vars(self).update(state)
120 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_dist_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "cuda_base_kernels.cuh"
  8 | 
  9 | 
 10 | namespace cuhnsw {
 11 | 
 12 | // https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
 13 | __inline__ __device__
 14 | cuda_scalar warp_reduce_sum(cuda_scalar val) {
 15 |   #if __CUDACC_VER_MAJOR__ >= 9
 16 |   // __shfl_down is deprecated with cuda 9+. use newer variants
 17 |   unsigned int active = __activemask();
 18 |   #pragma unroll
 19 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
 20 |       val = add(val, __shfl_down_sync(active, val, offset));
 21 |   }
 22 |   #else
 23 |   #pragma unroll
 24 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
 25 |       val = add(val, __shfl_down(val, offset));
 26 |   }
 27 |   #endif
 28 |   return val;
 29 | }
 30 | 
 31 | __inline__ __device__
 32 | cuda_scalar dot(const cuda_scalar * a, const cuda_scalar * b, const int num_dims) {
 33 |   __syncthreads();
 34 |   static __shared__ cuda_scalar shared[32];
 35 | 
 36 |   // figure out the warp/ position inside the warp
 37 |   int warp =  threadIdx.x / WARP_SIZE;
 38 |   int lane = threadIdx.x % WARP_SIZE;
 39 | 
 40 |   // partially reduce the dot product inside each warp using a shuffle
 41 |   cuda_scalar val = 0;
 42 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x)
 43 |     val = add(val, mul(a[i], b[i]));
 44 |   val = warp_reduce_sum(val);
 45 |   
 46 |   // write out the partial reduction to shared memory if appropiate
 47 |   if (lane == 0) {
 48 |     shared[warp] = val;
 49 |   }
 50 |   __syncthreads();
 51 | 
 52 |   // if we we don't have multiple warps, we're done
 53 |   if (blockDim.x <= WARP_SIZE) {
 54 |     return shared[0];
 55 |   }
 56 | 
 57 |   // otherwise reduce again in the first warp
 58 |   val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f);
 59 |   if (warp == 0) {
 60 |     val = warp_reduce_sum(val);
 61 |     // broadcast back to shared memory
 62 |     if (threadIdx.x == 0) {
 63 |         shared[0] = val;
 64 |     }
 65 |   }
 66 |   __syncthreads();
 67 |   return shared[0];
 68 | }
 69 | 
 70 | __inline__ __device__
 71 | cuda_scalar squaresum(const cuda_scalar * a, const cuda_scalar * b, const int num_dims) {
 72 |   __syncthreads();
 73 |   static __shared__ cuda_scalar shared[32];
 74 | 
 75 |   // figure out the warp/ position inside the warp
 76 |   int warp =  threadIdx.x / WARP_SIZE;
 77 |   int lane = threadIdx.x % WARP_SIZE;
 78 | 
 79 |   // partially reduce the dot product inside each warp using a shuffle
 80 |   cuda_scalar val = 0;
 81 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x) {
 82 |     cuda_scalar _val = sub(a[i], b[i]);
 83 |     val = add(val, mul(_val, _val));
 84 |   }
 85 |   __syncthreads();
 86 |   val = warp_reduce_sum(val);
 87 | 
 88 |   // write out the partial reduction to shared memory if appropiate
 89 |   if (lane == 0) {
 90 |     shared[warp] = val;
 91 |   }
 92 |   __syncthreads();
 93 | 
 94 |   // if we we don't have multiple warps, we're done
 95 |   if (blockDim.x <= WARP_SIZE) {
 96 |     return shared[0];
 97 |   }
 98 | 
 99 |   // otherwise reduce again in the first warp
100 |   val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f);
101 |   if (warp == 0) {
102 |     val = warp_reduce_sum(val);
103 |     // broadcast back to shared memory
104 |     if (threadIdx.x == 0) {
105 |         shared[0] = val;
106 |     }
107 |   }
108 |   __syncthreads();
109 |   return shared[0];
110 | }
111 | 
112 | __inline__ __device__
113 | cuda_scalar GetDistanceByVec(const cuda_scalar* src_vec, const cuda_scalar* dst_vec, const int num_dims, const int dist_type) {
114 |   cuda_scalar dist = 0;
115 |   switch (dist_type) {
116 |     case DOT:
117 |       dist = -dot(src_vec, dst_vec, num_dims); break;
118 |     case L2:
119 |       dist = squaresum(src_vec, dst_vec, num_dims); break;
120 |     default:
121 |       break;
122 |   }
123 |   return dist;
124 | }
125 | 
126 | __inline__ __device__
127 | cuda_scalar GetDistance(const int srcid, const int dstid, const int num_dims,
128 |     const int dist_type, const int* nodes, const cuda_scalar* data) {
129 |   const cuda_scalar* src_vec = data + num_dims * nodes[srcid];
130 |   const cuda_scalar* dst_vec = data + num_dims * nodes[dstid];
131 |   return GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
132 | }
133 | 
134 | __inline__ __device__
135 | cuda_scalar GetDistance2(const int src, const int dst, const int num_dims,
136 |     const int dist_type, const cuda_scalar* data) {
137 |   const cuda_scalar* src_vec = data + num_dims * src;
138 |   const cuda_scalar* dst_vec = data + num_dims * dst;
139 |   return GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
140 | }
141 | 
142 | 
143 | __global__ void BatchDistanceKernel(
144 |     const cuda_scalar* data, const int* src, const int* dst,
145 |     const int size, const int num_dims, const int dist_type,
146 |     float* distances) {
147 |   for (int idx = blockIdx.x; idx < size; idx += gridDim.x) {
148 |     const int _src = src[idx], _dst = dst[idx];
149 |     cuda_scalar dist = GetDistance2(_src, _dst, num_dims, dist_type, data);
150 |     #ifdef HALF_PRECISION
151 |       if (threadIdx.x == 0) distances[idx] = __half2float(dist);
152 |     #else
153 |       if (threadIdx.x == 0) distances[idx] = dist;
154 |     #endif
155 |   }
156 | }
157 | 
158 | 
159 | } // namespace cuhnsw
160 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=fixme,too-few-public-methods
  8 | # reference: https://github.com/kakao/buffalo/blob/
  9 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/setup.py
 10 | """cuhnsw
 11 | """
 12 | import os
 13 | import sys
 14 | import glob
 15 | import pathlib
 16 | import platform
 17 | import sysconfig
 18 | import subprocess
 19 | from setuptools import setup, Extension
 20 | 
 21 | import pybind11
 22 | import numpy as np
 23 | from cuda_setup import CUDA, BUILDEXT
 24 | 
 25 | 
 26 | DOCLINES = __doc__.split("\n")
 27 | 
 28 | # TODO: Python3 Support
 29 | if sys.version_info[:3] < (3, 6):
 30 |   raise RuntimeError("Python version 3.6 or later required.")
 31 | 
 32 | assert platform.system() == 'Linux'  # TODO: MacOS
 33 | 
 34 | 
 35 | MAJOR = 0
 36 | MINOR = 0
 37 | MICRO = 8
 38 | RELEASE = True
 39 | STAGE = {True: '', False: 'b'}.get(RELEASE)
 40 | VERSION = f'{MAJOR}.{MINOR}.{MICRO}{STAGE}'
 41 | STATUS = {False: 'Development Status :: 4 - Beta',
 42 |           True: 'Development Status :: 5 - Production/Stable'}
 43 | 
 44 | CLASSIFIERS = """{status}
 45 | Programming Language :: C++
 46 | Programming Language :: Python :: 3.6
 47 | Operating System :: POSIX :: Linux
 48 | Operating System :: Unix
 49 | Operating System :: MacOS
 50 | License :: OSI Approved :: Apache Software License""".format( \
 51 |   status=STATUS.get(RELEASE))
 52 | CLIB_DIR = os.path.join(sysconfig.get_path('purelib'), 'cuhnsw')
 53 | LIBRARY_DIRS = [CLIB_DIR]
 54 | 
 55 | with open("requirements.txt", "r") as fin:
 56 |   INSTALL_REQUIRES = [line.strip() for line in fin]
 57 | 
 58 | def get_extend_compile_flags():
 59 |   flags = ['-march=native']
 60 |   return flags
 61 | 
 62 | 
 63 | class CMakeExtension(Extension):
 64 |   extension_type = 'cmake'
 65 | 
 66 |   def __init__(self, name):
 67 |     super().__init__(name, sources=[])
 68 | 
 69 | 
 70 | extend_compile_flags = get_extend_compile_flags()
 71 | extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \
 72 |   extend_compile_flags
 73 | csrcs = glob.glob("cpp/src/*.cu") + glob.glob("cpp/src/*.cc")
 74 | extensions = [
 75 |   # CMakeExtension(name="cuhnsw"),
 76 |   Extension("cuhnsw.cuhnsw_bind",
 77 |             sources= csrcs + [ \
 78 |               "cuhnsw/bindings.cc",
 79 |               "3rd/json11/json11.cpp"],
 80 |             language="c++",
 81 |             extra_compile_args=extra_compile_args,
 82 |             extra_link_args=["-fopenmp"],
 83 |             library_dirs=[CUDA['lib64']],
 84 |             libraries=['cudart', 'curand'],
 85 |             extra_objects=[],
 86 |             include_dirs=[ \
 87 |               "cpp/include/", np.get_include(),
 88 |               pybind11.get_include(), pybind11.get_include(True),
 89 |               CUDA['include'], "3rd/json11", "3rd/spdlog/include"])
 90 | ]
 91 | 
 92 | 
 93 | # Return the git revision as a string
 94 | def git_version():
 95 |   def _minimal_ext_cmd(cmd):
 96 |     # construct minimal environment
 97 |     env = {}
 98 |     for k in ['SYSTEMROOT', 'PATH']:
 99 |       val = os.environ.get(k)
100 |       if val is not None:
101 |         env[k] = val
102 |     out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env). \
103 |       communicate()[0]
104 |     return out
105 | 
106 |   try:
107 |     out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
108 |     git_revision = out.strip().decode('ascii')
109 |   except OSError:
110 |     git_revision = "Unknown"
111 | 
112 |   return git_revision
113 | 
114 | 
115 | def write_version_py(filename='cuhnsw/version.py'):
116 |   cnt = """
117 | short_version = '%(version)s'
118 | git_revision = '%(git_revision)s'
119 | """
120 |   git_revision = git_version()
121 |   with open(filename, 'w') as fout:
122 |     fout.write(cnt % {'version': VERSION,
123 |               'git_revision': git_revision})
124 | 
125 | 
126 | class BuildExtension(BUILDEXT):
127 |   def run(self):
128 |     for ext in self.extensions:
129 |       print(ext.name)
130 |       if hasattr(ext, 'extension_type') and ext.extension_type == 'cmake':
131 |         self.cmake()
132 |     super().run()
133 | 
134 |   def cmake(self):
135 |     cwd = pathlib.Path().absolute()
136 | 
137 |     build_temp = pathlib.Path(self.build_temp)
138 |     build_temp.mkdir(parents=True, exist_ok=True)
139 | 
140 |     build_type = 'Debug' if self.debug else 'Release'
141 | 
142 |     cmake_args = [
143 |       '-DCMAKE_BUILD_TYPE=' + build_type,
144 |       '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + CLIB_DIR,
145 |     ]
146 | 
147 |     build_args = []
148 | 
149 |     os.chdir(str(build_temp))
150 |     self.spawn(['cmake', str(cwd)] + cmake_args)
151 |     if not self.dry_run:
152 |       self.spawn(['cmake', '--build', '.'] + build_args)
153 |     os.chdir(str(cwd))
154 | 
155 | 
156 | def setup_package():
157 |   write_version_py()
158 |   cmdclass = {
159 |     'build_ext': BuildExtension
160 |   }
161 | 
162 |   metadata = dict(
163 |     name='cuhnsw',
164 |     maintainer="Jisang Yoon",
165 |     maintainer_email="vjs10101v@gmail.com",
166 |     author="Jisang Yoon",
167 |     author_email="vjs10101v@gmail.com",
168 |     description=DOCLINES[0],
169 |     long_description="\n".join(DOCLINES[2:]),
170 |     url="https://github.com/js1010/cuhnsw",
171 |     download_url="https://github.com/js1010/cuhnsw/releases",
172 |     include_package_data=False,
173 |     license='Apache2',
174 |     packages=['cuhnsw/'],
175 |     cmdclass=cmdclass,
176 |     classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f],
177 |     platforms=['Linux', 'Mac OSX', 'Unix'],
178 |     ext_modules=extensions,
179 |     install_requires=INSTALL_REQUIRES,
180 |     entry_points={
181 |       'console_scripts': [
182 |       ]
183 |     },
184 |     python_requires='>=3.6',
185 |   )
186 | 
187 |   metadata['version'] = VERSION
188 |   setup(**metadata)
189 | 
190 | 
191 | if __name__ == '__main__':
192 |   setup_package()
193 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_search_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "cuda_utils_kernels.cuh"
  8 | 
  9 | namespace cuhnsw {
 10 | 
 11 | __global__ void GetEntryPointsKernel(
 12 |   const cuda_scalar* qdata, const int* qnodes, const cuda_scalar* target_data, const int* target_nodes,
 13 |   const int num_dims, const int num_qnodes, const int num_target_nodes, const int max_m, const int dist_type,
 14 |   const int* graph, const int* deg,
 15 |   bool* visited, int* visited_list, const int visited_list_size, int* entries, int64_t* acc_visited_cnt
 16 |   ) {
 17 | 
 18 |   static __shared__ int visited_cnt;
 19 |   bool* _visited = visited + num_target_nodes * blockIdx.x;
 20 |   int* _visited_list = visited_list + visited_list_size * blockIdx.x;
 21 | 
 22 |   for (int i = blockIdx.x; i < num_qnodes; i += gridDim.x) {
 23 |     if (threadIdx.x == 0) {
 24 |       visited_cnt = 0;
 25 |     }
 26 |     __syncthreads();
 27 |     cuda_scalar entry_dist = 0;
 28 |     int entryid = entries[i];
 29 |     const cuda_scalar* src_vec = qdata + num_dims * qnodes[i];
 30 |     {
 31 |       const cuda_scalar* dst_vec = target_data + num_dims * target_nodes[entryid];
 32 |       entry_dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
 33 |       // if (threadIdx.x == 0 and blockIdx.x == 0) { 
 34 |       //   printf("srcid: %d, dstid: %d, dist: %f\n", 
 35 |       //       qnodes[i], target_nodes[entryid], entry_dist);
 36 |       // }
 37 |     }
 38 |     __syncthreads();
 39 |     bool updated = true;
 40 |     while (updated) {
 41 |       // initialize entries as neighbors
 42 |       int beg = max_m * entryid;
 43 |       int end = beg + deg[entryid];
 44 |       updated = false;
 45 |       for (int j = beg; j < end; ++j) {
 46 |         int candid = graph[j];
 47 | 
 48 |         if (_visited[candid]) continue;
 49 |         __syncthreads();
 50 |         if (threadIdx.x == 0 and visited_cnt < visited_list_size) {
 51 |           _visited[candid] = true;
 52 |           _visited_list[visited_cnt++] = candid;
 53 |         }
 54 |         __syncthreads();
 55 |         const cuda_scalar* dst_vec = target_data + num_dims * target_nodes[candid];
 56 |         cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
 57 |         if (dist < entry_dist) {
 58 |           entry_dist = dist;
 59 |           entryid = candid;
 60 |           updated = true;
 61 |         }
 62 |         __syncthreads();
 63 |       }
 64 |       if (threadIdx.x == 0) entries[i] = entryid;
 65 |       __syncthreads();
 66 |     }
 67 | 
 68 |     __syncthreads();
 69 |     if (threadIdx.x == 0) {
 70 |       acc_visited_cnt[blockIdx.x] += visited_cnt;
 71 |     }
 72 |     for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) {
 73 |       _visited[_visited_list[j]] = false;
 74 |     }
 75 |     __syncthreads();
 76 |   }
 77 | }
 78 | 
 79 | __global__ void SearchGraphKernel(
 80 |   const cuda_scalar* qdata, const int num_qnodes, const cuda_scalar* data, const int num_nodes,
 81 |   const int num_dims, const int max_m, const int dist_type,
 82 |   const int ef_search, const int* entries, const int* graph, const int* deg, const int topk,
 83 |   int* nns, float* distances, int* found_cnt,
 84 |   int* visited_table, int* visited_list, 
 85 |   const int visited_table_size, const int visited_list_size, int64_t* acc_visited_cnt,
 86 |   const bool reverse_cand, Neighbor* neighbors, int* global_cand_nodes, cuda_scalar* global_cand_distances
 87 |   ) {
 88 | 
 89 |   static __shared__ int size;
 90 |   
 91 |   Neighbor* ef_search_pq = neighbors + ef_search * blockIdx.x;
 92 |   int* cand_nodes = global_cand_nodes + ef_search * blockIdx.x;
 93 |   cuda_scalar* cand_distances = global_cand_distances + ef_search * blockIdx.x;
 94 | 
 95 |   static __shared__ int visited_cnt;
 96 |   int* _visited_table = visited_table + visited_table_size * blockIdx.x;
 97 |   int* _visited_list = visited_list + visited_list_size * blockIdx.x;
 98 | 
 99 |   for (int i = blockIdx.x; i < num_qnodes; i += gridDim.x) {
100 |     if (threadIdx.x == 0) {
101 |       size = 0;
102 |       visited_cnt = 0;
103 |     }
104 |     __syncthreads();
105 | 
106 |     // initialize entries
107 |     const cuda_scalar* src_vec = qdata + i * num_dims;
108 |     PushNodeToSearchPq(ef_search_pq, &size, ef_search, data, 
109 |         num_dims, dist_type, src_vec, entries[i]);
110 |     if (CheckVisited(_visited_table, _visited_list, visited_cnt, entries[i], 
111 |           visited_table_size, visited_list_size)) 
112 |       continue;
113 |     __syncthreads();
114 |     
115 |     // iterate until converge
116 |     int idx = GetCand(ef_search_pq, size, reverse_cand);
117 |     while (idx >= 0) {
118 |       __syncthreads();
119 |       if (threadIdx.x == 0) ef_search_pq[idx].checked = true;
120 |       int entry = ef_search_pq[idx].nodeid;
121 |       __syncthreads();
122 | 
123 |       for (int j = max_m * entry; j < max_m * entry + deg[entry]; ++j) {
124 |         int dstid = graph[j];
125 | 
126 |         if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 
127 |               visited_table_size, visited_list_size)) 
128 |           continue;
129 |         __syncthreads();
130 | 
131 |         const cuda_scalar* dst_vec = data + num_dims * dstid;
132 |         cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
133 | 
134 |         PushNodeToSearchPq(ef_search_pq, &size, ef_search,
135 |             data, num_dims, dist_type, src_vec, dstid);
136 |       }
137 |       __syncthreads();
138 |       idx = GetCand(ef_search_pq, size, reverse_cand);
139 |     }
140 |     __syncthreads();
141 |     if (threadIdx.x == 0) {
142 |       acc_visited_cnt[blockIdx.x] += visited_cnt;
143 |     }
144 | 
145 |     for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) {
146 |       _visited_table[_visited_list[j]] = -1;
147 |     }
148 |     __syncthreads();
149 |     // get sorted neighbors
150 |     if (threadIdx.x == 0) {
151 |       int size2 = size;
152 |       while (size > 0) {
153 |         cand_nodes[size - 1] = ef_search_pq[0].nodeid;
154 |         cand_distances[size - 1] = ef_search_pq[0].distance;
155 |         PqPop(ef_search_pq, &size);
156 |       }
157 |       found_cnt[i] = size2 < topk? size2: topk;
158 |       for (int j = 0; j < found_cnt[i]; ++j) {
159 |         nns[j + i * topk] = cand_nodes[j];
160 |         distances[j + i * topk] = out_scalar(cand_distances[j]);
161 |       }
162 |     }
163 |     __syncthreads();
164 |   }
165 | }
166 | 
167 | 
168 | 
169 | } // namespace cuhnsw
170 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_utils_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "cuda_base_kernels.cuh"
  8 | #include "cuda_dist_kernels.cuh"
  9 | #include "cuda_heap_kernels.cuh"
 10 | 
 11 | 
 12 | namespace cuhnsw {
 13 | 
 14 | __inline__ __device__
 15 | int warp_reduce_cand(const Neighbor* pq, int cand, const bool reverse) {
 16 |   #if __CUDACC_VER_MAJOR__ >= 9
 17 |   unsigned int active = __activemask();
 18 |   #pragma unroll
 19 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
 20 |     int _cand = __shfl_down_sync(active, cand, offset);
 21 |     if (_cand >= 0) {
 22 |       if (cand == -1) {
 23 |         cand = _cand;
 24 |       } else {
 25 |         bool update = reverse? 
 26 |           lt(pq[cand].distance, pq[_cand].distance): 
 27 |           gt(pq[cand].distance, pq[_cand].distance);
 28 |         if (update) cand = _cand;
 29 |       }
 30 |     }
 31 |   }
 32 |   #else
 33 |   #pragma unroll
 34 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
 35 |     int _cand = __shfl_down(cand, offset);
 36 |     if (_cand >= 0) {
 37 |       if (cand == -1) {
 38 |         cand = _cand;
 39 |       } else {
 40 |         bool update = reverse? 
 41 |           lt(pq[cand].distance, pq[_cand.distance]): 
 42 |           gt(pq[cand].distance, pq[_cand.distance]);
 43 |         if (update) cand = _cand;
 44 |       }
 45 |     }
 46 |   }
 47 |   #endif
 48 |   return cand;
 49 | }
 50 | __inline__ __device__
 51 | bool CheckAlreadyExists(const Neighbor* pq, const int size, const int nodeid) {
 52 |   __syncthreads();
 53 |   // figure out the warp/ position inside the warp
 54 |   int warp =  threadIdx.x / WARP_SIZE;
 55 |   int lane = threadIdx.x % WARP_SIZE;
 56 |   
 57 |   static __shared__ bool shared[WARP_SIZE];
 58 |   bool exists = false;
 59 |   for (int i = threadIdx.x; i < size; i += blockDim.x) {
 60 |     if (pq[i].nodeid == nodeid) {
 61 |       exists = true;
 62 |     }
 63 |   }
 64 |   
 65 |   #if __CUDACC_VER_MAJOR__ >= 9
 66 |   unsigned int active = __activemask();
 67 |   exists = __any_sync(active, exists);
 68 |   #else
 69 |   exists = __any(exists);
 70 |   #endif
 71 |   // write out the partial reduction to shared memory if appropiate
 72 |   if (lane == 0) {
 73 |     shared[warp] = exists;
 74 |   }
 75 |   
 76 |   __syncthreads();
 77 |   
 78 |   // if we we don't have multiple warps, we're done
 79 |   if (blockDim.x <= WARP_SIZE) {
 80 |     return shared[0];
 81 |   } 
 82 | 
 83 | 
 84 |   // otherwise reduce again in the first warp
 85 |   exists = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : false;
 86 |   if (warp == 0) {
 87 |     #if __CUDACC_VER_MAJOR__ >= 9
 88 |     active = __activemask();
 89 |     exists = __any_sync(active, exists);
 90 |     #else
 91 |     exists = __any(exists);
 92 |     #endif
 93 |     // broadcast back to shared memory
 94 |     if (threadIdx.x == 0) {
 95 |         shared[0] = exists;
 96 |     }
 97 |   }
 98 |   __syncthreads();
 99 |   return shared[0];
100 | 
101 | 
102 | 
103 | }
104 | __inline__ __device__
105 | int GetCand(const Neighbor* pq, const int size, const bool reverse) {
106 |   __syncthreads();
107 |   
108 |   // figure out the warp/ position inside the warp
109 |   int warp =  threadIdx.x / WARP_SIZE;
110 |   int lane = threadIdx.x % WARP_SIZE;
111 | 
112 |   static __shared__ int shared[WARP_SIZE];
113 |   // pick the closest neighbor with checked = false if reverse = false and vice versa 
114 |   cuda_scalar dist = reverse? -INFINITY: INFINITY;
115 |   int cand = -1;
116 |   for (int i = threadIdx.x; i < size; i += blockDim.x) {
117 |     if (not pq[i].checked) {
118 |       bool update = reverse? lt(dist, pq[i].distance): gt(dist, pq[i].distance);
119 |       if (update) {
120 |         cand = i;
121 |         dist = pq[i].distance;
122 |       }
123 |     }
124 |   }
125 |   cand = warp_reduce_cand(pq, cand, reverse);
126 |   
127 | 
128 |   // write out the partial reduction to shared memory if appropiate
129 |   if (lane == 0) {
130 |     shared[warp] = cand;
131 |   }
132 |   __syncthreads();
133 |   
134 |   // if we we don't have multiple warps, we're done
135 |   if (blockDim.x <= WARP_SIZE) {
136 |     return shared[0];
137 |   } 
138 | 
139 | 
140 |   // otherwise reduce again in the first warp
141 |   cand = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : -1;
142 |   if (warp == 0) {
143 |     cand = warp_reduce_cand(pq, cand, reverse);
144 |     // broadcast back to shared memory
145 |     if (threadIdx.x == 0) {
146 |         shared[0] = cand;
147 |     }
148 |   }
149 |   __syncthreads();
150 |   return shared[0];
151 | }
152 | 
153 | __inline__ __device__
154 | void PushNodeToPq(Neighbor* pq, int* size, const int max_size,
155 |     const cuda_scalar* data, const int num_dims, const int dist_type,
156 |     const int srcid, const int dstid, const int* nodes) {
157 |   if (srcid == dstid) return;
158 |   if (CheckAlreadyExists(pq, *size, dstid)) return;
159 |   cuda_scalar dist = GetDistance(srcid, dstid, num_dims, dist_type, nodes, data);
160 |   __syncthreads();
161 |   if (*size < max_size) {
162 |     PqPush(pq, size, dist, dstid, false);
163 |   } else if (gt(pq[0].distance, dist)) {
164 |     PqPop(pq, size);
165 |     PqPush(pq, size, dist, dstid, false);
166 |   }
167 |   __syncthreads();
168 | }
169 | 
170 | __inline__ __device__
171 | void PushNodeToPq2(Neighbor* pq, int* size, const int max_size,
172 |     const cuda_scalar dist, const int srcid, const int dstid, const int* nodes) {
173 |   if (srcid == dstid) return;
174 |   if (CheckAlreadyExists(pq, *size, dstid)) return;
175 |   __syncthreads();
176 |   if (*size < max_size) {
177 |     PqPush(pq, size, dist, dstid, false);
178 |   } else if (gt(pq[0].distance, dist)) {
179 |     PqPop(pq, size);
180 |     PqPush(pq, size, dist, dstid, false);
181 |   }
182 |   __syncthreads();
183 | }
184 | 
185 | // similar to bloom filter
186 | // while bloom filter prevents false negative, this visited table prevents false positive
187 | // if it says the node is visited, it is actually visited
188 | // if it says the node is not visited, it can be possibly visited
189 | __inline__ __device__
190 | bool CheckVisited(int* visited_table, int* visited_list, int& visited_cnt, int target, 
191 |     const int visited_table_size, const int visited_list_size) {
192 |   __syncthreads();
193 |   bool ret = false;
194 |   if (visited_cnt < visited_list_size ){
195 |     int idx = target % visited_table_size;
196 |     if (visited_table[idx] != target) {
197 |       __syncthreads(); 
198 |       if (threadIdx.x == 0) {
199 |         if (visited_table[idx] == -1) {
200 |           visited_table[idx] = target;
201 |           visited_list[visited_cnt++] = idx;
202 |         }
203 |       }
204 |     } else {
205 |       ret = true;
206 |     }
207 |   }
208 |   __syncthreads();
209 |   return ret;
210 | }
211 | 
212 | __inline__ __device__
213 | void PushNodeToSearchPq(Neighbor* pq, int* size, const int max_size,
214 |     const cuda_scalar* data, const int num_dims, const int dist_type,
215 |     const cuda_scalar* src_vec, const int dstid) {
216 |   if (CheckAlreadyExists(pq, *size, dstid)) return;
217 |   const cuda_scalar* dst_vec = data + num_dims * dstid;
218 |   cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type);
219 |   __syncthreads();
220 |   if (*size < max_size) {
221 |     PqPush(pq, size, dist, dstid, false);
222 |   } else if (gt(pq[0].distance, dist)) {
223 |     PqPop(pq, size);
224 |     PqPush(pq, size, dist, dstid, false);
225 |   }
226 |   __syncthreads();
227 | }
228 | 
229 | 
230 | } // namespace cuhnsw
231 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CUHNSW
  2 | 
  3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Build Status](https://travis-ci.org/js1010/cuhnsw.svg?branch=main)](https://travis-ci.org/js1010/cuhnsw) [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/learn-travis/issues)
  4 | 
  5 | Efficient CUDA implementation of Hierarchical Navigable Small World (HNSW) graph algorithm for Approximate Nearest Neighbor (ANN)
  6 | 
  7 | ### Introduction
  8 | 
  9 | This project is to speed up HNSW algorithm by CUDA. I expect that anyone who will be interested in this project might be already familiar with the following paper and the open source project. If not, I strongly recommend that you check them first.
 10 | 
 11 | - hnsw paper: https://arxiv.org/pdf/1603.09320.pdf (2016)
 12 | - hnsw implementation (cpu only) by the author of hnsw (Yury Markov): https://github.com/nmslib/hnswlib
 13 | - Approximate Nearest Neighbor (ANN) Benchmark Site: http://ann-benchmarks.com/
 14 | 
 15 | I also adapted some ideas from the following project.
 16 | 
 17 | - n2 (alternative hnsw cpu implementation project): https://github.com/kakao/n2
 18 | 
 19 | By brief survey, I found there are several papers and projects to suggest to speed up ANN algorithms by GPU.
 20 | 
 21 | - papers or projects related to using GPU for ANN
 22 |   - paper (2020): http://research.baidu.com/Public/uploads/5f5c37aa9c37c.pdf
 23 |   - paper (2017): https://arxiv.org/pdf/1702.05911.pdf
 24 |   - slides (2020): https://wangzwhu.github.io/home/file/acmmm-t-part3-ann.pdf
 25 |   - project (2017): https://github.com/facebookresearch/faiss
 26 |   - paper (2019): https://arxiv.org/pdf/1912.01059.pdf (source repo: https://github.com/cgtuebingen/ggnn) [**UPDATED**: recently found..]
 27 |   
 28 | I started this project because I was originally interested in both CUDA programming and ANN algorithms. I release this project because it achieved meaningful performance and hope to develop further by community participation. 
 29 | 
 30 | Literally, this package is implemented to build HNSW graphs using GPU, and to approximate nearest neighbor search through the built graphs, and the format of the model file is compatible with hnswlib. In other words, you can build a HNSW graph from this package, then save it and load it from hnswlib for search, and vice versa.
 31 | 
 32 | 
 33 | ### How to install
 34 | 
 35 | 1. pip install
 36 | 
 37 | ```shell
 38 | pip install cuhnsw
 39 | ```
 40 | 
 41 | 2. build from source 
 42 | 
 43 | ```shell
 44 | # clone repo and submodules
 45 | git clone git@github.com:js1010/cuhnsw.git && cd cuhnsw && git submodule update --init
 46 | 
 47 | # install requirements
 48 | pip install -r requirements.txt
 49 | 
 50 | # generate proto
 51 | python -m grpc_tools.protoc --python_out cuhnsw/ --proto_path cuhnsw/proto/ config.proto
 52 | 
 53 | # install
 54 | python setup.py install
 55 | ```
 56 | 
 57 | ### How to use
 58 | 
 59 | - `examples/example1.py` and `examples/README.md` will be very helpful to understand the usage.
 60 | - build and save model
 61 | 
 62 | ```python
 63 | import h5py
 64 | from cuhnsw import CuHNSW
 65 | 
 66 | 
 67 | h5f = h5py.File("glove-50-angular.hdf5", "r")
 68 | data = h5f["train"][:, :].astype(np.float32)
 69 | h5f.close()
 70 | ch0 = CuHNSW(opt={})
 71 | ch0.set_data(data)
 72 | ch0.build()
 73 | ch0.save_index("cuhnsw.index")
 74 | ```
 75 | 
 76 | - load model and search
 77 | 
 78 | ```python
 79 | import h5py
 80 | from cuhnsw import CuHNSW
 81 | 
 82 | h5f = h5py.File("glove-50-angular.hdf5", "r")
 83 | data = h5f["test"][:, :].astype(np.float32)
 84 | h5f.close()
 85 | ch0 = CuHNSW(opt={})
 86 | ch0.load_index("cuhnsw.index")
 87 | nns, distances, found_cnt = ch0.search_knn(data, topk=10, ef_search=300)
 88 | ```
 89 | 
 90 | - Option parameters (see `cuhnsw/proto/config.proto`)
 91 |   - `seed`: numpy random seed (used in random levels)
 92 |   - `c_log_level`: log level in cpp logging (spdlog)
 93 |   - `py_log_level`: log level in python logging
 94 |   - `max_m`: maximum number of links in layers higher than ground layer
 95 |   - `max_m0`: maximum number of links in the ground layer
 96 |   - `level_mult`: multiplier to draw levels of each element (defualt: 0 => setted as `1 / log(max_m0)` in initialization as recommended in hnsw paper)
 97 |   - `save_remains`: link to remained candidates in SearchHeuristic (adapted from n2)
 98 |   - `heuristic_coff`: select some closest candidates by default (also adapted from n2)
 99 |   - `hyper_threads`: set the number of gpu blocks as the total number of concurrent cores exceeds the physical number of cores
100 |   - `block_dim`: block dimension (should be smaller than 32^2=1024 and should be the multiple of 32)
101 |   - `nrz`: normalize data vector if True
102 |   - `visited_table_size`: size of table to store the visited nodes in each search
103 |   - `visited_list_size`: size of list to store the visited nodes in each search (useful to reset table after each search)
104 |   - `reverse_cand`: select the candidate with the furthest distance if True (it makes the build slower but achieves better quality)
105 |   - `dist_type`: euclidean distance if "l2" and inner product distaance if "dot"
106 | 
107 | ### Performance
108 | 
109 | - tl;dr
110 |   - cuhnsw achieved the same build quality by 8-9 times faster build time than hnswlib with 8 vcpus on certain data and parameter setup
111 |   - cuhnsw achieved the same search quality by 3-4 times faster search time than hnswlib with 8 vcpus instance on certain data and parameter setup
112 | - Note1: HNSW search algorithm can be verified by exact match since it is deterministic. 
113 |   - I verified it with hnswlib, in other words, cuhnsw search and hnswlib search returns exactly same results by loading the same model file and the same queries and the same ef search.
114 | - Note2: GPU search has the advantage over CPU search only when it comes to the `Batch` search (i.e. processing large number of queries at once.) 
115 | - [AWS P3 2xlarge instance](https://aws.amazon.com/ec2/instance-types/p3/) is used to the experiment. (One Tesla V100 GPU with 8 vcpus, 3.06 USD / hr)
116 | - results can be reproduced by running `example/example1.py`.
117 | - build time / quality results on glove-50-angular
118 |   - used `ef_construction`=150 for hnswlib and `ef_construction=110` for cuhnsw to achieve the same build quality
119 |   - build quality is measured by the accuracy by the same search parameter (`ef_search`=300)
120 |   - build time is in seconds
121 | 
122 | | attr          |     1 vcpu |     2 vcpu |    4 vcpu |    8 vcpu |       gpu |
123 | |:--------------|-----------:|-----------:|----------:|----------:|----------:|
124 | | build time    | 343.909    | 179.836    | 89.7936   | 70.5476   | 8.2847    |
125 | | build quality |   0.863193 |   0.863301 |  0.863238 |  0.863165 |  0.865471 |
126 | 
127 | - update: measured build time / accuracy for cpu-only instance ([c5.24xlarge](https://aws.amazon.com/ec2/instance-types/c5/), 96 vcpu, 4.08 USD / hr): 9.6275 sec / 0.8628
128 | - search time comparison on glove-50-angular
129 |   - search time on 1M random queries (seconds)
130 |   - search `quality` is guaranteed to the same (exact match)
131 | 
132 | | attr        |  1 vcpu |  2 vcpu |  4 vcpu |  8 vcpu |     gpu |
133 | |:------------|--------:|--------:|--------:|--------:|--------:|
134 | | search time | 556.605 | 287.967 | 146.331 | 115.431 | 29.7008 |
135 | 
136 | - update: measured 1M queries search time for cpu-only instance ([c5.24xlarge](https://aws.amazon.com/ec2/instance-types/c5/), 96 vcpu, 4.08 USD / hr): 22.4642 sec
137 | 
138 | - the reason why the parallel efficiency significantly drops from 4 vcpu to 8 vcpu might be hyper threading (there might be only 4 "physical" cores in this instance).
139 | 
140 | ### Thoughts on Future Task
141 | 
142 | - The word in the parentheses shows the expected level of difficulty for each task
143 | 
144 | 1. **implement parallel compilation using bazel or cmake (easy-medium)**: bazel is more preferable. compilation time is a little bit painful so far.
145 | 2. **achieve significant speed-up by using half-precision operation (medium)**: I experimented it, but only got around 10 % improvement. I am not sure if I have used the half-precision feature appropriately.
146 | 3. **support multi-device (very hard)**: it only supports single-device (gpu) yet since the graph should be shared across all the building threads.
147 | 
148 | - contribution is always welcome
149 | 


--------------------------------------------------------------------------------
/cuda_setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # Adapted from https://github.com/rmcgibbo/npcuda-example and
  8 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py
  9 | # pylint: disable=fixme,access-member-before-definition
 10 | # pylint: disable=attribute-defined-outside-init,arguments-differ
 11 | import logging
 12 | import os
 13 | import sys
 14 | 
 15 | from distutils import ccompiler, errors, msvccompiler, unixccompiler
 16 | from setuptools.command.build_ext import build_ext as setuptools_build_ext
 17 | 
 18 | 
 19 | HALF_PRECISION = False
 20 | 
 21 | def find_in_path(name, path):
 22 |   "Find a file in a search path"
 23 |   # adapted fom http://code.activestate.com/
 24 |   # recipes/52224-find-a-file-given-a-search-path/
 25 |   for _dir in path.split(os.pathsep):
 26 |     binpath = os.path.join(_dir, name)
 27 |     if os.path.exists(binpath):
 28 |       return os.path.abspath(binpath)
 29 |   return None
 30 | 
 31 | # reference: https://arnon.dk/
 32 | # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
 33 | def get_cuda_sm_list(cuda_ver):
 34 |   if "CUDA_SM_LIST" in os.environ:
 35 |     sm_list = os.environ["CUDA_SM_LIST"].split(",")
 36 |   else:
 37 |     sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"]
 38 |     if cuda_ver >= 110:
 39 |       filter_list = ["30"]
 40 |       if cuda_ver == 110:
 41 |         filter_list += ["86"]
 42 |     else:
 43 |       filter_list = ["80", "86"]
 44 |       if cuda_ver < 100:
 45 |         filter_list += ["75"]
 46 |       if cuda_ver < 90:
 47 |         filter_list += ["70"]
 48 |       if cuda_ver < 80:
 49 |         filter_list += ["60", "61"]
 50 |     sm_list = [sm for sm in sm_list if sm not in filter_list]
 51 |   return sm_list
 52 | 
 53 | 
 54 | def get_cuda_compute(cuda_ver):
 55 |   if "CUDA_COMPUTE" in os.environ:
 56 |     compute = os.environ["CUDA_COMPUTE"]
 57 |   else:
 58 |     if 70 <= cuda_ver < 80:
 59 |       compute = "52"
 60 |     if 80 <= cuda_ver < 90:
 61 |       compute = "61"
 62 |     if 90 <= cuda_ver < 100:
 63 |       compute = "70"
 64 |     if 100 <= cuda_ver < 110:
 65 |       compute = "75"
 66 |     if cuda_ver == 110:
 67 |       compute = "80"
 68 |     if cuda_ver == 111:
 69 |       compute = "86"
 70 |   return compute
 71 | 
 72 | 
 73 | def get_cuda_arch(cuda_ver):
 74 |   if "CUDA_ARCH" in os.environ:
 75 |     arch = os.environ["CUDA_ARCH"]
 76 |   else:
 77 |     if 70 <= cuda_ver < 92:
 78 |       arch = "30"
 79 |     if 92 <= cuda_ver < 110:
 80 |       arch = "50"
 81 |     if cuda_ver == 110:
 82 |       arch = "52"
 83 |     if cuda_ver == 111:
 84 |       arch = "80"
 85 |   return arch
 86 | 
 87 | def locate_cuda():
 88 |   """Locate the CUDA environment on the system
 89 |   If a valid cuda installation is found
 90 |   this returns a dict with keys 'home', 'nvcc', 'include',
 91 |   and 'lib64' and values giving the absolute path to each directory.
 92 |   Starts by looking for the CUDAHOME env variable.
 93 |   If not found, everything is based on finding
 94 |   'nvcc' in the PATH.
 95 |   If nvcc can't be found, this returns None
 96 |   """
 97 |   nvcc_bin = 'nvcc'
 98 |   if sys.platform.startswith("win"):
 99 |     nvcc_bin = 'nvcc.exe'
100 | 
101 |   # check env variables CUDA_HOME, CUDAHOME, CUDA_PATH.
102 |   found = False
103 |   for env_name in ['CUDA_PATH', 'CUDAHOME', 'CUDA_HOME']:
104 |     if env_name not in os.environ:
105 |       continue
106 |     found = True
107 |     home = os.environ[env_name]
108 |     nvcc = os.path.join(home, 'bin', nvcc_bin)
109 |     break
110 |   if not found:
111 |     # otherwise, search the PATH for NVCC
112 |     nvcc = find_in_path(nvcc_bin, os.environ['PATH'])
113 |     if nvcc is None:
114 |       logging.warning('The nvcc binary could not be located in your '
115 |               '$PATH. Either add it to '
116 |               'your path, or set $CUDA_HOME to enable CUDA extensions')
117 |       return None
118 |     home = os.path.dirname(os.path.dirname(nvcc))
119 | 
120 |   cudaconfig = {'home': home,
121 |                 'nvcc': nvcc,
122 |                 'include': os.path.join(home, 'include'),
123 |                 'lib64':   os.path.join(home, 'lib64')}
124 |   cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".")
125 |   major, minor = int(cuda_ver[0]), int(cuda_ver[1])
126 |   cuda_ver = 10 * major + minor
127 |   assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}"
128 |   print(f"cuda_ver: {major}.{minor}")
129 |   arch = get_cuda_arch(cuda_ver)
130 |   sm_list = get_cuda_sm_list(cuda_ver)
131 |   compute = get_cuda_compute(cuda_ver)
132 |   post_args = [f"-arch=sm_{arch}"] + \
133 |     [f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \
134 |     [f"-gencode=arch=compute_{compute},code=compute_{compute}",
135 |      "--ptxas-options=-v", "-O2"]
136 |   print(f"nvcc post args: {post_args}")
137 |   if HALF_PRECISION:
138 |     post_args = [flag for flag in post_args if "52" not in flag]
139 | 
140 |   if sys.platform == "win32":
141 |     cudaconfig['lib64'] = os.path.join(home, 'lib', 'x64')
142 |     post_args += ['-Xcompiler', '/MD', '-std=c++14',  "-Xcompiler", "/openmp"]
143 |     if HALF_PRECISION:
144 |       post_args += ["-Xcompiler", "/D HALF_PRECISION"]
145 |   else:
146 |     post_args += ['-c', '--compiler-options', "'-fPIC'",
147 |                   "--compiler-options", "'-std=c++14'"]
148 |     if HALF_PRECISION:
149 |       post_args += ["--compiler-options", "'-D HALF_PRECISION'"]
150 |   for k, val in cudaconfig.items():
151 |     if not os.path.exists(val):
152 |       logging.warning('The CUDA %s path could not be located in %s', k, val)
153 |       return None
154 | 
155 |   cudaconfig['post_args'] = post_args
156 |   return cudaconfig
157 | 
158 | 
159 | # This code to build .cu extensions with nvcc is taken from cupy:
160 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py
161 | class _UnixCCompiler(unixccompiler.UnixCCompiler):
162 |   src_extensions = list(unixccompiler.UnixCCompiler.src_extensions)
163 |   src_extensions.append('.cu')
164 | 
165 |   def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
166 |     # For sources other than CUDA C ones, just call the super class method.
167 |     if os.path.splitext(src)[1] != '.cu':
168 |       return unixccompiler.UnixCCompiler._compile(
169 |         self, obj, src, ext, cc_args, extra_postargs, pp_opts)
170 | 
171 |     # For CUDA C source files, compile them with NVCC.
172 |     _compiler_so = self.compiler_so
173 |     try:
174 |       nvcc_path = CUDA['nvcc']
175 |       post_args = CUDA['post_args']
176 |       # TODO? base_opts = build.get_compiler_base_options()
177 |       self.set_executable('compiler_so', nvcc_path)
178 | 
179 |       return unixccompiler.UnixCCompiler._compile(
180 |         self, obj, src, ext, cc_args, post_args, pp_opts)
181 |     finally:
182 |       self.compiler_so = _compiler_so
183 | 
184 | 
185 | class _MSVCCompiler(msvccompiler.MSVCCompiler):
186 |   _cu_extensions = ['.cu']
187 | 
188 |   src_extensions = list(unixccompiler.UnixCCompiler.src_extensions)
189 |   src_extensions.extend(_cu_extensions)
190 | 
191 |   def _compile_cu(self, sources, output_dir=None, macros=None,
192 |           include_dirs=None, debug=0, extra_preargs=None,
193 |           extra_postargs=None, depends=None):
194 |     # Compile CUDA C files, mainly derived from UnixCCompiler._compile().
195 |     macros, objects, extra_postargs, pp_opts, _build = \
196 |       self._setup_compile(output_dir, macros, include_dirs, sources,
197 |                 depends, extra_postargs)
198 | 
199 |     compiler_so = CUDA['nvcc']
200 |     cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
201 |     post_args = CUDA['post_args']
202 | 
203 |     for obj in objects:
204 |       try:
205 |         src, _ = _build[obj]
206 |       except KeyError:
207 |         continue
208 |       try:
209 |         self.spawn([compiler_so] + cc_args + [src, '-o', obj] + post_args)
210 |       except errors.DistutilsExecError as e:
211 |         raise errors.CompileError(str(e))
212 | 
213 |     return objects
214 | 
215 |   def compile(self, sources, **kwargs):
216 |     # Split CUDA C sources and others.
217 |     cu_sources = []
218 |     other_sources = []
219 |     for source in sources:
220 |       if os.path.splitext(source)[1] == '.cu':
221 |         cu_sources.append(source)
222 |       else:
223 |         other_sources.append(source)
224 | 
225 |     # Compile source files other than CUDA C ones.
226 |     other_objects = msvccompiler.MSVCCompiler.compile(
227 |       self, other_sources, **kwargs)
228 | 
229 |     # Compile CUDA C sources.
230 |     cu_objects = self._compile_cu(cu_sources, **kwargs)
231 | 
232 |     # Return compiled object filenames.
233 |     return other_objects + cu_objects
234 | 
235 | 
236 | class CudaBuildExt(setuptools_build_ext):
237 |   """Custom `build_ext` command to include CUDA C source files."""
238 | 
239 |   def run(self):
240 |     if CUDA is not None:
241 |       def wrap_new_compiler(func):
242 |         def _wrap_new_compiler(*args, **kwargs):
243 |           try:
244 |             return func(*args, **kwargs)
245 |           except errors.DistutilsPlatformError:
246 |             if sys.platform != 'win32':
247 |               CCompiler = _UnixCCompiler
248 |             else:
249 |               CCompiler = _MSVCCompiler
250 |             return CCompiler(
251 |               None, kwargs['dry_run'], kwargs['force'])
252 |         return _wrap_new_compiler
253 |       ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler)
254 |       # Intentionally causes DistutilsPlatformError in
255 |       # ccompiler.new_compiler() function to hook.
256 |       self.compiler = 'nvidia'
257 | 
258 |     setuptools_build_ext.run(self)
259 | 
260 | 
261 | CUDA = locate_cuda()
262 | assert CUDA is not None
263 | BUILDEXT = CudaBuildExt if CUDA else setuptools_build_ext
264 | 


--------------------------------------------------------------------------------
/examples/example1.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,logging-format-truncated
  8 | import os
  9 | from os.path import join as pjoin
 10 | import time
 11 | import subprocess
 12 | 
 13 | import h5py
 14 | import fire
 15 | # import tqdm
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | import hnswlib
 20 | from cuhnsw import aux, CuHNSW
 21 | 
 22 | LOGGER = aux.get_logger()
 23 | 
 24 | NUM_DATA = 1183514
 25 | DATA_FILE = "glove-50-angular.hdf5"
 26 | DIST_TYPE = "cosine"
 27 | 
 28 | # NUM_DATA = 1000000
 29 | # DATA_FILE = "sift-128-euclidean.hdf5"
 30 | # DIST_TYPE = "l2"
 31 | 
 32 | BARRIER_SIZE = 100
 33 | RES_DIR = "res"
 34 | INDEX_FILE = "hnswlib.index"
 35 | CUHNSW_INDEX_FILE = "cuhnsw.index"
 36 | HNSWLIB_INDEX_FILE = "hnswlib.index"
 37 | DATA_URL = f"http://ann-benchmarks.com/{DATA_FILE}"
 38 | NRZ = DIST_TYPE == "cosine"
 39 | OPT = { \
 40 |   "c_log_level": 2,
 41 |   "ef_construction": 150,
 42 |   "hyper_threads": 100,
 43 |   "block_dim": 32,
 44 |   "nrz": NRZ,
 45 |   "reverse_cand": False,
 46 |   "heuristic_coef": 0.0,
 47 |   "dist_type": DIST_TYPE, \
 48 | }
 49 | 
 50 | 
 51 | def download():
 52 |   if not os.path.exists(RES_DIR):
 53 |     os.makedirs(RES_DIR)
 54 |   data_path = pjoin(RES_DIR, DATA_FILE)
 55 |   if os.path.exists(data_path):
 56 |     return
 57 |   cmds = ["wget", DATA_URL, "-O", data_path + ".tmp"]
 58 |   cmds = " ".join(cmds)
 59 |   LOGGER.info("download data: %s", cmds)
 60 |   subprocess.call(cmds, shell=True)
 61 |   os.rename(data_path + ".tmp", data_path)
 62 | 
 63 | 
 64 | def run_cpu_inference(topk=100, ef_search=300, index_file=INDEX_FILE,
 65 |                       evaluate=True, num_threads=-1):
 66 |   print("=" * BARRIER_SIZE)
 67 |   data_path = pjoin(RES_DIR, DATA_FILE)
 68 |   index_path = pjoin(RES_DIR, index_file)
 69 |   LOGGER.info("cpu inference on %s with index %s", data_path, index_path)
 70 |   h5f = h5py.File(data_path, "r")
 71 |   num_data = h5f["train"].shape[0]
 72 |   queries = h5f["test"][:, :].astype(np.float32)
 73 |   neighbors = h5f["neighbors"][:, :topk].astype(np.int32)
 74 |   h5f.close()
 75 |   hl0 = hnswlib.Index(space=DIST_TYPE, dim=queries.shape[1])
 76 |   LOGGER.info("load %s by hnswlib", index_path)
 77 |   num_queries = queries.shape[0]
 78 |   hl0.load_index(index_path, max_elements=num_data)
 79 |   hl0.set_ef(ef_search)
 80 |   if NRZ:
 81 |     queries /= np.linalg.norm(queries, axis=1)[:, None]
 82 | 
 83 |   start = time.time()
 84 |   labels, _ = hl0.knn_query(queries, k=topk, num_threads=num_threads)
 85 |   el0 = time.time() - start
 86 |   LOGGER.info("elapsed for processing %d queries computing top@%d: %.4e sec",
 87 |               num_queries, topk, el0)
 88 |   if evaluate:
 89 |     accs = []
 90 |     for _pred_nn, _gt_nn in zip(labels, neighbors):
 91 |       intersection = set(_pred_nn) & set(_gt_nn)
 92 |       acc = len(intersection) / float(topk)
 93 |       accs.append(acc)
 94 |     LOGGER.info("accuracy mean: %.4e, std: %.4e", np.mean(accs), np.std(accs))
 95 |     return el0, np.mean(accs)
 96 |   return el0
 97 | 
 98 | def run_cpu_inference_large(topk=100, index_file=INDEX_FILE, ef_search=300,
 99 |                             num_queries=1000000, num_dims=50, num_threads=-1):
100 |   print("=" * BARRIER_SIZE)
101 |   index_path = pjoin(RES_DIR, index_file)
102 |   data_path = pjoin(RES_DIR, DATA_FILE)
103 |   LOGGER.info("cpu inference on %s with index %s", data_path, index_path)
104 | 
105 |   queries = np.random.normal(size=(num_queries, num_dims)).astype(np.float32)
106 |   queries /= np.linalg.norm(queries, axis=1)[:, None]
107 | 
108 |   hl0 = hnswlib.Index(space=DIST_TYPE, dim=queries.shape[1])
109 |   LOGGER.info("load %s by hnswlib", index_path)
110 |   hl0.load_index(index_path, max_elements=NUM_DATA)
111 |   hl0.set_ef(ef_search)
112 |   queries /= np.linalg.norm(queries, axis=1)[:, None]
113 | 
114 |   start = time.time()
115 |   _, _ = hl0.knn_query(queries, k=topk, num_threads=num_threads)
116 |   el0 = time.time() - start
117 |   LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): "
118 |               "%.4e sec", num_queries, topk, ef_search, el0)
119 |   return el0
120 | 
121 | def run_cpu_training(ef_const=150, num_threads=-1):
122 |   print("=" * BARRIER_SIZE)
123 |   data_path = pjoin(RES_DIR, DATA_FILE)
124 |   LOGGER.info("cpu training on %s with ef const %d, num_threads: %d",
125 |               data_path, ef_const, num_threads)
126 |   h5f = h5py.File(data_path, "r")
127 |   data = h5f["train"][:, :].astype(np.float32)
128 |   h5f.close()
129 |   hl0 = hnswlib.Index(space=DIST_TYPE, dim=data.shape[1])
130 |   num_data = data.shape[0]
131 |   data /= np.linalg.norm(data, axis=1)[:, None]
132 |   hl0.init_index(max_elements=num_data, ef_construction=ef_const, M=12)
133 |   LOGGER.info("add data to hnswlib")
134 |   start = time.time()
135 |   hl0.add_items(data, np.arange(num_data, dtype=np.int32),
136 |                 num_threads=num_threads)
137 |   el0 = time.time() - start
138 |   LOGGER.info("elapsed for adding %d items: %.4e sec", num_data, el0)
139 |   index_path = pjoin(RES_DIR, HNSWLIB_INDEX_FILE)
140 |   hl0.save_index(index_path)
141 |   LOGGER.info("index saved to %s", index_path)
142 |   return el0
143 | 
144 | def run_gpu_inference(topk=100, index_file=INDEX_FILE, ef_search=300):
145 |   print("=" * BARRIER_SIZE)
146 |   data_path = pjoin(RES_DIR, DATA_FILE)
147 |   index_path = pjoin(RES_DIR, index_file)
148 |   LOGGER.info("gpu inference on %s with index %s", data_path, index_path)
149 |   ch0 = CuHNSW(OPT)
150 |   LOGGER.info("load model from %s by cuhnsw", index_path)
151 |   ch0.load_index(index_path)
152 | 
153 |   h5f = h5py.File(data_path, "r")
154 |   queries = h5f["test"][:, :].astype(np.float32)
155 |   neighbors = h5f["neighbors"][:, :topk].astype(np.int32)
156 |   h5f.close()
157 |   num_queries = queries.shape[0]
158 |   if NRZ:
159 |     queries /= np.linalg.norm(queries, axis=1)[:, None]
160 | 
161 |   start = time.time()
162 |   pred_nn, _, _ = ch0.search_knn(queries, topk, ef_search)
163 |   el0 = time.time() - start
164 |   LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): "
165 |               "%.4e sec", num_queries, topk, ef_search, el0)
166 |   accs = []
167 |   for _pred_nn, _gt_nn in zip(pred_nn, neighbors):
168 |     intersection = set(_pred_nn) & set(_gt_nn)
169 |     acc = len(intersection) / float(topk)
170 |     accs.append(acc)
171 |   LOGGER.info("accuracy mean: %.4e, std: %.4e", np.mean(accs), np.std(accs))
172 |   return el0, np.mean(accs)
173 | 
174 | def run_gpu_inference2(topk=5, index_file="cuhnsw.index", ef_search=300):
175 |   print("=" * BARRIER_SIZE)
176 |   data_path = pjoin(RES_DIR, DATA_FILE)
177 |   index_path = pjoin(RES_DIR, index_file)
178 |   LOGGER.info("gpu inference on %s with index %s", data_path, index_path)
179 |   ch0 = CuHNSW(OPT)
180 |   LOGGER.info("load model from %s by cuhnsw", index_path)
181 |   ch0.load_index(index_path)
182 | 
183 |   h5f = h5py.File(data_path, "r")
184 |   data = h5f["train"][:, :].astype(np.float32)
185 |   queries = h5f["test"][:5, :].astype(np.float32)
186 |   h5f.close()
187 |   if NRZ:
188 |     data /= np.linalg.norm(data, axis=1)[:, None]
189 | 
190 |   nns, distances, found_cnt = ch0.search_knn(queries[:5], topk, ef_search)
191 |   for idx, (nn0, distance, cnt) in \
192 |       enumerate(zip(nns, distances, found_cnt)):
193 |     print("=" * BARRIER_SIZE)
194 |     print(f"query {idx + 1}")
195 |     print("-" * BARRIER_SIZE)
196 |     for _idx, (_nn, _dist) in enumerate(zip(nn0[:cnt], distance[:cnt])):
197 |       if DIST_TYPE == "l2":
198 |         real_dist = np.linalg.norm(data[_nn] - queries[idx])
199 |         _dist = np.sqrt(_dist)
200 |       elif DIST_TYPE == "dot":
201 |         real_dist = data[_nn].dot(queries[idx])
202 |       print(f"rank {_idx + 1}. neighbor: {_nn}, dist by lib: {_dist}, "
203 |             f"actual dist: {real_dist}")
204 | 
205 | 
206 | def run_gpu_inference_large(topk=100, index_file=INDEX_FILE, ef_search=300,
207 |                             num_queries=1000000, num_dims=50):
208 |   print("=" * BARRIER_SIZE)
209 |   index_path = pjoin(RES_DIR, index_file)
210 |   data_path = pjoin(RES_DIR, DATA_FILE)
211 |   LOGGER.info("gpu inference on %s with index %s", data_path, index_path)
212 |   ch0 = CuHNSW(OPT)
213 |   LOGGER.info("load model from %s by cuhnsw", index_path)
214 |   ch0.load_index(index_path)
215 | 
216 |   queries = np.random.normal(size=(num_queries, num_dims)).astype(np.float32)
217 |   num_queries = queries.shape[0]
218 |   if NRZ:
219 |     queries /= np.linalg.norm(queries, axis=1)[:, None]
220 | 
221 |   start = time.time()
222 |   _, _, _ = ch0.search_knn(queries, topk, ef_search)
223 |   el0 = time.time() - start
224 |   LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): "
225 |               "%.4e sec", num_queries, topk, ef_search, el0)
226 |   return el0
227 | 
228 | def run_gpu_training(ef_const=150):
229 |   print("=" * BARRIER_SIZE)
230 |   data_path = pjoin(RES_DIR, DATA_FILE)
231 |   LOGGER.info("gpu training on %s with ef const %d", data_path, ef_const)
232 |   OPT["ef_construction"] = ef_const
233 |   ch0 = CuHNSW(OPT)
234 |   h5f = h5py.File(data_path, "r")
235 |   data = h5f["train"][:, :].astype(np.float32)
236 |   h5f.close()
237 |   ch0.set_data(data)
238 |   start = time.time()
239 |   ch0.build()
240 |   el0 = time.time() - start
241 |   LOGGER.info("elpased time to build by cuhnsw: %.4e sec", el0)
242 |   index_path = pjoin(RES_DIR, CUHNSW_INDEX_FILE)
243 |   ch0.save_index(index_path)
244 |   return el0
245 | 
246 | def measure_build_performance():
247 |   build_time = {"attr": "build time"}
248 |   build_quality = {"attr": "build quality"}
249 |   build_time["gpu"] = run_gpu_training(ef_const=110)
250 |   _, build_quality["gpu"] = run_gpu_inference(index_file="cuhnsw.index")
251 |   for i in [1, 2, 4, 8]:
252 |     build_time[f"{i} cpu"] = run_cpu_training(ef_const=150, num_threads=i)
253 |     _, build_quality[f"{i} cpu"] = run_cpu_inference(index_file="hnswlib.index")
254 |   columns = [f"{i} cpu" for i in [1, 2, 4, 8]] + ["gpu"]
255 |   df0 = pd.DataFrame([build_time, build_quality])
256 |   df0.set_index("attr", inplace=True)
257 |   print(df0[columns].to_markdown())
258 | 
259 | def measure_search_performance():
260 |   search_time = {"attr": "search time"}
261 |   search_time["gpu"] = run_gpu_inference_large(index_file="cuhnsw.index")
262 |   for i in [1, 2, 4, 8]:
263 |     search_time[f"{i} cpu"] = run_cpu_inference_large(
264 |       index_file="cuhnsw.index", num_threads=i)
265 |   columns = [f"{i} cpu" for i in [1, 2, 4, 8]] + ["gpu"]
266 |   df0 = pd.DataFrame([search_time])
267 |   df0.set_index("attr", inplace=True)
268 |   print(df0[columns].to_markdown())
269 | 
270 | 
271 | def run_experiments():
272 |   measure_build_performance()
273 |   measure_search_performance()
274 | 
275 | 
276 | if __name__ == "__main__":
277 |   fire.Fire()
278 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/cpp/include/cuda_build_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "cuda_utils_kernels.cuh"
  8 | 
  9 | namespace cuhnsw {
 10 | __inline__ __device__
 11 | bool IsNeighbor(const int* graph, const int deg, const int dstid) {
 12 |   
 13 |   __syncthreads();
 14 |   // figure out the warp/ position inside the warp
 15 |   int warp =  threadIdx.x / WARP_SIZE;
 16 |   int lane = threadIdx.x % WARP_SIZE;
 17 |   
 18 |   static __shared__ bool shared[WARP_SIZE];
 19 |   
 20 |   __syncthreads();
 21 |   bool is_neighbor = false;
 22 |   for (int i = threadIdx.x; i < deg; i += blockDim.x) {
 23 |     if (graph[i] == dstid) {
 24 |       is_neighbor = true;
 25 |       break;
 26 |     }
 27 |   }
 28 |   __syncthreads();
 29 |   
 30 |   #if __CUDACC_VER_MAJOR__ >= 9
 31 |   unsigned int active = __activemask();
 32 |   is_neighbor = __any_sync(active, is_neighbor);
 33 |   #else
 34 |   is_neighbor = __any(is_neighbor);
 35 |   #endif
 36 |   
 37 |   // write out the partial reduction to shared memory if appropiate
 38 |   if (lane == 0) {
 39 |     shared[warp] = is_neighbor;
 40 |   }
 41 |   
 42 |   __syncthreads();
 43 |   
 44 |   // if we we don't have multiple warps, we're done
 45 |   if (blockDim.x <= WARP_SIZE) {
 46 |     return shared[0];
 47 |   } 
 48 | 
 49 | 
 50 |   // otherwise reduce again in the first warp
 51 |   is_neighbor = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : false;
 52 |   if (warp == 0) {
 53 |     #if __CUDACC_VER_MAJOR__ >= 9
 54 |     active = __activemask();
 55 |     is_neighbor = __any_sync(active, is_neighbor);
 56 |     #else
 57 |     is_neighbor = __any(is_neighbor);
 58 |     #endif
 59 |     // broadcast back to shared memory
 60 |     if (threadIdx.x == 0) {
 61 |         shared[0] = is_neighbor;
 62 |     }
 63 |   }
 64 |   __syncthreads();
 65 |   return shared[0];
 66 | }
 67 | 
 68 | __inline__ __device__
 69 | void SearchHeuristic(
 70 |     Neighbor* ef_const_pq, int* size,
 71 |     const int srcid, const int* nodes,
 72 |     const cuda_scalar* data, const int dist_type, const int num_dims,
 73 |     const int ef_construction, const int max_m,
 74 |     const bool save_remains,
 75 |     int* cand_nodes, cuda_scalar* cand_distances, 
 76 |     int* graph, float* distances, int* deg, 
 77 |     const float heuristic_coef, const int new_comer = -1) {
 78 |   int size2 = *size;
 79 |   __syncthreads();
 80 | 
 81 |   // get sorted neighbors
 82 |   if (threadIdx.x == 0) {
 83 |     while (*size > 0) {
 84 |       cand_nodes[(*size) - 1] = ef_const_pq[0].nodeid;
 85 |       cand_distances[(*size) - 1] = ef_const_pq[0].distance;
 86 |       PqPop(ef_const_pq, size);
 87 |     }
 88 |   }
 89 |   __syncthreads();
 90 | 
 91 |   // set variables for search heuristic
 92 |   int head = 0;
 93 |   int tail = max_m - 1;
 94 |   if (tail > size2 - 1)
 95 |     tail = size2 - 1;
 96 |   const int max_head = tail + 1;
 97 |   
 98 |   // take some proportion of closest nodes by default
 99 |   // this mechanism does not exist in hnswlib
100 |   // it refers to https://github.com/kakao/n2/blob/36888c3869ac478d896d0921ac64f21930d85659/src/heuristic.cc#L42
101 |   const int nn_num = max_m * heuristic_coef;
102 |   
103 |   int* _graph = graph + srcid * max_m;
104 |   float* _distances = distances + srcid * max_m;
105 |   bool new_comer_inserted = false;
106 |   // search heuristic
107 |   for (int j = 0; j < size2; ++j) {
108 |     if (head >= max_m) break;
109 |     bool freepass = head < nn_num or 
110 |       (new_comer >= 0 and not new_comer_inserted and cand_nodes[j] != new_comer);
111 |     if (freepass) {
112 |       if (threadIdx.x == 0) {
113 |         _graph[head] = cand_nodes[j];
114 |         _distances[head] = out_scalar(cand_distances[j]);
115 |       }
116 |       head++;
117 |       __syncthreads();
118 |       continue;
119 |     }
120 |     const cuda_scalar dist_to_src = cand_distances[j];
121 |     bool skip = false;
122 |     if (new_comer >= 0 and new_comer_inserted) {
123 |       cuda_scalar dist = GetDistance(cand_nodes[j], new_comer,
124 |           num_dims, dist_type, nodes, data);
125 |       skip = gt(dist_to_src, dist);
126 |     } else {
127 |       for (int k = 0; k < head; ++k) {
128 |         cuda_scalar dist = GetDistance(cand_nodes[j], _graph[k],
129 |             num_dims, dist_type, nodes, data);
130 |         if (gt(dist_to_src, dist)) {
131 |           skip = true;
132 |           __syncthreads();
133 |           break;
134 |         }
135 |       }
136 |     }
137 |     if (cand_nodes[j] == new_comer and not skip) 
138 |       new_comer_inserted = true;
139 | 
140 |     if (skip and tail >= head) {
141 |       if (threadIdx.x == 0) {
142 |         _graph[tail] = cand_nodes[j];
143 |         _distances[tail] = out_scalar(cand_distances[j]);
144 |       }
145 |       tail--;
146 |     } else if (not skip){
147 |       if (threadIdx.x == 0) {
148 |         _graph[head] = cand_nodes[j];
149 |         _distances[head] = out_scalar(cand_distances[j]);
150 |       }
151 |       head++;
152 |     }
153 |   }
154 |   __syncthreads();
155 | 
156 |   // copy to graph
157 |   // take remaining nodes as new neighbors
158 |   // it also refers to https://github.com/kakao/n2/blob/36888c3869ac478d896d0921ac64f21930d85659/src/heuristic.cc#L85
159 |   // it does not exist in hnswlib as well
160 |   if (threadIdx.x == 0) deg[srcid] = save_remains? max_head: head;
161 |   __syncthreads();
162 | }
163 | 
164 | __global__ void BuildLevelGraphKernel(
165 |   const cuda_scalar* data, const int* nodes,
166 |   const int num_dims, const int num_nodes, const int max_m, const int dist_type,
167 |   const bool save_remains, const int ef_construction, int* graph, float* distances, int* deg,
168 |   int* visited_table, int* visited_list, const int visited_table_size, const int visited_list_size, 
169 |   int* mutex, int64_t* acc_visited_cnt,
170 |   const bool reverse_cand, Neighbor* neighbors, int* global_cand_nodes, cuda_scalar* global_cand_distances,
171 |   const float heuristic_coef, int* backup_neighbors, cuda_scalar* backup_distances, bool* went_through_heuristic
172 |   ) {
173 | 
174 |   static __shared__ int size;
175 |   static __shared__ int visited_cnt;
176 |   
177 |   // storage to store neighbors and distnces temporarily
178 |   static __shared__ int backup_deg;
179 |   int* _backup_neighbors = backup_neighbors + max_m * blockIdx.x;
180 |   cuda_scalar* _backup_distances = backup_distances + max_m * blockIdx.x;
181 | 
182 |   Neighbor* ef_const_pq = neighbors + ef_construction * blockIdx.x;
183 |   int* cand_nodes = global_cand_nodes + ef_construction * blockIdx.x;
184 |   cuda_scalar* cand_distances = global_cand_distances + ef_construction * blockIdx.x;
185 |   int* _visited_table = visited_table + visited_table_size * blockIdx.x;
186 |   int* _visited_list = visited_list + visited_list_size * blockIdx.x;
187 | 
188 |   for (int i = blockIdx.x; i < num_nodes; i += gridDim.x) {
189 |     if (threadIdx.x == 0) {
190 |       size = 0;
191 |       visited_cnt = 0;
192 |     }
193 |     __syncthreads();
194 |     int srcid = i;
195 |     // read access of srcid
196 |     if (threadIdx.x == 0) {
197 |       while (atomicCAS(&mutex[srcid], 0, 1)) {}
198 |     }
199 |     __syncthreads();
200 | 
201 |     // initialize entries as neighbors
202 |     for (int j = max_m * i; j < max_m * i + deg[i]; ++j) {
203 |       int dstid = graph[j];
204 |       if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 
205 |             visited_table_size, visited_list_size)) 
206 |         continue;
207 |       __syncthreads();
208 | 
209 |       PushNodeToPq(ef_const_pq, &size, ef_construction,
210 |           data, num_dims, dist_type, srcid, dstid, nodes);
211 |     }
212 |     __syncthreads();
213 | 
214 |     // release lock
215 |     if (threadIdx.x == 0) mutex[srcid] = 0;
216 |     __syncthreads();
217 | 
218 |     // iterate until converge
219 |     int idx = GetCand(ef_const_pq, size, reverse_cand);
220 |     while (idx >= 0) {
221 |       __syncthreads();
222 |       if (threadIdx.x == 0) ef_const_pq[idx].checked = true;
223 |       int entry = ef_const_pq[idx].nodeid;
224 | 
225 |       // read access of entry
226 |       if (threadIdx.x == 0) {
227 |         while (atomicCAS(&mutex[entry], 0, 1)) {}
228 |       }
229 |       __syncthreads();
230 | 
231 |       for (int j = max_m * entry; j < max_m * entry + deg[entry]; ++j) {
232 |         int dstid = graph[j];
233 | 
234 |         if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 
235 |               visited_table_size, visited_list_size)) 
236 |           continue;
237 |         __syncthreads();
238 |         
239 |         PushNodeToPq(ef_const_pq, &size, ef_construction,
240 |             data, num_dims, dist_type, srcid, dstid, nodes);
241 |       }
242 |       __syncthreads();
243 | 
244 |       // release lock
245 |       if (threadIdx.x == 0) mutex[entry] = 0;
246 |       __syncthreads();
247 |       idx = GetCand(ef_const_pq, size, reverse_cand);
248 |     }
249 | 
250 |     __syncthreads();
251 |     if (threadIdx.x == 0) {
252 |       acc_visited_cnt[blockIdx.x] += visited_cnt;
253 |     }
254 |     for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) {
255 |       _visited_table[_visited_list[j]] = -1;
256 |     }
257 |     __syncthreads();
258 | 
259 |     // write access of dstid
260 |     if (threadIdx.x == 0) {
261 |       while (atomicCAS(&mutex[srcid], 0, 1)) {}
262 |     }
263 |     __syncthreads();
264 | 
265 |     for (int j = 0; j < deg[srcid]; ++j) {
266 |       int dstid = graph[srcid * max_m + j];
267 |       PushNodeToPq(ef_const_pq, &size, ef_construction,
268 |           data, num_dims, dist_type, srcid, dstid, nodes);
269 |     }
270 | 
271 |     // run search heuristic for myself
272 |     SearchHeuristic(ef_const_pq, &size, srcid, nodes,
273 |         data, dist_type, num_dims,
274 |         ef_construction, max_m, save_remains,
275 |         cand_nodes, cand_distances, 
276 |         graph, distances, deg, heuristic_coef);
277 |     
278 |     if (threadIdx.x == 0) went_through_heuristic[srcid] = true;
279 | 
280 |     __syncthreads();
281 |     
282 |     // backup neighbors to handle overwrite
283 |     if (threadIdx.x == 0) backup_deg = deg[srcid];
284 |     __syncthreads();
285 |     for (int j = threadIdx.x; j < backup_deg; j += blockDim.x) {
286 |       _backup_neighbors[j] = graph[srcid * max_m + j];
287 |       _backup_distances[j] = conversion(distances[srcid * max_m + j]);
288 |     }
289 |     __syncthreads();
290 |     // release lock
291 |     if (threadIdx.x == 0) mutex[srcid] = 0;
292 |     __syncthreads();
293 | 
294 |     // run search heuristic for neighbors
295 |     for (int j = 0; j < backup_deg; ++j) {
296 |       int dstid = _backup_neighbors[j];
297 |       cuda_scalar dist = _backup_distances[j];
298 |       __syncthreads();
299 |       // write access of dstid
300 |       if (threadIdx.x == 0) {
301 |         while (atomicCAS(&mutex[dstid], 0, 1)) {}
302 |       }
303 |       __syncthreads();
304 |       
305 |       const int* _graph = graph + max_m * dstid;
306 |       const int _deg = deg[dstid];
307 |       bool is_neighbor = IsNeighbor(_graph, _deg, srcid);
308 |       if (not is_neighbor){
309 |         PushNodeToPq2(ef_const_pq, &size, ef_construction,
310 |             dist, dstid, srcid, nodes);
311 |         for (int k = 0; k < _deg; ++k) {
312 |           int dstid2 = _graph[k];
313 |           dist = conversion(distances[dstid * max_m + k]);
314 |           PushNodeToPq2(ef_const_pq, &size, ef_construction,
315 |               dist, dstid, dstid2, nodes);
316 |         }
317 | 
318 |         __syncthreads();
319 |         const int new_comer = not save_remains and went_through_heuristic[dstid]? srcid: -1;
320 |         __syncthreads();
321 |         SearchHeuristic(ef_const_pq, &size, dstid, nodes,
322 |             data, dist_type, num_dims,
323 |             ef_construction, max_m, save_remains,
324 |             cand_nodes, cand_distances, 
325 |             graph, distances, deg, heuristic_coef, new_comer);
326 |         if (threadIdx.x == 0) went_through_heuristic[dstid] = true;
327 |         __syncthreads();
328 |       }
329 |       // release lock
330 |       if (threadIdx.x == 0) mutex[dstid] = 0;
331 |       __syncthreads();
332 |     }
333 |     __syncthreads();
334 |   }
335 | 
336 |   // cooperative_groups::grid_group g = cooperative_groups::this_grid();
337 |   // g.sync();
338 | }
339 | 
340 | } // namespace cuhnsw
341 | 


--------------------------------------------------------------------------------
/cpp/src/cuhnsw_build.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #include <iostream>
  7 | #include <algorithm>
  8 | 
  9 | #include "cuhnsw.hpp"
 10 | #include "cuda_search_kernels.cuh"
 11 | #include "cuda_build_kernels.cuh"
 12 | 
 13 | namespace cuhnsw {
 14 | 
 15 | void CuHNSW::GetDeviceInfo() {
 16 |   CHECK_CUDA(cudaGetDevice(&devId_));
 17 |   cudaDeviceProp prop;
 18 |   CHECK_CUDA(cudaGetDeviceProperties(&prop, devId_));
 19 |   mp_cnt_ = prop.multiProcessorCount;
 20 |   major_ = prop.major;
 21 |   minor_ = prop.minor;
 22 |   cores_ = -1;
 23 | }
 24 | 
 25 | void CuHNSW::GetEntryPoints(
 26 |     const std::vector<int>& nodes,
 27 |     std::vector<int>& entries,
 28 |     int level, bool search) {
 29 |   int size = nodes.size();
 30 |   
 31 |   // process input data for kernel
 32 |   LevelGraph& graph = level_graphs_[level];
 33 |   const std::vector<int>& upper_nodes = graph.GetNodes();
 34 |   int upper_size = upper_nodes.size();
 35 |   std::vector<int> deg(upper_size);
 36 |   std::vector<int> neighbors(upper_size * max_m_);
 37 |   for (int i = 0; i < upper_size; ++i) {
 38 |     const std::vector<std::pair<float, int>>& _neighbors = graph.GetNeighbors(upper_nodes[i]);
 39 |     deg[i] = _neighbors.size();
 40 |     int offset = max_m_ * i;
 41 |     for (int j = 0; j < deg[i]; ++j) {
 42 |       neighbors[offset + j] = graph.GetNodeId(_neighbors[j].second);
 43 |     }
 44 |   }
 45 |   for (int i = 0; i < size; ++i) {
 46 |     int entryid = graph.GetNodeId(entries[i]); 
 47 |     entries[i] = entryid;
 48 |   }
 49 |   
 50 |   // copy to gpu mem
 51 |   thrust::device_vector<int> dev_nodes(size), dev_entries(size);
 52 |   thrust::device_vector<int> dev_upper_nodes(upper_size), dev_deg(upper_size);
 53 |   thrust::device_vector<int> dev_neighbors(upper_size * max_m_);
 54 |   thrust::copy(nodes.begin(), nodes.end(), dev_nodes.begin());
 55 |   thrust::copy(entries.begin(), entries.end(), dev_entries.begin());
 56 |   thrust::copy(upper_nodes.begin(), upper_nodes.end(), dev_upper_nodes.begin());
 57 |   thrust::copy(deg.begin(), deg.end(), dev_deg.begin());
 58 |   thrust::copy(neighbors.begin(), neighbors.end(), dev_neighbors.begin());
 59 |   
 60 |   thrust::device_vector<bool> dev_visited(upper_size * block_cnt_, false);
 61 |   thrust::device_vector<int> dev_visited_list(visited_list_size_ * block_cnt_);
 62 |   thrust::device_vector<int64_t> dev_acc_visited_cnt(block_cnt_, 0);
 63 |   thrust::device_vector<cuda_scalar>& qdata = search? device_qdata_: device_data_;
 64 | 
 65 |   // run kernel
 66 |   GetEntryPointsKernel<<<block_cnt_, block_dim_>>>(
 67 |     thrust::raw_pointer_cast(qdata.data()),
 68 |     thrust::raw_pointer_cast(dev_nodes.data()),
 69 |     thrust::raw_pointer_cast(device_data_.data()),
 70 |     thrust::raw_pointer_cast(dev_upper_nodes.data()),
 71 |     num_dims_, size, upper_size, max_m_, dist_type_,
 72 |     thrust::raw_pointer_cast(dev_neighbors.data()),
 73 |     thrust::raw_pointer_cast(dev_deg.data()),
 74 |     thrust::raw_pointer_cast(dev_visited.data()),
 75 |     thrust::raw_pointer_cast(dev_visited_list.data()),
 76 |     visited_list_size_,
 77 |     thrust::raw_pointer_cast(dev_entries.data()),
 78 |     thrust::raw_pointer_cast(dev_acc_visited_cnt.data())
 79 |     );
 80 |   CHECK_CUDA(cudaDeviceSynchronize());
 81 |   // el_[GPU] += sw_[GPU].CheckPoint();
 82 |   thrust::copy(dev_entries.begin(), dev_entries.end(), entries.begin());
 83 |   std::vector<int64_t> acc_visited_cnt(block_cnt_);
 84 |   thrust::copy(dev_acc_visited_cnt.begin(), dev_acc_visited_cnt.end(), acc_visited_cnt.begin());
 85 |   CHECK_CUDA(cudaDeviceSynchronize());
 86 |   int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0);
 87 |   DEBUG("full visited cnt: {}", full_visited_cnt);
 88 |   
 89 |   // set output
 90 |   for (int i = 0; i < size; ++i) {
 91 |     entries[i] = upper_nodes[entries[i]];
 92 |   }
 93 | 
 94 | }
 95 | 
 96 | void CuHNSW::BuildGraph() {
 97 |   visited_ = new bool[batch_size_ * num_data_];
 98 |   for (int level = max_level_; level >= 0; --level) {
 99 |     DEBUG("build graph of level: {}", level);
100 |     BuildLevelGraph(level);
101 |   }
102 | }
103 | 
104 | void CuHNSW::BuildLevelGraph(int level) {
105 |   std::set<int> upper_nodes;
106 |   std::vector<int> new_nodes;
107 |   LevelGraph& graph = level_graphs_[level];
108 |   const std::vector<int>& nodes = graph.GetNodes();
109 |   int size = nodes.size();
110 |   int max_m = level > 0? max_m_: max_m0_;
111 |   thrust::host_vector<int> graph_vec(size * max_m, 0);
112 |   thrust::host_vector<int> deg(size, 0);
113 |   if (level < max_level_) {
114 |     LevelGraph& upper_graph = level_graphs_[level + 1];
115 |     for (auto& node: upper_graph.GetNodes()) {
116 |       upper_nodes.insert(node);
117 |       int srcid = graph.GetNodeId(node);
118 |       int idx = 0;
119 |       for (auto& nb: upper_graph.GetNeighbors(node)) {
120 |         int dstid = graph.GetNodeId(nb.second);
121 |         graph_vec[max_m * srcid + (idx++)] = dstid;
122 |       }
123 |       deg[srcid] = idx;
124 |     }
125 |   }
126 | 
127 |   for (auto& node: graph.GetNodes()) {
128 |     if (upper_nodes.count(node)) continue;
129 |     new_nodes.push_back(node);
130 |   }
131 |   
132 |   // initialize entries
133 |   std::vector<int> entries(new_nodes.size(), enter_point_);
134 | 
135 |   for (int l = max_level_; l > level; --l)
136 |     GetEntryPoints(new_nodes, entries, l, false);
137 |   for (int i = 0; i < new_nodes.size(); ++i) {
138 |     int srcid = graph.GetNodeId(new_nodes[i]);
139 |     int dstid = graph.GetNodeId(entries[i]);
140 |     graph_vec[max_m * srcid] = dstid;
141 |     deg[srcid] = 1;
142 |   }
143 | 
144 |   thrust::device_vector<int> device_graph(max_m * size);
145 |   thrust::device_vector<float> device_distances(max_m * size);
146 |   thrust::device_vector<int> device_deg(size);
147 |   thrust::device_vector<int> device_nodes(size);
148 |   thrust::device_vector<int> device_visited_table(visited_table_size_ * block_cnt_, -1);
149 |   thrust::device_vector<int> device_visited_list(visited_list_size_ * block_cnt_);
150 |   thrust::device_vector<int> device_mutex(size, 0);
151 |   thrust::device_vector<int64_t> device_acc_visited_cnt(block_cnt_, 0);
152 |   thrust::device_vector<Neighbor> device_neighbors(ef_construction_ * block_cnt_);
153 |   thrust::device_vector<int> device_cand_nodes(ef_construction_ * block_cnt_);
154 |   thrust::device_vector<cuda_scalar> device_cand_distances(ef_construction_ * block_cnt_);
155 |   thrust::device_vector<int> device_backup_neighbors(max_m * block_cnt_);
156 |   thrust::device_vector<cuda_scalar> device_backup_distances(max_m * block_cnt_);
157 |   thrust::device_vector<bool> device_went_through_heuristic(size, false);
158 | 
159 |   thrust::copy(graph_vec.begin(), graph_vec.end(), device_graph.begin());
160 |   thrust::copy(deg.begin(), deg.end(), device_deg.begin());
161 |   thrust::copy(nodes.begin(), nodes.end(), device_nodes.begin());
162 | 
163 |   BuildLevelGraphKernel<<<block_cnt_, block_dim_>>>(
164 |     thrust::raw_pointer_cast(device_data_.data()),
165 |     thrust::raw_pointer_cast(device_nodes.data()),
166 |     num_dims_, size, max_m, dist_type_, save_remains_,
167 |     ef_construction_,
168 |     thrust::raw_pointer_cast(device_graph.data()),
169 |     thrust::raw_pointer_cast(device_distances.data()),
170 |     thrust::raw_pointer_cast(device_deg.data()),
171 |     thrust::raw_pointer_cast(device_visited_table.data()),
172 |     thrust::raw_pointer_cast(device_visited_list.data()),
173 |     visited_table_size_, visited_list_size_,
174 |     thrust::raw_pointer_cast(device_mutex.data()),
175 |     thrust::raw_pointer_cast(device_acc_visited_cnt.data()),
176 |     reverse_cand_,
177 |     thrust::raw_pointer_cast(device_neighbors.data()),
178 |     thrust::raw_pointer_cast(device_cand_nodes.data()),
179 |     thrust::raw_pointer_cast(device_cand_distances.data()),
180 |     heuristic_coef_,
181 |     thrust::raw_pointer_cast(device_backup_neighbors.data()),
182 |     thrust::raw_pointer_cast(device_backup_distances.data()),
183 |     thrust::raw_pointer_cast(device_went_through_heuristic.data())
184 |     );
185 |   CHECK_CUDA(cudaDeviceSynchronize());
186 |   thrust::copy(device_deg.begin(), device_deg.end(), deg.begin());
187 |   thrust::copy(device_graph.begin(), device_graph.end(), graph_vec.begin());
188 |   std::vector<float> distances(max_m * size);
189 |   thrust::copy(device_distances.begin(), device_distances.end(), distances.begin());
190 | 
191 |   std::vector<int64_t> acc_visited_cnt(block_cnt_);
192 |   thrust::copy(device_acc_visited_cnt.begin(), device_acc_visited_cnt.end(), acc_visited_cnt.begin());
193 |   CHECK_CUDA(cudaDeviceSynchronize());
194 |   int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0LL);
195 |   DEBUG("full number of visited nodes: {}", full_visited_cnt);
196 | 
197 |   for (auto& node: graph.GetNodes()) {
198 |     graph.ClearEdges(node);
199 |   }
200 |   for (int i = 0; i < size; ++i) {
201 |     int src = nodes[i];
202 |     for (int j = 0; j < deg[i]; ++j) {
203 |       int dst = nodes[graph_vec[i * max_m + j]];
204 |       float dist = distances[i * max_m + j];
205 |       graph.AddEdge(src, dst, dist);
206 |     }
207 |   }
208 | }
209 | 
210 | void CuHNSW::SearchGraph(const float* qdata, const int num_queries, const int topk, const int ef_search, 
211 |     int* nns, float* distances, int* found_cnt) {
212 |   device_qdata_.resize(num_queries * num_dims_);
213 |   #ifdef HALF_PRECISION
214 |     std::vector<cuda_scalar> hdata(num_queries * num_dims_);
215 |     for (int i = 0; i < num_queries * num_dims_; ++i)
216 |       hdata[i] = conversion(qdata[i]);
217 |     thrust::copy(hdata.begin(), hdata.end(), device_qdata_.begin());
218 |   #else
219 |     thrust::copy(qdata, qdata + num_queries * num_dims_, device_qdata_.begin());
220 |   #endif
221 |   std::vector<int> qnodes(num_queries);
222 |   std::iota(qnodes.begin(), qnodes.end(), 0);
223 |   std::vector<int> entries(num_queries, enter_point_);
224 |   for (int l = max_level_; l > 0; --l)
225 |     GetEntryPoints(qnodes, entries, l, true);
226 |   std::vector<int> graph_vec(max_m0_ * num_data_);
227 |   std::vector<int> deg(num_data_);
228 |   LevelGraph graph = level_graphs_[0];
229 |   for (int i = 0; i < num_data_; ++i) {
230 |     const std::vector<std::pair<float, int>>& neighbors = graph.GetNeighbors(i);
231 |     int nbsize = neighbors.size();
232 |     int offset = i * max_m0_;
233 |     for (int j = 0; j < nbsize; ++j)
234 |       graph_vec[offset + j] = neighbors[j].second;
235 |     deg[i] = nbsize;
236 |   }
237 |   
238 |   thrust::device_vector<int> device_graph(max_m0_ * num_data_);
239 |   thrust::device_vector<int> device_deg(num_data_);
240 |   thrust::device_vector<int> device_entries(num_queries);
241 |   thrust::device_vector<int> device_nns(num_queries * topk);
242 |   thrust::device_vector<float> device_distances(num_queries * topk);
243 |   thrust::device_vector<int> device_found_cnt(num_queries);
244 |   thrust::device_vector<int> device_visited_table(visited_table_size_ * block_cnt_, -1);
245 |   thrust::device_vector<int> device_visited_list(visited_list_size_ * block_cnt_);
246 |   thrust::device_vector<int64_t> device_acc_visited_cnt(block_cnt_, 0);
247 |   thrust::device_vector<Neighbor> device_neighbors(ef_search * block_cnt_);
248 |   thrust::device_vector<int> device_cand_nodes(ef_search * block_cnt_);
249 |   thrust::device_vector<cuda_scalar> device_cand_distances(ef_search * block_cnt_);
250 | 
251 |   thrust::copy(graph_vec.begin(), graph_vec.end(), device_graph.begin());
252 |   thrust::copy(deg.begin(), deg.end(), device_deg.begin());
253 |   thrust::copy(entries.begin(), entries.end(), device_entries.begin());
254 |   SearchGraphKernel<<<block_cnt_, block_dim_>>>(
255 |     thrust::raw_pointer_cast(device_qdata_.data()),
256 |     num_queries,
257 |     thrust::raw_pointer_cast(device_data_.data()),
258 |     num_data_, num_dims_, max_m0_, dist_type_, ef_search, 
259 |     thrust::raw_pointer_cast(device_entries.data()),
260 |     thrust::raw_pointer_cast(device_graph.data()),
261 |     thrust::raw_pointer_cast(device_deg.data()), 
262 |     topk,
263 |     thrust::raw_pointer_cast(device_nns.data()), 
264 |     thrust::raw_pointer_cast(device_distances.data()), 
265 |     thrust::raw_pointer_cast(device_found_cnt.data()), 
266 |     thrust::raw_pointer_cast(device_visited_table.data()),
267 |     thrust::raw_pointer_cast(device_visited_list.data()),
268 |     visited_table_size_, visited_list_size_,
269 |     thrust::raw_pointer_cast(device_acc_visited_cnt.data()),
270 |     reverse_cand_,
271 |     thrust::raw_pointer_cast(device_neighbors.data()),
272 |     thrust::raw_pointer_cast(device_cand_nodes.data()),
273 |     thrust::raw_pointer_cast(device_cand_distances.data())
274 |     );
275 |   CHECK_CUDA(cudaDeviceSynchronize());
276 |   std::vector<int64_t> acc_visited_cnt(block_cnt_);
277 |   thrust::copy(device_acc_visited_cnt.begin(), device_acc_visited_cnt.end(), acc_visited_cnt.begin());
278 |   thrust::copy(device_nns.begin(), device_nns.end(), nns);
279 |   thrust::copy(device_distances.begin(), device_distances.end(), distances);
280 |   thrust::copy(device_found_cnt.begin(), device_found_cnt.end(), found_cnt);
281 |   CHECK_CUDA(cudaDeviceSynchronize());
282 |   int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0LL);
283 |   DEBUG("full number of visited nodes: {}", full_visited_cnt);
284 |   if (labelled_)
285 |     for (int i = 0; i < num_queries * topk; ++i)
286 |       nns[i] = labels_[nns[i]];
287 |   
288 |   device_qdata_.clear();
289 |   device_qdata_.shrink_to_fit();
290 | }
291 | 
292 | } // namespace cuhnsw
293 | 


--------------------------------------------------------------------------------
/cpp/src/cuhnsw_base.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #include <iostream>
  7 | #include <algorithm>
  8 | 
  9 | #include "cuhnsw.hpp"
 10 | 
 11 | namespace cuhnsw {
 12 | 
 13 | CuHNSW::CuHNSW() {
 14 |   logger_ = CuHNSWLogger().get_logger();
 15 | 
 16 |   GetDeviceInfo();
 17 |   // reference: https://stackoverflow.com/a/32531982
 18 |   switch (major_){
 19 |     case 2: // Fermi
 20 |       if (minor_ == 1) 
 21 |         cores_ = mp_cnt_ * 48;
 22 |       else 
 23 |         cores_ = mp_cnt_ * 32;
 24 |       break;
 25 |     case 3: // Kepler
 26 |       cores_ = mp_cnt_ * 192;
 27 |       break;
 28 |     case 5: // Maxwell
 29 |       cores_ = mp_cnt_ * 128;
 30 |       break;
 31 |     case 6: // Pascal
 32 |       if (minor_ == 1 or minor_ == 2) 
 33 |         cores_ = mp_cnt_ * 128;
 34 |       else if (minor_ == 0) 
 35 |         cores_ = mp_cnt_ * 64;
 36 |       else 
 37 |         DEBUG0("Unknown device type");
 38 |       break;
 39 |     case 7: // Volta and Turing
 40 |       if (minor_ == 0 or minor_ == 5) 
 41 |         cores_ = mp_cnt_ * 64;
 42 |       else 
 43 |         DEBUG0("Unknown device type");
 44 |       break;
 45 |     case 8: // Ampere
 46 |       if (minor_ == 0) 
 47 |         cores_ = mp_cnt_ * 64;
 48 |       else if (minor_ == 6) 
 49 |         cores_ = mp_cnt_ * 128;
 50 |       else 
 51 |         DEBUG0("Unknown device type");
 52 |       break;
 53 |     default:
 54 |       DEBUG0("Unknown device type"); 
 55 |       break;
 56 |   }
 57 |   if (cores_ == -1) cores_ = mp_cnt_ * 128;
 58 |   INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}",
 59 |        major_, minor_, mp_cnt_, cores_);
 60 |   // sw_.resize(PROFILE_SIZE);
 61 |   // el_.resize(PROFILE_SIZE);
 62 | }
 63 | 
 64 | 
 65 | CuHNSW::~CuHNSW() {}
 66 | 
 67 | bool CuHNSW::Init(std::string opt_path) {
 68 |   std::ifstream in(opt_path.c_str());
 69 |   if (not in.is_open()) return false;
 70 | 
 71 |   std::string str((std::istreambuf_iterator<char>(in)),
 72 |       std::istreambuf_iterator<char>());
 73 |   std::string err_cmt;
 74 |   auto _opt = json11::Json::parse(str, err_cmt);
 75 |   if (not err_cmt.empty()) return false;
 76 |   opt_ = _opt;
 77 |   max_m_ = opt_["max_m"].int_value();
 78 |   max_m0_ = opt_["max_m0"].int_value();
 79 |   save_remains_ = opt_["save_remains"].bool_value();
 80 |   ef_construction_ = opt_["ef_construction"].int_value();
 81 |   level_mult_ = opt_["level_mult"].number_value();
 82 |   batch_size_ = opt_["batch_size"].int_value();
 83 |   block_dim_ = opt_["block_dim"].int_value();
 84 |   visited_table_size_ = opt_["visited_table_size"].int_value();
 85 |   visited_list_size_ = opt_["visited_list_size"].int_value();
 86 |   if (not visited_table_size_)
 87 |     visited_table_size_ = visited_list_size_ * 2;
 88 |   heuristic_coef_ = opt_["heuristic_coef"].number_value();
 89 |   std::string dist_type = opt_["dist_type"].string_value();
 90 |   reverse_cand_ = opt_["reverse_cand"].bool_value();
 91 |   if (dist_type == "dot") {
 92 |     dist_type_ = DOT;
 93 |   } else if (dist_type == "l2") {
 94 |     dist_type_ = L2;
 95 |   } else {
 96 |     char buf[4096];
 97 |     snprintf(buf, sizeof(buf), "invalid dist type %s",
 98 |         dist_type.c_str());
 99 |     std::string msg(buf);
100 |     throw std::runtime_error(msg);
101 |   }
102 |   CuHNSWLogger().set_log_level(opt_["c_log_level"].int_value());
103 |   DEBUG("max_m: {}, max_m0: {}, save_remains: {}, ef_construction: {}, level_mult: {}, dist_type: {}",
104 |       max_m_, max_m0_, save_remains_, ef_construction_, level_mult_, dist_type);
105 |   return true;
106 | }
107 | 
108 | void CuHNSW::SetData(const float* data, int num_data, int num_dims) {
109 |   num_data_ = num_data;
110 |   num_dims_ = num_dims;
111 |   block_cnt_ = opt_["hyper_threads"].number_value() * (cores_ / block_dim_);
112 |   DEBUG("copy data ({} x {}), block_cnt: {}, block_dim: {}",
113 |       num_data, num_dims, block_cnt_, block_dim_);
114 |   device_data_.resize(num_data * num_dims);
115 |   #ifdef HALF_PRECISION
116 |     // DEBUG0("fp16")
117 |     std::vector<cuda_scalar> hdata(num_data * num_dims);
118 |     for (int i = 0; i < num_data * num_dims; ++i) {
119 |       hdata[i] = conversion(data[i]);
120 |       // DEBUG("hdata i: {}, scalar: {}", i, out_scalar(hdata[i]));
121 |     }
122 |     thrust::copy(hdata.begin(), hdata.end(), device_data_.begin());
123 |   #else
124 |     // DEBUG0("fp32")
125 |     thrust::copy(data, data + num_data * num_dims, device_data_.begin());
126 |   #endif
127 |   data_ = data;
128 | }
129 | 
130 | void CuHNSW::SetRandomLevels(const int* levels) {
131 |   levels_.resize(num_data_);
132 |   DEBUG("set levels of data (length: {})", num_data_)
133 |   max_level_ = 0;
134 |   std::vector<std::vector<int>> level_nodes(1);
135 |   for (int i = 0; i < num_data_; ++i) {
136 |     levels_[i] = levels[i];
137 |     if (levels[i] > max_level_) {
138 |       max_level_ = levels[i];
139 |       level_nodes.resize(max_level_ + 1);
140 |       enter_point_ = i;
141 |     }
142 |     for (int l = 0; l <= levels[i]; ++l)
143 |       level_nodes[l].push_back(i);
144 |   }
145 |   DEBUG("max level: {}", max_level_)
146 |   for (int i = 0; i <= max_level_; ++i)
147 |     DEBUG("number of data in level {}: {}",
148 |         i, level_nodes[i].size());
149 |   level_graphs_.clear();
150 |   for (int i = 0; i <= max_level_; ++i) {
151 |     LevelGraph graph = LevelGraph();
152 |     graph.SetNodes(level_nodes[i],
153 |         num_data_, ef_construction_);
154 |     level_graphs_.push_back(graph);
155 |   }
156 | }
157 | 
158 | // save graph compatible with hnswlib (https://github.com/nmslib/hnswlib)
159 | void CuHNSW::SaveIndex(std::string fpath) {
160 |   std::ofstream output(fpath);
161 |   DEBUG("save index to {}", fpath);
162 |   
163 |   // write meta values
164 |   DEBUG0("write meta values"); 
165 |   size_t data_size = num_dims_ * sizeof(scalar);
166 |   size_t max_elements = num_data_;
167 |   size_t cur_element_count = num_data_;
168 |   size_t M = max_m_;
169 |   size_t maxM = max_m_;
170 |   size_t maxM0 = max_m0_;
171 |   int maxlevel = max_level_;
172 |   size_t size_links_level0 = maxM0 * sizeof(tableint) + sizeof(sizeint);
173 |   size_t size_links_per_element = maxM * sizeof(tableint) + sizeof(sizeint);
174 |   size_t size_data_per_element = size_links_level0 + data_size + sizeof(labeltype);
175 |   size_t ef_construction = ef_construction_; 
176 |   double mult = level_mult_; 
177 |   size_t offsetData = size_links_level0;
178 |   size_t label_offset = size_links_level0 + data_size;
179 |   size_t offsetLevel0 = 0;
180 |   tableint enterpoint_node = enter_point_;
181 |   
182 |   writeBinaryPOD(output, offsetLevel0);
183 |   writeBinaryPOD(output, max_elements);
184 |   writeBinaryPOD(output, cur_element_count);
185 |   writeBinaryPOD(output, size_data_per_element);
186 |   writeBinaryPOD(output, label_offset);
187 |   writeBinaryPOD(output, offsetData);
188 |   writeBinaryPOD(output, maxlevel);
189 |   writeBinaryPOD(output, enterpoint_node);
190 |   writeBinaryPOD(output, maxM);
191 |   writeBinaryPOD(output, maxM0);
192 |   writeBinaryPOD(output, M);
193 |   writeBinaryPOD(output, mult);
194 |   writeBinaryPOD(output, ef_construction);
195 | 
196 |   // write level0 links and data
197 |   DEBUG0("write level0 links and data"); 
198 |   char* data_level0_memory = (char*) malloc(cur_element_count * size_data_per_element);
199 |   LevelGraph& graph = level_graphs_[0];
200 |   std::vector<tableint> links;
201 |   links.reserve(max_m0_);
202 |   size_t offset = 0;
203 |   for (int i = 0; i < cur_element_count; ++i) {
204 |     links.clear();
205 |     for (const auto& pr: graph.GetNeighbors(i))
206 |       links.push_back(static_cast<tableint>(pr.second));
207 |     
208 |     sizeint size = links.size();
209 |     memcpy(data_level0_memory + offset, &size, sizeof(sizeint));
210 |     offset += sizeof(sizeint);
211 |     if (size > 0)
212 |       memcpy(data_level0_memory + offset, &links[0], sizeof(tableint) * size);
213 |     offset += maxM0 * sizeof(tableint); 
214 |     memcpy(data_level0_memory + offset, &data_[i * num_dims_], data_size);
215 |     offset += data_size;
216 |     labeltype label = i;
217 |     memcpy(data_level0_memory + offset, &label, sizeof(labeltype));
218 |     offset += sizeof(labeltype);
219 |   }
220 |   output.write(data_level0_memory, cur_element_count * size_data_per_element);
221 |   
222 |   // write upper layer links
223 |   DEBUG0("write upper layer links");
224 |   for (int i = 0; i < num_data_; ++i) {
225 |     unsigned int size = size_links_per_element * levels_[i];
226 |     writeBinaryPOD(output, size);
227 |     char* mem = (char*) malloc(size);
228 |     offset = 0;
229 |     if (size) {
230 |       for (int j = 1; j <= levels_[i]; ++j) {
231 |         links.clear();
232 |         LevelGraph& upper_graph = level_graphs_[j];
233 |         for (const auto& pr: upper_graph.GetNeighbors(i))
234 |           links.push_back(static_cast<tableint>(pr.second));
235 |         sizeint link_size = links.size();
236 |         memcpy(mem + offset, &link_size, sizeof(sizeint));
237 |         offset += sizeof(sizeint);
238 |         if (link_size > 0)
239 |           memcpy(mem + offset, &links[0], sizeof(tableint) * link_size);
240 |         offset += sizeof(tableint) * maxM;
241 |       }
242 |       output.write(mem, size);
243 |     }
244 |   }
245 | 
246 |   output.close();
247 | }
248 | 
249 | // load graph compatible with hnswlib (https://github.com/nmslib/hnswlib)
250 | void CuHNSW::LoadIndex(std::string fpath) {
251 |   std::ifstream input(fpath, std::ios::binary);
252 |   DEBUG("load index from {}", fpath);
253 |   
254 |   // reqd meta values
255 |   DEBUG0("read meta values"); 
256 |   size_t offsetLevel0, max_elements, cur_element_count;
257 |   size_t size_data_per_element, label_offset, offsetData;
258 |   int maxlevel; 
259 |   tableint enterpoint_node = enter_point_;
260 |   size_t maxM, maxM0, M;
261 |   double mult;
262 |   size_t ef_construction;
263 | 
264 |   readBinaryPOD(input, offsetLevel0);
265 |   readBinaryPOD(input, max_elements);
266 |   readBinaryPOD(input, cur_element_count);
267 |   readBinaryPOD(input, size_data_per_element);
268 |   readBinaryPOD(input, label_offset);
269 |   readBinaryPOD(input, offsetData);
270 |   readBinaryPOD(input, maxlevel);
271 |   readBinaryPOD(input, enterpoint_node);
272 |   readBinaryPOD(input, maxM);
273 |   readBinaryPOD(input, maxM0);
274 |   readBinaryPOD(input, M);
275 |   readBinaryPOD(input, mult);
276 |   readBinaryPOD(input, ef_construction);
277 |   size_t size_per_link = maxM * sizeof(tableint) + sizeof(sizeint);
278 |   num_data_ = cur_element_count;
279 |   max_m_ = maxM;
280 |   max_m0_ = maxM0;
281 |   enter_point_ = enterpoint_node;
282 |   ef_construction_ = ef_construction;
283 |   max_level_ = maxlevel;
284 |   level_mult_ = mult;
285 |   num_dims_ = (label_offset - offsetData) / sizeof(scalar);
286 |   DEBUG("meta values loaded, num_data: {}, num_dims: {}, max_m: {}, max_m0: {}, enter_point: {}, max_level: {}",
287 |       num_data_, num_dims_, max_m_, max_m0_, enter_point_, max_level_);
288 | 
289 |   char* data_level0_memory = (char*) malloc(max_elements * size_data_per_element);
290 |   input.read(data_level0_memory, cur_element_count * size_data_per_element);
291 |   
292 |   // reset level graphs
293 |   level_graphs_.clear();
294 |   level_graphs_.shrink_to_fit();
295 |   level_graphs_.resize(max_level_ + 1);
296 |   
297 |   // load data and level0 links
298 |   DEBUG0("load level0 links and data");
299 |   DEBUG("level0 count: {}", cur_element_count);
300 |   std::vector<float> data(num_data_ * num_dims_);
301 |   size_t offset = 0;
302 |   std::vector<tableint> links(max_m0_);
303 |   std::vector<scalar> vec_data(num_dims_);
304 |   LevelGraph& graph0 = level_graphs_[0];
305 |   std::vector<std::vector<int>> nodes(max_level_ + 1);
306 |   nodes[0].resize(cur_element_count);
307 |   std::iota(nodes[0].begin(), nodes[0].end(), 0);
308 |   graph0.SetNodes(nodes[0], num_data_, ef_construction_);
309 |   labels_.clear(); labelled_ = true;
310 |   for (int i = 0; i < cur_element_count; ++i) {
311 |     sizeint deg;
312 |     memcpy(&deg, data_level0_memory + offset, sizeof(sizeint));
313 |     offset += sizeof(sizeint);
314 |     memcpy(&links[0], data_level0_memory + offset, sizeof(tableint) * max_m0_);
315 |     for (int j = 0; j < deg; ++j)
316 |       graph0.AddEdge(i, links[j], 0);
317 |     offset += sizeof(tableint) * max_m0_;
318 |     memcpy(&vec_data[0], data_level0_memory + offset, sizeof(scalar) * num_dims_);
319 |     for (int j = 0; j < num_dims_; ++j)
320 |       data[num_dims_ * i + j] = vec_data[j];
321 |     offset += sizeof(scalar) * num_dims_;
322 |     labeltype label;
323 |     memcpy(&label, data_level0_memory + offset, sizeof(labeltype));
324 |     labels_.push_back(static_cast<int>(label));
325 |     offset += sizeof(labeltype);
326 |   }
327 |   SetData(&data[0], num_data_, num_dims_);
328 |   
329 |   // load upper layer links
330 |   DEBUG0("load upper layer links");
331 |   std::vector<std::vector<std::pair<int, int>>> links_data(max_level_ + 1);
332 |   links.resize(max_m_);
333 |   levels_.resize(cur_element_count);
334 |   for (int i = 0; i < cur_element_count; ++i) {
335 |     unsigned int linksize;
336 |     readBinaryPOD(input, linksize);
337 |     if (not linksize) continue;
338 |     char* buffer = (char*) malloc(linksize);
339 |     input.read(buffer, linksize);
340 |     size_t levels = linksize / size_per_link;
341 |     size_t offset = 0;
342 |     levels_[i] = levels + 1;
343 |     for (int j = 1; j <= levels; ++j) {
344 |       nodes[j].push_back(i);
345 |       sizeint deg;
346 |       memcpy(&deg, buffer + offset, sizeof(sizeint));
347 |       offset += sizeof(sizeint);
348 |       memcpy(&links[0], buffer + offset, sizeof(tableint) * deg);
349 |       offset += sizeof(tableint) * max_m_;
350 |       for (int k = 0; k < deg; ++k)
351 |         links_data[j].emplace_back(i, links[k]); 
352 |     }
353 |   }
354 | 
355 |   for (int i = 1; i <= max_level_; ++i) {
356 |     LevelGraph& graph = level_graphs_[i];
357 |     DEBUG("level {} count: {}", i, nodes[i].size());
358 |     graph.SetNodes(nodes[i], num_data_, ef_construction_);
359 |     for (const auto& pr: links_data[i]) {
360 |       graph.AddEdge(pr.first, pr.second, 0);
361 |     }
362 |   }
363 | 
364 |   input.close();
365 | }
366 | 
367 | } // namespace cuhnsw
368 | 


--------------------------------------------------------------------------------