├── examples ├── .gitignore ├── requirements.txt ├── README.md └── example1.py ├── cuhnsw ├── .gitignore ├── __init__.py ├── proto │ └── config.proto ├── bindings.cc ├── pyhnsw.py └── aux.py ├── requirements.txt ├── pyproject.toml ├── .gitmodules ├── MANIFEST.in ├── cpp ├── include │ ├── stop_watch.hpp │ ├── cuda_base_kernels.cuh │ ├── cuda_heap_kernels.cuh │ ├── types.hpp │ ├── log.hpp │ ├── level_graph.hpp │ ├── cuhnsw.hpp │ ├── cuda_dist_kernels.cuh │ ├── cuda_search_kernels.cuh │ ├── cuda_utils_kernels.cuh │ └── cuda_build_kernels.cuh └── src │ ├── log.cc │ ├── cuhnsw_build.cu │ └── cuhnsw_base.cu ├── .travis.yml ├── setup.py ├── README.md ├── cuda_setup.py └── LICENSE /examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.hdf5 2 | *.index 3 | -------------------------------------------------------------------------------- /cuhnsw/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | version.py 3 | config_pb2.py 4 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | fire 3 | hnswlib 4 | pandas 5 | tabulate 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools>=1.3.2 2 | jsmin 3 | numpy 4 | pybind11 5 | protobuf==3.10.0 6 | grpcio-tools==1.27.1 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=1.3.2", 4 | "numpy", 5 | "pybind11" 6 | ] 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rd/json11"] 2 | path = 3rd/json11 3 | url = https://github.com/dropbox/json11 4 | [submodule "3rd/spdlog"] 5 | path = 3rd/spdlog 6 | url = https://github.com/gabime/spdlog 7 | -------------------------------------------------------------------------------- /cuhnsw/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from cuhnsw.pyhnsw import CuHNSW 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cuda_setup.py 2 | include requirements.txt 3 | include pyproject.toml 4 | recursive-include cpp/src/ *.cu 5 | recursive-include cpp/src/ *.cc 6 | recursive-include cpp/include/ *.cuh 7 | recursive-include cpp/include/ *.hpp 8 | recursive-include 3rd/json11/ * 9 | recursive-include 3rd/spdlog/ * 10 | recursive-include 3rd/pybind11/ * 11 | -------------------------------------------------------------------------------- /cpp/include/stop_watch.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | 8 | #include 9 | #include 10 | 11 | class StopWatch { 12 | public: 13 | StopWatch() { 14 | clock_gettime(CLOCK_MONOTONIC, &beg_); 15 | } 16 | ~StopWatch() {} 17 | inline double CheckPoint() { 18 | clock_gettime(CLOCK_MONOTONIC, &end_); 19 | double ret = (end_.tv_sec - beg_.tv_sec) + (end_.tv_nsec - beg_.tv_nsec) / 1e9; 20 | std::swap(beg_, end_); 21 | return ret; 22 | } 23 | private: 24 | timespec beg_, end_; 25 | }; 26 | -------------------------------------------------------------------------------- /cuhnsw/proto/config.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | syntax = "proto2"; 8 | 9 | message ConfigProto { 10 | optional int32 seed = 1 [default = 777]; 11 | optional int32 c_log_level = 3 [default = 2]; 12 | optional int32 py_log_level = 4 [default = 2]; 13 | optional int32 max_m = 5 [default = 12]; 14 | optional int32 max_m0 = 6 [default = 24]; 15 | optional int32 ef_construction = 7 [default = 150]; 16 | // optional int32 ef_search = 8 [default = 50]; 17 | optional double level_mult = 9; 18 | optional bool save_remains = 10; 19 | optional double hyper_threads = 11 [default = 10]; 20 | optional int32 block_dim = 12 [default = 32]; 21 | optional string dist_type = 13 [default = "dot"]; 22 | optional int32 visited_table_size = 17; 23 | optional int32 visited_list_size = 14 [default = 8192]; 24 | optional bool nrz = 15; 25 | optional bool reverse_cand = 16; 26 | optional double heuristic_coef = 18 [default = 0.25]; 27 | } 28 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ### How to run example code 2 | 3 | 0. install requirements 4 | 5 | ```shell 6 | # install python requirements 7 | pip install -r requirements.txt 8 | 9 | # install wget to download data (in ubuntu) 10 | apt install wget 11 | ``` 12 | 13 | 1. first, it is good to know about python-fire in https://github.com/google/python-fire, if you haven't heard yet. 14 | 15 | 2. download data 16 | 17 | ```shell 18 | python example1.py download 19 | ``` 20 | 21 | 3. run gpu training 22 | 23 | 24 | ```shell 25 | python example1.py run_gpu_training 26 | ``` 27 | 28 | 4. check the saved index (filename: `cuhnsw.index`) 29 | 30 | 31 | 32 | 5. search the nearest neighbors loading the file in cuhnsw (GPU) 33 | 34 | ```shell 35 | python example1.py run_gpu_inference --index-file=cuhnsw.index --topk=10 36 | ``` 37 | 38 | 6. you can also search the nearest neighbor by hnswlib (CPU) 39 | 40 | ```shell 41 | python example1.py run_cpu_inference --index-file=cuhnsw.index --topk=10 42 | ``` 43 | 44 | 7. reproduce the experimental results shown in README.md on the root directory 45 | 46 | ```shell 47 | python example1.py run_experiments 48 | ``` 49 | -------------------------------------------------------------------------------- /cpp/src/log.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc 8 | #include "log.hpp" 9 | 10 | int CuHNSWLogger::global_logging_level_ = 2; 11 | 12 | CuHNSWLogger::CuHNSWLogger() { 13 | spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v"); 14 | logger_ = spdlog::default_logger(); 15 | } 16 | 17 | std::shared_ptr& CuHNSWLogger::get_logger() { 18 | return logger_; 19 | } 20 | 21 | void CuHNSWLogger::set_log_level(int level) { 22 | global_logging_level_ = level; 23 | switch (level) { 24 | case 0: spdlog::set_level(spdlog::level::off); break; 25 | case 1: spdlog::set_level(spdlog::level::warn); break; 26 | case 2: spdlog::set_level(spdlog::level::info); break; 27 | case 3: spdlog::set_level(spdlog::level::debug); break; 28 | default: spdlog::set_level(spdlog::level::trace); break; 29 | } 30 | } 31 | 32 | int CuHNSWLogger::get_log_level() { 33 | return global_logging_level_; 34 | } 35 | -------------------------------------------------------------------------------- /cpp/include/cuda_base_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "types.hpp" 25 | 26 | namespace cuhnsw { 27 | 28 | // Error Checking utilities, checks status codes from cuda calls 29 | // and throws exceptions on failure (which cython can proxy back to python) 30 | #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } 31 | inline void checkCuda(cudaError_t code, const char *file, int line) { 32 | if (code != cudaSuccess) { 33 | std::stringstream err; 34 | err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")"; 35 | throw std::runtime_error(err.str()); 36 | } 37 | } 38 | 39 | } // namespace cuhnsw 40 | -------------------------------------------------------------------------------- /cpp/include/cuda_heap_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "cuda_base_kernels.cuh" 8 | 9 | namespace cuhnsw { 10 | 11 | // pop and push for heap 12 | // reference: https://github.com/NVlabs/nvbio/blob/master/nvbio/basic/priority_queue_inline.h 13 | __inline__ __device__ 14 | void PqPop(Neighbor* pq, int* size) { 15 | if (threadIdx.x != 0) return; 16 | if (*size == 0) return; 17 | (*size)--; 18 | if (*size == 0) return; 19 | cuda_scalar tail_dist = pq[*size].distance; 20 | int p = 0, r = 1; 21 | while (r < *size) { 22 | if (r < (*size) - 1 and gt(pq[r + 1].distance, pq[r].distance)) 23 | r++; 24 | if (ge(tail_dist, pq[r].distance)) 25 | break; 26 | pq[p] = pq[r]; 27 | p = r; 28 | r = 2 * p + 1; 29 | } 30 | pq[p] = pq[*size]; 31 | } 32 | 33 | __inline__ __device__ 34 | void PqPush(Neighbor* pq, int* size, 35 | float dist, int nodeid, bool check) { 36 | if (threadIdx.x != 0) return; 37 | int idx = *size; 38 | while (idx > 0) { 39 | int nidx = (idx + 1) / 2 - 1; 40 | if (ge(pq[nidx].distance, dist)) 41 | break; 42 | pq[idx] = pq[nidx]; 43 | idx = nidx; 44 | } 45 | pq[idx].distance = dist; 46 | pq[idx].nodeid = nodeid; 47 | pq[idx].checked = check; 48 | (*size)++; 49 | } 50 | 51 | } // namespace cuhnsw 52 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/jeremad/cuda-travis/blob/master/.travis.yml 2 | language: cpp 3 | 4 | sudo: enabled 5 | 6 | compiler: 7 | - gcc 8 | 9 | matrix: 10 | include: 11 | - name: CUDA 10 12 | env: 13 | - CUDA=10.1.105-1 14 | - CUDA_SHORT=10.1 15 | - UBUNTU_VERSION=ubuntu1804 16 | dist: bionic 17 | 18 | before_install: 19 | - sudo apt update 20 | - sudo apt install -y software-properties-common 21 | - sudo add-apt-repository -y ppa:deadsnakes/ppa 22 | - sudo apt update 23 | - sudo apt install -y python3-pip python3.6 g++ 24 | - pip3 install -U pip 25 | - pip3 install setuptools 26 | - pip3 install -r requirements.txt 27 | - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb 28 | - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER} 29 | - sudo dpkg -i ${INSTALLER} 30 | - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub 31 | - sudo apt-key add 7fa2af80.pub 32 | - sudo apt update -qq 33 | - sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} 34 | - sudo apt clean 35 | - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} 36 | - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 37 | - export PATH=${CUDA_HOME}/bin:${PATH} 38 | - python3.6 -m grpc_tools.protoc --python_out cuhnsw/ --proto_path cuhnsw/proto/ config.proto 39 | 40 | script: 41 | - sudo python3.6 setup.py install 42 | -------------------------------------------------------------------------------- /cpp/include/types.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | 9 | // experimental codes to use half precision 10 | // not properly working yet.. 11 | // #define HALF_PRECISION 1 12 | 13 | // #if __CUDA_ARCH__ < 530 14 | // #undef HALF_PRECISION 15 | // #endif 16 | 17 | #ifdef HALF_PRECISION 18 | typedef half cuda_scalar; 19 | #define mul(x, y) ( __hmul(x, y) ) 20 | #define add(x, y) ( __hadd(x, y) ) 21 | #define sub(x, y) ( __hsub(x, y) ) 22 | #define gt(x, y) ( __hgt(x, y) ) // x > y 23 | #define ge(x, y) ( __hge(x, y) ) // x >= y 24 | #define lt(x, y) ( __hlt(x, y) ) // x < y 25 | #define le(x, y) ( __hle(x, y) ) // x <= y 26 | #define out_scalar(x) ( __half2float(x) ) 27 | #define conversion(x) ( __float2half(x) ) 28 | #else 29 | typedef float cuda_scalar; 30 | #define mul(x, y) ( x * y ) 31 | #define add(x, y) ( x + y ) 32 | #define sub(x, y) ( x - y ) 33 | #define gt(x, y) ( x > y ) 34 | #define ge(x, y) ( x >= y ) 35 | #define lt(x, y) ( x < y ) 36 | #define le(x, y) ( x <= y ) 37 | #define out_scalar(x) ( x ) 38 | #define conversion(x) ( x ) 39 | #endif 40 | 41 | #define WARP_SIZE 32 42 | 43 | struct Neighbor { 44 | cuda_scalar distance; 45 | int nodeid; 46 | bool checked; 47 | }; 48 | 49 | // to manage the compatibility with hnswlib 50 | typedef unsigned int tableint; 51 | typedef unsigned int sizeint; 52 | typedef float scalar; 53 | typedef size_t labeltype; 54 | 55 | enum DIST_TYPE { 56 | DOT, 57 | L2, 58 | }; 59 | -------------------------------------------------------------------------------- /cpp/include/log.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc 8 | #pragma once 9 | #include 10 | 11 | #define SPDLOG_EOL "" 12 | #define SPDLOG_TRACE_ON 13 | #include "spdlog/spdlog.h" 14 | #include "spdlog/sinks/stdout_color_sinks.h" 15 | 16 | #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) 17 | 18 | #define INFO(x, ...) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 19 | #define DEBUG(x, ...) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 20 | #define WARN(x, ...) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 21 | #define TRACE(x, ...) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 22 | #define CRITICAL(x, ...) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 23 | 24 | #define INFO0(x) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__); 25 | #define DEBUG0(x) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__); 26 | #define WARN0(x) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__); 27 | #define TRACE0(x) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__); 28 | #define CRITICAL0(x) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__); 29 | 30 | class CuHNSWLogger { 31 | public: 32 | CuHNSWLogger(); 33 | std::shared_ptr& get_logger(); 34 | void set_log_level(int level); 35 | int get_log_level(); 36 | 37 | private: 38 | static int global_logging_level_; 39 | std::shared_ptr logger_; 40 | }; // class CuHNSWLogger 41 | -------------------------------------------------------------------------------- /cpp/include/level_graph.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "log.hpp" 20 | 21 | namespace cuhnsw { 22 | 23 | class LevelGraph { 24 | public: 25 | LevelGraph() { 26 | logger_ = CuHNSWLogger().get_logger(); 27 | } 28 | 29 | ~LevelGraph() {} 30 | 31 | void SetNodes(std::vector& nodes, int num_data, int ef_construction) { 32 | nodes_ = nodes; 33 | num_nodes_ = nodes_.size(); 34 | neighbors_.clear(); 35 | neighbors_.resize(num_nodes_); 36 | nodes_idmap_.resize(num_data); 37 | std::fill(nodes_idmap_.begin(), nodes_idmap_.end(), -1); 38 | for (int i = 0; i < num_nodes_; ++i) 39 | nodes_idmap_[nodes[i]] = i; 40 | } 41 | 42 | const std::vector>& GetNeighbors(int node) const { 43 | int nodeid = GetNodeId(node); 44 | return neighbors_[nodeid]; 45 | } 46 | 47 | const std::vector& GetNodes() const { 48 | return nodes_; 49 | } 50 | 51 | void ClearEdges(int node) { 52 | neighbors_[GetNodeId(node)].clear(); 53 | } 54 | 55 | void AddEdge(int src, int dst, float dist) { 56 | if (src == dst) return; 57 | int srcid = GetNodeId(src); 58 | neighbors_[srcid].emplace_back(dist, dst); 59 | } 60 | 61 | inline int GetNodeId(int node) const { 62 | int nodeid = nodes_idmap_.at(node); 63 | if (not(nodeid >= 0 and nodeid < num_nodes_)) { 64 | throw std::runtime_error( 65 | fmt::format("[{}:{}] invalid nodeid: {}, node: {}, num_nodes: {}", 66 | __FILE__, __LINE__, nodeid, node, num_nodes_)); 67 | } 68 | return nodeid; 69 | } 70 | 71 | void ShowGraph() { 72 | for (int i = 0; i < num_nodes_; ++i) { 73 | std::cout << std::string(50, '=') << std::endl; 74 | printf("nodeid %d: %d\n", i, nodes_[i]); 75 | for (auto& nb: GetNeighbors(nodes_[i])) { 76 | printf("neighbor id: %d, dist: %f\n", 77 | nb.second, nb.first); 78 | } 79 | std::cout << std::string(50, '=') << std::endl; 80 | } 81 | } 82 | 83 | private: 84 | std::shared_ptr logger_; 85 | std::vector nodes_; 86 | std::vector>> neighbors_; 87 | int num_nodes_ = 0; 88 | std::vector nodes_idmap_; 89 | }; // class LevelGraph 90 | 91 | } // namespace cuhnsw 92 | -------------------------------------------------------------------------------- /cuhnsw/bindings.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "cuhnsw.hpp" 12 | 13 | namespace py = pybind11; 14 | 15 | typedef py::array_t float_array; 16 | typedef py::array_t int_array; 17 | 18 | class CuHNSWBind { 19 | public: 20 | CuHNSWBind() {} 21 | 22 | bool Init(std::string opt_path) { 23 | return obj_.Init(opt_path); 24 | } 25 | 26 | void SetData(py::object& input) { 27 | float_array array(input); 28 | auto buffer = array.request(); 29 | if (buffer.ndim != 2) throw std::runtime_error("data must be 2d array"); 30 | int num_data = buffer.shape[0]; 31 | int num_dims = buffer.shape[1]; 32 | obj_.SetData(array.data(0), num_data, num_dims); 33 | } 34 | 35 | void BuildGraph() { 36 | obj_.BuildGraph(); 37 | } 38 | 39 | void SetRandomLevels(py::object& input) { 40 | int_array array(input); 41 | auto buffer = array.request(); 42 | if (buffer.ndim != 1) throw std::runtime_error("levels must be 1d array"); 43 | obj_.SetRandomLevels(array.data(0)); 44 | } 45 | 46 | void SaveIndex(std::string fpath) { 47 | obj_.SaveIndex(fpath); 48 | } 49 | 50 | void LoadIndex(std::string fpath) { 51 | obj_.LoadIndex(fpath); 52 | } 53 | 54 | void SearchGraph(py::object& qdata, int topk, int ef_search, 55 | py::object& nns, py::object& distances, py::object& found_cnt) { 56 | float_array _qdata(qdata); 57 | int_array _nns(nns); 58 | float_array _distances(distances); 59 | int_array _found_cnt(found_cnt); 60 | auto buffer = _qdata.request(); 61 | 62 | if (buffer.ndim != 1 and buffer.ndim != 2) 63 | throw std::runtime_error("data array must be 1d / 2d shape"); 64 | 65 | int num_queries = buffer.ndim == 1? 1: buffer.shape[0]; 66 | obj_.SearchGraph(_qdata.data(0), num_queries, topk, ef_search, 67 | _nns.mutable_data(0), _distances.mutable_data(0), _found_cnt.mutable_data(0)); 68 | } 69 | 70 | private: 71 | cuhnsw::CuHNSW obj_; 72 | }; 73 | 74 | PYBIND11_PLUGIN(cuhnsw_bind) { 75 | py::module m("CuHNSWBind"); 76 | 77 | py::class_(m, "CuHNSWBind") 78 | .def(py::init()) 79 | .def("init", &CuHNSWBind::Init, py::arg("opt_path")) 80 | .def("set_data", &CuHNSWBind::SetData, py::arg("data")) 81 | .def("build_graph", &CuHNSWBind::BuildGraph) 82 | .def("set_random_levels", &CuHNSWBind::SetRandomLevels, py::arg("levels")) 83 | .def("save_index", &CuHNSWBind::SaveIndex, py::arg("fpath")) 84 | .def("load_index", &CuHNSWBind::LoadIndex, py::arg("fpath")) 85 | .def("search_knn", &CuHNSWBind::SearchGraph, 86 | py::arg("qdata"), py::arg("topk"), py::arg("ef_search"), 87 | py::arg("nns"), py::arg("distances"), py::arg("found")) 88 | .def("__repr__", 89 | [](const CuHNSWBind &a) { 90 | return ""; 91 | } 92 | ); 93 | return m.ptr(); 94 | } 95 | -------------------------------------------------------------------------------- /cuhnsw/pyhnsw.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member 8 | import os 9 | import json 10 | import tempfile 11 | 12 | import numpy as np 13 | 14 | from cuhnsw import aux 15 | from cuhnsw.cuhnsw_bind import CuHNSWBind 16 | 17 | EPS = 1e-10 18 | WARP_SIZE = 32 19 | DIST_ALIAS = {"ip": "dot", "euclidean": "l2", "cosine": "dot"} 20 | 21 | 22 | class CuHNSW: 23 | def __init__(self, opt=None): 24 | self.opt = aux.get_opt_as_proto(opt or {}) 25 | 26 | self.opt.level_mult = self.opt.level_mult or 1 / np.log(self.opt.max_m) 27 | 28 | # handle aliases of dist_type 29 | assert self.opt.dist_type in ["l2", "euclidean", "dot", "ip", "cosine"], \ 30 | self.opt.dist_type 31 | self.opt.dist_type = DIST_ALIAS.get(self.opt.dist_type, self.opt.dist_type) 32 | if self.opt.dist_type == "cosine": 33 | self.opt.nrz = True 34 | 35 | self.logger = aux.get_logger("cuhnsw", self.opt.py_log_level) 36 | tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) 37 | opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) 38 | tmp.write(opt_content) 39 | tmp.close() 40 | self.logger.info("opt: %s", opt_content) 41 | self.data = None 42 | self.obj = CuHNSWBind() 43 | assert self.opt.block_dim <= WARP_SIZE ** 2 and \ 44 | self.opt.block_dim % WARP_SIZE == 0, \ 45 | f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})" 46 | assert self.obj.init(bytes(tmp.name, "utf8")), \ 47 | f"failed to load {tmp.name}" 48 | os.remove(tmp.name) 49 | 50 | def set_data(self, data): 51 | self.data = data.copy() 52 | if self.opt.nrz and self.opt.dist_type == "l2": 53 | self.logger.warning( \ 54 | "it is not common to set nrz = True and dist_type = l2") 55 | if self.opt.nrz: 56 | self.data /= np.linalg.norm(self.data, axis=1)[:, None] 57 | num_data, num_dims = self.data.shape 58 | self.logger.info("data shape: %d x %d", num_data, num_dims) 59 | self.obj.set_data(self.data) 60 | 61 | def build(self): 62 | self.set_random_levels() 63 | self.obj.build_graph() 64 | 65 | def set_random_levels(self): 66 | np.random.seed(self.opt.seed) 67 | num_data = self.data.shape[0] 68 | levels = np.random.uniform(size=num_data) 69 | levels = np.maximum(levels, EPS) 70 | levels = (-np.log(levels) * self.opt.level_mult).astype(np.int32) 71 | self.obj.set_random_levels(levels) 72 | 73 | def save_index(self, fpath): 74 | self.obj.save_index(fpath.encode("utf-8")) 75 | 76 | def load_index(self, fpath): 77 | self.obj.load_index(fpath.encode("utf-8")) 78 | 79 | def search_knn(self, qdata, topk, ef_search): 80 | ef_search = max(topk, ef_search) 81 | qdata = qdata.astype(np.float32) 82 | num_queries = qdata.shape[0] 83 | nns = np.empty(shape=(num_queries, topk), dtype=np.int32) 84 | distances = np.empty(shape=(num_queries, topk), dtype=np.float32) 85 | found_cnt = np.empty(shape=(num_queries,), dtype=np.int32) 86 | self.obj.search_knn(qdata, topk, ef_search, 87 | nns, distances, found_cnt) 88 | return nns, distances, found_cnt 89 | -------------------------------------------------------------------------------- /cpp/include/cuhnsw.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include // NOLINT 28 | 29 | #include "json11.hpp" 30 | #include "log.hpp" 31 | #include "level_graph.hpp" 32 | // #include "stop_watch.hpp" 33 | #include "types.hpp" 34 | 35 | namespace cuhnsw { 36 | 37 | // for the compatibility with hnswlib 38 | // following two functions refer to 39 | // https://github.com/nmslib/hnswlib/blob/ 40 | // 2571bdb6ef3f91d6f4c2e59178fde49055d2f980/hnswlib/hnswlib.h 41 | template 42 | static void writeBinaryPOD(std::ostream &out, const T &podRef) { 43 | out.write(reinterpret_cast(&podRef), sizeof(T)); 44 | } 45 | template 46 | static void readBinaryPOD(std::istream &in, T &podRef) { 47 | in.read(reinterpret_cast(&podRef), sizeof(T)); 48 | } 49 | 50 | class CuHNSW { 51 | public: 52 | // enum ProfileColumns { 53 | // GPU, 54 | // PROFILE_SIZE, 55 | // }; 56 | 57 | // std::vector PROFILE_KEYS = { 58 | // "gpu", 59 | // }; 60 | 61 | CuHNSW(); 62 | ~CuHNSW(); 63 | 64 | bool Init(std::string opt_path); 65 | void SetData(const float* data, int num_data, int num_dims); 66 | void SetRandomLevels(const int* levels); 67 | void BuildGraph(); 68 | void SaveIndex(std::string fpath); 69 | void LoadIndex(std::string fpath); 70 | void SearchGraph(const float* qdata, const int num_queries, const int topk, const int ef_search, 71 | int* nns, float* distances, int* found_cnt); 72 | 73 | private: 74 | void GetDeviceInfo(); 75 | void GetEntryPoints( 76 | const std::vector& nodes, 77 | std::vector& entries, 78 | int level, bool search); 79 | void SearchAtLayer( 80 | const std::vector& queries, 81 | std::vector>>& entries, 82 | int level, int max_m); 83 | void SearchHeuristicAtLayer( 84 | const std::vector& queries, 85 | int level, int max_m, bool postprocess); 86 | void BuildLevelGraph(int level); 87 | std::vector level_graphs_; 88 | std::vector levels_; 89 | 90 | json11::Json opt_; 91 | std::shared_ptr logger_; 92 | 93 | int num_data_, num_dims_, batch_size_; 94 | thrust::device_vector device_data_, device_qdata_; 95 | const float* data_; 96 | std::vector labels_; 97 | bool labelled_ = false; 98 | bool reverse_cand_ = false; 99 | 100 | int major_, minor_, cores_, devId_, mp_cnt_; 101 | int block_cnt_, block_dim_; 102 | int visited_table_size_, visited_list_size_; 103 | int max_level_, max_m_, max_m0_; 104 | int enter_point_, ef_construction_; 105 | float level_mult_; 106 | int dist_type_; 107 | bool save_remains_; 108 | double heuristic_coef_; 109 | // std::vector sw_; 110 | // std::vector el_; 111 | 112 | bool* visited_; 113 | }; // class CuHNSW 114 | 115 | } // namespace cuhnsw 116 | -------------------------------------------------------------------------------- /cuhnsw/aux.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import re 7 | import json 8 | import logging 9 | import logging.handlers 10 | 11 | import jsmin 12 | from google.protobuf.json_format import Parse, MessageToDict 13 | from cuhnsw.config_pb2 import ConfigProto 14 | 15 | # get_logger and Option refer to 16 | # https://github.com/kakao/buffalo/blob/ 17 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/buffalo/misc/aux.py 18 | def get_logger(name=__file__, level=2): 19 | if level == 1: 20 | level = logging.WARNING 21 | elif level == 2: 22 | level = logging.INFO 23 | elif level == 3: 24 | level = logging.DEBUG 25 | logger = logging.getLogger(name) 26 | if logger.handlers: 27 | return logger 28 | logger.setLevel(level) 29 | sh0 = logging.StreamHandler() 30 | sh0.setLevel(level) 31 | formatter = logging.Formatter('[%(levelname)-8s] %(asctime)s ' 32 | '[%(filename)s] [%(funcName)s:%(lineno)d]' 33 | '%(message)s', '%Y-%m-%d %H:%M:%S') 34 | sh0.setFormatter(formatter) 35 | logger.addHandler(sh0) 36 | return logger 37 | 38 | # This function helps you to read non-standard json strings. 39 | # - Handles json string with c++ style inline comments 40 | # - Handles json string with trailing commas. 41 | def load_json_string(cont): 42 | # (1) Removes comment. 43 | # Refer to https://plus.google.com/+DouglasCrockfordEsq/posts/RK8qyGVaGSr 44 | cont = jsmin.jsmin(cont) 45 | 46 | # (2) Removes trailing comma. 47 | cont = re.sub(",[ \t\r\n]*}", "}", cont) 48 | cont = re.sub(",[ \t\r\n]*" + r"\]", "]", cont) 49 | 50 | return json.loads(cont) 51 | 52 | 53 | # function read json file from filename 54 | def load_json_file(fname): 55 | with open(fname, "r") as fin: 56 | ret = load_json_string(fin.read()) 57 | return ret 58 | 59 | # use protobuf to restrict field and types 60 | def get_opt_as_proto(raw, proto_type=ConfigProto): 61 | proto = proto_type() 62 | # convert raw to proto 63 | Parse(json.dumps(Option(raw)), proto) 64 | err = [] 65 | assert proto.IsInitialized(err), \ 66 | f"some required fields are missing in proto {err}\n {proto}" 67 | return proto 68 | 69 | def proto_to_dict(proto): 70 | return MessageToDict(proto, \ 71 | including_default_value_fields=True, \ 72 | preserving_proto_field_name=True) 73 | 74 | def copy_proto(proto): 75 | newproto = type(proto)() 76 | Parse(json.dumps(proto_to_dict(proto)), newproto) 77 | return newproto 78 | 79 | class Option(dict): 80 | def __init__(self, *args, **kwargs): 81 | args = [arg if isinstance(arg, dict) 82 | else load_json_file(arg) for arg in args] 83 | super().__init__(*args, **kwargs) 84 | for arg in args: 85 | if isinstance(arg, dict): 86 | for k, val in arg.items(): 87 | if isinstance(val, dict): 88 | self[k] = Option(val) 89 | else: 90 | self[k] = val 91 | if kwargs: 92 | for k, val in kwargs.items(): 93 | if isinstance(val, dict): 94 | self[k] = Option(val) 95 | else: 96 | self[k] = val 97 | 98 | def __getattr__(self, attr): 99 | return self.get(attr) 100 | 101 | def __setattr__(self, key, value): 102 | self.__setitem__(key, value) 103 | 104 | def __setitem__(self, key, value): 105 | super().__setitem__(key, value) 106 | self.__dict__.update({key: value}) 107 | 108 | def __delattr__(self, item): 109 | self.__delitem__(item) 110 | 111 | def __delitem__(self, key): 112 | super().__delitem__(key) 113 | del self.__dict__[key] 114 | 115 | def __getstate__(self): 116 | return vars(self) 117 | 118 | def __setstate__(self, state): 119 | vars(self).update(state) 120 | -------------------------------------------------------------------------------- /cpp/include/cuda_dist_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "cuda_base_kernels.cuh" 8 | 9 | 10 | namespace cuhnsw { 11 | 12 | // https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ 13 | __inline__ __device__ 14 | cuda_scalar warp_reduce_sum(cuda_scalar val) { 15 | #if __CUDACC_VER_MAJOR__ >= 9 16 | // __shfl_down is deprecated with cuda 9+. use newer variants 17 | unsigned int active = __activemask(); 18 | #pragma unroll 19 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 20 | val = add(val, __shfl_down_sync(active, val, offset)); 21 | } 22 | #else 23 | #pragma unroll 24 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 25 | val = add(val, __shfl_down(val, offset)); 26 | } 27 | #endif 28 | return val; 29 | } 30 | 31 | __inline__ __device__ 32 | cuda_scalar dot(const cuda_scalar * a, const cuda_scalar * b, const int num_dims) { 33 | __syncthreads(); 34 | static __shared__ cuda_scalar shared[32]; 35 | 36 | // figure out the warp/ position inside the warp 37 | int warp = threadIdx.x / WARP_SIZE; 38 | int lane = threadIdx.x % WARP_SIZE; 39 | 40 | // partially reduce the dot product inside each warp using a shuffle 41 | cuda_scalar val = 0; 42 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) 43 | val = add(val, mul(a[i], b[i])); 44 | val = warp_reduce_sum(val); 45 | 46 | // write out the partial reduction to shared memory if appropiate 47 | if (lane == 0) { 48 | shared[warp] = val; 49 | } 50 | __syncthreads(); 51 | 52 | // if we we don't have multiple warps, we're done 53 | if (blockDim.x <= WARP_SIZE) { 54 | return shared[0]; 55 | } 56 | 57 | // otherwise reduce again in the first warp 58 | val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f); 59 | if (warp == 0) { 60 | val = warp_reduce_sum(val); 61 | // broadcast back to shared memory 62 | if (threadIdx.x == 0) { 63 | shared[0] = val; 64 | } 65 | } 66 | __syncthreads(); 67 | return shared[0]; 68 | } 69 | 70 | __inline__ __device__ 71 | cuda_scalar squaresum(const cuda_scalar * a, const cuda_scalar * b, const int num_dims) { 72 | __syncthreads(); 73 | static __shared__ cuda_scalar shared[32]; 74 | 75 | // figure out the warp/ position inside the warp 76 | int warp = threadIdx.x / WARP_SIZE; 77 | int lane = threadIdx.x % WARP_SIZE; 78 | 79 | // partially reduce the dot product inside each warp using a shuffle 80 | cuda_scalar val = 0; 81 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) { 82 | cuda_scalar _val = sub(a[i], b[i]); 83 | val = add(val, mul(_val, _val)); 84 | } 85 | __syncthreads(); 86 | val = warp_reduce_sum(val); 87 | 88 | // write out the partial reduction to shared memory if appropiate 89 | if (lane == 0) { 90 | shared[warp] = val; 91 | } 92 | __syncthreads(); 93 | 94 | // if we we don't have multiple warps, we're done 95 | if (blockDim.x <= WARP_SIZE) { 96 | return shared[0]; 97 | } 98 | 99 | // otherwise reduce again in the first warp 100 | val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f); 101 | if (warp == 0) { 102 | val = warp_reduce_sum(val); 103 | // broadcast back to shared memory 104 | if (threadIdx.x == 0) { 105 | shared[0] = val; 106 | } 107 | } 108 | __syncthreads(); 109 | return shared[0]; 110 | } 111 | 112 | __inline__ __device__ 113 | cuda_scalar GetDistanceByVec(const cuda_scalar* src_vec, const cuda_scalar* dst_vec, const int num_dims, const int dist_type) { 114 | cuda_scalar dist = 0; 115 | switch (dist_type) { 116 | case DOT: 117 | dist = -dot(src_vec, dst_vec, num_dims); break; 118 | case L2: 119 | dist = squaresum(src_vec, dst_vec, num_dims); break; 120 | default: 121 | break; 122 | } 123 | return dist; 124 | } 125 | 126 | __inline__ __device__ 127 | cuda_scalar GetDistance(const int srcid, const int dstid, const int num_dims, 128 | const int dist_type, const int* nodes, const cuda_scalar* data) { 129 | const cuda_scalar* src_vec = data + num_dims * nodes[srcid]; 130 | const cuda_scalar* dst_vec = data + num_dims * nodes[dstid]; 131 | return GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 132 | } 133 | 134 | __inline__ __device__ 135 | cuda_scalar GetDistance2(const int src, const int dst, const int num_dims, 136 | const int dist_type, const cuda_scalar* data) { 137 | const cuda_scalar* src_vec = data + num_dims * src; 138 | const cuda_scalar* dst_vec = data + num_dims * dst; 139 | return GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 140 | } 141 | 142 | 143 | __global__ void BatchDistanceKernel( 144 | const cuda_scalar* data, const int* src, const int* dst, 145 | const int size, const int num_dims, const int dist_type, 146 | float* distances) { 147 | for (int idx = blockIdx.x; idx < size; idx += gridDim.x) { 148 | const int _src = src[idx], _dst = dst[idx]; 149 | cuda_scalar dist = GetDistance2(_src, _dst, num_dims, dist_type, data); 150 | #ifdef HALF_PRECISION 151 | if (threadIdx.x == 0) distances[idx] = __half2float(dist); 152 | #else 153 | if (threadIdx.x == 0) distances[idx] = dist; 154 | #endif 155 | } 156 | } 157 | 158 | 159 | } // namespace cuhnsw 160 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=fixme,too-few-public-methods 8 | # reference: https://github.com/kakao/buffalo/blob/ 9 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/setup.py 10 | """cuhnsw 11 | """ 12 | import os 13 | import sys 14 | import glob 15 | import pathlib 16 | import platform 17 | import sysconfig 18 | import subprocess 19 | from setuptools import setup, Extension 20 | 21 | import pybind11 22 | import numpy as np 23 | from cuda_setup import CUDA, BUILDEXT 24 | 25 | 26 | DOCLINES = __doc__.split("\n") 27 | 28 | # TODO: Python3 Support 29 | if sys.version_info[:3] < (3, 6): 30 | raise RuntimeError("Python version 3.6 or later required.") 31 | 32 | assert platform.system() == 'Linux' # TODO: MacOS 33 | 34 | 35 | MAJOR = 0 36 | MINOR = 0 37 | MICRO = 8 38 | RELEASE = True 39 | STAGE = {True: '', False: 'b'}.get(RELEASE) 40 | VERSION = f'{MAJOR}.{MINOR}.{MICRO}{STAGE}' 41 | STATUS = {False: 'Development Status :: 4 - Beta', 42 | True: 'Development Status :: 5 - Production/Stable'} 43 | 44 | CLASSIFIERS = """{status} 45 | Programming Language :: C++ 46 | Programming Language :: Python :: 3.6 47 | Operating System :: POSIX :: Linux 48 | Operating System :: Unix 49 | Operating System :: MacOS 50 | License :: OSI Approved :: Apache Software License""".format( \ 51 | status=STATUS.get(RELEASE)) 52 | CLIB_DIR = os.path.join(sysconfig.get_path('purelib'), 'cuhnsw') 53 | LIBRARY_DIRS = [CLIB_DIR] 54 | 55 | with open("requirements.txt", "r") as fin: 56 | INSTALL_REQUIRES = [line.strip() for line in fin] 57 | 58 | def get_extend_compile_flags(): 59 | flags = ['-march=native'] 60 | return flags 61 | 62 | 63 | class CMakeExtension(Extension): 64 | extension_type = 'cmake' 65 | 66 | def __init__(self, name): 67 | super().__init__(name, sources=[]) 68 | 69 | 70 | extend_compile_flags = get_extend_compile_flags() 71 | extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \ 72 | extend_compile_flags 73 | csrcs = glob.glob("cpp/src/*.cu") + glob.glob("cpp/src/*.cc") 74 | extensions = [ 75 | # CMakeExtension(name="cuhnsw"), 76 | Extension("cuhnsw.cuhnsw_bind", 77 | sources= csrcs + [ \ 78 | "cuhnsw/bindings.cc", 79 | "3rd/json11/json11.cpp"], 80 | language="c++", 81 | extra_compile_args=extra_compile_args, 82 | extra_link_args=["-fopenmp"], 83 | library_dirs=[CUDA['lib64']], 84 | libraries=['cudart', 'curand'], 85 | extra_objects=[], 86 | include_dirs=[ \ 87 | "cpp/include/", np.get_include(), 88 | pybind11.get_include(), pybind11.get_include(True), 89 | CUDA['include'], "3rd/json11", "3rd/spdlog/include"]) 90 | ] 91 | 92 | 93 | # Return the git revision as a string 94 | def git_version(): 95 | def _minimal_ext_cmd(cmd): 96 | # construct minimal environment 97 | env = {} 98 | for k in ['SYSTEMROOT', 'PATH']: 99 | val = os.environ.get(k) 100 | if val is not None: 101 | env[k] = val 102 | out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env). \ 103 | communicate()[0] 104 | return out 105 | 106 | try: 107 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 108 | git_revision = out.strip().decode('ascii') 109 | except OSError: 110 | git_revision = "Unknown" 111 | 112 | return git_revision 113 | 114 | 115 | def write_version_py(filename='cuhnsw/version.py'): 116 | cnt = """ 117 | short_version = '%(version)s' 118 | git_revision = '%(git_revision)s' 119 | """ 120 | git_revision = git_version() 121 | with open(filename, 'w') as fout: 122 | fout.write(cnt % {'version': VERSION, 123 | 'git_revision': git_revision}) 124 | 125 | 126 | class BuildExtension(BUILDEXT): 127 | def run(self): 128 | for ext in self.extensions: 129 | print(ext.name) 130 | if hasattr(ext, 'extension_type') and ext.extension_type == 'cmake': 131 | self.cmake() 132 | super().run() 133 | 134 | def cmake(self): 135 | cwd = pathlib.Path().absolute() 136 | 137 | build_temp = pathlib.Path(self.build_temp) 138 | build_temp.mkdir(parents=True, exist_ok=True) 139 | 140 | build_type = 'Debug' if self.debug else 'Release' 141 | 142 | cmake_args = [ 143 | '-DCMAKE_BUILD_TYPE=' + build_type, 144 | '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + CLIB_DIR, 145 | ] 146 | 147 | build_args = [] 148 | 149 | os.chdir(str(build_temp)) 150 | self.spawn(['cmake', str(cwd)] + cmake_args) 151 | if not self.dry_run: 152 | self.spawn(['cmake', '--build', '.'] + build_args) 153 | os.chdir(str(cwd)) 154 | 155 | 156 | def setup_package(): 157 | write_version_py() 158 | cmdclass = { 159 | 'build_ext': BuildExtension 160 | } 161 | 162 | metadata = dict( 163 | name='cuhnsw', 164 | maintainer="Jisang Yoon", 165 | maintainer_email="vjs10101v@gmail.com", 166 | author="Jisang Yoon", 167 | author_email="vjs10101v@gmail.com", 168 | description=DOCLINES[0], 169 | long_description="\n".join(DOCLINES[2:]), 170 | url="https://github.com/js1010/cuhnsw", 171 | download_url="https://github.com/js1010/cuhnsw/releases", 172 | include_package_data=False, 173 | license='Apache2', 174 | packages=['cuhnsw/'], 175 | cmdclass=cmdclass, 176 | classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f], 177 | platforms=['Linux', 'Mac OSX', 'Unix'], 178 | ext_modules=extensions, 179 | install_requires=INSTALL_REQUIRES, 180 | entry_points={ 181 | 'console_scripts': [ 182 | ] 183 | }, 184 | python_requires='>=3.6', 185 | ) 186 | 187 | metadata['version'] = VERSION 188 | setup(**metadata) 189 | 190 | 191 | if __name__ == '__main__': 192 | setup_package() 193 | -------------------------------------------------------------------------------- /cpp/include/cuda_search_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "cuda_utils_kernels.cuh" 8 | 9 | namespace cuhnsw { 10 | 11 | __global__ void GetEntryPointsKernel( 12 | const cuda_scalar* qdata, const int* qnodes, const cuda_scalar* target_data, const int* target_nodes, 13 | const int num_dims, const int num_qnodes, const int num_target_nodes, const int max_m, const int dist_type, 14 | const int* graph, const int* deg, 15 | bool* visited, int* visited_list, const int visited_list_size, int* entries, int64_t* acc_visited_cnt 16 | ) { 17 | 18 | static __shared__ int visited_cnt; 19 | bool* _visited = visited + num_target_nodes * blockIdx.x; 20 | int* _visited_list = visited_list + visited_list_size * blockIdx.x; 21 | 22 | for (int i = blockIdx.x; i < num_qnodes; i += gridDim.x) { 23 | if (threadIdx.x == 0) { 24 | visited_cnt = 0; 25 | } 26 | __syncthreads(); 27 | cuda_scalar entry_dist = 0; 28 | int entryid = entries[i]; 29 | const cuda_scalar* src_vec = qdata + num_dims * qnodes[i]; 30 | { 31 | const cuda_scalar* dst_vec = target_data + num_dims * target_nodes[entryid]; 32 | entry_dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 33 | // if (threadIdx.x == 0 and blockIdx.x == 0) { 34 | // printf("srcid: %d, dstid: %d, dist: %f\n", 35 | // qnodes[i], target_nodes[entryid], entry_dist); 36 | // } 37 | } 38 | __syncthreads(); 39 | bool updated = true; 40 | while (updated) { 41 | // initialize entries as neighbors 42 | int beg = max_m * entryid; 43 | int end = beg + deg[entryid]; 44 | updated = false; 45 | for (int j = beg; j < end; ++j) { 46 | int candid = graph[j]; 47 | 48 | if (_visited[candid]) continue; 49 | __syncthreads(); 50 | if (threadIdx.x == 0 and visited_cnt < visited_list_size) { 51 | _visited[candid] = true; 52 | _visited_list[visited_cnt++] = candid; 53 | } 54 | __syncthreads(); 55 | const cuda_scalar* dst_vec = target_data + num_dims * target_nodes[candid]; 56 | cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 57 | if (dist < entry_dist) { 58 | entry_dist = dist; 59 | entryid = candid; 60 | updated = true; 61 | } 62 | __syncthreads(); 63 | } 64 | if (threadIdx.x == 0) entries[i] = entryid; 65 | __syncthreads(); 66 | } 67 | 68 | __syncthreads(); 69 | if (threadIdx.x == 0) { 70 | acc_visited_cnt[blockIdx.x] += visited_cnt; 71 | } 72 | for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) { 73 | _visited[_visited_list[j]] = false; 74 | } 75 | __syncthreads(); 76 | } 77 | } 78 | 79 | __global__ void SearchGraphKernel( 80 | const cuda_scalar* qdata, const int num_qnodes, const cuda_scalar* data, const int num_nodes, 81 | const int num_dims, const int max_m, const int dist_type, 82 | const int ef_search, const int* entries, const int* graph, const int* deg, const int topk, 83 | int* nns, float* distances, int* found_cnt, 84 | int* visited_table, int* visited_list, 85 | const int visited_table_size, const int visited_list_size, int64_t* acc_visited_cnt, 86 | const bool reverse_cand, Neighbor* neighbors, int* global_cand_nodes, cuda_scalar* global_cand_distances 87 | ) { 88 | 89 | static __shared__ int size; 90 | 91 | Neighbor* ef_search_pq = neighbors + ef_search * blockIdx.x; 92 | int* cand_nodes = global_cand_nodes + ef_search * blockIdx.x; 93 | cuda_scalar* cand_distances = global_cand_distances + ef_search * blockIdx.x; 94 | 95 | static __shared__ int visited_cnt; 96 | int* _visited_table = visited_table + visited_table_size * blockIdx.x; 97 | int* _visited_list = visited_list + visited_list_size * blockIdx.x; 98 | 99 | for (int i = blockIdx.x; i < num_qnodes; i += gridDim.x) { 100 | if (threadIdx.x == 0) { 101 | size = 0; 102 | visited_cnt = 0; 103 | } 104 | __syncthreads(); 105 | 106 | // initialize entries 107 | const cuda_scalar* src_vec = qdata + i * num_dims; 108 | PushNodeToSearchPq(ef_search_pq, &size, ef_search, data, 109 | num_dims, dist_type, src_vec, entries[i]); 110 | if (CheckVisited(_visited_table, _visited_list, visited_cnt, entries[i], 111 | visited_table_size, visited_list_size)) 112 | continue; 113 | __syncthreads(); 114 | 115 | // iterate until converge 116 | int idx = GetCand(ef_search_pq, size, reverse_cand); 117 | while (idx >= 0) { 118 | __syncthreads(); 119 | if (threadIdx.x == 0) ef_search_pq[idx].checked = true; 120 | int entry = ef_search_pq[idx].nodeid; 121 | __syncthreads(); 122 | 123 | for (int j = max_m * entry; j < max_m * entry + deg[entry]; ++j) { 124 | int dstid = graph[j]; 125 | 126 | if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 127 | visited_table_size, visited_list_size)) 128 | continue; 129 | __syncthreads(); 130 | 131 | const cuda_scalar* dst_vec = data + num_dims * dstid; 132 | cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 133 | 134 | PushNodeToSearchPq(ef_search_pq, &size, ef_search, 135 | data, num_dims, dist_type, src_vec, dstid); 136 | } 137 | __syncthreads(); 138 | idx = GetCand(ef_search_pq, size, reverse_cand); 139 | } 140 | __syncthreads(); 141 | if (threadIdx.x == 0) { 142 | acc_visited_cnt[blockIdx.x] += visited_cnt; 143 | } 144 | 145 | for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) { 146 | _visited_table[_visited_list[j]] = -1; 147 | } 148 | __syncthreads(); 149 | // get sorted neighbors 150 | if (threadIdx.x == 0) { 151 | int size2 = size; 152 | while (size > 0) { 153 | cand_nodes[size - 1] = ef_search_pq[0].nodeid; 154 | cand_distances[size - 1] = ef_search_pq[0].distance; 155 | PqPop(ef_search_pq, &size); 156 | } 157 | found_cnt[i] = size2 < topk? size2: topk; 158 | for (int j = 0; j < found_cnt[i]; ++j) { 159 | nns[j + i * topk] = cand_nodes[j]; 160 | distances[j + i * topk] = out_scalar(cand_distances[j]); 161 | } 162 | } 163 | __syncthreads(); 164 | } 165 | } 166 | 167 | 168 | 169 | } // namespace cuhnsw 170 | -------------------------------------------------------------------------------- /cpp/include/cuda_utils_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "cuda_base_kernels.cuh" 8 | #include "cuda_dist_kernels.cuh" 9 | #include "cuda_heap_kernels.cuh" 10 | 11 | 12 | namespace cuhnsw { 13 | 14 | __inline__ __device__ 15 | int warp_reduce_cand(const Neighbor* pq, int cand, const bool reverse) { 16 | #if __CUDACC_VER_MAJOR__ >= 9 17 | unsigned int active = __activemask(); 18 | #pragma unroll 19 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 20 | int _cand = __shfl_down_sync(active, cand, offset); 21 | if (_cand >= 0) { 22 | if (cand == -1) { 23 | cand = _cand; 24 | } else { 25 | bool update = reverse? 26 | lt(pq[cand].distance, pq[_cand].distance): 27 | gt(pq[cand].distance, pq[_cand].distance); 28 | if (update) cand = _cand; 29 | } 30 | } 31 | } 32 | #else 33 | #pragma unroll 34 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 35 | int _cand = __shfl_down(cand, offset); 36 | if (_cand >= 0) { 37 | if (cand == -1) { 38 | cand = _cand; 39 | } else { 40 | bool update = reverse? 41 | lt(pq[cand].distance, pq[_cand.distance]): 42 | gt(pq[cand].distance, pq[_cand.distance]); 43 | if (update) cand = _cand; 44 | } 45 | } 46 | } 47 | #endif 48 | return cand; 49 | } 50 | __inline__ __device__ 51 | bool CheckAlreadyExists(const Neighbor* pq, const int size, const int nodeid) { 52 | __syncthreads(); 53 | // figure out the warp/ position inside the warp 54 | int warp = threadIdx.x / WARP_SIZE; 55 | int lane = threadIdx.x % WARP_SIZE; 56 | 57 | static __shared__ bool shared[WARP_SIZE]; 58 | bool exists = false; 59 | for (int i = threadIdx.x; i < size; i += blockDim.x) { 60 | if (pq[i].nodeid == nodeid) { 61 | exists = true; 62 | } 63 | } 64 | 65 | #if __CUDACC_VER_MAJOR__ >= 9 66 | unsigned int active = __activemask(); 67 | exists = __any_sync(active, exists); 68 | #else 69 | exists = __any(exists); 70 | #endif 71 | // write out the partial reduction to shared memory if appropiate 72 | if (lane == 0) { 73 | shared[warp] = exists; 74 | } 75 | 76 | __syncthreads(); 77 | 78 | // if we we don't have multiple warps, we're done 79 | if (blockDim.x <= WARP_SIZE) { 80 | return shared[0]; 81 | } 82 | 83 | 84 | // otherwise reduce again in the first warp 85 | exists = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : false; 86 | if (warp == 0) { 87 | #if __CUDACC_VER_MAJOR__ >= 9 88 | active = __activemask(); 89 | exists = __any_sync(active, exists); 90 | #else 91 | exists = __any(exists); 92 | #endif 93 | // broadcast back to shared memory 94 | if (threadIdx.x == 0) { 95 | shared[0] = exists; 96 | } 97 | } 98 | __syncthreads(); 99 | return shared[0]; 100 | 101 | 102 | 103 | } 104 | __inline__ __device__ 105 | int GetCand(const Neighbor* pq, const int size, const bool reverse) { 106 | __syncthreads(); 107 | 108 | // figure out the warp/ position inside the warp 109 | int warp = threadIdx.x / WARP_SIZE; 110 | int lane = threadIdx.x % WARP_SIZE; 111 | 112 | static __shared__ int shared[WARP_SIZE]; 113 | // pick the closest neighbor with checked = false if reverse = false and vice versa 114 | cuda_scalar dist = reverse? -INFINITY: INFINITY; 115 | int cand = -1; 116 | for (int i = threadIdx.x; i < size; i += blockDim.x) { 117 | if (not pq[i].checked) { 118 | bool update = reverse? lt(dist, pq[i].distance): gt(dist, pq[i].distance); 119 | if (update) { 120 | cand = i; 121 | dist = pq[i].distance; 122 | } 123 | } 124 | } 125 | cand = warp_reduce_cand(pq, cand, reverse); 126 | 127 | 128 | // write out the partial reduction to shared memory if appropiate 129 | if (lane == 0) { 130 | shared[warp] = cand; 131 | } 132 | __syncthreads(); 133 | 134 | // if we we don't have multiple warps, we're done 135 | if (blockDim.x <= WARP_SIZE) { 136 | return shared[0]; 137 | } 138 | 139 | 140 | // otherwise reduce again in the first warp 141 | cand = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : -1; 142 | if (warp == 0) { 143 | cand = warp_reduce_cand(pq, cand, reverse); 144 | // broadcast back to shared memory 145 | if (threadIdx.x == 0) { 146 | shared[0] = cand; 147 | } 148 | } 149 | __syncthreads(); 150 | return shared[0]; 151 | } 152 | 153 | __inline__ __device__ 154 | void PushNodeToPq(Neighbor* pq, int* size, const int max_size, 155 | const cuda_scalar* data, const int num_dims, const int dist_type, 156 | const int srcid, const int dstid, const int* nodes) { 157 | if (srcid == dstid) return; 158 | if (CheckAlreadyExists(pq, *size, dstid)) return; 159 | cuda_scalar dist = GetDistance(srcid, dstid, num_dims, dist_type, nodes, data); 160 | __syncthreads(); 161 | if (*size < max_size) { 162 | PqPush(pq, size, dist, dstid, false); 163 | } else if (gt(pq[0].distance, dist)) { 164 | PqPop(pq, size); 165 | PqPush(pq, size, dist, dstid, false); 166 | } 167 | __syncthreads(); 168 | } 169 | 170 | __inline__ __device__ 171 | void PushNodeToPq2(Neighbor* pq, int* size, const int max_size, 172 | const cuda_scalar dist, const int srcid, const int dstid, const int* nodes) { 173 | if (srcid == dstid) return; 174 | if (CheckAlreadyExists(pq, *size, dstid)) return; 175 | __syncthreads(); 176 | if (*size < max_size) { 177 | PqPush(pq, size, dist, dstid, false); 178 | } else if (gt(pq[0].distance, dist)) { 179 | PqPop(pq, size); 180 | PqPush(pq, size, dist, dstid, false); 181 | } 182 | __syncthreads(); 183 | } 184 | 185 | // similar to bloom filter 186 | // while bloom filter prevents false negative, this visited table prevents false positive 187 | // if it says the node is visited, it is actually visited 188 | // if it says the node is not visited, it can be possibly visited 189 | __inline__ __device__ 190 | bool CheckVisited(int* visited_table, int* visited_list, int& visited_cnt, int target, 191 | const int visited_table_size, const int visited_list_size) { 192 | __syncthreads(); 193 | bool ret = false; 194 | if (visited_cnt < visited_list_size ){ 195 | int idx = target % visited_table_size; 196 | if (visited_table[idx] != target) { 197 | __syncthreads(); 198 | if (threadIdx.x == 0) { 199 | if (visited_table[idx] == -1) { 200 | visited_table[idx] = target; 201 | visited_list[visited_cnt++] = idx; 202 | } 203 | } 204 | } else { 205 | ret = true; 206 | } 207 | } 208 | __syncthreads(); 209 | return ret; 210 | } 211 | 212 | __inline__ __device__ 213 | void PushNodeToSearchPq(Neighbor* pq, int* size, const int max_size, 214 | const cuda_scalar* data, const int num_dims, const int dist_type, 215 | const cuda_scalar* src_vec, const int dstid) { 216 | if (CheckAlreadyExists(pq, *size, dstid)) return; 217 | const cuda_scalar* dst_vec = data + num_dims * dstid; 218 | cuda_scalar dist = GetDistanceByVec(src_vec, dst_vec, num_dims, dist_type); 219 | __syncthreads(); 220 | if (*size < max_size) { 221 | PqPush(pq, size, dist, dstid, false); 222 | } else if (gt(pq[0].distance, dist)) { 223 | PqPop(pq, size); 224 | PqPush(pq, size, dist, dstid, false); 225 | } 226 | __syncthreads(); 227 | } 228 | 229 | 230 | } // namespace cuhnsw 231 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUHNSW 2 | 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Build Status](https://travis-ci.org/js1010/cuhnsw.svg?branch=main)](https://travis-ci.org/js1010/cuhnsw) [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/learn-travis/issues) 4 | 5 | Efficient CUDA implementation of Hierarchical Navigable Small World (HNSW) graph algorithm for Approximate Nearest Neighbor (ANN) 6 | 7 | ### Introduction 8 | 9 | This project is to speed up HNSW algorithm by CUDA. I expect that anyone who will be interested in this project might be already familiar with the following paper and the open source project. If not, I strongly recommend that you check them first. 10 | 11 | - hnsw paper: https://arxiv.org/pdf/1603.09320.pdf (2016) 12 | - hnsw implementation (cpu only) by the author of hnsw (Yury Markov): https://github.com/nmslib/hnswlib 13 | - Approximate Nearest Neighbor (ANN) Benchmark Site: http://ann-benchmarks.com/ 14 | 15 | I also adapted some ideas from the following project. 16 | 17 | - n2 (alternative hnsw cpu implementation project): https://github.com/kakao/n2 18 | 19 | By brief survey, I found there are several papers and projects to suggest to speed up ANN algorithms by GPU. 20 | 21 | - papers or projects related to using GPU for ANN 22 | - paper (2020): http://research.baidu.com/Public/uploads/5f5c37aa9c37c.pdf 23 | - paper (2017): https://arxiv.org/pdf/1702.05911.pdf 24 | - slides (2020): https://wangzwhu.github.io/home/file/acmmm-t-part3-ann.pdf 25 | - project (2017): https://github.com/facebookresearch/faiss 26 | - paper (2019): https://arxiv.org/pdf/1912.01059.pdf (source repo: https://github.com/cgtuebingen/ggnn) [**UPDATED**: recently found..] 27 | 28 | I started this project because I was originally interested in both CUDA programming and ANN algorithms. I release this project because it achieved meaningful performance and hope to develop further by community participation. 29 | 30 | Literally, this package is implemented to build HNSW graphs using GPU, and to approximate nearest neighbor search through the built graphs, and the format of the model file is compatible with hnswlib. In other words, you can build a HNSW graph from this package, then save it and load it from hnswlib for search, and vice versa. 31 | 32 | 33 | ### How to install 34 | 35 | 1. pip install 36 | 37 | ```shell 38 | pip install cuhnsw 39 | ``` 40 | 41 | 2. build from source 42 | 43 | ```shell 44 | # clone repo and submodules 45 | git clone git@github.com:js1010/cuhnsw.git && cd cuhnsw && git submodule update --init 46 | 47 | # install requirements 48 | pip install -r requirements.txt 49 | 50 | # generate proto 51 | python -m grpc_tools.protoc --python_out cuhnsw/ --proto_path cuhnsw/proto/ config.proto 52 | 53 | # install 54 | python setup.py install 55 | ``` 56 | 57 | ### How to use 58 | 59 | - `examples/example1.py` and `examples/README.md` will be very helpful to understand the usage. 60 | - build and save model 61 | 62 | ```python 63 | import h5py 64 | from cuhnsw import CuHNSW 65 | 66 | 67 | h5f = h5py.File("glove-50-angular.hdf5", "r") 68 | data = h5f["train"][:, :].astype(np.float32) 69 | h5f.close() 70 | ch0 = CuHNSW(opt={}) 71 | ch0.set_data(data) 72 | ch0.build() 73 | ch0.save_index("cuhnsw.index") 74 | ``` 75 | 76 | - load model and search 77 | 78 | ```python 79 | import h5py 80 | from cuhnsw import CuHNSW 81 | 82 | h5f = h5py.File("glove-50-angular.hdf5", "r") 83 | data = h5f["test"][:, :].astype(np.float32) 84 | h5f.close() 85 | ch0 = CuHNSW(opt={}) 86 | ch0.load_index("cuhnsw.index") 87 | nns, distances, found_cnt = ch0.search_knn(data, topk=10, ef_search=300) 88 | ``` 89 | 90 | - Option parameters (see `cuhnsw/proto/config.proto`) 91 | - `seed`: numpy random seed (used in random levels) 92 | - `c_log_level`: log level in cpp logging (spdlog) 93 | - `py_log_level`: log level in python logging 94 | - `max_m`: maximum number of links in layers higher than ground layer 95 | - `max_m0`: maximum number of links in the ground layer 96 | - `level_mult`: multiplier to draw levels of each element (defualt: 0 => setted as `1 / log(max_m0)` in initialization as recommended in hnsw paper) 97 | - `save_remains`: link to remained candidates in SearchHeuristic (adapted from n2) 98 | - `heuristic_coff`: select some closest candidates by default (also adapted from n2) 99 | - `hyper_threads`: set the number of gpu blocks as the total number of concurrent cores exceeds the physical number of cores 100 | - `block_dim`: block dimension (should be smaller than 32^2=1024 and should be the multiple of 32) 101 | - `nrz`: normalize data vector if True 102 | - `visited_table_size`: size of table to store the visited nodes in each search 103 | - `visited_list_size`: size of list to store the visited nodes in each search (useful to reset table after each search) 104 | - `reverse_cand`: select the candidate with the furthest distance if True (it makes the build slower but achieves better quality) 105 | - `dist_type`: euclidean distance if "l2" and inner product distaance if "dot" 106 | 107 | ### Performance 108 | 109 | - tl;dr 110 | - cuhnsw achieved the same build quality by 8-9 times faster build time than hnswlib with 8 vcpus on certain data and parameter setup 111 | - cuhnsw achieved the same search quality by 3-4 times faster search time than hnswlib with 8 vcpus instance on certain data and parameter setup 112 | - Note1: HNSW search algorithm can be verified by exact match since it is deterministic. 113 | - I verified it with hnswlib, in other words, cuhnsw search and hnswlib search returns exactly same results by loading the same model file and the same queries and the same ef search. 114 | - Note2: GPU search has the advantage over CPU search only when it comes to the `Batch` search (i.e. processing large number of queries at once.) 115 | - [AWS P3 2xlarge instance](https://aws.amazon.com/ec2/instance-types/p3/) is used to the experiment. (One Tesla V100 GPU with 8 vcpus, 3.06 USD / hr) 116 | - results can be reproduced by running `example/example1.py`. 117 | - build time / quality results on glove-50-angular 118 | - used `ef_construction`=150 for hnswlib and `ef_construction=110` for cuhnsw to achieve the same build quality 119 | - build quality is measured by the accuracy by the same search parameter (`ef_search`=300) 120 | - build time is in seconds 121 | 122 | | attr | 1 vcpu | 2 vcpu | 4 vcpu | 8 vcpu | gpu | 123 | |:--------------|-----------:|-----------:|----------:|----------:|----------:| 124 | | build time | 343.909 | 179.836 | 89.7936 | 70.5476 | 8.2847 | 125 | | build quality | 0.863193 | 0.863301 | 0.863238 | 0.863165 | 0.865471 | 126 | 127 | - update: measured build time / accuracy for cpu-only instance ([c5.24xlarge](https://aws.amazon.com/ec2/instance-types/c5/), 96 vcpu, 4.08 USD / hr): 9.6275 sec / 0.8628 128 | - search time comparison on glove-50-angular 129 | - search time on 1M random queries (seconds) 130 | - search `quality` is guaranteed to the same (exact match) 131 | 132 | | attr | 1 vcpu | 2 vcpu | 4 vcpu | 8 vcpu | gpu | 133 | |:------------|--------:|--------:|--------:|--------:|--------:| 134 | | search time | 556.605 | 287.967 | 146.331 | 115.431 | 29.7008 | 135 | 136 | - update: measured 1M queries search time for cpu-only instance ([c5.24xlarge](https://aws.amazon.com/ec2/instance-types/c5/), 96 vcpu, 4.08 USD / hr): 22.4642 sec 137 | 138 | - the reason why the parallel efficiency significantly drops from 4 vcpu to 8 vcpu might be hyper threading (there might be only 4 "physical" cores in this instance). 139 | 140 | ### Thoughts on Future Task 141 | 142 | - The word in the parentheses shows the expected level of difficulty for each task 143 | 144 | 1. **implement parallel compilation using bazel or cmake (easy-medium)**: bazel is more preferable. compilation time is a little bit painful so far. 145 | 2. **achieve significant speed-up by using half-precision operation (medium)**: I experimented it, but only got around 10 % improvement. I am not sure if I have used the half-precision feature appropriately. 146 | 3. **support multi-device (very hard)**: it only supports single-device (gpu) yet since the graph should be shared across all the building threads. 147 | 148 | - contribution is always welcome 149 | -------------------------------------------------------------------------------- /cuda_setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Adapted from https://github.com/rmcgibbo/npcuda-example and 8 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py 9 | # pylint: disable=fixme,access-member-before-definition 10 | # pylint: disable=attribute-defined-outside-init,arguments-differ 11 | import logging 12 | import os 13 | import sys 14 | 15 | from distutils import ccompiler, errors, msvccompiler, unixccompiler 16 | from setuptools.command.build_ext import build_ext as setuptools_build_ext 17 | 18 | 19 | HALF_PRECISION = False 20 | 21 | def find_in_path(name, path): 22 | "Find a file in a search path" 23 | # adapted fom http://code.activestate.com/ 24 | # recipes/52224-find-a-file-given-a-search-path/ 25 | for _dir in path.split(os.pathsep): 26 | binpath = os.path.join(_dir, name) 27 | if os.path.exists(binpath): 28 | return os.path.abspath(binpath) 29 | return None 30 | 31 | # reference: https://arnon.dk/ 32 | # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ 33 | def get_cuda_sm_list(cuda_ver): 34 | if "CUDA_SM_LIST" in os.environ: 35 | sm_list = os.environ["CUDA_SM_LIST"].split(",") 36 | else: 37 | sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"] 38 | if cuda_ver >= 110: 39 | filter_list = ["30"] 40 | if cuda_ver == 110: 41 | filter_list += ["86"] 42 | else: 43 | filter_list = ["80", "86"] 44 | if cuda_ver < 100: 45 | filter_list += ["75"] 46 | if cuda_ver < 90: 47 | filter_list += ["70"] 48 | if cuda_ver < 80: 49 | filter_list += ["60", "61"] 50 | sm_list = [sm for sm in sm_list if sm not in filter_list] 51 | return sm_list 52 | 53 | 54 | def get_cuda_compute(cuda_ver): 55 | if "CUDA_COMPUTE" in os.environ: 56 | compute = os.environ["CUDA_COMPUTE"] 57 | else: 58 | if 70 <= cuda_ver < 80: 59 | compute = "52" 60 | if 80 <= cuda_ver < 90: 61 | compute = "61" 62 | if 90 <= cuda_ver < 100: 63 | compute = "70" 64 | if 100 <= cuda_ver < 110: 65 | compute = "75" 66 | if cuda_ver == 110: 67 | compute = "80" 68 | if cuda_ver == 111: 69 | compute = "86" 70 | return compute 71 | 72 | 73 | def get_cuda_arch(cuda_ver): 74 | if "CUDA_ARCH" in os.environ: 75 | arch = os.environ["CUDA_ARCH"] 76 | else: 77 | if 70 <= cuda_ver < 92: 78 | arch = "30" 79 | if 92 <= cuda_ver < 110: 80 | arch = "50" 81 | if cuda_ver == 110: 82 | arch = "52" 83 | if cuda_ver == 111: 84 | arch = "80" 85 | return arch 86 | 87 | def locate_cuda(): 88 | """Locate the CUDA environment on the system 89 | If a valid cuda installation is found 90 | this returns a dict with keys 'home', 'nvcc', 'include', 91 | and 'lib64' and values giving the absolute path to each directory. 92 | Starts by looking for the CUDAHOME env variable. 93 | If not found, everything is based on finding 94 | 'nvcc' in the PATH. 95 | If nvcc can't be found, this returns None 96 | """ 97 | nvcc_bin = 'nvcc' 98 | if sys.platform.startswith("win"): 99 | nvcc_bin = 'nvcc.exe' 100 | 101 | # check env variables CUDA_HOME, CUDAHOME, CUDA_PATH. 102 | found = False 103 | for env_name in ['CUDA_PATH', 'CUDAHOME', 'CUDA_HOME']: 104 | if env_name not in os.environ: 105 | continue 106 | found = True 107 | home = os.environ[env_name] 108 | nvcc = os.path.join(home, 'bin', nvcc_bin) 109 | break 110 | if not found: 111 | # otherwise, search the PATH for NVCC 112 | nvcc = find_in_path(nvcc_bin, os.environ['PATH']) 113 | if nvcc is None: 114 | logging.warning('The nvcc binary could not be located in your ' 115 | '$PATH. Either add it to ' 116 | 'your path, or set $CUDA_HOME to enable CUDA extensions') 117 | return None 118 | home = os.path.dirname(os.path.dirname(nvcc)) 119 | 120 | cudaconfig = {'home': home, 121 | 'nvcc': nvcc, 122 | 'include': os.path.join(home, 'include'), 123 | 'lib64': os.path.join(home, 'lib64')} 124 | cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".") 125 | major, minor = int(cuda_ver[0]), int(cuda_ver[1]) 126 | cuda_ver = 10 * major + minor 127 | assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}" 128 | print(f"cuda_ver: {major}.{minor}") 129 | arch = get_cuda_arch(cuda_ver) 130 | sm_list = get_cuda_sm_list(cuda_ver) 131 | compute = get_cuda_compute(cuda_ver) 132 | post_args = [f"-arch=sm_{arch}"] + \ 133 | [f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \ 134 | [f"-gencode=arch=compute_{compute},code=compute_{compute}", 135 | "--ptxas-options=-v", "-O2"] 136 | print(f"nvcc post args: {post_args}") 137 | if HALF_PRECISION: 138 | post_args = [flag for flag in post_args if "52" not in flag] 139 | 140 | if sys.platform == "win32": 141 | cudaconfig['lib64'] = os.path.join(home, 'lib', 'x64') 142 | post_args += ['-Xcompiler', '/MD', '-std=c++14', "-Xcompiler", "/openmp"] 143 | if HALF_PRECISION: 144 | post_args += ["-Xcompiler", "/D HALF_PRECISION"] 145 | else: 146 | post_args += ['-c', '--compiler-options', "'-fPIC'", 147 | "--compiler-options", "'-std=c++14'"] 148 | if HALF_PRECISION: 149 | post_args += ["--compiler-options", "'-D HALF_PRECISION'"] 150 | for k, val in cudaconfig.items(): 151 | if not os.path.exists(val): 152 | logging.warning('The CUDA %s path could not be located in %s', k, val) 153 | return None 154 | 155 | cudaconfig['post_args'] = post_args 156 | return cudaconfig 157 | 158 | 159 | # This code to build .cu extensions with nvcc is taken from cupy: 160 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py 161 | class _UnixCCompiler(unixccompiler.UnixCCompiler): 162 | src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) 163 | src_extensions.append('.cu') 164 | 165 | def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): 166 | # For sources other than CUDA C ones, just call the super class method. 167 | if os.path.splitext(src)[1] != '.cu': 168 | return unixccompiler.UnixCCompiler._compile( 169 | self, obj, src, ext, cc_args, extra_postargs, pp_opts) 170 | 171 | # For CUDA C source files, compile them with NVCC. 172 | _compiler_so = self.compiler_so 173 | try: 174 | nvcc_path = CUDA['nvcc'] 175 | post_args = CUDA['post_args'] 176 | # TODO? base_opts = build.get_compiler_base_options() 177 | self.set_executable('compiler_so', nvcc_path) 178 | 179 | return unixccompiler.UnixCCompiler._compile( 180 | self, obj, src, ext, cc_args, post_args, pp_opts) 181 | finally: 182 | self.compiler_so = _compiler_so 183 | 184 | 185 | class _MSVCCompiler(msvccompiler.MSVCCompiler): 186 | _cu_extensions = ['.cu'] 187 | 188 | src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) 189 | src_extensions.extend(_cu_extensions) 190 | 191 | def _compile_cu(self, sources, output_dir=None, macros=None, 192 | include_dirs=None, debug=0, extra_preargs=None, 193 | extra_postargs=None, depends=None): 194 | # Compile CUDA C files, mainly derived from UnixCCompiler._compile(). 195 | macros, objects, extra_postargs, pp_opts, _build = \ 196 | self._setup_compile(output_dir, macros, include_dirs, sources, 197 | depends, extra_postargs) 198 | 199 | compiler_so = CUDA['nvcc'] 200 | cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) 201 | post_args = CUDA['post_args'] 202 | 203 | for obj in objects: 204 | try: 205 | src, _ = _build[obj] 206 | except KeyError: 207 | continue 208 | try: 209 | self.spawn([compiler_so] + cc_args + [src, '-o', obj] + post_args) 210 | except errors.DistutilsExecError as e: 211 | raise errors.CompileError(str(e)) 212 | 213 | return objects 214 | 215 | def compile(self, sources, **kwargs): 216 | # Split CUDA C sources and others. 217 | cu_sources = [] 218 | other_sources = [] 219 | for source in sources: 220 | if os.path.splitext(source)[1] == '.cu': 221 | cu_sources.append(source) 222 | else: 223 | other_sources.append(source) 224 | 225 | # Compile source files other than CUDA C ones. 226 | other_objects = msvccompiler.MSVCCompiler.compile( 227 | self, other_sources, **kwargs) 228 | 229 | # Compile CUDA C sources. 230 | cu_objects = self._compile_cu(cu_sources, **kwargs) 231 | 232 | # Return compiled object filenames. 233 | return other_objects + cu_objects 234 | 235 | 236 | class CudaBuildExt(setuptools_build_ext): 237 | """Custom `build_ext` command to include CUDA C source files.""" 238 | 239 | def run(self): 240 | if CUDA is not None: 241 | def wrap_new_compiler(func): 242 | def _wrap_new_compiler(*args, **kwargs): 243 | try: 244 | return func(*args, **kwargs) 245 | except errors.DistutilsPlatformError: 246 | if sys.platform != 'win32': 247 | CCompiler = _UnixCCompiler 248 | else: 249 | CCompiler = _MSVCCompiler 250 | return CCompiler( 251 | None, kwargs['dry_run'], kwargs['force']) 252 | return _wrap_new_compiler 253 | ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler) 254 | # Intentionally causes DistutilsPlatformError in 255 | # ccompiler.new_compiler() function to hook. 256 | self.compiler = 'nvidia' 257 | 258 | setuptools_build_ext.run(self) 259 | 260 | 261 | CUDA = locate_cuda() 262 | assert CUDA is not None 263 | BUILDEXT = CudaBuildExt if CUDA else setuptools_build_ext 264 | -------------------------------------------------------------------------------- /examples/example1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,logging-format-truncated 8 | import os 9 | from os.path import join as pjoin 10 | import time 11 | import subprocess 12 | 13 | import h5py 14 | import fire 15 | # import tqdm 16 | import numpy as np 17 | import pandas as pd 18 | 19 | import hnswlib 20 | from cuhnsw import aux, CuHNSW 21 | 22 | LOGGER = aux.get_logger() 23 | 24 | NUM_DATA = 1183514 25 | DATA_FILE = "glove-50-angular.hdf5" 26 | DIST_TYPE = "cosine" 27 | 28 | # NUM_DATA = 1000000 29 | # DATA_FILE = "sift-128-euclidean.hdf5" 30 | # DIST_TYPE = "l2" 31 | 32 | BARRIER_SIZE = 100 33 | RES_DIR = "res" 34 | INDEX_FILE = "hnswlib.index" 35 | CUHNSW_INDEX_FILE = "cuhnsw.index" 36 | HNSWLIB_INDEX_FILE = "hnswlib.index" 37 | DATA_URL = f"http://ann-benchmarks.com/{DATA_FILE}" 38 | NRZ = DIST_TYPE == "cosine" 39 | OPT = { \ 40 | "c_log_level": 2, 41 | "ef_construction": 150, 42 | "hyper_threads": 100, 43 | "block_dim": 32, 44 | "nrz": NRZ, 45 | "reverse_cand": False, 46 | "heuristic_coef": 0.0, 47 | "dist_type": DIST_TYPE, \ 48 | } 49 | 50 | 51 | def download(): 52 | if not os.path.exists(RES_DIR): 53 | os.makedirs(RES_DIR) 54 | data_path = pjoin(RES_DIR, DATA_FILE) 55 | if os.path.exists(data_path): 56 | return 57 | cmds = ["wget", DATA_URL, "-O", data_path + ".tmp"] 58 | cmds = " ".join(cmds) 59 | LOGGER.info("download data: %s", cmds) 60 | subprocess.call(cmds, shell=True) 61 | os.rename(data_path + ".tmp", data_path) 62 | 63 | 64 | def run_cpu_inference(topk=100, ef_search=300, index_file=INDEX_FILE, 65 | evaluate=True, num_threads=-1): 66 | print("=" * BARRIER_SIZE) 67 | data_path = pjoin(RES_DIR, DATA_FILE) 68 | index_path = pjoin(RES_DIR, index_file) 69 | LOGGER.info("cpu inference on %s with index %s", data_path, index_path) 70 | h5f = h5py.File(data_path, "r") 71 | num_data = h5f["train"].shape[0] 72 | queries = h5f["test"][:, :].astype(np.float32) 73 | neighbors = h5f["neighbors"][:, :topk].astype(np.int32) 74 | h5f.close() 75 | hl0 = hnswlib.Index(space=DIST_TYPE, dim=queries.shape[1]) 76 | LOGGER.info("load %s by hnswlib", index_path) 77 | num_queries = queries.shape[0] 78 | hl0.load_index(index_path, max_elements=num_data) 79 | hl0.set_ef(ef_search) 80 | if NRZ: 81 | queries /= np.linalg.norm(queries, axis=1)[:, None] 82 | 83 | start = time.time() 84 | labels, _ = hl0.knn_query(queries, k=topk, num_threads=num_threads) 85 | el0 = time.time() - start 86 | LOGGER.info("elapsed for processing %d queries computing top@%d: %.4e sec", 87 | num_queries, topk, el0) 88 | if evaluate: 89 | accs = [] 90 | for _pred_nn, _gt_nn in zip(labels, neighbors): 91 | intersection = set(_pred_nn) & set(_gt_nn) 92 | acc = len(intersection) / float(topk) 93 | accs.append(acc) 94 | LOGGER.info("accuracy mean: %.4e, std: %.4e", np.mean(accs), np.std(accs)) 95 | return el0, np.mean(accs) 96 | return el0 97 | 98 | def run_cpu_inference_large(topk=100, index_file=INDEX_FILE, ef_search=300, 99 | num_queries=1000000, num_dims=50, num_threads=-1): 100 | print("=" * BARRIER_SIZE) 101 | index_path = pjoin(RES_DIR, index_file) 102 | data_path = pjoin(RES_DIR, DATA_FILE) 103 | LOGGER.info("cpu inference on %s with index %s", data_path, index_path) 104 | 105 | queries = np.random.normal(size=(num_queries, num_dims)).astype(np.float32) 106 | queries /= np.linalg.norm(queries, axis=1)[:, None] 107 | 108 | hl0 = hnswlib.Index(space=DIST_TYPE, dim=queries.shape[1]) 109 | LOGGER.info("load %s by hnswlib", index_path) 110 | hl0.load_index(index_path, max_elements=NUM_DATA) 111 | hl0.set_ef(ef_search) 112 | queries /= np.linalg.norm(queries, axis=1)[:, None] 113 | 114 | start = time.time() 115 | _, _ = hl0.knn_query(queries, k=topk, num_threads=num_threads) 116 | el0 = time.time() - start 117 | LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): " 118 | "%.4e sec", num_queries, topk, ef_search, el0) 119 | return el0 120 | 121 | def run_cpu_training(ef_const=150, num_threads=-1): 122 | print("=" * BARRIER_SIZE) 123 | data_path = pjoin(RES_DIR, DATA_FILE) 124 | LOGGER.info("cpu training on %s with ef const %d, num_threads: %d", 125 | data_path, ef_const, num_threads) 126 | h5f = h5py.File(data_path, "r") 127 | data = h5f["train"][:, :].astype(np.float32) 128 | h5f.close() 129 | hl0 = hnswlib.Index(space=DIST_TYPE, dim=data.shape[1]) 130 | num_data = data.shape[0] 131 | data /= np.linalg.norm(data, axis=1)[:, None] 132 | hl0.init_index(max_elements=num_data, ef_construction=ef_const, M=12) 133 | LOGGER.info("add data to hnswlib") 134 | start = time.time() 135 | hl0.add_items(data, np.arange(num_data, dtype=np.int32), 136 | num_threads=num_threads) 137 | el0 = time.time() - start 138 | LOGGER.info("elapsed for adding %d items: %.4e sec", num_data, el0) 139 | index_path = pjoin(RES_DIR, HNSWLIB_INDEX_FILE) 140 | hl0.save_index(index_path) 141 | LOGGER.info("index saved to %s", index_path) 142 | return el0 143 | 144 | def run_gpu_inference(topk=100, index_file=INDEX_FILE, ef_search=300): 145 | print("=" * BARRIER_SIZE) 146 | data_path = pjoin(RES_DIR, DATA_FILE) 147 | index_path = pjoin(RES_DIR, index_file) 148 | LOGGER.info("gpu inference on %s with index %s", data_path, index_path) 149 | ch0 = CuHNSW(OPT) 150 | LOGGER.info("load model from %s by cuhnsw", index_path) 151 | ch0.load_index(index_path) 152 | 153 | h5f = h5py.File(data_path, "r") 154 | queries = h5f["test"][:, :].astype(np.float32) 155 | neighbors = h5f["neighbors"][:, :topk].astype(np.int32) 156 | h5f.close() 157 | num_queries = queries.shape[0] 158 | if NRZ: 159 | queries /= np.linalg.norm(queries, axis=1)[:, None] 160 | 161 | start = time.time() 162 | pred_nn, _, _ = ch0.search_knn(queries, topk, ef_search) 163 | el0 = time.time() - start 164 | LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): " 165 | "%.4e sec", num_queries, topk, ef_search, el0) 166 | accs = [] 167 | for _pred_nn, _gt_nn in zip(pred_nn, neighbors): 168 | intersection = set(_pred_nn) & set(_gt_nn) 169 | acc = len(intersection) / float(topk) 170 | accs.append(acc) 171 | LOGGER.info("accuracy mean: %.4e, std: %.4e", np.mean(accs), np.std(accs)) 172 | return el0, np.mean(accs) 173 | 174 | def run_gpu_inference2(topk=5, index_file="cuhnsw.index", ef_search=300): 175 | print("=" * BARRIER_SIZE) 176 | data_path = pjoin(RES_DIR, DATA_FILE) 177 | index_path = pjoin(RES_DIR, index_file) 178 | LOGGER.info("gpu inference on %s with index %s", data_path, index_path) 179 | ch0 = CuHNSW(OPT) 180 | LOGGER.info("load model from %s by cuhnsw", index_path) 181 | ch0.load_index(index_path) 182 | 183 | h5f = h5py.File(data_path, "r") 184 | data = h5f["train"][:, :].astype(np.float32) 185 | queries = h5f["test"][:5, :].astype(np.float32) 186 | h5f.close() 187 | if NRZ: 188 | data /= np.linalg.norm(data, axis=1)[:, None] 189 | 190 | nns, distances, found_cnt = ch0.search_knn(queries[:5], topk, ef_search) 191 | for idx, (nn0, distance, cnt) in \ 192 | enumerate(zip(nns, distances, found_cnt)): 193 | print("=" * BARRIER_SIZE) 194 | print(f"query {idx + 1}") 195 | print("-" * BARRIER_SIZE) 196 | for _idx, (_nn, _dist) in enumerate(zip(nn0[:cnt], distance[:cnt])): 197 | if DIST_TYPE == "l2": 198 | real_dist = np.linalg.norm(data[_nn] - queries[idx]) 199 | _dist = np.sqrt(_dist) 200 | elif DIST_TYPE == "dot": 201 | real_dist = data[_nn].dot(queries[idx]) 202 | print(f"rank {_idx + 1}. neighbor: {_nn}, dist by lib: {_dist}, " 203 | f"actual dist: {real_dist}") 204 | 205 | 206 | def run_gpu_inference_large(topk=100, index_file=INDEX_FILE, ef_search=300, 207 | num_queries=1000000, num_dims=50): 208 | print("=" * BARRIER_SIZE) 209 | index_path = pjoin(RES_DIR, index_file) 210 | data_path = pjoin(RES_DIR, DATA_FILE) 211 | LOGGER.info("gpu inference on %s with index %s", data_path, index_path) 212 | ch0 = CuHNSW(OPT) 213 | LOGGER.info("load model from %s by cuhnsw", index_path) 214 | ch0.load_index(index_path) 215 | 216 | queries = np.random.normal(size=(num_queries, num_dims)).astype(np.float32) 217 | num_queries = queries.shape[0] 218 | if NRZ: 219 | queries /= np.linalg.norm(queries, axis=1)[:, None] 220 | 221 | start = time.time() 222 | _, _, _ = ch0.search_knn(queries, topk, ef_search) 223 | el0 = time.time() - start 224 | LOGGER.info("elapsed for inferencing %d queries of top@%d (ef_search: %d): " 225 | "%.4e sec", num_queries, topk, ef_search, el0) 226 | return el0 227 | 228 | def run_gpu_training(ef_const=150): 229 | print("=" * BARRIER_SIZE) 230 | data_path = pjoin(RES_DIR, DATA_FILE) 231 | LOGGER.info("gpu training on %s with ef const %d", data_path, ef_const) 232 | OPT["ef_construction"] = ef_const 233 | ch0 = CuHNSW(OPT) 234 | h5f = h5py.File(data_path, "r") 235 | data = h5f["train"][:, :].astype(np.float32) 236 | h5f.close() 237 | ch0.set_data(data) 238 | start = time.time() 239 | ch0.build() 240 | el0 = time.time() - start 241 | LOGGER.info("elpased time to build by cuhnsw: %.4e sec", el0) 242 | index_path = pjoin(RES_DIR, CUHNSW_INDEX_FILE) 243 | ch0.save_index(index_path) 244 | return el0 245 | 246 | def measure_build_performance(): 247 | build_time = {"attr": "build time"} 248 | build_quality = {"attr": "build quality"} 249 | build_time["gpu"] = run_gpu_training(ef_const=110) 250 | _, build_quality["gpu"] = run_gpu_inference(index_file="cuhnsw.index") 251 | for i in [1, 2, 4, 8]: 252 | build_time[f"{i} cpu"] = run_cpu_training(ef_const=150, num_threads=i) 253 | _, build_quality[f"{i} cpu"] = run_cpu_inference(index_file="hnswlib.index") 254 | columns = [f"{i} cpu" for i in [1, 2, 4, 8]] + ["gpu"] 255 | df0 = pd.DataFrame([build_time, build_quality]) 256 | df0.set_index("attr", inplace=True) 257 | print(df0[columns].to_markdown()) 258 | 259 | def measure_search_performance(): 260 | search_time = {"attr": "search time"} 261 | search_time["gpu"] = run_gpu_inference_large(index_file="cuhnsw.index") 262 | for i in [1, 2, 4, 8]: 263 | search_time[f"{i} cpu"] = run_cpu_inference_large( 264 | index_file="cuhnsw.index", num_threads=i) 265 | columns = [f"{i} cpu" for i in [1, 2, 4, 8]] + ["gpu"] 266 | df0 = pd.DataFrame([search_time]) 267 | df0.set_index("attr", inplace=True) 268 | print(df0[columns].to_markdown()) 269 | 270 | 271 | def run_experiments(): 272 | measure_build_performance() 273 | measure_search_performance() 274 | 275 | 276 | if __name__ == "__main__": 277 | fire.Fire() 278 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /cpp/include/cuda_build_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "cuda_utils_kernels.cuh" 8 | 9 | namespace cuhnsw { 10 | __inline__ __device__ 11 | bool IsNeighbor(const int* graph, const int deg, const int dstid) { 12 | 13 | __syncthreads(); 14 | // figure out the warp/ position inside the warp 15 | int warp = threadIdx.x / WARP_SIZE; 16 | int lane = threadIdx.x % WARP_SIZE; 17 | 18 | static __shared__ bool shared[WARP_SIZE]; 19 | 20 | __syncthreads(); 21 | bool is_neighbor = false; 22 | for (int i = threadIdx.x; i < deg; i += blockDim.x) { 23 | if (graph[i] == dstid) { 24 | is_neighbor = true; 25 | break; 26 | } 27 | } 28 | __syncthreads(); 29 | 30 | #if __CUDACC_VER_MAJOR__ >= 9 31 | unsigned int active = __activemask(); 32 | is_neighbor = __any_sync(active, is_neighbor); 33 | #else 34 | is_neighbor = __any(is_neighbor); 35 | #endif 36 | 37 | // write out the partial reduction to shared memory if appropiate 38 | if (lane == 0) { 39 | shared[warp] = is_neighbor; 40 | } 41 | 42 | __syncthreads(); 43 | 44 | // if we we don't have multiple warps, we're done 45 | if (blockDim.x <= WARP_SIZE) { 46 | return shared[0]; 47 | } 48 | 49 | 50 | // otherwise reduce again in the first warp 51 | is_neighbor = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane] : false; 52 | if (warp == 0) { 53 | #if __CUDACC_VER_MAJOR__ >= 9 54 | active = __activemask(); 55 | is_neighbor = __any_sync(active, is_neighbor); 56 | #else 57 | is_neighbor = __any(is_neighbor); 58 | #endif 59 | // broadcast back to shared memory 60 | if (threadIdx.x == 0) { 61 | shared[0] = is_neighbor; 62 | } 63 | } 64 | __syncthreads(); 65 | return shared[0]; 66 | } 67 | 68 | __inline__ __device__ 69 | void SearchHeuristic( 70 | Neighbor* ef_const_pq, int* size, 71 | const int srcid, const int* nodes, 72 | const cuda_scalar* data, const int dist_type, const int num_dims, 73 | const int ef_construction, const int max_m, 74 | const bool save_remains, 75 | int* cand_nodes, cuda_scalar* cand_distances, 76 | int* graph, float* distances, int* deg, 77 | const float heuristic_coef, const int new_comer = -1) { 78 | int size2 = *size; 79 | __syncthreads(); 80 | 81 | // get sorted neighbors 82 | if (threadIdx.x == 0) { 83 | while (*size > 0) { 84 | cand_nodes[(*size) - 1] = ef_const_pq[0].nodeid; 85 | cand_distances[(*size) - 1] = ef_const_pq[0].distance; 86 | PqPop(ef_const_pq, size); 87 | } 88 | } 89 | __syncthreads(); 90 | 91 | // set variables for search heuristic 92 | int head = 0; 93 | int tail = max_m - 1; 94 | if (tail > size2 - 1) 95 | tail = size2 - 1; 96 | const int max_head = tail + 1; 97 | 98 | // take some proportion of closest nodes by default 99 | // this mechanism does not exist in hnswlib 100 | // it refers to https://github.com/kakao/n2/blob/36888c3869ac478d896d0921ac64f21930d85659/src/heuristic.cc#L42 101 | const int nn_num = max_m * heuristic_coef; 102 | 103 | int* _graph = graph + srcid * max_m; 104 | float* _distances = distances + srcid * max_m; 105 | bool new_comer_inserted = false; 106 | // search heuristic 107 | for (int j = 0; j < size2; ++j) { 108 | if (head >= max_m) break; 109 | bool freepass = head < nn_num or 110 | (new_comer >= 0 and not new_comer_inserted and cand_nodes[j] != new_comer); 111 | if (freepass) { 112 | if (threadIdx.x == 0) { 113 | _graph[head] = cand_nodes[j]; 114 | _distances[head] = out_scalar(cand_distances[j]); 115 | } 116 | head++; 117 | __syncthreads(); 118 | continue; 119 | } 120 | const cuda_scalar dist_to_src = cand_distances[j]; 121 | bool skip = false; 122 | if (new_comer >= 0 and new_comer_inserted) { 123 | cuda_scalar dist = GetDistance(cand_nodes[j], new_comer, 124 | num_dims, dist_type, nodes, data); 125 | skip = gt(dist_to_src, dist); 126 | } else { 127 | for (int k = 0; k < head; ++k) { 128 | cuda_scalar dist = GetDistance(cand_nodes[j], _graph[k], 129 | num_dims, dist_type, nodes, data); 130 | if (gt(dist_to_src, dist)) { 131 | skip = true; 132 | __syncthreads(); 133 | break; 134 | } 135 | } 136 | } 137 | if (cand_nodes[j] == new_comer and not skip) 138 | new_comer_inserted = true; 139 | 140 | if (skip and tail >= head) { 141 | if (threadIdx.x == 0) { 142 | _graph[tail] = cand_nodes[j]; 143 | _distances[tail] = out_scalar(cand_distances[j]); 144 | } 145 | tail--; 146 | } else if (not skip){ 147 | if (threadIdx.x == 0) { 148 | _graph[head] = cand_nodes[j]; 149 | _distances[head] = out_scalar(cand_distances[j]); 150 | } 151 | head++; 152 | } 153 | } 154 | __syncthreads(); 155 | 156 | // copy to graph 157 | // take remaining nodes as new neighbors 158 | // it also refers to https://github.com/kakao/n2/blob/36888c3869ac478d896d0921ac64f21930d85659/src/heuristic.cc#L85 159 | // it does not exist in hnswlib as well 160 | if (threadIdx.x == 0) deg[srcid] = save_remains? max_head: head; 161 | __syncthreads(); 162 | } 163 | 164 | __global__ void BuildLevelGraphKernel( 165 | const cuda_scalar* data, const int* nodes, 166 | const int num_dims, const int num_nodes, const int max_m, const int dist_type, 167 | const bool save_remains, const int ef_construction, int* graph, float* distances, int* deg, 168 | int* visited_table, int* visited_list, const int visited_table_size, const int visited_list_size, 169 | int* mutex, int64_t* acc_visited_cnt, 170 | const bool reverse_cand, Neighbor* neighbors, int* global_cand_nodes, cuda_scalar* global_cand_distances, 171 | const float heuristic_coef, int* backup_neighbors, cuda_scalar* backup_distances, bool* went_through_heuristic 172 | ) { 173 | 174 | static __shared__ int size; 175 | static __shared__ int visited_cnt; 176 | 177 | // storage to store neighbors and distnces temporarily 178 | static __shared__ int backup_deg; 179 | int* _backup_neighbors = backup_neighbors + max_m * blockIdx.x; 180 | cuda_scalar* _backup_distances = backup_distances + max_m * blockIdx.x; 181 | 182 | Neighbor* ef_const_pq = neighbors + ef_construction * blockIdx.x; 183 | int* cand_nodes = global_cand_nodes + ef_construction * blockIdx.x; 184 | cuda_scalar* cand_distances = global_cand_distances + ef_construction * blockIdx.x; 185 | int* _visited_table = visited_table + visited_table_size * blockIdx.x; 186 | int* _visited_list = visited_list + visited_list_size * blockIdx.x; 187 | 188 | for (int i = blockIdx.x; i < num_nodes; i += gridDim.x) { 189 | if (threadIdx.x == 0) { 190 | size = 0; 191 | visited_cnt = 0; 192 | } 193 | __syncthreads(); 194 | int srcid = i; 195 | // read access of srcid 196 | if (threadIdx.x == 0) { 197 | while (atomicCAS(&mutex[srcid], 0, 1)) {} 198 | } 199 | __syncthreads(); 200 | 201 | // initialize entries as neighbors 202 | for (int j = max_m * i; j < max_m * i + deg[i]; ++j) { 203 | int dstid = graph[j]; 204 | if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 205 | visited_table_size, visited_list_size)) 206 | continue; 207 | __syncthreads(); 208 | 209 | PushNodeToPq(ef_const_pq, &size, ef_construction, 210 | data, num_dims, dist_type, srcid, dstid, nodes); 211 | } 212 | __syncthreads(); 213 | 214 | // release lock 215 | if (threadIdx.x == 0) mutex[srcid] = 0; 216 | __syncthreads(); 217 | 218 | // iterate until converge 219 | int idx = GetCand(ef_const_pq, size, reverse_cand); 220 | while (idx >= 0) { 221 | __syncthreads(); 222 | if (threadIdx.x == 0) ef_const_pq[idx].checked = true; 223 | int entry = ef_const_pq[idx].nodeid; 224 | 225 | // read access of entry 226 | if (threadIdx.x == 0) { 227 | while (atomicCAS(&mutex[entry], 0, 1)) {} 228 | } 229 | __syncthreads(); 230 | 231 | for (int j = max_m * entry; j < max_m * entry + deg[entry]; ++j) { 232 | int dstid = graph[j]; 233 | 234 | if (CheckVisited(_visited_table, _visited_list, visited_cnt, dstid, 235 | visited_table_size, visited_list_size)) 236 | continue; 237 | __syncthreads(); 238 | 239 | PushNodeToPq(ef_const_pq, &size, ef_construction, 240 | data, num_dims, dist_type, srcid, dstid, nodes); 241 | } 242 | __syncthreads(); 243 | 244 | // release lock 245 | if (threadIdx.x == 0) mutex[entry] = 0; 246 | __syncthreads(); 247 | idx = GetCand(ef_const_pq, size, reverse_cand); 248 | } 249 | 250 | __syncthreads(); 251 | if (threadIdx.x == 0) { 252 | acc_visited_cnt[blockIdx.x] += visited_cnt; 253 | } 254 | for (int j = threadIdx.x; j < visited_cnt; j += blockDim.x) { 255 | _visited_table[_visited_list[j]] = -1; 256 | } 257 | __syncthreads(); 258 | 259 | // write access of dstid 260 | if (threadIdx.x == 0) { 261 | while (atomicCAS(&mutex[srcid], 0, 1)) {} 262 | } 263 | __syncthreads(); 264 | 265 | for (int j = 0; j < deg[srcid]; ++j) { 266 | int dstid = graph[srcid * max_m + j]; 267 | PushNodeToPq(ef_const_pq, &size, ef_construction, 268 | data, num_dims, dist_type, srcid, dstid, nodes); 269 | } 270 | 271 | // run search heuristic for myself 272 | SearchHeuristic(ef_const_pq, &size, srcid, nodes, 273 | data, dist_type, num_dims, 274 | ef_construction, max_m, save_remains, 275 | cand_nodes, cand_distances, 276 | graph, distances, deg, heuristic_coef); 277 | 278 | if (threadIdx.x == 0) went_through_heuristic[srcid] = true; 279 | 280 | __syncthreads(); 281 | 282 | // backup neighbors to handle overwrite 283 | if (threadIdx.x == 0) backup_deg = deg[srcid]; 284 | __syncthreads(); 285 | for (int j = threadIdx.x; j < backup_deg; j += blockDim.x) { 286 | _backup_neighbors[j] = graph[srcid * max_m + j]; 287 | _backup_distances[j] = conversion(distances[srcid * max_m + j]); 288 | } 289 | __syncthreads(); 290 | // release lock 291 | if (threadIdx.x == 0) mutex[srcid] = 0; 292 | __syncthreads(); 293 | 294 | // run search heuristic for neighbors 295 | for (int j = 0; j < backup_deg; ++j) { 296 | int dstid = _backup_neighbors[j]; 297 | cuda_scalar dist = _backup_distances[j]; 298 | __syncthreads(); 299 | // write access of dstid 300 | if (threadIdx.x == 0) { 301 | while (atomicCAS(&mutex[dstid], 0, 1)) {} 302 | } 303 | __syncthreads(); 304 | 305 | const int* _graph = graph + max_m * dstid; 306 | const int _deg = deg[dstid]; 307 | bool is_neighbor = IsNeighbor(_graph, _deg, srcid); 308 | if (not is_neighbor){ 309 | PushNodeToPq2(ef_const_pq, &size, ef_construction, 310 | dist, dstid, srcid, nodes); 311 | for (int k = 0; k < _deg; ++k) { 312 | int dstid2 = _graph[k]; 313 | dist = conversion(distances[dstid * max_m + k]); 314 | PushNodeToPq2(ef_const_pq, &size, ef_construction, 315 | dist, dstid, dstid2, nodes); 316 | } 317 | 318 | __syncthreads(); 319 | const int new_comer = not save_remains and went_through_heuristic[dstid]? srcid: -1; 320 | __syncthreads(); 321 | SearchHeuristic(ef_const_pq, &size, dstid, nodes, 322 | data, dist_type, num_dims, 323 | ef_construction, max_m, save_remains, 324 | cand_nodes, cand_distances, 325 | graph, distances, deg, heuristic_coef, new_comer); 326 | if (threadIdx.x == 0) went_through_heuristic[dstid] = true; 327 | __syncthreads(); 328 | } 329 | // release lock 330 | if (threadIdx.x == 0) mutex[dstid] = 0; 331 | __syncthreads(); 332 | } 333 | __syncthreads(); 334 | } 335 | 336 | // cooperative_groups::grid_group g = cooperative_groups::this_grid(); 337 | // g.sync(); 338 | } 339 | 340 | } // namespace cuhnsw 341 | -------------------------------------------------------------------------------- /cpp/src/cuhnsw_build.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | 9 | #include "cuhnsw.hpp" 10 | #include "cuda_search_kernels.cuh" 11 | #include "cuda_build_kernels.cuh" 12 | 13 | namespace cuhnsw { 14 | 15 | void CuHNSW::GetDeviceInfo() { 16 | CHECK_CUDA(cudaGetDevice(&devId_)); 17 | cudaDeviceProp prop; 18 | CHECK_CUDA(cudaGetDeviceProperties(&prop, devId_)); 19 | mp_cnt_ = prop.multiProcessorCount; 20 | major_ = prop.major; 21 | minor_ = prop.minor; 22 | cores_ = -1; 23 | } 24 | 25 | void CuHNSW::GetEntryPoints( 26 | const std::vector& nodes, 27 | std::vector& entries, 28 | int level, bool search) { 29 | int size = nodes.size(); 30 | 31 | // process input data for kernel 32 | LevelGraph& graph = level_graphs_[level]; 33 | const std::vector& upper_nodes = graph.GetNodes(); 34 | int upper_size = upper_nodes.size(); 35 | std::vector deg(upper_size); 36 | std::vector neighbors(upper_size * max_m_); 37 | for (int i = 0; i < upper_size; ++i) { 38 | const std::vector>& _neighbors = graph.GetNeighbors(upper_nodes[i]); 39 | deg[i] = _neighbors.size(); 40 | int offset = max_m_ * i; 41 | for (int j = 0; j < deg[i]; ++j) { 42 | neighbors[offset + j] = graph.GetNodeId(_neighbors[j].second); 43 | } 44 | } 45 | for (int i = 0; i < size; ++i) { 46 | int entryid = graph.GetNodeId(entries[i]); 47 | entries[i] = entryid; 48 | } 49 | 50 | // copy to gpu mem 51 | thrust::device_vector dev_nodes(size), dev_entries(size); 52 | thrust::device_vector dev_upper_nodes(upper_size), dev_deg(upper_size); 53 | thrust::device_vector dev_neighbors(upper_size * max_m_); 54 | thrust::copy(nodes.begin(), nodes.end(), dev_nodes.begin()); 55 | thrust::copy(entries.begin(), entries.end(), dev_entries.begin()); 56 | thrust::copy(upper_nodes.begin(), upper_nodes.end(), dev_upper_nodes.begin()); 57 | thrust::copy(deg.begin(), deg.end(), dev_deg.begin()); 58 | thrust::copy(neighbors.begin(), neighbors.end(), dev_neighbors.begin()); 59 | 60 | thrust::device_vector dev_visited(upper_size * block_cnt_, false); 61 | thrust::device_vector dev_visited_list(visited_list_size_ * block_cnt_); 62 | thrust::device_vector dev_acc_visited_cnt(block_cnt_, 0); 63 | thrust::device_vector& qdata = search? device_qdata_: device_data_; 64 | 65 | // run kernel 66 | GetEntryPointsKernel<<>>( 67 | thrust::raw_pointer_cast(qdata.data()), 68 | thrust::raw_pointer_cast(dev_nodes.data()), 69 | thrust::raw_pointer_cast(device_data_.data()), 70 | thrust::raw_pointer_cast(dev_upper_nodes.data()), 71 | num_dims_, size, upper_size, max_m_, dist_type_, 72 | thrust::raw_pointer_cast(dev_neighbors.data()), 73 | thrust::raw_pointer_cast(dev_deg.data()), 74 | thrust::raw_pointer_cast(dev_visited.data()), 75 | thrust::raw_pointer_cast(dev_visited_list.data()), 76 | visited_list_size_, 77 | thrust::raw_pointer_cast(dev_entries.data()), 78 | thrust::raw_pointer_cast(dev_acc_visited_cnt.data()) 79 | ); 80 | CHECK_CUDA(cudaDeviceSynchronize()); 81 | // el_[GPU] += sw_[GPU].CheckPoint(); 82 | thrust::copy(dev_entries.begin(), dev_entries.end(), entries.begin()); 83 | std::vector acc_visited_cnt(block_cnt_); 84 | thrust::copy(dev_acc_visited_cnt.begin(), dev_acc_visited_cnt.end(), acc_visited_cnt.begin()); 85 | CHECK_CUDA(cudaDeviceSynchronize()); 86 | int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0); 87 | DEBUG("full visited cnt: {}", full_visited_cnt); 88 | 89 | // set output 90 | for (int i = 0; i < size; ++i) { 91 | entries[i] = upper_nodes[entries[i]]; 92 | } 93 | 94 | } 95 | 96 | void CuHNSW::BuildGraph() { 97 | visited_ = new bool[batch_size_ * num_data_]; 98 | for (int level = max_level_; level >= 0; --level) { 99 | DEBUG("build graph of level: {}", level); 100 | BuildLevelGraph(level); 101 | } 102 | } 103 | 104 | void CuHNSW::BuildLevelGraph(int level) { 105 | std::set upper_nodes; 106 | std::vector new_nodes; 107 | LevelGraph& graph = level_graphs_[level]; 108 | const std::vector& nodes = graph.GetNodes(); 109 | int size = nodes.size(); 110 | int max_m = level > 0? max_m_: max_m0_; 111 | thrust::host_vector graph_vec(size * max_m, 0); 112 | thrust::host_vector deg(size, 0); 113 | if (level < max_level_) { 114 | LevelGraph& upper_graph = level_graphs_[level + 1]; 115 | for (auto& node: upper_graph.GetNodes()) { 116 | upper_nodes.insert(node); 117 | int srcid = graph.GetNodeId(node); 118 | int idx = 0; 119 | for (auto& nb: upper_graph.GetNeighbors(node)) { 120 | int dstid = graph.GetNodeId(nb.second); 121 | graph_vec[max_m * srcid + (idx++)] = dstid; 122 | } 123 | deg[srcid] = idx; 124 | } 125 | } 126 | 127 | for (auto& node: graph.GetNodes()) { 128 | if (upper_nodes.count(node)) continue; 129 | new_nodes.push_back(node); 130 | } 131 | 132 | // initialize entries 133 | std::vector entries(new_nodes.size(), enter_point_); 134 | 135 | for (int l = max_level_; l > level; --l) 136 | GetEntryPoints(new_nodes, entries, l, false); 137 | for (int i = 0; i < new_nodes.size(); ++i) { 138 | int srcid = graph.GetNodeId(new_nodes[i]); 139 | int dstid = graph.GetNodeId(entries[i]); 140 | graph_vec[max_m * srcid] = dstid; 141 | deg[srcid] = 1; 142 | } 143 | 144 | thrust::device_vector device_graph(max_m * size); 145 | thrust::device_vector device_distances(max_m * size); 146 | thrust::device_vector device_deg(size); 147 | thrust::device_vector device_nodes(size); 148 | thrust::device_vector device_visited_table(visited_table_size_ * block_cnt_, -1); 149 | thrust::device_vector device_visited_list(visited_list_size_ * block_cnt_); 150 | thrust::device_vector device_mutex(size, 0); 151 | thrust::device_vector device_acc_visited_cnt(block_cnt_, 0); 152 | thrust::device_vector device_neighbors(ef_construction_ * block_cnt_); 153 | thrust::device_vector device_cand_nodes(ef_construction_ * block_cnt_); 154 | thrust::device_vector device_cand_distances(ef_construction_ * block_cnt_); 155 | thrust::device_vector device_backup_neighbors(max_m * block_cnt_); 156 | thrust::device_vector device_backup_distances(max_m * block_cnt_); 157 | thrust::device_vector device_went_through_heuristic(size, false); 158 | 159 | thrust::copy(graph_vec.begin(), graph_vec.end(), device_graph.begin()); 160 | thrust::copy(deg.begin(), deg.end(), device_deg.begin()); 161 | thrust::copy(nodes.begin(), nodes.end(), device_nodes.begin()); 162 | 163 | BuildLevelGraphKernel<<>>( 164 | thrust::raw_pointer_cast(device_data_.data()), 165 | thrust::raw_pointer_cast(device_nodes.data()), 166 | num_dims_, size, max_m, dist_type_, save_remains_, 167 | ef_construction_, 168 | thrust::raw_pointer_cast(device_graph.data()), 169 | thrust::raw_pointer_cast(device_distances.data()), 170 | thrust::raw_pointer_cast(device_deg.data()), 171 | thrust::raw_pointer_cast(device_visited_table.data()), 172 | thrust::raw_pointer_cast(device_visited_list.data()), 173 | visited_table_size_, visited_list_size_, 174 | thrust::raw_pointer_cast(device_mutex.data()), 175 | thrust::raw_pointer_cast(device_acc_visited_cnt.data()), 176 | reverse_cand_, 177 | thrust::raw_pointer_cast(device_neighbors.data()), 178 | thrust::raw_pointer_cast(device_cand_nodes.data()), 179 | thrust::raw_pointer_cast(device_cand_distances.data()), 180 | heuristic_coef_, 181 | thrust::raw_pointer_cast(device_backup_neighbors.data()), 182 | thrust::raw_pointer_cast(device_backup_distances.data()), 183 | thrust::raw_pointer_cast(device_went_through_heuristic.data()) 184 | ); 185 | CHECK_CUDA(cudaDeviceSynchronize()); 186 | thrust::copy(device_deg.begin(), device_deg.end(), deg.begin()); 187 | thrust::copy(device_graph.begin(), device_graph.end(), graph_vec.begin()); 188 | std::vector distances(max_m * size); 189 | thrust::copy(device_distances.begin(), device_distances.end(), distances.begin()); 190 | 191 | std::vector acc_visited_cnt(block_cnt_); 192 | thrust::copy(device_acc_visited_cnt.begin(), device_acc_visited_cnt.end(), acc_visited_cnt.begin()); 193 | CHECK_CUDA(cudaDeviceSynchronize()); 194 | int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0LL); 195 | DEBUG("full number of visited nodes: {}", full_visited_cnt); 196 | 197 | for (auto& node: graph.GetNodes()) { 198 | graph.ClearEdges(node); 199 | } 200 | for (int i = 0; i < size; ++i) { 201 | int src = nodes[i]; 202 | for (int j = 0; j < deg[i]; ++j) { 203 | int dst = nodes[graph_vec[i * max_m + j]]; 204 | float dist = distances[i * max_m + j]; 205 | graph.AddEdge(src, dst, dist); 206 | } 207 | } 208 | } 209 | 210 | void CuHNSW::SearchGraph(const float* qdata, const int num_queries, const int topk, const int ef_search, 211 | int* nns, float* distances, int* found_cnt) { 212 | device_qdata_.resize(num_queries * num_dims_); 213 | #ifdef HALF_PRECISION 214 | std::vector hdata(num_queries * num_dims_); 215 | for (int i = 0; i < num_queries * num_dims_; ++i) 216 | hdata[i] = conversion(qdata[i]); 217 | thrust::copy(hdata.begin(), hdata.end(), device_qdata_.begin()); 218 | #else 219 | thrust::copy(qdata, qdata + num_queries * num_dims_, device_qdata_.begin()); 220 | #endif 221 | std::vector qnodes(num_queries); 222 | std::iota(qnodes.begin(), qnodes.end(), 0); 223 | std::vector entries(num_queries, enter_point_); 224 | for (int l = max_level_; l > 0; --l) 225 | GetEntryPoints(qnodes, entries, l, true); 226 | std::vector graph_vec(max_m0_ * num_data_); 227 | std::vector deg(num_data_); 228 | LevelGraph graph = level_graphs_[0]; 229 | for (int i = 0; i < num_data_; ++i) { 230 | const std::vector>& neighbors = graph.GetNeighbors(i); 231 | int nbsize = neighbors.size(); 232 | int offset = i * max_m0_; 233 | for (int j = 0; j < nbsize; ++j) 234 | graph_vec[offset + j] = neighbors[j].second; 235 | deg[i] = nbsize; 236 | } 237 | 238 | thrust::device_vector device_graph(max_m0_ * num_data_); 239 | thrust::device_vector device_deg(num_data_); 240 | thrust::device_vector device_entries(num_queries); 241 | thrust::device_vector device_nns(num_queries * topk); 242 | thrust::device_vector device_distances(num_queries * topk); 243 | thrust::device_vector device_found_cnt(num_queries); 244 | thrust::device_vector device_visited_table(visited_table_size_ * block_cnt_, -1); 245 | thrust::device_vector device_visited_list(visited_list_size_ * block_cnt_); 246 | thrust::device_vector device_acc_visited_cnt(block_cnt_, 0); 247 | thrust::device_vector device_neighbors(ef_search * block_cnt_); 248 | thrust::device_vector device_cand_nodes(ef_search * block_cnt_); 249 | thrust::device_vector device_cand_distances(ef_search * block_cnt_); 250 | 251 | thrust::copy(graph_vec.begin(), graph_vec.end(), device_graph.begin()); 252 | thrust::copy(deg.begin(), deg.end(), device_deg.begin()); 253 | thrust::copy(entries.begin(), entries.end(), device_entries.begin()); 254 | SearchGraphKernel<<>>( 255 | thrust::raw_pointer_cast(device_qdata_.data()), 256 | num_queries, 257 | thrust::raw_pointer_cast(device_data_.data()), 258 | num_data_, num_dims_, max_m0_, dist_type_, ef_search, 259 | thrust::raw_pointer_cast(device_entries.data()), 260 | thrust::raw_pointer_cast(device_graph.data()), 261 | thrust::raw_pointer_cast(device_deg.data()), 262 | topk, 263 | thrust::raw_pointer_cast(device_nns.data()), 264 | thrust::raw_pointer_cast(device_distances.data()), 265 | thrust::raw_pointer_cast(device_found_cnt.data()), 266 | thrust::raw_pointer_cast(device_visited_table.data()), 267 | thrust::raw_pointer_cast(device_visited_list.data()), 268 | visited_table_size_, visited_list_size_, 269 | thrust::raw_pointer_cast(device_acc_visited_cnt.data()), 270 | reverse_cand_, 271 | thrust::raw_pointer_cast(device_neighbors.data()), 272 | thrust::raw_pointer_cast(device_cand_nodes.data()), 273 | thrust::raw_pointer_cast(device_cand_distances.data()) 274 | ); 275 | CHECK_CUDA(cudaDeviceSynchronize()); 276 | std::vector acc_visited_cnt(block_cnt_); 277 | thrust::copy(device_acc_visited_cnt.begin(), device_acc_visited_cnt.end(), acc_visited_cnt.begin()); 278 | thrust::copy(device_nns.begin(), device_nns.end(), nns); 279 | thrust::copy(device_distances.begin(), device_distances.end(), distances); 280 | thrust::copy(device_found_cnt.begin(), device_found_cnt.end(), found_cnt); 281 | CHECK_CUDA(cudaDeviceSynchronize()); 282 | int64_t full_visited_cnt = std::accumulate(acc_visited_cnt.begin(), acc_visited_cnt.end(), 0LL); 283 | DEBUG("full number of visited nodes: {}", full_visited_cnt); 284 | if (labelled_) 285 | for (int i = 0; i < num_queries * topk; ++i) 286 | nns[i] = labels_[nns[i]]; 287 | 288 | device_qdata_.clear(); 289 | device_qdata_.shrink_to_fit(); 290 | } 291 | 292 | } // namespace cuhnsw 293 | -------------------------------------------------------------------------------- /cpp/src/cuhnsw_base.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | 9 | #include "cuhnsw.hpp" 10 | 11 | namespace cuhnsw { 12 | 13 | CuHNSW::CuHNSW() { 14 | logger_ = CuHNSWLogger().get_logger(); 15 | 16 | GetDeviceInfo(); 17 | // reference: https://stackoverflow.com/a/32531982 18 | switch (major_){ 19 | case 2: // Fermi 20 | if (minor_ == 1) 21 | cores_ = mp_cnt_ * 48; 22 | else 23 | cores_ = mp_cnt_ * 32; 24 | break; 25 | case 3: // Kepler 26 | cores_ = mp_cnt_ * 192; 27 | break; 28 | case 5: // Maxwell 29 | cores_ = mp_cnt_ * 128; 30 | break; 31 | case 6: // Pascal 32 | if (minor_ == 1 or minor_ == 2) 33 | cores_ = mp_cnt_ * 128; 34 | else if (minor_ == 0) 35 | cores_ = mp_cnt_ * 64; 36 | else 37 | DEBUG0("Unknown device type"); 38 | break; 39 | case 7: // Volta and Turing 40 | if (minor_ == 0 or minor_ == 5) 41 | cores_ = mp_cnt_ * 64; 42 | else 43 | DEBUG0("Unknown device type"); 44 | break; 45 | case 8: // Ampere 46 | if (minor_ == 0) 47 | cores_ = mp_cnt_ * 64; 48 | else if (minor_ == 6) 49 | cores_ = mp_cnt_ * 128; 50 | else 51 | DEBUG0("Unknown device type"); 52 | break; 53 | default: 54 | DEBUG0("Unknown device type"); 55 | break; 56 | } 57 | if (cores_ == -1) cores_ = mp_cnt_ * 128; 58 | INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}", 59 | major_, minor_, mp_cnt_, cores_); 60 | // sw_.resize(PROFILE_SIZE); 61 | // el_.resize(PROFILE_SIZE); 62 | } 63 | 64 | 65 | CuHNSW::~CuHNSW() {} 66 | 67 | bool CuHNSW::Init(std::string opt_path) { 68 | std::ifstream in(opt_path.c_str()); 69 | if (not in.is_open()) return false; 70 | 71 | std::string str((std::istreambuf_iterator(in)), 72 | std::istreambuf_iterator()); 73 | std::string err_cmt; 74 | auto _opt = json11::Json::parse(str, err_cmt); 75 | if (not err_cmt.empty()) return false; 76 | opt_ = _opt; 77 | max_m_ = opt_["max_m"].int_value(); 78 | max_m0_ = opt_["max_m0"].int_value(); 79 | save_remains_ = opt_["save_remains"].bool_value(); 80 | ef_construction_ = opt_["ef_construction"].int_value(); 81 | level_mult_ = opt_["level_mult"].number_value(); 82 | batch_size_ = opt_["batch_size"].int_value(); 83 | block_dim_ = opt_["block_dim"].int_value(); 84 | visited_table_size_ = opt_["visited_table_size"].int_value(); 85 | visited_list_size_ = opt_["visited_list_size"].int_value(); 86 | if (not visited_table_size_) 87 | visited_table_size_ = visited_list_size_ * 2; 88 | heuristic_coef_ = opt_["heuristic_coef"].number_value(); 89 | std::string dist_type = opt_["dist_type"].string_value(); 90 | reverse_cand_ = opt_["reverse_cand"].bool_value(); 91 | if (dist_type == "dot") { 92 | dist_type_ = DOT; 93 | } else if (dist_type == "l2") { 94 | dist_type_ = L2; 95 | } else { 96 | char buf[4096]; 97 | snprintf(buf, sizeof(buf), "invalid dist type %s", 98 | dist_type.c_str()); 99 | std::string msg(buf); 100 | throw std::runtime_error(msg); 101 | } 102 | CuHNSWLogger().set_log_level(opt_["c_log_level"].int_value()); 103 | DEBUG("max_m: {}, max_m0: {}, save_remains: {}, ef_construction: {}, level_mult: {}, dist_type: {}", 104 | max_m_, max_m0_, save_remains_, ef_construction_, level_mult_, dist_type); 105 | return true; 106 | } 107 | 108 | void CuHNSW::SetData(const float* data, int num_data, int num_dims) { 109 | num_data_ = num_data; 110 | num_dims_ = num_dims; 111 | block_cnt_ = opt_["hyper_threads"].number_value() * (cores_ / block_dim_); 112 | DEBUG("copy data ({} x {}), block_cnt: {}, block_dim: {}", 113 | num_data, num_dims, block_cnt_, block_dim_); 114 | device_data_.resize(num_data * num_dims); 115 | #ifdef HALF_PRECISION 116 | // DEBUG0("fp16") 117 | std::vector hdata(num_data * num_dims); 118 | for (int i = 0; i < num_data * num_dims; ++i) { 119 | hdata[i] = conversion(data[i]); 120 | // DEBUG("hdata i: {}, scalar: {}", i, out_scalar(hdata[i])); 121 | } 122 | thrust::copy(hdata.begin(), hdata.end(), device_data_.begin()); 123 | #else 124 | // DEBUG0("fp32") 125 | thrust::copy(data, data + num_data * num_dims, device_data_.begin()); 126 | #endif 127 | data_ = data; 128 | } 129 | 130 | void CuHNSW::SetRandomLevels(const int* levels) { 131 | levels_.resize(num_data_); 132 | DEBUG("set levels of data (length: {})", num_data_) 133 | max_level_ = 0; 134 | std::vector> level_nodes(1); 135 | for (int i = 0; i < num_data_; ++i) { 136 | levels_[i] = levels[i]; 137 | if (levels[i] > max_level_) { 138 | max_level_ = levels[i]; 139 | level_nodes.resize(max_level_ + 1); 140 | enter_point_ = i; 141 | } 142 | for (int l = 0; l <= levels[i]; ++l) 143 | level_nodes[l].push_back(i); 144 | } 145 | DEBUG("max level: {}", max_level_) 146 | for (int i = 0; i <= max_level_; ++i) 147 | DEBUG("number of data in level {}: {}", 148 | i, level_nodes[i].size()); 149 | level_graphs_.clear(); 150 | for (int i = 0; i <= max_level_; ++i) { 151 | LevelGraph graph = LevelGraph(); 152 | graph.SetNodes(level_nodes[i], 153 | num_data_, ef_construction_); 154 | level_graphs_.push_back(graph); 155 | } 156 | } 157 | 158 | // save graph compatible with hnswlib (https://github.com/nmslib/hnswlib) 159 | void CuHNSW::SaveIndex(std::string fpath) { 160 | std::ofstream output(fpath); 161 | DEBUG("save index to {}", fpath); 162 | 163 | // write meta values 164 | DEBUG0("write meta values"); 165 | size_t data_size = num_dims_ * sizeof(scalar); 166 | size_t max_elements = num_data_; 167 | size_t cur_element_count = num_data_; 168 | size_t M = max_m_; 169 | size_t maxM = max_m_; 170 | size_t maxM0 = max_m0_; 171 | int maxlevel = max_level_; 172 | size_t size_links_level0 = maxM0 * sizeof(tableint) + sizeof(sizeint); 173 | size_t size_links_per_element = maxM * sizeof(tableint) + sizeof(sizeint); 174 | size_t size_data_per_element = size_links_level0 + data_size + sizeof(labeltype); 175 | size_t ef_construction = ef_construction_; 176 | double mult = level_mult_; 177 | size_t offsetData = size_links_level0; 178 | size_t label_offset = size_links_level0 + data_size; 179 | size_t offsetLevel0 = 0; 180 | tableint enterpoint_node = enter_point_; 181 | 182 | writeBinaryPOD(output, offsetLevel0); 183 | writeBinaryPOD(output, max_elements); 184 | writeBinaryPOD(output, cur_element_count); 185 | writeBinaryPOD(output, size_data_per_element); 186 | writeBinaryPOD(output, label_offset); 187 | writeBinaryPOD(output, offsetData); 188 | writeBinaryPOD(output, maxlevel); 189 | writeBinaryPOD(output, enterpoint_node); 190 | writeBinaryPOD(output, maxM); 191 | writeBinaryPOD(output, maxM0); 192 | writeBinaryPOD(output, M); 193 | writeBinaryPOD(output, mult); 194 | writeBinaryPOD(output, ef_construction); 195 | 196 | // write level0 links and data 197 | DEBUG0("write level0 links and data"); 198 | char* data_level0_memory = (char*) malloc(cur_element_count * size_data_per_element); 199 | LevelGraph& graph = level_graphs_[0]; 200 | std::vector links; 201 | links.reserve(max_m0_); 202 | size_t offset = 0; 203 | for (int i = 0; i < cur_element_count; ++i) { 204 | links.clear(); 205 | for (const auto& pr: graph.GetNeighbors(i)) 206 | links.push_back(static_cast(pr.second)); 207 | 208 | sizeint size = links.size(); 209 | memcpy(data_level0_memory + offset, &size, sizeof(sizeint)); 210 | offset += sizeof(sizeint); 211 | if (size > 0) 212 | memcpy(data_level0_memory + offset, &links[0], sizeof(tableint) * size); 213 | offset += maxM0 * sizeof(tableint); 214 | memcpy(data_level0_memory + offset, &data_[i * num_dims_], data_size); 215 | offset += data_size; 216 | labeltype label = i; 217 | memcpy(data_level0_memory + offset, &label, sizeof(labeltype)); 218 | offset += sizeof(labeltype); 219 | } 220 | output.write(data_level0_memory, cur_element_count * size_data_per_element); 221 | 222 | // write upper layer links 223 | DEBUG0("write upper layer links"); 224 | for (int i = 0; i < num_data_; ++i) { 225 | unsigned int size = size_links_per_element * levels_[i]; 226 | writeBinaryPOD(output, size); 227 | char* mem = (char*) malloc(size); 228 | offset = 0; 229 | if (size) { 230 | for (int j = 1; j <= levels_[i]; ++j) { 231 | links.clear(); 232 | LevelGraph& upper_graph = level_graphs_[j]; 233 | for (const auto& pr: upper_graph.GetNeighbors(i)) 234 | links.push_back(static_cast(pr.second)); 235 | sizeint link_size = links.size(); 236 | memcpy(mem + offset, &link_size, sizeof(sizeint)); 237 | offset += sizeof(sizeint); 238 | if (link_size > 0) 239 | memcpy(mem + offset, &links[0], sizeof(tableint) * link_size); 240 | offset += sizeof(tableint) * maxM; 241 | } 242 | output.write(mem, size); 243 | } 244 | } 245 | 246 | output.close(); 247 | } 248 | 249 | // load graph compatible with hnswlib (https://github.com/nmslib/hnswlib) 250 | void CuHNSW::LoadIndex(std::string fpath) { 251 | std::ifstream input(fpath, std::ios::binary); 252 | DEBUG("load index from {}", fpath); 253 | 254 | // reqd meta values 255 | DEBUG0("read meta values"); 256 | size_t offsetLevel0, max_elements, cur_element_count; 257 | size_t size_data_per_element, label_offset, offsetData; 258 | int maxlevel; 259 | tableint enterpoint_node = enter_point_; 260 | size_t maxM, maxM0, M; 261 | double mult; 262 | size_t ef_construction; 263 | 264 | readBinaryPOD(input, offsetLevel0); 265 | readBinaryPOD(input, max_elements); 266 | readBinaryPOD(input, cur_element_count); 267 | readBinaryPOD(input, size_data_per_element); 268 | readBinaryPOD(input, label_offset); 269 | readBinaryPOD(input, offsetData); 270 | readBinaryPOD(input, maxlevel); 271 | readBinaryPOD(input, enterpoint_node); 272 | readBinaryPOD(input, maxM); 273 | readBinaryPOD(input, maxM0); 274 | readBinaryPOD(input, M); 275 | readBinaryPOD(input, mult); 276 | readBinaryPOD(input, ef_construction); 277 | size_t size_per_link = maxM * sizeof(tableint) + sizeof(sizeint); 278 | num_data_ = cur_element_count; 279 | max_m_ = maxM; 280 | max_m0_ = maxM0; 281 | enter_point_ = enterpoint_node; 282 | ef_construction_ = ef_construction; 283 | max_level_ = maxlevel; 284 | level_mult_ = mult; 285 | num_dims_ = (label_offset - offsetData) / sizeof(scalar); 286 | DEBUG("meta values loaded, num_data: {}, num_dims: {}, max_m: {}, max_m0: {}, enter_point: {}, max_level: {}", 287 | num_data_, num_dims_, max_m_, max_m0_, enter_point_, max_level_); 288 | 289 | char* data_level0_memory = (char*) malloc(max_elements * size_data_per_element); 290 | input.read(data_level0_memory, cur_element_count * size_data_per_element); 291 | 292 | // reset level graphs 293 | level_graphs_.clear(); 294 | level_graphs_.shrink_to_fit(); 295 | level_graphs_.resize(max_level_ + 1); 296 | 297 | // load data and level0 links 298 | DEBUG0("load level0 links and data"); 299 | DEBUG("level0 count: {}", cur_element_count); 300 | std::vector data(num_data_ * num_dims_); 301 | size_t offset = 0; 302 | std::vector links(max_m0_); 303 | std::vector vec_data(num_dims_); 304 | LevelGraph& graph0 = level_graphs_[0]; 305 | std::vector> nodes(max_level_ + 1); 306 | nodes[0].resize(cur_element_count); 307 | std::iota(nodes[0].begin(), nodes[0].end(), 0); 308 | graph0.SetNodes(nodes[0], num_data_, ef_construction_); 309 | labels_.clear(); labelled_ = true; 310 | for (int i = 0; i < cur_element_count; ++i) { 311 | sizeint deg; 312 | memcpy(°, data_level0_memory + offset, sizeof(sizeint)); 313 | offset += sizeof(sizeint); 314 | memcpy(&links[0], data_level0_memory + offset, sizeof(tableint) * max_m0_); 315 | for (int j = 0; j < deg; ++j) 316 | graph0.AddEdge(i, links[j], 0); 317 | offset += sizeof(tableint) * max_m0_; 318 | memcpy(&vec_data[0], data_level0_memory + offset, sizeof(scalar) * num_dims_); 319 | for (int j = 0; j < num_dims_; ++j) 320 | data[num_dims_ * i + j] = vec_data[j]; 321 | offset += sizeof(scalar) * num_dims_; 322 | labeltype label; 323 | memcpy(&label, data_level0_memory + offset, sizeof(labeltype)); 324 | labels_.push_back(static_cast(label)); 325 | offset += sizeof(labeltype); 326 | } 327 | SetData(&data[0], num_data_, num_dims_); 328 | 329 | // load upper layer links 330 | DEBUG0("load upper layer links"); 331 | std::vector>> links_data(max_level_ + 1); 332 | links.resize(max_m_); 333 | levels_.resize(cur_element_count); 334 | for (int i = 0; i < cur_element_count; ++i) { 335 | unsigned int linksize; 336 | readBinaryPOD(input, linksize); 337 | if (not linksize) continue; 338 | char* buffer = (char*) malloc(linksize); 339 | input.read(buffer, linksize); 340 | size_t levels = linksize / size_per_link; 341 | size_t offset = 0; 342 | levels_[i] = levels + 1; 343 | for (int j = 1; j <= levels; ++j) { 344 | nodes[j].push_back(i); 345 | sizeint deg; 346 | memcpy(°, buffer + offset, sizeof(sizeint)); 347 | offset += sizeof(sizeint); 348 | memcpy(&links[0], buffer + offset, sizeof(tableint) * deg); 349 | offset += sizeof(tableint) * max_m_; 350 | for (int k = 0; k < deg; ++k) 351 | links_data[j].emplace_back(i, links[k]); 352 | } 353 | } 354 | 355 | for (int i = 1; i <= max_level_; ++i) { 356 | LevelGraph& graph = level_graphs_[i]; 357 | DEBUG("level {} count: {}", i, nodes[i].size()); 358 | graph.SetNodes(nodes[i], num_data_, ef_construction_); 359 | for (const auto& pr: links_data[i]) { 360 | graph.AddEdge(pr.first, pr.second, 0); 361 | } 362 | } 363 | 364 | input.close(); 365 | } 366 | 367 | } // namespace cuhnsw 368 | --------------------------------------------------------------------------------