├── AutoTune.cpp ├── AutoTune.h ├── AuxIndexStructures.cpp ├── AuxIndexStructures.h ├── CMakeLists.txt ├── Clustering.cpp ├── Clustering.h ├── FaissAssert.h ├── FaissException.cpp ├── FaissException.h ├── Heap.cpp ├── Heap.h ├── INSTALL ├── Index.cpp ├── Index.h ├── IndexFlat.cpp ├── IndexFlat.h ├── IndexIVF.cpp ├── IndexIVF.h ├── IndexIVFPQ.cpp ├── IndexIVFPQ.h ├── IndexLSH.cpp ├── IndexLSH.h ├── IndexPQ.cpp ├── IndexPQ.h ├── IndexScalarQuantizer.cpp ├── IndexScalarQuantizer.h ├── LICENSE ├── Makefile ├── MetaIndexes.cpp ├── MetaIndexes.h ├── PolysemousTraining.cpp ├── PolysemousTraining.h ├── ProductQuantizer.cpp ├── ProductQuantizer.h ├── README.md ├── VectorTransform.cpp ├── VectorTransform.h ├── cmake ├── Cuda.cmake ├── Cuda.cmake.bak └── Modules │ ├── FindMKL.cmake │ ├── FindOpenBLAS.cmake │ └── FindOpenBLAS.cmake.bak ├── example_makefiles ├── makefile.inc.Linux ├── makefile.inc.Mac.brew └── makefile.inc.Mac.port ├── faiss.cbp ├── faiss.cscope_file_list ├── faiss.h ├── faiss.layout ├── filehelper.cpp ├── filehelper.h ├── gpu ├── CMakeLists.txt ├── GpuAutoTune.cpp ├── GpuAutoTune.h ├── GpuClonerOptions.cpp ├── GpuClonerOptions.h ├── GpuIndex.cu ├── GpuIndex.h ├── GpuIndexFlat.cu ├── GpuIndexFlat.h ├── GpuIndexIVF.cu ├── GpuIndexIVF.h ├── GpuIndexIVFFlat.cu ├── GpuIndexIVFFlat.h ├── GpuIndexIVFPQ.cu ├── GpuIndexIVFPQ.h ├── GpuIndicesOptions.h ├── GpuResources.cpp ├── GpuResources.h ├── IndexProxy.cpp ├── IndexProxy.h ├── Makefile ├── StandardGpuResources.cpp ├── StandardGpuResources.h ├── impl │ ├── BroadcastSum.cu │ ├── BroadcastSum.cuh │ ├── Distance.cu │ ├── Distance.cuh │ ├── FlatIndex.cu │ ├── FlatIndex.cuh │ ├── IVFBase.cu │ ├── IVFBase.cuh │ ├── IVFFlat.cu │ ├── IVFFlat.cuh │ ├── IVFFlatScan.cu │ ├── IVFFlatScan.cuh │ ├── IVFPQ.cu │ ├── IVFPQ.cuh │ ├── IVFUtils.cu │ ├── IVFUtils.cuh │ ├── IVFUtilsSelect1.cu │ ├── IVFUtilsSelect2.cu │ ├── InvertedListAppend.cu │ ├── InvertedListAppend.cuh │ ├── L2Norm.cu │ ├── L2Norm.cuh │ ├── L2Select.cu │ ├── L2Select.cuh │ ├── PQCodeDistances.cu │ ├── PQCodeDistances.cuh │ ├── PQCodeLoad.cuh │ ├── PQScanMultiPassNoPrecomputed.cu │ ├── PQScanMultiPassNoPrecomputed.cuh │ ├── PQScanMultiPassPrecomputed.cu │ ├── PQScanMultiPassPrecomputed.cuh │ ├── RemapIndices.cpp │ ├── RemapIndices.h │ ├── VectorResidual.cu │ └── VectorResidual.cuh ├── perf │ ├── CompareFlat.cu │ ├── CompareIVFFlat.cu │ ├── CompareIVFPQ.cu │ ├── CompareIVFPQGrid.cu │ ├── IndexWrapper-inl.h │ ├── IndexWrapper.h │ ├── PerfClustering.cpp │ ├── PerfIVFPQAdd.cpp │ ├── PerfSelect.cu │ └── WriteIndex.cpp ├── test │ ├── CMakeLists.txt │ ├── CMakeLists.txt.bak │ ├── TestGpuIndexFlat.cpp │ ├── TestGpuIndexIVFFlat.cpp │ ├── TestGpuIndexIVFPQ.cpp │ ├── TestGpuSelect.cu │ ├── TestUtils.cpp │ ├── TestUtils.h │ ├── deep1b16_createdb.cpp │ ├── deep1b16_query.cpp │ ├── deep1b_createdb.cpp │ ├── deep1b_createdb_hnsw.cpp │ ├── deep1b_creategt.cpp │ ├── deep1b_query.cpp │ ├── deep1b_query.cpp.bak │ ├── deep1b_query1.cpp │ ├── deep1b_query2.cpp │ ├── deep1b_queryd.cpp │ ├── demo_ivfpq_indexing_gpu.cpp │ ├── demo_ivfpq_line_indexing_gpu.cpp │ ├── sift1b16_createdb.cpp │ ├── sift1b16_query - 副本.cpp │ ├── sift1b16_query.cpp │ ├── sift1b_createdb.cpp │ ├── sift1b_createdb_hnsw.cpp │ ├── sift1b_creategt.cpp │ ├── sift1b_query.cpp │ ├── sift1b_query1.cpp │ ├── sift1b_query2.cpp │ ├── sift1b_queryd.cpp │ ├── test_gpu_index.py │ ├── tool_createdb.cpp │ ├── tool_query.cpp │ ├── tool_query1.cpp │ ├── transform_deep1b.cpp │ └── transform_sift1b.cpp └── utils │ ├── BlockSelectFloat.cu │ ├── BlockSelectHalf.cu │ ├── BlockSelectKernel.cuh │ ├── Comparators.cuh │ ├── ConversionOperators.cuh │ ├── CopyUtils.cuh │ ├── DeviceDefs.cuh │ ├── DeviceMemory.cpp │ ├── DeviceMemory.h │ ├── DeviceTensor-inl.cuh │ ├── DeviceTensor.cuh │ ├── DeviceUtils.cpp │ ├── DeviceUtils.h │ ├── DeviceVector.cuh │ ├── Float16.cu │ ├── Float16.cuh │ ├── HostTensor-inl.cuh │ ├── HostTensor.cuh │ ├── Limits.cuh │ ├── LoadStoreOperators.cuh │ ├── MathOperators.cuh │ ├── MatrixMult.cu │ ├── MatrixMult.cuh │ ├── MemorySpace.cpp │ ├── MemorySpace.h │ ├── MergeNetworkBlock.cuh │ ├── MergeNetworkUtils.cuh │ ├── MergeNetworkWarp.cuh │ ├── NoTypeTensor.cuh │ ├── Pair.cuh │ ├── PtxUtils.cuh │ ├── ReductionOperators.cuh │ ├── Reductions.cuh │ ├── Select.cuh │ ├── StackDeviceMemory.cpp │ ├── StackDeviceMemory.h │ ├── StaticUtils.h │ ├── Tensor-inl.cuh │ ├── Tensor.cuh │ ├── ThrustAllocator.cuh │ ├── Timer.cpp │ ├── Timer.h │ ├── Transpose.cuh │ ├── WarpSelectFloat.cu │ ├── WarpSelectHalf.cu │ ├── WarpSelectKernel.cuh │ ├── WarpShuffles.cuh │ ├── WorkerThread.cpp │ ├── WorkerThread.h │ ├── bitonicSort.cuh │ ├── blockselect │ ├── BlockSelectFloat1.cu │ ├── BlockSelectFloat128.cu │ ├── BlockSelectFloat256.cu │ ├── BlockSelectFloat32.cu │ ├── BlockSelectFloat64.cu │ ├── BlockSelectFloatF1024.cu │ ├── BlockSelectFloatF512.cu │ ├── BlockSelectFloatT1024.cu │ ├── BlockSelectFloatT512.cu │ ├── BlockSelectHalf1.cu │ ├── BlockSelectHalf128.cu │ ├── BlockSelectHalf256.cu │ ├── BlockSelectHalf32.cu │ ├── BlockSelectHalf64.cu │ ├── BlockSelectHalfF1024.cu │ ├── BlockSelectHalfF512.cu │ ├── BlockSelectHalfT1024.cu │ ├── BlockSelectHalfT512.cu │ └── BlockSelectImpl.cuh │ ├── helper.cu │ ├── helper.cuh │ ├── nvidia │ ├── fp16_emu.cu │ └── fp16_emu.cuh │ ├── triangle.cuh │ └── warpselect │ ├── WarpSelectFloat1.cu │ ├── WarpSelectFloat128.cu │ ├── WarpSelectFloat256.cu │ ├── WarpSelectFloat32.cu │ ├── WarpSelectFloat64.cu │ ├── WarpSelectFloatF1024.cu │ ├── WarpSelectFloatF512.cu │ ├── WarpSelectFloatT1024.cu │ ├── WarpSelectFloatT512.cu │ ├── WarpSelectHalf1.cu │ ├── WarpSelectHalf128.cu │ ├── WarpSelectHalf256.cu │ ├── WarpSelectHalf32.cu │ ├── WarpSelectHalf64.cu │ ├── WarpSelectHalfF1024.cu │ ├── WarpSelectHalfF512.cu │ ├── WarpSelectHalfT1024.cu │ ├── WarpSelectHalfT512.cu │ └── WarpSelectImpl.cuh ├── hamming.cpp ├── hamming.h ├── index_io.cpp ├── index_io.h ├── makefile.inc ├── tests ├── CMakeLists.txt ├── deep1b16_imi_pq.cpp ├── deep1b_imi_pq.cpp ├── deep1b_imi_pq1.cpp ├── deep1b_imi_pq2.cpp ├── demo_imi_flat.cpp ├── demo_imi_pq.cpp ├── demo_ivfpq_indexing.cpp ├── demo_sift1M.cpp ├── sift1b16_imi_pq.cpp ├── sift1b_imi_pq.cpp ├── sift1b_imi_pq1.cpp ├── sift1b_imi_pq2.cpp ├── test_blas ├── test_blas.cpp ├── test_ivfpq_codec.cpp └── test_ivfpq_indexing.cpp ├── utils.cpp └── utils.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.7) 2 | 3 | # faiss project 4 | project(faiss C CXX) 5 | 6 | option(BUILD_TUTORIAL "Build tutorials" ON) 7 | option(BUILD_TEST "Build tests" ON) 8 | option(BUILD_WITH_GPU "Build faiss with gpu (cuda) support" ON) 9 | option(WITH_MKL "Build with MKL if ON (OpenBLAS if OFF)" OFF) 10 | 11 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) 12 | 13 | # OpenMP 14 | find_package(OpenMP REQUIRED) 15 | 16 | set(MPICH_INCLUDE_PATH "/usr/local/mpich/include") 17 | set(MPICH_CXX_LIBRARIES "/usr/local/mpich/lib/libmpi.so") 18 | 19 | include_directories(${MPICH_INCLUDE_PATH}) 20 | 21 | # BLAS (MKL os OpenBLAS) 22 | if(WITH_MKL) 23 | find_package(MKL REQUIRED) 24 | include_directories(${MKL_INCLUDE_DIRS}) 25 | set(BLAS_LIB ${MKL_LIBRARIES}) 26 | else() 27 | find_package(OpenBLAS REQUIRED) 28 | include_directories(${OpenBLAS_INCLUDE_DIR}) 29 | set(BLAS_LIB ${OpenBLAS_LIB}) 30 | endif() 31 | 32 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -m64 -Wall -g -O3 -msse4 -mpopcnt -fopenmp -Wno-sign-compare") 33 | add_definitions(-DFINTEGER=int) 34 | 35 | # specify output bin_path and lib_path 36 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 37 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 38 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 39 | 40 | # specify header and cpp files 41 | file(GLOB faiss_cpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) 42 | file(GLOB faiss_cpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) 43 | 44 | set(faiss_lib faiss) 45 | add_library(${faiss_lib} STATIC ${faiss_cpu_headers} ${faiss_cpu_cpp}) 46 | target_link_libraries(${faiss_lib} ${OpenMP_CXX_FLAGS} ${BLAS_LIB}) 47 | 48 | # build gpu lib 49 | if(BUILD_WITH_GPU) 50 | include(cmake/Cuda.cmake) 51 | add_subdirectory(gpu) 52 | endif(BUILD_WITH_GPU) 53 | 54 | # build tutorial examples 55 | if(BUILD_TUTORIAL) 56 | add_subdirectory(tutorial) 57 | endif(BUILD_TUTORIAL) 58 | 59 | # build tests 60 | if(BUILD_TEST) 61 | add_subdirectory(tests) 62 | endif(BUILD_TEST) 63 | -------------------------------------------------------------------------------- /Clustering.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved 10 | // -*- c++ -*- 11 | 12 | #ifndef FAISS_CLUSTERING_H 13 | #define FAISS_CLUSTERING_H 14 | #include "Index.h" 15 | 16 | #include 17 | 18 | namespace faiss { 19 | 20 | 21 | /** Class for the clustering parameters. Can be passed to the 22 | * constructor of the Clustering object. 23 | */ 24 | struct ClusteringParameters { 25 | int niter; ///< clustering iterations 26 | int nredo; ///< redo clustering this many times and keep best 27 | 28 | bool verbose; 29 | bool spherical; ///< do we want normalized centroids? 30 | bool update_index; ///< update index after each iteration? 31 | 32 | int min_points_per_centroid; ///< otherwise you get a warning 33 | int max_points_per_centroid; ///< to limit size of dataset 34 | 35 | int seed; ///< seed for the random number generator 36 | 37 | /// sets reasonable defaults 38 | ClusteringParameters (); 39 | }; 40 | 41 | 42 | /** clustering based on assignment - centroid update iterations 43 | * 44 | * The clustering is based on an Index object that assigns training 45 | * points to the centroids. Therefore, at each iteration the centroids 46 | * are added to the index. 47 | * 48 | * On output, the centoids table is set to the latest version 49 | * of the centroids and they are also added to the index. If the 50 | * centroids table it is not empty on input, it is also used for 51 | * initialization. 52 | * 53 | * To do several clusterings, just call train() several times on 54 | * different training sets, clearing the centroid table in between. 55 | */ 56 | struct Clustering: ClusteringParameters { 57 | typedef Index::idx_t idx_t; 58 | size_t d; ///< dimension of the vectors 59 | size_t k; ///< nb of centroids 60 | 61 | /// centroids (k * d) 62 | std::vector centroids; 63 | 64 | /// objective values (sum of distances reported by index) over 65 | /// iterations 66 | std::vector obj; 67 | 68 | /// the only mandatory parameters are k and d 69 | Clustering (int d, int k); 70 | Clustering (int d, int k, const ClusteringParameters &cp); 71 | 72 | /// Index is used during the assignment stage 73 | virtual void train (idx_t n, const float * x, faiss::Index & index); 74 | 75 | virtual ~Clustering() {} 76 | }; 77 | 78 | 79 | /** simplified interface 80 | * 81 | * @param d dimension of the data 82 | * @param n nb of training vectors 83 | * @param k nb of output centroids 84 | * @param x training set (size n * d) 85 | * @param centroids output centroids (size k * d) 86 | * @return final quantization error 87 | */ 88 | float kmeans_clustering (size_t d, size_t n, size_t k, 89 | const float *x, 90 | float *centroids); 91 | 92 | 93 | 94 | } 95 | 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /FaissException.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "FaissException.h" 12 | 13 | namespace faiss { 14 | 15 | FaissException::FaissException(const std::string& m) 16 | : msg(m) { 17 | } 18 | 19 | FaissException::FaissException(const std::string& m, 20 | const char* funcName, 21 | const char* file, 22 | int line) { 23 | int size = snprintf(nullptr, 0, "Error in %s at %s:%d: %s", 24 | funcName, file, line, m.c_str()); 25 | msg.resize(size + 1); 26 | snprintf(&msg[0], msg.size(), "Error in %s at %s:%d: %s", 27 | funcName, file, line, m.c_str()); 28 | } 29 | 30 | const char* 31 | FaissException::what() const noexcept { 32 | return msg.c_str(); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /FaissException.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #ifndef FAISS_EXCEPTION_INCLUDED 12 | #define FAISS_EXCEPTION_INCLUDED 13 | 14 | #include 15 | #include 16 | 17 | namespace faiss { 18 | 19 | /// Base class for Faiss exceptions 20 | class FaissException : public std::exception { 21 | public: 22 | explicit FaissException(const std::string& msg); 23 | 24 | FaissException(const std::string& msg, 25 | const char* funcName, 26 | const char* file, 27 | int line); 28 | 29 | /// from std::exception 30 | const char* what() const noexcept override; 31 | 32 | std::string msg; 33 | }; 34 | 35 | 36 | /** bare-bones unique_ptr 37 | * this one deletes with delete [] */ 38 | template 39 | struct ScopeDeleter { 40 | const T * ptr; 41 | explicit ScopeDeleter (const T* ptr = nullptr): ptr (ptr) {} 42 | void release () {ptr = nullptr; } 43 | void set (const T * ptr_in) { ptr = ptr_in; } 44 | void swap (ScopeDeleter &other) {std::swap (ptr, other.ptr); } 45 | ~ScopeDeleter () { 46 | delete [] ptr; 47 | } 48 | }; 49 | 50 | /** same but deletes with the simple delete (least common case) */ 51 | template 52 | struct ScopeDeleter1 { 53 | const T * ptr; 54 | explicit ScopeDeleter1 (const T* ptr = nullptr): ptr (ptr) {} 55 | void release () {ptr = nullptr; } 56 | void set (const T * ptr_in) { ptr = ptr_in; } 57 | void swap (ScopeDeleter1 &other) {std::swap (ptr, other.ptr); } 58 | ~ScopeDeleter1 () { 59 | delete ptr; 60 | } 61 | }; 62 | 63 | 64 | 65 | } 66 | 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /Heap.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /* Copyright 2004-present Facebook. All Rights Reserved. */ 10 | /* Function for soft heap */ 11 | 12 | #include "Heap.h" 13 | 14 | 15 | namespace faiss { 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | template 26 | void HeapArray::heapify () 27 | { 28 | #pragma omp parallel for 29 | for (size_t j = 0; j < nh; j++) 30 | heap_heapify (k, val + j * k, ids + j * k); 31 | } 32 | 33 | template 34 | void HeapArray::reorder () 35 | { 36 | #pragma omp parallel for 37 | for (size_t j = 0; j < nh; j++) 38 | heap_reorder (k, val + j * k, ids + j * k); 39 | } 40 | 41 | template 42 | void HeapArray::addn (size_t nj, const T *vin, TI j0, 43 | size_t i0, long ni) 44 | { 45 | if (ni == -1) ni = nh; 46 | assert (i0 >= 0 && i0 + ni <= nh); 47 | #pragma omp parallel for 48 | for (size_t i = i0; i < i0 + ni; i++) { 49 | T * __restrict simi = get_val(i); 50 | TI * __restrict idxi = get_ids (i); 51 | const T *ip_line = vin + (i - i0) * nj; 52 | 53 | for (size_t j = 0; j < nj; j++) { 54 | T ip = ip_line [j]; 55 | if (C::cmp(simi[0], ip)) { 56 | heap_pop (k, simi, idxi); 57 | heap_push (k, simi, idxi, ip, j + j0); 58 | } 59 | } 60 | } 61 | } 62 | 63 | template 64 | void HeapArray::addn_with_ids ( 65 | size_t nj, const T *vin, const TI *id_in, 66 | long id_stride, size_t i0, long ni) 67 | { 68 | if (id_in == nullptr) { 69 | addn (nj, vin, 0, i0, ni); 70 | return; 71 | } 72 | if (ni == -1) ni = nh; 73 | assert (i0 >= 0 && i0 + ni <= nh); 74 | #pragma omp parallel for 75 | for (size_t i = i0; i < i0 + ni; i++) { 76 | T * __restrict simi = get_val(i); 77 | TI * __restrict idxi = get_ids (i); 78 | const T *ip_line = vin + (i - i0) * nj; 79 | const TI *id_line = id_in + (i - i0) * id_stride; 80 | 81 | for (size_t j = 0; j < nj; j++) { 82 | T ip = ip_line [j]; 83 | if (C::cmp(simi[0], ip)) { 84 | heap_pop (k, simi, idxi); 85 | heap_push (k, simi, idxi, ip, id_line [j]); 86 | } 87 | } 88 | } 89 | } 90 | 91 | template 92 | void HeapArray::per_line_extrema ( 93 | T * out_val, 94 | TI * out_ids) const 95 | { 96 | #pragma omp parallel for 97 | for (size_t j = 0; j < nh; j++) { 98 | long imin = -1; 99 | typename C::T xval = C::Crev::neutral (); 100 | const typename C::T * x_ = val + j * k; 101 | for (size_t i = 0; i < k; i++) 102 | if (C::cmp (x_[i], xval)) { 103 | xval = x_[i]; 104 | imin = i; 105 | } 106 | if (out_val) 107 | out_val[j] = xval; 108 | 109 | if (out_ids) { 110 | if (ids && imin != -1) 111 | out_ids[j] = ids [j * k + imin]; 112 | else 113 | out_ids[j] = imin; 114 | } 115 | } 116 | } 117 | 118 | 119 | 120 | 121 | // explicit instanciations 122 | 123 | template class HeapArray >; 124 | template class HeapArray >; 125 | template class HeapArray >; 126 | template class HeapArray >; 127 | 128 | 129 | 130 | 131 | 132 | } // END namespace fasis 133 | -------------------------------------------------------------------------------- /Index.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved 10 | 11 | #include "IndexFlat.h" 12 | #include "FaissAssert.h" 13 | #include 14 | namespace faiss { 15 | 16 | 17 | void Index::range_search (idx_t , const float *, float, 18 | RangeSearchResult *) const 19 | { 20 | FAISS_THROW_MSG ("range search not implemented"); 21 | } 22 | 23 | void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k) 24 | { 25 | printf("Index::assign,k: %d\n",k); 26 | float * distances = new float[n * k]; 27 | ScopeDeleter del(distances); 28 | search (n, x, k, distances, labels); 29 | } 30 | 31 | 32 | void Index::add_with_ids (idx_t n, const float * x, const long *xids) 33 | { 34 | FAISS_THROW_MSG ("add_with_ids not implemented for this type of index"); 35 | } 36 | 37 | 38 | long Index::remove_ids (const IDSelector & sel) 39 | { 40 | FAISS_THROW_MSG ("remove_ids not implemented for this type of index"); 41 | return -1; 42 | } 43 | 44 | 45 | void Index::reconstruct (idx_t, float * ) const { 46 | FAISS_THROW_MSG ("Can not compute reconstruct without " 47 | "knowing how to do so"); 48 | } 49 | 50 | 51 | void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const { 52 | for (idx_t i = 0; i < ni; i++) { 53 | reconstruct (i0 + i, recons + i * d); 54 | } 55 | } 56 | 57 | void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k, 58 | float *distances, idx_t *labels, 59 | float *recons) const { 60 | search (n, x, k, distances, labels); 61 | for (idx_t i = 0; i < n; ++i) { 62 | for (idx_t j = 0; j < k; ++j) { 63 | idx_t ij = i * k + j; 64 | idx_t key = labels[ij]; 65 | float* reconstructed = recons + ij * d; 66 | if (key < 0) { 67 | // Fill with NaNs 68 | memset(reconstructed, -1, sizeof(*reconstructed) * d); 69 | } else { 70 | reconstruct (key, reconstructed); 71 | } 72 | } 73 | } 74 | } 75 | 76 | void Index::compute_residual (const float * x, 77 | float * residual, idx_t key) const { 78 | reconstruct (key, residual); 79 | for (size_t i = 0; i < d; i++) 80 | residual[i] = x[i] - residual[i]; 81 | } 82 | 83 | 84 | void Index::display () const { 85 | printf ("Index: %s -> %ld elements\n", typeid (*this).name(), ntotal); 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /IndexIVFPQ.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/IndexIVFPQ.cpp -------------------------------------------------------------------------------- /IndexLSH.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | // -*- c++ -*- 11 | 12 | #ifndef INDEX_LSH_H 13 | #define INDEX_LSH_H 14 | 15 | #include 16 | 17 | #include "Index.h" 18 | #include "VectorTransform.h" 19 | 20 | namespace faiss { 21 | 22 | 23 | /** The sign of each vector component is put in a binary signature */ 24 | struct IndexLSH:Index { 25 | typedef unsigned char uint8_t; 26 | 27 | int nbits; ///< nb of bits per vector 28 | int bytes_per_vec; ///< nb of 8-bits per encoded vector 29 | bool rotate_data; ///< whether to apply a random rotation to input 30 | bool train_thresholds; ///< whether we train thresholds or use 0 31 | 32 | RandomRotationMatrix rrot; ///< optional random rotation 33 | 34 | std::vector thresholds; ///< thresholds to compare with 35 | 36 | /// encoded dataset 37 | std::vector codes; 38 | 39 | IndexLSH ( 40 | idx_t d, int nbits, 41 | bool rotate_data = true, 42 | bool train_thresholds = false); 43 | 44 | /** Preprocesses and resizes the input to the size required to 45 | * binarize the data 46 | * 47 | * @param x input vectors, size n * d 48 | * @return output vectors, size n * bits. May be the same pointer 49 | * as x, otherwise it should be deleted by the caller 50 | */ 51 | const float *apply_preprocess (idx_t n, const float *x) const; 52 | 53 | void train(idx_t n, const float* x) override; 54 | 55 | void add(idx_t n, const float* x) override; 56 | 57 | void search( 58 | idx_t n, 59 | const float* x, 60 | idx_t k, 61 | float* distances, 62 | idx_t* labels) const override; 63 | 64 | void reset() override; 65 | 66 | /// transfer the thresholds to a pre-processing stage (and unset 67 | /// train_thresholds) 68 | void transfer_thresholds (LinearTransform * vt); 69 | 70 | ~IndexLSH() override {} 71 | 72 | IndexLSH (); 73 | }; 74 | 75 | 76 | 77 | } 78 | 79 | 80 | 81 | 82 | 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | "# vector-line-quantization" 2 | -------------------------------------------------------------------------------- /cmake/Cuda.cmake: -------------------------------------------------------------------------------- 1 | # configure cuda 2 | 3 | find_package(CUDA QUIET REQUIRED) 4 | if(CUDA_FOUND) 5 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 6 | 7 | list(APPEND CUDA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) 8 | else(CUDA_FOUND) 9 | message(STATUS "Could not locate cuda, disabling cuda support.") 10 | set(BUILD_WITH_GPU OFF) 11 | return() 12 | endif(CUDA_FOUND) 13 | if(NOT DEFINED CUDA_TOOLKIT_SAMPLE_DIR) 14 | set(CUDA_TOOLKIT_SAMPLE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/samples) 15 | endif() 16 | set(FAISS_CUDA_ADDITIONAL_INC_PATH ${CUDA_INCLUDE_DIRS} ${CUDA_TOOLKIT_SAMPLE_DIR}/common/inc/) 17 | include_directories(${FAISS_CUDA_ADDITIONAL_INC_PATH}) 18 | # set cuda flags 19 | if (CMAKE_BUILD_TYPE STREQUAL "Debug") 20 | list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-DVERBOSE;-g;-lineinfo;-Xcompiler;-ggdb") 21 | else() 22 | list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-lm;-DVERBOSE;-O3;-DNDEBUG;-Xcompiler;-DNDEBU") 23 | endif() 24 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 25 | -------------------------------------------------------------------------------- /cmake/Cuda.cmake.bak: -------------------------------------------------------------------------------- 1 | # configure cuda 2 | 3 | find_package(CUDA QUIET REQUIRED) 4 | if(CUDA_FOUND) 5 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 6 | 7 | list(APPEND CUDA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) 8 | else(CUDA_FOUND) 9 | message(STATUS "Could not locate cuda, disabling cuda support.") 10 | set(BUILD_WITH_GPU OFF) 11 | return() 12 | endif(CUDA_FOUND) 13 | if(NOT DEFINED CUDA_TOOLKIT_SAMPLE_DIR) 14 | set(CUDA_TOOLKIT_SAMPLE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/samples) 15 | endif() 16 | set(PQT_CUDA_ADDITIONAL_INC_PATH ${CUDA_INCLUDE_DIRS} ${CUDA_TOOLKIT_SAMPLE_DIR}/common/inc/) 17 | include_directories(${PQT_CUDA_ADDITIONAL_INC_PATH}) 18 | # set cuda flags 19 | if (CMAKE_BUILD_TYPE STREQUAL "Debug") 20 | list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-DVERBOSE;-g;-lineinfo;-Xcompiler;-ggdb") 21 | else() 22 | list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-lm;-DVERBOSE;-O3;-DNDEBUG;-Xcompiler;-DNDEBU") 23 | endif() 24 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 25 | -------------------------------------------------------------------------------- /cmake/Modules/FindMKL.cmake: -------------------------------------------------------------------------------- 1 | # defines: 2 | # MKL_INCLUDE_DIRS 3 | # MKL_LIBRARIES 4 | # MKL_COMPILER_LIBRARIES - a list of compiler libraries (file names) required for MKL 5 | 6 | #unset(MKL_LIB_DIR CACHE) 7 | #unset(MKL_COMPILER_LIB_DIR CACHE) 8 | #unset(MKL_COMPILER_REDIST_PATH CACHE) 9 | 10 | if(NOT HAVE_MKL) 11 | find_path(MKL_INCLUDE_DIRS "mkl.h" PATHS ${MKL_INCLUDE_DIR} DOC "The path to MKL headers") 12 | 13 | if(MKL_INCLUDE_DIRS) 14 | 15 | get_filename_component(_MKL_LIB_PATH "${MKL_INCLUDE_DIRS}/../lib" ABSOLUTE) 16 | 17 | if(APPLE) 18 | # MKL 2017 for mac has only 64 bit libraries without directory prefix 19 | set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib) 20 | else() 21 | if(CMAKE_SIZEOF_VOID_P EQUAL 8) 22 | set(_MKL_LIB_PATH "${_MKL_LIB_PATH}/intel64") 23 | set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib/intel64) 24 | if(WIN32) 25 | set(_MKL_COMPILER_REDIST_PATH ${MKL_INCLUDE_DIRS}/../../redist/intel64/compiler) 26 | endif() 27 | else() 28 | set(_MKL_LIB_PATH "${_MKL_LIB_PATH}/ia32") 29 | set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib/ia32) 30 | if(WIN32) 31 | set(_MKL_COMPILER_REDIST_PATH ${MKL_INCLUDE_DIRS}/../../redist/ia32/compiler) 32 | endif() 33 | endif() 34 | endif() 35 | 36 | # On Linux and Apple take libraries for redistribution from the same location that is used for linking 37 | if(UNIX) 38 | set(_MKL_COMPILER_REDIST_PATH ${_MKL_COMPILER_LIB_PATH}) 39 | endif() 40 | 41 | if(WIN32) 42 | set(MKL_COMPILER_LIBRARIES libiomp5md.dll) 43 | set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64 mkl_core mkl_intel_thread libiomp5md) 44 | elseif(APPLE) 45 | set(MKL_COMPILER_LIBRARIES libiomp5.dylib) 46 | # generated by https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor 47 | # with the following options: OSX; Clang; Intel64; static; 32 bit integer; OpenMP; Intel OpenMP 48 | set(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel_lp64.a libmkl_intel_thread.a libmkl_core.a iomp5 pthread m dl) 49 | else() 50 | set(MKL_COMPILER_LIBRARIES libiomp5.so) 51 | # a --start-group / --end-group pair is required when linking with static MKL on GNU. 52 | # see https://software.intel.com/en-us/forums/topic/280974#comment-1478780 53 | # and https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor 54 | set(MKL_LIBRARIES ${MKL_LIBRARIES} 55 | "-Wl,--start-group" 56 | libmkl_intel_lp64.a libmkl_core.a libmkl_intel_thread.a 57 | "-Wl,--end-group" 58 | "-Wl,--exclude-libs,libmkl_intel_lp64.a,--exclude-libs,libmkl_core.a,--exclude-libs,libmkl_intel_thread.a,--exclude-libs,iomp5" 59 | iomp5 dl pthread m) 60 | endif() 61 | 62 | set(MKL_LIB_DIR "${_MKL_LIB_PATH}" 63 | CACHE PATH "Full path of MKL library directory") 64 | set(MKL_COMPILER_LIB_DIR "${_MKL_COMPILER_LIB_PATH}" 65 | CACHE PATH "Full path of MKL compiler library directory") 66 | set(MKL_COMPILER_REDIST_PATH "${_MKL_COMPILER_REDIST_PATH}" 67 | CACHE PATH "Full path of MKL compiler redistributable library directory") 68 | 69 | link_directories(${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR}) 70 | 71 | set(HAVE_MKL 1) 72 | 73 | endif(MKL_INCLUDE_DIRS) 74 | endif(NOT HAVE_MKL) 75 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenBLAS.cmake: -------------------------------------------------------------------------------- 1 | 2 | 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS 4 | /usr/include 5 | /usr/include/openblas 6 | /usr/include/openblas-base 7 | /usr/local/include 8 | /usr/local/include/openblas 9 | /usr/local/include/openblas-base 10 | /opt/OpenBLAS/include 11 | /opt/local/include 12 | $ENV{OpenBLAS_HOME} 13 | $ENV{OpenBLAS_HOME}/include 14 | ) 15 | 16 | SET(Open_BLAS_LIB_SEARCH_PATHS 17 | /lib/ 18 | /lib/openblas-base 19 | /lib64/ 20 | /usr/lib 21 | /usr/lib/openblas-base 22 | /usr/lib64 23 | /usr/local/lib 24 | /usr/local/lib64 25 | /opt/OpenBLAS/lib 26 | /opt/local/lib 27 | $ENV{OpenBLAS}cd 28 | $ENV{OpenBLAS}/lib 29 | $ENV{OpenBLAS_HOME} 30 | $ENV{OpenBLAS_HOME}/lib 31 | ) 32 | 33 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES openblas_config.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS}) 34 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}) 35 | 36 | SET(OpenBLAS_FOUND ON) 37 | 38 | # Check include files 39 | IF(NOT OpenBLAS_INCLUDE_DIR) 40 | SET(OpenBLAS_FOUND OFF) 41 | MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off") 42 | ENDIF() 43 | 44 | # Check libraries 45 | IF(NOT OpenBLAS_LIB) 46 | SET(OpenBLAS_FOUND OFF) 47 | MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off") 48 | ENDIF() 49 | 50 | IF (OpenBLAS_FOUND) 51 | IF (NOT OpenBLAS_FIND_QUIETLY) 52 | MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}") 53 | MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}") 54 | ENDIF (NOT OpenBLAS_FIND_QUIETLY) 55 | ELSE (OpenBLAS_FOUND) 56 | IF (OpenBLAS_FIND_REQUIRED) 57 | MESSAGE(FATAL_ERROR "Could not find OpenBLAS") 58 | ENDIF (OpenBLAS_FIND_REQUIRED) 59 | ENDIF (OpenBLAS_FOUND) 60 | 61 | MARK_AS_ADVANCED( 62 | OpenBLAS_INCLUDE_DIR 63 | OpenBLAS_LIB 64 | OpenBLAS 65 | ) 66 | 67 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenBLAS.cmake.bak: -------------------------------------------------------------------------------- 1 | 2 | 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS 4 | /usr/include 5 | /usr/include/openblas 6 | /usr/include/openblas-base 7 | /usr/local/include 8 | /usr/local/include/openblas 9 | /usr/local/include/openblas-base 10 | /opt/OpenBLAS/include 11 | /opt/local/include 12 | /home/dl/OpenBLAS 13 | $ENV{OpenBLAS_HOME} 14 | $ENV{OpenBLAS_HOME}/include 15 | ) 16 | 17 | SET(Open_BLAS_LIB_SEARCH_PATHS 18 | /lib/ 19 | /lib/openblas-base 20 | /lib64/ 21 | /usr/lib 22 | /usr/lib/openblas-base 23 | /usr/lib64 24 | /usr/local/lib 25 | /usr/local/lib64 26 | /opt/OpenBLAS/lib 27 | /opt/local/lib 28 | /home/dl/OpenBLAS 29 | $ENV{OpenBLAS}cd 30 | $ENV{OpenBLAS}/lib 31 | $ENV{OpenBLAS_HOME} 32 | $ENV{OpenBLAS_HOME}/lib 33 | ) 34 | 35 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES openblas_config.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS}) 36 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}) 37 | 38 | SET(OpenBLAS_FOUND ON) 39 | 40 | # Check include files 41 | IF(NOT OpenBLAS_INCLUDE_DIR) 42 | SET(OpenBLAS_FOUND OFF) 43 | MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off") 44 | ENDIF() 45 | 46 | # Check libraries 47 | IF(NOT OpenBLAS_LIB) 48 | SET(OpenBLAS_FOUND OFF) 49 | MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off") 50 | ENDIF() 51 | 52 | IF (OpenBLAS_FOUND) 53 | IF (NOT OpenBLAS_FIND_QUIETLY) 54 | MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}") 55 | MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}") 56 | ENDIF (NOT OpenBLAS_FIND_QUIETLY) 57 | ELSE (OpenBLAS_FOUND) 58 | IF (OpenBLAS_FIND_REQUIRED) 59 | MESSAGE(FATAL_ERROR "Could not find OpenBLAS") 60 | ENDIF (OpenBLAS_FIND_REQUIRED) 61 | ENDIF (OpenBLAS_FOUND) 62 | 63 | MARK_AS_ADVANCED( 64 | OpenBLAS_INCLUDE_DIR 65 | OpenBLAS_LIB 66 | OpenBLAS 67 | ) 68 | 69 | -------------------------------------------------------------------------------- /example_makefiles/makefile.inc.Mac.brew: -------------------------------------------------------------------------------- 1 | 2 | # -*- makefile -*- 3 | 4 | # Tested on macOS Sierra (10.12.2) with llvm installed using Homebrew (https://brew.sh) 5 | # brew install llvm 6 | CC=/usr/local/opt/llvm/bin/clang++ 7 | CFLAGS=-fPIC -m64 -Wall -g -O3 -msse4 -mpopcnt -fopenmp -Wno-sign-compare -Dnullptr=NULL -I/usr/local/opt/llvm/include -std=c++11 8 | LDFLAGS=-g -fPIC -fopenmp -L/usr/local/opt/llvm/lib 9 | 10 | # common mac flags 11 | SHAREDEXT=dylib 12 | SHAREDFLAGS=-Wl,-F. -bundle -undefined dynamic_lookup 13 | FAISSSHAREDFLAGS=-dynamiclib 14 | 15 | # wrapldflags="" 16 | # sharedext=dylib 17 | # sharedflags="-dynamiclib" 18 | # yaelsharedflags="$sharedflags -install_name $yaelprefix/yael/libyael.dylib" 19 | 20 | ########################################################################## 21 | # Uncomment one of the 4 BLAS/Lapack implementation options 22 | # below. They are sorted # from fastest to slowest (in our 23 | # experiments). 24 | ########################################################################## 25 | 26 | # 27 | # 1. Intel MKL 28 | # 29 | # This is the fastest BLAS implementation we tested. Unfortunately it 30 | # is not open-source and determining the correct linking flags is a 31 | # nightmare. See 32 | # 33 | # https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor 34 | # 35 | # for a start on setting the link flags. On version IntelComposerXE 36 | # 2015.0.090, the following flags work 37 | # 38 | # MKLROOT=$(HOME)/fbsource/fbcode/third-party2//IntelComposerXE/2015.0.090/gcc-4.8.1-glibc-2.17/c3f970a/mkl 39 | # 40 | # BLASLDFLAGS=-Wl,--no-as-needed -L$(MKLROOT)/lib/intel64 -lmkl_intel_ilp64 \ 41 | # -lmkl_core -lmkl_gnu_thread -ldl -lpthread 42 | # 43 | # the ilp64 means that the integers are 64-bit. 44 | # 45 | # BLASLDFLAGS=-DFINTEGER=long 46 | # 47 | # you may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime 48 | # 49 | 50 | # 51 | # 2. Openblas 52 | # 53 | # The library contains both BLAS and Lapack. Install with port install OpenBLAS 54 | # 55 | # BLASCFLAGS=-DFINTEGER=int 56 | # BLASLDFLAGS=/opt/local/lib/libopenblas.dylib 57 | # 58 | 59 | # 60 | # 3. Apple's framework accelerate 61 | # 62 | # This has the advantage that it does not require to install anything, 63 | # as it is provided by default on the mac. It is not very fast, though. 64 | # 65 | 66 | BLASCFLAGS=-DFINTEGER=int 67 | BLASLDFLAGS=-framework Accelerate 68 | 69 | 70 | 71 | ########################################################################## 72 | # SWIG and Python flags 73 | ########################################################################## 74 | 75 | # SWIG executable. This should be at least version 3.x 76 | # brew install swig 77 | 78 | SWIGEXEC=/usr/local/bin/swig 79 | 80 | # The Python include directories for the current python executable can 81 | # typically be found with 82 | # 83 | # python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()" 84 | # python -c "import numpy ; print numpy.get_include()" 85 | # 86 | # the paths below are for the system python (not the macports one) 87 | 88 | PYTHONCFLAGS=-I/System/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 \ 89 | -I/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/include 90 | 91 | 92 | ########################################################################## 93 | # Faiss GPU 94 | ########################################################################## 95 | 96 | # As we don't have access to a Mac with nvidia GPUs installed, we 97 | # could not validate the GPU compile of Faiss. 98 | -------------------------------------------------------------------------------- /example_makefiles/makefile.inc.Mac.port: -------------------------------------------------------------------------------- 1 | 2 | # -*- makefile -*- 3 | # tested on Mac OS X 10.12.2 Sierra with additional software installed via macports 4 | 5 | 6 | 7 | # The system default clang does not support openmp 8 | # You can install an openmp compatible g++ with macports: 9 | # port install g++-mp-6 10 | CC=/opt/local/bin/g++-mp-6 11 | 12 | CFLAGS=-fPIC -m64 -Wall -g -O3 -msse4 -mpopcnt -fopenmp -Wno-sign-compare -std=c++11 13 | LDFLAGS=-g -fPIC -fopenmp 14 | 15 | 16 | # common linux flags 17 | SHAREDEXT=dylib 18 | SHAREDFLAGS=-Wl,-F. -bundle -undefined dynamic_lookup 19 | FAISSSHAREDFLAGS=-dynamiclib 20 | 21 | # wrapldflags="" 22 | # sharedext=dylib 23 | # sharedflags="-dynamiclib" 24 | # yaelsharedflags="$sharedflags -install_name $yaelprefix/yael/libyael.dylib" 25 | 26 | ########################################################################## 27 | # Uncomment one of the 4 BLAS/Lapack implementation options 28 | # below. They are sorted # from fastest to slowest (in our 29 | # experiments). 30 | ########################################################################## 31 | 32 | # 33 | # 1. Intel MKL 34 | # 35 | # This is the fastest BLAS implementation we tested. Unfortunately it 36 | # is not open-source and determining the correct linking flags is a 37 | # nightmare. See 38 | # 39 | # https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor 40 | # 41 | # for a start on setting the link flags. On version IntelComposerXE 42 | # 2015.0.090, the following flags work 43 | # 44 | # MKLROOT=$(HOME)/fbsource/fbcode/third-party2//IntelComposerXE/2015.0.090/gcc-4.8.1-glibc-2.17/c3f970a/mkl 45 | # 46 | # BLASLDFLAGS=-Wl,--no-as-needed -L$(MKLROOT)/lib/intel64 -lmkl_intel_ilp64 \ 47 | # -lmkl_core -lmkl_gnu_thread -ldl -lpthread 48 | # 49 | # the ilp64 means that the integers are 64-bit. 50 | # 51 | # BLASLDFLAGS=-DFINTEGER=long 52 | # 53 | # you may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime 54 | # 55 | 56 | # 57 | # 2. Openblas 58 | # 59 | # The library contains both BLAS and Lapack. Install with port install OpenBLAS 60 | # 61 | # BLASCFLAGS=-DFINTEGER=int 62 | # BLASLDFLAGS=/opt/local/lib/libopenblas.dylib 63 | # 64 | 65 | # 66 | # 3. Apple's framework accelerate 67 | # 68 | # This has the advantage that it does not require to install anything, 69 | # as it is provided by default on the mac. It is not very fast, though. 70 | # 71 | 72 | BLASCFLAGS=-DFINTEGER=int 73 | BLASLDFLAGS=-framework Accelerate 74 | 75 | 76 | 77 | ########################################################################## 78 | # SWIG and Python flags 79 | ########################################################################## 80 | 81 | # SWIG executable. This should be at least version 3.x 82 | # port install swig swig-python 83 | 84 | SWIGEXEC=/opt/local/bin/swig 85 | 86 | # The Python include directories for the current python executable can 87 | # typically be found with 88 | # 89 | # python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()" 90 | # python -c "import numpy ; print numpy.get_include()" 91 | # 92 | # the paths below are for the system python (not the macports one) 93 | 94 | PYTHONCFLAGS=-I/System/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 \ 95 | -I/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/include 96 | 97 | 98 | ########################################################################## 99 | # Faiss GPU 100 | ########################################################################## 101 | 102 | # As we don't have access to a Mac with nvidia GPUs installed, we 103 | # could not validate the GPU compile of Faiss. 104 | -------------------------------------------------------------------------------- /faiss.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * Copyright (c) 2015-present, Facebook, Inc. 4 | * All rights reserved. 5 | * 6 | * This source code is licensed under the CC-by-NC license found in the 7 | * LICENSE file in the root directory of this source tree. An additional grant 8 | * of patent rights can be found in the PATENTS file in the same directory. 9 | */ 10 | 11 | // Copyright 2004-present Facebook. All Rights Reserved 12 | // -*- c++ -*- 13 | 14 | // This is the main internal include file for Faiss. It defines 15 | // macros and some machine-specific functions shared across .cpp files 16 | 17 | #ifndef FAISS_h 18 | #define FAISS_h 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #ifndef __SSE2__ 25 | #error "SSE optimized distance computations not set" 26 | #endif 27 | 28 | 29 | 30 | 31 | #ifdef _OPENMP 32 | #include 33 | #define SET_NT(ntlim) \ 34 | size_t nt = omp_get_max_threads(); \ 35 | if (nt > ntlim) nt = ntlim; 36 | #else 37 | #warning "OpenMP is NOT activated" 38 | #define SET_NT(ntlim) size_t nt = 0; nt++; 39 | #endif 40 | 41 | /* This is to prevent warning by the linter (FINTEGER is defined externally) */ 42 | #ifndef FINTEGER 43 | #define FINTEGER long 44 | #endif 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /filehelper.h: -------------------------------------------------------------------------------- 1 | #ifndef FILEHELPER_HPP 2 | #define FILEHELPER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace faiss { 9 | /* read Header from fvecs */ 10 | template 11 | T* readJegou(const char *path, uint &n, uint &d) ; 12 | 13 | template 14 | void readJegouHeader(const char *path, uint &n, uint &d); 15 | 16 | template<> 17 | uint8_t* readJegou(const char *path, uint &n, uint &d); 18 | 19 | 20 | uint8_t* readBatchJegou(const char *path, uint start_pos, uint num); 21 | 22 | template<> 23 | void readJegouHeader(const char *path, uint &n, uint &d); 24 | 25 | 26 | template 27 | void write(std::string fs, size_t num, uint dim, T *ptr, size_t len, size_t offset = 0) ; 28 | 29 | 30 | 31 | 32 | template 33 | void read(std::string fs, size_t &num, uint &dim, T *ptr, size_t len, size_t offset = 0); 34 | 35 | void header(std::string fs, uint &num, uint &dim) ; 36 | 37 | void writeFloat(std::string _fn, size_t _dim, size_t _num, float* _x, size_t _offset); 38 | void writeInt(std::string _fn, size_t _dim, size_t _num, int* _x, size_t _offset); 39 | 40 | float* readFloat(const char* _fn, size_t _dim, size_t _num, size_t _offset) ; 41 | 42 | float* readUint8(const char* _fn, size_t _dim, size_t _num, size_t _offset) ; 43 | 44 | int* readInt(const char* _fn, size_t _dim, size_t _num, size_t _offset); 45 | #endif 46 | } 47 | -------------------------------------------------------------------------------- /gpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # specify header and cpp files 2 | file(GLOB_RECURSE faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) 3 | file(GLOB_RECURSE faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) 4 | file(GLOB_RECURSE faiss_gpu_cuh ${CMAKE_CURRENT_SOURCE_DIR}/*.cuh) 5 | file(GLOB_RECURSE faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/*.cu) 6 | 7 | set(faiss_lib_gpu gpufaiss) 8 | 9 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/demo_ivfpq_indexing_gpu.cpp) 10 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexFlat.cpp) 11 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexIVFFlat.cpp) 12 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexIVFPQ.cpp) 13 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestUtils.cpp) 14 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuSelect.cu) 15 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/test/TestUtils.h) 16 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/perf/IndexWrapper.h) 17 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/perf/IndexWrapper-inl.h) 18 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareFlat.cu) 19 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFFlat.cu) 20 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFPQ.cu) 21 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFPQGrid.cu) 22 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfSelect.cu) 23 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfClustering.cpp) 24 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfIVFPQAdd.cpp) 25 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/WriteIndex.cpp) 26 | 27 | cuda_add_library(${faiss_lib_gpu} STATIC ${faiss_gpu_headers} ${faiss_gpu_cpp} ${faiss_gpu_cuh} ${faiss_gpu_cu}) 28 | add_subdirectory(test) 29 | -------------------------------------------------------------------------------- /gpu/GpuAutoTune.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #pragma once 11 | 12 | #include "../Index.h" 13 | #include "../AutoTune.h" 14 | #include "GpuClonerOptions.h" 15 | #include "GpuIndex.h" 16 | #include "GpuIndicesOptions.h" 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | class GpuResources; 21 | 22 | // to support auto-tuning we need cloning to/from CPU 23 | 24 | /// converts any GPU index inside gpu_index to a CPU index 25 | faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index); 26 | 27 | /// converts any CPU index that can be converted to GPU 28 | faiss::Index * index_cpu_to_gpu( 29 | GpuResources* resources, int device, 30 | const faiss::Index *index, 31 | const GpuClonerOptions *options = nullptr); 32 | 33 | faiss::Index * index_cpu_to_gpu_multiple( 34 | std::vector & resources, 35 | std::vector &devices, 36 | const faiss::Index *index, 37 | const GpuMultipleClonerOptions *options = nullptr); 38 | 39 | /// parameter space and setters for GPU indexes 40 | struct GpuParameterSpace: faiss::ParameterSpace { 41 | /// initialize with reasonable parameters for the index 42 | void initialize (const faiss::Index * index) override; 43 | 44 | /// set a combination of parameters on an index 45 | void set_index_parameter ( 46 | faiss::Index * index, const std::string & name, 47 | double val) const override; 48 | }; 49 | 50 | } } // namespace 51 | -------------------------------------------------------------------------------- /gpu/GpuClonerOptions.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "GpuClonerOptions.h" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | GpuClonerOptions::GpuClonerOptions() 15 | : indicesOptions(INDICES_64_BIT), 16 | useFloat16CoarseQuantizer(false), 17 | useFloat16(false), 18 | usePrecomputed(true), 19 | reserveVecs(0), 20 | storeTransposed(false), 21 | verbose(false) { 22 | } 23 | 24 | GpuMultipleClonerOptions::GpuMultipleClonerOptions() 25 | : shard(false) { 26 | } 27 | 28 | } } // namespace 29 | -------------------------------------------------------------------------------- /gpu/GpuClonerOptions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #pragma once 11 | 12 | #include "GpuIndicesOptions.h" 13 | 14 | namespace faiss { namespace gpu { 15 | 16 | /// set some options on how to copy to GPU 17 | struct GpuClonerOptions { 18 | GpuClonerOptions(); 19 | 20 | /// how should indices be stored on index types that support indices 21 | /// (anything but GpuIndexFlat*)? 22 | IndicesOptions indicesOptions; 23 | 24 | /// is the coarse quantizer in float16? 25 | bool useFloat16CoarseQuantizer; 26 | 27 | /// for GpuIndexIVFFlat, is storage in float16? 28 | /// for GpuIndexIVFPQ, are intermediate calculations in float16? 29 | bool useFloat16; 30 | 31 | /// use precomputed tables? 32 | bool usePrecomputed; 33 | 34 | /// reserve vectors in the invfiles? 35 | long reserveVecs; 36 | 37 | /// For GpuIndexFlat, store data in transposed layout? 38 | bool storeTransposed; 39 | 40 | /// Set verbose options on the index 41 | bool verbose; 42 | }; 43 | 44 | struct GpuMultipleClonerOptions : public GpuClonerOptions { 45 | GpuMultipleClonerOptions (); 46 | 47 | /// Whether to shard the index across GPUs, versus replication 48 | /// across GPUs 49 | bool shard; 50 | }; 51 | 52 | } } // namespace 53 | -------------------------------------------------------------------------------- /gpu/GpuIndex.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../Index.h" 14 | #include "utils/MemorySpace.h" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | class GpuResources; 19 | 20 | struct GpuIndexConfig { 21 | inline GpuIndexConfig() 22 | : device(0), 23 | memorySpace(MemorySpace::Device) { 24 | } 25 | 26 | /// GPU device on which the index is resident 27 | int device; 28 | 29 | /// What memory space to use for primary storae. 30 | /// On Pascal and above (CC 6+) architectures, allows GPUs to use 31 | /// more memory than is available on the GPU. 32 | MemorySpace memorySpace; 33 | }; 34 | 35 | class GpuIndex : public faiss::Index { 36 | public: 37 | GpuIndex(GpuResources* resources, 38 | int dims, 39 | faiss::MetricType metric, 40 | GpuIndexConfig config); 41 | 42 | int getDevice() const { 43 | return device_; 44 | } 45 | 46 | GpuResources* getResources() { 47 | return resources_; 48 | } 49 | 50 | /// `x` can be resident on the CPU or any GPU; copies are performed 51 | /// as needed 52 | /// Handles paged adds if the add set is too large; calls addInternal_ 53 | void add(faiss::Index::idx_t, const float* x) override; 54 | 55 | /// `x` and `ids` can be resident on the CPU or any GPU; copies are 56 | /// performed as needed 57 | /// Handles paged adds if the add set is too large; calls addInternal_ 58 | void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids) 59 | override; 60 | 61 | /// `x`, `distances` and `labels` can be resident on the CPU or any 62 | /// GPU; copies are performed as needed 63 | void search( 64 | faiss::Index::idx_t n, 65 | const float* x, 66 | faiss::Index::idx_t k, 67 | float* distances, 68 | faiss::Index::idx_t* labels) const override; 69 | 70 | 71 | 72 | protected: 73 | /// Handles paged adds if the add set is too large, passes to 74 | /// addImpl_ to actually perform the add for the current page 75 | void addInternal_(Index::idx_t n, 76 | const float* x, 77 | const Index::idx_t* ids); 78 | 79 | /// Overridden to actually perform the add 80 | virtual void addImpl_(Index::idx_t n, 81 | const float* x, 82 | const Index::idx_t* ids) = 0; 83 | 84 | /// Overridden to actually perform the search 85 | virtual void searchImpl_(faiss::Index::idx_t n, 86 | const float* x, 87 | faiss::Index::idx_t k, 88 | float* distances, 89 | faiss::Index::idx_t* labels) const = 0; 90 | 91 | protected: 92 | /// Manages streans, cuBLAS handles and scratch memory for devices 93 | GpuResources* resources_; 94 | 95 | /// The GPU device we are resident on 96 | const int device_; 97 | 98 | /// The memory space of our primary storage on the GPU 99 | const MemorySpace memorySpace_; 100 | }; 101 | 102 | } } // namespace 103 | -------------------------------------------------------------------------------- /gpu/GpuIndexIVF.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "GpuIndex.h" 14 | #include "GpuIndexFlat.h" 15 | #include "GpuIndicesOptions.h" 16 | #include "../Clustering.h" 17 | 18 | namespace faiss { struct IndexIVF; } 19 | 20 | namespace faiss { namespace gpu { 21 | 22 | class GpuIndexFlat; 23 | class GpuResources; 24 | 25 | struct GpuIndexIVFConfig : public GpuIndexConfig { 26 | inline GpuIndexIVFConfig() 27 | : indicesOptions(INDICES_64_BIT) { 28 | } 29 | 30 | /// Index storage options for the GPU 31 | IndicesOptions indicesOptions; 32 | 33 | /// Configuration for the coarse quantizer object 34 | GpuIndexFlatConfig flatConfig; 35 | }; 36 | 37 | class GpuIndexIVF : public GpuIndex { 38 | public: 39 | GpuIndexIVF(GpuResources* resources, 40 | int dims, 41 | faiss::MetricType metric, 42 | int nlist, 43 | GpuIndexIVFConfig config = GpuIndexIVFConfig()); 44 | 45 | ~GpuIndexIVF() override; 46 | 47 | private: 48 | /// Shared initialization functions 49 | void init_(); 50 | 51 | public: 52 | /// Copy what we need from the CPU equivalent 53 | void copyFrom(const faiss::IndexIVF* index); 54 | 55 | /// Copy what we have to the CPU equivalent 56 | void copyTo(faiss::IndexIVF* index) const; 57 | 58 | /// Returns the number of inverted lists we're managing 59 | int getNumLists() const; 60 | 61 | /// Return the quantizer we're using 62 | GpuIndexFlat* getQuantizer(); 63 | 64 | /// Sets the number of list probes per query 65 | void setNumProbes(int nprobe); 66 | 67 | /// Returns our current number of list probes per query 68 | int getNumProbes() const; 69 | 70 | /// `x` can be resident on the CPU or any GPU; the proper copies are 71 | /// performed 72 | /// Forwards to add_with_ids; assigns IDs as needed 73 | /// FIXME: remove override for C++03 compatibility 74 | void add(Index::idx_t n, const float* x) override; 75 | 76 | protected: 77 | void trainQuantizer_(faiss::Index::idx_t n, const float* x); 78 | 79 | protected: 80 | GpuIndexIVFConfig ivfConfig_; 81 | 82 | /// Number of inverted lists that we manage 83 | int nlist_; 84 | 85 | /// Number of inverted list probes per query 86 | int nprobe_; 87 | 88 | /// Ability to override default clustering parameters 89 | ClusteringParameters cp_; 90 | 91 | /// Quantizer for inverted lists 92 | GpuIndexFlat* quantizer_; 93 | }; 94 | 95 | } } // namespace 96 | -------------------------------------------------------------------------------- /gpu/GpuIndexIVFFlat.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "GpuIndexIVF.h" 14 | 15 | namespace faiss { struct IndexIVFFlat; } 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | class IVFFlat; 20 | class GpuIndexFlat; 21 | 22 | struct GpuIndexIVFFlatConfig : public GpuIndexIVFConfig { 23 | inline GpuIndexIVFFlatConfig() 24 | : useFloat16IVFStorage(false) { 25 | } 26 | 27 | /// Whether or not IVFFlat inverted list storage is in float16; 28 | /// supported on all architectures 29 | bool useFloat16IVFStorage; 30 | }; 31 | 32 | /// Wrapper around the GPU implementation that looks like 33 | /// faiss::IndexIVFFlat 34 | class GpuIndexIVFFlat : public GpuIndexIVF { 35 | public: 36 | /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying 37 | /// data over to the given GPU, if the input index is trained. 38 | GpuIndexIVFFlat(GpuResources* resources, 39 | const faiss::IndexIVFFlat* index, 40 | GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); 41 | 42 | /// Constructs a new instance with an empty flat quantizer; the user 43 | /// provides the number of lists desired. 44 | GpuIndexIVFFlat(GpuResources* resources, 45 | int dims, 46 | int nlist, 47 | faiss::MetricType metric, 48 | GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); 49 | 50 | ~GpuIndexIVFFlat() override; 51 | 52 | /// Reserve GPU memory in our inverted lists for this number of vectors 53 | void reserveMemory(size_t numVecs); 54 | 55 | /// Initialize ourselves from the given CPU index; will overwrite 56 | /// all data in ourselves 57 | void copyFrom(const faiss::IndexIVFFlat* index); 58 | 59 | /// Copy ourselves to the given CPU index; will overwrite all data 60 | /// in the index instance 61 | void copyTo(faiss::IndexIVFFlat* index) const; 62 | 63 | /// After adding vectors, one can call this to reclaim device memory 64 | /// to exactly the amount needed. Returns space reclaimed in bytes 65 | size_t reclaimMemory(); 66 | 67 | void reset() override; 68 | 69 | void train(Index::idx_t n, const float* x) override; 70 | 71 | protected: 72 | /// Called from GpuIndex for add/add_with_ids 73 | void addImpl_( 74 | faiss::Index::idx_t n, 75 | const float* x, 76 | const faiss::Index::idx_t* ids) override; 77 | 78 | /// Called from GpuIndex for search 79 | void searchImpl_( 80 | faiss::Index::idx_t n, 81 | const float* x, 82 | faiss::Index::idx_t k, 83 | float* distances, 84 | faiss::Index::idx_t* labels) const override; 85 | private: 86 | GpuIndexIVFFlatConfig ivfFlatConfig_; 87 | 88 | /// Desired inverted list memory reservation 89 | size_t reserveMemoryVecs_; 90 | 91 | /// Instance that we own; contains the inverted list 92 | IVFFlat* index_; 93 | }; 94 | 95 | } } // namespace 96 | -------------------------------------------------------------------------------- /gpu/GpuIndicesOptions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | namespace faiss { namespace gpu { 14 | 15 | /// How user vector index data is stored on the GPU 16 | enum IndicesOptions { 17 | /// The user indices are only stored on the CPU; the GPU returns 18 | /// (inverted list, offset) to the CPU which is then translated to 19 | /// the real user index. 20 | INDICES_CPU = 0, 21 | /// The indices are not stored at all, on either the CPU or 22 | /// GPU. Only (inverted list, offset) is returned to the user as the 23 | /// index. 24 | INDICES_IVF = 1, 25 | /// Indices are stored as 32 bit integers on the GPU, but returned 26 | /// as 64 bit integers 27 | INDICES_32_BIT = 2, 28 | /// Indices are stored as 64 bit integers on the GPU 29 | INDICES_64_BIT = 3, 30 | }; 31 | 32 | } } // namespace 33 | -------------------------------------------------------------------------------- /gpu/GpuResources.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "GpuResources.h" 12 | #include "utils/DeviceUtils.h" 13 | 14 | namespace faiss { namespace gpu { 15 | 16 | GpuResources::~GpuResources() { 17 | } 18 | 19 | cublasHandle_t 20 | GpuResources::getBlasHandleCurrentDevice() { 21 | return getBlasHandle(getCurrentDevice()); 22 | } 23 | 24 | cudaStream_t 25 | GpuResources::getDefaultStreamCurrentDevice() { 26 | return getDefaultStream(getCurrentDevice()); 27 | } 28 | 29 | std::vector 30 | GpuResources::getAlternateStreamsCurrentDevice() { 31 | return getAlternateStreams(getCurrentDevice()); 32 | } 33 | 34 | DeviceMemory& 35 | GpuResources::getMemoryManagerCurrentDevice() { 36 | return getMemoryManager(getCurrentDevice()); 37 | } 38 | 39 | cudaStream_t 40 | GpuResources::getAsyncCopyStreamCurrentDevice() { 41 | return getAsyncCopyStream(getCurrentDevice()); 42 | } 43 | 44 | } } // namespace 45 | -------------------------------------------------------------------------------- /gpu/GpuResources.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "utils/DeviceMemory.h" 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace faiss { namespace gpu { 20 | 21 | /// Base class of GPU-side resource provider; hides provision of 22 | /// cuBLAS handles, CUDA streams and a temporary memory manager 23 | class GpuResources { 24 | public: 25 | virtual ~GpuResources(); 26 | 27 | /// Call to pre-allocate resources for a particular device. If this is 28 | /// not called, then resources will be allocated at the first time 29 | /// of demand 30 | virtual void initializeForDevice(int device) = 0; 31 | 32 | virtual cublasHandle_t getBlasHandle(int device) = 0; 33 | 34 | virtual cudaStream_t getDefaultStream(int device) = 0; 35 | 36 | virtual std::vector getAlternateStreams(int device) = 0; 37 | 38 | virtual DeviceMemory& getMemoryManager(int device) = 0; 39 | 40 | virtual std::pair getPinnedMemory() = 0; 41 | 42 | virtual cudaStream_t getAsyncCopyStream(int device) = 0; 43 | 44 | cublasHandle_t getBlasHandleCurrentDevice(); 45 | 46 | cudaStream_t getDefaultStreamCurrentDevice(); 47 | 48 | std::vector getAlternateStreamsCurrentDevice(); 49 | 50 | DeviceMemory& getMemoryManagerCurrentDevice(); 51 | 52 | cudaStream_t getAsyncCopyStreamCurrentDevice(); 53 | }; 54 | 55 | } } // namespace 56 | -------------------------------------------------------------------------------- /gpu/IndexProxy.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../Index.h" 14 | #include "utils/WorkerThread.h" 15 | #include 16 | #include 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | /// Takes individual faiss::Index instances, and splits queries for 21 | /// sending to each Index instance, and joins the results together 22 | /// when done. 23 | /// Each index is managed by a separate CPU thread. 24 | class IndexProxy : public faiss::Index { 25 | public: 26 | IndexProxy(); 27 | ~IndexProxy() override; 28 | 29 | /// Adds an index that is managed by ourselves. 30 | /// WARNING: once an index is added to this proxy, it becomes unsafe 31 | /// to touch it from any other thread than that on which is managing 32 | /// it, until we are shut down. Use runOnIndex to perform work on it 33 | /// instead. 34 | void addIndex(faiss::Index* index); 35 | 36 | /// Remove an index that is managed by ourselves. 37 | /// This will flush all pending work on that index, and then shut 38 | /// down its managing thread, and will remove the index. 39 | void removeIndex(faiss::Index* index); 40 | 41 | /// Run a function on all indices, in the thread that the index is 42 | /// managed in. 43 | void runOnIndex(std::function f); 44 | 45 | /// faiss::Index API 46 | /// All indices receive the same call 47 | void reset() override; 48 | 49 | /// faiss::Index API 50 | /// All indices receive the same call 51 | void train(Index::idx_t n, const float* x) override; 52 | 53 | /// faiss::Index API 54 | /// All indices receive the same call 55 | void add(Index::idx_t n, const float* x) override; 56 | 57 | /// faiss::Index API 58 | /// Query is partitioned into a slice for each sub-index 59 | /// split by ceil(n / #indices) for our sub-indices 60 | void search(faiss::Index::idx_t n, 61 | const float* x, 62 | faiss::Index::idx_t k, 63 | float* distances, 64 | faiss::Index::idx_t* labels) const override; 65 | 66 | /// reconstructs from the first index 67 | void reconstruct(idx_t, float *v) const override; 68 | 69 | bool own_fields; 70 | 71 | int count() const {return indices_.size(); } 72 | 73 | faiss::Index* at(int i) {return indices_[i].first; } 74 | const faiss::Index* at(int i) const {return indices_[i].first; } 75 | 76 | 77 | private: 78 | /// Collection of Index instances, with their managing worker thread 79 | mutable std::vector > > indices_; 81 | }; 82 | 83 | 84 | 85 | /** Clustering on GPU (is here because uses Proxy with ngpu > 1 86 | * 87 | * @param ngpu nb of GPUs to use 88 | * @param d dimension of the data 89 | * @param n nb of training vectors 90 | * @param k nb of output centroids 91 | * @param x training set (size n * d) 92 | * @param centroids output centroids (size k * d) 93 | * @return final quantization error 94 | */ 95 | float kmeans_clustering_gpu (int ngpu, size_t d, size_t n, size_t k, 96 | const float *x, 97 | float *centroids, 98 | bool useFloat16, 99 | bool storeTransposed); 100 | 101 | 102 | 103 | } } // namespace 104 | -------------------------------------------------------------------------------- /gpu/StandardGpuResources.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "GpuResources.h" 14 | #include "utils/StackDeviceMemory.h" 15 | #include "utils/DeviceUtils.h" 16 | #include 17 | #include 18 | 19 | namespace faiss { namespace gpu { 20 | 21 | /// Default implementation of GpuResources that allocates a cuBLAS 22 | /// stream and 2 streams for use, as well as temporary memory 23 | class StandardGpuResources : public GpuResources { 24 | public: 25 | StandardGpuResources(); 26 | 27 | ~StandardGpuResources() override; 28 | 29 | /// Disable allocation of temporary memory; all temporary memory 30 | /// requests will call cudaMalloc / cudaFree at the point of use 31 | void noTempMemory(); 32 | 33 | /// Specify that we wish to use a certain fixed size of memory on 34 | /// all devices as temporary memory 35 | void setTempMemory(size_t size); 36 | 37 | /// Specify that we wish to use a certain fraction of memory on 38 | /// all devices as temporary memory 39 | void setTempMemoryFraction(float fraction); 40 | 41 | /// Set amount of pinned memory to allocate, for async GPU <-> CPU 42 | /// transfers 43 | void setPinnedMemory(size_t size); 44 | 45 | public: 46 | /// Internal system calls 47 | void initializeForDevice(int device) override; 48 | 49 | cublasHandle_t getBlasHandle(int device) override; 50 | 51 | cudaStream_t getDefaultStream(int device) override; 52 | 53 | std::vector getAlternateStreams(int device) override; 54 | 55 | DeviceMemory& getMemoryManager(int device) override; 56 | 57 | std::pair getPinnedMemory() override; 58 | 59 | cudaStream_t getAsyncCopyStream(int device) override; 60 | 61 | private: 62 | /// Our default stream that work is ordered on, one per each device 63 | std::unordered_map defaultStreams_; 64 | 65 | /// Other streams we can use, per each device 66 | std::unordered_map > alternateStreams_; 67 | 68 | /// Async copy stream to use for GPU <-> CPU pinned memory copies 69 | std::unordered_map asyncCopyStreams_; 70 | 71 | /// cuBLAS handle for each device 72 | std::unordered_map blasHandles_; 73 | 74 | /// Temporary memory provider, per each device 75 | std::unordered_map > memory_; 76 | 77 | /// Pinned memory allocation for use with this GPU 78 | void* pinnedMemAlloc_; 79 | size_t pinnedMemAllocSize_; 80 | 81 | /// By default, we reserve this fraction of memory on all devices 82 | float tempMemFraction_; 83 | 84 | /// Another option is to use a specified amount of memory on all 85 | /// devices 86 | size_t tempMemSize_; 87 | 88 | /// Whether we look at tempMemFraction_ or tempMemSize_ 89 | bool useFraction_; 90 | 91 | /// Amount of pinned memory we should allocate 92 | size_t pinnedMemSize_; 93 | }; 94 | 95 | } } // namespace 96 | -------------------------------------------------------------------------------- /gpu/impl/BroadcastSum.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../utils/Float16.cuh" 14 | #include "../utils/Tensor.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | // output[x][i] += input[i] for all x 19 | void runSumAlongColumns(Tensor& input, 20 | Tensor& output, 21 | cudaStream_t stream); 22 | 23 | #ifdef FAISS_USE_FLOAT16 24 | void runSumAlongColumns(Tensor& input, 25 | Tensor& output, 26 | cudaStream_t stream); 27 | #endif 28 | 29 | // output[x][i] = input[i] for all x 30 | void runAssignAlongColumns(Tensor& input, 31 | Tensor& output, 32 | cudaStream_t stream); 33 | 34 | #ifdef FAISS_USE_FLOAT16 35 | void runAssignAlongColumns(Tensor& input, 36 | Tensor& output, 37 | cudaStream_t stream); 38 | #endif 39 | 40 | // output[i][x] += input[i] for all x 41 | void runSumAlongRows(Tensor& input, 42 | Tensor& output, 43 | cudaStream_t stream); 44 | void runSumAlongRowsWithGraph(Tensor& outIndexView, 45 | Tensor& graphIndices, 46 | Tensor& productDistances, 47 | Tensor& outGraphDistances, 48 | cudaStream_t stream); 49 | void runSumAlongColumnsGraph1(Tensor& input, 50 | Tensor& output, 51 | cudaStream_t stream); 52 | void runL2SelectMinGraph(Tensor& graphDistancesBuf, Tensor& outIndexView, 53 | Tensor& graphIndices, 54 | Tensor& graphDists, 55 | Tensor& productDistances, 56 | Tensor& outGraphDistances, 57 | Tensor& outDistances2nd, 58 | Tensor& outIndices2nd,int k,int begin ,int end, 59 | cudaStream_t stream); 60 | #ifdef FAISS_USE_FLOAT16 61 | void runSumAlongRows(Tensor& input, 62 | Tensor& output, 63 | cudaStream_t stream); 64 | void runSumAlongRowsWithGraph(Tensor& outIndexView, 65 | Tensor& graphIndices, 66 | Tensor& productDistances, 67 | Tensor& outGraphDistances, 68 | cudaStream_t stream); 69 | #endif 70 | 71 | } } // namespace 72 | -------------------------------------------------------------------------------- /gpu/impl/IVFFlat.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "IVFBase.cuh" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | class IVFFlat : public IVFBase { 18 | public: 19 | /// Construct from a quantizer that has elemen 20 | IVFFlat(GpuResources* resources, 21 | /// We do not own this reference 22 | FlatIndex* quantizer, 23 | bool l2Distance, 24 | bool useFloat16, 25 | IndicesOptions indicesOptions, 26 | MemorySpace space); 27 | 28 | ~IVFFlat() override; 29 | 30 | /// Add vectors to a specific list; the input data can be on the 31 | /// host or on our current device 32 | void addCodeVectorsFromCpu(int listId, 33 | const float* vecs, 34 | const long* indices, 35 | size_t numVecs); 36 | 37 | /// Adds the given vectors to this index. 38 | /// The input data must be on our current device. 39 | /// Returns the number of vectors successfully added. Vectors may 40 | /// not be able to be added because they contain NaNs. 41 | int classifyAndAddVectors(Tensor& vecs, 42 | Tensor& indices); 43 | 44 | /// Find the approximate k nearest neigbors for `queries` against 45 | /// our database 46 | void query(Tensor& queries, 47 | int nprobe, 48 | int k, 49 | Tensor& outDistances, 50 | Tensor& outIndices); 51 | 52 | /// Return the vectors of a particular list back to the CPU 53 | std::vector getListVectors(int listId) const; 54 | 55 | private: 56 | /// Returns the size of our stored vectors, in bytes 57 | size_t getVectorMemorySize() const; 58 | 59 | private: 60 | /// Calculating L2 distance or inner product? 61 | const bool l2Distance_; 62 | 63 | /// Do we store data internally as float16 (versus float32)? 64 | const bool useFloat16_; 65 | }; 66 | 67 | } } // namespace 68 | -------------------------------------------------------------------------------- /gpu/impl/IVFFlatScan.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../GpuIndicesOptions.h" 14 | #include "../utils/Tensor.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | class GpuResources; 20 | 21 | void runIVFFlatScan(Tensor& queries, 22 | Tensor& listIds, 23 | thrust::device_vector& listData, 24 | thrust::device_vector& listIndices, 25 | IndicesOptions indicesOptions, 26 | thrust::device_vector& listLengths, 27 | int maxListLength, 28 | int k, 29 | bool l2Distance, 30 | bool useFloat16, 31 | // output 32 | Tensor& outDistances, 33 | // output 34 | Tensor& outIndices, 35 | GpuResources* res); 36 | 37 | } } // namespace 38 | -------------------------------------------------------------------------------- /gpu/impl/InvertedListAppend.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../GpuIndicesOptions.h" 14 | #include "../utils/Tensor.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | /// Update device-side list pointers in a batch 20 | void runUpdateListPointers(Tensor& listIds, 21 | Tensor& newListLength, 22 | Tensor& newCodePointers, 23 | Tensor& newIndexPointers, 24 | Tensor& newLambdaPointers, 25 | Tensor& newConstPointers, 26 | thrust::device_vector& listLengths, 27 | thrust::device_vector& listCodes, 28 | thrust::device_vector& listIndices, 29 | thrust::device_vector& listLambdas, 30 | thrust::device_vector& listConsts, 31 | cudaStream_t stream); 32 | 33 | /// Actually append the new codes / vector indices to the individual lists 34 | 35 | /// IVFPQ 36 | void runIVFPQInvertedListAppend(Tensor& listIds, 37 | Tensor& listOffset, 38 | Tensor& encodings, 39 | Tensor& indices, 40 | thrust::device_vector& listCodes, 41 | thrust::device_vector& listIndices, 42 | IndicesOptions indicesOptions, 43 | cudaStream_t stream); 44 | 45 | 46 | /// IVFPQ 47 | void runIVFPQInvertedListAppend(Tensor& listIds, 48 | Tensor& listOffset, 49 | Tensor& encodings, 50 | Tensor& indices, 51 | Tensor& lambdas, 52 | Tensor& consts, 53 | thrust::device_vector& listCodes, 54 | thrust::device_vector& listIndices, 55 | thrust::device_vector& listLambdas, 56 | thrust::device_vector& listConsts, 57 | IndicesOptions indicesOptions, 58 | cudaStream_t stream); 59 | 60 | /// IVF flat storage 61 | void runIVFFlatInvertedListAppend(Tensor& listIds, 62 | Tensor& listOffset, 63 | Tensor& vecs, 64 | Tensor& indices, 65 | bool useFloat16, 66 | thrust::device_vector& listData, 67 | thrust::device_vector& listIndices, 68 | IndicesOptions indicesOptions, 69 | cudaStream_t stream); 70 | 71 | } } // namespace 72 | -------------------------------------------------------------------------------- /gpu/impl/L2Norm.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../utils/Float16.cuh" 14 | #include "../utils/Tensor.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | void runL2Norm(Tensor& input, 19 | Tensor& output, 20 | bool normSquared, 21 | cudaStream_t stream); 22 | 23 | #ifdef FAISS_USE_FLOAT16 24 | void runL2Norm(Tensor& input, 25 | Tensor& output, 26 | bool normSquared, 27 | cudaStream_t stream); 28 | #endif 29 | 30 | } } // namespace 31 | -------------------------------------------------------------------------------- /gpu/impl/L2Select.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../utils/Float16.cuh" 14 | #include "../utils/Tensor.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | void runL2SelectMin(Tensor& productDistances, 19 | Tensor& centroidDistances, 20 | Tensor& outDistances, 21 | Tensor& outIndices, 22 | int k, 23 | cudaStream_t stream); 24 | 25 | #ifdef FAISS_USE_FLOAT16 26 | void runL2SelectMin(Tensor& productDistances, 27 | Tensor& centroidDistances, 28 | Tensor& outDistances, 29 | Tensor& outIndices, 30 | int k, 31 | cudaStream_t stream); 32 | #endif 33 | 34 | } } // namespace 35 | -------------------------------------------------------------------------------- /gpu/impl/PQCodeDistances.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../utils/Tensor.cuh" 14 | #include "../utils/NoTypeTensor.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | class DeviceMemory; 20 | 21 | /// pqCentroids is of the form (sub q)(sub dim)(code id) 22 | /// Calculates the distance from the (query - centroid) residual to 23 | /// each sub-code vector, for the given list of query results in 24 | /// topQueryToCentroid 25 | void runPQCodeDistances(Tensor& pqCentroids, 26 | Tensor& queries, 27 | Tensor& coarseCentroids, 28 | Tensor& topQueryToCentroid, 29 | NoTypeTensor<4, true>& outCodeDistances, 30 | bool useFloat16Lookup, 31 | cudaStream_t stream); 32 | 33 | void runPQCodeDistancesMM(Tensor& pqCentroids, 34 | Tensor& queries, 35 | Tensor& coarseCentroids, 36 | Tensor& topQueryToCentroid, 37 | NoTypeTensor<4, true>& outCodeDistances, 38 | bool useFloat16Lookup, 39 | DeviceMemory& mem, 40 | cublasHandle_t handle, 41 | cudaStream_t stream); 42 | 43 | } } // namespace 44 | -------------------------------------------------------------------------------- /gpu/impl/PQScanMultiPassNoPrecomputed.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../GpuIndicesOptions.h" 14 | #include "../utils/Tensor.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | class GpuResources; 20 | 21 | /// For no precomputed codes, is this a supported number of dimensions 22 | /// per subquantizer? 23 | bool isSupportedNoPrecomputedSubDimSize(int dims); 24 | 25 | void runPQScanMultiPassNoPrecomputed(Tensor& queries, 26 | Tensor& centroids, 27 | Tensor& pqCentroidsInnermostCode, 28 | Tensor& topQueryToCentroid, 29 | bool useFloat16Lookup, 30 | int bytesPerCode, 31 | int numSubQuantizers, 32 | int numSubQuantizerCodes, 33 | thrust::device_vector& listCodes, 34 | thrust::device_vector& listIndices, 35 | IndicesOptions indicesOptions, 36 | thrust::device_vector& listLengths, 37 | int maxListLength, 38 | int k, 39 | // output 40 | Tensor& outDistances, 41 | // output 42 | Tensor& outIndices, 43 | GpuResources* res); 44 | 45 | } } // namespace 46 | -------------------------------------------------------------------------------- /gpu/impl/RemapIndices.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "RemapIndices.h" 12 | #include "../../FaissAssert.h" 13 | #include 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | // Utility function to translate (list id, offset) to a user index on 18 | // the CPU. In a cpp in order to use OpenMP 19 | void ivfOffsetToUserIndex( 20 | long* indices, 21 | int numLists, 22 | int queries, 23 | int k, 24 | const std::vector>& listOffsetToUserIndex) { 25 | FAISS_ASSERT(numLists == listOffsetToUserIndex.size()); 26 | 27 | #pragma omp parallel for 28 | for (int q = 0; q < queries; ++q) { 29 | for (int r = 0; r < k; ++r) { 30 | long offsetIndex = indices[q * k + r]; 31 | 32 | if (offsetIndex < 0) continue; 33 | 34 | int listId = (int) (offsetIndex >> 32); 35 | int listOffset = (int) (offsetIndex & 0xffffffff); 36 | 37 | FAISS_ASSERT(listId < numLists); 38 | auto& listIndices = listOffsetToUserIndex[listId]; 39 | //if(listOffset >= listIndices.size()) 40 | // std::cout << "-----listId: " << listId << " listOffset: " << listOffset<< " length: " << listIndices.size()< 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | /// Utility function to translate (list id, offset) to a user index on 18 | /// the CPU. In a cpp in order to use OpenMP. 19 | void ivfOffsetToUserIndex( 20 | long* indices, 21 | int numLists, 22 | int queries, 23 | int k, 24 | const std::vector>& listOffsetToUserIndex); 25 | 26 | } } // namespace 27 | -------------------------------------------------------------------------------- /gpu/impl/VectorResidual.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "VectorResidual.cuh" 11 | #include "../../FaissAssert.h" 12 | #include "../utils/ConversionOperators.cuh" 13 | #include "../utils/DeviceUtils.h" 14 | #include "../utils/Tensor.cuh" 15 | #include "../utils/StaticUtils.h" 16 | #include // in CUDA SDK, for CUDART_NAN_F 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | template 21 | __global__ void calcResidual(Tensor vecs, 22 | Tensor centroids, 23 | Tensor vecToCentroid, 24 | Tensor residuals) { 25 | auto vec = vecs[blockIdx.x]; 26 | auto residual = residuals[blockIdx.x]; 27 | 28 | int centroidId = vecToCentroid[blockIdx.x]; 29 | // Vector could be invalid (containing NaNs), so -1 was the 30 | // classified centroid 31 | if (centroidId == -1) { 32 | if (LargeDim) { 33 | for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) { 34 | residual[i] = CUDART_NAN_F; 35 | } 36 | } else { 37 | residual[threadIdx.x] = CUDART_NAN_F; 38 | } 39 | 40 | return; 41 | } 42 | 43 | auto centroid = centroids[centroidId]; 44 | 45 | if (LargeDim) { 46 | for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) { 47 | residual[i] = vec[i] - ConvertTo::to(centroid[i]); 48 | } 49 | } else { 50 | residual[threadIdx.x] = vec[threadIdx.x] - 51 | ConvertTo::to(centroid[threadIdx.x]); 52 | } 53 | } 54 | 55 | template 56 | void calcResidual(Tensor& vecs, 57 | Tensor& centroids, 58 | Tensor& vecToCentroid, 59 | Tensor& residuals, 60 | cudaStream_t stream) { 61 | FAISS_ASSERT(vecs.getSize(1) == centroids.getSize(1)); 62 | FAISS_ASSERT(vecs.getSize(1) == residuals.getSize(1)); 63 | FAISS_ASSERT(vecs.getSize(0) == vecToCentroid.getSize(0)); 64 | FAISS_ASSERT(vecs.getSize(0) == residuals.getSize(0)); 65 | 66 | dim3 grid(vecs.getSize(0)); 67 | 68 | int maxThreads = getMaxThreadsCurrentDevice(); 69 | bool largeDim = vecs.getSize(1) > maxThreads; 70 | dim3 block(std::min(vecs.getSize(1), maxThreads)); 71 | 72 | if (largeDim) { 73 | calcResidual<<>>( 74 | vecs, centroids, vecToCentroid, residuals); 75 | } else { 76 | calcResidual<<>>( 77 | vecs, centroids, vecToCentroid, residuals); 78 | } 79 | 80 | CUDA_TEST_ERROR(); 81 | } 82 | 83 | void runCalcResidual(Tensor& vecs, 84 | Tensor& centroids, 85 | Tensor& vecToCentroid, 86 | Tensor& residuals, 87 | cudaStream_t stream) { 88 | calcResidual(vecs, centroids, vecToCentroid, residuals, stream); 89 | } 90 | 91 | #ifdef FAISS_USE_FLOAT16 92 | void runCalcResidual(Tensor& vecs, 93 | Tensor& centroids, 94 | Tensor& vecToCentroid, 95 | Tensor& residuals, 96 | cudaStream_t stream) { 97 | calcResidual(vecs, centroids, vecToCentroid, residuals, stream); 98 | } 99 | #endif 100 | 101 | } } // namespace 102 | -------------------------------------------------------------------------------- /gpu/impl/VectorResidual.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../utils/Tensor.cuh" 14 | #include "../utils/Float16.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | // Calculates residual v_i - c_j for all v_i in vecs where j = vecToCentroid[i] 19 | void runCalcResidual(Tensor& vecs, 20 | Tensor& centroids, 21 | Tensor& vecToCentroid, 22 | Tensor& residuals, 23 | cudaStream_t stream); 24 | 25 | #ifdef FAISS_USE_FLOAT16 26 | void runCalcResidual(Tensor& vecs, 27 | Tensor& centroids, 28 | Tensor& vecToCentroid, 29 | Tensor& residuals, 30 | cudaStream_t stream); 31 | #endif 32 | 33 | } } // namespace 34 | -------------------------------------------------------------------------------- /gpu/perf/IndexWrapper-inl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "../../FaissAssert.h" 12 | 13 | namespace faiss { namespace gpu { 14 | 15 | template 16 | IndexWrapper::IndexWrapper( 17 | int numGpus, 18 | std::function(GpuResources*, int)> init) { 19 | FAISS_ASSERT(numGpus <= faiss::gpu::getNumDevices()); 20 | for (int i = 0; i < numGpus; ++i) { 21 | auto res = std::unique_ptr( 22 | new StandardGpuResources); 23 | 24 | subIndex.emplace_back(init(res.get(), i)); 25 | resources.emplace_back(std::move(res)); 26 | } 27 | 28 | if (numGpus > 1) { 29 | // create proxy 30 | proxyIndex = 31 | std::unique_ptr(new faiss::gpu::IndexProxy); 32 | 33 | for (auto& index : subIndex) { 34 | proxyIndex->addIndex(index.get()); 35 | } 36 | } 37 | } 38 | 39 | template 40 | faiss::Index* 41 | IndexWrapper::getIndex() { 42 | if ((bool) proxyIndex) { 43 | return proxyIndex.get(); 44 | } else { 45 | FAISS_ASSERT(!subIndex.empty()); 46 | return subIndex.front().get(); 47 | } 48 | } 49 | 50 | template 51 | void 52 | IndexWrapper::runOnIndices(std::function f) { 53 | 54 | if ((bool) proxyIndex) { 55 | proxyIndex->runOnIndex( 56 | [f](faiss::Index* index) { 57 | f(dynamic_cast(index)); 58 | }); 59 | } else { 60 | FAISS_ASSERT(!subIndex.empty()); 61 | f(subIndex.front().get()); 62 | } 63 | } 64 | 65 | template 66 | void 67 | IndexWrapper::setNumProbes(int nprobe) { 68 | runOnIndices([nprobe](GpuIndex* index) { 69 | index->setNumProbes(nprobe); 70 | }); 71 | } 72 | 73 | } } 74 | -------------------------------------------------------------------------------- /gpu/perf/IndexWrapper.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../IndexProxy.h" 14 | #include "../StandardGpuResources.h" 15 | #include 16 | #include 17 | #include 18 | 19 | namespace faiss { namespace gpu { 20 | 21 | // If we want to run multi-GPU, create a proxy to wrap the indices. 22 | // If we don't want multi-GPU, don't involve the proxy, so it doesn't 23 | // affect the timings. 24 | template 25 | struct IndexWrapper { 26 | std::vector> resources; 27 | std::vector> subIndex; 28 | std::unique_ptr proxyIndex; 29 | 30 | IndexWrapper( 31 | int numGpus, 32 | std::function(GpuResources*, int)> init); 33 | faiss::Index* getIndex(); 34 | 35 | void runOnIndices(std::function f); 36 | void setNumProbes(int nprobe); 37 | }; 38 | 39 | } } 40 | 41 | #include "IndexWrapper-inl.h" 42 | -------------------------------------------------------------------------------- /gpu/perf/PerfSelect.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "../utils/DeviceUtils.h" 12 | #include "../utils/BlockSelectKernel.cuh" 13 | #include "../utils/WarpSelectKernel.cuh" 14 | #include "../utils/HostTensor.cuh" 15 | #include "../utils/DeviceTensor.cuh" 16 | #include "../test/TestUtils.h" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | DEFINE_int32(rows, 10000, "rows in matrix"); 25 | DEFINE_int32(cols, 40000, "cols in matrix"); 26 | DEFINE_int32(k, 100, "k"); 27 | DEFINE_bool(dir, false, "direction of sort"); 28 | DEFINE_bool(warp, false, "warp select"); 29 | DEFINE_int32(iter, 5, "iterations to run"); 30 | DEFINE_bool(k_powers, false, "test k powers of 2 from 1 -> 1024"); 31 | 32 | int main(int argc, char** argv) { 33 | gflags::ParseCommandLineFlags(&argc, &argv, true); 34 | 35 | std::vector v = faiss::gpu::randVecs(FLAGS_rows, FLAGS_cols); 36 | faiss::gpu::HostTensor hostVal({FLAGS_rows, FLAGS_cols}); 37 | 38 | for (int r = 0; r < FLAGS_rows; ++r) { 39 | for (int c = 0; c < FLAGS_cols; ++c) { 40 | hostVal[r][c] = v[r * FLAGS_cols + c]; 41 | } 42 | } 43 | 44 | // Select top-k on GPU 45 | faiss::gpu::DeviceTensor gpuVal(hostVal, 0); 46 | 47 | // enough space for any k 48 | faiss::gpu::DeviceTensor gpuOutVal({FLAGS_rows, 1024}); 49 | faiss::gpu::DeviceTensor gpuOutInd({FLAGS_rows, 1024}); 50 | 51 | int startK = FLAGS_k; 52 | int limitK = FLAGS_k; 53 | 54 | if (FLAGS_k_powers) { 55 | startK = 1; 56 | limitK = 1024; 57 | } 58 | 59 | for (int k = startK; k <= limitK; k *= 2) { 60 | for (int i = 0; i < FLAGS_iter; ++i) { 61 | if (FLAGS_warp) { 62 | faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, 63 | FLAGS_dir, k, 0); 64 | } else { 65 | faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, 66 | FLAGS_dir, k, 0); 67 | } 68 | } 69 | } 70 | 71 | cudaDeviceSynchronize(); 72 | } 73 | -------------------------------------------------------------------------------- /gpu/perf/WriteIndex.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "../../IndexIVF.h" 12 | #include "../../IndexIVFPQ.h" 13 | #include "../../IndexFlat.h" 14 | #include "../../index_io.h" 15 | #include "../test/TestUtils.h" 16 | #include 17 | #include 18 | 19 | // For IVFPQ: 20 | DEFINE_bool(ivfpq, false, "use IVFPQ encoding"); 21 | DEFINE_int32(codes, 4, "number of PQ codes per vector"); 22 | DEFINE_int32(bits_per_code, 8, "number of bits per PQ code"); 23 | 24 | // For IVFFlat: 25 | DEFINE_bool(l2, true, "use L2 metric (versus IP metric)"); 26 | DEFINE_bool(ivfflat, false, "use IVF flat encoding"); 27 | 28 | // For both: 29 | DEFINE_string(out, "/home/jhj/local/index.out", "index file for output"); 30 | DEFINE_int32(dim, 128, "vector dimension"); 31 | DEFINE_int32(num_coarse, 100, "number of coarse centroids"); 32 | DEFINE_int32(num, 100000, "total database size"); 33 | DEFINE_int32(num_train, -1, "number of database vecs to train on"); 34 | 35 | template 36 | void fillAndSave(T& index, int numTrain, int num, int dim) { 37 | auto trainVecs = faiss::gpu::randVecs(numTrain, dim); 38 | index.train(numTrain, trainVecs.data()); 39 | 40 | constexpr int kAddChunk = 1000000; 41 | 42 | for (int i = 0; i < num; i += kAddChunk) { 43 | int numRemaining = (num - i) < kAddChunk ? (num - i) : kAddChunk; 44 | auto vecs = faiss::gpu::randVecs(numRemaining, dim); 45 | 46 | printf("adding at %d: %d\n", i, numRemaining); 47 | index.add(numRemaining, vecs.data()); 48 | } 49 | 50 | faiss::write_index(&index, FLAGS_out.c_str()); 51 | } 52 | 53 | int main(int argc, char** argv) { 54 | gflags::ParseCommandLineFlags(&argc, &argv, true); 55 | 56 | // Either ivfpq or ivfflat must be set 57 | if ((FLAGS_ivfpq && FLAGS_ivfflat) || 58 | (!FLAGS_ivfpq && !FLAGS_ivfflat)) { 59 | printf("must specify either ivfpq or ivfflat\n"); 60 | return 1; 61 | } 62 | 63 | auto dim = FLAGS_dim; 64 | auto numCentroids = FLAGS_num_coarse; 65 | auto num = FLAGS_num; 66 | auto numTrain = FLAGS_num_train; 67 | numTrain = numTrain == -1 ? std::max((num / 4), 1) : numTrain; 68 | numTrain = std::min(num, numTrain); 69 | 70 | if (FLAGS_ivfpq) { 71 | faiss::IndexFlatL2 quantizer(dim); 72 | faiss::IndexIVFPQ index(&quantizer, dim, numCentroids, 73 | FLAGS_codes, FLAGS_bits_per_code); 74 | index.verbose = true; 75 | 76 | printf("IVFPQ: codes %d bits per code %d\n", 77 | FLAGS_codes, FLAGS_bits_per_code); 78 | printf("Lists: %d\n", numCentroids); 79 | printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain); 80 | printf("output file: %s\n", FLAGS_out.c_str()); 81 | 82 | fillAndSave(index, numTrain, num, dim); 83 | } else if (FLAGS_ivfflat) { 84 | faiss::IndexFlatL2 quantizerL2(dim); 85 | faiss::IndexFlatIP quantizerIP(dim); 86 | 87 | faiss::IndexFlat* quantizer = FLAGS_l2 ? 88 | (faiss::IndexFlat*) &quantizerL2 : 89 | (faiss::IndexFlat*) &quantizerIP; 90 | 91 | faiss::IndexIVFFlat index(quantizer, dim, numCentroids, 92 | FLAGS_l2 ? faiss::METRIC_L2 : 93 | faiss::METRIC_INNER_PRODUCT); 94 | 95 | printf("IVFFlat: metric %s\n", FLAGS_l2 ? "L2" : "IP"); 96 | printf("Lists: %d\n", numCentroids); 97 | printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain); 98 | printf("output file: %s\n", FLAGS_out.c_str()); 99 | 100 | fillAndSave(index, numTrain, num, dim); 101 | } 102 | 103 | return 0; 104 | } 105 | -------------------------------------------------------------------------------- /gpu/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #list(APPEND srcs 2 | # ${CMAKE_CURRENT_SOURCE_DIR}/demo_ivfpq_indexing_gpu.cpp) 3 | 4 | list(APPEND srcs 5 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_createdb.cpp) 6 | 7 | list(APPEND srcs 8 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_query.cpp) 9 | 10 | list(APPEND srcs 11 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_query1.cpp) 12 | 13 | 14 | list(APPEND srcs 15 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb.cpp) 16 | list(APPEND srcs 17 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_createdb.cpp) 18 | 19 | 20 | list(APPEND srcs 21 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query.cpp) 22 | list(APPEND srcs 23 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_query.cpp) 24 | list(APPEND srcs 25 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query1.cpp) 26 | list(APPEND srcs 27 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb_hnsw.cpp) 28 | list(APPEND srcs 29 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_creategt.cpp) 30 | list(APPEND srcs 31 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_creategt.cpp) 32 | 33 | list(APPEND srcs 34 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb.cpp) 35 | list(APPEND srcs 36 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_createdb.cpp) 37 | list(APPEND srcs 38 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query.cpp) 39 | list(APPEND srcs 40 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_query.cpp) 41 | list(APPEND srcs 42 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query1.cpp) 43 | 44 | list(APPEND srcs 45 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb_hnsw.cpp) 46 | 47 | 48 | 49 | list(APPEND srcs 50 | ${CMAKE_CURRENT_SOURCE_DIR}/transform_sift1b.cpp) 51 | 52 | list(APPEND srcs 53 | ${CMAKE_CURRENT_SOURCE_DIR}/transform_deep1b.cpp) 54 | 55 | # gtest 56 | find_package(GTest REQUIRED) 57 | include_directories(${GTEST_INCLUDE_DIRS}) 58 | foreach(source ${srcs}) 59 | get_filename_component(name ${source} NAME_WE) 60 | add_executable(${name} ${source}) 61 | target_link_libraries(${name} ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${MPICH_CXX_LIBRARIES}) 62 | endforeach(source) 63 | 64 | # CUDA_ADD_EXECUTABLE(TestGpuSelect ${CMAKE_CURRENT_SOURCE_DIR}/TestGpuSelect.cu ${CMAKE_CURRENT_SOURCE_DIR}/TestUtils.cpp) 65 | # target_link_libraries(TestGpuSelect ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${GTEST_BOTH_LIBRARIES}) 66 | 67 | 68 | -------------------------------------------------------------------------------- /gpu/test/CMakeLists.txt.bak: -------------------------------------------------------------------------------- 1 | #list(APPEND srcs 2 | # ${CMAKE_CURRENT_SOURCE_DIR}/demo_ivfpq_indexing_gpu.cpp) 3 | 4 | list(APPEND srcs 5 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_createdb.cpp) 6 | 7 | list(APPEND srcs 8 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_query.cpp) 9 | 10 | list(APPEND srcs 11 | ${CMAKE_CURRENT_SOURCE_DIR}/tool_query1.cpp) 12 | 13 | 14 | list(APPEND srcs 15 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb.cpp) 16 | list(APPEND srcs 17 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_createdb.cpp) 18 | 19 | 20 | list(APPEND srcs 21 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query.cpp) 22 | list(APPEND srcs 23 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_query.cpp) 24 | list(APPEND srcs 25 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query1.cpp) 26 | list(APPEND srcs 27 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb_hnsw.cpp) 28 | list(APPEND srcs 29 | ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdbgt.cpp) 30 | 31 | 32 | list(APPEND srcs 33 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb.cpp) 34 | list(APPEND srcs 35 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_createdb.cpp) 36 | list(APPEND srcs 37 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query.cpp) 38 | list(APPEND srcs 39 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_query.cpp) 40 | list(APPEND srcs 41 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query1.cpp) 42 | 43 | list(APPEND srcs 44 | ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb_hnsw.cpp) 45 | 46 | 47 | 48 | list(APPEND srcs 49 | ${CMAKE_CURRENT_SOURCE_DIR}/transform_sift1b.cpp) 50 | 51 | list(APPEND srcs 52 | ${CMAKE_CURRENT_SOURCE_DIR}/transform_deep1b.cpp) 53 | 54 | # gtest 55 | find_package(GTest REQUIRED) 56 | include_directories(${GTEST_INCLUDE_DIRS}) 57 | foreach(source ${srcs}) 58 | get_filename_component(name ${source} NAME_WE) 59 | add_executable(${name} ${source}) 60 | target_link_libraries(${name} ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${MPICH_CXX_LIBRARIES}) 61 | endforeach(source) 62 | 63 | # CUDA_ADD_EXECUTABLE(TestGpuSelect ${CMAKE_CURRENT_SOURCE_DIR}/TestGpuSelect.cu ${CMAKE_CURRENT_SOURCE_DIR}/TestUtils.cpp) 64 | # target_link_libraries(TestGpuSelect ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${GTEST_BOTH_LIBRARIES}) 65 | 66 | 67 | -------------------------------------------------------------------------------- /gpu/test/TestUtils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../../FaissAssert.h" 14 | #include "../../Index.h" 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace faiss { namespace gpu { 21 | 22 | /// Generates and displays a new seed for the test 23 | void newTestSeed(); 24 | 25 | /// Uses an explicit seed for the test 26 | void setTestSeed(long seed); 27 | 28 | /// Returns the relative error in difference between a and b 29 | /// (|a - b| / (0.5 * (|a| + |b|)) 30 | float relativeError(float a, float b); 31 | 32 | /// Generates a random integer in the range [a, b] 33 | int randVal(int a, int b); 34 | 35 | /// Generates a random bool 36 | bool randBool(); 37 | 38 | /// Select a random value from the given list of values provided as an 39 | /// initializer_list 40 | template 41 | T randSelect(std::initializer_list vals) { 42 | FAISS_ASSERT(vals.size() > 0); 43 | int sel = randVal(0, vals.size()); 44 | 45 | int i = 0; 46 | for (auto v : vals) { 47 | if (i++ == sel) { 48 | return v; 49 | } 50 | } 51 | 52 | // should not get here 53 | return *vals.begin(); 54 | } 55 | 56 | /// Generates a collection of random vectors in the range [0, 1] 57 | std::vector randVecs(size_t num, size_t dim); 58 | 59 | /// Compare two indices via query for similarity 60 | void compareIndices(faiss::Index& refIndex, 61 | faiss::Index& testIndex, 62 | int numQuery, int dim, int k, 63 | const std::string& configMsg, 64 | float maxRelativeError = 6e-5f, 65 | float pctMaxDiff1 = 0.1f, 66 | float pctMaxDiffN = 0.005f); 67 | 68 | /// Display specific differences in the two (distance, index) lists 69 | void compareLists(const float* refDist, 70 | const faiss::Index::idx_t* refInd, 71 | const float* testDist, 72 | const faiss::Index::idx_t* testInd, 73 | int dim1, int dim2, 74 | const std::string& configMsg, 75 | bool printBasicStats, bool printDiffs, bool assertOnErr, 76 | float maxRelativeError = 6e-5f, 77 | float pctMaxDiff1 = 0.1f, 78 | float pctMaxDiffN = 0.005f); 79 | 80 | } } 81 | -------------------------------------------------------------------------------- /gpu/test/deep1b_query.cpp.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_query.cpp.bak -------------------------------------------------------------------------------- /gpu/test/deep1b_query1.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_query1.cpp -------------------------------------------------------------------------------- /gpu/test/deep1b_queryd.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_queryd.cpp -------------------------------------------------------------------------------- /gpu/test/sift1b_query.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_query.cpp -------------------------------------------------------------------------------- /gpu/test/sift1b_query1.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_query1.cpp -------------------------------------------------------------------------------- /gpu/test/sift1b_queryd.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_queryd.cpp -------------------------------------------------------------------------------- /gpu/test/test_gpu_index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the CC-by-NC license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | #! /usr/bin/env python2 8 | 9 | import libfb.py.mkl # noqa 10 | 11 | import numpy as np 12 | 13 | from libfb import testutil 14 | 15 | import faiss 16 | 17 | 18 | class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase): 19 | 20 | def get_dataset(self): 21 | d = 128 22 | nb = 100000 23 | nt = 15000 24 | nq = 2000 25 | np.random.seed(123) 26 | 27 | # generate points in a low-dim subspace to make the resutls 28 | # look better :-) 29 | d1 = 16 30 | q, r = np.linalg.qr(np.random.randn(d, d)) 31 | qc = q[:d1, :] 32 | def make_mat(n): 33 | return np.dot( 34 | np.random.random(size=(nb, d1)), qc).astype('float32') 35 | 36 | return (make_mat(nt), make_mat(nb), make_mat(nq)) 37 | 38 | def test_IndexIVFPQ(self): 39 | (xt, xb, xq) = self.get_dataset() 40 | d = xt.shape[1] 41 | 42 | dev_no = 0 43 | usePrecomputed = True 44 | 45 | res = faiss.StandardGpuResources() 46 | 47 | flat_config = faiss.GpuIndexFlatConfig() 48 | flat_config.device = dev_no 49 | 50 | gt_index = faiss.GpuIndexFlatL2(res, d, flat_config) 51 | gt_index.add(xb) 52 | D, gt_nns = gt_index.search(xq, 1) 53 | 54 | coarse_quantizer = faiss.IndexFlatL2(d) 55 | ncentroids = int(np.sqrt(xb.shape[0])) * 4 56 | 57 | index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8) 58 | # add implemented on GPU but not train 59 | index.train(xt) 60 | 61 | ivfpq_config = faiss.GpuIndexIVFPQConfig() 62 | ivfpq_config.device = dev_no 63 | ivfpq_config.usePrecomputedTables = usePrecomputed 64 | 65 | gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config) 66 | gpuIndex.setNumProbes(64) 67 | index.add(xb) 68 | 69 | D, nns = index.search(xq, 10) 70 | n_ok = (nns == gt_nns).sum() 71 | nq = xq.shape[0] 72 | print ncentroids, n_ok, nq 73 | 74 | self.assertGreater(n_ok, nq * 0.2) 75 | 76 | def test_mm(self): 77 | # trouble with MKL+fbmake that appears only at runtime. Check it here 78 | x = np.random.random(size=(100, 20)).astype('float32') 79 | mat = faiss.PCAMatrix(20, 10) 80 | mat.train(x) 81 | mat.apply_py(x) 82 | -------------------------------------------------------------------------------- /gpu/utils/BlockSelectFloat.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "blockselect/BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | // warp Q to thread Q: 15 | // 1, 1 16 | // 32, 2 17 | // 64, 3 18 | // 128, 3 19 | // 256, 4 20 | // 512, 8 21 | // 1024, 8 22 | 23 | BLOCK_SELECT_DECL(float, true, 1); 24 | BLOCK_SELECT_DECL(float, true, 32); 25 | BLOCK_SELECT_DECL(float, true, 64); 26 | BLOCK_SELECT_DECL(float, true, 128); 27 | BLOCK_SELECT_DECL(float, true, 256); 28 | BLOCK_SELECT_DECL(float, true, 512); 29 | BLOCK_SELECT_DECL(float, true, 1024); 30 | 31 | BLOCK_SELECT_DECL(float, false, 1); 32 | BLOCK_SELECT_DECL(float, false, 32); 33 | BLOCK_SELECT_DECL(float, false, 64); 34 | BLOCK_SELECT_DECL(float, false, 128); 35 | BLOCK_SELECT_DECL(float, false, 256); 36 | BLOCK_SELECT_DECL(float, false, 512); 37 | BLOCK_SELECT_DECL(float, false, 1024); 38 | 39 | void runBlockSelect(Tensor& in, 40 | Tensor& outK, 41 | Tensor& outV, 42 | bool dir, int k, cudaStream_t stream) { 43 | FAISS_ASSERT(k <= 1024); 44 | 45 | if (dir) { 46 | if (k == 1) { 47 | BLOCK_SELECT_CALL(float, true, 1); 48 | } else if (k <= 32) { 49 | BLOCK_SELECT_CALL(float, true, 32); 50 | } else if (k <= 64) { 51 | BLOCK_SELECT_CALL(float, true, 64); 52 | } else if (k <= 128) { 53 | BLOCK_SELECT_CALL(float, true, 128); 54 | } else if (k <= 256) { 55 | BLOCK_SELECT_CALL(float, true, 256); 56 | } else if (k <= 512) { 57 | BLOCK_SELECT_CALL(float, true, 512); 58 | } else if (k <= 1024) { 59 | BLOCK_SELECT_CALL(float, true, 1024); 60 | } 61 | } else { 62 | if (k == 1) { 63 | BLOCK_SELECT_CALL(float, false, 1); 64 | } else if (k <= 32) { 65 | BLOCK_SELECT_CALL(float, false, 32); 66 | } else if (k <= 64) { 67 | BLOCK_SELECT_CALL(float, false, 64); 68 | } else if (k <= 128) { 69 | BLOCK_SELECT_CALL(float, false, 128); 70 | } else if (k <= 256) { 71 | BLOCK_SELECT_CALL(float, false, 256); 72 | } else if (k <= 512) { 73 | BLOCK_SELECT_CALL(float, false, 512); 74 | } else if (k <= 1024) { 75 | BLOCK_SELECT_CALL(float, false, 1024); 76 | } 77 | } 78 | } 79 | 80 | } } // namespace 81 | -------------------------------------------------------------------------------- /gpu/utils/BlockSelectHalf.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "blockselect/BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | 16 | // warp Q to thread Q: 17 | // 1, 1 18 | // 32, 2 19 | // 64, 3 20 | // 128, 3 21 | // 256, 4 22 | // 512, 8 23 | // 1024, 8 24 | 25 | BLOCK_SELECT_DECL(half, true, 1); 26 | BLOCK_SELECT_DECL(half, true, 32); 27 | BLOCK_SELECT_DECL(half, true, 64); 28 | BLOCK_SELECT_DECL(half, true, 128); 29 | BLOCK_SELECT_DECL(half, true, 256); 30 | BLOCK_SELECT_DECL(half, true, 512); 31 | BLOCK_SELECT_DECL(half, true, 1024); 32 | 33 | BLOCK_SELECT_DECL(half, false, 1); 34 | BLOCK_SELECT_DECL(half, false, 32); 35 | BLOCK_SELECT_DECL(half, false, 64); 36 | BLOCK_SELECT_DECL(half, false, 128); 37 | BLOCK_SELECT_DECL(half, false, 256); 38 | BLOCK_SELECT_DECL(half, false, 512); 39 | BLOCK_SELECT_DECL(half, false, 1024); 40 | 41 | void runBlockSelect(Tensor& in, 42 | Tensor& outK, 43 | Tensor& outV, 44 | bool dir, int k, cudaStream_t stream) { 45 | FAISS_ASSERT(k <= 1024); 46 | 47 | if (dir) { 48 | if (k == 1) { 49 | BLOCK_SELECT_CALL(half, true, 1); 50 | } else if (k <= 32) { 51 | BLOCK_SELECT_CALL(half, true, 32); 52 | } else if (k <= 64) { 53 | BLOCK_SELECT_CALL(half, true, 64); 54 | } else if (k <= 128) { 55 | BLOCK_SELECT_CALL(half, true, 128); 56 | } else if (k <= 256) { 57 | BLOCK_SELECT_CALL(half, true, 256); 58 | } else if (k <= 512) { 59 | BLOCK_SELECT_CALL(half, true, 512); 60 | } else if (k <= 1024) { 61 | BLOCK_SELECT_CALL(half, true, 1024); 62 | } 63 | } else { 64 | if (k == 1) { 65 | BLOCK_SELECT_CALL(half, false, 1); 66 | } else if (k <= 32) { 67 | BLOCK_SELECT_CALL(half, false, 32); 68 | } else if (k <= 64) { 69 | BLOCK_SELECT_CALL(half, false, 64); 70 | } else if (k <= 128) { 71 | BLOCK_SELECT_CALL(half, false, 128); 72 | } else if (k <= 256) { 73 | BLOCK_SELECT_CALL(half, false, 256); 74 | } else if (k <= 512) { 75 | BLOCK_SELECT_CALL(half, false, 512); 76 | } else if (k <= 1024) { 77 | BLOCK_SELECT_CALL(half, false, 1024); 78 | } 79 | } 80 | } 81 | 82 | #endif 83 | 84 | } } // namespace 85 | -------------------------------------------------------------------------------- /gpu/utils/BlockSelectKernel.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #pragma once 11 | 12 | #include "Float16.cuh" 13 | #include "Select.cuh" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | template 23 | __global__ void blockSelect(Tensor in, 24 | Tensor outK, 25 | Tensor outV, 26 | K initK, 27 | IndexType initV, 28 | int k) { 29 | constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; 30 | 31 | __shared__ K smemK[kNumWarps * NumWarpQ]; 32 | __shared__ IndexType smemV[kNumWarps * NumWarpQ]; 33 | 34 | BlockSelect, 35 | NumWarpQ, NumThreadQ, ThreadsPerBlock> 36 | heap(initK, initV, smemK, smemV, k); 37 | 38 | // Grid is exactly sized to rows available 39 | int row = blockIdx.x; 40 | 41 | int i = threadIdx.x; 42 | K* inStart = in[row][i].data(); 43 | 44 | // Whole warps must participate in the selection 45 | int limit = utils::roundDown(in.getSize(1), kWarpSize); 46 | 47 | for (; i < limit; i += ThreadsPerBlock) { 48 | heap.add(*inStart, (IndexType) i); 49 | inStart += ThreadsPerBlock; 50 | } 51 | 52 | // Handle last remainder fraction of a warp of elements 53 | if (i < in.getSize(1)) { 54 | heap.addThreadQ(*inStart, (IndexType) i); 55 | } 56 | 57 | heap.reduce(); 58 | 59 | for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) { 60 | outK[row][i] = smemK[i]; 61 | outV[row][i] = smemV[i]; 62 | } 63 | } 64 | 65 | void runBlockSelect(Tensor& in, 66 | Tensor& outKeys, 67 | Tensor& outIndices, 68 | bool dir, int k, cudaStream_t stream); 69 | 70 | #ifdef FAISS_USE_FLOAT16 71 | void runBlockSelect(Tensor& in, 72 | Tensor& outKeys, 73 | Tensor& outIndices, 74 | bool dir, int k, cudaStream_t stream); 75 | #endif 76 | 77 | } } // namespace 78 | -------------------------------------------------------------------------------- /gpu/utils/Comparators.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "Float16.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | template 19 | struct Comparator { 20 | __device__ static inline bool lt(T a, T b) { 21 | return a < b; 22 | } 23 | 24 | __device__ static inline bool gt(T a, T b) { 25 | return a > b; 26 | } 27 | }; 28 | 29 | #ifdef FAISS_USE_FLOAT16 30 | 31 | template <> 32 | struct Comparator { 33 | __device__ static inline bool lt(half a, half b) { 34 | #if FAISS_USE_FULL_FLOAT16 35 | return __hlt(a, b); 36 | #else 37 | return __half2float(a) < __half2float(b); 38 | #endif // FAISS_USE_FULL_FLOAT16 39 | } 40 | 41 | __device__ static inline bool gt(half a, half b) { 42 | #if FAISS_USE_FULL_FLOAT16 43 | return __hgt(a, b); 44 | #else 45 | return __half2float(a) > __half2float(b); 46 | #endif // FAISS_USE_FULL_FLOAT16 47 | } 48 | }; 49 | 50 | #endif // FAISS_USE_FLOAT16 51 | 52 | } } // namespace 53 | -------------------------------------------------------------------------------- /gpu/utils/ConversionOperators.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "Float16.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | // 19 | // Conversion utilities 20 | // 21 | 22 | template 23 | struct ConvertTo { 24 | }; 25 | 26 | template <> 27 | struct ConvertTo { 28 | static inline __device__ float to(float v) { return v; } 29 | #ifdef FAISS_USE_FLOAT16 30 | static inline __device__ float to(half v) { return __half2float(v); } 31 | #endif 32 | }; 33 | 34 | template <> 35 | struct ConvertTo { 36 | static inline __device__ float2 to(float2 v) { return v; } 37 | #ifdef FAISS_USE_FLOAT16 38 | static inline __device__ float2 to(half2 v) { return __half22float2(v); } 39 | #endif 40 | }; 41 | 42 | template <> 43 | struct ConvertTo { 44 | static inline __device__ float4 to(float4 v) { return v; } 45 | #ifdef FAISS_USE_FLOAT16 46 | static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); } 47 | #endif 48 | }; 49 | 50 | #ifdef FAISS_USE_FLOAT16 51 | template <> 52 | struct ConvertTo { 53 | static inline __device__ half to(float v) { return __float2half(v); } 54 | static inline __device__ half to(half v) { return v; } 55 | }; 56 | 57 | template <> 58 | struct ConvertTo { 59 | static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); } 60 | static inline __device__ half2 to(half2 v) { return v; } 61 | }; 62 | 63 | template <> 64 | struct ConvertTo { 65 | static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); } 66 | static inline __device__ Half4 to(Half4 v) { return v; } 67 | }; 68 | #endif 69 | 70 | 71 | } } // namespace 72 | -------------------------------------------------------------------------------- /gpu/utils/CopyUtils.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "DeviceTensor.cuh" 14 | #include "HostTensor.cuh" 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | /// Ensure the memory at `p` is either on the given device, or copy it 19 | /// to the device in a new allocation. 20 | /// If `resources` is provided, then we will perform a temporary 21 | /// memory allocation if needed. Otherwise, we will call cudaMalloc if 22 | /// needed. 23 | template 24 | DeviceTensor toDevice(GpuResources* resources, 25 | int dstDevice, 26 | T* src, 27 | cudaStream_t stream, 28 | std::initializer_list sizes) { 29 | int dev = getDeviceForAddress(src); 30 | 31 | if (dev == dstDevice) { 32 | // On device we expect 33 | return DeviceTensor(src, sizes); 34 | } else { 35 | // On different device or on host 36 | DeviceScope scope(dstDevice); 37 | 38 | Tensor oldT(src, sizes); 39 | 40 | if (resources) { 41 | DeviceTensor newT(resources->getMemoryManager(dstDevice), 42 | sizes, 43 | stream); 44 | 45 | newT.copyFrom(oldT, stream); 46 | return newT; 47 | } else { 48 | DeviceTensor newT(sizes); 49 | 50 | newT.copyFrom(oldT, stream); 51 | return newT; 52 | } 53 | } 54 | } 55 | 56 | /// Copies a device array's allocation to an address, if necessary 57 | template 58 | inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) { 59 | // It is possible that the array already represents memory at `p`, 60 | // in which case no copy is needed 61 | if (src == dst) { 62 | return; 63 | } 64 | 65 | int dev = getDeviceForAddress(dst); 66 | 67 | if (dev == -1) { 68 | CUDA_VERIFY(cudaMemcpyAsync(dst, 69 | src, 70 | num * sizeof(T), 71 | cudaMemcpyDeviceToHost, 72 | stream)); 73 | } else { 74 | CUDA_VERIFY(cudaMemcpyAsync(dst, 75 | src, 76 | num * sizeof(T), 77 | cudaMemcpyDeviceToDevice, 78 | stream)); 79 | } 80 | } 81 | 82 | /// Copies a device array's allocation to an address, if necessary 83 | template 84 | void fromDevice(Tensor& src, T* dst, cudaStream_t stream) { 85 | FAISS_ASSERT(src.isContiguous()); 86 | fromDevice(src.data(), dst, src.numElements(), stream); 87 | } 88 | 89 | } } // namespace 90 | -------------------------------------------------------------------------------- /gpu/utils/DeviceDefs.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | namespace faiss { namespace gpu { 14 | 15 | #ifdef __CUDA_ARCH__ 16 | #if __CUDA_ARCH__ <= 620 17 | constexpr int kWarpSize = 32; 18 | #else 19 | #error Unknown __CUDA_ARCH__; please define parameters for compute capability 20 | #endif // __CUDA_ARCH__ types 21 | #endif // __CUDA_ARCH__ 22 | 23 | #ifndef __CUDA_ARCH__ 24 | // dummy value for host compiler 25 | constexpr int kWarpSize = 32; 26 | #endif // !__CUDA_ARCH__ 27 | 28 | __forceinline__ __device__ void warpFence() { 29 | // Technically, memory barriers are required via the CUDA 30 | // programming model, since warp synchronous programming no longer 31 | // is guaranteed. 32 | // 33 | // There are two components to it: 34 | // -a barrier known to the compiler such that the compiler will not 35 | // schedule loads and stores across the barrier; 36 | // -a HW-level barrier that guarantees that writes are seen in the 37 | // proper order 38 | // 39 | // However, __threadfence_block() is a stronger constraint than what 40 | // we really want out of the hardware: a warp-wide barrier. 41 | // 42 | // In current hardware, it appears that warp synchronous programming 43 | // is a reality; by all tests it appears safe and race-free. 44 | // 45 | // However, understandably it may not be in the future (based on 46 | // what Nvidia says in the Kepler guide, it may change depending 47 | // upon compiler/toolchain issues or future hardware). 48 | // 49 | // Removing the fence results in 10%+ faster performance. 50 | // However, we are judicious as to where we insert the fence, so if 51 | // this reality ever changes, uncommenting this will result in CUDA 52 | // programming model-safe ordering again. 53 | // 54 | // FIXME: we should probably qualify as volatile as well, since the 55 | // compiler could technically preserve values across loops? This 56 | // seems very impractical for the compiler to do, however. 57 | 58 | // __threadfence_block(); 59 | } 60 | 61 | } } // namespace 62 | -------------------------------------------------------------------------------- /gpu/utils/DeviceMemory.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "DeviceMemory.h" 12 | #include "DeviceUtils.h" 13 | #include "../../FaissAssert.h" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | DeviceMemoryReservation::DeviceMemoryReservation() 18 | : state_(NULL), 19 | device_(0), 20 | data_(NULL), 21 | size_(0), 22 | stream_(0) { 23 | } 24 | 25 | DeviceMemoryReservation::DeviceMemoryReservation(DeviceMemory* state, 26 | int device, 27 | void* p, 28 | size_t size, 29 | cudaStream_t stream) 30 | : state_(state), 31 | device_(device), 32 | data_(p), 33 | size_(size), 34 | stream_(stream) { 35 | } 36 | 37 | DeviceMemoryReservation::DeviceMemoryReservation( 38 | DeviceMemoryReservation&& m) noexcept { 39 | if (data_) { 40 | FAISS_ASSERT(state_); 41 | state_->returnAllocation(*this); 42 | } 43 | 44 | state_ = m.state_; 45 | device_ = m.device_; 46 | data_ = m.data_; 47 | size_ = m.size_; 48 | stream_ = m.stream_; 49 | 50 | m.data_ = NULL; 51 | } 52 | 53 | DeviceMemoryReservation::~DeviceMemoryReservation() { 54 | if (data_) { 55 | FAISS_ASSERT(state_); 56 | state_->returnAllocation(*this); 57 | } 58 | 59 | data_ = NULL; 60 | } 61 | 62 | DeviceMemoryReservation& 63 | DeviceMemoryReservation::operator=(DeviceMemoryReservation&& m) { 64 | if (data_) { 65 | FAISS_ASSERT(state_); 66 | state_->returnAllocation(*this); 67 | } 68 | 69 | state_ = m.state_; 70 | device_ = m.device_; 71 | data_ = m.data_; 72 | size_ = m.size_; 73 | stream_ = m.stream_; 74 | 75 | m.data_ = NULL; 76 | 77 | return *this; 78 | } 79 | 80 | DeviceMemory::~DeviceMemory() { 81 | } 82 | 83 | } } // namespace 84 | -------------------------------------------------------------------------------- /gpu/utils/DeviceMemory.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | class DeviceMemory; 19 | 20 | class DeviceMemoryReservation { 21 | public: 22 | DeviceMemoryReservation(); 23 | DeviceMemoryReservation(DeviceMemory* state, 24 | int device, void* p, size_t size, 25 | cudaStream_t stream); 26 | DeviceMemoryReservation(DeviceMemoryReservation&& m) noexcept; 27 | ~DeviceMemoryReservation(); 28 | 29 | DeviceMemoryReservation& operator=(DeviceMemoryReservation&& m); 30 | 31 | int device() { return device_; } 32 | void* get() { return data_; } 33 | size_t size() { return size_; } 34 | cudaStream_t stream() { return stream_; } 35 | 36 | private: 37 | DeviceMemory* state_; 38 | 39 | int device_; 40 | void* data_; 41 | size_t size_; 42 | cudaStream_t stream_; 43 | }; 44 | 45 | /// Manages temporary memory allocations on a GPU device 46 | class DeviceMemory { 47 | public: 48 | virtual ~DeviceMemory(); 49 | 50 | /// Returns the device we are managing memory for 51 | virtual int getDevice() const = 0; 52 | 53 | /// Obtains a temporary memory allocation for our device, 54 | /// whose usage is ordered with respect to the given stream. 55 | virtual DeviceMemoryReservation getMemory(cudaStream_t stream, 56 | size_t size) = 0; 57 | 58 | /// Returns the current size available without calling cudaMalloc 59 | virtual size_t getSizeAvailable() const = 0; 60 | 61 | /// Returns a string containing our current memory manager state 62 | virtual std::string toString() const = 0; 63 | 64 | /// Returns the high-water mark of cudaMalloc allocations for our 65 | /// device 66 | virtual size_t getHighWaterCudaMalloc() const = 0; 67 | 68 | protected: 69 | friend class DeviceMemoryReservation; 70 | virtual void returnAllocation(DeviceMemoryReservation& m) = 0; 71 | }; 72 | 73 | } } // namespace 74 | -------------------------------------------------------------------------------- /gpu/utils/Float16.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "Float16.cuh" 12 | #include "nvidia/fp16_emu.cuh" 13 | #include 14 | #include 15 | 16 | #ifdef FAISS_USE_FLOAT16 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | bool getDeviceSupportsFloat16Math(int device) { 21 | const auto& prop = getDeviceProperties(device); 22 | 23 | return (prop.major >= 6 || 24 | (prop.major == 5 && prop.minor >= 3)); 25 | } 26 | 27 | struct FloatToHalf { 28 | __device__ half operator()(float v) const { return __float2half(v); } 29 | }; 30 | 31 | struct HalfToFloat { 32 | __device__ float operator()(half v) const { return __half2float(v); } 33 | }; 34 | 35 | void runConvertToFloat16(half* out, 36 | const float* in, 37 | size_t num, 38 | cudaStream_t stream) { 39 | thrust::transform(thrust::cuda::par.on(stream), 40 | in, in + num, out, FloatToHalf()); 41 | } 42 | 43 | void runConvertToFloat32(float* out, 44 | const half* in, 45 | size_t num, 46 | cudaStream_t stream) { 47 | thrust::transform(thrust::cuda::par.on(stream), 48 | in, in + num, out, HalfToFloat()); 49 | } 50 | 51 | half hostFloat2Half(float a) { 52 | half h; 53 | h.x = cpu_float2half_rn(a).x; 54 | return h; 55 | } 56 | 57 | } } // namespace 58 | 59 | #endif // FAISS_USE_FLOAT16 60 | -------------------------------------------------------------------------------- /gpu/utils/HostTensor.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "Tensor.cuh" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | template class PtrTraits = traits::DefaultPtrTraits> 22 | class HostTensor : public Tensor { 23 | public: 24 | typedef IndexT IndexType; 25 | typedef typename PtrTraits::PtrType DataPtrType; 26 | 27 | /// Default constructor 28 | __host__ HostTensor(); 29 | 30 | /// Destructor 31 | __host__ ~HostTensor(); 32 | 33 | /// Constructs a tensor of the given size, allocating memory for it 34 | /// locally 35 | __host__ HostTensor(const IndexT sizes[Dim]); 36 | __host__ HostTensor(std::initializer_list sizes); 37 | 38 | /// Constructs a tensor of the given size and stride, referencing a 39 | /// memory region we do not own 40 | __host__ HostTensor(DataPtrType data, 41 | const IndexT sizes[Dim]); 42 | __host__ HostTensor(DataPtrType data, 43 | std::initializer_list sizes); 44 | 45 | /// Constructs a tensor of the given size and stride, referencing a 46 | /// memory region we do not own 47 | __host__ HostTensor(DataPtrType data, 48 | const IndexT sizes[Dim], 49 | const IndexT strides[Dim]); 50 | 51 | /// Copies a tensor into ourselves, allocating memory for it 52 | /// locally. If the tensor is on the GPU, then we will copy it to 53 | /// ourselves wrt the given stream. 54 | __host__ HostTensor(Tensor& t, 55 | cudaStream_t stream); 56 | 57 | /// Call to zero out memory 58 | __host__ HostTensor& zero(); 59 | 60 | /// Returns the maximum difference seen between two tensors 61 | __host__ T 62 | maxDiff(const HostTensor& t) const; 63 | 64 | /// Are the two tensors exactly equal? 65 | __host__ bool 66 | equal(const HostTensor& t) const { 67 | return (maxDiff(t) == (T) 0); 68 | } 69 | 70 | private: 71 | enum AllocState { 72 | /// This tensor itself owns the memory, which must be freed via 73 | /// cudaFree 74 | Owner, 75 | 76 | /// This tensor itself is not an owner of the memory; there is 77 | /// nothing to free 78 | NotOwner, 79 | }; 80 | 81 | AllocState state_; 82 | }; 83 | 84 | } } // namespace 85 | 86 | #include "HostTensor-inl.cuh" 87 | -------------------------------------------------------------------------------- /gpu/utils/Limits.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "Float16.cuh" 14 | #include "Pair.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | template 20 | struct Limits { 21 | }; 22 | 23 | // Unfortunately we can't use constexpr because there is no 24 | // constexpr constructor for half 25 | // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity 26 | constexpr float kFloatMax = std::numeric_limits::max(); 27 | 28 | template <> 29 | struct Limits { 30 | static __device__ __host__ inline float getMin() { 31 | return -kFloatMax; 32 | } 33 | static __device__ __host__ inline float getMax() { 34 | return kFloatMax; 35 | } 36 | }; 37 | 38 | #ifdef FAISS_USE_FLOAT16 39 | 40 | inline __device__ __host__ half kGetHalf(unsigned short v) { 41 | half h; 42 | h.x = v; 43 | return h; 44 | } 45 | 46 | template <> 47 | struct Limits { 48 | static __device__ __host__ inline half getMin() { 49 | return kGetHalf(0xfbffU); 50 | } 51 | static __device__ __host__ inline half getMax() { 52 | return kGetHalf(0x7bffU); 53 | } 54 | }; 55 | 56 | #endif // FAISS_USE_FLOAT16 57 | 58 | constexpr int kIntMin = std::numeric_limits::min(); 59 | constexpr int kIntMax = std::numeric_limits::max(); 60 | 61 | template <> 62 | struct Limits { 63 | static __device__ __host__ inline int getMin() { 64 | return kIntMin; 65 | } 66 | static __device__ __host__ inline int getMax() { 67 | return kIntMax; 68 | } 69 | }; 70 | 71 | template 72 | struct Limits> { 73 | static __device__ __host__ inline Pair getMin() { 74 | return Pair(Limits::getMin(), Limits::getMin()); 75 | } 76 | 77 | static __device__ __host__ inline Pair getMax() { 78 | return Pair(Limits::getMax(), Limits::getMax()); 79 | } 80 | }; 81 | 82 | } } // namespace 83 | -------------------------------------------------------------------------------- /gpu/utils/LoadStoreOperators.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "Float16.cuh" 14 | 15 | // 16 | // Templated wrappers to express load/store for different scalar and vector 17 | // types, so kernels can have the same written form but can operate 18 | // over half and float, and on vector types transparently 19 | // 20 | 21 | namespace faiss { namespace gpu { 22 | 23 | template 24 | struct LoadStore { 25 | static inline __device__ T load(void* p) { 26 | return *((T*) p); 27 | } 28 | 29 | static inline __device__ void store(void* p, const T& v) { 30 | *((T*) p) = v; 31 | } 32 | }; 33 | 34 | #ifdef FAISS_USE_FLOAT16 35 | 36 | template <> 37 | struct LoadStore { 38 | static inline __device__ Half4 load(void* p) { 39 | Half4 out; 40 | asm("ld.global.v2.u32 {%0, %1}, [%2];" : 41 | "=r"(out.a.x), "=r"(out.b.x) : "l"(p)); 42 | return out; 43 | } 44 | 45 | static inline __device__ void store(void* p, const Half4& v) { 46 | asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x)); 47 | } 48 | }; 49 | 50 | template <> 51 | struct LoadStore { 52 | static inline __device__ Half8 load(void* p) { 53 | Half8 out; 54 | asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" : 55 | "=r"(out.a.a.x), "=r"(out.a.b.x), 56 | "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p)); 57 | return out; 58 | } 59 | 60 | static inline __device__ void store(void* p, const Half8& v) { 61 | asm("st.v4.u32 [%0], {%1, %2, %3, %4};" 62 | : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x)); 63 | } 64 | }; 65 | 66 | #endif // FAISS_USE_FLOAT16 67 | 68 | } } // namespace 69 | -------------------------------------------------------------------------------- /gpu/utils/MatrixMult.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "Float16.cuh" 15 | #include "Tensor.cuh" 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | class DeviceMemory; 20 | 21 | /// C = alpha * A * B + beta * C 22 | /// Expects row major layout, not fortran/blas column major! 23 | void runMatrixMult(Tensor& c, bool transC, 24 | Tensor& a, bool transA, 25 | Tensor& b, bool transB, 26 | float alpha, 27 | float beta, 28 | bool useHgemm, // ignored for float32 29 | cublasHandle_t handle, 30 | cudaStream_t stream); 31 | 32 | #ifdef FAISS_USE_FLOAT16 33 | /// C = alpha * A * B + beta * C 34 | /// Expects row major layout, not fortran/blas column major! 35 | void runMatrixMult(Tensor& c, bool transC, 36 | Tensor& a, bool transA, 37 | Tensor& b, bool transB, 38 | float alpha, 39 | float beta, 40 | bool useHgemm, 41 | cublasHandle_t handle, 42 | cudaStream_t stream); 43 | #endif 44 | 45 | /// C_i = alpha * A_i * B_i + beta * C_i 46 | /// where `i` is the outermost dimension, via iterated gemm 47 | /// Expects row major layout, not fortran/blas column major! 48 | void runIteratedMatrixMult(Tensor& c, bool transC, 49 | Tensor& a, bool transA, 50 | Tensor& b, bool transB, 51 | float alpha, 52 | float beta, 53 | cublasHandle_t handle, 54 | cudaStream_t stream); 55 | 56 | /// C_i = alpha * A_i * B_i + beta * C_i 57 | /// where `i` is the outermost dimension, via batched gemm 58 | /// Expects row major layout, not fortran/blas column major! 59 | void runBatchMatrixMult(Tensor& c, bool transC, 60 | Tensor& a, bool transA, 61 | Tensor& b, bool transB, 62 | float alpha, 63 | float beta, 64 | DeviceMemory& mem, 65 | cublasHandle_t handle, 66 | cudaStream_t stream); 67 | 68 | } } // namespace 69 | -------------------------------------------------------------------------------- /gpu/utils/MemorySpace.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "MemorySpace.h" 12 | #include 13 | 14 | namespace faiss { namespace gpu { 15 | 16 | /// Allocates CUDA memory for a given memory space 17 | void allocMemorySpace(MemorySpace space, void** p, size_t size) { 18 | if (space == MemorySpace::Device) { 19 | FAISS_ASSERT_FMT(cudaMalloc(p, size) == cudaSuccess, 20 | "Failed to cudaMalloc %zu bytes", size); 21 | } 22 | #ifdef FAISS_UNIFIED_MEM 23 | else if (space == MemorySpace::Unified) { 24 | FAISS_ASSERT_FMT(cudaMallocManaged(p, size) == cudaSuccess, 25 | "Failed to cudaMallocManaged %zu bytes", size); 26 | } 27 | #endif 28 | else { 29 | FAISS_ASSERT_FMT(false, "Unknown MemorySpace %d", (int) space); 30 | } 31 | } 32 | 33 | } } 34 | -------------------------------------------------------------------------------- /gpu/utils/MemorySpace.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../../FaissAssert.h" 14 | #include 15 | 16 | #if CUDA_VERSION >= 8000 17 | // Whether or not we enable usage of CUDA Unified Memory 18 | #define FAISS_UNIFIED_MEM 1 19 | #endif 20 | 21 | namespace faiss { namespace gpu { 22 | 23 | enum MemorySpace { 24 | /// Managed using cudaMalloc/cudaFree 25 | Device = 1, 26 | /// Managed using cudaMallocManaged/cudaFree 27 | Unified = 2, 28 | }; 29 | 30 | /// Allocates CUDA memory for a given memory space 31 | void allocMemorySpace(MemorySpace space, void** p, size_t size); 32 | 33 | } } 34 | -------------------------------------------------------------------------------- /gpu/utils/MergeNetworkUtils.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #pragma once 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | template 15 | inline __device__ void swap(bool swap, T& x, T& y) { 16 | T tmp = x; 17 | x = swap ? y : x; 18 | y = swap ? tmp : y; 19 | } 20 | 21 | template 22 | inline __device__ void assign(bool assign, T& x, T y) { 23 | x = assign ? y : x; 24 | } 25 | 26 | } } // namespace 27 | -------------------------------------------------------------------------------- /gpu/utils/NoTypeTensor.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "../../FaissAssert.h" 14 | #include "Tensor.cuh" 15 | #include 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | template 20 | class NoTypeTensor { 21 | public: 22 | NoTypeTensor() 23 | : mem_(nullptr), 24 | typeSize_(0) { 25 | } 26 | 27 | template 28 | NoTypeTensor(Tensor& t) 29 | : mem_(t.data()), 30 | typeSize_(sizeof(T)) { 31 | for (int i = 0; i < Dim; ++i) { 32 | size_[i] = t.getSize(i); 33 | stride_[i] = t.getStride(i); 34 | } 35 | } 36 | 37 | NoTypeTensor(void* mem, int typeSize, std::initializer_list sizes) 38 | : mem_(mem), 39 | typeSize_(typeSize) { 40 | 41 | int i = 0; 42 | for (auto s : sizes) { 43 | size_[i++] = s; 44 | } 45 | 46 | stride_[Dim - 1] = (IndexT) 1; 47 | for (int j = Dim - 2; j >= 0; --j) { 48 | stride_[j] = stride_[j + 1] * size_[j + 1]; 49 | } 50 | } 51 | 52 | NoTypeTensor(void* mem, int typeSize, int sizes[Dim]) 53 | : mem_(mem), 54 | typeSize_(typeSize) { 55 | for (int i = 0; i < Dim; ++i) { 56 | size_[i] = sizes[i]; 57 | } 58 | 59 | stride_[Dim - 1] = (IndexT) 1; 60 | for (int i = Dim - 2; i >= 0; --i) { 61 | stride_[i] = stride_[i + 1] * sizes[i + 1]; 62 | } 63 | } 64 | 65 | NoTypeTensor(void* mem, int typeSize, 66 | IndexT sizes[Dim], IndexT strides[Dim]) 67 | : mem_(mem), 68 | typeSize_(typeSize) { 69 | for (int i = 0; i < Dim; ++i) { 70 | size_[i] = sizes[i]; 71 | stride_[i] = strides[i]; 72 | } 73 | } 74 | 75 | int getTypeSize() const { 76 | return typeSize_; 77 | } 78 | 79 | IndexT getSize(int dim) const { 80 | FAISS_ASSERT(dim < Dim); 81 | return size_[dim]; 82 | } 83 | 84 | IndexT getStride(int dim) const { 85 | FAISS_ASSERT(dim < Dim); 86 | return stride_[dim]; 87 | } 88 | 89 | template 90 | Tensor toTensor() { 91 | FAISS_ASSERT(sizeof(T) == typeSize_); 92 | 93 | return Tensor((T*) mem_, size_, stride_); 94 | } 95 | 96 | NoTypeTensor narrowOutermost(IndexT start, IndexT size) { 97 | char* newPtr = (char*) mem_; 98 | 99 | if (start > 0) { 100 | newPtr += typeSize_ * start * stride_[0]; 101 | } 102 | 103 | IndexT newSize[Dim]; 104 | for (int i = 0; i < Dim; ++i) { 105 | if (i == 0) { 106 | assert(start + size <= size_[0]); 107 | newSize[i] = size; 108 | } else { 109 | newSize[i] = size_[i]; 110 | } 111 | } 112 | 113 | return NoTypeTensor( 114 | newPtr, typeSize_, newSize, stride_); 115 | } 116 | 117 | private: 118 | void* mem_; 119 | int typeSize_; 120 | IndexT size_[Dim]; 121 | IndexT stride_[Dim]; 122 | }; 123 | 124 | } } // namespace 125 | -------------------------------------------------------------------------------- /gpu/utils/Pair.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "MathOperators.cuh" 15 | #include "WarpShuffles.cuh" 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | /// A simple pair type for CUDA device usage 20 | template 21 | struct Pair { 22 | constexpr __device__ inline Pair() { 23 | } 24 | 25 | constexpr __device__ inline Pair(K key, V value) 26 | : k(key), v(value) { 27 | } 28 | 29 | __device__ inline bool 30 | operator==(const Pair& rhs) const { 31 | return Math::eq(k, rhs.k) && Math::eq(v, rhs.v); 32 | } 33 | 34 | __device__ inline bool 35 | operator!=(const Pair& rhs) const { 36 | return !operator==(rhs); 37 | } 38 | 39 | __device__ inline bool 40 | operator<(const Pair& rhs) const { 41 | return Math::lt(k, rhs.k) || 42 | (Math::eq(k, rhs.k) && Math::lt(v, rhs.v)); 43 | } 44 | 45 | __device__ inline bool 46 | operator>(const Pair& rhs) const { 47 | return Math::gt(k, rhs.k) || 48 | (Math::eq(k, rhs.k) && Math::gt(v, rhs.v)); 49 | } 50 | 51 | K k; 52 | V v; 53 | }; 54 | 55 | template 56 | inline __device__ Pair shfl_up(const Pair& pair, 57 | unsigned int delta, 58 | int width = kWarpSize) { 59 | return Pair(shfl_up(pair.k, delta, width), 60 | shfl_up(pair.v, delta, width)); 61 | } 62 | 63 | template 64 | inline __device__ Pair shfl_xor(const Pair& pair, 65 | int laneMask, 66 | int width = kWarpSize) { 67 | return Pair(shfl_xor(pair.k, laneMask, width), 68 | shfl_xor(pair.v, laneMask, width)); 69 | } 70 | 71 | } } // namespace 72 | -------------------------------------------------------------------------------- /gpu/utils/PtxUtils.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | __device__ __forceinline__ 18 | unsigned int getBitfield(unsigned int val, int pos, int len) { 19 | unsigned int ret; 20 | asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len)); 21 | return ret; 22 | } 23 | 24 | __device__ __forceinline__ 25 | unsigned long getBitfield(unsigned long val, int pos, int len) { 26 | unsigned long ret; 27 | asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); 28 | return ret; 29 | } 30 | 31 | __device__ __forceinline__ 32 | unsigned int setBitfield(unsigned int val, 33 | unsigned int toInsert, int pos, int len) { 34 | unsigned int ret; 35 | asm("bfi.b32 %0, %1, %2, %3, %4;" : 36 | "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len)); 37 | return ret; 38 | } 39 | 40 | __device__ __forceinline__ int getLaneId() { 41 | int laneId; 42 | asm("mov.s32 %0, %laneid;" : "=r"(laneId) ); 43 | return laneId; 44 | } 45 | 46 | __device__ __forceinline__ unsigned getLaneMaskLt() { 47 | unsigned mask; 48 | asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); 49 | return mask; 50 | } 51 | 52 | __device__ __forceinline__ unsigned getLaneMaskLe() { 53 | unsigned mask; 54 | asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); 55 | return mask; 56 | } 57 | 58 | __device__ __forceinline__ unsigned getLaneMaskGt() { 59 | unsigned mask; 60 | asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask)); 61 | return mask; 62 | } 63 | 64 | __device__ __forceinline__ unsigned getLaneMaskGe() { 65 | unsigned mask; 66 | asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); 67 | return mask; 68 | } 69 | 70 | __device__ __forceinline__ void namedBarrierWait(int name, int numThreads) { 71 | asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory"); 72 | } 73 | 74 | __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) { 75 | asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(numThreads) : "memory"); 76 | } 77 | 78 | // FIXME: prefetch does nothing (in SASS) on Maxwell 79 | __device__ __forceinline__ void prefetchL2(const void *p) { 80 | asm volatile("prefetch.global.L2 [%0];" : : "l"(p)); 81 | } 82 | 83 | __device__ __forceinline__ void prefetchL1(const void *p) { 84 | asm volatile("prefetch.global.L1 [%0];" : : "l"(p)); 85 | } 86 | 87 | } } // namespace 88 | -------------------------------------------------------------------------------- /gpu/utils/ReductionOperators.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "Limits.cuh" 15 | #include "MathOperators.cuh" 16 | #include "Pair.cuh" 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | template 21 | struct Sum { 22 | __device__ inline T operator()(T a, T b) const { 23 | return Math::add(a, b); 24 | } 25 | 26 | inline __device__ T identity() const { 27 | return Math::zero(); 28 | } 29 | }; 30 | 31 | template 32 | struct Min { 33 | __device__ inline T operator()(T a, T b) const { 34 | return Math::lt(a, b) ? a : b; 35 | } 36 | 37 | inline __device__ T identity() const { 38 | return Limits::getMax(); 39 | } 40 | }; 41 | 42 | template 43 | struct Max { 44 | __device__ inline T operator()(T a, T b) const { 45 | return Math::gt(a, b) ? a : b; 46 | } 47 | 48 | inline __device__ T identity() const { 49 | return Limits::getMin(); 50 | } 51 | }; 52 | 53 | /// Used for producing segmented prefix scans; the value of the Pair 54 | /// denotes the start of a new segment for the scan 55 | template 56 | struct SegmentedReduce { 57 | inline __device__ SegmentedReduce(const ReduceOp& o) 58 | : op(o) { 59 | } 60 | 61 | __device__ 62 | inline Pair 63 | operator()(const Pair& a, const Pair& b) const { 64 | return Pair(b.v ? b.k : op(a.k, b.k), 65 | a.v || b.v); 66 | } 67 | 68 | inline __device__ Pair identity() const { 69 | return Pair(op.identity(), false); 70 | } 71 | 72 | ReduceOp op; 73 | }; 74 | 75 | } } // namespace 76 | -------------------------------------------------------------------------------- /gpu/utils/StackDeviceMemory.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include "DeviceMemory.h" 14 | #include 15 | #include 16 | #include 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | /// Device memory manager that provides temporary memory allocations 21 | /// out of a region of memory 22 | class StackDeviceMemory : public DeviceMemory { 23 | public: 24 | /// Allocate a new region of memory that we manage 25 | explicit StackDeviceMemory(int device, size_t allocPerDevice); 26 | 27 | /// Manage a region of memory for a particular device, with or 28 | /// without ownership 29 | StackDeviceMemory(int device, void* p, size_t size, bool isOwner); 30 | 31 | ~StackDeviceMemory() override; 32 | 33 | int getDevice() const override; 34 | 35 | DeviceMemoryReservation getMemory(cudaStream_t stream, 36 | size_t size) override; 37 | 38 | size_t getSizeAvailable() const override; 39 | std::string toString() const override; 40 | size_t getHighWaterCudaMalloc() const override; 41 | 42 | protected: 43 | void returnAllocation(DeviceMemoryReservation& m) override; 44 | 45 | protected: 46 | /// Previous allocation ranges and the streams for which 47 | /// synchronization is required 48 | struct Range { 49 | inline Range(char* s, char* e, cudaStream_t str) : 50 | start_(s), end_(e), stream_(str) { 51 | } 52 | 53 | // References a memory range [start, end) 54 | char* start_; 55 | char* end_; 56 | cudaStream_t stream_; 57 | }; 58 | 59 | struct Stack { 60 | /// Constructor that allocates memory via cudaMalloc 61 | Stack(int device, size_t size); 62 | 63 | /// Constructor that references a pre-allocated region of memory 64 | Stack(int device, void* p, size_t size, bool isOwner); 65 | ~Stack(); 66 | 67 | /// Returns how much size is available for an allocation without 68 | /// calling cudaMalloc 69 | size_t getSizeAvailable() const; 70 | 71 | /// Obtains an allocation; all allocations are guaranteed to be 16 72 | /// byte aligned 73 | char* getAlloc(size_t size, cudaStream_t stream); 74 | 75 | /// Returns an allocation 76 | void returnAlloc(char* p, size_t size, cudaStream_t stream); 77 | 78 | /// Returns the stack state 79 | std::string toString() const; 80 | 81 | /// Returns the high-water mark of cudaMalloc activity 82 | size_t getHighWaterCudaMalloc() const; 83 | 84 | /// Device this allocation is on 85 | int device_; 86 | 87 | /// Do we own our region of memory? 88 | bool isOwner_; 89 | 90 | /// Where our allocation begins and ends 91 | /// [start_, end_) is valid 92 | char* start_; 93 | char* end_; 94 | 95 | /// Total size end_ - start_ 96 | size_t size_; 97 | 98 | /// Stack head within [start, end) 99 | char* head_; 100 | 101 | /// List of previous last users of allocations on our stack, for 102 | /// possible synchronization purposes 103 | std::list lastUsers_; 104 | 105 | /// How much cudaMalloc memory is currently outstanding? 106 | size_t mallocCurrent_; 107 | 108 | /// What's the high water mark in terms of memory used from the 109 | /// temporary buffer? 110 | size_t highWaterMemoryUsed_; 111 | 112 | /// What's the high water mark in terms of memory allocated via 113 | /// cudaMalloc? 114 | size_t highWaterMalloc_; 115 | }; 116 | 117 | /// Our device 118 | int device_; 119 | 120 | /// Memory stack 121 | Stack stack_; 122 | }; 123 | 124 | } } // namespace 125 | -------------------------------------------------------------------------------- /gpu/utils/StaticUtils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | 15 | namespace faiss { namespace gpu { namespace utils { 16 | 17 | template 18 | constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { 19 | return (a + b - 1) / b; 20 | } 21 | 22 | template 23 | constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) { 24 | return (a / b) * b; 25 | } 26 | 27 | template 28 | constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) { 29 | return divUp(a, b) * b; 30 | } 31 | 32 | template 33 | constexpr __host__ __device__ T pow(T n, T power) { 34 | return (power > 0 ? n * pow(n, power - 1) : 1); 35 | } 36 | 37 | template 38 | constexpr __host__ __device__ T pow2(T n) { 39 | return pow(2, (T) n); 40 | } 41 | 42 | static_assert(pow2(8) == 256, "pow2"); 43 | 44 | template 45 | constexpr __host__ __device__ int log2(T n, int p = 0) { 46 | return (n <= 1) ? p : log2(n / 2, p + 1); 47 | } 48 | 49 | static_assert(log2(2) == 1, "log2"); 50 | static_assert(log2(3) == 1, "log2"); 51 | static_assert(log2(4) == 2, "log2"); 52 | 53 | template 54 | constexpr __host__ __device__ bool isPowerOf2(T v) { 55 | return (v && !(v & (v - 1))); 56 | } 57 | 58 | static_assert(isPowerOf2(2048), "isPowerOf2"); 59 | static_assert(!isPowerOf2(3333), "isPowerOf2"); 60 | 61 | template 62 | constexpr __host__ __device__ T nextHighestPowerOf2(T v) { 63 | return (isPowerOf2(v) ? (T) 2 * v : ((T) 1 << (log2(v) + 1))); 64 | } 65 | 66 | static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2"); 67 | static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2"); 68 | static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2"); 69 | static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2"); 70 | 71 | static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2"); 72 | static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2"); 73 | static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2"); 74 | 75 | static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, 76 | "nextHighestPowerOf2"); 77 | static_assert(nextHighestPowerOf2((size_t) 2147483648ULL) == 78 | (size_t) 4294967296ULL, "nextHighestPowerOf2"); 79 | 80 | } } } // namespace 81 | -------------------------------------------------------------------------------- /gpu/utils/ThrustAllocator.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | /// Allocator for Thrust that comes out of a specified memory space 19 | class GpuResourcesThrustAllocator { 20 | public: 21 | typedef char value_type; 22 | 23 | GpuResourcesThrustAllocator(void* mem, size_t size) 24 | : start_((char*) mem), 25 | cur_((char*) mem), 26 | end_((char*) mem + size) { 27 | } 28 | 29 | ~GpuResourcesThrustAllocator() { 30 | } 31 | 32 | char* allocate(std::ptrdiff_t size) { 33 | if (size <= (end_ - cur_)) { 34 | char* p = cur_; 35 | cur_ += size; 36 | FAISS_ASSERT(cur_ <= end_); 37 | 38 | return p; 39 | } else { 40 | char* p = nullptr; 41 | CUDA_VERIFY(cudaMalloc(&p, size)); 42 | mallocAllocs_.insert(p); 43 | return p; 44 | } 45 | } 46 | 47 | void deallocate(char* p, size_t size) { 48 | // Allocations could be returned out-of-order; ignore those we 49 | // didn't cudaMalloc 50 | auto it = mallocAllocs_.find(p); 51 | if (it != mallocAllocs_.end()) { 52 | CUDA_VERIFY(cudaFree(p)); 53 | mallocAllocs_.erase(it); 54 | } 55 | } 56 | 57 | private: 58 | char* start_; 59 | char* cur_; 60 | char* end_; 61 | std::unordered_set mallocAllocs_; 62 | }; 63 | 64 | 65 | } } // namespace 66 | -------------------------------------------------------------------------------- /gpu/utils/Timer.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "Timer.h" 12 | #include "DeviceUtils.h" 13 | #include "../../FaissAssert.h" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | KernelTimer::KernelTimer(cudaStream_t stream) 18 | : startEvent_(0), 19 | stopEvent_(0), 20 | stream_(stream), 21 | valid_(true) { 22 | CUDA_VERIFY(cudaEventCreate(&startEvent_)); 23 | CUDA_VERIFY(cudaEventCreate(&stopEvent_)); 24 | 25 | CUDA_VERIFY(cudaEventRecord(startEvent_, stream_)); 26 | } 27 | 28 | KernelTimer::~KernelTimer() { 29 | CUDA_VERIFY(cudaEventDestroy(startEvent_)); 30 | CUDA_VERIFY(cudaEventDestroy(stopEvent_)); 31 | } 32 | 33 | float 34 | KernelTimer::elapsedMilliseconds() { 35 | FAISS_ASSERT(valid_); 36 | 37 | CUDA_VERIFY(cudaEventRecord(stopEvent_, stream_)); 38 | CUDA_VERIFY(cudaEventSynchronize(stopEvent_)); 39 | 40 | auto time = 0.0f; 41 | CUDA_VERIFY(cudaEventElapsedTime(&time, startEvent_, stopEvent_)); 42 | valid_ = false; 43 | 44 | return time; 45 | } 46 | 47 | CpuTimer::CpuTimer() { 48 | clock_gettime(CLOCK_REALTIME, &start_); 49 | } 50 | 51 | float 52 | CpuTimer::elapsedMilliseconds() { 53 | struct timespec end; 54 | clock_gettime(CLOCK_REALTIME, &end); 55 | 56 | auto diffS = end.tv_sec - start_.tv_sec; 57 | auto diffNs = end.tv_nsec - start_.tv_nsec; 58 | 59 | return 1000.0f * (float) diffS + ((float) diffNs) / 1000000.0f; 60 | } 61 | 62 | } } // namespace 63 | -------------------------------------------------------------------------------- /gpu/utils/Timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | namespace faiss { namespace gpu { 17 | 18 | /// Utility class for timing execution of a kernel 19 | class KernelTimer { 20 | public: 21 | /// Constructor starts the timer and adds an event into the current 22 | /// device stream 23 | KernelTimer(cudaStream_t stream = 0); 24 | 25 | /// Destructor releases event resources 26 | ~KernelTimer(); 27 | 28 | /// Adds a stop event then synchronizes on the stop event to get the 29 | /// actual GPU-side kernel timings for any kernels launched in the 30 | /// current stream. Returns the number of milliseconds elapsed. 31 | /// Can only be called once. 32 | float elapsedMilliseconds(); 33 | 34 | private: 35 | cudaEvent_t startEvent_; 36 | cudaEvent_t stopEvent_; 37 | cudaStream_t stream_; 38 | bool valid_; 39 | }; 40 | 41 | /// CPU wallclock elapsed timer 42 | class CpuTimer { 43 | public: 44 | /// Creates and starts a new timer 45 | CpuTimer(); 46 | 47 | /// Returns elapsed time in milliseconds 48 | float elapsedMilliseconds(); 49 | 50 | private: 51 | struct timespec start_; 52 | }; 53 | 54 | } } // namespace 55 | -------------------------------------------------------------------------------- /gpu/utils/WarpSelectFloat.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "warpselect/WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | // warp Q to thread Q: 15 | // 1, 1 16 | // 32, 2 17 | // 64, 3 18 | // 128, 3 19 | // 256, 4 20 | // 512, 8 21 | // 1024, 8 22 | 23 | WARP_SELECT_DECL(float, true, 1); 24 | WARP_SELECT_DECL(float, true, 32); 25 | WARP_SELECT_DECL(float, true, 64); 26 | WARP_SELECT_DECL(float, true, 128); 27 | WARP_SELECT_DECL(float, true, 256); 28 | WARP_SELECT_DECL(float, true, 512); 29 | WARP_SELECT_DECL(float, true, 1024); 30 | 31 | WARP_SELECT_DECL(float, false, 1); 32 | WARP_SELECT_DECL(float, false, 32); 33 | WARP_SELECT_DECL(float, false, 64); 34 | WARP_SELECT_DECL(float, false, 128); 35 | WARP_SELECT_DECL(float, false, 256); 36 | WARP_SELECT_DECL(float, false, 512); 37 | WARP_SELECT_DECL(float, false, 1024); 38 | 39 | void runWarpSelect(Tensor& in, 40 | Tensor& outK, 41 | Tensor& outV, 42 | bool dir, int k, cudaStream_t stream) { 43 | FAISS_ASSERT(k <= 1024); 44 | 45 | if (dir) { 46 | if (k == 1) { 47 | WARP_SELECT_CALL(float, true, 1); 48 | } else if (k <= 32) { 49 | WARP_SELECT_CALL(float, true, 32); 50 | } else if (k <= 64) { 51 | WARP_SELECT_CALL(float, true, 64); 52 | } else if (k <= 128) { 53 | WARP_SELECT_CALL(float, true, 128); 54 | } else if (k <= 256) { 55 | WARP_SELECT_CALL(float, true, 256); 56 | } else if (k <= 512) { 57 | WARP_SELECT_CALL(float, true, 512); 58 | } else if (k <= 1024) { 59 | WARP_SELECT_CALL(float, true, 1024); 60 | } 61 | } else { 62 | if (k == 1) { 63 | WARP_SELECT_CALL(float, false, 1); 64 | } else if (k <= 32) { 65 | WARP_SELECT_CALL(float, false, 32); 66 | } else if (k <= 64) { 67 | WARP_SELECT_CALL(float, false, 64); 68 | } else if (k <= 128) { 69 | WARP_SELECT_CALL(float, false, 128); 70 | } else if (k <= 256) { 71 | WARP_SELECT_CALL(float, false, 256); 72 | } else if (k <= 512) { 73 | WARP_SELECT_CALL(float, false, 512); 74 | } else if (k <= 1024) { 75 | WARP_SELECT_CALL(float, false, 1024); 76 | } 77 | } 78 | } 79 | 80 | } } // namespace 81 | -------------------------------------------------------------------------------- /gpu/utils/WarpSelectHalf.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "warpselect/WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | 16 | // warp Q to thread Q: 17 | // 1, 1 18 | // 32, 2 19 | // 64, 3 20 | // 128, 3 21 | // 256, 4 22 | // 512, 8 23 | // 1024, 8 24 | 25 | WARP_SELECT_DECL(half, true, 1); 26 | WARP_SELECT_DECL(half, true, 32); 27 | WARP_SELECT_DECL(half, true, 64); 28 | WARP_SELECT_DECL(half, true, 128); 29 | WARP_SELECT_DECL(half, true, 256); 30 | WARP_SELECT_DECL(half, true, 512); 31 | WARP_SELECT_DECL(half, true, 1024); 32 | 33 | WARP_SELECT_DECL(half, false, 1); 34 | WARP_SELECT_DECL(half, false, 32); 35 | WARP_SELECT_DECL(half, false, 64); 36 | WARP_SELECT_DECL(half, false, 128); 37 | WARP_SELECT_DECL(half, false, 256); 38 | WARP_SELECT_DECL(half, false, 512); 39 | WARP_SELECT_DECL(half, false, 1024); 40 | 41 | void runWarpSelect(Tensor& in, 42 | Tensor& outK, 43 | Tensor& outV, 44 | bool dir, int k, cudaStream_t stream) { 45 | FAISS_ASSERT(k <= 1024); 46 | 47 | if (dir) { 48 | if (k == 1) { 49 | WARP_SELECT_CALL(half, true, 1); 50 | } else if (k <= 32) { 51 | WARP_SELECT_CALL(half, true, 32); 52 | } else if (k <= 64) { 53 | WARP_SELECT_CALL(half, true, 64); 54 | } else if (k <= 128) { 55 | WARP_SELECT_CALL(half, true, 128); 56 | } else if (k <= 256) { 57 | WARP_SELECT_CALL(half, true, 256); 58 | } else if (k <= 512) { 59 | WARP_SELECT_CALL(half, true, 512); 60 | } else if (k <= 1024) { 61 | WARP_SELECT_CALL(half, true, 1024); 62 | } 63 | } else { 64 | if (k == 1) { 65 | WARP_SELECT_CALL(half, false, 1); 66 | } else if (k <= 32) { 67 | WARP_SELECT_CALL(half, false, 32); 68 | } else if (k <= 64) { 69 | WARP_SELECT_CALL(half, false, 64); 70 | } else if (k <= 128) { 71 | WARP_SELECT_CALL(half, false, 128); 72 | } else if (k <= 256) { 73 | WARP_SELECT_CALL(half, false, 256); 74 | } else if (k <= 512) { 75 | WARP_SELECT_CALL(half, false, 512); 76 | } else if (k <= 1024) { 77 | WARP_SELECT_CALL(half, false, 1024); 78 | } 79 | } 80 | } 81 | 82 | #endif 83 | 84 | } } // namespace 85 | -------------------------------------------------------------------------------- /gpu/utils/WarpSelectKernel.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #pragma once 11 | 12 | #include "Float16.cuh" 13 | #include "Select.cuh" 14 | 15 | namespace faiss { namespace gpu { 16 | 17 | template 23 | __global__ void warpSelect(Tensor in, 24 | Tensor outK, 25 | Tensor outV, 26 | K initK, 27 | IndexType initV, 28 | int k) { 29 | constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; 30 | 31 | WarpSelect, 32 | NumWarpQ, NumThreadQ, ThreadsPerBlock> 33 | heap(initK, initV, k); 34 | 35 | int warpId = threadIdx.x / kWarpSize; 36 | int row = blockIdx.x * kNumWarps + warpId; 37 | 38 | if (row >= in.getSize(0)) { 39 | return; 40 | } 41 | 42 | int i = getLaneId(); 43 | K* inStart = in[row][i].data(); 44 | 45 | // Whole warps must participate in the selection 46 | int limit = utils::roundDown(in.getSize(1), kWarpSize); 47 | 48 | for (; i < limit; i += kWarpSize) { 49 | heap.add(*inStart, (IndexType) i); 50 | inStart += kWarpSize; 51 | } 52 | 53 | // Handle non-warp multiple remainder 54 | if (i < in.getSize(1)) { 55 | heap.addThreadQ(*inStart, (IndexType) i); 56 | } 57 | 58 | heap.reduce(); 59 | heap.writeOut(outK[row].data(), 60 | outV[row].data(), k); 61 | } 62 | 63 | void runWarpSelect(Tensor& in, 64 | Tensor& outKeys, 65 | Tensor& outIndices, 66 | bool dir, int k, cudaStream_t stream); 67 | 68 | #ifdef FAISS_USE_FLOAT16 69 | void runWarpSelect(Tensor& in, 70 | Tensor& outKeys, 71 | Tensor& outIndices, 72 | bool dir, int k, cudaStream_t stream); 73 | #endif 74 | 75 | } } // namespace 76 | -------------------------------------------------------------------------------- /gpu/utils/WarpShuffles.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include "DeviceDefs.cuh" 15 | #include "Float16.cuh" 16 | 17 | namespace faiss { namespace gpu { 18 | 19 | template 20 | inline __device__ T shfl(const T val, 21 | int srcLane, int width = kWarpSize) { 22 | return __shfl(val, srcLane, width); 23 | } 24 | 25 | // CUDA SDK does not provide specializations for T* 26 | template 27 | inline __device__ T* shfl(T* const val, 28 | int srcLane, int width = kWarpSize) { 29 | static_assert(sizeof(T*) == sizeof(long long), "pointer size"); 30 | long long v = (long long) val; 31 | return (T*) __shfl(v, srcLane, width); 32 | } 33 | 34 | template 35 | inline __device__ T shfl_up(const T val, 36 | unsigned int delta, int width = kWarpSize) { 37 | return __shfl_up(val, delta, width); 38 | } 39 | 40 | // CUDA SDK does not provide specializations for T* 41 | template 42 | inline __device__ T* shfl_up(T* const val, 43 | unsigned int delta, int width = kWarpSize) { 44 | static_assert(sizeof(T*) == sizeof(long long), "pointer size"); 45 | long long v = (long long) val; 46 | return (T*) __shfl_up(v, delta, width); 47 | } 48 | 49 | template 50 | inline __device__ T shfl_down(const T val, 51 | unsigned int delta, int width = kWarpSize) { 52 | return __shfl_down(val, delta, width); 53 | } 54 | 55 | // CUDA SDK does not provide specializations for T* 56 | template 57 | inline __device__ T* shfl_down(T* const val, 58 | unsigned int delta, int width = kWarpSize) { 59 | static_assert(sizeof(T*) == sizeof(long long), "pointer size"); 60 | long long v = (long long) val; 61 | return (T*) __shfl_down(v, delta, width); 62 | } 63 | 64 | template 65 | inline __device__ T shfl_xor(const T val, 66 | int laneMask, int width = kWarpSize) { 67 | return __shfl_xor(val, laneMask, width); 68 | } 69 | 70 | // CUDA SDK does not provide specializations for T* 71 | template 72 | inline __device__ T* shfl_xor(T* const val, 73 | int laneMask, int width = kWarpSize) { 74 | static_assert(sizeof(T*) == sizeof(long long), "pointer size"); 75 | long long v = (long long) val; 76 | return (T*) __shfl_xor(v, laneMask, width); 77 | } 78 | 79 | #ifdef FAISS_USE_FLOAT16 80 | inline __device__ half shfl(half v, 81 | int srcLane, int width = kWarpSize) { 82 | unsigned int vu = v.x; 83 | vu = __shfl(vu, srcLane, width); 84 | 85 | half h; 86 | h.x = (unsigned short) vu; 87 | return h; 88 | } 89 | 90 | inline __device__ half shfl_xor(half v, 91 | int laneMask, int width = kWarpSize) { 92 | unsigned int vu = v.x; 93 | vu = __shfl_xor(vu, laneMask, width); 94 | 95 | half h; 96 | h.x = (unsigned short) vu; 97 | return h; 98 | } 99 | #endif 100 | 101 | } } // namespace 102 | -------------------------------------------------------------------------------- /gpu/utils/WorkerThread.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #include "WorkerThread.h" 12 | #include "../../FaissAssert.h" 13 | 14 | namespace faiss { namespace gpu { 15 | 16 | WorkerThread::WorkerThread() : 17 | wantStop_(false) { 18 | startThread(); 19 | 20 | // Make sure that the thread has started before continuing 21 | add([](){}).get(); 22 | } 23 | 24 | WorkerThread::~WorkerThread() { 25 | stop(); 26 | waitForThreadExit(); 27 | } 28 | 29 | void 30 | WorkerThread::startThread() { 31 | thread_ = std::thread([this](){ threadMain(); }); 32 | } 33 | 34 | void 35 | WorkerThread::stop() { 36 | std::lock_guard guard(mutex_); 37 | 38 | wantStop_ = true; 39 | monitor_.notify_one(); 40 | } 41 | 42 | std::future 43 | WorkerThread::add(std::function f) { 44 | std::lock_guard guard(mutex_); 45 | 46 | if (wantStop_) { 47 | // The timer thread has been stopped, or we want to stop; we can't 48 | // schedule anything else 49 | std::promise p; 50 | auto fut = p.get_future(); 51 | 52 | // did not execute 53 | p.set_value(false); 54 | return fut; 55 | } 56 | 57 | auto pr = std::promise(); 58 | auto fut = pr.get_future(); 59 | 60 | queue_.emplace_back(std::make_pair(std::move(f), std::move(pr))); 61 | 62 | // Wake up our thread 63 | monitor_.notify_one(); 64 | return fut; 65 | } 66 | 67 | void 68 | WorkerThread::threadMain() { 69 | threadLoop(); 70 | 71 | // Call all pending tasks 72 | FAISS_ASSERT(wantStop_); 73 | 74 | for (auto& f : queue_) { 75 | f.first(); 76 | f.second.set_value(true); 77 | } 78 | } 79 | 80 | void 81 | WorkerThread::threadLoop() { 82 | while (true) { 83 | std::pair, std::promise> data; 84 | 85 | { 86 | std::unique_lock lock(mutex_); 87 | 88 | while (!wantStop_ && queue_.empty()) { 89 | monitor_.wait(lock); 90 | } 91 | 92 | if (wantStop_) { 93 | return; 94 | } 95 | 96 | data = std::move(queue_.front()); 97 | queue_.pop_front(); 98 | } 99 | 100 | data.first(); 101 | data.second.set_value(true); 102 | } 103 | } 104 | 105 | void 106 | WorkerThread::waitForThreadExit() { 107 | try { 108 | thread_.join(); 109 | } catch (...) { 110 | } 111 | } 112 | 113 | } } // namespace 114 | -------------------------------------------------------------------------------- /gpu/utils/WorkerThread.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace faiss { namespace gpu { 19 | 20 | class WorkerThread { 21 | public: 22 | WorkerThread(); 23 | 24 | /// Stops and waits for the worker thread to exit, flushing all 25 | /// pending lambdas 26 | ~WorkerThread(); 27 | 28 | /// Request that the worker thread stop itself 29 | void stop(); 30 | 31 | /// Blocking waits in the current thread for the worker thread to 32 | /// stop 33 | void waitForThreadExit(); 34 | 35 | /// Adds a lambda to run on the worker thread; returns a future that 36 | /// can be used to block on its completion. 37 | /// Future status is `true` if the lambda was run in the worker 38 | /// thread; `false` if it was not run, because the worker thread is 39 | /// exiting or has exited. 40 | std::future add(std::function f); 41 | 42 | private: 43 | void startThread(); 44 | void threadMain(); 45 | void threadLoop(); 46 | 47 | /// Thread that all queued lambdas are run on 48 | std::thread thread_; 49 | 50 | /// Mutex for the queue and exit status 51 | std::mutex mutex_; 52 | 53 | /// Monitor for the exit status and the queue 54 | std::condition_variable monitor_; 55 | 56 | /// Whether or not we want the thread to exit 57 | bool wantStop_; 58 | 59 | /// Queue of pending lambdas to call 60 | std::deque, std::promise>> queue_; 61 | }; 62 | 63 | } } // namespace 64 | -------------------------------------------------------------------------------- /gpu/utils/bitonicSort.cuh: -------------------------------------------------------------------------------- 1 | #ifndef BITONIC_SORT_CUH 2 | #define BITONIC_SORT_CUH 3 | 4 | #include 5 | #include 6 | 7 | namespace faiss { namespace gpu { 8 | template 9 | __device__ void swap1(T& _a, T&_b) { 10 | T h = _a; 11 | _a = _b; 12 | _b = h; 13 | } 14 | 15 | // parallel bitonic sort 16 | template 17 | __device__ void bitonic3(volatile T _val[], volatile uint _idx[], uint _N) { 18 | 19 | for (int k = 2; k <= _N; k <<= 1) { 20 | 21 | // bitonic merge 22 | for (int j = k / 2; j > 0; j /= 2) { 23 | int ixj = threadIdx.x ^ j; // XOR 24 | if ((ixj > threadIdx.x) && (ixj < _N)) { 25 | if ((threadIdx.x & k) == 0) // ascending - descending 26 | { 27 | if (_val[threadIdx.x] > _val[ixj]) { 28 | 29 | swap1(_val[threadIdx.x], _val[ixj]); 30 | swap1(_idx[threadIdx.x], _idx[ixj]); 31 | } 32 | } else { 33 | if (_val[threadIdx.x] < _val[ixj]) { 34 | 35 | swap1(_val[threadIdx.x], _val[ixj]); 36 | swap1(_idx[threadIdx.x], _idx[ixj]); 37 | } 38 | 39 | } 40 | } 41 | __syncthreads(); 42 | } 43 | } 44 | } 45 | 46 | 47 | // parallel bitonic sort 48 | template 49 | __device__ void bitonicLarge(volatile T _val[], volatile uint _idx[], uint _N) { 50 | 51 | for (int k = 2; k <= _N; k <<= 1) { 52 | 53 | // bitonic merge 54 | for (int j = k / 2; j > 0; j /= 2) { 55 | 56 | for (int tid = threadIdx.x; tid < _N; tid += blockDim.x) { 57 | int ixj = tid ^ j; // XOR 58 | if ((ixj > tid) && (ixj < _N)) { 59 | if ((tid & k) == 0) // ascending - descending 60 | { 61 | if (_val[tid] > _val[ixj]) { 62 | 63 | swap1(_val[tid], _val[ixj]); 64 | swap1(_idx[tid], _idx[ixj]); 65 | } 66 | } else { 67 | if (_val[tid] < _val[ixj]) { 68 | 69 | swap1(_val[tid], _val[ixj]); 70 | swap1(_idx[tid], _idx[ixj]); 71 | } 72 | 73 | } 74 | } 75 | } 76 | __syncthreads(); 77 | } 78 | } 79 | } 80 | 81 | // parallel bitonic sort (descending) 82 | template 83 | __device__ void bitonic3Descending(volatile T _val[], volatile uint _idx[], 84 | uint _N) { 85 | 86 | for (int k = 2; k <= _N; k <<= 1) { 87 | 88 | // bitonic merge 89 | for (int j = k / 2; j > 0; j /= 2) { 90 | int ixj = threadIdx.x ^ j; // XOR 91 | if ((ixj > threadIdx.x) && (ixj < _N)) { 92 | if ((threadIdx.x & k) != 0) // ascending - descending 93 | { 94 | if (_val[threadIdx.x] > _val[ixj]) { 95 | 96 | swap1(_val[threadIdx.x], _val[ixj]); 97 | swap1(_idx[threadIdx.x], _idx[ixj]); 98 | } 99 | } else { 100 | if (_val[threadIdx.x] < _val[ixj]) { 101 | 102 | swap1(_val[threadIdx.x], _val[ixj]); 103 | swap1(_idx[threadIdx.x], _idx[ixj]); 104 | } 105 | 106 | } 107 | } 108 | __syncthreads(); 109 | } 110 | } 111 | } 112 | 113 | }}; 114 | #endif 115 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloat1.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 1, 1); 15 | BLOCK_SELECT_IMPL(float, false, 1, 1); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloat128.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 128, 3); 15 | BLOCK_SELECT_IMPL(float, false, 128, 3); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloat256.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 256, 4); 15 | BLOCK_SELECT_IMPL(float, false, 256, 4); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloat32.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 32, 2); 15 | BLOCK_SELECT_IMPL(float, false, 32, 2); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloat64.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 64, 3); 15 | BLOCK_SELECT_IMPL(float, false, 64, 3); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloatF1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, false, 1024, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloatF512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, false, 512, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloatT1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 1024, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectFloatT512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | BLOCK_SELECT_IMPL(float, true, 512, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalf1.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 1, 1); 16 | BLOCK_SELECT_IMPL(half, false, 1, 1); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalf128.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 128, 3); 16 | BLOCK_SELECT_IMPL(half, false, 128, 3); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalf256.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 256, 4); 16 | BLOCK_SELECT_IMPL(half, false, 256, 4); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalf32.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 32, 2); 16 | BLOCK_SELECT_IMPL(half, false, 32, 2); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalf64.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 64, 3); 16 | BLOCK_SELECT_IMPL(half, false, 64, 3); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalfF1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, false, 1024, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalfF512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, false, 512, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalfT1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 1024, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectHalfT512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "BlockSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | BLOCK_SELECT_IMPL(half, true, 512, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/blockselect/BlockSelectImpl.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "../BlockSelectKernel.cuh" 11 | #include "../Limits.cuh" 12 | 13 | #define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q) \ 14 | extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 15 | Tensor& in, \ 16 | Tensor& outK, \ 17 | Tensor& outV, \ 18 | bool dir, \ 19 | int k, \ 20 | cudaStream_t stream) 21 | 22 | #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \ 23 | void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 24 | Tensor& in, \ 25 | Tensor& outK, \ 26 | Tensor& outV, \ 27 | bool dir, \ 28 | int k, \ 29 | cudaStream_t stream) { \ 30 | auto grid = dim3(in.getSize(0)); \ 31 | \ 32 | constexpr int kBlockSelectNumThreads = 128; \ 33 | auto block = dim3(kBlockSelectNumThreads); \ 34 | \ 35 | FAISS_ASSERT(k <= WARP_Q); \ 36 | FAISS_ASSERT(dir == DIR); \ 37 | \ 38 | auto kInit = dir ? Limits::getMin() : Limits::getMax(); \ 39 | auto vInit = -1; \ 40 | \ 41 | blockSelect \ 42 | <<>>(in, outK, outV, kInit, vInit, k); \ 43 | CUDA_TEST_ERROR(); \ 44 | } 45 | 46 | #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \ 47 | runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 48 | in, outK, outV, dir, k, stream) 49 | -------------------------------------------------------------------------------- /gpu/utils/helper.cuh: -------------------------------------------------------------------------------- 1 | #ifndef NEARESTNEIGHBOR_HELPER_H 2 | #define NEARESTNEIGHBOR_HELPER_H 3 | 4 | /*! \file helper.hh 5 | \brief a collection of helper classes 6 | */ 7 | //#define OUTPUT 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | #define MAX_THREADS 512 16 | #define MAX_BLOCKS 65535 17 | #define WARP_SIZE 32 18 | namespace faiss { namespace gpu { 19 | 20 | 21 | void outputMat(const std::string& _S, const float* _A, 22 | uint _rows, uint _cols,cudaStream_t stream); 23 | 24 | void outputVec(const std::string& _S, const float* _v, 25 | uint _n,cudaStream_t stream); 26 | 27 | void outputVecChar(const std::string& _S, const char* _v, 28 | uint _n,cudaStream_t stream); 29 | void outputVecUint8(const std::string& _S, const uint8_t* _v, 30 | uint _n,cudaStream_t stream); 31 | void outputVecUint(const std::string& _S, const uint* _v, 32 | uint _n,cudaStream_t stream); 33 | void outputVecUShort(const std::string& _S, const ushort* _v, 34 | uint _n,cudaStream_t stream); 35 | 36 | void outputVecInt(const std::string& _S, const int* _v,uint _n,cudaStream_t stream); 37 | 38 | void outputVecLong(const std::string& _S, const long* _v,uint _n,cudaStream_t stream); 39 | 40 | void checkPrefixSumOffsets(const int* _v,uint _n,cudaStream_t stream); 41 | 42 | 43 | } 44 | 45 | } /* namespace */ 46 | 47 | 48 | 49 | #endif /* NEARESTNEIGHBOR_HELPER_H */ 50 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloat1.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 1, 1); 15 | WARP_SELECT_IMPL(float, false, 1, 1); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloat128.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 128, 3); 15 | WARP_SELECT_IMPL(float, false, 128, 3); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloat256.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 256, 4); 15 | WARP_SELECT_IMPL(float, false, 256, 4); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloat32.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 32, 2); 15 | WARP_SELECT_IMPL(float, false, 32, 2); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloat64.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 64, 3); 15 | WARP_SELECT_IMPL(float, false, 64, 3); 16 | 17 | } } // namespace 18 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloatF1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, false, 1024, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloatF512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, false, 512, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloatT1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 1024, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectFloatT512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | WARP_SELECT_IMPL(float, true, 512, 8); 15 | 16 | } } // namespace 17 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalf1.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 1, 1); 16 | WARP_SELECT_IMPL(half, false, 1, 1); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalf128.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 128, 3); 16 | WARP_SELECT_IMPL(half, false, 128, 3); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalf256.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 256, 4); 16 | WARP_SELECT_IMPL(half, false, 256, 4); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalf32.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 32, 2); 16 | WARP_SELECT_IMPL(half, false, 32, 2); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalf64.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 64, 3); 16 | WARP_SELECT_IMPL(half, false, 64, 3); 17 | #endif 18 | 19 | } } // namespace 20 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalfF1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, false, 1024, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalfF512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, false, 512, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalfT1024.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 1024, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectHalfT512.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "WarpSelectImpl.cuh" 11 | 12 | namespace faiss { namespace gpu { 13 | 14 | #ifdef FAISS_USE_FLOAT16 15 | WARP_SELECT_IMPL(half, true, 512, 8); 16 | #endif 17 | 18 | } } // namespace 19 | -------------------------------------------------------------------------------- /gpu/utils/warpselect/WarpSelectImpl.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved. 10 | #include "../WarpSelectKernel.cuh" 11 | #include "../Limits.cuh" 12 | 13 | #define WARP_SELECT_DECL(TYPE, DIR, WARP_Q) \ 14 | extern void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 15 | Tensor& in, \ 16 | Tensor& outK, \ 17 | Tensor& outV, \ 18 | bool dir, \ 19 | int k, \ 20 | cudaStream_t stream) 21 | 22 | #define WARP_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \ 23 | void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 24 | Tensor& in, \ 25 | Tensor& outK, \ 26 | Tensor& outV, \ 27 | bool dir, \ 28 | int k, \ 29 | cudaStream_t stream) { \ 30 | \ 31 | constexpr int kWarpSelectNumThreads = 128; \ 32 | auto grid = dim3(utils::divUp(in.getSize(0), \ 33 | (kWarpSelectNumThreads / kWarpSize))); \ 34 | auto block = dim3(kWarpSelectNumThreads); \ 35 | \ 36 | FAISS_ASSERT(k <= WARP_Q); \ 37 | FAISS_ASSERT(dir == DIR); \ 38 | \ 39 | auto kInit = dir ? Limits::getMin() : Limits::getMax(); \ 40 | auto vInit = -1; \ 41 | \ 42 | warpSelect \ 43 | <<>>(in, outK, outV, kInit, vInit, k); \ 44 | CUDA_TEST_ERROR(); \ 45 | } 46 | 47 | #define WARP_SELECT_CALL(TYPE, DIR, WARP_Q) \ 48 | runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ 49 | in, outK, outV, dir, k, stream) 50 | -------------------------------------------------------------------------------- /index_io.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved 10 | // -*- c++ -*- 11 | // I/O code for indexes 12 | 13 | #ifndef FAISS_INDEX_IO_H 14 | #define FAISS_INDEX_IO_H 15 | 16 | #include 17 | 18 | namespace faiss { 19 | 20 | struct Index; 21 | struct VectorTransform; 22 | struct IndexIVF; 23 | struct ProductQuantizer; 24 | 25 | void write_index (const Index *idx, FILE *f); 26 | void write_index (const Index *idx, const char *fname); 27 | 28 | /** 29 | * mmap'ing currently works only for IndexIVFPQCompact, the 30 | * IndexIVFPQCompact destructor will unmap the file. 31 | */ 32 | Index *read_index (FILE * f, bool try_mmap = false); 33 | Index *read_index (const char *fname, bool try_mmap = false); 34 | 35 | 36 | 37 | void write_VectorTransform (const VectorTransform *vt, const char *fname); 38 | VectorTransform *read_VectorTransform (const char *fname); 39 | 40 | ProductQuantizer * read_ProductQuantizer (const char*fname); 41 | void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname); 42 | 43 | 44 | 45 | /* cloning functions */ 46 | Index *clone_index (const Index *); 47 | 48 | /** Cloner class, useful to override classes with other cloning 49 | * functions. The cloning function above just calls 50 | * Cloner::clone_Index. */ 51 | struct Cloner { 52 | virtual VectorTransform *clone_VectorTransform (const VectorTransform *); 53 | virtual Index *clone_Index (const Index *); 54 | virtual IndexIVF *clone_IndexIVF (const IndexIVF *); 55 | virtual ~Cloner() {} 56 | }; 57 | 58 | } 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) 2 | 3 | # Build each source file independently 4 | include_directories(../../) # faiss root directory 5 | 6 | # gtest 7 | 8 | find_package(GTest REQUIRED) 9 | 10 | include_directories(${GTEST_INCLUDE_DIRS}) 11 | set(GTEST_ROOT /usr/include) 12 | foreach(source ${srcs}) 13 | get_filename_component(name ${source} NAME_WE) 14 | 15 | # target 16 | add_executable(${name} ${source}) 17 | target_link_libraries(${name} ${faiss_lib} ${BLAS_LIB} ${GTEST_BOTH_LIBRARIES}) 18 | 19 | # Install 20 | install(TARGETS ${name} DESTINATION test) 21 | endforeach(source) 22 | -------------------------------------------------------------------------------- /tests/test_blas: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/tests/test_blas -------------------------------------------------------------------------------- /tests/test_blas.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | #include 11 | 12 | #undef FINTEGER 13 | #define FINTEGER long 14 | 15 | 16 | extern "C" { 17 | 18 | /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */ 19 | 20 | int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER * 21 | n, FINTEGER *k, const float *alpha, const float *a, 22 | FINTEGER *lda, const float *b, FINTEGER * 23 | ldb, float *beta, float *c, FINTEGER *ldc); 24 | 25 | /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */ 26 | 27 | int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda, 28 | float *tau, float *work, FINTEGER *lwork, FINTEGER *info); 29 | 30 | } 31 | 32 | float *new_random_vec(int size) 33 | { 34 | float *x = new float[size]; 35 | for (int i = 0; i < size; i++) 36 | x[i] = drand48(); 37 | return x; 38 | } 39 | 40 | 41 | int main() { 42 | 43 | FINTEGER m = 10, n = 20, k = 30; 44 | float *a = new_random_vec(m * k), *b = new_random_vec(n * k), *c = new float[n * m]; 45 | float one = 1.0, zero = 0.0; 46 | 47 | printf("BLAS test\n"); 48 | 49 | sgemm_("Not transposed", "Not transposed", 50 | &m, &n, &k, &one, a, &m, b, &k, &zero, c, &m); 51 | 52 | printf("errors=\n"); 53 | 54 | for (int i = 0; i < m; i++) { 55 | for (int j = 0; j < n; j++) { 56 | float accu = 0; 57 | for (int l = 0; l < k; l++) 58 | accu += a[i + l * m] * b[l + j * k]; 59 | printf ("%6.3f ", accu - c[i + j * m]); 60 | } 61 | printf("\n"); 62 | } 63 | 64 | long info = 0x64bL << 32; 65 | long mi = 0x64bL << 32 | m; 66 | float *tau = new float[m]; 67 | FINTEGER lwork = -1; 68 | 69 | float work1; 70 | 71 | printf("Intentional Lapack error (appears only for 64-bit INTEGER):\n"); 72 | sgeqrf_ (&mi, &n, c, &m, tau, &work1, &lwork, (FINTEGER*)&info); 73 | 74 | // sgeqrf_ (&m, &n, c, &zeroi, tau, &work1, &lwork, (FINTEGER*)&info); 75 | printf("info=%016lx\n", info); 76 | 77 | if(info >> 32 == 0x64b) { 78 | printf("Lapack uses 32-bit integers\n"); 79 | } else { 80 | printf("Lapack uses 64-bit integers\n"); 81 | } 82 | 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /tests/test_ivfpq_codec.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | 15 | #include "../IndexIVFPQ.h" 16 | #include "../IndexFlat.h" 17 | #include "../utils.h" 18 | 19 | 20 | // dimension of the vectors to index 21 | int d = 64; 22 | 23 | // size of the database we plan to index 24 | size_t nb = 8000; 25 | 26 | 27 | double eval_codec_error (long ncentroids, long m, const std::vector &v) 28 | { 29 | faiss::IndexFlatL2 coarse_quantizer (d); 30 | faiss::IndexIVFPQ index (&coarse_quantizer, d, 31 | ncentroids, m, 8); 32 | index.pq.cp.niter = 10; // speed up train 33 | index.train (nb, v.data()); 34 | 35 | // encode and decode to compute reconstruction error 36 | 37 | std::vector keys (nb); 38 | std::vector codes (nb * m); 39 | index.encode_multiple (nb, keys.data(), v.data(), codes.data(), true); 40 | 41 | std::vector v2 (nb * d); 42 | index.decode_multiple (nb, keys.data(), codes.data(), v2.data()); 43 | 44 | return faiss::fvec_L2sqr (v.data(), v2.data(), nb * d); 45 | } 46 | 47 | 48 | 49 | TEST(IVFPQ, codec) { 50 | 51 | std::vector database (nb * d); 52 | for (size_t i = 0; i < nb * d; i++) { 53 | database[i] = drand48(); 54 | } 55 | 56 | double err0 = eval_codec_error(16, 8, database); 57 | 58 | // should be more accurate as there are more coarse centroids 59 | double err1 = eval_codec_error(128, 8, database); 60 | EXPECT_GT(err0, err1); 61 | 62 | // should be more accurate as there are more PQ codes 63 | double err2 = eval_codec_error(16, 16, database); 64 | EXPECT_GT(err0, err2); 65 | } 66 | -------------------------------------------------------------------------------- /tests/test_ivfpq_indexing.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the CC-by-NC license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Copyright 2004-present Facebook. All Rights Reserved 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #include "../IndexIVFPQ.h" 17 | #include "../IndexFlat.h" 18 | #include "../index_io.h" 19 | 20 | TEST(IVFPQ, accuracy) { 21 | 22 | // dimension of the vectors to index 23 | int d = 64; 24 | 25 | // size of the database we plan to index 26 | size_t nb = 1000; 27 | 28 | // make a set of nt training vectors in the unit cube 29 | // (could be the database) 30 | size_t nt = 1500; 31 | 32 | // make the index object and train it 33 | faiss::IndexFlatL2 coarse_quantizer (d); 34 | 35 | // a reasonable number of cetroids to index nb vectors 36 | int ncentroids = 25; 37 | 38 | faiss::IndexIVFPQ index (&coarse_quantizer, d, 39 | ncentroids, 16, 8); 40 | 41 | // index that gives the ground-truth 42 | faiss::IndexFlatL2 index_gt (d); 43 | 44 | srand48 (35); 45 | 46 | { // training 47 | 48 | std::vector trainvecs (nt * d); 49 | for (size_t i = 0; i < nt * d; i++) { 50 | trainvecs[i] = drand48(); 51 | } 52 | index.verbose = true; 53 | index.train (nt, trainvecs.data()); 54 | } 55 | 56 | { // populating the database 57 | 58 | std::vector database (nb * d); 59 | for (size_t i = 0; i < nb * d; i++) { 60 | database[i] = drand48(); 61 | } 62 | 63 | index.add (nb, database.data()); 64 | index_gt.add (nb, database.data()); 65 | } 66 | 67 | int nq = 200; 68 | int n_ok; 69 | 70 | { // searching the database 71 | 72 | std::vector queries (nq * d); 73 | for (size_t i = 0; i < nq * d; i++) { 74 | queries[i] = drand48(); 75 | } 76 | 77 | std::vector gt_nns (nq); 78 | std::vector gt_dis (nq); 79 | 80 | index_gt.search (nq, queries.data(), 1, 81 | gt_dis.data(), gt_nns.data()); 82 | 83 | index.nprobe = 5; 84 | int k = 5; 85 | std::vector nns (k * nq); 86 | std::vector dis (k * nq); 87 | 88 | index.search (nq, queries.data(), k, dis.data(), nns.data()); 89 | 90 | n_ok = 0; 91 | for (int q = 0; q < nq; q++) { 92 | 93 | for (int i = 0; i < k; i++) 94 | if (nns[q * k + i] == gt_nns[q]) 95 | n_ok++; 96 | } 97 | EXPECT_GT(n_ok, nq * 0.4); 98 | } 99 | 100 | } 101 | --------------------------------------------------------------------------------