├── AutoTune.cpp
├── AutoTune.h
├── AuxIndexStructures.cpp
├── AuxIndexStructures.h
├── CMakeLists.txt
├── Clustering.cpp
├── Clustering.h
├── FaissAssert.h
├── FaissException.cpp
├── FaissException.h
├── Heap.cpp
├── Heap.h
├── INSTALL
├── Index.cpp
├── Index.h
├── IndexFlat.cpp
├── IndexFlat.h
├── IndexIVF.cpp
├── IndexIVF.h
├── IndexIVFPQ.cpp
├── IndexIVFPQ.h
├── IndexLSH.cpp
├── IndexLSH.h
├── IndexPQ.cpp
├── IndexPQ.h
├── IndexScalarQuantizer.cpp
├── IndexScalarQuantizer.h
├── LICENSE
├── Makefile
├── MetaIndexes.cpp
├── MetaIndexes.h
├── PolysemousTraining.cpp
├── PolysemousTraining.h
├── ProductQuantizer.cpp
├── ProductQuantizer.h
├── README.md
├── VectorTransform.cpp
├── VectorTransform.h
├── cmake
    ├── Cuda.cmake
    ├── Cuda.cmake.bak
    └── Modules
    │   ├── FindMKL.cmake
    │   ├── FindOpenBLAS.cmake
    │   └── FindOpenBLAS.cmake.bak
├── example_makefiles
    ├── makefile.inc.Linux
    ├── makefile.inc.Mac.brew
    └── makefile.inc.Mac.port
├── faiss.cbp
├── faiss.cscope_file_list
├── faiss.h
├── faiss.layout
├── filehelper.cpp
├── filehelper.h
├── gpu
    ├── CMakeLists.txt
    ├── GpuAutoTune.cpp
    ├── GpuAutoTune.h
    ├── GpuClonerOptions.cpp
    ├── GpuClonerOptions.h
    ├── GpuIndex.cu
    ├── GpuIndex.h
    ├── GpuIndexFlat.cu
    ├── GpuIndexFlat.h
    ├── GpuIndexIVF.cu
    ├── GpuIndexIVF.h
    ├── GpuIndexIVFFlat.cu
    ├── GpuIndexIVFFlat.h
    ├── GpuIndexIVFPQ.cu
    ├── GpuIndexIVFPQ.h
    ├── GpuIndicesOptions.h
    ├── GpuResources.cpp
    ├── GpuResources.h
    ├── IndexProxy.cpp
    ├── IndexProxy.h
    ├── Makefile
    ├── StandardGpuResources.cpp
    ├── StandardGpuResources.h
    ├── impl
    │   ├── BroadcastSum.cu
    │   ├── BroadcastSum.cuh
    │   ├── Distance.cu
    │   ├── Distance.cuh
    │   ├── FlatIndex.cu
    │   ├── FlatIndex.cuh
    │   ├── IVFBase.cu
    │   ├── IVFBase.cuh
    │   ├── IVFFlat.cu
    │   ├── IVFFlat.cuh
    │   ├── IVFFlatScan.cu
    │   ├── IVFFlatScan.cuh
    │   ├── IVFPQ.cu
    │   ├── IVFPQ.cuh
    │   ├── IVFUtils.cu
    │   ├── IVFUtils.cuh
    │   ├── IVFUtilsSelect1.cu
    │   ├── IVFUtilsSelect2.cu
    │   ├── InvertedListAppend.cu
    │   ├── InvertedListAppend.cuh
    │   ├── L2Norm.cu
    │   ├── L2Norm.cuh
    │   ├── L2Select.cu
    │   ├── L2Select.cuh
    │   ├── PQCodeDistances.cu
    │   ├── PQCodeDistances.cuh
    │   ├── PQCodeLoad.cuh
    │   ├── PQScanMultiPassNoPrecomputed.cu
    │   ├── PQScanMultiPassNoPrecomputed.cuh
    │   ├── PQScanMultiPassPrecomputed.cu
    │   ├── PQScanMultiPassPrecomputed.cuh
    │   ├── RemapIndices.cpp
    │   ├── RemapIndices.h
    │   ├── VectorResidual.cu
    │   └── VectorResidual.cuh
    ├── perf
    │   ├── CompareFlat.cu
    │   ├── CompareIVFFlat.cu
    │   ├── CompareIVFPQ.cu
    │   ├── CompareIVFPQGrid.cu
    │   ├── IndexWrapper-inl.h
    │   ├── IndexWrapper.h
    │   ├── PerfClustering.cpp
    │   ├── PerfIVFPQAdd.cpp
    │   ├── PerfSelect.cu
    │   └── WriteIndex.cpp
    ├── test
    │   ├── CMakeLists.txt
    │   ├── CMakeLists.txt.bak
    │   ├── TestGpuIndexFlat.cpp
    │   ├── TestGpuIndexIVFFlat.cpp
    │   ├── TestGpuIndexIVFPQ.cpp
    │   ├── TestGpuSelect.cu
    │   ├── TestUtils.cpp
    │   ├── TestUtils.h
    │   ├── deep1b16_createdb.cpp
    │   ├── deep1b16_query.cpp
    │   ├── deep1b_createdb.cpp
    │   ├── deep1b_createdb_hnsw.cpp
    │   ├── deep1b_creategt.cpp
    │   ├── deep1b_query.cpp
    │   ├── deep1b_query.cpp.bak
    │   ├── deep1b_query1.cpp
    │   ├── deep1b_query2.cpp
    │   ├── deep1b_queryd.cpp
    │   ├── demo_ivfpq_indexing_gpu.cpp
    │   ├── demo_ivfpq_line_indexing_gpu.cpp
    │   ├── sift1b16_createdb.cpp
    │   ├── sift1b16_query - 副本.cpp
    │   ├── sift1b16_query.cpp
    │   ├── sift1b_createdb.cpp
    │   ├── sift1b_createdb_hnsw.cpp
    │   ├── sift1b_creategt.cpp
    │   ├── sift1b_query.cpp
    │   ├── sift1b_query1.cpp
    │   ├── sift1b_query2.cpp
    │   ├── sift1b_queryd.cpp
    │   ├── test_gpu_index.py
    │   ├── tool_createdb.cpp
    │   ├── tool_query.cpp
    │   ├── tool_query1.cpp
    │   ├── transform_deep1b.cpp
    │   └── transform_sift1b.cpp
    └── utils
    │   ├── BlockSelectFloat.cu
    │   ├── BlockSelectHalf.cu
    │   ├── BlockSelectKernel.cuh
    │   ├── Comparators.cuh
    │   ├── ConversionOperators.cuh
    │   ├── CopyUtils.cuh
    │   ├── DeviceDefs.cuh
    │   ├── DeviceMemory.cpp
    │   ├── DeviceMemory.h
    │   ├── DeviceTensor-inl.cuh
    │   ├── DeviceTensor.cuh
    │   ├── DeviceUtils.cpp
    │   ├── DeviceUtils.h
    │   ├── DeviceVector.cuh
    │   ├── Float16.cu
    │   ├── Float16.cuh
    │   ├── HostTensor-inl.cuh
    │   ├── HostTensor.cuh
    │   ├── Limits.cuh
    │   ├── LoadStoreOperators.cuh
    │   ├── MathOperators.cuh
    │   ├── MatrixMult.cu
    │   ├── MatrixMult.cuh
    │   ├── MemorySpace.cpp
    │   ├── MemorySpace.h
    │   ├── MergeNetworkBlock.cuh
    │   ├── MergeNetworkUtils.cuh
    │   ├── MergeNetworkWarp.cuh
    │   ├── NoTypeTensor.cuh
    │   ├── Pair.cuh
    │   ├── PtxUtils.cuh
    │   ├── ReductionOperators.cuh
    │   ├── Reductions.cuh
    │   ├── Select.cuh
    │   ├── StackDeviceMemory.cpp
    │   ├── StackDeviceMemory.h
    │   ├── StaticUtils.h
    │   ├── Tensor-inl.cuh
    │   ├── Tensor.cuh
    │   ├── ThrustAllocator.cuh
    │   ├── Timer.cpp
    │   ├── Timer.h
    │   ├── Transpose.cuh
    │   ├── WarpSelectFloat.cu
    │   ├── WarpSelectHalf.cu
    │   ├── WarpSelectKernel.cuh
    │   ├── WarpShuffles.cuh
    │   ├── WorkerThread.cpp
    │   ├── WorkerThread.h
    │   ├── bitonicSort.cuh
    │   ├── blockselect
    │       ├── BlockSelectFloat1.cu
    │       ├── BlockSelectFloat128.cu
    │       ├── BlockSelectFloat256.cu
    │       ├── BlockSelectFloat32.cu
    │       ├── BlockSelectFloat64.cu
    │       ├── BlockSelectFloatF1024.cu
    │       ├── BlockSelectFloatF512.cu
    │       ├── BlockSelectFloatT1024.cu
    │       ├── BlockSelectFloatT512.cu
    │       ├── BlockSelectHalf1.cu
    │       ├── BlockSelectHalf128.cu
    │       ├── BlockSelectHalf256.cu
    │       ├── BlockSelectHalf32.cu
    │       ├── BlockSelectHalf64.cu
    │       ├── BlockSelectHalfF1024.cu
    │       ├── BlockSelectHalfF512.cu
    │       ├── BlockSelectHalfT1024.cu
    │       ├── BlockSelectHalfT512.cu
    │       └── BlockSelectImpl.cuh
    │   ├── helper.cu
    │   ├── helper.cuh
    │   ├── nvidia
    │       ├── fp16_emu.cu
    │       └── fp16_emu.cuh
    │   ├── triangle.cuh
    │   └── warpselect
    │       ├── WarpSelectFloat1.cu
    │       ├── WarpSelectFloat128.cu
    │       ├── WarpSelectFloat256.cu
    │       ├── WarpSelectFloat32.cu
    │       ├── WarpSelectFloat64.cu
    │       ├── WarpSelectFloatF1024.cu
    │       ├── WarpSelectFloatF512.cu
    │       ├── WarpSelectFloatT1024.cu
    │       ├── WarpSelectFloatT512.cu
    │       ├── WarpSelectHalf1.cu
    │       ├── WarpSelectHalf128.cu
    │       ├── WarpSelectHalf256.cu
    │       ├── WarpSelectHalf32.cu
    │       ├── WarpSelectHalf64.cu
    │       ├── WarpSelectHalfF1024.cu
    │       ├── WarpSelectHalfF512.cu
    │       ├── WarpSelectHalfT1024.cu
    │       ├── WarpSelectHalfT512.cu
    │       └── WarpSelectImpl.cuh
├── hamming.cpp
├── hamming.h
├── index_io.cpp
├── index_io.h
├── makefile.inc
├── tests
    ├── CMakeLists.txt
    ├── deep1b16_imi_pq.cpp
    ├── deep1b_imi_pq.cpp
    ├── deep1b_imi_pq1.cpp
    ├── deep1b_imi_pq2.cpp
    ├── demo_imi_flat.cpp
    ├── demo_imi_pq.cpp
    ├── demo_ivfpq_indexing.cpp
    ├── demo_sift1M.cpp
    ├── sift1b16_imi_pq.cpp
    ├── sift1b_imi_pq.cpp
    ├── sift1b_imi_pq1.cpp
    ├── sift1b_imi_pq2.cpp
    ├── test_blas
    ├── test_blas.cpp
    ├── test_ivfpq_codec.cpp
    └── test_ivfpq_indexing.cpp
├── utils.cpp
└── utils.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.7)
 2 | 
 3 | # faiss project
 4 | project(faiss C CXX)
 5 | 
 6 | option(BUILD_TUTORIAL "Build tutorials" ON)
 7 | option(BUILD_TEST "Build tests" ON)
 8 | option(BUILD_WITH_GPU "Build faiss with gpu (cuda) support" ON)
 9 | option(WITH_MKL "Build with MKL if ON (OpenBLAS if OFF)" OFF)
10 | 
11 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
12 | 
13 | # OpenMP
14 | find_package(OpenMP REQUIRED)
15 | 
16 | set(MPICH_INCLUDE_PATH "/usr/local/mpich/include")
17 | set(MPICH_CXX_LIBRARIES "/usr/local/mpich/lib/libmpi.so")
18 | 
19 | include_directories(${MPICH_INCLUDE_PATH})
20 | 
21 | # BLAS (MKL os OpenBLAS)
22 | if(WITH_MKL)
23 |     find_package(MKL REQUIRED)
24 |     include_directories(${MKL_INCLUDE_DIRS})
25 |     set(BLAS_LIB ${MKL_LIBRARIES})
26 | else()
27 |     find_package(OpenBLAS REQUIRED)
28 |     include_directories(${OpenBLAS_INCLUDE_DIR})
29 |     set(BLAS_LIB ${OpenBLAS_LIB})
30 | endif()
31 | 
32 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -m64 -Wall -g -O3  -msse4 -mpopcnt -fopenmp -Wno-sign-compare")
33 | add_definitions(-DFINTEGER=int)
34 | 
35 | # specify output bin_path and lib_path
36 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
37 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
38 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
39 | 
40 | # specify header and cpp files
41 | file(GLOB faiss_cpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
42 | file(GLOB faiss_cpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
43 | 
44 | set(faiss_lib faiss)
45 | add_library(${faiss_lib} STATIC ${faiss_cpu_headers} ${faiss_cpu_cpp})
46 | target_link_libraries(${faiss_lib} ${OpenMP_CXX_FLAGS} ${BLAS_LIB})
47 | 
48 | # build gpu lib
49 | if(BUILD_WITH_GPU)
50 |     include(cmake/Cuda.cmake)
51 |     add_subdirectory(gpu)
52 | endif(BUILD_WITH_GPU)
53 | 
54 | # build tutorial examples
55 | if(BUILD_TUTORIAL)
56 |     add_subdirectory(tutorial)
57 | endif(BUILD_TUTORIAL)
58 | 
59 | # build tests
60 | if(BUILD_TEST)
61 |     add_subdirectory(tests)
62 | endif(BUILD_TEST)
63 | 


--------------------------------------------------------------------------------
/Clustering.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved
10 | // -*- c++ -*-
11 | 
12 | #ifndef FAISS_CLUSTERING_H
13 | #define FAISS_CLUSTERING_H
14 | #include "Index.h"
15 | 
16 | #include <vector>
17 | 
18 | namespace faiss {
19 | 
20 | 
21 | /** Class for the clustering parameters. Can be passed to the
22 |  * constructor of the Clustering object.
23 |  */
24 | struct ClusteringParameters {
25 |     int niter;          ///< clustering iterations
26 |     int nredo;          ///< redo clustering this many times and keep best
27 | 
28 |     bool verbose;
29 |     bool spherical;     ///< do we want normalized centroids?
30 |     bool update_index;  ///< update index after each iteration?
31 | 
32 |     int min_points_per_centroid; ///< otherwise you get a warning
33 |     int max_points_per_centroid;  ///< to limit size of dataset
34 | 
35 |     int seed; ///< seed for the random number generator
36 | 
37 |     /// sets reasonable defaults
38 |     ClusteringParameters ();
39 | };
40 | 
41 | 
42 | /** clustering based on assignment - centroid update iterations
43 |  *
44 |  * The clustering is based on an Index object that assigns training
45 |  * points to the centroids. Therefore, at each iteration the centroids
46 |  * are added to the index.
47 |  *
48 |  * On output, the centoids table is set to the latest version
49 |  * of the centroids and they are also added to the index. If the
50 |  * centroids table it is not empty on input, it is also used for
51 |  * initialization.
52 |  *
53 |  * To do several clusterings, just call train() several times on
54 |  * different training sets, clearing the centroid table in between.
55 |  */
56 | struct Clustering: ClusteringParameters {
57 |     typedef Index::idx_t idx_t;
58 |     size_t d;              ///< dimension of the vectors
59 |     size_t k;              ///< nb of centroids
60 | 
61 |     /// centroids (k * d)
62 |     std::vector<float> centroids;
63 | 
64 |     /// objective values (sum of distances reported by index) over
65 |     /// iterations
66 |     std::vector<float> obj;
67 | 
68 |     /// the only mandatory parameters are k and d
69 |     Clustering (int d, int k);
70 |     Clustering (int d, int k, const ClusteringParameters &cp);
71 | 
72 |     /// Index is used during the assignment stage
73 |     virtual void train (idx_t n, const float * x, faiss::Index & index);
74 | 
75 |     virtual ~Clustering() {}
76 | };
77 | 
78 | 
79 | /** simplified interface
80 |  *
81 |  * @param d dimension of the data
82 |  * @param n nb of training vectors
83 |  * @param k nb of output centroids
84 |  * @param x training set (size n * d)
85 |  * @param centroids output centroids (size k * d)
86 |  * @return final quantization error
87 |  */
88 | float kmeans_clustering (size_t d, size_t n, size_t k,
89 |                          const float *x,
90 |                          float *centroids);
91 | 
92 | 
93 | 
94 | }
95 | 
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/FaissException.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "FaissException.h"
12 | 
13 | namespace faiss {
14 | 
15 | FaissException::FaissException(const std::string& m)
16 |     : msg(m) {
17 | }
18 | 
19 | FaissException::FaissException(const std::string& m,
20 |                                const char* funcName,
21 |                                const char* file,
22 |                                int line) {
23 |   int size = snprintf(nullptr, 0, "Error in %s at %s:%d: %s",
24 |                       funcName, file, line, m.c_str());
25 |   msg.resize(size + 1);
26 |   snprintf(&msg[0], msg.size(), "Error in %s at %s:%d: %s",
27 |            funcName, file, line, m.c_str());
28 | }
29 | 
30 | const char*
31 | FaissException::what() const noexcept {
32 |   return msg.c_str();
33 | }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/FaissException.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #ifndef FAISS_EXCEPTION_INCLUDED
12 | #define FAISS_EXCEPTION_INCLUDED
13 | 
14 | #include <exception>
15 | #include <string>
16 | 
17 | namespace faiss {
18 | 
19 | /// Base class for Faiss exceptions
20 | class FaissException : public std::exception {
21 |  public:
22 |   explicit FaissException(const std::string& msg);
23 | 
24 |   FaissException(const std::string& msg,
25 |                  const char* funcName,
26 |                  const char* file,
27 |                  int line);
28 | 
29 |   /// from std::exception
30 |   const char* what() const noexcept override;
31 | 
32 |   std::string msg;
33 | };
34 | 
35 | 
36 | /** bare-bones unique_ptr
37 |  * this one deletes with delete [] */
38 | template<class T>
39 | struct ScopeDeleter {
40 |     const T * ptr;
41 |     explicit ScopeDeleter (const T* ptr = nullptr): ptr (ptr) {}
42 |     void release () {ptr = nullptr; }
43 |     void set (const T * ptr_in) { ptr = ptr_in; }
44 |     void swap (ScopeDeleter<T> &other) {std::swap (ptr, other.ptr); }
45 |     ~ScopeDeleter () {
46 |         delete [] ptr;
47 |     }
48 | };
49 | 
50 | /** same but deletes with the simple delete (least common case) */
51 | template<class T>
52 | struct ScopeDeleter1 {
53 |     const T * ptr;
54 |     explicit ScopeDeleter1 (const T* ptr = nullptr): ptr (ptr) {}
55 |     void release () {ptr = nullptr; }
56 |     void set (const T * ptr_in) { ptr = ptr_in; }
57 |     void swap (ScopeDeleter1<T> &other) {std::swap (ptr, other.ptr); }
58 |     ~ScopeDeleter1 () {
59 |         delete ptr;
60 |     }
61 | };
62 | 
63 | 
64 | 
65 | }
66 | 
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/Heap.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | /* Copyright 2004-present Facebook. All Rights Reserved. */
 10 | /* Function for soft heap */
 11 | 
 12 | #include "Heap.h"
 13 | 
 14 | 
 15 | namespace faiss {
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | template <typename C>
 26 | void HeapArray<C>::heapify ()
 27 | {
 28 | #pragma omp parallel for
 29 |     for (size_t j = 0; j < nh; j++)
 30 |         heap_heapify<C> (k, val + j * k, ids + j * k);
 31 | }
 32 | 
 33 | template <typename C>
 34 | void HeapArray<C>::reorder ()
 35 | {
 36 | #pragma omp parallel for
 37 |     for (size_t j = 0; j < nh; j++)
 38 |         heap_reorder<C> (k, val + j * k, ids + j * k);
 39 | }
 40 | 
 41 | template <typename C>
 42 | void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
 43 |                          size_t i0, long ni)
 44 | {
 45 |     if (ni == -1) ni = nh;
 46 |     assert (i0 >= 0 && i0 + ni <= nh);
 47 | #pragma omp parallel for
 48 |     for (size_t i = i0; i < i0 + ni; i++) {
 49 |         T * __restrict simi = get_val(i);
 50 |         TI * __restrict idxi = get_ids (i);
 51 |         const T *ip_line = vin + (i - i0) * nj;
 52 | 
 53 |         for (size_t j = 0; j < nj; j++) {
 54 |             T ip = ip_line [j];
 55 |             if (C::cmp(simi[0], ip)) {
 56 |                 heap_pop<C> (k, simi, idxi);
 57 |                 heap_push<C> (k, simi, idxi, ip, j + j0);
 58 |             }
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | template <typename C>
 64 | void HeapArray<C>::addn_with_ids (
 65 |      size_t nj, const T *vin, const TI *id_in,
 66 |      long id_stride, size_t i0, long ni)
 67 | {
 68 |     if (id_in == nullptr) {
 69 |         addn (nj, vin, 0, i0, ni);
 70 |         return;
 71 |     }
 72 |     if (ni == -1) ni = nh;
 73 |     assert (i0 >= 0 && i0 + ni <= nh);
 74 | #pragma omp parallel for
 75 |     for (size_t i = i0; i < i0 + ni; i++) {
 76 |         T * __restrict simi = get_val(i);
 77 |         TI * __restrict idxi = get_ids (i);
 78 |         const T *ip_line = vin + (i - i0) * nj;
 79 |         const TI *id_line = id_in + (i - i0) * id_stride;
 80 | 
 81 |         for (size_t j = 0; j < nj; j++) {
 82 |             T ip = ip_line [j];
 83 |             if (C::cmp(simi[0], ip)) {
 84 |                 heap_pop<C> (k, simi, idxi);
 85 |                 heap_push<C> (k, simi, idxi, ip, id_line [j]);
 86 |             }
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | template <typename C>
 92 | void HeapArray<C>::per_line_extrema (
 93 |                    T * out_val,
 94 |                    TI * out_ids) const
 95 | {
 96 | #pragma omp parallel for
 97 |     for (size_t j = 0; j < nh; j++) {
 98 |         long imin = -1;
 99 |         typename C::T xval = C::Crev::neutral ();
100 |         const typename C::T * x_ = val + j * k;
101 |         for (size_t i = 0; i < k; i++)
102 |             if (C::cmp (x_[i], xval)) {
103 |                 xval = x_[i];
104 |                 imin = i;
105 |             }
106 |         if (out_val)
107 |             out_val[j] = xval;
108 | 
109 |         if (out_ids) {
110 |             if (ids && imin != -1)
111 |                 out_ids[j] = ids [j * k + imin];
112 |             else
113 |                 out_ids[j] = imin;
114 |         }
115 |     }
116 | }
117 | 
118 | 
119 | 
120 | 
121 | // explicit instanciations
122 | 
123 | template class HeapArray<CMin <float, long> >;
124 | template class HeapArray<CMax <float, long> >;
125 | template class HeapArray<CMin <int, long> >;
126 | template class HeapArray<CMax <int, long> >;
127 | 
128 | 
129 | 
130 | 
131 | 
132 | }  // END namespace fasis
133 | 


--------------------------------------------------------------------------------
/Index.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved
10 | 
11 | #include "IndexFlat.h"
12 | #include "FaissAssert.h"
13 | #include <cstring>
14 | namespace faiss {
15 | 
16 | 
17 | void Index::range_search (idx_t , const float *, float,
18 |                           RangeSearchResult *) const
19 | {
20 |   FAISS_THROW_MSG ("range search not implemented");
21 | }
22 | 
23 | void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
24 | {
25 |   printf("Index::assign,k: %d\n",k);
26 |   float * distances = new float[n * k];
27 |   ScopeDeleter<float> del(distances);
28 |   search (n, x, k, distances, labels);
29 | }
30 | 
31 | 
32 | void Index::add_with_ids (idx_t n, const float * x, const long *xids)
33 | {
34 |   FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
35 | }
36 | 
37 | 
38 | long Index::remove_ids (const IDSelector & sel)
39 | {
40 |   FAISS_THROW_MSG ("remove_ids not implemented for this type of index");
41 |   return -1;
42 | }
43 | 
44 | 
45 | void Index::reconstruct (idx_t, float * ) const {
46 |   FAISS_THROW_MSG ("Can not compute reconstruct without "
47 |                     "knowing how to do so");
48 | }
49 | 
50 | 
51 | void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
52 |   for (idx_t i = 0; i < ni; i++) {
53 |     reconstruct (i0 + i, recons + i * d);
54 |   }
55 | }
56 | 
57 | void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
58 |                                     float *distances, idx_t *labels,
59 |                                     float *recons) const {
60 |   search (n, x, k, distances, labels);
61 |   for (idx_t i = 0; i < n; ++i) {
62 |     for (idx_t j = 0; j < k; ++j) {
63 |       idx_t ij = i * k + j;
64 |       idx_t key = labels[ij];
65 |       float* reconstructed = recons + ij * d;
66 |       if (key < 0) {
67 |         // Fill with NaNs
68 |         memset(reconstructed, -1, sizeof(*reconstructed) * d);
69 |       } else {
70 |         reconstruct (key, reconstructed);
71 |       }
72 |     }
73 |   }
74 | }
75 | 
76 | void Index::compute_residual (const float * x,
77 |                               float * residual, idx_t key) const {
78 |   reconstruct (key, residual);
79 |   for (size_t i = 0; i < d; i++)
80 |     residual[i] = x[i] - residual[i];
81 | }
82 | 
83 | 
84 | void Index::display () const {
85 |   printf ("Index: %s  -> %ld elements\n", typeid (*this).name(), ntotal);
86 | }
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/IndexIVFPQ.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/IndexIVFPQ.cpp


--------------------------------------------------------------------------------
/IndexLSH.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | // -*- c++ -*-
11 | 
12 | #ifndef INDEX_LSH_H
13 | #define INDEX_LSH_H
14 | 
15 | #include <vector>
16 | 
17 | #include "Index.h"
18 | #include "VectorTransform.h"
19 | 
20 | namespace faiss {
21 | 
22 | 
23 | /** The sign of each vector component is put in a binary signature */
24 | struct IndexLSH:Index {
25 |     typedef unsigned char uint8_t;
26 | 
27 |     int nbits;              ///< nb of bits per vector
28 |     int bytes_per_vec;      ///< nb of 8-bits per encoded vector
29 |     bool rotate_data;       ///< whether to apply a random rotation to input
30 |     bool train_thresholds;  ///< whether we train thresholds or use 0
31 | 
32 |     RandomRotationMatrix rrot; ///< optional random rotation
33 | 
34 |     std::vector <float> thresholds; ///< thresholds to compare with
35 | 
36 |     /// encoded dataset
37 |     std::vector<uint8_t> codes;
38 | 
39 |     IndexLSH (
40 |             idx_t d, int nbits,
41 |             bool rotate_data = true,
42 |             bool train_thresholds = false);
43 | 
44 |     /** Preprocesses and resizes the input to the size required to
45 |      * binarize the data
46 |      *
47 |      * @param x input vectors, size n * d
48 |      * @return output vectors, size n * bits. May be the same pointer
49 |      *         as x, otherwise it should be deleted by the caller
50 |      */
51 |     const float *apply_preprocess (idx_t n, const float *x) const;
52 | 
53 |     void train(idx_t n, const float* x) override;
54 | 
55 |     void add(idx_t n, const float* x) override;
56 | 
57 |     void search(
58 |         idx_t n,
59 |         const float* x,
60 |         idx_t k,
61 |         float* distances,
62 |         idx_t* labels) const override;
63 | 
64 |     void reset() override;
65 | 
66 |     /// transfer the thresholds to a pre-processing stage (and unset
67 |     /// train_thresholds)
68 |     void transfer_thresholds (LinearTransform * vt);
69 | 
70 |     ~IndexLSH() override {}
71 | 
72 |     IndexLSH ();
73 | };
74 | 
75 | 
76 | 
77 | }
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | "# vector-line-quantization" 
2 | 


--------------------------------------------------------------------------------
/cmake/Cuda.cmake:
--------------------------------------------------------------------------------
 1 | # configure cuda
 2 | 
 3 | find_package(CUDA QUIET REQUIRED)
 4 | if(CUDA_FOUND)
 5 |     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 6 |  
 7 |     list(APPEND CUDA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 8 | else(CUDA_FOUND)
 9 |     message(STATUS "Could not locate cuda, disabling cuda support.")
10 |     set(BUILD_WITH_GPU OFF)
11 |     return()
12 | endif(CUDA_FOUND)
13 | if(NOT DEFINED CUDA_TOOLKIT_SAMPLE_DIR)
14 |   set(CUDA_TOOLKIT_SAMPLE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/samples)
15 | endif()
16 | set(FAISS_CUDA_ADDITIONAL_INC_PATH ${CUDA_INCLUDE_DIRS} ${CUDA_TOOLKIT_SAMPLE_DIR}/common/inc/)
17 | include_directories(${FAISS_CUDA_ADDITIONAL_INC_PATH})
18 | # set cuda flags
19 | if (CMAKE_BUILD_TYPE STREQUAL "Debug")
20 |     list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-DVERBOSE;-g;-lineinfo;-Xcompiler;-ggdb")
21 | else()
22 |     list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-lm;-DVERBOSE;-O3;-DNDEBUG;-Xcompiler;-DNDEBU")
23 | endif()
24 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
25 | 


--------------------------------------------------------------------------------
/cmake/Cuda.cmake.bak:
--------------------------------------------------------------------------------
 1 | # configure cuda
 2 | 
 3 | find_package(CUDA QUIET REQUIRED)
 4 | if(CUDA_FOUND)
 5 |     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 6 |  
 7 |     list(APPEND CUDA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 8 | else(CUDA_FOUND)
 9 |     message(STATUS "Could not locate cuda, disabling cuda support.")
10 |     set(BUILD_WITH_GPU OFF)
11 |     return()
12 | endif(CUDA_FOUND)
13 | if(NOT DEFINED CUDA_TOOLKIT_SAMPLE_DIR)
14 |   set(CUDA_TOOLKIT_SAMPLE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/samples)
15 | endif()
16 | set(PQT_CUDA_ADDITIONAL_INC_PATH ${CUDA_INCLUDE_DIRS} ${CUDA_TOOLKIT_SAMPLE_DIR}/common/inc/)
17 | include_directories(${PQT_CUDA_ADDITIONAL_INC_PATH})
18 | # set cuda flags
19 | if (CMAKE_BUILD_TYPE STREQUAL "Debug")
20 |     list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-DVERBOSE;-g;-lineinfo;-Xcompiler;-ggdb")
21 | else()
22 |     list(APPEND CUDA_NVCC_FLAGS "-arch=sm_52;-D_FORCE_INLINES;-D_MWAITXINTRIN_H_INCLUDED;-D__STRICT_ANSI__;-std=c++11;-lm;-DVERBOSE;-O3;-DNDEBUG;-Xcompiler;-DNDEBU")
23 | endif()
24 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
25 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindMKL.cmake:
--------------------------------------------------------------------------------
 1 | # defines:
 2 | #   MKL_INCLUDE_DIRS
 3 | #   MKL_LIBRARIES
 4 | #   MKL_COMPILER_LIBRARIES - a list of compiler libraries (file names) required for MKL
 5 | 
 6 | #unset(MKL_LIB_DIR CACHE)
 7 | #unset(MKL_COMPILER_LIB_DIR CACHE)
 8 | #unset(MKL_COMPILER_REDIST_PATH CACHE)
 9 | 
10 | if(NOT HAVE_MKL)
11 |   find_path(MKL_INCLUDE_DIRS "mkl.h" PATHS ${MKL_INCLUDE_DIR} DOC "The path to MKL headers")
12 | 
13 |   if(MKL_INCLUDE_DIRS)
14 | 
15 | 		get_filename_component(_MKL_LIB_PATH "${MKL_INCLUDE_DIRS}/../lib" ABSOLUTE)
16 | 		
17 | 		if(APPLE)
18 | 			# MKL 2017 for mac has only 64 bit libraries without directory prefix 
19 | 			set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib)
20 | 		else()
21 | 			if(CMAKE_SIZEOF_VOID_P EQUAL 8)
22 | 				set(_MKL_LIB_PATH "${_MKL_LIB_PATH}/intel64")
23 | 				set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib/intel64)
24 | 				if(WIN32)
25 | 					set(_MKL_COMPILER_REDIST_PATH ${MKL_INCLUDE_DIRS}/../../redist/intel64/compiler)
26 | 				endif()
27 | 			else()
28 | 				set(_MKL_LIB_PATH "${_MKL_LIB_PATH}/ia32")
29 | 				set(_MKL_COMPILER_LIB_PATH ${MKL_INCLUDE_DIRS}/../../compiler/lib/ia32)
30 | 				if(WIN32)
31 | 					set(_MKL_COMPILER_REDIST_PATH ${MKL_INCLUDE_DIRS}/../../redist/ia32/compiler)
32 | 				endif()
33 | 			endif()
34 | 		endif()
35 | 
36 | 		# On Linux and Apple take libraries for redistribution from the same location that is used for linking
37 | 		if(UNIX)
38 | 			set(_MKL_COMPILER_REDIST_PATH ${_MKL_COMPILER_LIB_PATH})
39 | 		endif()
40 | 
41 | 		if(WIN32)
42 | 			set(MKL_COMPILER_LIBRARIES libiomp5md.dll)
43 | 			set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64 mkl_core mkl_intel_thread libiomp5md)
44 | 		elseif(APPLE)
45 | 			set(MKL_COMPILER_LIBRARIES libiomp5.dylib)
46 | 			# generated by https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
47 | 			# with the following options: OSX; Clang; Intel64; static; 32 bit integer; OpenMP; Intel OpenMP
48 | 			set(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel_lp64.a libmkl_intel_thread.a libmkl_core.a iomp5 pthread m dl)
49 | 		else()
50 | 			set(MKL_COMPILER_LIBRARIES libiomp5.so)
51 | 			# a --start-group / --end-group pair is required when linking with static MKL on GNU.
52 | 			# see https://software.intel.com/en-us/forums/topic/280974#comment-1478780
53 | 			# and https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
54 | 			set(MKL_LIBRARIES ${MKL_LIBRARIES}
55 | 			"-Wl,--start-group"
56 | 			libmkl_intel_lp64.a libmkl_core.a libmkl_intel_thread.a
57 | 			"-Wl,--end-group"
58 | 			"-Wl,--exclude-libs,libmkl_intel_lp64.a,--exclude-libs,libmkl_core.a,--exclude-libs,libmkl_intel_thread.a,--exclude-libs,iomp5"
59 | 			iomp5 dl pthread m)
60 | 		endif()
61 | 
62 | 		set(MKL_LIB_DIR "${_MKL_LIB_PATH}"
63 | 			CACHE PATH "Full path of MKL library directory")
64 | 		set(MKL_COMPILER_LIB_DIR "${_MKL_COMPILER_LIB_PATH}"
65 | 			CACHE PATH "Full path of MKL compiler library directory")
66 | 		set(MKL_COMPILER_REDIST_PATH "${_MKL_COMPILER_REDIST_PATH}"
67 | 			CACHE PATH "Full path of MKL compiler redistributable library directory")
68 | 
69 | 	link_directories(${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR})
70 | 
71 |     set(HAVE_MKL 1)
72 | 
73 |   endif(MKL_INCLUDE_DIRS)
74 | endif(NOT HAVE_MKL)
75 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindOpenBLAS.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS
 4 |   /usr/include
 5 |   /usr/include/openblas
 6 |   /usr/include/openblas-base
 7 |   /usr/local/include
 8 |   /usr/local/include/openblas
 9 |   /usr/local/include/openblas-base
10 |   /opt/OpenBLAS/include
11 |   /opt/local/include
12 |   $ENV{OpenBLAS_HOME}
13 |   $ENV{OpenBLAS_HOME}/include
14 | )
15 | 
16 | SET(Open_BLAS_LIB_SEARCH_PATHS
17 |         /lib/
18 |         /lib/openblas-base
19 |         /lib64/
20 |         /usr/lib
21 |         /usr/lib/openblas-base
22 |         /usr/lib64
23 |         /usr/local/lib
24 |         /usr/local/lib64
25 |         /opt/OpenBLAS/lib
26 |         /opt/local/lib
27 |         $ENV{OpenBLAS}cd
28 |         $ENV{OpenBLAS}/lib
29 |         $ENV{OpenBLAS_HOME}
30 |         $ENV{OpenBLAS_HOME}/lib
31 |  )
32 | 
33 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES openblas_config.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
34 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
35 | 
36 | SET(OpenBLAS_FOUND ON)
37 | 
38 | #    Check include files
39 | IF(NOT OpenBLAS_INCLUDE_DIR)
40 |     SET(OpenBLAS_FOUND OFF)
41 |     MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
42 | ENDIF()
43 | 
44 | #    Check libraries
45 | IF(NOT OpenBLAS_LIB)
46 |     SET(OpenBLAS_FOUND OFF)
47 |     MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
48 | ENDIF()
49 | 
50 | IF (OpenBLAS_FOUND)
51 |   IF (NOT OpenBLAS_FIND_QUIETLY)
52 |     MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
53 |     MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
54 |   ENDIF (NOT OpenBLAS_FIND_QUIETLY)
55 | ELSE (OpenBLAS_FOUND)
56 |   IF (OpenBLAS_FIND_REQUIRED)
57 |     MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
58 |   ENDIF (OpenBLAS_FIND_REQUIRED)
59 | ENDIF (OpenBLAS_FOUND)
60 | 
61 | MARK_AS_ADVANCED(
62 |     OpenBLAS_INCLUDE_DIR
63 |     OpenBLAS_LIB
64 |     OpenBLAS
65 | )
66 | 
67 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindOpenBLAS.cmake.bak:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS
 4 |   /usr/include
 5 |   /usr/include/openblas
 6 |   /usr/include/openblas-base
 7 |   /usr/local/include
 8 |   /usr/local/include/openblas
 9 |   /usr/local/include/openblas-base
10 |   /opt/OpenBLAS/include
11 |   /opt/local/include
12 |   /home/dl/OpenBLAS
13 |   $ENV{OpenBLAS_HOME}
14 |   $ENV{OpenBLAS_HOME}/include
15 | )
16 | 
17 | SET(Open_BLAS_LIB_SEARCH_PATHS
18 |         /lib/
19 |         /lib/openblas-base
20 |         /lib64/
21 |         /usr/lib
22 |         /usr/lib/openblas-base
23 |         /usr/lib64
24 |         /usr/local/lib
25 |         /usr/local/lib64
26 |         /opt/OpenBLAS/lib
27 |         /opt/local/lib
28 |         /home/dl/OpenBLAS
29 |         $ENV{OpenBLAS}cd
30 |         $ENV{OpenBLAS}/lib
31 |         $ENV{OpenBLAS_HOME}
32 |         $ENV{OpenBLAS_HOME}/lib
33 |  )
34 | 
35 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES openblas_config.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
36 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
37 | 
38 | SET(OpenBLAS_FOUND ON)
39 | 
40 | #    Check include files
41 | IF(NOT OpenBLAS_INCLUDE_DIR)
42 |     SET(OpenBLAS_FOUND OFF)
43 |     MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
44 | ENDIF()
45 | 
46 | #    Check libraries
47 | IF(NOT OpenBLAS_LIB)
48 |     SET(OpenBLAS_FOUND OFF)
49 |     MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
50 | ENDIF()
51 | 
52 | IF (OpenBLAS_FOUND)
53 |   IF (NOT OpenBLAS_FIND_QUIETLY)
54 |     MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
55 |     MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
56 |   ENDIF (NOT OpenBLAS_FIND_QUIETLY)
57 | ELSE (OpenBLAS_FOUND)
58 |   IF (OpenBLAS_FIND_REQUIRED)
59 |     MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
60 |   ENDIF (OpenBLAS_FIND_REQUIRED)
61 | ENDIF (OpenBLAS_FOUND)
62 | 
63 | MARK_AS_ADVANCED(
64 |     OpenBLAS_INCLUDE_DIR
65 |     OpenBLAS_LIB
66 |     OpenBLAS
67 | )
68 | 
69 | 


--------------------------------------------------------------------------------
/example_makefiles/makefile.inc.Mac.brew:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- makefile -*-
 3 | 
 4 | # Tested on macOS Sierra (10.12.2) with llvm installed using Homebrew (https://brew.sh)
 5 | # brew install llvm
 6 | CC=/usr/local/opt/llvm/bin/clang++
 7 | CFLAGS=-fPIC -m64 -Wall -g -O3 -msse4 -mpopcnt -fopenmp -Wno-sign-compare -Dnullptr=NULL -I/usr/local/opt/llvm/include -std=c++11
 8 | LDFLAGS=-g -fPIC -fopenmp -L/usr/local/opt/llvm/lib
 9 | 
10 | # common mac flags
11 | SHAREDEXT=dylib
12 | SHAREDFLAGS=-Wl,-F. -bundle -undefined dynamic_lookup
13 | FAISSSHAREDFLAGS=-dynamiclib
14 | 
15 | #  wrapldflags=""
16 | #  sharedext=dylib
17 | #  sharedflags="-dynamiclib"
18 | #  yaelsharedflags="$sharedflags -install_name $yaelprefix/yael/libyael.dylib"
19 | 
20 | ##########################################################################
21 | # Uncomment one of the 4 BLAS/Lapack implementation options
22 | # below. They are sorted # from fastest to slowest (in our
23 | # experiments).
24 | ##########################################################################
25 | 
26 | #
27 | # 1. Intel MKL
28 | #
29 | # This is the fastest BLAS implementation we tested. Unfortunately it
30 | # is not open-source and determining the correct linking flags is a
31 | # nightmare. See
32 | #
33 | #   https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
34 | #
35 | # for a start on setting the link flags. On version IntelComposerXE
36 | # 2015.0.090, the following flags work
37 | #
38 | # MKLROOT=$(HOME)/fbsource/fbcode/third-party2//IntelComposerXE/2015.0.090/gcc-4.8.1-glibc-2.17/c3f970a/mkl
39 | #
40 | # BLASLDFLAGS=-Wl,--no-as-needed -L$(MKLROOT)/lib/intel64   -lmkl_intel_ilp64 \
41 | # -lmkl_core -lmkl_gnu_thread -ldl -lpthread
42 | #
43 | # the ilp64 means that the integers are 64-bit.
44 | #
45 | # BLASLDFLAGS=-DFINTEGER=long
46 | #
47 | # you may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime
48 | #
49 | 
50 | #
51 | # 2. Openblas
52 | #
53 | # The library contains both BLAS and Lapack. Install with port install OpenBLAS
54 | #
55 | # BLASCFLAGS=-DFINTEGER=int
56 | # BLASLDFLAGS=/opt/local/lib/libopenblas.dylib
57 | #
58 | 
59 | #
60 | # 3. Apple's framework accelerate
61 | #
62 | # This has the advantage that it does not require to install anything,
63 | # as it is provided by default on the mac. It is not very fast, though.
64 | #
65 | 
66 | BLASCFLAGS=-DFINTEGER=int
67 | BLASLDFLAGS=-framework Accelerate
68 | 
69 | 
70 | 
71 | ##########################################################################
72 | # SWIG and Python flags
73 | ##########################################################################
74 | 
75 | # SWIG executable. This should be at least version 3.x
76 | # brew install swig
77 | 
78 | SWIGEXEC=/usr/local/bin/swig
79 | 
80 | # The Python include directories for the current python executable can
81 | # typically be found with
82 | #
83 | # python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()"
84 | # python -c "import numpy ; print numpy.get_include()"
85 | #
86 | # the paths below are for the system python (not the macports one)
87 | 
88 | PYTHONCFLAGS=-I/System/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 \
89 | -I/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/include
90 | 
91 | 
92 | ##########################################################################
93 | # Faiss GPU
94 | ##########################################################################
95 | 
96 | # As we don't have access to a Mac with nvidia GPUs installed, we
97 | # could not validate the GPU compile of Faiss.
98 | 


--------------------------------------------------------------------------------
/example_makefiles/makefile.inc.Mac.port:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- makefile -*-
  3 | # tested on Mac OS X 10.12.2 Sierra with additional software installed via macports
  4 | 
  5 | 
  6 | 
  7 | # The system default clang does not support openmp
  8 | # You can install an openmp compatible g++ with macports:
  9 | # port install g++-mp-6
 10 | CC=/opt/local/bin/g++-mp-6
 11 | 
 12 | CFLAGS=-fPIC -m64 -Wall -g -O3  -msse4 -mpopcnt -fopenmp -Wno-sign-compare -std=c++11
 13 | LDFLAGS=-g -fPIC  -fopenmp
 14 | 
 15 | 
 16 | # common linux flags
 17 | SHAREDEXT=dylib
 18 | SHAREDFLAGS=-Wl,-F. -bundle -undefined dynamic_lookup
 19 | FAISSSHAREDFLAGS=-dynamiclib
 20 | 
 21 | #  wrapldflags=""
 22 | #  sharedext=dylib
 23 | #  sharedflags="-dynamiclib"
 24 | #  yaelsharedflags="$sharedflags -install_name $yaelprefix/yael/libyael.dylib"
 25 | 
 26 | ##########################################################################
 27 | # Uncomment one of the 4 BLAS/Lapack implementation options
 28 | # below. They are sorted # from fastest to slowest (in our
 29 | # experiments).
 30 | ##########################################################################
 31 | 
 32 | #
 33 | # 1. Intel MKL
 34 | #
 35 | # This is the fastest BLAS implementation we tested. Unfortunately it
 36 | # is not open-source and determining the correct linking flags is a
 37 | # nightmare. See
 38 | #
 39 | #   https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
 40 | #
 41 | # for a start on setting the link flags. On version IntelComposerXE
 42 | # 2015.0.090, the following flags work
 43 | #
 44 | # MKLROOT=$(HOME)/fbsource/fbcode/third-party2//IntelComposerXE/2015.0.090/gcc-4.8.1-glibc-2.17/c3f970a/mkl
 45 | #
 46 | # BLASLDFLAGS=-Wl,--no-as-needed -L$(MKLROOT)/lib/intel64   -lmkl_intel_ilp64 \
 47 | # -lmkl_core -lmkl_gnu_thread -ldl -lpthread
 48 | #
 49 | # the ilp64 means that the integers are 64-bit.
 50 | #
 51 | # BLASLDFLAGS=-DFINTEGER=long
 52 | #
 53 | # you may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime
 54 | #
 55 | 
 56 | #
 57 | # 2. Openblas
 58 | #
 59 | # The library contains both BLAS and Lapack. Install with port install OpenBLAS
 60 | #
 61 | # BLASCFLAGS=-DFINTEGER=int
 62 | # BLASLDFLAGS=/opt/local/lib/libopenblas.dylib
 63 | #
 64 | 
 65 | #
 66 | # 3. Apple's framework accelerate
 67 | #
 68 | # This has the advantage that it does not require to install anything,
 69 | # as it is provided by default on the mac. It is not very fast, though.
 70 | #
 71 | 
 72 | BLASCFLAGS=-DFINTEGER=int
 73 | BLASLDFLAGS=-framework Accelerate
 74 | 
 75 | 
 76 | 
 77 | ##########################################################################
 78 | # SWIG and Python flags
 79 | ##########################################################################
 80 | 
 81 | # SWIG executable. This should be at least version 3.x
 82 | # port install swig swig-python
 83 | 
 84 | SWIGEXEC=/opt/local/bin/swig
 85 | 
 86 | # The Python include directories for the current python executable can
 87 | # typically be found with
 88 | #
 89 | # python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()"
 90 | # python -c "import numpy ; print numpy.get_include()"
 91 | #
 92 | # the paths below are for the system python (not the macports one)
 93 | 
 94 | PYTHONCFLAGS=-I/System/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 \
 95 | -I/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/include
 96 | 
 97 | 
 98 | ##########################################################################
 99 | # Faiss GPU
100 | ##########################################################################
101 | 
102 | # As we don't have access to a Mac with nvidia GPUs installed, we
103 | # could not validate the GPU compile of Faiss.
104 | 


--------------------------------------------------------------------------------
/faiss.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * Copyright (c) 2015-present, Facebook, Inc.
 4 |  * All rights reserved.
 5 |  *
 6 |  * This source code is licensed under the CC-by-NC license found in the
 7 |  * LICENSE file in the root directory of this source tree. An additional grant
 8 |  * of patent rights can be found in the PATENTS file in the same directory.
 9 |  */
10 | 
11 | // Copyright 2004-present Facebook. All Rights Reserved
12 | // -*- c++ -*-
13 | 
14 | // This is the main internal include file for Faiss. It defines
15 | // macros and some machine-specific functions shared across .cpp files
16 | 
17 | #ifndef FAISS_h
18 | #define FAISS_h
19 | 
20 | #include <stdint.h>
21 | #include <stdlib.h>
22 | #include <stddef.h>
23 | 
24 | #ifndef __SSE2__
25 |     #error "SSE optimized distance computations not set"
26 | #endif
27 | 
28 | 
29 | 
30 | 
31 | #ifdef _OPENMP
32 |   #include <omp.h>
33 |   #define SET_NT(ntlim)                          \
34 |       size_t nt = omp_get_max_threads();         \
35 |       if (nt > ntlim) nt = ntlim;
36 | #else
37 |   #warning "OpenMP is NOT activated"
38 |   #define SET_NT(ntlim) size_t nt = 0; nt++;
39 | #endif
40 | 
41 | /* This is to prevent warning by the linter (FINTEGER is defined externally) */
42 | #ifndef FINTEGER
43 |   #define FINTEGER long
44 | #endif
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/filehelper.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILEHELPER_HPP
 2 | #define FILEHELPER_HPP
 3 | 
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | 
 8 | namespace faiss {
 9 | /* read Header from fvecs */
10 | template<typename T>
11 | T* readJegou(const char *path, uint &n, uint &d) ;
12 | 
13 | template<typename T>
14 | void readJegouHeader(const char *path, uint &n, uint &d);
15 | 
16 | template<>
17 | uint8_t* readJegou<uint8_t>(const char *path, uint &n, uint &d);
18 | 
19 | 
20 | uint8_t* readBatchJegou(const char *path, uint start_pos, uint num);
21 | 
22 | template<>
23 | void readJegouHeader<uint8_t>(const char *path, uint &n, uint &d);
24 | 
25 | 
26 | template<typename T = uint8_t>
27 | void write(std::string fs, size_t num, uint dim, T *ptr, size_t len, size_t offset = 0) ;
28 | 
29 | 
30 | 
31 | 
32 | template<typename T = uint8_t>
33 | void read(std::string fs, size_t &num, uint &dim, T *ptr, size_t len, size_t offset = 0);
34 | 
35 | void header(std::string fs, uint &num, uint &dim) ;
36 | 
37 | void writeFloat(std::string _fn, size_t _dim, size_t _num, float* _x, size_t _offset);
38 | void writeInt(std::string _fn, size_t _dim, size_t _num, int* _x, size_t _offset);
39 | 
40 | float* readFloat(const char* _fn, size_t _dim, size_t _num, size_t _offset) ;
41 | 
42 | float* readUint8(const char* _fn, size_t _dim, size_t _num, size_t _offset) ;
43 | 
44 | int* readInt(const char* _fn, size_t _dim, size_t _num, size_t _offset);
45 | #endif
46 | }
47 | 


--------------------------------------------------------------------------------
/gpu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # specify header and cpp files
 2 | file(GLOB_RECURSE faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
 3 | file(GLOB_RECURSE faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 4 | file(GLOB_RECURSE faiss_gpu_cuh ${CMAKE_CURRENT_SOURCE_DIR}/*.cuh)
 5 | file(GLOB_RECURSE faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 6 | 
 7 | set(faiss_lib_gpu gpufaiss)
 8 | 
 9 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/demo_ivfpq_indexing_gpu.cpp)
10 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexFlat.cpp)
11 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexIVFFlat.cpp)
12 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuIndexIVFPQ.cpp)
13 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/TestUtils.cpp)
14 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/test/TestGpuSelect.cu)
15 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/test/TestUtils.h)
16 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/perf/IndexWrapper.h)
17 | list(REMOVE_ITEM faiss_gpu_headers ${CMAKE_CURRENT_SOURCE_DIR}/perf/IndexWrapper-inl.h)
18 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareFlat.cu)
19 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFFlat.cu)
20 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFPQ.cu)
21 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/CompareIVFPQGrid.cu)
22 | list(REMOVE_ITEM faiss_gpu_cu ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfSelect.cu)
23 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfClustering.cpp)
24 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/PerfIVFPQAdd.cpp)
25 | list(REMOVE_ITEM faiss_gpu_cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf/WriteIndex.cpp)
26 | 
27 | cuda_add_library(${faiss_lib_gpu} STATIC ${faiss_gpu_headers} ${faiss_gpu_cpp} ${faiss_gpu_cuh} ${faiss_gpu_cu})
28 | add_subdirectory(test)
29 | 


--------------------------------------------------------------------------------
/gpu/GpuAutoTune.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #pragma once
11 | 
12 | #include "../Index.h"
13 | #include "../AutoTune.h"
14 | #include "GpuClonerOptions.h"
15 | #include "GpuIndex.h"
16 | #include "GpuIndicesOptions.h"
17 | 
18 | namespace faiss { namespace gpu {
19 | 
20 | class GpuResources;
21 | 
22 | // to support auto-tuning we need cloning to/from CPU
23 | 
24 | /// converts any GPU index inside gpu_index to a CPU index
25 | faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index);
26 | 
27 | /// converts any CPU index that can be converted to GPU
28 | faiss::Index * index_cpu_to_gpu(
29 |        GpuResources* resources, int device,
30 |        const faiss::Index *index,
31 |        const GpuClonerOptions *options = nullptr);
32 | 
33 | faiss::Index * index_cpu_to_gpu_multiple(
34 |        std::vector<GpuResources*> & resources,
35 |        std::vector<int> &devices,
36 |        const faiss::Index *index,
37 |        const GpuMultipleClonerOptions *options = nullptr);
38 | 
39 | /// parameter space and setters for GPU indexes
40 | struct GpuParameterSpace: faiss::ParameterSpace {
41 |     /// initialize with reasonable parameters for the index
42 |     void initialize (const faiss::Index * index) override;
43 | 
44 |     /// set a combination of parameters on an index
45 |     void set_index_parameter (
46 |           faiss::Index * index, const std::string & name,
47 |           double val) const override;
48 | };
49 | 
50 | } } // namespace
51 | 


--------------------------------------------------------------------------------
/gpu/GpuClonerOptions.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "GpuClonerOptions.h"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | GpuClonerOptions::GpuClonerOptions()
15 |     : indicesOptions(INDICES_64_BIT),
16 |       useFloat16CoarseQuantizer(false),
17 |       useFloat16(false),
18 |       usePrecomputed(true),
19 |       reserveVecs(0),
20 |       storeTransposed(false),
21 |       verbose(false) {
22 | }
23 | 
24 | GpuMultipleClonerOptions::GpuMultipleClonerOptions()
25 |     : shard(false) {
26 | }
27 | 
28 | } } // namespace
29 | 


--------------------------------------------------------------------------------
/gpu/GpuClonerOptions.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #pragma once
11 | 
12 | #include "GpuIndicesOptions.h"
13 | 
14 | namespace faiss { namespace gpu {
15 | 
16 | /// set some options on how to copy to GPU
17 | struct GpuClonerOptions {
18 |   GpuClonerOptions();
19 | 
20 |   /// how should indices be stored on index types that support indices
21 |   /// (anything but GpuIndexFlat*)?
22 |   IndicesOptions indicesOptions;
23 | 
24 |   /// is the coarse quantizer in float16?
25 |   bool useFloat16CoarseQuantizer;
26 | 
27 |   /// for GpuIndexIVFFlat, is storage in float16?
28 |   /// for GpuIndexIVFPQ, are intermediate calculations in float16?
29 |   bool useFloat16;
30 | 
31 |   /// use precomputed tables?
32 |   bool usePrecomputed;
33 | 
34 |   /// reserve vectors in the invfiles?
35 |   long reserveVecs;
36 | 
37 |   /// For GpuIndexFlat, store data in transposed layout?
38 |   bool storeTransposed;
39 | 
40 |   /// Set verbose options on the index
41 |   bool verbose;
42 | };
43 | 
44 | struct GpuMultipleClonerOptions : public GpuClonerOptions {
45 |   GpuMultipleClonerOptions ();
46 | 
47 |   /// Whether to shard the index across GPUs, versus replication
48 |   /// across GPUs
49 |   bool shard;
50 | };
51 | 
52 | } } // namespace
53 | 


--------------------------------------------------------------------------------
/gpu/GpuIndex.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #pragma once
 12 | 
 13 | #include "../Index.h"
 14 | #include "utils/MemorySpace.h"
 15 | 
 16 | namespace faiss { namespace gpu {
 17 | 
 18 | class GpuResources;
 19 | 
 20 | struct GpuIndexConfig {
 21 |   inline GpuIndexConfig()
 22 |       : device(0),
 23 |         memorySpace(MemorySpace::Device) {
 24 |   }
 25 | 
 26 |   /// GPU device on which the index is resident
 27 |   int device;
 28 | 
 29 |   /// What memory space to use for primary storae.
 30 |   /// On Pascal and above (CC 6+) architectures, allows GPUs to use
 31 |   /// more memory than is available on the GPU.
 32 |   MemorySpace memorySpace;
 33 | };
 34 | 
 35 | class GpuIndex : public faiss::Index {
 36 |  public:
 37 |   GpuIndex(GpuResources* resources,
 38 |            int dims,
 39 |            faiss::MetricType metric,
 40 |            GpuIndexConfig config);
 41 | 
 42 |   int getDevice() const {
 43 |     return device_;
 44 |   }
 45 | 
 46 |   GpuResources* getResources() {
 47 |     return resources_;
 48 |   }
 49 | 
 50 |   /// `x` can be resident on the CPU or any GPU; copies are performed
 51 |   /// as needed
 52 |   /// Handles paged adds if the add set is too large; calls addInternal_
 53 |   void add(faiss::Index::idx_t, const float* x) override;
 54 | 
 55 |   /// `x` and `ids` can be resident on the CPU or any GPU; copies are
 56 |   /// performed as needed
 57 |   /// Handles paged adds if the add set is too large; calls addInternal_
 58 |   void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
 59 |       override;
 60 | 
 61 |   /// `x`, `distances` and `labels` can be resident on the CPU or any
 62 |   /// GPU; copies are performed as needed
 63 |   void search(
 64 |       faiss::Index::idx_t n,
 65 |       const float* x,
 66 |       faiss::Index::idx_t k,
 67 |       float* distances,
 68 |       faiss::Index::idx_t* labels) const override;
 69 | 
 70 | 
 71 | 
 72 |  protected:
 73 |   /// Handles paged adds if the add set is too large, passes to
 74 |   /// addImpl_ to actually perform the add for the current page
 75 |   void addInternal_(Index::idx_t n,
 76 |                     const float* x,
 77 |                     const Index::idx_t* ids);
 78 | 
 79 |   /// Overridden to actually perform the add
 80 |   virtual void addImpl_(Index::idx_t n,
 81 |                         const float* x,
 82 |                         const Index::idx_t* ids) = 0;
 83 | 
 84 |   /// Overridden to actually perform the search
 85 |   virtual void searchImpl_(faiss::Index::idx_t n,
 86 |                            const float* x,
 87 |                            faiss::Index::idx_t k,
 88 |                            float* distances,
 89 |                            faiss::Index::idx_t* labels) const = 0;
 90 | 
 91 |  protected:
 92 |   /// Manages streans, cuBLAS handles and scratch memory for devices
 93 |   GpuResources* resources_;
 94 | 
 95 |   /// The GPU device we are resident on
 96 |   const int device_;
 97 | 
 98 |   /// The memory space of our primary storage on the GPU
 99 |   const MemorySpace memorySpace_;
100 | };
101 | 
102 | } } // namespace
103 | 


--------------------------------------------------------------------------------
/gpu/GpuIndexIVF.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "GpuIndex.h"
14 | #include "GpuIndexFlat.h"
15 | #include "GpuIndicesOptions.h"
16 | #include "../Clustering.h"
17 | 
18 | namespace faiss { struct IndexIVF; }
19 | 
20 | namespace faiss { namespace gpu {
21 | 
22 | class GpuIndexFlat;
23 | class GpuResources;
24 | 
25 | struct GpuIndexIVFConfig : public GpuIndexConfig {
26 |   inline GpuIndexIVFConfig()
27 |       : indicesOptions(INDICES_64_BIT) {
28 |   }
29 | 
30 |   /// Index storage options for the GPU
31 |   IndicesOptions indicesOptions;
32 | 
33 |   /// Configuration for the coarse quantizer object
34 |   GpuIndexFlatConfig flatConfig;
35 | };
36 | 
37 | class GpuIndexIVF : public GpuIndex {
38 |  public:
39 |   GpuIndexIVF(GpuResources* resources,
40 |               int dims,
41 |               faiss::MetricType metric,
42 |               int nlist,
43 |               GpuIndexIVFConfig config = GpuIndexIVFConfig());
44 | 
45 |   ~GpuIndexIVF() override;
46 | 
47 |  private:
48 |   /// Shared initialization functions
49 |   void init_();
50 | 
51 |  public:
52 |   /// Copy what we need from the CPU equivalent
53 |   void copyFrom(const faiss::IndexIVF* index);
54 | 
55 |   /// Copy what we have to the CPU equivalent
56 |   void copyTo(faiss::IndexIVF* index) const;
57 | 
58 |   /// Returns the number of inverted lists we're managing
59 |   int getNumLists() const;
60 | 
61 |   /// Return the quantizer we're using
62 |   GpuIndexFlat* getQuantizer();
63 | 
64 |   /// Sets the number of list probes per query
65 |   void setNumProbes(int nprobe);
66 | 
67 |   /// Returns our current number of list probes per query
68 |   int getNumProbes() const;
69 | 
70 |   /// `x` can be resident on the CPU or any GPU; the proper copies are
71 |   /// performed
72 |   /// Forwards to add_with_ids; assigns IDs as needed
73 |   /// FIXME: remove override for C++03 compatibility
74 |   void add(Index::idx_t n, const float* x) override;
75 | 
76 |  protected:
77 |   void trainQuantizer_(faiss::Index::idx_t n, const float* x);
78 | 
79 |  protected:
80 |   GpuIndexIVFConfig ivfConfig_;
81 | 
82 |   /// Number of inverted lists that we manage
83 |   int nlist_;
84 | 
85 |   /// Number of inverted list probes per query
86 |   int nprobe_;
87 | 
88 |   /// Ability to override default clustering parameters
89 |   ClusteringParameters cp_;
90 | 
91 |   /// Quantizer for inverted lists
92 |   GpuIndexFlat* quantizer_;
93 | };
94 | 
95 | } } // namespace
96 | 


--------------------------------------------------------------------------------
/gpu/GpuIndexIVFFlat.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "GpuIndexIVF.h"
14 | 
15 | namespace faiss { struct IndexIVFFlat; }
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | class IVFFlat;
20 | class GpuIndexFlat;
21 | 
22 | struct GpuIndexIVFFlatConfig : public GpuIndexIVFConfig {
23 |   inline GpuIndexIVFFlatConfig()
24 |       : useFloat16IVFStorage(false) {
25 |   }
26 | 
27 |   /// Whether or not IVFFlat inverted list storage is in float16;
28 |   /// supported on all architectures
29 |   bool useFloat16IVFStorage;
30 | };
31 | 
32 | /// Wrapper around the GPU implementation that looks like
33 | /// faiss::IndexIVFFlat
34 | class GpuIndexIVFFlat : public GpuIndexIVF {
35 |  public:
36 |   /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying
37 |   /// data over to the given GPU, if the input index is trained.
38 |   GpuIndexIVFFlat(GpuResources* resources,
39 |                   const faiss::IndexIVFFlat* index,
40 |                   GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
41 | 
42 |   /// Constructs a new instance with an empty flat quantizer; the user
43 |   /// provides the number of lists desired.
44 |   GpuIndexIVFFlat(GpuResources* resources,
45 |                   int dims,
46 |                   int nlist,
47 |                   faiss::MetricType metric,
48 |                   GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
49 | 
50 |   ~GpuIndexIVFFlat() override;
51 | 
52 |   /// Reserve GPU memory in our inverted lists for this number of vectors
53 |   void reserveMemory(size_t numVecs);
54 | 
55 |   /// Initialize ourselves from the given CPU index; will overwrite
56 |   /// all data in ourselves
57 |   void copyFrom(const faiss::IndexIVFFlat* index);
58 | 
59 |   /// Copy ourselves to the given CPU index; will overwrite all data
60 |   /// in the index instance
61 |   void copyTo(faiss::IndexIVFFlat* index) const;
62 | 
63 |   /// After adding vectors, one can call this to reclaim device memory
64 |   /// to exactly the amount needed. Returns space reclaimed in bytes
65 |   size_t reclaimMemory();
66 | 
67 |   void reset() override;
68 | 
69 |   void train(Index::idx_t n, const float* x) override;
70 | 
71 |  protected:
72 |   /// Called from GpuIndex for add/add_with_ids
73 |   void addImpl_(
74 |       faiss::Index::idx_t n,
75 |       const float* x,
76 |       const faiss::Index::idx_t* ids) override;
77 | 
78 |   /// Called from GpuIndex for search
79 |   void searchImpl_(
80 |       faiss::Index::idx_t n,
81 |       const float* x,
82 |       faiss::Index::idx_t k,
83 |       float* distances,
84 |       faiss::Index::idx_t* labels) const override;
85 |  private:
86 |   GpuIndexIVFFlatConfig ivfFlatConfig_;
87 | 
88 |   /// Desired inverted list memory reservation
89 |   size_t reserveMemoryVecs_;
90 | 
91 |   /// Instance that we own; contains the inverted list
92 |   IVFFlat* index_;
93 | };
94 | 
95 | } } // namespace
96 | 


--------------------------------------------------------------------------------
/gpu/GpuIndicesOptions.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | namespace faiss { namespace gpu {
14 | 
15 | /// How user vector index data is stored on the GPU
16 | enum IndicesOptions {
17 |   /// The user indices are only stored on the CPU; the GPU returns
18 |   /// (inverted list, offset) to the CPU which is then translated to
19 |   /// the real user index.
20 |   INDICES_CPU = 0,
21 |   /// The indices are not stored at all, on either the CPU or
22 |   /// GPU. Only (inverted list, offset) is returned to the user as the
23 |   /// index.
24 |   INDICES_IVF = 1,
25 |   /// Indices are stored as 32 bit integers on the GPU, but returned
26 |   /// as 64 bit integers
27 |   INDICES_32_BIT = 2,
28 |   /// Indices are stored as 64 bit integers on the GPU
29 |   INDICES_64_BIT = 3,
30 | };
31 | 
32 | } } // namespace
33 | 


--------------------------------------------------------------------------------
/gpu/GpuResources.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "GpuResources.h"
12 | #include "utils/DeviceUtils.h"
13 | 
14 | namespace faiss { namespace gpu {
15 | 
16 | GpuResources::~GpuResources() {
17 | }
18 | 
19 | cublasHandle_t
20 | GpuResources::getBlasHandleCurrentDevice() {
21 |   return getBlasHandle(getCurrentDevice());
22 | }
23 | 
24 | cudaStream_t
25 | GpuResources::getDefaultStreamCurrentDevice() {
26 |   return getDefaultStream(getCurrentDevice());
27 | }
28 | 
29 | std::vector<cudaStream_t>
30 | GpuResources::getAlternateStreamsCurrentDevice() {
31 |   return getAlternateStreams(getCurrentDevice());
32 | }
33 | 
34 | DeviceMemory&
35 | GpuResources::getMemoryManagerCurrentDevice() {
36 |   return getMemoryManager(getCurrentDevice());
37 | }
38 | 
39 | cudaStream_t
40 | GpuResources::getAsyncCopyStreamCurrentDevice() {
41 |   return getAsyncCopyStream(getCurrentDevice());
42 | }
43 | 
44 | } } // namespace
45 | 


--------------------------------------------------------------------------------
/gpu/GpuResources.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "utils/DeviceMemory.h"
14 | #include <cuda_runtime.h>
15 | #include <cublas_v2.h>
16 | #include <utility>
17 | #include <vector>
18 | 
19 | namespace faiss { namespace gpu {
20 | 
21 | /// Base class of GPU-side resource provider; hides provision of
22 | /// cuBLAS handles, CUDA streams and a temporary memory manager
23 | class GpuResources {
24 |  public:
25 |   virtual ~GpuResources();
26 | 
27 |   /// Call to pre-allocate resources for a particular device. If this is
28 |   /// not called, then resources will be allocated at the first time
29 |   /// of demand
30 |   virtual void initializeForDevice(int device) = 0;
31 | 
32 |   virtual cublasHandle_t getBlasHandle(int device) = 0;
33 | 
34 |   virtual cudaStream_t getDefaultStream(int device) = 0;
35 | 
36 |   virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
37 | 
38 |   virtual DeviceMemory& getMemoryManager(int device) = 0;
39 | 
40 |   virtual std::pair<void*, size_t> getPinnedMemory() = 0;
41 | 
42 |   virtual cudaStream_t getAsyncCopyStream(int device) = 0;
43 | 
44 |   cublasHandle_t getBlasHandleCurrentDevice();
45 | 
46 |   cudaStream_t getDefaultStreamCurrentDevice();
47 | 
48 |   std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
49 | 
50 |   DeviceMemory& getMemoryManagerCurrentDevice();
51 | 
52 |   cudaStream_t getAsyncCopyStreamCurrentDevice();
53 | };
54 | 
55 | } } // namespace
56 | 


--------------------------------------------------------------------------------
/gpu/IndexProxy.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #pragma once
 12 | 
 13 | #include "../Index.h"
 14 | #include "utils/WorkerThread.h"
 15 | #include <memory>
 16 | #include <vector>
 17 | 
 18 | namespace faiss { namespace gpu {
 19 | 
 20 | /// Takes individual faiss::Index instances, and splits queries for
 21 | /// sending to each Index instance, and joins the results together
 22 | /// when done.
 23 | /// Each index is managed by a separate CPU thread.
 24 | class IndexProxy : public faiss::Index {
 25 |  public:
 26 |   IndexProxy();
 27 |   ~IndexProxy() override;
 28 | 
 29 |   /// Adds an index that is managed by ourselves.
 30 |   /// WARNING: once an index is added to this proxy, it becomes unsafe
 31 |   /// to touch it from any other thread than that on which is managing
 32 |   /// it, until we are shut down. Use runOnIndex to perform work on it
 33 |   /// instead.
 34 |   void addIndex(faiss::Index* index);
 35 | 
 36 |   /// Remove an index that is managed by ourselves.
 37 |   /// This will flush all pending work on that index, and then shut
 38 |   /// down its managing thread, and will remove the index.
 39 |   void removeIndex(faiss::Index* index);
 40 | 
 41 |   /// Run a function on all indices, in the thread that the index is
 42 |   /// managed in.
 43 |   void runOnIndex(std::function<void(faiss::Index*)> f);
 44 | 
 45 |   /// faiss::Index API
 46 |   /// All indices receive the same call
 47 |   void reset() override;
 48 | 
 49 |   /// faiss::Index API
 50 |   /// All indices receive the same call
 51 |   void train(Index::idx_t n, const float* x) override;
 52 | 
 53 |   /// faiss::Index API
 54 |   /// All indices receive the same call
 55 |   void add(Index::idx_t n, const float* x) override;
 56 | 
 57 |   /// faiss::Index API
 58 |   /// Query is partitioned into a slice for each sub-index
 59 |   /// split by ceil(n / #indices) for our sub-indices
 60 |   void search(faiss::Index::idx_t n,
 61 |               const float* x,
 62 |               faiss::Index::idx_t k,
 63 |               float* distances,
 64 |               faiss::Index::idx_t* labels) const override;
 65 | 
 66 |   /// reconstructs from the first index
 67 |   void reconstruct(idx_t, float *v) const override;
 68 | 
 69 |   bool own_fields;
 70 | 
 71 |   int count() const {return indices_.size(); }
 72 | 
 73 |   faiss::Index* at(int i) {return indices_[i].first; }
 74 |   const faiss::Index* at(int i) const {return indices_[i].first; }
 75 | 
 76 | 
 77 |  private:
 78 |   /// Collection of Index instances, with their managing worker thread
 79 |   mutable std::vector<std::pair<faiss::Index*,
 80 |                                 std::unique_ptr<WorkerThread> > > indices_;
 81 | };
 82 | 
 83 | 
 84 | 
 85 | /** Clustering on GPU (is here because uses Proxy with ngpu > 1
 86 |  *
 87 |  * @param ngpu nb of GPUs to use
 88 |  * @param d dimension of the data
 89 |  * @param n nb of training vectors
 90 |  * @param k nb of output centroids
 91 |  * @param x training set (size n * d)
 92 |  * @param centroids output centroids (size k * d)
 93 |  * @return final quantization error
 94 |  */
 95 | float kmeans_clustering_gpu (int ngpu, size_t d, size_t n, size_t k,
 96 |                              const float *x,
 97 |                              float *centroids,
 98 |                              bool useFloat16,
 99 |                              bool storeTransposed);
100 | 
101 | 
102 | 
103 | } } // namespace
104 | 


--------------------------------------------------------------------------------
/gpu/StandardGpuResources.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "GpuResources.h"
14 | #include "utils/StackDeviceMemory.h"
15 | #include "utils/DeviceUtils.h"
16 | #include <unordered_map>
17 | #include <vector>
18 | 
19 | namespace faiss { namespace gpu {
20 | 
21 | /// Default implementation of GpuResources that allocates a cuBLAS
22 | /// stream and 2 streams for use, as well as temporary memory
23 | class StandardGpuResources : public GpuResources {
24 |  public:
25 |   StandardGpuResources();
26 | 
27 |   ~StandardGpuResources() override;
28 | 
29 |   /// Disable allocation of temporary memory; all temporary memory
30 |   /// requests will call cudaMalloc / cudaFree at the point of use
31 |   void noTempMemory();
32 | 
33 |   /// Specify that we wish to use a certain fixed size of memory on
34 |   /// all devices as temporary memory
35 |   void setTempMemory(size_t size);
36 | 
37 |   /// Specify that we wish to use a certain fraction of memory on
38 |   /// all devices as temporary memory
39 |   void setTempMemoryFraction(float fraction);
40 | 
41 |   /// Set amount of pinned memory to allocate, for async GPU <-> CPU
42 |   /// transfers
43 |   void setPinnedMemory(size_t size);
44 | 
45 |  public:
46 |   /// Internal system calls
47 |   void initializeForDevice(int device) override;
48 | 
49 |   cublasHandle_t getBlasHandle(int device) override;
50 | 
51 |   cudaStream_t getDefaultStream(int device) override;
52 | 
53 |   std::vector<cudaStream_t> getAlternateStreams(int device) override;
54 | 
55 |   DeviceMemory& getMemoryManager(int device) override;
56 | 
57 |   std::pair<void*, size_t> getPinnedMemory() override;
58 | 
59 |   cudaStream_t getAsyncCopyStream(int device) override;
60 | 
61 |  private:
62 |   /// Our default stream that work is ordered on, one per each device
63 |   std::unordered_map<int, cudaStream_t> defaultStreams_;
64 | 
65 |   /// Other streams we can use, per each device
66 |   std::unordered_map<int, std::vector<cudaStream_t> > alternateStreams_;
67 | 
68 |   /// Async copy stream to use for GPU <-> CPU pinned memory copies
69 |   std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
70 | 
71 |   /// cuBLAS handle for each device
72 |   std::unordered_map<int, cublasHandle_t> blasHandles_;
73 | 
74 |   /// Temporary memory provider, per each device
75 |   std::unordered_map<int, std::unique_ptr<StackDeviceMemory> > memory_;
76 | 
77 |   /// Pinned memory allocation for use with this GPU
78 |   void* pinnedMemAlloc_;
79 |   size_t pinnedMemAllocSize_;
80 | 
81 |   /// By default, we reserve this fraction of memory on all devices
82 |   float tempMemFraction_;
83 | 
84 |   /// Another option is to use a specified amount of memory on all
85 |   /// devices
86 |   size_t tempMemSize_;
87 | 
88 |   /// Whether we look at tempMemFraction_ or tempMemSize_
89 |   bool useFraction_;
90 | 
91 |   /// Amount of pinned memory we should allocate
92 |   size_t pinnedMemSize_;
93 | };
94 | 
95 | } } // namespace
96 | 


--------------------------------------------------------------------------------
/gpu/impl/BroadcastSum.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../utils/Float16.cuh"
14 | #include "../utils/Tensor.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | // output[x][i] += input[i] for all x
19 | void runSumAlongColumns(Tensor<float, 1, true>& input,
20 |                         Tensor<float, 2, true>& output,
21 |                         cudaStream_t stream);
22 | 
23 | #ifdef FAISS_USE_FLOAT16
24 | void runSumAlongColumns(Tensor<half, 1, true>& input,
25 |                         Tensor<half, 2, true>& output,
26 |                         cudaStream_t stream);
27 | #endif
28 | 
29 | // output[x][i] = input[i] for all x
30 | void runAssignAlongColumns(Tensor<float, 1, true>& input,
31 |                            Tensor<float, 2, true>& output,
32 |                            cudaStream_t stream);
33 | 
34 | #ifdef FAISS_USE_FLOAT16
35 | void runAssignAlongColumns(Tensor<half, 1, true>& input,
36 |                            Tensor<half, 2, true>& output,
37 |                            cudaStream_t stream);
38 | #endif
39 | 
40 | // output[i][x] += input[i] for all x
41 | void runSumAlongRows(Tensor<float, 1, true>& input,
42 |                      Tensor<float, 2, true>& output,
43 |                      cudaStream_t stream);
44 | void runSumAlongRowsWithGraph(Tensor<int, 2, true>& outIndexView,
45 |                      Tensor<int, 2, true>& graphIndices,
46 |                       Tensor<float, 2, true>&   productDistances,
47 |                       Tensor<float, 2, true>& outGraphDistances,
48 |                      cudaStream_t stream);
49 | void runSumAlongColumnsGraph1(Tensor<float, 1, true>& input,
50 |                            Tensor<float, 2, true>& output,
51 |                            cudaStream_t stream);
52 | void runL2SelectMinGraph(Tensor<float, 2, true>& graphDistancesBuf, Tensor<int, 2, true>& outIndexView,
53 |                      Tensor<int, 2, true>& graphIndices,
54 |                      Tensor<float, 2, true>& graphDists,
55 |                       Tensor<float, 2, true>&  productDistances,
56 |                       Tensor<float, 2, true>& outGraphDistances,
57 |                       Tensor<float, 2, true>& outDistances2nd,
58 |                       Tensor<int, 2, true>& outIndices2nd,int k,int begin ,int end,
59 |                      cudaStream_t stream);
60 | #ifdef FAISS_USE_FLOAT16
61 | void runSumAlongRows(Tensor<half, 1, true>& input,
62 |                      Tensor<half, 2, true>& output,
63 |                      cudaStream_t stream);
64 | void runSumAlongRowsWithGraph(Tensor<int, 2, true>& outIndexView,
65 |                      Tensor<int, 2, true>& graphIndices,
66 |                       Tensor<half, 2, true>&   productDistances,
67 |                       Tensor<half, 2, true>& outGraphDistances,
68 |                      cudaStream_t stream);
69 | #endif
70 | 
71 | } } // namespace
72 | 


--------------------------------------------------------------------------------
/gpu/impl/IVFFlat.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "IVFBase.cuh"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | class IVFFlat : public IVFBase {
18 |  public:
19 |   /// Construct from a quantizer that has elemen
20 |   IVFFlat(GpuResources* resources,
21 |           /// We do not own this reference
22 |           FlatIndex* quantizer,
23 |           bool l2Distance,
24 |           bool useFloat16,
25 |           IndicesOptions indicesOptions,
26 |           MemorySpace space);
27 | 
28 |   ~IVFFlat() override;
29 | 
30 |   /// Add vectors to a specific list; the input data can be on the
31 |   /// host or on our current device
32 |   void addCodeVectorsFromCpu(int listId,
33 |                              const float* vecs,
34 |                              const long* indices,
35 |                              size_t numVecs);
36 | 
37 |   /// Adds the given vectors to this index.
38 |   /// The input data must be on our current device.
39 |   /// Returns the number of vectors successfully added. Vectors may
40 |   /// not be able to be added because they contain NaNs.
41 |   int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
42 |                             Tensor<long, 1, true>& indices);
43 | 
44 |   /// Find the approximate k nearest neigbors for `queries` against
45 |   /// our database
46 |   void query(Tensor<float, 2, true>& queries,
47 |              int nprobe,
48 |              int k,
49 |              Tensor<float, 2, true>& outDistances,
50 |              Tensor<long, 2, true>& outIndices);
51 | 
52 |   /// Return the vectors of a particular list back to the CPU
53 |   std::vector<float> getListVectors(int listId) const;
54 | 
55 |  private:
56 |   /// Returns the size of our stored vectors, in bytes
57 |   size_t getVectorMemorySize() const;
58 | 
59 |  private:
60 |   /// Calculating L2 distance or inner product?
61 |   const bool l2Distance_;
62 | 
63 |   /// Do we store data internally as float16 (versus float32)?
64 |   const bool useFloat16_;
65 | };
66 | 
67 | } } // namespace
68 | 


--------------------------------------------------------------------------------
/gpu/impl/IVFFlatScan.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../GpuIndicesOptions.h"
14 | #include "../utils/Tensor.cuh"
15 | #include <thrust/device_vector.h>
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | class GpuResources;
20 | 
21 | void runIVFFlatScan(Tensor<float, 2, true>& queries,
22 |                     Tensor<int, 2, true>& listIds,
23 |                     thrust::device_vector<void*>& listData,
24 |                     thrust::device_vector<void*>& listIndices,
25 |                     IndicesOptions indicesOptions,
26 |                     thrust::device_vector<int>& listLengths,
27 |                     int maxListLength,
28 |                     int k,
29 |                     bool l2Distance,
30 |                     bool useFloat16,
31 |                     // output
32 |                     Tensor<float, 2, true>& outDistances,
33 |                     // output
34 |                     Tensor<long, 2, true>& outIndices,
35 |                     GpuResources* res);
36 | 
37 | } } // namespace
38 | 


--------------------------------------------------------------------------------
/gpu/impl/InvertedListAppend.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../GpuIndicesOptions.h"
14 | #include "../utils/Tensor.cuh"
15 | #include <thrust/device_vector.h>
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | /// Update device-side list pointers in a batch
20 | void runUpdateListPointers(Tensor<int, 1, true>& listIds,
21 |                            Tensor<int, 1, true>& newListLength,
22 |                            Tensor<void*, 1, true>& newCodePointers,
23 |                            Tensor<void*, 1, true>& newIndexPointers,
24 |                            Tensor<void*, 1, true>& newLambdaPointers,
25 |                            Tensor<void*, 1, true>& newConstPointers,
26 |                            thrust::device_vector<int>& listLengths,
27 |                            thrust::device_vector<void*>& listCodes,
28 |                            thrust::device_vector<void*>& listIndices,
29 |                            thrust::device_vector<void*>& listLambdas,
30 |                             thrust::device_vector<void*>& listConsts,
31 |                            cudaStream_t stream);
32 | 
33 | /// Actually append the new codes / vector indices to the individual lists
34 | 
35 | /// IVFPQ
36 | void runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
37 |                                 Tensor<int, 1, true>& listOffset,
38 |                                 Tensor<int, 2, true>& encodings,
39 |                                 Tensor<long, 1, true>& indices,
40 |                                 thrust::device_vector<void*>& listCodes,
41 |                                 thrust::device_vector<void*>& listIndices,
42 |                                 IndicesOptions indicesOptions,
43 |                                 cudaStream_t stream);
44 | 
45 | 
46 | /// IVFPQ
47 | void runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
48 |                                 Tensor<int, 1, true>& listOffset,
49 |                                 Tensor<int, 2, true>& encodings,
50 |                                 Tensor<long, 1, true>& indices,
51 |                                 Tensor<uint8_t, 2, true>& lambdas,
52 |                                 Tensor<uint8_t, 2, true>& consts,
53 |                                 thrust::device_vector<void*>& listCodes,
54 |                                 thrust::device_vector<void*>& listIndices,
55 |                                 thrust::device_vector<void*>& listLambdas,
56 |                                 thrust::device_vector<void*>& listConsts,
57 |                                 IndicesOptions indicesOptions,
58 |                                 cudaStream_t stream);
59 | 
60 | /// IVF flat storage
61 | void runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
62 |                                   Tensor<int, 1, true>& listOffset,
63 |                                   Tensor<float, 2, true>& vecs,
64 |                                   Tensor<long, 1, true>& indices,
65 |                                   bool useFloat16,
66 |                                   thrust::device_vector<void*>& listData,
67 |                                   thrust::device_vector<void*>& listIndices,
68 |                                   IndicesOptions indicesOptions,
69 |                                   cudaStream_t stream);
70 | 
71 | } } // namespace
72 | 


--------------------------------------------------------------------------------
/gpu/impl/L2Norm.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../utils/Float16.cuh"
14 | #include "../utils/Tensor.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | void runL2Norm(Tensor<float, 2, true>& input,
19 |                Tensor<float, 1, true>& output,
20 |                bool normSquared,
21 |                cudaStream_t stream);
22 | 
23 | #ifdef FAISS_USE_FLOAT16
24 | void runL2Norm(Tensor<half, 2, true>& input,
25 |                Tensor<half, 1, true>& output,
26 |                bool normSquared,
27 |                cudaStream_t stream);
28 | #endif
29 | 
30 | } } // namespace
31 | 


--------------------------------------------------------------------------------
/gpu/impl/L2Select.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../utils/Float16.cuh"
14 | #include "../utils/Tensor.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | void runL2SelectMin(Tensor<float, 2, true>& productDistances,
19 |                     Tensor<float, 1, true>& centroidDistances,
20 |                     Tensor<float, 2, true>& outDistances,
21 |                     Tensor<int, 2, true>& outIndices,
22 |                     int k,
23 |                     cudaStream_t stream);
24 | 
25 | #ifdef FAISS_USE_FLOAT16
26 | void runL2SelectMin(Tensor<half, 2, true>& productDistances,
27 |                     Tensor<half, 1, true>& centroidDistances,
28 |                     Tensor<half, 2, true>& outDistances,
29 |                     Tensor<int, 2, true>& outIndices,
30 |                     int k,
31 |                     cudaStream_t stream);
32 | #endif
33 | 
34 | } } // namespace
35 | 


--------------------------------------------------------------------------------
/gpu/impl/PQCodeDistances.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../utils/Tensor.cuh"
14 | #include "../utils/NoTypeTensor.cuh"
15 | #include <cublas_v2.h>
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | class DeviceMemory;
20 | 
21 | /// pqCentroids is of the form (sub q)(sub dim)(code id)
22 | /// Calculates the distance from the (query - centroid) residual to
23 | /// each sub-code vector, for the given list of query results in
24 | /// topQueryToCentroid
25 | void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
26 |                         Tensor<float, 2, true>& queries,
27 |                         Tensor<float, 2, true>& coarseCentroids,
28 |                         Tensor<int, 2, true>& topQueryToCentroid,
29 |                         NoTypeTensor<4, true>& outCodeDistances,
30 |                         bool useFloat16Lookup,
31 |                         cudaStream_t stream);
32 | 
33 | void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
34 |                           Tensor<float, 2, true>& queries,
35 |                           Tensor<float, 2, true>& coarseCentroids,
36 |                           Tensor<int, 2, true>& topQueryToCentroid,
37 |                           NoTypeTensor<4, true>& outCodeDistances,
38 |                           bool useFloat16Lookup,
39 |                           DeviceMemory& mem,
40 |                           cublasHandle_t handle,
41 |                           cudaStream_t stream);
42 | 
43 | } } // namespace
44 | 


--------------------------------------------------------------------------------
/gpu/impl/PQScanMultiPassNoPrecomputed.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../GpuIndicesOptions.h"
14 | #include "../utils/Tensor.cuh"
15 | #include <thrust/device_vector.h>
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | class GpuResources;
20 | 
21 | /// For no precomputed codes, is this a supported number of dimensions
22 | /// per subquantizer?
23 | bool isSupportedNoPrecomputedSubDimSize(int dims);
24 | 
25 | void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
26 |                                      Tensor<float, 2, true>& centroids,
27 |                                      Tensor<float, 3, true>& pqCentroidsInnermostCode,
28 |                                      Tensor<int, 2, true>& topQueryToCentroid,
29 |                                      bool useFloat16Lookup,
30 |                                      int bytesPerCode,
31 |                                      int numSubQuantizers,
32 |                                      int numSubQuantizerCodes,
33 |                                      thrust::device_vector<void*>& listCodes,
34 |                                      thrust::device_vector<void*>& listIndices,
35 |                                      IndicesOptions indicesOptions,
36 |                                      thrust::device_vector<int>& listLengths,
37 |                                      int maxListLength,
38 |                                      int k,
39 |                                      // output
40 |                                      Tensor<float, 2, true>& outDistances,
41 |                                      // output
42 |                                      Tensor<long, 2, true>& outIndices,
43 |                                      GpuResources* res);
44 | 
45 | } } // namespace
46 | 


--------------------------------------------------------------------------------
/gpu/impl/RemapIndices.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "RemapIndices.h"
12 | #include "../../FaissAssert.h"
13 | #include  <iostream>
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | // Utility function to translate (list id, offset) to a user index on
18 | // the CPU. In a cpp in order to use OpenMP
19 | void ivfOffsetToUserIndex(
20 |   long* indices,
21 |   int numLists,
22 |   int queries,
23 |   int k,
24 |   const std::vector<std::vector<long>>& listOffsetToUserIndex) {
25 |   FAISS_ASSERT(numLists == listOffsetToUserIndex.size());
26 | 
27 | #pragma omp parallel for
28 |   for (int q = 0; q < queries; ++q) {
29 |     for (int r = 0; r < k; ++r) {
30 |       long offsetIndex = indices[q * k + r];
31 | 
32 |       if (offsetIndex < 0) continue;
33 | 
34 |       int listId = (int) (offsetIndex >> 32);
35 |       int listOffset = (int) (offsetIndex & 0xffffffff);
36 | 
37 |       FAISS_ASSERT(listId < numLists);
38 |       auto& listIndices = listOffsetToUserIndex[listId];
39 |       //if(listOffset >= listIndices.size())
40 |      // std::cout << "-----listId: " << listId  << " listOffset: " << listOffset<< " length: " <<  listIndices.size()<<std::endl;
41 |       FAISS_ASSERT(listOffset < listIndices.size());
42 |       //if(listOffset<listIndices.size())
43 |       indices[q * k + r] = listIndices[listOffset];
44 |     }
45 |   }
46 | }
47 | 
48 | } } // namespace
49 | 


--------------------------------------------------------------------------------
/gpu/impl/RemapIndices.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <vector>
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | /// Utility function to translate (list id, offset) to a user index on
18 | /// the CPU. In a cpp in order to use OpenMP.
19 | void ivfOffsetToUserIndex(
20 |   long* indices,
21 |   int numLists,
22 |   int queries,
23 |   int k,
24 |   const std::vector<std::vector<long>>& listOffsetToUserIndex);
25 | 
26 | } } // namespace
27 | 


--------------------------------------------------------------------------------
/gpu/impl/VectorResidual.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | #include "VectorResidual.cuh"
 11 | #include "../../FaissAssert.h"
 12 | #include "../utils/ConversionOperators.cuh"
 13 | #include "../utils/DeviceUtils.h"
 14 | #include "../utils/Tensor.cuh"
 15 | #include "../utils/StaticUtils.h"
 16 | #include <math_constants.h> // in CUDA SDK, for CUDART_NAN_F
 17 | 
 18 | namespace faiss { namespace gpu {
 19 | 
 20 | template <typename CentroidT, bool LargeDim>
 21 | __global__ void calcResidual(Tensor<float, 2, true> vecs,
 22 |                              Tensor<CentroidT, 2, true> centroids,
 23 |                              Tensor<int, 1, true> vecToCentroid,
 24 |                              Tensor<float, 2, true> residuals) {
 25 |   auto vec = vecs[blockIdx.x];
 26 |   auto residual = residuals[blockIdx.x];
 27 | 
 28 |   int centroidId = vecToCentroid[blockIdx.x];
 29 |   // Vector could be invalid (containing NaNs), so -1 was the
 30 |   // classified centroid
 31 |   if (centroidId == -1) {
 32 |     if (LargeDim) {
 33 |       for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
 34 |         residual[i] = CUDART_NAN_F;
 35 |       }
 36 |     } else {
 37 |       residual[threadIdx.x] = CUDART_NAN_F;
 38 |     }
 39 | 
 40 |     return;
 41 |   }
 42 | 
 43 |   auto centroid = centroids[centroidId];
 44 | 
 45 |   if (LargeDim) {
 46 |     for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
 47 |       residual[i] = vec[i] - ConvertTo<float>::to(centroid[i]);
 48 |     }
 49 |   } else {
 50 |     residual[threadIdx.x] = vec[threadIdx.x] -
 51 |       ConvertTo<float>::to(centroid[threadIdx.x]);
 52 |   }
 53 | }
 54 | 
 55 | template <typename CentroidT>
 56 | void calcResidual(Tensor<float, 2, true>& vecs,
 57 |                   Tensor<CentroidT, 2, true>& centroids,
 58 |                   Tensor<int, 1, true>& vecToCentroid,
 59 |                   Tensor<float, 2, true>& residuals,
 60 |                   cudaStream_t stream) {
 61 |   FAISS_ASSERT(vecs.getSize(1) == centroids.getSize(1));
 62 |   FAISS_ASSERT(vecs.getSize(1) == residuals.getSize(1));
 63 |   FAISS_ASSERT(vecs.getSize(0) == vecToCentroid.getSize(0));
 64 |   FAISS_ASSERT(vecs.getSize(0) == residuals.getSize(0));
 65 | 
 66 |   dim3 grid(vecs.getSize(0));
 67 | 
 68 |   int maxThreads = getMaxThreadsCurrentDevice();
 69 |   bool largeDim = vecs.getSize(1) > maxThreads;
 70 |   dim3 block(std::min(vecs.getSize(1), maxThreads));
 71 | 
 72 |   if (largeDim) {
 73 |     calcResidual<CentroidT, true><<<grid, block, 0, stream>>>(
 74 |       vecs, centroids, vecToCentroid, residuals);
 75 |   } else {
 76 |     calcResidual<CentroidT, false><<<grid, block, 0, stream>>>(
 77 |       vecs, centroids, vecToCentroid, residuals);
 78 |   }
 79 | 
 80 |   CUDA_TEST_ERROR();
 81 | }
 82 | 
 83 | void runCalcResidual(Tensor<float, 2, true>& vecs,
 84 |                      Tensor<float, 2, true>& centroids,
 85 |                      Tensor<int, 1, true>& vecToCentroid,
 86 |                      Tensor<float, 2, true>& residuals,
 87 |                      cudaStream_t stream) {
 88 |   calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
 89 | }
 90 | 
 91 | #ifdef FAISS_USE_FLOAT16
 92 | void runCalcResidual(Tensor<float, 2, true>& vecs,
 93 |                      Tensor<half, 2, true>& centroids,
 94 |                      Tensor<int, 1, true>& vecToCentroid,
 95 |                      Tensor<float, 2, true>& residuals,
 96 |                      cudaStream_t stream) {
 97 |   calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
 98 | }
 99 | #endif
100 | 
101 | } } // namespace
102 | 


--------------------------------------------------------------------------------
/gpu/impl/VectorResidual.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../utils/Tensor.cuh"
14 | #include "../utils/Float16.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | // Calculates residual v_i - c_j for all v_i in vecs where j = vecToCentroid[i]
19 | void runCalcResidual(Tensor<float, 2, true>& vecs,
20 |                      Tensor<float, 2, true>& centroids,
21 |                      Tensor<int, 1, true>& vecToCentroid,
22 |                      Tensor<float, 2, true>& residuals,
23 |                      cudaStream_t stream);
24 | 
25 | #ifdef FAISS_USE_FLOAT16
26 | void runCalcResidual(Tensor<float, 2, true>& vecs,
27 |                      Tensor<half, 2, true>& centroids,
28 |                      Tensor<int, 1, true>& vecToCentroid,
29 |                      Tensor<float, 2, true>& residuals,
30 |                      cudaStream_t stream);
31 | #endif
32 | 
33 | } } // namespace
34 | 


--------------------------------------------------------------------------------
/gpu/perf/IndexWrapper-inl.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "../../FaissAssert.h"
12 | 
13 | namespace faiss { namespace gpu {
14 | 
15 | template <typename GpuIndex>
16 | IndexWrapper<GpuIndex>::IndexWrapper(
17 |   int numGpus,
18 |   std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init) {
19 |   FAISS_ASSERT(numGpus <= faiss::gpu::getNumDevices());
20 |   for (int i = 0; i < numGpus; ++i) {
21 |     auto res = std::unique_ptr<faiss::gpu::StandardGpuResources>(
22 |       new StandardGpuResources);
23 | 
24 |     subIndex.emplace_back(init(res.get(), i));
25 |     resources.emplace_back(std::move(res));
26 |   }
27 | 
28 |   if (numGpus > 1) {
29 |     // create proxy
30 |     proxyIndex =
31 |       std::unique_ptr<faiss::gpu::IndexProxy>(new faiss::gpu::IndexProxy);
32 | 
33 |     for (auto& index : subIndex) {
34 |       proxyIndex->addIndex(index.get());
35 |     }
36 |   }
37 | }
38 | 
39 | template <typename GpuIndex>
40 | faiss::Index*
41 | IndexWrapper<GpuIndex>::getIndex() {
42 |   if ((bool) proxyIndex) {
43 |     return proxyIndex.get();
44 |   } else {
45 |     FAISS_ASSERT(!subIndex.empty());
46 |     return subIndex.front().get();
47 |   }
48 | }
49 | 
50 | template <typename GpuIndex>
51 | void
52 | IndexWrapper<GpuIndex>::runOnIndices(std::function<void(GpuIndex*)> f) {
53 | 
54 |   if ((bool) proxyIndex) {
55 |     proxyIndex->runOnIndex(
56 |       [f](faiss::Index* index) {
57 |         f(dynamic_cast<GpuIndex*>(index));
58 |       });
59 |   } else {
60 |     FAISS_ASSERT(!subIndex.empty());
61 |     f(subIndex.front().get());
62 |   }
63 | }
64 | 
65 | template <typename GpuIndex>
66 | void
67 | IndexWrapper<GpuIndex>::setNumProbes(int nprobe) {
68 |   runOnIndices([nprobe](GpuIndex* index) {
69 |       index->setNumProbes(nprobe);
70 |     });
71 | }
72 | 
73 | } }
74 | 


--------------------------------------------------------------------------------
/gpu/perf/IndexWrapper.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../IndexProxy.h"
14 | #include "../StandardGpuResources.h"
15 | #include <functional>
16 | #include <memory>
17 | #include <vector>
18 | 
19 | namespace faiss { namespace gpu {
20 | 
21 | // If we want to run multi-GPU, create a proxy to wrap the indices.
22 | // If we don't want multi-GPU, don't involve the proxy, so it doesn't
23 | // affect the timings.
24 | template <typename GpuIndex>
25 | struct IndexWrapper {
26 |   std::vector<std::unique_ptr<faiss::gpu::StandardGpuResources>> resources;
27 |   std::vector<std::unique_ptr<GpuIndex>> subIndex;
28 |   std::unique_ptr<faiss::gpu::IndexProxy> proxyIndex;
29 | 
30 |   IndexWrapper(
31 |     int numGpus,
32 |     std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init);
33 |   faiss::Index* getIndex();
34 | 
35 |   void runOnIndices(std::function<void(GpuIndex*)> f);
36 |   void setNumProbes(int nprobe);
37 | };
38 | 
39 | } }
40 | 
41 | #include "IndexWrapper-inl.h"
42 | 


--------------------------------------------------------------------------------
/gpu/perf/PerfSelect.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "../utils/DeviceUtils.h"
12 | #include "../utils/BlockSelectKernel.cuh"
13 | #include "../utils/WarpSelectKernel.cuh"
14 | #include "../utils/HostTensor.cuh"
15 | #include "../utils/DeviceTensor.cuh"
16 | #include "../test/TestUtils.h"
17 | #include <algorithm>
18 | #include <gflags/gflags.h>
19 | #include <gtest/gtest.h>
20 | #include <sstream>
21 | #include <unordered_map>
22 | #include <vector>
23 | 
24 | DEFINE_int32(rows, 10000, "rows in matrix");
25 | DEFINE_int32(cols, 40000, "cols in matrix");
26 | DEFINE_int32(k, 100, "k");
27 | DEFINE_bool(dir, false, "direction of sort");
28 | DEFINE_bool(warp, false, "warp select");
29 | DEFINE_int32(iter, 5, "iterations to run");
30 | DEFINE_bool(k_powers, false, "test k powers of 2 from 1 -> 1024");
31 | 
32 | int main(int argc, char** argv) {
33 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
34 | 
35 |   std::vector<float> v = faiss::gpu::randVecs(FLAGS_rows, FLAGS_cols);
36 |   faiss::gpu::HostTensor<float, 2, true> hostVal({FLAGS_rows, FLAGS_cols});
37 | 
38 |   for (int r = 0; r < FLAGS_rows; ++r) {
39 |     for (int c = 0; c < FLAGS_cols; ++c) {
40 |       hostVal[r][c] = v[r * FLAGS_cols + c];
41 |     }
42 |   }
43 | 
44 |   // Select top-k on GPU
45 |   faiss::gpu::DeviceTensor<float, 2, true> gpuVal(hostVal, 0);
46 | 
47 |   // enough space for any k
48 |   faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({FLAGS_rows, 1024});
49 |   faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({FLAGS_rows, 1024});
50 | 
51 |   int startK = FLAGS_k;
52 |   int limitK = FLAGS_k;
53 | 
54 |   if (FLAGS_k_powers) {
55 |     startK = 1;
56 |     limitK = 1024;
57 |   }
58 | 
59 |   for (int k = startK; k <= limitK; k *= 2) {
60 |     for (int i = 0; i < FLAGS_iter; ++i) {
61 |       if (FLAGS_warp) {
62 |         faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd,
63 |                                   FLAGS_dir, k, 0);
64 |       } else {
65 |         faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd,
66 |                                    FLAGS_dir, k, 0);
67 |       }
68 |     }
69 |   }
70 | 
71 |   cudaDeviceSynchronize();
72 | }
73 | 


--------------------------------------------------------------------------------
/gpu/perf/WriteIndex.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #include "../../IndexIVF.h"
 12 | #include "../../IndexIVFPQ.h"
 13 | #include "../../IndexFlat.h"
 14 | #include "../../index_io.h"
 15 | #include "../test/TestUtils.h"
 16 | #include <vector>
 17 | #include <gflags/gflags.h>
 18 | 
 19 | // For IVFPQ:
 20 | DEFINE_bool(ivfpq, false, "use IVFPQ encoding");
 21 | DEFINE_int32(codes, 4, "number of PQ codes per vector");
 22 | DEFINE_int32(bits_per_code, 8, "number of bits per PQ code");
 23 | 
 24 | // For IVFFlat:
 25 | DEFINE_bool(l2, true, "use L2 metric (versus IP metric)");
 26 | DEFINE_bool(ivfflat, false, "use IVF flat encoding");
 27 | 
 28 | // For both:
 29 | DEFINE_string(out, "/home/jhj/local/index.out", "index file for output");
 30 | DEFINE_int32(dim, 128, "vector dimension");
 31 | DEFINE_int32(num_coarse, 100, "number of coarse centroids");
 32 | DEFINE_int32(num, 100000, "total database size");
 33 | DEFINE_int32(num_train, -1, "number of database vecs to train on");
 34 | 
 35 | template <typename T>
 36 | void fillAndSave(T& index, int numTrain, int num, int dim) {
 37 |   auto trainVecs = faiss::gpu::randVecs(numTrain, dim);
 38 |   index.train(numTrain, trainVecs.data());
 39 | 
 40 |   constexpr int kAddChunk = 1000000;
 41 | 
 42 |   for (int i = 0; i < num; i += kAddChunk) {
 43 |     int numRemaining = (num - i) < kAddChunk ? (num - i) : kAddChunk;
 44 |     auto vecs = faiss::gpu::randVecs(numRemaining, dim);
 45 | 
 46 |     printf("adding at %d: %d\n", i, numRemaining);
 47 |     index.add(numRemaining, vecs.data());
 48 |   }
 49 | 
 50 |   faiss::write_index(&index, FLAGS_out.c_str());
 51 | }
 52 | 
 53 | int main(int argc, char** argv) {
 54 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
 55 | 
 56 |   // Either ivfpq or ivfflat must be set
 57 |   if ((FLAGS_ivfpq && FLAGS_ivfflat) ||
 58 |       (!FLAGS_ivfpq && !FLAGS_ivfflat)) {
 59 |     printf("must specify either ivfpq or ivfflat\n");
 60 |     return 1;
 61 |   }
 62 | 
 63 |   auto dim = FLAGS_dim;
 64 |   auto numCentroids = FLAGS_num_coarse;
 65 |   auto num = FLAGS_num;
 66 |   auto numTrain = FLAGS_num_train;
 67 |   numTrain = numTrain == -1 ? std::max((num / 4), 1) : numTrain;
 68 |   numTrain = std::min(num, numTrain);
 69 | 
 70 |   if (FLAGS_ivfpq) {
 71 |     faiss::IndexFlatL2 quantizer(dim);
 72 |     faiss::IndexIVFPQ index(&quantizer, dim, numCentroids,
 73 |                             FLAGS_codes, FLAGS_bits_per_code);
 74 |     index.verbose = true;
 75 | 
 76 |     printf("IVFPQ: codes %d bits per code %d\n",
 77 |            FLAGS_codes, FLAGS_bits_per_code);
 78 |     printf("Lists: %d\n", numCentroids);
 79 |     printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain);
 80 |     printf("output file: %s\n", FLAGS_out.c_str());
 81 | 
 82 |     fillAndSave(index, numTrain, num, dim);
 83 |   } else if (FLAGS_ivfflat) {
 84 |     faiss::IndexFlatL2 quantizerL2(dim);
 85 |     faiss::IndexFlatIP quantizerIP(dim);
 86 | 
 87 |     faiss::IndexFlat* quantizer = FLAGS_l2 ?
 88 |       (faiss::IndexFlat*) &quantizerL2 :
 89 |       (faiss::IndexFlat*) &quantizerIP;
 90 | 
 91 |     faiss::IndexIVFFlat index(quantizer, dim, numCentroids,
 92 |                               FLAGS_l2 ? faiss::METRIC_L2 :
 93 |                               faiss::METRIC_INNER_PRODUCT);
 94 | 
 95 |     printf("IVFFlat: metric %s\n", FLAGS_l2 ? "L2" : "IP");
 96 |     printf("Lists: %d\n", numCentroids);
 97 |     printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain);
 98 |     printf("output file: %s\n", FLAGS_out.c_str());
 99 | 
100 |     fillAndSave(index, numTrain, num, dim);
101 |   }
102 | 
103 |   return 0;
104 | }
105 | 


--------------------------------------------------------------------------------
/gpu/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #list(APPEND srcs
 2 | #    ${CMAKE_CURRENT_SOURCE_DIR}/demo_ivfpq_indexing_gpu.cpp)
 3 | 
 4 | list(APPEND srcs
 5 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_createdb.cpp)
 6 | 
 7 | list(APPEND srcs
 8 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_query.cpp)
 9 | 
10 | list(APPEND srcs
11 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_query1.cpp)
12 | 
13 | 
14 | list(APPEND srcs
15 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb.cpp)
16 |     list(APPEND srcs
17 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_createdb.cpp)
18 | 
19 | 
20 | list(APPEND srcs
21 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query.cpp)
22 | list(APPEND srcs
23 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_query.cpp)
24 | list(APPEND srcs
25 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query1.cpp)
26 | list(APPEND srcs
27 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb_hnsw.cpp)
28 | list(APPEND srcs
29 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_creategt.cpp)
30 | list(APPEND srcs
31 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_creategt.cpp)
32 | 
33 | list(APPEND srcs
34 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb.cpp)
35 | list(APPEND srcs
36 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_createdb.cpp)
37 | list(APPEND srcs
38 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query.cpp)
39 | list(APPEND srcs
40 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_query.cpp)
41 | list(APPEND srcs
42 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query1.cpp)
43 | 
44 | list(APPEND srcs
45 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb_hnsw.cpp)
46 | 
47 | 
48 | 
49 | list(APPEND srcs
50 |     ${CMAKE_CURRENT_SOURCE_DIR}/transform_sift1b.cpp)
51 | 
52 | list(APPEND srcs
53 |     ${CMAKE_CURRENT_SOURCE_DIR}/transform_deep1b.cpp)
54 | 
55 | # gtest
56 | find_package(GTest REQUIRED)
57 | include_directories(${GTEST_INCLUDE_DIRS})
58 | foreach(source ${srcs})
59 |     get_filename_component(name ${source} NAME_WE)
60 |     add_executable(${name} ${source})
61 |     target_link_libraries(${name} ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS}  ${MPICH_CXX_LIBRARIES})
62 | endforeach(source)
63 | 
64 | # CUDA_ADD_EXECUTABLE(TestGpuSelect ${CMAKE_CURRENT_SOURCE_DIR}/TestGpuSelect.cu ${CMAKE_CURRENT_SOURCE_DIR}/TestUtils.cpp)
65 | # target_link_libraries(TestGpuSelect ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${GTEST_BOTH_LIBRARIES})
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/gpu/test/CMakeLists.txt.bak:
--------------------------------------------------------------------------------
 1 | #list(APPEND srcs
 2 | #    ${CMAKE_CURRENT_SOURCE_DIR}/demo_ivfpq_indexing_gpu.cpp)
 3 | 
 4 | list(APPEND srcs
 5 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_createdb.cpp)
 6 | 
 7 | list(APPEND srcs
 8 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_query.cpp)
 9 | 
10 | list(APPEND srcs
11 |     ${CMAKE_CURRENT_SOURCE_DIR}/tool_query1.cpp)
12 | 
13 | 
14 | list(APPEND srcs
15 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb.cpp)
16 |     list(APPEND srcs
17 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_createdb.cpp)
18 | 
19 | 
20 | list(APPEND srcs
21 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query.cpp)
22 | list(APPEND srcs
23 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b16_query.cpp)
24 | list(APPEND srcs
25 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_query1.cpp)
26 | list(APPEND srcs
27 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdb_hnsw.cpp)
28 | list(APPEND srcs
29 |     ${CMAKE_CURRENT_SOURCE_DIR}/sift1b_createdbgt.cpp)
30 | 
31 | 
32 | list(APPEND srcs
33 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb.cpp)
34 | list(APPEND srcs
35 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_createdb.cpp)
36 | list(APPEND srcs
37 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query.cpp)
38 | list(APPEND srcs
39 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b16_query.cpp)
40 | list(APPEND srcs
41 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_query1.cpp)
42 | 
43 | list(APPEND srcs
44 |     ${CMAKE_CURRENT_SOURCE_DIR}/deep1b_createdb_hnsw.cpp)
45 | 
46 | 
47 | 
48 | list(APPEND srcs
49 |     ${CMAKE_CURRENT_SOURCE_DIR}/transform_sift1b.cpp)
50 | 
51 | list(APPEND srcs
52 |     ${CMAKE_CURRENT_SOURCE_DIR}/transform_deep1b.cpp)
53 | 
54 | # gtest
55 | find_package(GTest REQUIRED)
56 | include_directories(${GTEST_INCLUDE_DIRS})
57 | foreach(source ${srcs})
58 |     get_filename_component(name ${source} NAME_WE)
59 |     add_executable(${name} ${source})
60 |     target_link_libraries(${name} ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS}  ${MPICH_CXX_LIBRARIES})
61 | endforeach(source)
62 | 
63 | # CUDA_ADD_EXECUTABLE(TestGpuSelect ${CMAKE_CURRENT_SOURCE_DIR}/TestGpuSelect.cu ${CMAKE_CURRENT_SOURCE_DIR}/TestUtils.cpp)
64 | # target_link_libraries(TestGpuSelect ${faiss_lib_gpu} ${faiss_lib} ${CUDA_LINKER_LIBS} ${GTEST_BOTH_LIBRARIES})
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/gpu/test/TestUtils.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../../FaissAssert.h"
14 | #include "../../Index.h"
15 | #include <initializer_list>
16 | #include <memory>
17 | #include <string>
18 | #include <vector>
19 | 
20 | namespace faiss { namespace gpu {
21 | 
22 | /// Generates and displays a new seed for the test
23 | void newTestSeed();
24 | 
25 | /// Uses an explicit seed for the test
26 | void setTestSeed(long seed);
27 | 
28 | /// Returns the relative error in difference between a and b
29 | /// (|a - b| / (0.5 * (|a| + |b|))
30 | float relativeError(float a, float b);
31 | 
32 | /// Generates a random integer in the range [a, b]
33 | int randVal(int a, int b);
34 | 
35 | /// Generates a random bool
36 | bool randBool();
37 | 
38 | /// Select a random value from the given list of values provided as an
39 | /// initializer_list
40 | template <typename T>
41 | T randSelect(std::initializer_list<T> vals) {
42 |   FAISS_ASSERT(vals.size() > 0);
43 |   int sel = randVal(0, vals.size());
44 | 
45 |   int i = 0;
46 |   for (auto v : vals) {
47 |     if (i++ == sel) {
48 |       return v;
49 |     }
50 |   }
51 | 
52 |   // should not get here
53 |   return *vals.begin();
54 | }
55 | 
56 | /// Generates a collection of random vectors in the range [0, 1]
57 | std::vector<float> randVecs(size_t num, size_t dim);
58 | 
59 | /// Compare two indices via query for similarity
60 | void compareIndices(faiss::Index& refIndex,
61 |                     faiss::Index& testIndex,
62 |                     int numQuery, int dim, int k,
63 |                     const std::string& configMsg,
64 |                     float maxRelativeError = 6e-5f,
65 |                     float pctMaxDiff1 = 0.1f,
66 |                     float pctMaxDiffN = 0.005f);
67 | 
68 | /// Display specific differences in the two (distance, index) lists
69 | void compareLists(const float* refDist,
70 |                   const faiss::Index::idx_t* refInd,
71 |                   const float* testDist,
72 |                   const faiss::Index::idx_t* testInd,
73 |                   int dim1, int dim2,
74 |                   const std::string& configMsg,
75 |                   bool printBasicStats, bool printDiffs, bool assertOnErr,
76 |                   float maxRelativeError = 6e-5f,
77 |                   float pctMaxDiff1 = 0.1f,
78 |                   float pctMaxDiffN = 0.005f);
79 | 
80 | } }
81 | 


--------------------------------------------------------------------------------
/gpu/test/deep1b_query.cpp.bak:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_query.cpp.bak


--------------------------------------------------------------------------------
/gpu/test/deep1b_query1.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_query1.cpp


--------------------------------------------------------------------------------
/gpu/test/deep1b_queryd.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/deep1b_queryd.cpp


--------------------------------------------------------------------------------
/gpu/test/sift1b_query.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_query.cpp


--------------------------------------------------------------------------------
/gpu/test/sift1b_query1.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_query1.cpp


--------------------------------------------------------------------------------
/gpu/test/sift1b_queryd.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/gpu/test/sift1b_queryd.cpp


--------------------------------------------------------------------------------
/gpu/test/test_gpu_index.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the CC-by-NC license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | #! /usr/bin/env python2
 8 | 
 9 | import libfb.py.mkl  # noqa
10 | 
11 | import numpy as np
12 | 
13 | from libfb import testutil
14 | 
15 | import faiss
16 | 
17 | 
18 | class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
19 | 
20 |     def get_dataset(self):
21 |         d = 128
22 |         nb = 100000
23 |         nt = 15000
24 |         nq = 2000
25 |         np.random.seed(123)
26 | 
27 |         # generate points in a low-dim subspace to make the resutls
28 |         # look better :-)
29 |         d1 = 16
30 |         q, r = np.linalg.qr(np.random.randn(d, d))
31 |         qc = q[:d1, :]
32 |         def make_mat(n):
33 |             return np.dot(
34 |                 np.random.random(size=(nb, d1)), qc).astype('float32')
35 | 
36 |         return (make_mat(nt), make_mat(nb), make_mat(nq))
37 | 
38 |     def test_IndexIVFPQ(self):
39 |         (xt, xb, xq) = self.get_dataset()
40 |         d = xt.shape[1]
41 | 
42 |         dev_no = 0
43 |         usePrecomputed = True
44 | 
45 |         res = faiss.StandardGpuResources()
46 | 
47 |         flat_config = faiss.GpuIndexFlatConfig()
48 |         flat_config.device = dev_no
49 | 
50 |         gt_index = faiss.GpuIndexFlatL2(res, d, flat_config)
51 |         gt_index.add(xb)
52 |         D, gt_nns = gt_index.search(xq, 1)
53 | 
54 |         coarse_quantizer = faiss.IndexFlatL2(d)
55 |         ncentroids = int(np.sqrt(xb.shape[0])) * 4
56 | 
57 |         index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8)
58 |         # add implemented on GPU but not train
59 |         index.train(xt)
60 | 
61 |         ivfpq_config = faiss.GpuIndexIVFPQConfig()
62 |         ivfpq_config.device = dev_no
63 |         ivfpq_config.usePrecomputedTables = usePrecomputed
64 | 
65 |         gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config)
66 |         gpuIndex.setNumProbes(64)
67 |         index.add(xb)
68 | 
69 |         D, nns = index.search(xq, 10)
70 |         n_ok = (nns == gt_nns).sum()
71 |         nq = xq.shape[0]
72 |         print ncentroids, n_ok, nq
73 | 
74 |         self.assertGreater(n_ok, nq * 0.2)
75 | 
76 |     def test_mm(self):
77 |         # trouble with MKL+fbmake that appears only at runtime. Check it here
78 |         x = np.random.random(size=(100, 20)).astype('float32')
79 |         mat = faiss.PCAMatrix(20, 10)
80 |         mat.train(x)
81 |         mat.apply_py(x)
82 | 


--------------------------------------------------------------------------------
/gpu/utils/BlockSelectFloat.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "blockselect/BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | // warp Q to thread Q:
15 | // 1, 1
16 | // 32, 2
17 | // 64, 3
18 | // 128, 3
19 | // 256, 4
20 | // 512, 8
21 | // 1024, 8
22 | 
23 | BLOCK_SELECT_DECL(float, true, 1);
24 | BLOCK_SELECT_DECL(float, true, 32);
25 | BLOCK_SELECT_DECL(float, true, 64);
26 | BLOCK_SELECT_DECL(float, true, 128);
27 | BLOCK_SELECT_DECL(float, true, 256);
28 | BLOCK_SELECT_DECL(float, true, 512);
29 | BLOCK_SELECT_DECL(float, true, 1024);
30 | 
31 | BLOCK_SELECT_DECL(float, false, 1);
32 | BLOCK_SELECT_DECL(float, false, 32);
33 | BLOCK_SELECT_DECL(float, false, 64);
34 | BLOCK_SELECT_DECL(float, false, 128);
35 | BLOCK_SELECT_DECL(float, false, 256);
36 | BLOCK_SELECT_DECL(float, false, 512);
37 | BLOCK_SELECT_DECL(float, false, 1024);
38 | 
39 | void runBlockSelect(Tensor<float, 2, true>& in,
40 |                   Tensor<float, 2, true>& outK,
41 |                   Tensor<int, 2, true>& outV,
42 |                   bool dir, int k, cudaStream_t stream) {
43 |   FAISS_ASSERT(k <= 1024);
44 | 
45 |   if (dir) {
46 |     if (k == 1) {
47 |       BLOCK_SELECT_CALL(float, true, 1);
48 |     } else if (k <= 32) {
49 |       BLOCK_SELECT_CALL(float, true, 32);
50 |     } else if (k <= 64) {
51 |       BLOCK_SELECT_CALL(float, true, 64);
52 |     } else if (k <= 128) {
53 |       BLOCK_SELECT_CALL(float, true, 128);
54 |     } else if (k <= 256) {
55 |       BLOCK_SELECT_CALL(float, true, 256);
56 |     } else if (k <= 512) {
57 |       BLOCK_SELECT_CALL(float, true, 512);
58 |     } else if (k <= 1024) {
59 |       BLOCK_SELECT_CALL(float, true, 1024);
60 |     }
61 |   } else {
62 |     if (k == 1) {
63 |       BLOCK_SELECT_CALL(float, false, 1);
64 |     } else if (k <= 32) {
65 |       BLOCK_SELECT_CALL(float, false, 32);
66 |     } else if (k <= 64) {
67 |       BLOCK_SELECT_CALL(float, false, 64);
68 |     } else if (k <= 128) {
69 |       BLOCK_SELECT_CALL(float, false, 128);
70 |     } else if (k <= 256) {
71 |       BLOCK_SELECT_CALL(float, false, 256);
72 |     } else if (k <= 512) {
73 |       BLOCK_SELECT_CALL(float, false, 512);
74 |     } else if (k <= 1024) {
75 |       BLOCK_SELECT_CALL(float, false, 1024);
76 |     }
77 |   }
78 | }
79 | 
80 | } } // namespace
81 | 


--------------------------------------------------------------------------------
/gpu/utils/BlockSelectHalf.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "blockselect/BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | 
16 | // warp Q to thread Q:
17 | // 1, 1
18 | // 32, 2
19 | // 64, 3
20 | // 128, 3
21 | // 256, 4
22 | // 512, 8
23 | // 1024, 8
24 | 
25 | BLOCK_SELECT_DECL(half, true, 1);
26 | BLOCK_SELECT_DECL(half, true, 32);
27 | BLOCK_SELECT_DECL(half, true, 64);
28 | BLOCK_SELECT_DECL(half, true, 128);
29 | BLOCK_SELECT_DECL(half, true, 256);
30 | BLOCK_SELECT_DECL(half, true, 512);
31 | BLOCK_SELECT_DECL(half, true, 1024);
32 | 
33 | BLOCK_SELECT_DECL(half, false, 1);
34 | BLOCK_SELECT_DECL(half, false, 32);
35 | BLOCK_SELECT_DECL(half, false, 64);
36 | BLOCK_SELECT_DECL(half, false, 128);
37 | BLOCK_SELECT_DECL(half, false, 256);
38 | BLOCK_SELECT_DECL(half, false, 512);
39 | BLOCK_SELECT_DECL(half, false, 1024);
40 | 
41 | void runBlockSelect(Tensor<half, 2, true>& in,
42 |                   Tensor<half, 2, true>& outK,
43 |                   Tensor<int, 2, true>& outV,
44 |                   bool dir, int k, cudaStream_t stream) {
45 |   FAISS_ASSERT(k <= 1024);
46 | 
47 |   if (dir) {
48 |     if (k == 1) {
49 |       BLOCK_SELECT_CALL(half, true, 1);
50 |     } else if (k <= 32) {
51 |       BLOCK_SELECT_CALL(half, true, 32);
52 |     } else if (k <= 64) {
53 |       BLOCK_SELECT_CALL(half, true, 64);
54 |     } else if (k <= 128) {
55 |       BLOCK_SELECT_CALL(half, true, 128);
56 |     } else if (k <= 256) {
57 |       BLOCK_SELECT_CALL(half, true, 256);
58 |     } else if (k <= 512) {
59 |       BLOCK_SELECT_CALL(half, true, 512);
60 |     } else if (k <= 1024) {
61 |       BLOCK_SELECT_CALL(half, true, 1024);
62 |     }
63 |   } else {
64 |     if (k == 1) {
65 |       BLOCK_SELECT_CALL(half, false, 1);
66 |     } else if (k <= 32) {
67 |       BLOCK_SELECT_CALL(half, false, 32);
68 |     } else if (k <= 64) {
69 |       BLOCK_SELECT_CALL(half, false, 64);
70 |     } else if (k <= 128) {
71 |       BLOCK_SELECT_CALL(half, false, 128);
72 |     } else if (k <= 256) {
73 |       BLOCK_SELECT_CALL(half, false, 256);
74 |     } else if (k <= 512) {
75 |       BLOCK_SELECT_CALL(half, false, 512);
76 |     } else if (k <= 1024) {
77 |       BLOCK_SELECT_CALL(half, false, 1024);
78 |     }
79 |   }
80 | }
81 | 
82 | #endif
83 | 
84 | } } // namespace
85 | 


--------------------------------------------------------------------------------
/gpu/utils/BlockSelectKernel.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #pragma once
11 | 
12 | #include "Float16.cuh"
13 | #include "Select.cuh"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | template <typename K,
18 |           typename IndexType,
19 |           bool Dir,
20 |           int NumWarpQ,
21 |           int NumThreadQ,
22 |           int ThreadsPerBlock>
23 | __global__ void blockSelect(Tensor<K, 2, true> in,
24 |                             Tensor<K, 2, true> outK,
25 |                             Tensor<IndexType, 2, true> outV,
26 |                             K initK,
27 |                             IndexType initV,
28 |                             int k) {
29 |   constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
30 | 
31 |   __shared__ K smemK[kNumWarps * NumWarpQ];
32 |   __shared__ IndexType smemV[kNumWarps * NumWarpQ];
33 | 
34 |   BlockSelect<K, IndexType, Dir, Comparator<K>,
35 |             NumWarpQ, NumThreadQ, ThreadsPerBlock>
36 |     heap(initK, initV, smemK, smemV, k);
37 | 
38 |   // Grid is exactly sized to rows available
39 |   int row = blockIdx.x;
40 | 
41 |   int i = threadIdx.x;
42 |   K* inStart = in[row][i].data();
43 | 
44 |   // Whole warps must participate in the selection
45 |   int limit = utils::roundDown(in.getSize(1), kWarpSize);
46 | 
47 |   for (; i < limit; i += ThreadsPerBlock) {
48 |     heap.add(*inStart, (IndexType) i);
49 |     inStart += ThreadsPerBlock;
50 |   }
51 | 
52 |   // Handle last remainder fraction of a warp of elements
53 |   if (i < in.getSize(1)) {
54 |     heap.addThreadQ(*inStart, (IndexType) i);
55 |   }
56 | 
57 |   heap.reduce();
58 | 
59 |   for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
60 |     outK[row][i] = smemK[i];
61 |     outV[row][i] = smemV[i];
62 |   }
63 | }
64 | 
65 | void runBlockSelect(Tensor<float, 2, true>& in,
66 |                   Tensor<float, 2, true>& outKeys,
67 |                   Tensor<int, 2, true>& outIndices,
68 |                   bool dir, int k, cudaStream_t stream);
69 | 
70 | #ifdef FAISS_USE_FLOAT16
71 | void runBlockSelect(Tensor<half, 2, true>& in,
72 |                   Tensor<half, 2, true>& outKeys,
73 |                   Tensor<int, 2, true>& outIndices,
74 |                   bool dir, int k, cudaStream_t stream);
75 | #endif
76 | 
77 | } } // namespace
78 | 


--------------------------------------------------------------------------------
/gpu/utils/Comparators.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | #include "Float16.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | template <typename T>
19 | struct Comparator {
20 |   __device__ static inline bool lt(T a, T b) {
21 |     return a < b;
22 |   }
23 | 
24 |   __device__ static inline bool gt(T a, T b) {
25 |     return a > b;
26 |   }
27 | };
28 | 
29 | #ifdef FAISS_USE_FLOAT16
30 | 
31 | template <>
32 | struct Comparator<half> {
33 |   __device__ static inline bool lt(half a, half b) {
34 | #if FAISS_USE_FULL_FLOAT16
35 |     return __hlt(a, b);
36 | #else
37 |     return __half2float(a) < __half2float(b);
38 | #endif // FAISS_USE_FULL_FLOAT16
39 |   }
40 | 
41 |   __device__ static inline bool gt(half a, half b) {
42 | #if FAISS_USE_FULL_FLOAT16
43 |     return __hgt(a, b);
44 | #else
45 |     return __half2float(a) > __half2float(b);
46 | #endif // FAISS_USE_FULL_FLOAT16
47 |   }
48 | };
49 | 
50 | #endif // FAISS_USE_FLOAT16
51 | 
52 | } } // namespace
53 | 


--------------------------------------------------------------------------------
/gpu/utils/ConversionOperators.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | #include "Float16.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | //
19 | // Conversion utilities
20 | //
21 | 
22 | template <typename T>
23 | struct ConvertTo {
24 | };
25 | 
26 | template <>
27 | struct ConvertTo<float> {
28 |   static inline __device__ float to(float v) { return v; }
29 | #ifdef FAISS_USE_FLOAT16
30 |   static inline __device__ float to(half v) { return __half2float(v); }
31 | #endif
32 | };
33 | 
34 | template <>
35 | struct ConvertTo<float2> {
36 |   static inline __device__ float2 to(float2 v) { return v; }
37 | #ifdef FAISS_USE_FLOAT16
38 |   static inline __device__ float2 to(half2 v) { return __half22float2(v); }
39 | #endif
40 | };
41 | 
42 | template <>
43 | struct ConvertTo<float4> {
44 |   static inline __device__ float4 to(float4 v) { return v; }
45 | #ifdef FAISS_USE_FLOAT16
46 |   static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); }
47 | #endif
48 | };
49 | 
50 | #ifdef FAISS_USE_FLOAT16
51 | template <>
52 | struct ConvertTo<half> {
53 |   static inline __device__ half to(float v) { return __float2half(v); }
54 |   static inline __device__ half to(half v) { return v; }
55 | };
56 | 
57 | template <>
58 | struct ConvertTo<half2> {
59 |   static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); }
60 |   static inline __device__ half2 to(half2 v) { return v; }
61 | };
62 | 
63 | template <>
64 | struct ConvertTo<Half4> {
65 |   static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); }
66 |   static inline __device__ Half4 to(Half4 v) { return v; }
67 | };
68 | #endif
69 | 
70 | 
71 | } } // namespace
72 | 


--------------------------------------------------------------------------------
/gpu/utils/CopyUtils.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "DeviceTensor.cuh"
14 | #include "HostTensor.cuh"
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | /// Ensure the memory at `p` is either on the given device, or copy it
19 | /// to the device in a new allocation.
20 | /// If `resources` is provided, then we will perform a temporary
21 | /// memory allocation if needed. Otherwise, we will call cudaMalloc if
22 | /// needed.
23 | template <typename T, int Dim>
24 | DeviceTensor<T, Dim, true> toDevice(GpuResources* resources,
25 |                                     int dstDevice,
26 |                                     T* src,
27 |                                     cudaStream_t stream,
28 |                                     std::initializer_list<int> sizes) {
29 |   int dev = getDeviceForAddress(src);
30 | 
31 |   if (dev == dstDevice) {
32 |     // On device we expect
33 |     return DeviceTensor<T, Dim, true>(src, sizes);
34 |   } else {
35 |     // On different device or on host
36 |     DeviceScope scope(dstDevice);
37 | 
38 |     Tensor<T, Dim, true> oldT(src, sizes);
39 | 
40 |     if (resources) {
41 |       DeviceTensor<T, Dim, true> newT(resources->getMemoryManager(dstDevice),
42 |                                       sizes,
43 |                                       stream);
44 | 
45 |       newT.copyFrom(oldT, stream);
46 |       return newT;
47 |     } else {
48 |       DeviceTensor<T, Dim, true> newT(sizes);
49 | 
50 |       newT.copyFrom(oldT, stream);
51 |       return newT;
52 |     }
53 |   }
54 | }
55 | 
56 | /// Copies a device array's allocation to an address, if necessary
57 | template <typename T>
58 | inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) {
59 |   // It is possible that the array already represents memory at `p`,
60 |   // in which case no copy is needed
61 |   if (src == dst) {
62 |     return;
63 |   }
64 | 
65 |   int dev = getDeviceForAddress(dst);
66 | 
67 |   if (dev == -1) {
68 |     CUDA_VERIFY(cudaMemcpyAsync(dst,
69 |                                 src,
70 |                                 num * sizeof(T),
71 |                                 cudaMemcpyDeviceToHost,
72 |                                 stream));
73 |   } else {
74 |     CUDA_VERIFY(cudaMemcpyAsync(dst,
75 |                                 src,
76 |                                 num * sizeof(T),
77 |                                 cudaMemcpyDeviceToDevice,
78 |                                 stream));
79 |   }
80 | }
81 | 
82 | /// Copies a device array's allocation to an address, if necessary
83 | template <typename T, int Dim>
84 | void fromDevice(Tensor<T, Dim, true>& src, T* dst, cudaStream_t stream) {
85 |   FAISS_ASSERT(src.isContiguous());
86 |   fromDevice(src.data(), dst, src.numElements(), stream);
87 | }
88 | 
89 | } } // namespace
90 | 


--------------------------------------------------------------------------------
/gpu/utils/DeviceDefs.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | namespace faiss { namespace gpu {
14 | 
15 | #ifdef __CUDA_ARCH__
16 | #if __CUDA_ARCH__ <= 620
17 | constexpr int kWarpSize = 32;
18 | #else
19 | #error Unknown __CUDA_ARCH__; please define parameters for compute capability
20 | #endif // __CUDA_ARCH__ types
21 | #endif // __CUDA_ARCH__
22 | 
23 | #ifndef __CUDA_ARCH__
24 | // dummy value for host compiler
25 | constexpr int kWarpSize = 32;
26 | #endif // !__CUDA_ARCH__
27 | 
28 | __forceinline__ __device__ void warpFence() {
29 |   // Technically, memory barriers are required via the CUDA
30 |   // programming model, since warp synchronous programming no longer
31 |   // is guaranteed.
32 |   //
33 |   // There are two components to it:
34 |   // -a barrier known to the compiler such that the compiler will not
35 |   // schedule loads and stores across the barrier;
36 |   // -a HW-level barrier that guarantees that writes are seen in the
37 |   // proper order
38 |   //
39 |   // However, __threadfence_block() is a stronger constraint than what
40 |   // we really want out of the hardware: a warp-wide barrier.
41 |   //
42 |   // In current hardware, it appears that warp synchronous programming
43 |   // is a reality; by all tests it appears safe and race-free.
44 |   //
45 |   // However, understandably it may not be in the future (based on
46 |   // what Nvidia says in the Kepler guide, it may change depending
47 |   // upon compiler/toolchain issues or future hardware).
48 |   //
49 |   // Removing the fence results in 10%+ faster performance.
50 |   // However, we are judicious as to where we insert the fence, so if
51 |   // this reality ever changes, uncommenting this will result in CUDA
52 |   // programming model-safe ordering again.
53 |   //
54 |   // FIXME: we should probably qualify as volatile as well, since the
55 |   // compiler could technically preserve values across loops? This
56 |   // seems very impractical for the compiler to do, however.
57 | 
58 |   //  __threadfence_block();
59 | }
60 | 
61 | } } // namespace
62 | 


--------------------------------------------------------------------------------
/gpu/utils/DeviceMemory.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "DeviceMemory.h"
12 | #include "DeviceUtils.h"
13 | #include "../../FaissAssert.h"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | DeviceMemoryReservation::DeviceMemoryReservation()
18 |     : state_(NULL),
19 |       device_(0),
20 |       data_(NULL),
21 |       size_(0),
22 |       stream_(0) {
23 | }
24 | 
25 | DeviceMemoryReservation::DeviceMemoryReservation(DeviceMemory* state,
26 |                                              int device,
27 |                                              void* p,
28 |                                              size_t size,
29 |                                              cudaStream_t stream)
30 |     : state_(state),
31 |       device_(device),
32 |       data_(p),
33 |       size_(size),
34 |       stream_(stream) {
35 | }
36 | 
37 | DeviceMemoryReservation::DeviceMemoryReservation(
38 |   DeviceMemoryReservation&& m) noexcept {
39 |   if (data_) {
40 |     FAISS_ASSERT(state_);
41 |     state_->returnAllocation(*this);
42 |   }
43 | 
44 |   state_ = m.state_;
45 |   device_ = m.device_;
46 |   data_ = m.data_;
47 |   size_ = m.size_;
48 |   stream_ = m.stream_;
49 | 
50 |   m.data_ = NULL;
51 | }
52 | 
53 | DeviceMemoryReservation::~DeviceMemoryReservation() {
54 |   if (data_) {
55 |     FAISS_ASSERT(state_);
56 |     state_->returnAllocation(*this);
57 |   }
58 | 
59 |   data_ = NULL;
60 | }
61 | 
62 | DeviceMemoryReservation&
63 | DeviceMemoryReservation::operator=(DeviceMemoryReservation&& m) {
64 |   if (data_) {
65 |     FAISS_ASSERT(state_);
66 |     state_->returnAllocation(*this);
67 |   }
68 | 
69 |   state_ = m.state_;
70 |   device_ = m.device_;
71 |   data_ = m.data_;
72 |   size_ = m.size_;
73 |   stream_ = m.stream_;
74 | 
75 |   m.data_ = NULL;
76 | 
77 |   return *this;
78 | }
79 | 
80 | DeviceMemory::~DeviceMemory() {
81 | }
82 | 
83 | } } // namespace
84 | 


--------------------------------------------------------------------------------
/gpu/utils/DeviceMemory.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda_runtime.h>
14 | #include <string>
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | class DeviceMemory;
19 | 
20 | class DeviceMemoryReservation {
21 |  public:
22 |   DeviceMemoryReservation();
23 |   DeviceMemoryReservation(DeviceMemory* state,
24 |                           int device, void* p, size_t size,
25 |                           cudaStream_t stream);
26 |   DeviceMemoryReservation(DeviceMemoryReservation&& m) noexcept;
27 |   ~DeviceMemoryReservation();
28 | 
29 |   DeviceMemoryReservation& operator=(DeviceMemoryReservation&& m);
30 | 
31 |   int device() { return device_; }
32 |   void* get() { return data_; }
33 |   size_t size() { return size_; }
34 |   cudaStream_t stream() { return stream_; }
35 | 
36 |  private:
37 |   DeviceMemory* state_;
38 | 
39 |   int device_;
40 |   void* data_;
41 |   size_t size_;
42 |   cudaStream_t stream_;
43 | };
44 | 
45 | /// Manages temporary memory allocations on a GPU device
46 | class DeviceMemory {
47 |  public:
48 |   virtual ~DeviceMemory();
49 | 
50 |   /// Returns the device we are managing memory for
51 |   virtual int getDevice() const = 0;
52 | 
53 |   /// Obtains a temporary memory allocation for our device,
54 |   /// whose usage is ordered with respect to the given stream.
55 |   virtual DeviceMemoryReservation getMemory(cudaStream_t stream,
56 |                                             size_t size) = 0;
57 | 
58 |   /// Returns the current size available without calling cudaMalloc
59 |   virtual size_t getSizeAvailable() const = 0;
60 | 
61 |   /// Returns a string containing our current memory manager state
62 |   virtual std::string toString() const = 0;
63 | 
64 |   /// Returns the high-water mark of cudaMalloc allocations for our
65 |   /// device
66 |   virtual size_t getHighWaterCudaMalloc() const = 0;
67 | 
68 |  protected:
69 |   friend class DeviceMemoryReservation;
70 |   virtual void returnAllocation(DeviceMemoryReservation& m) = 0;
71 | };
72 | 
73 | } } // namespace
74 | 


--------------------------------------------------------------------------------
/gpu/utils/Float16.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "Float16.cuh"
12 | #include "nvidia/fp16_emu.cuh"
13 | #include <thrust/execution_policy.h>
14 | #include <thrust/transform.h>
15 | 
16 | #ifdef FAISS_USE_FLOAT16
17 | 
18 | namespace faiss { namespace gpu {
19 | 
20 | bool getDeviceSupportsFloat16Math(int device) {
21 |   const auto& prop = getDeviceProperties(device);
22 | 
23 |   return (prop.major >= 6 ||
24 |           (prop.major == 5 && prop.minor >= 3));
25 | }
26 | 
27 | struct FloatToHalf {
28 |   __device__ half operator()(float v) const { return __float2half(v); }
29 | };
30 | 
31 | struct HalfToFloat {
32 |   __device__ float operator()(half v) const { return __half2float(v); }
33 | };
34 | 
35 | void runConvertToFloat16(half* out,
36 |                          const float* in,
37 |                          size_t num,
38 |                          cudaStream_t stream) {
39 |   thrust::transform(thrust::cuda::par.on(stream),
40 |                     in, in + num, out, FloatToHalf());
41 | }
42 | 
43 | void runConvertToFloat32(float* out,
44 |                          const half* in,
45 |                          size_t num,
46 |                          cudaStream_t stream) {
47 |   thrust::transform(thrust::cuda::par.on(stream),
48 |                     in, in + num, out, HalfToFloat());
49 | }
50 | 
51 | half hostFloat2Half(float a) {
52 |   half h;
53 |   h.x = cpu_float2half_rn(a).x;
54 |   return h;
55 | }
56 | 
57 | } } // namespace
58 | 
59 | #endif // FAISS_USE_FLOAT16
60 | 


--------------------------------------------------------------------------------
/gpu/utils/HostTensor.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "Tensor.cuh"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | template <typename T,
18 |           int Dim,
19 |           bool Contig = false,
20 |           typename IndexT = int,
21 |           template <typename U> class PtrTraits = traits::DefaultPtrTraits>
22 | class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
23 |  public:
24 |   typedef IndexT IndexType;
25 |   typedef typename PtrTraits<T>::PtrType DataPtrType;
26 | 
27 |   /// Default constructor
28 |   __host__ HostTensor();
29 | 
30 |   /// Destructor
31 |   __host__ ~HostTensor();
32 | 
33 |   /// Constructs a tensor of the given size, allocating memory for it
34 |   /// locally
35 |   __host__ HostTensor(const IndexT sizes[Dim]);
36 |   __host__ HostTensor(std::initializer_list<IndexT> sizes);
37 | 
38 |   /// Constructs a tensor of the given size and stride, referencing a
39 |   /// memory region we do not own
40 |   __host__ HostTensor(DataPtrType data,
41 |                       const IndexT sizes[Dim]);
42 |   __host__ HostTensor(DataPtrType data,
43 |                       std::initializer_list<IndexT> sizes);
44 | 
45 |   /// Constructs a tensor of the given size and stride, referencing a
46 |   /// memory region we do not own
47 |   __host__ HostTensor(DataPtrType data,
48 |                       const IndexT sizes[Dim],
49 |                       const IndexT strides[Dim]);
50 | 
51 |   /// Copies a tensor into ourselves, allocating memory for it
52 |   /// locally. If the tensor is on the GPU, then we will copy it to
53 |   /// ourselves wrt the given stream.
54 |   __host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
55 |                       cudaStream_t stream);
56 | 
57 |   /// Call to zero out memory
58 |   __host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
59 | 
60 |   /// Returns the maximum difference seen between two tensors
61 |   __host__ T
62 |   maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
63 | 
64 |   /// Are the two tensors exactly equal?
65 |   __host__ bool
66 |   equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
67 |     return (maxDiff(t) == (T) 0);
68 |   }
69 | 
70 |  private:
71 |   enum AllocState {
72 |     /// This tensor itself owns the memory, which must be freed via
73 |     /// cudaFree
74 |     Owner,
75 | 
76 |     /// This tensor itself is not an owner of the memory; there is
77 |     /// nothing to free
78 |     NotOwner,
79 |   };
80 | 
81 |   AllocState state_;
82 | };
83 | 
84 | } } // namespace
85 | 
86 | #include "HostTensor-inl.cuh"
87 | 


--------------------------------------------------------------------------------
/gpu/utils/Limits.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "Float16.cuh"
14 | #include "Pair.cuh"
15 | #include <limits>
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | template <typename T>
20 | struct Limits {
21 | };
22 | 
23 | // Unfortunately we can't use constexpr because there is no
24 | // constexpr constructor for half
25 | // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
26 | constexpr float kFloatMax = std::numeric_limits<float>::max();
27 | 
28 | template <>
29 | struct Limits<float> {
30 |   static __device__ __host__ inline float getMin() {
31 |     return -kFloatMax;
32 |   }
33 |   static __device__ __host__ inline float getMax() {
34 |     return kFloatMax;
35 |   }
36 | };
37 | 
38 | #ifdef FAISS_USE_FLOAT16
39 | 
40 | inline __device__ __host__ half kGetHalf(unsigned short v) {
41 |   half h;
42 |   h.x = v;
43 |   return h;
44 | }
45 | 
46 | template <>
47 | struct Limits<half> {
48 |   static __device__ __host__ inline half getMin() {
49 |     return kGetHalf(0xfbffU);
50 |   }
51 |   static __device__ __host__ inline half getMax() {
52 |     return kGetHalf(0x7bffU);
53 |   }
54 | };
55 | 
56 | #endif // FAISS_USE_FLOAT16
57 | 
58 | constexpr int kIntMin = std::numeric_limits<int>::min();
59 | constexpr int kIntMax = std::numeric_limits<int>::max();
60 | 
61 | template <>
62 | struct Limits<int> {
63 |   static __device__ __host__ inline int getMin() {
64 |     return kIntMin;
65 |   }
66 |   static __device__ __host__ inline int getMax() {
67 |     return kIntMax;
68 |   }
69 | };
70 | 
71 | template<typename K, typename V>
72 | struct Limits<Pair<K, V>> {
73 |   static __device__ __host__ inline Pair<K, V> getMin() {
74 |     return Pair<K, V>(Limits<K>::getMin(), Limits<V>::getMin());
75 |   }
76 | 
77 |   static __device__ __host__ inline Pair<K, V> getMax() {
78 |     return Pair<K, V>(Limits<K>::getMax(), Limits<V>::getMax());
79 |   }
80 | };
81 | 
82 | } } // namespace
83 | 


--------------------------------------------------------------------------------
/gpu/utils/LoadStoreOperators.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "Float16.cuh"
14 | 
15 | //
16 | // Templated wrappers to express load/store for different scalar and vector
17 | // types, so kernels can have the same written form but can operate
18 | // over half and float, and on vector types transparently
19 | //
20 | 
21 | namespace faiss { namespace gpu {
22 | 
23 | template <typename T>
24 | struct LoadStore {
25 |   static inline __device__ T load(void* p) {
26 |     return *((T*) p);
27 |   }
28 | 
29 |   static inline __device__ void store(void* p, const T& v) {
30 |     *((T*) p) = v;
31 |   }
32 | };
33 | 
34 | #ifdef FAISS_USE_FLOAT16
35 | 
36 | template <>
37 | struct LoadStore<Half4> {
38 |   static inline __device__ Half4 load(void* p) {
39 |     Half4 out;
40 |     asm("ld.global.v2.u32 {%0, %1}, [%2];" :
41 |         "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
42 |     return out;
43 |   }
44 | 
45 |   static inline __device__ void store(void* p, const Half4& v) {
46 |     asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
47 |   }
48 | };
49 | 
50 | template <>
51 | struct LoadStore<Half8> {
52 |   static inline __device__ Half8 load(void* p) {
53 |     Half8 out;
54 |     asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
55 |         "=r"(out.a.a.x), "=r"(out.a.b.x),
56 |         "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
57 |     return out;
58 |   }
59 | 
60 |   static inline __device__ void store(void* p, const Half8& v) {
61 |     asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
62 |         : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
63 |   }
64 | };
65 | 
66 | #endif // FAISS_USE_FLOAT16
67 | 
68 | } } // namespace
69 | 


--------------------------------------------------------------------------------
/gpu/utils/MatrixMult.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cublas_v2.h>
14 | #include "Float16.cuh"
15 | #include "Tensor.cuh"
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | class DeviceMemory;
20 | 
21 | /// C = alpha * A * B + beta * C
22 | /// Expects row major layout, not fortran/blas column major!
23 | void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
24 |                    Tensor<float, 2, true>& a, bool transA,
25 |                    Tensor<float, 2, true>& b, bool transB,
26 |                    float alpha,
27 |                    float beta,
28 |                    bool useHgemm, // ignored for float32
29 |                    cublasHandle_t handle,
30 |                    cudaStream_t stream);
31 | 
32 | #ifdef FAISS_USE_FLOAT16
33 | /// C = alpha * A * B + beta * C
34 | /// Expects row major layout, not fortran/blas column major!
35 | void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
36 |                    Tensor<half, 2, true>& a, bool transA,
37 |                    Tensor<half, 2, true>& b, bool transB,
38 |                    float alpha,
39 |                    float beta,
40 |                    bool useHgemm,
41 |                    cublasHandle_t handle,
42 |                    cudaStream_t stream);
43 | #endif
44 | 
45 | /// C_i = alpha * A_i * B_i + beta * C_i
46 | /// where `i` is the outermost dimension, via iterated gemm
47 | /// Expects row major layout, not fortran/blas column major!
48 | void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
49 |                            Tensor<float, 3, true>& a, bool transA,
50 |                            Tensor<float, 3, true>& b, bool transB,
51 |                            float alpha,
52 |                            float beta,
53 |                            cublasHandle_t handle,
54 |                            cudaStream_t stream);
55 | 
56 | /// C_i = alpha * A_i * B_i + beta * C_i
57 | /// where `i` is the outermost dimension, via batched gemm
58 | /// Expects row major layout, not fortran/blas column major!
59 | void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
60 |                         Tensor<float, 3, true>& a, bool transA,
61 |                         Tensor<float, 3, true>& b, bool transB,
62 |                         float alpha,
63 |                         float beta,
64 |                         DeviceMemory& mem,
65 |                         cublasHandle_t handle,
66 |                         cudaStream_t stream);
67 | 
68 | } } // namespace
69 | 


--------------------------------------------------------------------------------
/gpu/utils/MemorySpace.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "MemorySpace.h"
12 | #include <cuda_runtime.h>
13 | 
14 | namespace faiss { namespace gpu {
15 | 
16 | /// Allocates CUDA memory for a given memory space
17 | void allocMemorySpace(MemorySpace space, void** p, size_t size) {
18 |   if (space == MemorySpace::Device) {
19 |     FAISS_ASSERT_FMT(cudaMalloc(p, size) == cudaSuccess,
20 |                      "Failed to cudaMalloc %zu bytes", size);
21 |   }
22 | #ifdef FAISS_UNIFIED_MEM
23 |   else if (space == MemorySpace::Unified) {
24 |     FAISS_ASSERT_FMT(cudaMallocManaged(p, size) == cudaSuccess,
25 |                      "Failed to cudaMallocManaged %zu bytes", size);
26 |   }
27 | #endif
28 |   else {
29 |     FAISS_ASSERT_FMT(false, "Unknown MemorySpace %d", (int) space);
30 |   }
31 | }
32 | 
33 | } }
34 | 


--------------------------------------------------------------------------------
/gpu/utils/MemorySpace.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include "../../FaissAssert.h"
14 | #include <cuda.h>
15 | 
16 | #if CUDA_VERSION >= 8000
17 | // Whether or not we enable usage of CUDA Unified Memory
18 | #define FAISS_UNIFIED_MEM 1
19 | #endif
20 | 
21 | namespace faiss { namespace gpu {
22 | 
23 | enum MemorySpace {
24 |   /// Managed using cudaMalloc/cudaFree
25 |   Device = 1,
26 |   /// Managed using cudaMallocManaged/cudaFree
27 |   Unified = 2,
28 | };
29 | 
30 | /// Allocates CUDA memory for a given memory space
31 | void allocMemorySpace(MemorySpace space, void** p, size_t size);
32 | 
33 | } }
34 | 


--------------------------------------------------------------------------------
/gpu/utils/MergeNetworkUtils.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #pragma once
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | template <typename T>
15 | inline __device__ void swap(bool swap, T& x, T& y) {
16 |   T tmp = x;
17 |   x = swap ? y : x;
18 |   y = swap ? tmp : y;
19 | }
20 | 
21 | template <typename T>
22 | inline __device__ void assign(bool assign, T& x, T y) {
23 |   x = assign ? y : x;
24 | }
25 | 
26 | } } // namespace
27 | 


--------------------------------------------------------------------------------
/gpu/utils/NoTypeTensor.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #pragma once
 12 | 
 13 | #include "../../FaissAssert.h"
 14 | #include "Tensor.cuh"
 15 | #include <initializer_list>
 16 | 
 17 | namespace faiss { namespace gpu {
 18 | 
 19 | template <int Dim, bool Contig = false, typename IndexT = int>
 20 | class NoTypeTensor {
 21 |  public:
 22 |   NoTypeTensor()
 23 |       : mem_(nullptr),
 24 |         typeSize_(0) {
 25 |   }
 26 | 
 27 |   template <typename T>
 28 |   NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
 29 |       : mem_(t.data()),
 30 |         typeSize_(sizeof(T)) {
 31 |     for (int i = 0; i < Dim; ++i) {
 32 |       size_[i] = t.getSize(i);
 33 |       stride_[i] = t.getStride(i);
 34 |     }
 35 |   }
 36 | 
 37 |   NoTypeTensor(void* mem, int typeSize, std::initializer_list<IndexT> sizes)
 38 |       : mem_(mem),
 39 |         typeSize_(typeSize) {
 40 | 
 41 |     int i = 0;
 42 |     for (auto s : sizes) {
 43 |       size_[i++] = s;
 44 |     }
 45 | 
 46 |     stride_[Dim - 1] = (IndexT) 1;
 47 |     for (int j = Dim - 2; j >= 0; --j) {
 48 |       stride_[j] = stride_[j + 1] * size_[j + 1];
 49 |     }
 50 |   }
 51 | 
 52 |   NoTypeTensor(void* mem, int typeSize, int sizes[Dim])
 53 |       : mem_(mem),
 54 |         typeSize_(typeSize) {
 55 |     for (int i = 0; i < Dim; ++i) {
 56 |       size_[i] = sizes[i];
 57 |     }
 58 | 
 59 |     stride_[Dim - 1] = (IndexT) 1;
 60 |     for (int i = Dim - 2; i >= 0; --i) {
 61 |       stride_[i] = stride_[i + 1] * sizes[i + 1];
 62 |     }
 63 |   }
 64 | 
 65 |   NoTypeTensor(void* mem, int typeSize,
 66 |                IndexT sizes[Dim], IndexT strides[Dim])
 67 |     : mem_(mem),
 68 |       typeSize_(typeSize) {
 69 |     for (int i = 0; i < Dim; ++i) {
 70 |       size_[i] = sizes[i];
 71 |       stride_[i] = strides[i];
 72 |     }
 73 |   }
 74 | 
 75 |   int getTypeSize() const {
 76 |     return typeSize_;
 77 |   }
 78 | 
 79 |   IndexT getSize(int dim) const {
 80 |     FAISS_ASSERT(dim < Dim);
 81 |     return size_[dim];
 82 |   }
 83 | 
 84 |   IndexT getStride(int dim) const {
 85 |     FAISS_ASSERT(dim < Dim);
 86 |     return stride_[dim];
 87 |   }
 88 | 
 89 |   template <typename T>
 90 |   Tensor<T, Dim, Contig, IndexT> toTensor() {
 91 |     FAISS_ASSERT(sizeof(T) == typeSize_);
 92 | 
 93 |     return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
 94 |   }
 95 | 
 96 |   NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
 97 |     char* newPtr = (char*) mem_;
 98 | 
 99 |     if (start > 0) {
100 |       newPtr += typeSize_ * start * stride_[0];
101 |     }
102 | 
103 |     IndexT newSize[Dim];
104 |     for (int i = 0; i < Dim; ++i) {
105 |       if (i == 0) {
106 |         assert(start + size <= size_[0]);
107 |         newSize[i] = size;
108 |       } else {
109 |         newSize[i] = size_[i];
110 |       }
111 |     }
112 | 
113 |     return NoTypeTensor<Dim, Contig, IndexT>(
114 |       newPtr, typeSize_, newSize, stride_);
115 |   }
116 | 
117 |  private:
118 |   void* mem_;
119 |   int typeSize_;
120 |   IndexT size_[Dim];
121 |   IndexT stride_[Dim];
122 | };
123 | 
124 | } } // namespace
125 | 


--------------------------------------------------------------------------------
/gpu/utils/Pair.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | #include "MathOperators.cuh"
15 | #include "WarpShuffles.cuh"
16 | 
17 | namespace faiss { namespace gpu {
18 | 
19 | /// A simple pair type for CUDA device usage
20 | template <typename K, typename V>
21 | struct Pair {
22 |   constexpr __device__ inline Pair() {
23 |   }
24 | 
25 |   constexpr __device__ inline Pair(K key, V value)
26 |       : k(key), v(value) {
27 |   }
28 | 
29 |   __device__ inline bool
30 |   operator==(const Pair<K, V>& rhs) const {
31 |     return Math<K>::eq(k, rhs.k) && Math<V>::eq(v, rhs.v);
32 |   }
33 | 
34 |   __device__ inline bool
35 |   operator!=(const Pair<K, V>& rhs) const {
36 |     return !operator==(rhs);
37 |   }
38 | 
39 |   __device__ inline bool
40 |   operator<(const Pair<K, V>& rhs) const {
41 |     return Math<K>::lt(k, rhs.k) ||
42 |       (Math<K>::eq(k, rhs.k) && Math<V>::lt(v, rhs.v));
43 |   }
44 | 
45 |   __device__ inline bool
46 |   operator>(const Pair<K, V>& rhs) const {
47 |     return Math<K>::gt(k, rhs.k) ||
48 |       (Math<K>::eq(k, rhs.k) && Math<V>::gt(v, rhs.v));
49 |   }
50 | 
51 |   K k;
52 |   V v;
53 | };
54 | 
55 | template <typename T, typename U>
56 | inline __device__ Pair<T, U> shfl_up(const Pair<T, U>& pair,
57 |                                      unsigned int delta,
58 |                                      int width = kWarpSize) {
59 |   return Pair<T, U>(shfl_up(pair.k, delta, width),
60 |                     shfl_up(pair.v, delta, width));
61 | }
62 | 
63 | template <typename T, typename U>
64 | inline __device__ Pair<T, U> shfl_xor(const Pair<T, U>& pair,
65 |                                       int laneMask,
66 |                                       int width = kWarpSize) {
67 |   return Pair<T, U>(shfl_xor(pair.k, laneMask, width),
68 |                     shfl_xor(pair.v, laneMask, width));
69 | }
70 | 
71 | } } // namespace
72 | 


--------------------------------------------------------------------------------
/gpu/utils/PtxUtils.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | __device__ __forceinline__
18 | unsigned int getBitfield(unsigned int val, int pos, int len) {
19 |   unsigned int ret;
20 |   asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
21 |   return ret;
22 | }
23 | 
24 | __device__ __forceinline__
25 | unsigned long getBitfield(unsigned long val, int pos, int len) {
26 |   unsigned long ret;
27 |   asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
28 |   return ret;
29 | }
30 | 
31 | __device__ __forceinline__
32 | unsigned int setBitfield(unsigned int val,
33 |                          unsigned int toInsert, int pos, int len) {
34 |   unsigned int ret;
35 |   asm("bfi.b32 %0, %1, %2, %3, %4;" :
36 |       "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
37 |   return ret;
38 | }
39 | 
40 | __device__ __forceinline__ int getLaneId() {
41 |   int laneId;
42 |   asm("mov.s32 %0, %laneid;" : "=r"(laneId) );
43 |   return laneId;
44 | }
45 | 
46 | __device__ __forceinline__ unsigned getLaneMaskLt() {
47 |   unsigned mask;
48 |   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
49 |   return mask;
50 | }
51 | 
52 | __device__ __forceinline__ unsigned getLaneMaskLe() {
53 |   unsigned mask;
54 |   asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
55 |   return mask;
56 | }
57 | 
58 | __device__ __forceinline__ unsigned getLaneMaskGt() {
59 |   unsigned mask;
60 |   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
61 |   return mask;
62 | }
63 | 
64 | __device__ __forceinline__ unsigned getLaneMaskGe() {
65 |   unsigned mask;
66 |   asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
67 |   return mask;
68 | }
69 | 
70 | __device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
71 |   asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
72 | }
73 | 
74 | __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
75 |   asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
76 | }
77 | 
78 | // FIXME: prefetch does nothing (in SASS) on Maxwell
79 | __device__ __forceinline__ void prefetchL2(const void *p) {
80 |   asm volatile("prefetch.global.L2 [%0];" : : "l"(p));
81 | }
82 | 
83 | __device__ __forceinline__ void prefetchL1(const void *p) {
84 |   asm volatile("prefetch.global.L1 [%0];" : : "l"(p));
85 | }
86 | 
87 | } } // namespace
88 | 


--------------------------------------------------------------------------------
/gpu/utils/ReductionOperators.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | #include "Limits.cuh"
15 | #include "MathOperators.cuh"
16 | #include "Pair.cuh"
17 | 
18 | namespace faiss { namespace gpu {
19 | 
20 | template <typename T>
21 | struct Sum {
22 |   __device__ inline T operator()(T a, T b) const {
23 |     return Math<T>::add(a, b);
24 |   }
25 | 
26 |   inline __device__ T identity() const {
27 |     return Math<T>::zero();
28 |   }
29 | };
30 | 
31 | template <typename T>
32 | struct Min {
33 |   __device__ inline T operator()(T a, T b) const {
34 |     return Math<T>::lt(a, b) ? a : b;
35 |   }
36 | 
37 |   inline __device__ T identity() const {
38 |     return Limits<T>::getMax();
39 |   }
40 | };
41 | 
42 | template <typename T>
43 | struct Max {
44 |   __device__ inline T operator()(T a, T b) const {
45 |     return Math<T>::gt(a, b) ? a : b;
46 |   }
47 | 
48 |   inline __device__ T identity() const {
49 |     return Limits<T>::getMin();
50 |   }
51 | };
52 | 
53 | /// Used for producing segmented prefix scans; the value of the Pair
54 | /// denotes the start of a new segment for the scan
55 | template <typename T, typename ReduceOp>
56 | struct SegmentedReduce {
57 |   inline __device__ SegmentedReduce(const ReduceOp& o)
58 |       : op(o) {
59 |   }
60 | 
61 |   __device__
62 |   inline Pair<T, bool>
63 |   operator()(const Pair<T, bool>& a, const Pair<T, bool>& b) const {
64 |     return Pair<T, bool>(b.v ? b.k : op(a.k, b.k),
65 |                          a.v || b.v);
66 |   }
67 | 
68 |   inline __device__ Pair<T, bool> identity() const {
69 |     return Pair<T, bool>(op.identity(), false);
70 |   }
71 | 
72 |   ReduceOp op;
73 | };
74 | 
75 | } } // namespace
76 | 


--------------------------------------------------------------------------------
/gpu/utils/StackDeviceMemory.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #pragma once
 12 | 
 13 | #include "DeviceMemory.h"
 14 | #include <list>
 15 | #include <memory>
 16 | #include <unordered_map>
 17 | 
 18 | namespace faiss { namespace gpu {
 19 | 
 20 | /// Device memory manager that provides temporary memory allocations
 21 | /// out of a region of memory
 22 | class StackDeviceMemory : public DeviceMemory {
 23 |  public:
 24 |   /// Allocate a new region of memory that we manage
 25 |   explicit StackDeviceMemory(int device, size_t allocPerDevice);
 26 | 
 27 |   /// Manage a region of memory for a particular device, with or
 28 |   /// without ownership
 29 |   StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
 30 | 
 31 |   ~StackDeviceMemory() override;
 32 | 
 33 |   int getDevice() const override;
 34 | 
 35 |   DeviceMemoryReservation getMemory(cudaStream_t stream,
 36 |                                     size_t size) override;
 37 | 
 38 |   size_t getSizeAvailable() const override;
 39 |   std::string toString() const override;
 40 |   size_t getHighWaterCudaMalloc() const override;
 41 | 
 42 |  protected:
 43 |   void returnAllocation(DeviceMemoryReservation& m) override;
 44 | 
 45 |  protected:
 46 |   /// Previous allocation ranges and the streams for which
 47 |   /// synchronization is required
 48 |   struct Range {
 49 |     inline Range(char* s, char* e, cudaStream_t str) :
 50 |         start_(s), end_(e), stream_(str) {
 51 |     }
 52 | 
 53 |     // References a memory range [start, end)
 54 |     char* start_;
 55 |     char* end_;
 56 |     cudaStream_t stream_;
 57 |   };
 58 | 
 59 |   struct Stack {
 60 |     /// Constructor that allocates memory via cudaMalloc
 61 |     Stack(int device, size_t size);
 62 | 
 63 |     /// Constructor that references a pre-allocated region of memory
 64 |     Stack(int device, void* p, size_t size, bool isOwner);
 65 |     ~Stack();
 66 | 
 67 |     /// Returns how much size is available for an allocation without
 68 |     /// calling cudaMalloc
 69 |     size_t getSizeAvailable() const;
 70 | 
 71 |     /// Obtains an allocation; all allocations are guaranteed to be 16
 72 |     /// byte aligned
 73 |     char* getAlloc(size_t size, cudaStream_t stream);
 74 | 
 75 |     /// Returns an allocation
 76 |     void returnAlloc(char* p, size_t size, cudaStream_t stream);
 77 | 
 78 |     /// Returns the stack state
 79 |     std::string toString() const;
 80 | 
 81 |     /// Returns the high-water mark of cudaMalloc activity
 82 |     size_t getHighWaterCudaMalloc() const;
 83 | 
 84 |     /// Device this allocation is on
 85 |     int device_;
 86 | 
 87 |     /// Do we own our region of memory?
 88 |     bool isOwner_;
 89 | 
 90 |     /// Where our allocation begins and ends
 91 |     /// [start_, end_) is valid
 92 |     char* start_;
 93 |     char* end_;
 94 | 
 95 |     /// Total size end_ - start_
 96 |     size_t size_;
 97 | 
 98 |     /// Stack head within [start, end)
 99 |     char* head_;
100 | 
101 |     /// List of previous last users of allocations on our stack, for
102 |     /// possible synchronization purposes
103 |     std::list<Range> lastUsers_;
104 | 
105 |     /// How much cudaMalloc memory is currently outstanding?
106 |     size_t mallocCurrent_;
107 | 
108 |     /// What's the high water mark in terms of memory used from the
109 |     /// temporary buffer?
110 |     size_t highWaterMemoryUsed_;
111 | 
112 |     /// What's the high water mark in terms of memory allocated via
113 |     /// cudaMalloc?
114 |     size_t highWaterMalloc_;
115 |   };
116 | 
117 |   /// Our device
118 |   int device_;
119 | 
120 |   /// Memory stack
121 |   Stack stack_;
122 | };
123 | 
124 | } } // namespace
125 | 


--------------------------------------------------------------------------------
/gpu/utils/StaticUtils.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | 
15 | namespace faiss { namespace gpu { namespace utils {
16 | 
17 | template <typename U, typename V>
18 | constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
19 |   return (a + b - 1) / b;
20 | }
21 | 
22 | template <typename U, typename V>
23 | constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
24 |   return (a / b) * b;
25 | }
26 | 
27 | template <typename U, typename V>
28 | constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
29 |   return divUp(a, b) * b;
30 | }
31 | 
32 | template <class T>
33 | constexpr __host__ __device__ T pow(T n, T power) {
34 |   return (power > 0 ? n * pow(n, power - 1) : 1);
35 | }
36 | 
37 | template <class T>
38 | constexpr __host__ __device__ T pow2(T n) {
39 |   return pow(2, (T) n);
40 | }
41 | 
42 | static_assert(pow2(8) == 256, "pow2");
43 | 
44 | template <typename T>
45 | constexpr __host__ __device__ int log2(T n, int p = 0) {
46 |   return (n <= 1) ? p : log2(n / 2, p + 1);
47 | }
48 | 
49 | static_assert(log2(2) == 1, "log2");
50 | static_assert(log2(3) == 1, "log2");
51 | static_assert(log2(4) == 2, "log2");
52 | 
53 | template <typename T>
54 | constexpr __host__ __device__ bool isPowerOf2(T v) {
55 |   return (v && !(v & (v - 1)));
56 | }
57 | 
58 | static_assert(isPowerOf2(2048), "isPowerOf2");
59 | static_assert(!isPowerOf2(3333), "isPowerOf2");
60 | 
61 | template <typename T>
62 | constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
63 |   return (isPowerOf2(v) ? (T) 2 * v : ((T) 1 << (log2(v) + 1)));
64 | }
65 | 
66 | static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
67 | static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
68 | static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
69 | static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
70 | 
71 | static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
72 | static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
73 | static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
74 | 
75 | static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u,
76 |               "nextHighestPowerOf2");
77 | static_assert(nextHighestPowerOf2((size_t) 2147483648ULL) ==
78 |               (size_t) 4294967296ULL, "nextHighestPowerOf2");
79 | 
80 | } } } // namespace
81 | 


--------------------------------------------------------------------------------
/gpu/utils/ThrustAllocator.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda.h>
14 | #include <unordered_set>
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | /// Allocator for Thrust that comes out of a specified memory space
19 | class GpuResourcesThrustAllocator {
20 |  public:
21 |   typedef char value_type;
22 | 
23 |   GpuResourcesThrustAllocator(void* mem, size_t size)
24 |       : start_((char*) mem),
25 |         cur_((char*) mem),
26 |         end_((char*) mem + size) {
27 |   }
28 | 
29 |   ~GpuResourcesThrustAllocator() {
30 |   }
31 | 
32 |   char* allocate(std::ptrdiff_t size) {
33 |     if (size <= (end_ - cur_)) {
34 |       char* p = cur_;
35 |       cur_ += size;
36 |       FAISS_ASSERT(cur_ <= end_);
37 | 
38 |       return p;
39 |     } else {
40 |       char* p = nullptr;
41 |       CUDA_VERIFY(cudaMalloc(&p, size));
42 |       mallocAllocs_.insert(p);
43 |       return p;
44 |     }
45 |   }
46 | 
47 |   void deallocate(char* p, size_t size) {
48 |     // Allocations could be returned out-of-order; ignore those we
49 |     // didn't cudaMalloc
50 |     auto it = mallocAllocs_.find(p);
51 |     if (it != mallocAllocs_.end()) {
52 |       CUDA_VERIFY(cudaFree(p));
53 |       mallocAllocs_.erase(it);
54 |     }
55 |   }
56 | 
57 |  private:
58 |   char* start_;
59 |   char* cur_;
60 |   char* end_;
61 |   std::unordered_set<char*> mallocAllocs_;
62 | };
63 | 
64 | 
65 | } } // namespace
66 | 


--------------------------------------------------------------------------------
/gpu/utils/Timer.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #include "Timer.h"
12 | #include "DeviceUtils.h"
13 | #include "../../FaissAssert.h"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | KernelTimer::KernelTimer(cudaStream_t stream)
18 |     : startEvent_(0),
19 |       stopEvent_(0),
20 |       stream_(stream),
21 |       valid_(true) {
22 |   CUDA_VERIFY(cudaEventCreate(&startEvent_));
23 |   CUDA_VERIFY(cudaEventCreate(&stopEvent_));
24 | 
25 |   CUDA_VERIFY(cudaEventRecord(startEvent_, stream_));
26 | }
27 | 
28 | KernelTimer::~KernelTimer() {
29 |   CUDA_VERIFY(cudaEventDestroy(startEvent_));
30 |   CUDA_VERIFY(cudaEventDestroy(stopEvent_));
31 | }
32 | 
33 | float
34 | KernelTimer::elapsedMilliseconds() {
35 |   FAISS_ASSERT(valid_);
36 | 
37 |   CUDA_VERIFY(cudaEventRecord(stopEvent_, stream_));
38 |   CUDA_VERIFY(cudaEventSynchronize(stopEvent_));
39 | 
40 |   auto time = 0.0f;
41 |   CUDA_VERIFY(cudaEventElapsedTime(&time, startEvent_, stopEvent_));
42 |   valid_ = false;
43 | 
44 |   return time;
45 | }
46 | 
47 | CpuTimer::CpuTimer() {
48 |   clock_gettime(CLOCK_REALTIME, &start_);
49 | }
50 | 
51 | float
52 | CpuTimer::elapsedMilliseconds() {
53 |   struct timespec end;
54 |   clock_gettime(CLOCK_REALTIME, &end);
55 | 
56 |   auto diffS = end.tv_sec - start_.tv_sec;
57 |   auto diffNs = end.tv_nsec - start_.tv_nsec;
58 | 
59 |   return 1000.0f * (float) diffS + ((float) diffNs) / 1000000.0f;
60 | }
61 | 
62 | } } // namespace
63 | 


--------------------------------------------------------------------------------
/gpu/utils/Timer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <cuda_runtime.h>
14 | #include <time.h>
15 | 
16 | namespace faiss { namespace gpu {
17 | 
18 | /// Utility class for timing execution of a kernel
19 | class KernelTimer {
20 |  public:
21 |   /// Constructor starts the timer and adds an event into the current
22 |   /// device stream
23 |   KernelTimer(cudaStream_t stream = 0);
24 | 
25 |   /// Destructor releases event resources
26 |   ~KernelTimer();
27 | 
28 |   /// Adds a stop event then synchronizes on the stop event to get the
29 |   /// actual GPU-side kernel timings for any kernels launched in the
30 |   /// current stream. Returns the number of milliseconds elapsed.
31 |   /// Can only be called once.
32 |   float elapsedMilliseconds();
33 | 
34 |  private:
35 |   cudaEvent_t startEvent_;
36 |   cudaEvent_t stopEvent_;
37 |   cudaStream_t stream_;
38 |   bool valid_;
39 | };
40 | 
41 | /// CPU wallclock elapsed timer
42 | class CpuTimer {
43 |  public:
44 |   /// Creates and starts a new timer
45 |   CpuTimer();
46 | 
47 |   /// Returns elapsed time in milliseconds
48 |   float elapsedMilliseconds();
49 | 
50 |  private:
51 |   struct timespec start_;
52 | };
53 | 
54 | } } // namespace
55 | 


--------------------------------------------------------------------------------
/gpu/utils/WarpSelectFloat.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "warpselect/WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | // warp Q to thread Q:
15 | // 1, 1
16 | // 32, 2
17 | // 64, 3
18 | // 128, 3
19 | // 256, 4
20 | // 512, 8
21 | // 1024, 8
22 | 
23 | WARP_SELECT_DECL(float, true, 1);
24 | WARP_SELECT_DECL(float, true, 32);
25 | WARP_SELECT_DECL(float, true, 64);
26 | WARP_SELECT_DECL(float, true, 128);
27 | WARP_SELECT_DECL(float, true, 256);
28 | WARP_SELECT_DECL(float, true, 512);
29 | WARP_SELECT_DECL(float, true, 1024);
30 | 
31 | WARP_SELECT_DECL(float, false, 1);
32 | WARP_SELECT_DECL(float, false, 32);
33 | WARP_SELECT_DECL(float, false, 64);
34 | WARP_SELECT_DECL(float, false, 128);
35 | WARP_SELECT_DECL(float, false, 256);
36 | WARP_SELECT_DECL(float, false, 512);
37 | WARP_SELECT_DECL(float, false, 1024);
38 | 
39 | void runWarpSelect(Tensor<float, 2, true>& in,
40 |                       Tensor<float, 2, true>& outK,
41 |                       Tensor<int, 2, true>& outV,
42 |                       bool dir, int k, cudaStream_t stream) {
43 |   FAISS_ASSERT(k <= 1024);
44 | 
45 |   if (dir) {
46 |     if (k == 1) {
47 |       WARP_SELECT_CALL(float, true, 1);
48 |     } else if (k <= 32) {
49 |       WARP_SELECT_CALL(float, true, 32);
50 |     } else if (k <= 64) {
51 |       WARP_SELECT_CALL(float, true, 64);
52 |     } else if (k <= 128) {
53 |       WARP_SELECT_CALL(float, true, 128);
54 |     } else if (k <= 256) {
55 |       WARP_SELECT_CALL(float, true, 256);
56 |     } else if (k <= 512) {
57 |       WARP_SELECT_CALL(float, true, 512);
58 |     } else if (k <= 1024) {
59 |       WARP_SELECT_CALL(float, true, 1024);
60 |     }
61 |   } else {
62 |     if (k == 1) {
63 |       WARP_SELECT_CALL(float, false, 1);
64 |     } else if (k <= 32) {
65 |       WARP_SELECT_CALL(float, false, 32);
66 |     } else if (k <= 64) {
67 |       WARP_SELECT_CALL(float, false, 64);
68 |     } else if (k <= 128) {
69 |       WARP_SELECT_CALL(float, false, 128);
70 |     } else if (k <= 256) {
71 |       WARP_SELECT_CALL(float, false, 256);
72 |     } else if (k <= 512) {
73 |       WARP_SELECT_CALL(float, false, 512);
74 |     } else if (k <= 1024) {
75 |       WARP_SELECT_CALL(float, false, 1024);
76 |     }
77 |   }
78 | }
79 | 
80 | } } // namespace
81 | 


--------------------------------------------------------------------------------
/gpu/utils/WarpSelectHalf.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "warpselect/WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | 
16 | // warp Q to thread Q:
17 | // 1, 1
18 | // 32, 2
19 | // 64, 3
20 | // 128, 3
21 | // 256, 4
22 | // 512, 8
23 | // 1024, 8
24 | 
25 | WARP_SELECT_DECL(half, true, 1);
26 | WARP_SELECT_DECL(half, true, 32);
27 | WARP_SELECT_DECL(half, true, 64);
28 | WARP_SELECT_DECL(half, true, 128);
29 | WARP_SELECT_DECL(half, true, 256);
30 | WARP_SELECT_DECL(half, true, 512);
31 | WARP_SELECT_DECL(half, true, 1024);
32 | 
33 | WARP_SELECT_DECL(half, false, 1);
34 | WARP_SELECT_DECL(half, false, 32);
35 | WARP_SELECT_DECL(half, false, 64);
36 | WARP_SELECT_DECL(half, false, 128);
37 | WARP_SELECT_DECL(half, false, 256);
38 | WARP_SELECT_DECL(half, false, 512);
39 | WARP_SELECT_DECL(half, false, 1024);
40 | 
41 | void runWarpSelect(Tensor<half, 2, true>& in,
42 |                       Tensor<half, 2, true>& outK,
43 |                       Tensor<int, 2, true>& outV,
44 |                       bool dir, int k, cudaStream_t stream) {
45 |   FAISS_ASSERT(k <= 1024);
46 | 
47 |   if (dir) {
48 |     if (k == 1) {
49 |       WARP_SELECT_CALL(half, true, 1);
50 |     } else if (k <= 32) {
51 |       WARP_SELECT_CALL(half, true, 32);
52 |     } else if (k <= 64) {
53 |       WARP_SELECT_CALL(half, true, 64);
54 |     } else if (k <= 128) {
55 |       WARP_SELECT_CALL(half, true, 128);
56 |     } else if (k <= 256) {
57 |       WARP_SELECT_CALL(half, true, 256);
58 |     } else if (k <= 512) {
59 |       WARP_SELECT_CALL(half, true, 512);
60 |     } else if (k <= 1024) {
61 |       WARP_SELECT_CALL(half, true, 1024);
62 |     }
63 |   } else {
64 |     if (k == 1) {
65 |       WARP_SELECT_CALL(half, false, 1);
66 |     } else if (k <= 32) {
67 |       WARP_SELECT_CALL(half, false, 32);
68 |     } else if (k <= 64) {
69 |       WARP_SELECT_CALL(half, false, 64);
70 |     } else if (k <= 128) {
71 |       WARP_SELECT_CALL(half, false, 128);
72 |     } else if (k <= 256) {
73 |       WARP_SELECT_CALL(half, false, 256);
74 |     } else if (k <= 512) {
75 |       WARP_SELECT_CALL(half, false, 512);
76 |     } else if (k <= 1024) {
77 |       WARP_SELECT_CALL(half, false, 1024);
78 |     }
79 |   }
80 | }
81 | 
82 | #endif
83 | 
84 | } } // namespace
85 | 


--------------------------------------------------------------------------------
/gpu/utils/WarpSelectKernel.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #pragma once
11 | 
12 | #include "Float16.cuh"
13 | #include "Select.cuh"
14 | 
15 | namespace faiss { namespace gpu {
16 | 
17 | template <typename K,
18 |           typename IndexType,
19 |           bool Dir,
20 |           int NumWarpQ,
21 |           int NumThreadQ,
22 |           int ThreadsPerBlock>
23 | __global__ void warpSelect(Tensor<K, 2, true> in,
24 |                            Tensor<K, 2, true> outK,
25 |                            Tensor<IndexType, 2, true> outV,
26 |                            K initK,
27 |                            IndexType initV,
28 |                            int k) {
29 |   constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
30 | 
31 |   WarpSelect<K, IndexType, Dir, Comparator<K>,
32 |                 NumWarpQ, NumThreadQ, ThreadsPerBlock>
33 |     heap(initK, initV, k);
34 | 
35 |   int warpId = threadIdx.x / kWarpSize;
36 |   int row = blockIdx.x * kNumWarps + warpId;
37 | 
38 |   if (row >= in.getSize(0)) {
39 |     return;
40 |   }
41 | 
42 |   int i = getLaneId();
43 |   K* inStart = in[row][i].data();
44 | 
45 |   // Whole warps must participate in the selection
46 |   int limit = utils::roundDown(in.getSize(1), kWarpSize);
47 | 
48 |   for (; i < limit; i += kWarpSize) {
49 |     heap.add(*inStart, (IndexType) i);
50 |     inStart += kWarpSize;
51 |   }
52 | 
53 |   // Handle non-warp multiple remainder
54 |   if (i < in.getSize(1)) {
55 |     heap.addThreadQ(*inStart, (IndexType) i);
56 |   }
57 | 
58 |   heap.reduce();
59 |   heap.writeOut(outK[row].data(),
60 |                 outV[row].data(), k);
61 | }
62 | 
63 | void runWarpSelect(Tensor<float, 2, true>& in,
64 |                       Tensor<float, 2, true>& outKeys,
65 |                       Tensor<int, 2, true>& outIndices,
66 |                       bool dir, int k, cudaStream_t stream);
67 | 
68 | #ifdef FAISS_USE_FLOAT16
69 | void runWarpSelect(Tensor<half, 2, true>& in,
70 |                       Tensor<half, 2, true>& outKeys,
71 |                       Tensor<int, 2, true>& outIndices,
72 |                       bool dir, int k, cudaStream_t stream);
73 | #endif
74 | 
75 | } } // namespace
76 | 


--------------------------------------------------------------------------------
/gpu/utils/WarpShuffles.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #pragma once
 12 | 
 13 | #include <cuda.h>
 14 | #include "DeviceDefs.cuh"
 15 | #include "Float16.cuh"
 16 | 
 17 | namespace faiss { namespace gpu {
 18 | 
 19 | template <typename T>
 20 | inline __device__ T shfl(const T val,
 21 |                          int srcLane, int width = kWarpSize) {
 22 |   return __shfl(val, srcLane, width);
 23 | }
 24 | 
 25 | // CUDA SDK does not provide specializations for T*
 26 | template <typename T>
 27 | inline __device__ T* shfl(T* const val,
 28 |                          int srcLane, int width = kWarpSize) {
 29 |   static_assert(sizeof(T*) == sizeof(long long), "pointer size");
 30 |   long long v = (long long) val;
 31 |   return (T*) __shfl(v, srcLane, width);
 32 | }
 33 | 
 34 | template <typename T>
 35 | inline __device__ T shfl_up(const T val,
 36 |                             unsigned int delta, int width = kWarpSize) {
 37 |   return __shfl_up(val, delta, width);
 38 | }
 39 | 
 40 | // CUDA SDK does not provide specializations for T*
 41 | template <typename T>
 42 | inline __device__ T* shfl_up(T* const val,
 43 |                              unsigned int delta, int width = kWarpSize) {
 44 |   static_assert(sizeof(T*) == sizeof(long long), "pointer size");
 45 |   long long v = (long long) val;
 46 |   return (T*) __shfl_up(v, delta, width);
 47 | }
 48 | 
 49 | template <typename T>
 50 | inline __device__ T shfl_down(const T val,
 51 |                               unsigned int delta, int width = kWarpSize) {
 52 |   return __shfl_down(val, delta, width);
 53 | }
 54 | 
 55 | // CUDA SDK does not provide specializations for T*
 56 | template <typename T>
 57 | inline __device__ T* shfl_down(T* const val,
 58 |                               unsigned int delta, int width = kWarpSize) {
 59 |   static_assert(sizeof(T*) == sizeof(long long), "pointer size");
 60 |   long long v = (long long) val;
 61 |   return (T*) __shfl_down(v, delta, width);
 62 | }
 63 | 
 64 | template <typename T>
 65 | inline __device__ T shfl_xor(const T val,
 66 |                              int laneMask, int width = kWarpSize) {
 67 |   return __shfl_xor(val, laneMask, width);
 68 | }
 69 | 
 70 | // CUDA SDK does not provide specializations for T*
 71 | template <typename T>
 72 | inline __device__ T* shfl_xor(T* const val,
 73 |                               int laneMask, int width = kWarpSize) {
 74 |   static_assert(sizeof(T*) == sizeof(long long), "pointer size");
 75 |   long long v = (long long) val;
 76 |   return (T*) __shfl_xor(v, laneMask, width);
 77 | }
 78 | 
 79 | #ifdef FAISS_USE_FLOAT16
 80 | inline __device__ half shfl(half v,
 81 |                             int srcLane, int width = kWarpSize) {
 82 |   unsigned int vu = v.x;
 83 |   vu = __shfl(vu, srcLane, width);
 84 | 
 85 |   half h;
 86 |   h.x = (unsigned short) vu;
 87 |   return h;
 88 | }
 89 | 
 90 | inline __device__ half shfl_xor(half v,
 91 |                                 int laneMask, int width = kWarpSize) {
 92 |   unsigned int vu = v.x;
 93 |   vu = __shfl_xor(vu, laneMask, width);
 94 | 
 95 |   half h;
 96 |   h.x = (unsigned short) vu;
 97 |   return h;
 98 | }
 99 | #endif
100 | 
101 | } } // namespace
102 | 


--------------------------------------------------------------------------------
/gpu/utils/WorkerThread.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved.
 10 | 
 11 | #include "WorkerThread.h"
 12 | #include "../../FaissAssert.h"
 13 | 
 14 | namespace faiss { namespace gpu {
 15 | 
 16 | WorkerThread::WorkerThread() :
 17 |     wantStop_(false) {
 18 |   startThread();
 19 | 
 20 |   // Make sure that the thread has started before continuing
 21 |   add([](){}).get();
 22 | }
 23 | 
 24 | WorkerThread::~WorkerThread() {
 25 |   stop();
 26 |   waitForThreadExit();
 27 | }
 28 | 
 29 | void
 30 | WorkerThread::startThread() {
 31 |   thread_ = std::thread([this](){ threadMain(); });
 32 | }
 33 | 
 34 | void
 35 | WorkerThread::stop() {
 36 |   std::lock_guard<std::mutex> guard(mutex_);
 37 | 
 38 |   wantStop_ = true;
 39 |   monitor_.notify_one();
 40 | }
 41 | 
 42 | std::future<bool>
 43 | WorkerThread::add(std::function<void()> f) {
 44 |   std::lock_guard<std::mutex> guard(mutex_);
 45 | 
 46 |   if (wantStop_) {
 47 |     // The timer thread has been stopped, or we want to stop; we can't
 48 |     // schedule anything else
 49 |     std::promise<bool> p;
 50 |     auto fut = p.get_future();
 51 | 
 52 |     // did not execute
 53 |     p.set_value(false);
 54 |     return fut;
 55 |   }
 56 | 
 57 |   auto pr = std::promise<bool>();
 58 |   auto fut = pr.get_future();
 59 | 
 60 |   queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
 61 | 
 62 |   // Wake up our thread
 63 |   monitor_.notify_one();
 64 |   return fut;
 65 | }
 66 | 
 67 | void
 68 | WorkerThread::threadMain() {
 69 |   threadLoop();
 70 | 
 71 |   // Call all pending tasks
 72 |   FAISS_ASSERT(wantStop_);
 73 | 
 74 |   for (auto& f : queue_) {
 75 |     f.first();
 76 |     f.second.set_value(true);
 77 |   }
 78 | }
 79 | 
 80 | void
 81 | WorkerThread::threadLoop() {
 82 |   while (true) {
 83 |     std::pair<std::function<void()>, std::promise<bool>> data;
 84 | 
 85 |     {
 86 |       std::unique_lock<std::mutex> lock(mutex_);
 87 | 
 88 |       while (!wantStop_ && queue_.empty()) {
 89 |         monitor_.wait(lock);
 90 |       }
 91 | 
 92 |       if (wantStop_) {
 93 |         return;
 94 |       }
 95 | 
 96 |       data = std::move(queue_.front());
 97 |       queue_.pop_front();
 98 |     }
 99 | 
100 |     data.first();
101 |     data.second.set_value(true);
102 |   }
103 | }
104 | 
105 | void
106 | WorkerThread::waitForThreadExit() {
107 |   try {
108 |     thread_.join();
109 |   } catch (...) {
110 |   }
111 | }
112 | 
113 | } } // namespace
114 | 


--------------------------------------------------------------------------------
/gpu/utils/WorkerThread.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | 
11 | #pragma once
12 | 
13 | #include <condition_variable>
14 | #include <future>
15 | #include <deque>
16 | #include <thread>
17 | 
18 | namespace faiss { namespace gpu {
19 | 
20 | class WorkerThread {
21 |  public:
22 |   WorkerThread();
23 | 
24 |   /// Stops and waits for the worker thread to exit, flushing all
25 |   /// pending lambdas
26 |   ~WorkerThread();
27 | 
28 |   /// Request that the worker thread stop itself
29 |   void stop();
30 | 
31 |   /// Blocking waits in the current thread for the worker thread to
32 |   /// stop
33 |   void waitForThreadExit();
34 | 
35 |   /// Adds a lambda to run on the worker thread; returns a future that
36 |   /// can be used to block on its completion.
37 |   /// Future status is `true` if the lambda was run in the worker
38 |   /// thread; `false` if it was not run, because the worker thread is
39 |   /// exiting or has exited.
40 |   std::future<bool> add(std::function<void()> f);
41 | 
42 |  private:
43 |   void startThread();
44 |   void threadMain();
45 |   void threadLoop();
46 | 
47 |   /// Thread that all queued lambdas are run on
48 |   std::thread thread_;
49 | 
50 |   /// Mutex for the queue and exit status
51 |   std::mutex mutex_;
52 | 
53 |   /// Monitor for the exit status and the queue
54 |   std::condition_variable monitor_;
55 | 
56 |   /// Whether or not we want the thread to exit
57 |   bool wantStop_;
58 | 
59 |   /// Queue of pending lambdas to call
60 |   std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
61 | };
62 | 
63 | } } // namespace
64 | 


--------------------------------------------------------------------------------
/gpu/utils/bitonicSort.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef BITONIC_SORT_CUH
  2 | #define BITONIC_SORT_CUH
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | namespace faiss { namespace gpu {
  8 | template<class T>
  9 | __device__ void swap1(T& _a, T&_b) {
 10 | 	T h = _a;
 11 | 	_a = _b;
 12 | 	_b = h;
 13 | }
 14 | 
 15 | // parallel bitonic sort
 16 | template<class T>
 17 | __device__ void bitonic3(volatile T _val[], volatile uint _idx[], uint _N) {
 18 | 
 19 | 	for (int k = 2; k <= _N; k <<= 1) {
 20 | 
 21 | 		// bitonic merge
 22 | 		for (int j = k / 2; j > 0; j /= 2) {
 23 | 			int ixj = threadIdx.x ^ j;  // XOR
 24 | 			if ((ixj > threadIdx.x) && (ixj < _N)) {
 25 | 				if ((threadIdx.x & k) == 0) // ascending - descending
 26 | 						{
 27 | 					if (_val[threadIdx.x] > _val[ixj]) {
 28 | 
 29 | 						swap1(_val[threadIdx.x], _val[ixj]);
 30 | 						swap1(_idx[threadIdx.x], _idx[ixj]);
 31 | 					}
 32 | 				} else {
 33 | 					if (_val[threadIdx.x] < _val[ixj]) {
 34 | 
 35 | 						swap1(_val[threadIdx.x], _val[ixj]);
 36 | 						swap1(_idx[threadIdx.x], _idx[ixj]);
 37 | 					}
 38 | 
 39 | 				}
 40 | 			}
 41 | 			__syncthreads();
 42 | 		}
 43 | 	}
 44 | }
 45 | 
 46 | 
 47 | // parallel bitonic sort
 48 | template<class T>
 49 | __device__ void bitonicLarge(volatile T _val[], volatile uint _idx[], uint _N) {
 50 | 
 51 | 	for (int k = 2; k <= _N; k <<= 1) {
 52 | 
 53 | 		// bitonic merge
 54 | 		for (int j = k / 2; j > 0; j /= 2) {
 55 | 
 56 | 			for (int tid = threadIdx.x; tid < _N; tid += blockDim.x) {
 57 | 				int ixj = tid ^ j;  // XOR
 58 | 				if ((ixj > tid) && (ixj < _N)) {
 59 | 					if ((tid & k) == 0) // ascending - descending
 60 | 							{
 61 | 						if (_val[tid] > _val[ixj]) {
 62 | 
 63 | 							swap1(_val[tid], _val[ixj]);
 64 | 							swap1(_idx[tid], _idx[ixj]);
 65 | 						}
 66 | 					} else {
 67 | 						if (_val[tid] < _val[ixj]) {
 68 | 
 69 | 							swap1(_val[tid], _val[ixj]);
 70 | 							swap1(_idx[tid], _idx[ixj]);
 71 | 						}
 72 | 
 73 | 					}
 74 | 				}
 75 | 			}
 76 | 			__syncthreads();
 77 | 		}
 78 | 	}
 79 | }
 80 | 
 81 | // parallel bitonic sort (descending)
 82 | template<class T>
 83 | __device__ void bitonic3Descending(volatile T _val[], volatile uint _idx[],
 84 | 		uint _N) {
 85 | 
 86 | 	for (int k = 2; k <= _N; k <<= 1) {
 87 | 
 88 | 		// bitonic merge
 89 | 		for (int j = k / 2; j > 0; j /= 2) {
 90 | 			int ixj = threadIdx.x ^ j;  // XOR
 91 | 			if ((ixj > threadIdx.x) && (ixj < _N)) {
 92 | 				if ((threadIdx.x & k) != 0) // ascending - descending
 93 | 						{
 94 | 					if (_val[threadIdx.x] > _val[ixj]) {
 95 | 
 96 | 						swap1(_val[threadIdx.x], _val[ixj]);
 97 | 						swap1(_idx[threadIdx.x], _idx[ixj]);
 98 | 					}
 99 | 				} else {
100 | 					if (_val[threadIdx.x] < _val[ixj]) {
101 | 
102 | 						swap1(_val[threadIdx.x], _val[ixj]);
103 | 						swap1(_idx[threadIdx.x], _idx[ixj]);
104 | 					}
105 | 
106 | 				}
107 | 			}
108 | 			__syncthreads();
109 | 		}
110 | 	}
111 | }
112 | 
113 | }};
114 | #endif
115 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloat1.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 1, 1);
15 | BLOCK_SELECT_IMPL(float, false, 1, 1);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloat128.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 128, 3);
15 | BLOCK_SELECT_IMPL(float, false, 128, 3);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloat256.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 256, 4);
15 | BLOCK_SELECT_IMPL(float, false, 256, 4);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloat32.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 32, 2);
15 | BLOCK_SELECT_IMPL(float, false, 32, 2);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloat64.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 64, 3);
15 | BLOCK_SELECT_IMPL(float, false, 64, 3);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloatF1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, false, 1024, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloatF512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, false, 512, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloatT1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 1024, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectFloatT512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | BLOCK_SELECT_IMPL(float, true, 512, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalf1.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 1, 1);
16 | BLOCK_SELECT_IMPL(half, false, 1, 1);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalf128.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 128, 3);
16 | BLOCK_SELECT_IMPL(half, false, 128, 3);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalf256.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 256, 4);
16 | BLOCK_SELECT_IMPL(half, false, 256, 4);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalf32.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 32, 2);
16 | BLOCK_SELECT_IMPL(half, false, 32, 2);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalf64.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 64, 3);
16 | BLOCK_SELECT_IMPL(half, false, 64, 3);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalfF1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, false, 1024, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalfF512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, false, 512, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalfT1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 1024, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectHalfT512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "BlockSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | BLOCK_SELECT_IMPL(half, true, 512, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/blockselect/BlockSelectImpl.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "../BlockSelectKernel.cuh"
11 | #include "../Limits.cuh"
12 | 
13 | #define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
14 |   extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
15 |     Tensor<TYPE, 2, true>& in,                                          \
16 |     Tensor<TYPE, 2, true>& outK,                                        \
17 |     Tensor<int, 2, true>& outV,                                         \
18 |     bool dir,                                                           \
19 |     int k,                                                              \
20 |     cudaStream_t stream)
21 | 
22 | #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
23 |   void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
24 |     Tensor<TYPE, 2, true>& in,                                          \
25 |     Tensor<TYPE, 2, true>& outK,                                        \
26 |     Tensor<int, 2, true>& outV,                                         \
27 |     bool dir,                                                           \
28 |     int k,                                                              \
29 |     cudaStream_t stream) {                                              \
30 |     auto grid = dim3(in.getSize(0));                                    \
31 |                                                                         \
32 |     constexpr int kBlockSelectNumThreads = 128;                         \
33 |     auto block = dim3(kBlockSelectNumThreads);                          \
34 |                                                                         \
35 |     FAISS_ASSERT(k <= WARP_Q);                                          \
36 |     FAISS_ASSERT(dir == DIR);                                           \
37 |                                                                         \
38 |     auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
39 |     auto vInit = -1;                                                    \
40 |                                                                         \
41 |     blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
42 |       <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
43 |     CUDA_TEST_ERROR();                                                  \
44 |   }
45 | 
46 | #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
47 |   runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
48 |     in, outK, outV, dir, k, stream)
49 | 


--------------------------------------------------------------------------------
/gpu/utils/helper.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef NEARESTNEIGHBOR_HELPER_H
 2 | #define NEARESTNEIGHBOR_HELPER_H
 3 | 
 4 | /*! \file  helper.hh
 5 |     \brief a collection of helper classes
 6 |  */
 7 | //#define OUTPUT
 8 | 
 9 | #include <cuda_runtime.h>
10 | #include <cuda.h>
11 | #include <iostream>
12 | 
13 | using namespace std;
14 | 
15 | #define MAX_THREADS 512
16 | #define MAX_BLOCKS 65535
17 | #define WARP_SIZE 32
18 | namespace faiss { namespace gpu {
19 | 
20 | 
21 | void outputMat(const std::string& _S, const float* _A,
22 | 		uint _rows, uint _cols,cudaStream_t stream);
23 | 
24 | void outputVec(const std::string& _S, const float* _v,
25 | 		uint _n,cudaStream_t stream);
26 | 
27 | void outputVecChar(const std::string& _S, const char* _v,
28 | 		uint _n,cudaStream_t stream);
29 | void outputVecUint8(const std::string& _S, const uint8_t* _v,
30 | 		uint _n,cudaStream_t stream);
31 | void outputVecUint(const std::string& _S, const uint* _v,
32 | 		uint _n,cudaStream_t stream);
33 | void outputVecUShort(const std::string& _S, const ushort* _v,
34 | 		uint _n,cudaStream_t stream);
35 | 
36 | void outputVecInt(const std::string& _S, const int* _v,uint _n,cudaStream_t stream);
37 | 
38 | void outputVecLong(const std::string& _S, const long* _v,uint _n,cudaStream_t stream);
39 | 
40 | void checkPrefixSumOffsets(const int* _v,uint _n,cudaStream_t stream);
41 | 
42 | 
43 | }
44 | 
45 | } /* namespace */
46 | 
47 | 
48 | 
49 | #endif /* NEARESTNEIGHBOR_HELPER_H */
50 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloat1.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 1, 1);
15 | WARP_SELECT_IMPL(float, false, 1, 1);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloat128.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 128, 3);
15 | WARP_SELECT_IMPL(float, false, 128, 3);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloat256.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 256, 4);
15 | WARP_SELECT_IMPL(float, false, 256, 4);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloat32.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 32, 2);
15 | WARP_SELECT_IMPL(float, false, 32, 2);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloat64.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 64, 3);
15 | WARP_SELECT_IMPL(float, false, 64, 3);
16 | 
17 | } } // namespace
18 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloatF1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, false, 1024, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloatF512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, false, 512, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloatT1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 1024, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectFloatT512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | WARP_SELECT_IMPL(float, true, 512, 8);
15 | 
16 | } } // namespace
17 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalf1.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 1, 1);
16 | WARP_SELECT_IMPL(half, false, 1, 1);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalf128.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 128, 3);
16 | WARP_SELECT_IMPL(half, false, 128, 3);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalf256.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 256, 4);
16 | WARP_SELECT_IMPL(half, false, 256, 4);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalf32.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 32, 2);
16 | WARP_SELECT_IMPL(half, false, 32, 2);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalf64.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 64, 3);
16 | WARP_SELECT_IMPL(half, false, 64, 3);
17 | #endif
18 | 
19 | } } // namespace
20 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalfF1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, false, 1024, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalfF512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, false, 512, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalfT1024.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 1024, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectHalfT512.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "WarpSelectImpl.cuh"
11 | 
12 | namespace faiss { namespace gpu {
13 | 
14 | #ifdef FAISS_USE_FLOAT16
15 | WARP_SELECT_IMPL(half, true, 512, 8);
16 | #endif
17 | 
18 | } } // namespace
19 | 


--------------------------------------------------------------------------------
/gpu/utils/warpselect/WarpSelectImpl.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Copyright 2004-present Facebook. All Rights Reserved.
10 | #include "../WarpSelectKernel.cuh"
11 | #include "../Limits.cuh"
12 | 
13 | #define WARP_SELECT_DECL(TYPE, DIR, WARP_Q)                             \
14 |   extern void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(   \
15 |     Tensor<TYPE, 2, true>& in,                                          \
16 |     Tensor<TYPE, 2, true>& outK,                                        \
17 |     Tensor<int, 2, true>& outV,                                         \
18 |     bool dir,                                                           \
19 |     int k,                                                              \
20 |     cudaStream_t stream)
21 | 
22 | #define WARP_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                   \
23 |   void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(          \
24 |     Tensor<TYPE, 2, true>& in,                                          \
25 |     Tensor<TYPE, 2, true>& outK,                                        \
26 |     Tensor<int, 2, true>& outV,                                         \
27 |     bool dir,                                                           \
28 |     int k,                                                              \
29 |     cudaStream_t stream) {                                              \
30 |                                                                         \
31 |     constexpr int kWarpSelectNumThreads = 128;                          \
32 |     auto grid = dim3(utils::divUp(in.getSize(0),                        \
33 |                                   (kWarpSelectNumThreads / kWarpSize))); \
34 |     auto block = dim3(kWarpSelectNumThreads);                           \
35 |                                                                         \
36 |     FAISS_ASSERT(k <= WARP_Q);                                          \
37 |     FAISS_ASSERT(dir == DIR);                                           \
38 |                                                                         \
39 |     auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
40 |     auto vInit = -1;                                                    \
41 |                                                                         \
42 |     warpSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads> \
43 |       <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
44 |     CUDA_TEST_ERROR();                                                  \
45 |   }
46 | 
47 | #define WARP_SELECT_CALL(TYPE, DIR, WARP_Q)                     \
48 |   runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(       \
49 |     in, outK, outV, dir, k, stream)
50 | 


--------------------------------------------------------------------------------
/index_io.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //  Copyright 2004-present Facebook. All Rights Reserved
10 | // -*- c++ -*-
11 | // I/O code for indexes
12 | 
13 | #ifndef FAISS_INDEX_IO_H
14 | #define FAISS_INDEX_IO_H
15 | 
16 | #include <cstdio>
17 | 
18 | namespace faiss {
19 | 
20 | struct Index;
21 | struct VectorTransform;
22 | struct IndexIVF;
23 | struct ProductQuantizer;
24 | 
25 | void write_index (const Index *idx, FILE *f);
26 | void write_index (const Index *idx, const char *fname);
27 | 
28 | /**
29 |  * mmap'ing currently works only for IndexIVFPQCompact, the
30 |  * IndexIVFPQCompact destructor will unmap the file.
31 |  */
32 | Index *read_index (FILE * f, bool try_mmap = false);
33 | Index *read_index (const char *fname, bool try_mmap = false);
34 | 
35 | 
36 | 
37 | void write_VectorTransform (const VectorTransform *vt, const char *fname);
38 | VectorTransform *read_VectorTransform (const char *fname);
39 | 
40 | ProductQuantizer * read_ProductQuantizer (const char*fname);
41 | void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname);
42 | 
43 | 
44 | 
45 | /* cloning functions */
46 | Index *clone_index (const Index *);
47 | 
48 | /** Cloner class, useful to override classes with other cloning
49 |  * functions. The cloning function above just calls
50 |  * Cloner::clone_Index. */
51 | struct Cloner {
52 |     virtual VectorTransform *clone_VectorTransform (const VectorTransform *);
53 |     virtual Index *clone_Index (const Index *);
54 |     virtual IndexIVF *clone_IndexIVF (const IndexIVF *);
55 |     virtual ~Cloner() {}
56 | };
57 | 
58 | }
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 2 | 
 3 | # Build each source file independently
 4 | include_directories(../../)	# faiss root directory
 5 | 
 6 | # gtest
 7 | 
 8 | find_package(GTest REQUIRED)
 9 | 
10 | include_directories(${GTEST_INCLUDE_DIRS})
11 | set(GTEST_ROOT /usr/include)
12 | foreach(source ${srcs})
13 |     get_filename_component(name ${source} NAME_WE)
14 | 
15 |     # target
16 |     add_executable(${name} ${source})
17 |     target_link_libraries(${name} ${faiss_lib} ${BLAS_LIB} ${GTEST_BOTH_LIBRARIES})
18 | 
19 |     # Install
20 |     install(TARGETS ${name} DESTINATION test)
21 | endforeach(source)
22 | 


--------------------------------------------------------------------------------
/tests/test_blas:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjuchenwei/vector-line-quantization/af6abd833c3c1fd18184a72153fd3331fe6b5291/tests/test_blas


--------------------------------------------------------------------------------
/tests/test_blas.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <cstdio>
10 | #include <cstdlib>
11 | 
12 | #undef FINTEGER
13 | #define FINTEGER long
14 | 
15 | 
16 | extern "C" {
17 | 
18 | /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
19 | 
20 | int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
21 |             n, FINTEGER *k, const float *alpha, const float *a,
22 |             FINTEGER *lda, const float *b, FINTEGER *
23 |             ldb, float *beta, float *c, FINTEGER *ldc);
24 | 
25 | /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
26 | 
27 | int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
28 |                  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
29 | 
30 | }
31 | 
32 | float *new_random_vec(int size)
33 | {
34 |     float *x = new float[size];
35 |     for (int i = 0; i < size; i++)
36 |         x[i] = drand48();
37 |     return x;
38 | }
39 | 
40 | 
41 | int main() {
42 | 
43 |     FINTEGER m = 10, n = 20, k = 30;
44 |     float *a = new_random_vec(m * k), *b = new_random_vec(n * k), *c = new float[n * m];
45 |     float one = 1.0, zero = 0.0;
46 | 
47 |     printf("BLAS test\n");
48 | 
49 |     sgemm_("Not transposed", "Not transposed",
50 |            &m, &n, &k, &one, a, &m, b, &k, &zero, c, &m);
51 | 
52 |     printf("errors=\n");
53 | 
54 |     for (int i = 0; i < m; i++) {
55 |         for (int j = 0; j < n; j++) {
56 |             float accu = 0;
57 |             for (int l = 0; l < k; l++)
58 |                 accu += a[i + l * m] * b[l + j * k];
59 |             printf ("%6.3f ", accu - c[i + j * m]);
60 |         }
61 |         printf("\n");
62 |     }
63 | 
64 |     long info = 0x64bL << 32;
65 |     long mi = 0x64bL << 32 | m;
66 |     float *tau = new float[m];
67 |     FINTEGER lwork = -1;
68 | 
69 |     float work1;
70 | 
71 |     printf("Intentional Lapack error (appears only for 64-bit INTEGER):\n");
72 |     sgeqrf_ (&mi, &n, c, &m, tau, &work1, &lwork, (FINTEGER*)&info);
73 | 
74 |     // sgeqrf_ (&m, &n, c, &zeroi, tau, &work1, &lwork, (FINTEGER*)&info);
75 |     printf("info=%016lx\n", info);
76 | 
77 |     if(info >> 32 == 0x64b) {
78 |         printf("Lapack uses 32-bit integers\n");
79 |     } else {
80 |         printf("Lapack uses 64-bit integers\n");
81 |     }
82 | 
83 | 
84 |     return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/tests/test_ivfpq_codec.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2015-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the CC-by-NC license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <cstdio>
10 | #include <cstdlib>
11 | 
12 | #include <gtest/gtest.h>
13 | 
14 | 
15 | #include "../IndexIVFPQ.h"
16 | #include "../IndexFlat.h"
17 | #include "../utils.h"
18 | 
19 | 
20 | // dimension of the vectors to index
21 | int d = 64;
22 | 
23 | // size of the database we plan to index
24 | size_t nb = 8000;
25 | 
26 | 
27 | double eval_codec_error (long ncentroids, long m, const std::vector<float> &v)
28 | {
29 |     faiss::IndexFlatL2 coarse_quantizer (d);
30 |     faiss::IndexIVFPQ index (&coarse_quantizer, d,
31 |                              ncentroids, m, 8);
32 |     index.pq.cp.niter = 10; // speed up train
33 |     index.train (nb, v.data());
34 | 
35 |     // encode and decode to compute reconstruction error
36 | 
37 |     std::vector<long> keys (nb);
38 |     std::vector<uint8_t> codes (nb * m);
39 |     index.encode_multiple (nb, keys.data(), v.data(), codes.data(), true);
40 | 
41 |     std::vector<float> v2 (nb * d);
42 |     index.decode_multiple (nb, keys.data(), codes.data(), v2.data());
43 | 
44 |     return faiss::fvec_L2sqr (v.data(), v2.data(), nb * d);
45 | }
46 | 
47 | 
48 | 
49 | TEST(IVFPQ, codec) {
50 | 
51 |     std::vector <float> database (nb * d);
52 |     for (size_t i = 0; i < nb * d; i++) {
53 |         database[i] = drand48();
54 |     }
55 | 
56 |     double err0 = eval_codec_error(16, 8, database);
57 | 
58 |     // should be more accurate as there are more coarse centroids
59 |     double err1 = eval_codec_error(128, 8, database);
60 |     EXPECT_GT(err0, err1);
61 | 
62 |     // should be more accurate as there are more PQ codes
63 |     double err2 = eval_codec_error(16, 16, database);
64 |     EXPECT_GT(err0, err2);
65 | }
66 | 


--------------------------------------------------------------------------------
/tests/test_ivfpq_indexing.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2015-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the CC-by-NC license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | // Copyright 2004-present Facebook. All Rights Reserved
 10 | 
 11 | #include <cstdio>
 12 | #include <cstdlib>
 13 | 
 14 | #include <gtest/gtest.h>
 15 | 
 16 | #include "../IndexIVFPQ.h"
 17 | #include "../IndexFlat.h"
 18 | #include "../index_io.h"
 19 | 
 20 | TEST(IVFPQ, accuracy) {
 21 | 
 22 |     // dimension of the vectors to index
 23 |     int d = 64;
 24 | 
 25 |     // size of the database we plan to index
 26 |     size_t nb = 1000;
 27 | 
 28 |     // make a set of nt training vectors in the unit cube
 29 |     // (could be the database)
 30 |     size_t nt = 1500;
 31 | 
 32 |     // make the index object and train it
 33 |     faiss::IndexFlatL2 coarse_quantizer (d);
 34 | 
 35 |     // a reasonable number of cetroids to index nb vectors
 36 |     int ncentroids = 25;
 37 | 
 38 |     faiss::IndexIVFPQ index (&coarse_quantizer, d,
 39 |                              ncentroids, 16, 8);
 40 | 
 41 |     // index that gives the ground-truth
 42 |     faiss::IndexFlatL2 index_gt (d);
 43 | 
 44 |     srand48 (35);
 45 | 
 46 |     { // training
 47 | 
 48 |         std::vector <float> trainvecs (nt * d);
 49 |         for (size_t i = 0; i < nt * d; i++) {
 50 |             trainvecs[i] = drand48();
 51 |         }
 52 |         index.verbose = true;
 53 |         index.train (nt, trainvecs.data());
 54 |     }
 55 | 
 56 |     { // populating the database
 57 | 
 58 |         std::vector <float> database (nb * d);
 59 |         for (size_t i = 0; i < nb * d; i++) {
 60 |             database[i] = drand48();
 61 |         }
 62 | 
 63 |         index.add (nb, database.data());
 64 |         index_gt.add (nb, database.data());
 65 |     }
 66 | 
 67 |     int nq = 200;
 68 |     int n_ok;
 69 | 
 70 |     { // searching the database
 71 | 
 72 |         std::vector <float> queries (nq * d);
 73 |         for (size_t i = 0; i < nq * d; i++) {
 74 |             queries[i] = drand48();
 75 |         }
 76 | 
 77 |         std::vector<faiss::Index::idx_t> gt_nns (nq);
 78 |         std::vector<float>               gt_dis (nq);
 79 | 
 80 |         index_gt.search (nq, queries.data(), 1,
 81 |                          gt_dis.data(), gt_nns.data());
 82 | 
 83 |         index.nprobe = 5;
 84 |         int k = 5;
 85 |         std::vector<faiss::Index::idx_t> nns (k * nq);
 86 |         std::vector<float>               dis (k * nq);
 87 | 
 88 |         index.search (nq, queries.data(), k, dis.data(), nns.data());
 89 | 
 90 |         n_ok = 0;
 91 |         for (int q = 0; q < nq; q++) {
 92 | 
 93 |             for (int i = 0; i < k; i++)
 94 |                 if (nns[q * k + i] == gt_nns[q])
 95 |                     n_ok++;
 96 |         }
 97 |         EXPECT_GT(n_ok, nq * 0.4);
 98 |     }
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------