├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── pybind
    └── pybind.cpp
├── src
    ├── Flinng.h
    └── LshFunctions.h
└── test
    ├── promethion-gt.npy
    ├── promethion.py
    └── test-dense.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | test/promethion-data*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Joshua Engels
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Base file courtesy of https://spin.atomicobject.com/2016/08/26/makefile-c-projects/
 2 | 
 3 | # Build and source directories, target executable name
 4 | TARGET_PYBIND ?= flinng.so
 5 | BUILD_DIR ?= ./build
 6 | SRC_DIR ?= src
 7 | 
 8 | # Find cpp source files
 9 | SRCS := $(shell find $(SRC_DIR) -name '*.cpp')
10 | OBJS := $(SRCS:%=$(BUILD_DIR)/%.o)
11 | DEPS := $(OBJS:.o=.d)
12 | 
13 | # Define flags
14 | CPP_DEBUG_FLAGS := -g -fno-omit-frame-pointer 
15 | CPP_OPT_FLAGS := -O3 -ffast-math
16 | CPP_WARN_FLAGS := -Wall -Werror
17 | INC_FLAGS := $(shell python3 -m pybind11 --includes)
18 | CPPFLAGS ?= -std=c++11 $(INC_FLAGS) $(CPP_WARN_FLAGS) $(CPP_OPT_FLAGS) $(CPP_DEBUG_FLAGS) -MMD -MP -fopenmp
19 | 
20 | all: $(BUILD_DIR)/$(TARGET_PYBIND)
21 | 
22 | # Make target pybind
23 | $(BUILD_DIR)/$(TARGET_PYBIND): ./pybind/pybind.cpp $(OBJS)
24 | 	$(MKDIR_P) $(dir $@)
25 | 	g++ -shared -o $(BUILD_DIR)/$(TARGET_PYBIND) $(CPPFLAGS) -undefined dynamic_lookup -fPIC ./pybind/pybind.cpp $(SRCS)
26 | 
27 | # Make c++ source into object files
28 | $(BUILD_DIR)/%.cpp.o: %.cpp
29 | 	$(MKDIR_P) $(dir $@)
30 | 	g++ $(CPPFLAGS) -c $< -o $@
31 | 
32 | # Clean command
33 | .PHONY: clean
34 | clean:
35 | 	$(RM) -r $(BUILD_DIR)
36 | 
37 | -include $(DEPS)
38 | -include $(TESTDEPS)
39 | 
40 | MKDIR_P ?= mkdir -p
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # FLINNG
  3 | 
  4 | Filters to Identify Near-Neighbor Groups (FLINNG) is a near neighbor search algorithm outlined in the paper 
  5 | [Practical Near Neighbor Search via Group Testing](https://arxiv.org/pdf/2106.11565.pdf). 
  6 | 
  7 | This branch (the main branch) contains a moderately cleaned up version of FLINNG. To 
  8 | access the original research code, use the research branch. Only this branch will 
  9 | be actively updated.
 10 | ## Features
 11 | 
 12 | - If using C++, header-only 
 13 | - If using Python, clean and simple bindings
 14 | - Incremental/streaming index construction
 15 | - Parallel index construction and querying
 16 | 
 17 | Note that some features of the research branch have yet to be ported over, and there are a few improvements
 18 | this branch might soon receive:
 19 | - Signed random projection and densified minhash performance optimization (sparsify SRP, improve DOPH)
 20 | - Index dumping to and from disk
 21 | - Addition of random seeds for reproducible experiments
 22 | 
 23 | ## Installation
 24 | 
 25 | To install Python bindings, run
 26 | ```
 27 | git clone --depth 1 https://github.com/JoshEngels/FLINNG
 28 | cd FLINNG
 29 | make
 30 | export PYTHONPATH=$(pwd)/build:$PYTHONPATH
 31 | ```
 32 | 
 33 | You will need to have pybind11 installed with conda or pip. This has been tested
 34 | on an M1 Mac and on Windows 10 WSL with Ubuntu.
 35 | 
 36 | If you want to be able to use FLINNG without running the export command every time you
 37 | start a new terminal, add the export command to your .bashrc or another file that 
 38 | gets run on terminal startup.
 39 | 
 40 | To use the C++ headers, you just need to clone the repo and copy src/Flinng.h to your project. You 
 41 | can also copy src/LshFunctions.h to hash your data before passing into Flinng.h; see
 42 | pybind/pybind.cpp for a direct example of how this works.
 43 | 
 44 | 
 45 | ## Usage/Examples
 46 | 
 47 | ### Python
 48 | To use FLINNG we must first create a new index, either a dense or a sparse index. 
 49 | A dense index will use the cosine similarity for the similarity search and accept
 50 | points as dense vectors in R^n (2D numpy array), while a sparse index will use the Jaccard similarity
 51 | for the similarity search and accept points as sets of positive integers 
 52 | (2D numpy array if all sets are the same length, otherwise python list of lists ). 
 53 | 
 54 | Here are the steps to use the FLINNG Python API:
 55 | 
 56 | Create a dense or sparse index:
 57 | ```python
 58 | dense_index = flinng.dense_32_bit(
 59 |                             num_rows, 
 60 |                             cells_per_row, 
 61 |                             data_dimension, 
 62 |                             num_hash_tables, 
 63 |                             hashes_per_table)
 64 | sparse_index = flinng.sparse_32_bit(
 65 |                             num_rows, 
 66 |                             cells_per_row, 
 67 |                             num_hash_tables, 
 68 |                             hashes_per_table,
 69 |                             hash_range_pow)
 70 | ```
 71 | 
 72 | Add points to the index:
 73 | ```python
 74 | index.add_points(dataset)
 75 | ```
 76 | 
 77 | Prepare for querying:
 78 | ```python
 79 | index.prepare_for_queries()
 80 | ```
 81 | 
 82 | Query:
 83 | ```python
 84 | results = index.query(queries, top_k)
 85 | ```
 86 | 
 87 | test/dense_data.py contains a complete example for running FLINNG on a synthetic dense data,
 88 | expected result ~100% R1@1. 
 89 | test/promethion.py contains a complete example for running FLINNG on real sparse DNA 
 90 | data, expected result ~98% R10@100. Make sure to read the comment at the beginning of promethion.py to see
 91 | how to download the dataset.
 92 | 
 93 | 
 94 | ### C++
 95 | Similar to the above, but example usage might look like:
 96 | ```C++
 97 | auto flinng = Flinng(num_rows, cells_per_row, num_hashes, hash_range);
 98 | std::vector<uint64_t> dataHashes = getHashes(data); // You need to implement this yourself or use LshFunctions.h
 99 | flinng.addPoints(hashes);
100 | flinng.prepareForQueries();
101 | std::vector<uint64_t> queryHashes = getHashes(queries); // You need to implement this yourself or use LshFunctions.h
102 | auto results = flinng.query(queryHashes, topK);
103 | ```
104 | 
105 | 
106 | ## Authors
107 | 
108 | Implementation by [Josh Engels](https://www.github.com/joshengels). 
109 | FLINNG created in collaboration with [Ben Coleman](https://randorithms.com/about.html)
110 | and [Anshumali Shrivastava](https://www.cs.rice.edu/~as143/).
111 | 
112 | Please feel free to contact josh.adam.engels@gmail.com with any questions.
113 | 
114 | ## Contributing
115 | 
116 | Currently, contributions are limited to bug fixes and suggestions. 
117 | For a bug fix, feel free to submit a PR or send an email. 
118 | 
119 | ## Citations
120 | 
121 | If you found our work useful, please cite our work as follows:
122 | 
123 | ```
124 | @inproceedings{NEURIPS2021_5248e511,
125 |  author = {Engels, Joshua and Coleman, Benjamin and Shrivastava, Anshumali},
126 |  booktitle = {Advances in Neural Information Processing Systems},
127 |  editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
128 |  pages = {9950--9962},
129 |  publisher = {Curran Associates, Inc.},
130 |  title = { Practical Near Neighbor Search via Group Testing},
131 |  url = {https://proceedings.neurips.cc/paper_files/paper/2021/file/5248e5118c84beea359b6ea385393661-Paper.pdf},
132 |  volume = {34},
133 |  year = {2021}
134 | }
135 | ```
136 | 


--------------------------------------------------------------------------------
/pybind/pybind.cpp:
--------------------------------------------------------------------------------
  1 | #include "../src/Flinng.h"
  2 | #include "../src/LshFunctions.h"
  3 | #include <pybind11/numpy.h>
  4 | #include <pybind11/pybind11.h>
  5 | #include <pybind11/stl.h>
  6 | 
  7 | template <typename T>
  8 | uint64_t checkValidAndGetNumPoints(pybind11::array_t<T> points,
  9 |                                    uint64_t data_dimension) {
 10 |   auto points_buf = points.request();
 11 | 
 12 |   if (points_buf.ndim != 2) {
 13 |     throw std::invalid_argument(
 14 |         "The input points must be a 2 dimensional Numpy array where each "
 15 |         "row is a single point.");
 16 |   }
 17 |   uint64_t num_points = (uint64_t)points_buf.shape[0];
 18 |   uint64_t point_dimension = (uint64_t)points_buf.shape[1];
 19 |   if ((data_dimension != 0 && point_dimension != data_dimension) ||
 20 |       num_points == 0) {
 21 |     throw std::invalid_argument("The rows (each point) must be of dimension " +
 22 |                                 std::to_string(data_dimension) +
 23 |                                 ", and there must be at least 1 row.");
 24 |   }
 25 | 
 26 |   return num_points;
 27 | }
 28 | 
 29 | class DenseFlinng32 {
 30 | 
 31 | public:
 32 |   DenseFlinng32(uint64_t num_rows, uint64_t cells_per_row,
 33 |                 uint64_t data_dimension, uint64_t num_hash_tables,
 34 |                 uint64_t hashes_per_table)
 35 |       : internal_flinng(num_rows, cells_per_row, num_hash_tables,
 36 |                         1 << hashes_per_table),
 37 |         num_hash_tables(num_hash_tables), hashes_per_table(hashes_per_table),
 38 |         data_dimension(data_dimension),
 39 |         rand_bits(num_hash_tables * hashes_per_table * data_dimension) {
 40 | 
 41 |     for (uint64_t i = 0; i < rand_bits.size(); i++) {
 42 |       rand_bits[i] = (rand() % 2) * 2 - 1; // 50% chance either 1 or -1
 43 |     }
 44 |   }
 45 | 
 46 |   // See
 47 |   // https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html?highlight=numpy#arrays
 48 |   // for explanation of why we do py::array::c_style and py::array::forcecase
 49 |   // Basically ensures array is in dense row major order
 50 |   void addPoints(pybind11::array_t<float, pybind11::array::c_style |
 51 |                                               pybind11::array::forcecast>
 52 |                      points) {
 53 | 
 54 |     checkValidAndGetNumPoints<float>(points, data_dimension);
 55 |     std::vector<uint64_t> hashes = getHashes(points);
 56 |     internal_flinng.addPoints(hashes);
 57 |   }
 58 | 
 59 |   void prepareForQueries() { internal_flinng.prepareForQueries(); }
 60 | 
 61 |   pybind11::array_t<uint64_t>
 62 |   query(pybind11::array_t<float,
 63 |                           pybind11::array::c_style | pybind11::array::forcecast>
 64 |             queries,
 65 |         uint32_t top_k) {
 66 | 
 67 |     uint64_t num_queries =
 68 |         checkValidAndGetNumPoints<float>(queries, data_dimension);
 69 |     std::vector<uint64_t> hashes = getHashes(queries);
 70 |     std::vector<uint64_t> results = internal_flinng.query(hashes, top_k);
 71 | 
 72 |     return pybind11::array_t<uint64_t>(
 73 |         std::vector<ptrdiff_t>{(int64_t)num_queries, top_k}, &results[0]);
 74 |   }
 75 | 
 76 | private:
 77 |   Flinng internal_flinng;
 78 |   const uint64_t num_hash_tables, hashes_per_table, data_dimension;
 79 |   std::vector<int8_t> rand_bits;
 80 | 
 81 |   std::vector<uint64_t>
 82 |   getHashes(pybind11::array_t<float, pybind11::array::c_style |
 83 |                                          pybind11::array::forcecast>
 84 |                 points) {
 85 |     auto points_buf = points.request();
 86 |     uint64_t num_points = (uint64_t)points_buf.shape[0];
 87 |     float *points_ptr = (float *)points_buf.ptr;
 88 |     return parallel_srp(points_ptr, num_points, data_dimension,
 89 |                         rand_bits.data(), num_hash_tables, hashes_per_table);
 90 |   }
 91 | };
 92 | 
 93 | class SparseFlinng32 {
 94 | 
 95 | public:
 96 |   SparseFlinng32(uint64_t num_rows, uint64_t cells_per_row,
 97 |                  uint64_t num_hash_tables, uint64_t hashes_per_table,
 98 |                  uint64_t hash_range_pow)
 99 |       : internal_flinng(num_rows, cells_per_row, num_hash_tables,
100 |                         1 << hash_range_pow),
101 |         num_hash_tables(num_hash_tables), hashes_per_table(hashes_per_table),
102 |         hash_range_pow(hash_range_pow), seed(rand()) {}
103 | 
104 |   // See
105 |   // https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html?highlight=numpy#arrays
106 |   // for explanation of why we do py::array::c_style and py::array::forcecase
107 |   // Basically ensures array is in dense row major order
108 |   void
109 |   addPointsSameDim(pybind11::array_t<uint64_t, pybind11::array::c_style |
110 |                                                    pybind11::array::forcecast>
111 |                        points) {
112 |     checkValidAndGetNumPoints<uint64_t>(points, 0);
113 |     std::vector<uint64_t> hashes = getHashes(points);
114 |     internal_flinng.addPoints(hashes);
115 |   }
116 | 
117 |   void addPoints(std::vector<std::vector<uint64_t>> data) {
118 |     std::vector<uint64_t> hashes = getHashes(data);
119 |     internal_flinng.addPoints(hashes);
120 |   }
121 | 
122 |   std::vector<uint64_t> hashPoints(std::vector<std::vector<uint64_t>> data) {
123 |     return getHashes(data);
124 |     // internal_flinng.addPoints(hashes);
125 |   }
126 | 
127 |   void prepareForQueries() { internal_flinng.prepareForQueries(); }
128 | 
129 |   pybind11::array_t<uint64_t> query(std::vector<std::vector<uint64_t>> queries,
130 |                                     uint64_t top_k) {
131 |     std::vector<uint64_t> hashes = getHashes(queries);
132 |     std::vector<uint64_t> results = internal_flinng.query(hashes, top_k);
133 | 
134 |     return pybind11::array_t<uint64_t>(
135 |         std::vector<ptrdiff_t>{(int64_t)queries.size(), (int64_t)top_k},
136 |         &results[0]);
137 |   }
138 | 
139 |   pybind11::array_t<uint64_t>
140 |   querySameDim(pybind11::array_t<uint64_t, pybind11::array::c_style |
141 |                                                pybind11::array::forcecast>
142 |                    queries,
143 |                uint32_t top_k) {
144 | 
145 |     uint64_t num_queries = checkValidAndGetNumPoints<uint64_t>(queries, 0);
146 |     std::vector<uint64_t> hashes = getHashes(queries);
147 |     std::vector<uint64_t> results = internal_flinng.query(hashes, top_k);
148 | 
149 |     return pybind11::array_t<uint64_t>(
150 |         std::vector<ptrdiff_t>{(int64_t)num_queries, top_k}, &results[0]);
151 |   }
152 | 
153 | private:
154 |   Flinng internal_flinng;
155 |   const uint64_t num_hash_tables, hashes_per_table, hash_range_pow;
156 |   const uint32_t seed;
157 | 
158 |   std::vector<uint64_t>
159 |   getHashes(pybind11::array_t<uint64_t, pybind11::array::c_style |
160 |                                             pybind11::array::forcecast>
161 |                 data) {
162 |     auto points_buf = data.request();
163 |     uint64_t num_points = (uint64_t)points_buf.shape[0];
164 |     uint64_t *points_ptr = (uint64_t *)points_buf.ptr;
165 |     uint64_t point_dimension = (uint64_t)points_buf.shape[1];
166 |     return parallel_densified_minhash(points_ptr, num_points, point_dimension,
167 |                                       num_hash_tables, hashes_per_table,
168 |                                       hash_range_pow, seed);
169 |   }
170 | 
171 |   std::vector<uint64_t> getHashes(std::vector<std::vector<uint64_t>> data) {
172 |     return parallel_densified_minhash(data, num_hash_tables, hashes_per_table,
173 |                                       hash_range_pow, seed);
174 |   }
175 | };
176 | 
177 | PYBIND11_MODULE(flinng, m) {
178 |   pybind11::class_<DenseFlinng32>(m, "dense_32_bit")
179 |       .def(pybind11::init<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t>(),
180 |            pybind11::arg("num_rows"), pybind11::arg("cells_per_row"),
181 |            pybind11::arg("data_dimension"), pybind11::arg("num_hash_tables"),
182 |            pybind11::arg("hashes_per_table"))
183 |       .def("add_points", &DenseFlinng32::addPoints,
184 |            pybind11::arg("data_points"))
185 |       .def("prepare_for_queries", &DenseFlinng32::prepareForQueries)
186 |       .def("query", &DenseFlinng32::query, pybind11::arg("query_points"),
187 |            pybind11::arg("top_k"));
188 | 
189 |   pybind11::class_<SparseFlinng32>(m, "sparse_32_bit")
190 |       .def(pybind11::init<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t>(),
191 |            pybind11::arg("num_rows"), pybind11::arg("cells_per_row"),
192 |            pybind11::arg("num_hash_tables"), pybind11::arg("hashes_per_table"),
193 |            pybind11::arg("hash_range_pow"))
194 |       .def("add_points", &SparseFlinng32::addPoints,
195 |            pybind11::arg("data_points"))
196 |       .def("add_points", &SparseFlinng32::addPointsSameDim,
197 |            pybind11::arg("data_points"))
198 |       .def("prepare_for_queries", &SparseFlinng32::prepareForQueries)
199 |       .def("query", &SparseFlinng32::query, pybind11::arg("query_points"),
200 |            pybind11::arg("top_k"))
201 |       .def("query", &SparseFlinng32::querySameDim,
202 |            pybind11::arg("query_points"), pybind11::arg("top_k"));
203 | }


--------------------------------------------------------------------------------
/src/Flinng.h:
--------------------------------------------------------------------------------
  1 | #ifndef _FLING
  2 | #define _FLING
  3 | 
  4 | #include "LshFunctions.h"
  5 | #include <cstdint>
  6 | #include <iostream>
  7 | #include <pybind11/numpy.h>
  8 | #include <pybind11/pybind11.h>
  9 | #include <stdexcept>
 10 | #include <vector>
 11 | 
 12 | // TODO: Add back 16 bit FLINNG, check input
 13 | // TODO: Reproduce experiments
 14 | // TODO: Add percent of srp used
 15 | class Flinng {
 16 | 
 17 | public:
 18 |   Flinng(uint64_t num_rows, uint64_t cells_per_row, uint64_t num_hashes,
 19 |          uint64_t hash_range)
 20 |       : num_rows(num_rows), cells_per_row(cells_per_row),
 21 |         num_hash_tables(num_hashes), hash_range(hash_range),
 22 |         inverted_flinng_index(hash_range * num_hashes),
 23 |         cell_membership(num_rows * cells_per_row) {}
 24 | 
 25 |   // All the hashes for point 1 come first, etc.
 26 |   // Size of hashes should be multiple of num_hash_tables
 27 |   void addPoints(std::vector<uint64_t> hashes) {
 28 | 
 29 |     uint64_t num_points = hashes.size() / num_hash_tables;
 30 |     std::vector<uint64_t> random_buckets(num_rows * num_points);
 31 |     for (uint64_t i = 0; i < num_rows * num_points; i++) {
 32 |       random_buckets[i] =
 33 |           (rand() % cells_per_row + cells_per_row) % cells_per_row +
 34 |           (i % num_rows) * cells_per_row;
 35 |     }
 36 | 
 37 | #pragma omp parallel for
 38 |     for (uint64_t table = 0; table < num_hash_tables; table++) {
 39 |       for (uint64_t point = 0; point < num_points; point++) {
 40 |         uint64_t hash = hashes[point * num_hash_tables + table];
 41 |         uint64_t hash_id = table * hash_range + hash;
 42 |         for (uint64_t row = 0; row < num_rows; row++) {
 43 |           inverted_flinng_index[hash_id].push_back(
 44 |               random_buckets[point * num_rows + row]);
 45 |         }
 46 |       }
 47 |     }
 48 | 
 49 |     for (uint64_t point = 0; point < num_points; point++) {
 50 |       for (uint64_t row = 0; row < num_rows; row++) {
 51 |         cell_membership[random_buckets[point * num_rows + row]].push_back(
 52 |             total_points_added + point);
 53 |       }
 54 |     }
 55 | 
 56 |     total_points_added += num_points;
 57 | 
 58 |     prepareForQueries();
 59 |   }
 60 | 
 61 |   void prepareForQueries() {
 62 |     for (uint64_t i = 0; i < inverted_flinng_index.size(); i++) {
 63 |       std::sort(inverted_flinng_index[i].begin(),
 64 |                 inverted_flinng_index[i].end());
 65 |       inverted_flinng_index[i].erase(
 66 |           std::unique(inverted_flinng_index[i].begin(),
 67 |                       inverted_flinng_index[i].end()),
 68 |           inverted_flinng_index[i].end());
 69 |     }
 70 |   }
 71 | 
 72 |   // Again all the hashes for point 1 come first, etc.
 73 |   // Size of hashes should be multiple of num_hash_tables
 74 |   // Results are similarly ordered
 75 |   std::vector<uint64_t> query(std::vector<uint64_t> hashes, uint32_t top_k) {
 76 | 
 77 |     uint64_t num_queries = hashes.size() / num_hash_tables;
 78 |     std::vector<uint64_t> results(top_k * num_queries);
 79 | 
 80 | #pragma omp parallel for
 81 |     for (uint32_t query_id = 0; query_id < num_queries; query_id++) {
 82 | 
 83 |       std::vector<uint32_t> counts(num_rows * cells_per_row, 0);
 84 |       for (uint32_t rep = 0; rep < num_hash_tables; rep++) {
 85 |         const uint32_t index =
 86 |             hash_range * rep + hashes[num_hash_tables * query_id + rep];
 87 |         const uint32_t size = inverted_flinng_index[index].size();
 88 |         for (uint32_t small_index = 0; small_index < size; small_index++) {
 89 |           // This single line takes 80% of the time, around half for the move
 90 |           // and half for the add
 91 |           ++counts[inverted_flinng_index[index][small_index]];
 92 |         }
 93 |       }
 94 | 
 95 |       std::vector<uint32_t> sorted[num_hash_tables + 1];
 96 |       uint32_t size_guess = num_rows * cells_per_row / (num_hash_tables + 1);
 97 |       for (std::vector<uint32_t> &v : sorted) {
 98 |         v.reserve(size_guess);
 99 |       }
100 | 
101 |       for (uint32_t i = 0; i < num_rows * cells_per_row; ++i) {
102 |         sorted[counts[i]].push_back(i);
103 |       }
104 | 
105 |       if (num_rows > 2) {
106 |         std::vector<uint8_t> num_counts(total_points_added, 0);
107 |         uint32_t num_found = 0;
108 |         for (int32_t rep = num_hash_tables; rep >= 0; --rep) {
109 |           for (uint32_t bin : sorted[rep]) {
110 |             for (uint32_t point : cell_membership[bin]) {
111 |               if (++num_counts[point] == num_rows) {
112 |                 results[top_k * query_id + num_found] = point;
113 |                 if (++num_found == top_k) {
114 |                   goto end_of_query;
115 |                 }
116 |               }
117 |             }
118 |           }
119 |         }
120 |       } else {
121 |         char *num_counts =
122 |             (char *)calloc(total_points_added / 8 + 1, sizeof(char));
123 |         uint32_t num_found = 0;
124 |         for (int32_t rep = num_hash_tables; rep >= 0; --rep) {
125 |           for (uint32_t bin : sorted[rep]) {
126 |             for (uint32_t point : cell_membership[bin]) {
127 |               if (num_counts[(point / 8)] & (1 << (point % 8))) {
128 |                 results[top_k * query_id + num_found] = point;
129 |                 if (++num_found == top_k) {
130 |                   free(num_counts);
131 |                   goto end_of_query;
132 |                 }
133 |               } else {
134 |                 num_counts[(point / 8)] |= (1 << (point % 8));
135 |               }
136 |             }
137 |           }
138 |         }
139 |       }
140 |     end_of_query:;
141 |     }
142 | 
143 |     return results;
144 |   }
145 | 
146 | private:
147 |   const uint64_t num_rows, cells_per_row, num_hash_tables, hash_range;
148 |   uint64_t total_points_added = 0;
149 |   std::vector<std::vector<uint32_t>> inverted_flinng_index;
150 |   std::vector<std::vector<uint64_t>> cell_membership;
151 | };
152 | 
153 | #endif


--------------------------------------------------------------------------------
/src/LshFunctions.h:
--------------------------------------------------------------------------------
  1 | #ifndef _LSH_FUNCTIONS
  2 | #define _LSH_FUNCTIONS
  3 | 
  4 | #include <climits>
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | uint64_t combine(uint64_t item1, uint64_t item2) {
 11 |   return item1 * 0xC4DD05BF + item2 * 0x6C8702C9;
 12 | }
 13 | 
 14 | void single_densified_minhash(uint64_t *result, uint64_t *point,
 15 |                               uint64_t point_len, uint64_t num_tables,
 16 |                               uint64_t hashes_per_table, uint8_t hash_range_pow,
 17 |                               uint32_t random_seed) {
 18 | 
 19 |   uint64_t num_hashes_to_generate = num_tables * hashes_per_table;
 20 |   std::vector<uint64_t> prelim_result(num_hashes_to_generate);
 21 |   uint64_t binsize = std::ceil(UINT64_MAX / prelim_result.size());
 22 | 
 23 |   for (uint64_t i = 0; i < num_hashes_to_generate; i++) {
 24 |     prelim_result[i] = UINT64_MAX;
 25 |   }
 26 | 
 27 |   for (uint64_t i = 0; i < point_len; i++) {
 28 |     uint64_t val = point[i];
 29 |     val *= random_seed;
 30 |     val ^= val >> 13;
 31 |     val *= 0x192AF017AAFFF017;
 32 |     val *= val;
 33 |     uint64_t hash = val;
 34 |     uint64_t binid =
 35 |         std::min((uint64_t)floor(val / binsize), num_hashes_to_generate - 1);
 36 |     if (prelim_result[binid] > hash) {
 37 |       prelim_result[binid] = hash;
 38 |     }
 39 |   }
 40 | 
 41 |   // Densify
 42 |   for (size_t i = 0; i < num_hashes_to_generate; i++) {
 43 |     uint64_t next = prelim_result[i];
 44 |     if (next != UINT64_MAX) {
 45 |       continue;
 46 |     }
 47 |     uint64_t count = 0;
 48 |     while (next == UINT64_MAX) {
 49 |       count++;
 50 |       uint64_t index = combine(i, count) % num_hashes_to_generate;
 51 |       next = prelim_result[index]; // Kills GPU.
 52 |       if (count > 100) {           // Densification failure.
 53 |         next = 0;
 54 |         break;
 55 |       }
 56 |     }
 57 |     prelim_result[i] = next;
 58 |   }
 59 | 
 60 |   // Combine each K
 61 |   for (uint64_t table = 0; table < num_tables; table++) {
 62 |     result[table] = prelim_result[hashes_per_table * table];
 63 |     for (uint64_t hash = 1; hash < hashes_per_table; hash++) {
 64 |       result[table] =
 65 |           combine(prelim_result[hashes_per_table * table], result[table]);
 66 |     }
 67 |     result[table] >>= (64 - hash_range_pow);
 68 |   }
 69 | }
 70 | 
 71 | std::vector<uint64_t>
 72 | parallel_densified_minhash(uint64_t *points, uint64_t num_points,
 73 |                            uint64_t point_dimension, uint64_t num_tables,
 74 |                            uint64_t hashes_per_table, uint8_t hash_range_pow,
 75 |                            uint32_t random_seed) {
 76 | 
 77 |   std::vector<uint64_t> result(num_tables * num_points);
 78 | 
 79 | #pragma omp parallel for
 80 |   for (uint64_t point_id = 0; point_id < num_points; point_id += 1) {
 81 |     single_densified_minhash((&result[0]) + point_id * num_tables,
 82 |                              points + point_id * point_dimension,
 83 |                              point_dimension, num_tables, hashes_per_table,
 84 |                              hash_range_pow, random_seed);
 85 |   }
 86 | 
 87 |   return result;
 88 | }
 89 | 
 90 | std::vector<uint64_t>
 91 | parallel_densified_minhash(std::vector<std::vector<uint64_t>> points,
 92 |                            uint64_t num_tables, uint64_t hashes_per_table,
 93 |                            uint8_t hash_range_pow, uint32_t random_seed) {
 94 | 
 95 |   std::vector<uint64_t> result(num_tables * points.size());
 96 | 
 97 | #pragma omp parallel for
 98 |   for (uint64_t point_id = 0; point_id < points.size(); point_id += 1) {
 99 |     single_densified_minhash((&result[0]) + point_id * num_tables,
100 |                              (&points[point_id][0]), points[point_id].size(),
101 |                              num_tables, hashes_per_table, hash_range_pow,
102 |                              random_seed);
103 |   }
104 | 
105 |   return result;
106 | }
107 | 
108 | std::vector<uint64_t> parallel_srp(float *dense_data, uint64_t num_points,
109 |                                    uint64_t data_dimension, int8_t *random_bits,
110 |                                    uint64_t num_tables,
111 |                                    uint64_t hashes_per_table) {
112 |   std::vector<uint64_t> result(num_tables * num_points);
113 | 
114 | #pragma omp parallel for
115 |   for (uint64_t data_id = 0; data_id < num_points; data_id++) {
116 |     for (uint64_t rep = 0; rep < num_tables; rep++) {
117 |       uint64_t hash = 0;
118 |       for (uint64_t bit = 0; bit < hashes_per_table; bit++) {
119 |         double sum = 0;
120 |         for (uint64_t j = 0; j < data_dimension; j++) {
121 |           double val = dense_data[data_dimension * data_id + j];
122 |           if (random_bits[rep * hashes_per_table * data_dimension +
123 |                           bit * data_dimension + j] > 0) {
124 |             sum += val;
125 |           } else {
126 |             sum -= val;
127 |           }
128 |         }
129 |         hash += (sum > 0) << bit;
130 |       }
131 |       result[data_id * num_tables + rep] = hash;
132 |     }
133 |   }
134 | 
135 |   return result;
136 | }
137 | 
138 | #endif


--------------------------------------------------------------------------------
/test/promethion-gt.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoshEngels/FLINNG/d216121c8dd4db53631aac4d81feceb3859b50aa/test/promethion-gt.npy


--------------------------------------------------------------------------------
/test/promethion.py:
--------------------------------------------------------------------------------
 1 | # Real world example on a dataset FLINNG is good at.
 2 | # To use this script, download the promethion dataset form
 3 | # https://drive.google.com/file/d/1EIN8uUuy98oIqYfHadtc2KzOzRH_E1Cs/view?usp=sharing
 4 | # Then copy it to this folder and run this script. When running it again, you
 5 | # can changed saved_as_npy to true to use the created .npy file. 
 6 | 
 7 | import numpy as np
 8 | import flinng
 9 | import time
10 | 
11 | saved_as_npy = False
12 | if not saved_as_npy:
13 |   promethion = np.loadtxt("promethion-data")
14 |   np.save("promethion-data", promethion)
15 | else:
16 |   promethion = np.load("promethion-data.npy")
17 | gts = np.load("promethion-gt.npy")
18 | 
19 | index = flinng.sparse_32_bit(num_rows=2, cells_per_row=50000, num_hash_tables=100, hashes_per_table=2, hash_range_pow=18)
20 | index.add_points(promethion[10001:]) # This 100001 isn't a bug, for some reason the generated gt skips point 10000
21 | index.prepare_for_queries()
22 | start = time.time()
23 | results = index.query(promethion[:10000], 100)
24 | r10at100 = sum([sum([gt[i] in res for i in range(10)]) for gt, res in zip(gts, results)]) / 10 / 10000
25 | print(f"R10@100 = {r10at100}")


--------------------------------------------------------------------------------
/test/test-dense.py:
--------------------------------------------------------------------------------
 1 | # Synthetic sanity check for FLINNG that ensures near duplicate detection works well
 2 | 
 3 | import numpy as np
 4 | import random 
 5 | import flinng
 6 | 
 7 | data_dim = 100
 8 | dataset_size = 1000000
 9 | queries_size = 10000
10 | dataset_std = 1
11 | queries_std = 0.1
12 | 
13 | flinng_num_rows = 3
14 | flinngs_cells_per_row = dataset_size // 100
15 | flinng_hashes_per_table = 16
16 | flinng_num_hash_tables = 20
17 | 
18 | # Generate n points using gaussian
19 | np.random.seed(42)
20 | random.seed(42)
21 | dataset = np.random.normal(size=(dataset_size, data_dim), scale=dataset_std)
22 | 
23 | # Generate queries from random points
24 | queries = []
25 | gts = []
26 | for i in range(queries_size):
27 |   gt = random.randrange(dataset_size)
28 |   query = dataset[gt] + np.random.normal(size=(data_dim), scale=queries_std)
29 |   queries.append(query)
30 |   gts.append(gt)
31 | queries = np.array(queries)
32 |  
33 | index = flinng.dense_32_bit(num_rows=flinng_num_rows, 
34 |                             cells_per_row=flinngs_cells_per_row, 
35 |                             data_dimension=data_dim, 
36 |                             num_hash_tables=flinng_num_hash_tables, 
37 |                             hashes_per_table=flinng_hashes_per_table)
38 | index.add_points(dataset)
39 | index.prepare_for_queries()
40 | results = index.query(queries, 1)
41 | recall = sum([gt == result[0] for gt, result in zip(gts, results)]) / queries_size
42 | print(f"R1@1 = {recall}")


--------------------------------------------------------------------------------