├── .envrc
├── .cargo
    └── config.toml
├── src
    ├── codecs.rs
    ├── utils.rs
    ├── codecs
    │   ├── native_u32.rs
    │   └── zero_copy.rs
    ├── decreasing_window_iter.rs
    ├── allocator.rs
    ├── error.rs
    ├── lib.rs
    ├── roaringish
    │   ├── intersect
    │   │   ├── gallop_first.rs
    │   │   ├── gallop_second.rs
    │   │   ├── naive.rs
    │   │   └── simd.rs
    │   └── intersect.rs
    ├── searcher.rs
    ├── stats.rs
    ├── indexer.rs
    ├── roaringish.rs
    └── db.rs
├── .gitignore
├── LICENSE
├── Cargo.toml
├── flake.lock
├── flake.nix
├── README.md
└── Cargo.lock


/.envrc:
--------------------------------------------------------------------------------
1 | use flake


--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | rustdocflags = ["-C", "target-cpu=native"]


--------------------------------------------------------------------------------
/src/codecs.rs:
--------------------------------------------------------------------------------
1 | mod native_u32;
2 | mod zero_copy;
3 | pub use native_u32::*;
4 | pub use zero_copy::*;
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /data*
 3 | /db*
 4 | **.mdb
 5 | **.tsv
 6 | .direnv
 7 | /benchmark
 8 | *.txt
 9 | /msmarco*
10 | *.jsonl
11 | roaringish_packed
12 | **.asm
13 | **.svg
14 | **.data
15 | /temp


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use unicode_segmentation::UnicodeSegmentation;
 2 | 
 3 | /// Normalizes the input string by trimming leading and trailing
 4 | /// whitespaces and converting it to lowercase.
 5 | pub fn normalize(s: &str) -> String {
 6 |     s.trim_start().trim_end().to_lowercase()
 7 | }
 8 | 
 9 | /// Tokenizes the input string by splitting it into word bounds
10 | /// also remove all tokens that are considered whitespace by utf-8.
11 | pub fn tokenize(s: &str) -> impl Iterator<Item = &str> {
12 |     s.split_word_bounds().filter(|t| {
13 |         if !t.is_empty() {
14 |             // This is safe because we know that `t` is not empty.
15 |             return !t.chars().next().unwrap().is_whitespace();
16 |         }
17 |         false
18 |     })
19 | }
20 | 


--------------------------------------------------------------------------------
/src/codecs/native_u32.rs:
--------------------------------------------------------------------------------
 1 | use std::borrow::Cow;
 2 | 
 3 | use heed::BoxedError;
 4 | 
 5 | pub struct NativeU32;
 6 | 
 7 | impl<'a> heed::BytesDecode<'a> for NativeU32 {
 8 |     type DItem = u32;
 9 | 
10 |     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
11 |         unsafe { Ok(u32::from_ne_bytes(bytes.try_into().unwrap_unchecked())) }
12 |     }
13 | }
14 | 
15 | impl<'a> heed::BytesEncode<'a> for NativeU32 {
16 |     type EItem = u32;
17 | 
18 |     fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
19 |         let p = item as *const u32 as *const u8;
20 |         let bytes = unsafe { std::slice::from_raw_parts(p, std::mem::size_of::<Self::EItem>()) };
21 |         Ok(Cow::Borrowed(bytes))
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/decreasing_window_iter.rs:
--------------------------------------------------------------------------------
 1 | use std::{iter::FusedIterator, num::NonZero};
 2 | 
 3 | pub struct DecreasingWindows<'a, T: 'a> {
 4 |     v: &'a [T],
 5 |     size: NonZero<usize>,
 6 | }
 7 | impl<'a, T: 'a> DecreasingWindows<'a, T> {
 8 |     #[inline]
 9 |     pub fn new(slice: &'a [T], size: NonZero<usize>) -> Self {
10 |         Self { v: slice, size }
11 |     }
12 | }
13 | impl<'a, T> Iterator for DecreasingWindows<'a, T> {
14 |     type Item = &'a [T];
15 | 
16 |     #[inline]
17 |     fn next(&mut self) -> Option<&'a [T]> {
18 |         if self.size.get() > self.v.len() {
19 |             self.size = NonZero::new(self.v.len())?;
20 |         }
21 | 
22 |         let ret = Some(&self.v[..self.size.get()]);
23 |         self.v = &self.v[1..];
24 |         ret
25 |     }
26 | 
27 |     #[inline]
28 |     fn size_hint(&self) -> (usize, Option<usize>) {
29 |         let size = self.v.len();
30 |         (size, Some(size))
31 |     }
32 | 
33 |     #[inline]
34 |     fn count(self) -> usize {
35 |         self.len()
36 |     }
37 | }
38 | impl<T> ExactSizeIterator for DecreasingWindows<'_, T> {}
39 | impl<T> FusedIterator for DecreasingWindows<'_, T> {}
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Gabriel Jorge Menezes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/codecs/zero_copy.rs:
--------------------------------------------------------------------------------
 1 | use std::{borrow::Cow, marker::PhantomData};
 2 | 
 3 | use rkyv::{
 4 |     Archive, Archived, Serialize, api::high::HighSerializer, ser::allocator::ArenaHandle,
 5 |     util::AlignedVec,
 6 | };
 7 | 
 8 | pub struct ZeroCopyCodec<T>(PhantomData<T>)
 9 | where
10 |     T: for<'a> Serialize<HighSerializer<AlignedVec, ArenaHandle<'a>, rkyv::rancor::Error>>
11 |         + Archive;
12 | 
13 | impl<'a, T> heed::BytesEncode<'a> for ZeroCopyCodec<T>
14 | where
15 |     T: for<'b> Serialize<HighSerializer<AlignedVec, ArenaHandle<'b>, rkyv::rancor::Error>>
16 |         + Archive
17 |         + 'a,
18 | {
19 |     type EItem = T;
20 | 
21 |     fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
22 |         let bytes = rkyv::to_bytes(item).map(|bytes| Cow::Owned(bytes.to_vec()));
23 | 
24 |         Ok(bytes?)
25 |     }
26 | }
27 | 
28 | impl<'a, T> heed::BytesDecode<'a> for ZeroCopyCodec<T>
29 | where
30 |     T: for<'b> Serialize<HighSerializer<AlignedVec, ArenaHandle<'b>, rkyv::rancor::Error>>
31 |         + Archive
32 |         + 'a,
33 | {
34 |     type DItem = &'a T::Archived;
35 | 
36 |     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, heed::BoxedError> {
37 |         unsafe { Ok(rkyv::access_unchecked::<Archived<T>>(bytes)) }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | cargo-features = ["profile-rustflags"]
 2 | 
 3 | [package]
 4 | name = "simdphrase"
 5 | version = "0.1.1"
 6 | edition = "2024"
 7 | license = "MIT"
 8 | authors = ["Gabriel Menezes <gabriel.jorge.menezes@gmail.com>"]
 9 | readme = "README.md"
10 | repository = "https://github.com/Gab-Menezes/simdphrase"
11 | description = "Extremely fast phrase search implementation."
12 | keywords = ["simd", "phrase", "search", "information", "retrieval"]
13 | categories = ["algorithms", "data-structures", "database", "text-processing"]
14 | exclude = [
15 |     "flake.nix",
16 |     "flake.lock",
17 |     ".envrc",
18 | ]
19 | 
20 | [dependencies]
21 | heed = "0.21.0"
22 | unicode-segmentation = "1.12.0"
23 | fxhash = "0.2.1"
24 | rkyv = { version = "0.8.10", features = ["unaligned", "pointer_width_64"] }
25 | memmap2 = "0.9.5"
26 | hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
27 | bumpalo = "3.17.0"
28 | gxhash = "3.4.1"
29 | thiserror = "2.0.12"
30 | log = "0.4.26"
31 | 
32 | [profile.dev]
33 | rustflags = [
34 |     "-C", "target-cpu=native", 
35 |     "-Z", "tune-cpu=native",
36 |     "-C", "llvm-args=-align-all-functions=6",
37 | ]
38 | 
39 | [profile.release]
40 | rustflags = [
41 |     "-C", "target-cpu=native", 
42 |     "-Z", "tune-cpu=native",
43 |     "-C", "llvm-args=-align-all-functions=6",
44 | ]


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nodes": {
 3 |     "nixpkgs": {
 4 |       "locked": {
 5 |         "lastModified": 1741010256,
 6 |         "narHash": "sha256-WZNlK/KX7Sni0RyqLSqLPbK8k08Kq7H7RijPJbq9KHM=",
 7 |         "owner": "NixOS",
 8 |         "repo": "nixpkgs",
 9 |         "rev": "ba487dbc9d04e0634c64e3b1f0d25839a0a68246",
10 |         "type": "github"
11 |       },
12 |       "original": {
13 |         "owner": "NixOS",
14 |         "ref": "nixos-unstable",
15 |         "repo": "nixpkgs",
16 |         "type": "github"
17 |       }
18 |     },
19 |     "root": {
20 |       "inputs": {
21 |         "nixpkgs": "nixpkgs",
22 |         "rust-overlay": "rust-overlay"
23 |       }
24 |     },
25 |     "rust-overlay": {
26 |       "inputs": {
27 |         "nixpkgs": [
28 |           "nixpkgs"
29 |         ]
30 |       },
31 |       "locked": {
32 |         "lastModified": 1741055476,
33 |         "narHash": "sha256-52vwEV0oS2lCnx3c/alOFGglujZTLmObit7K8VblnS8=",
34 |         "owner": "oxalica",
35 |         "repo": "rust-overlay",
36 |         "rev": "aefb7017d710f150970299685e8d8b549d653649",
37 |         "type": "github"
38 |       },
39 |       "original": {
40 |         "owner": "oxalica",
41 |         "repo": "rust-overlay",
42 |         "type": "github"
43 |       }
44 |     }
45 |   },
46 |   "root": "root",
47 |   "version": 7
48 | }
49 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   inputs = {
 3 |     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
 4 |     rust-overlay = {
 5 |       url = "github:oxalica/rust-overlay";
 6 |       inputs.nixpkgs.follows = "nixpkgs";
 7 |     };
 8 |   };
 9 | 
10 |   outputs = { self, nixpkgs, rust-overlay, ... }:
11 |     let
12 |       system = "x86_64-linux";
13 |       pkgs = import nixpkgs { 
14 |         inherit system; 
15 |         overlays = [ rust-overlay.overlays.default ];
16 |       };
17 |       rustbin = pkgs.rust-bin.selectLatestNightlyWith (toolchain: toolchain.default.override {
18 |         extensions = [ "rust-src" "rust-analyzer" "miri" ];
19 |       });
20 | 
21 |       clangVersion = "19";
22 |     in 
23 |     {
24 |       devShells.${system}.default = pkgs.mkShell {
25 |         packages = [
26 |           rustbin
27 |           pkgs.cargo-show-asm
28 |           pkgs.cargo-expand
29 |           pkgs.cargo-flamegraph
30 |           pkgs.cargo-valgrind
31 |           pkgs.cargo-fuzz
32 |           pkgs.cargo-pgo
33 | 
34 |           pkgs.openssl
35 |           pkgs.pkg-config
36 | 
37 |           pkgs."clang_${clangVersion}"
38 |           pkgs."llvmPackages_${clangVersion}".bintools
39 |           pkgs."bolt_${clangVersion}"
40 |           pkgs.cmake
41 |         ];
42 | 
43 |         LIBCLANG_PATH = pkgs.lib.makeLibraryPath [ pkgs."llvmPackages_${clangVersion}".libclang.lib ];
44 |       };
45 |     };
46 | }
47 | 


--------------------------------------------------------------------------------
/src/allocator.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     alloc::{Allocator, alloc, dealloc},
 3 |     ptr::NonNull,
 4 | };
 5 | 
 6 | use rkyv::{Archive, Serialize};
 7 | 
 8 | #[derive(Default, Archive, Serialize)]
 9 | pub struct AlignedAllocator<const N: usize>;
10 | unsafe impl<const N: usize> Allocator for AlignedAllocator<N> {
11 |     fn allocate(
12 |         &self,
13 |         layout: std::alloc::Layout,
14 |     ) -> Result<std::ptr::NonNull<[u8]>, std::alloc::AllocError> {
15 |         const { assert!(N.is_power_of_two()) };
16 |         const { assert!(N != 0) };
17 |         unsafe {
18 |             // This probably will never fail, if it does
19 |             // what can I do ? Let it crash, something
20 |             // went wrong.
21 |             let p = alloc(layout.align_to(N).unwrap());
22 |             let s = std::ptr::slice_from_raw_parts_mut(p, layout.size());
23 |             #[cfg(debug_assertions)]
24 |             return NonNull::new(s).ok_or(std::alloc::AllocError);
25 | 
26 |             #[cfg(not(debug_assertions))]
27 |             return Ok(NonNull::new_unchecked(s));
28 |         }
29 |     }
30 | 
31 |     unsafe fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
32 |         unsafe {
33 |             // This probably will never fail, if it does
34 |             // what can I do ? Let it crash, something
35 |             // went wrong.
36 |             dealloc(ptr.as_ptr(), layout.align_to(N).unwrap());
37 |         }
38 |     }
39 | }
40 | 
41 | pub type Aligned64 = AlignedAllocator<64>;
42 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | /// Possible errors that can occur while interacting with the database.
 4 | #[derive(Error, Debug)]
 5 | pub enum DbError {
 6 |     #[error("Io error: {0}")]
 7 |     IoError(#[from] std::io::Error),
 8 | 
 9 |     #[error("Lmdb error: {0}")]
10 |     LmdbError(#[from] heed::Error),
11 | 
12 |     #[error("Serialize error: {0}")]
13 |     EncodingError(#[from] rkyv::rancor::Error),
14 | 
15 |     #[error("Database error: {0}")]
16 |     DatabaseError(String),
17 | 
18 |     #[error("Key `{0}` not found in database `{1}`")]
19 |     KeyNotFound(String, String),
20 | }
21 | 
22 | /// Possible errors that can occur while searching.
23 | #[derive(Error, Debug)]
24 | pub enum SearchError {
25 |     #[error("Db error: {0}")]
26 |     DbError(#[from] DbError),
27 | 
28 |     #[error("Searched query is empty")]
29 |     EmptyQuery,
30 | 
31 |     #[error("No combination found while trying to merge and minimize")]
32 |     MergeAndMinimizeNotPossible,
33 | 
34 |     #[error("Token `{0}` not found in the database")]
35 |     TokenNotFound(String),
36 | 
37 |     #[error("Empty Intersection")]
38 |     EmptyIntersection,
39 | 
40 |     #[error("Catastrophic error has occurred")]
41 |     InternalError,
42 | }
43 | 
44 | /// Possible errors when trying to retrieve documents by their internal ID.
45 | #[derive(Error, Debug)]
46 | pub enum GetDocumentError {
47 |     #[error("Db error: {0}")]
48 |     DbError(#[from] DbError),
49 | 
50 |     #[error("Document with id `{0}` not found")]
51 |     DocumentNotFound(u32),
52 | }
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simd Phrase Search
 2 | 
 3 | Extremely fast phrase search implementation.
 4 | 
 5 | ## Overview
 6 | 
 7 | This implementation follows some of the ideas proposed in this 
 8 | [blog post](https://softwaredoug.com/blog/2024/01/21/search-array-phrase-algorithm) 
 9 | by [Doug Turnbull](https://softwaredoug.com/). The full explanation on how the internals
10 | work can be found in [here](https://gab-menezes.github.io/2025/01/13/using-the-most-unhinged-avx-512-instruction-to-make-the-fastest-phrase-search-algo.html).
11 | 
12 | This crate uses the [log] crate for logging during indexing.
13 | 
14 | It's highly recommended to compile this crate with `-C llvm-args=-align-all-functions=6`.
15 | 
16 | ## Usage
17 | 
18 | ```rust
19 | use phrase_search::{CommonTokens, Indexer, SimdIntersect};
20 | 
21 | // Creates a new indexer that can be reused, it will index 300_000 documents
22 | // in each batch and will use the top 50 most common tokens to speed up the search,
23 | // by merging them.
24 | let indexer = Indexer::new(Some(300_000), Some(CommonTokens::FixedNum(50)));
25 | 
26 | let docs = vec![
27 |     ("look at my beautiful cat", 0),
28 |     ("this is a document", 50),
29 |     ("look at my dog", 25),
30 |     ("look at my beautiful hamster", 35),
31 | ];
32 | let index_name = "./index";
33 | let db_size = 1024 * 1024;
34 | 
35 | // Indexes the documents returned by the iterator `it`.
36 | // The index will be created at `index_name` with the given `db_size`.
37 | let (searcher, num_indexed_documents) = indexer.index(docs, index_name, db_size)?;
38 | 
39 | // Search by the string "78"
40 | let result = searcher.search::<SimdIntersect>("at my beautiful")?;
41 | // This should return `[0, 35]`
42 | let documents = result.get_documents()?;
43 | ```


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(hash_raw_entry)]
 2 | #![feature(array_windows)]
 3 | #![feature(iter_intersperse)]
 4 | #![feature(debug_closure_helpers)]
 5 | #![feature(vec_push_within_capacity)]
 6 | #![feature(trivial_bounds)]
 7 | #![feature(portable_simd)]
 8 | #![feature(stdarch_x86_avx512)]
 9 | #![feature(avx512_target_feature)]
10 | #![feature(allocator_api)]
11 | #![feature(pointer_is_aligned_to)]
12 | 
13 | //! Extremely fast phrase search implementation.
14 | //!
15 | //! ## Overview
16 | //!
17 | //! This implementation follows some of the ideas proposed in this
18 | //! [blog post](https://softwaredoug.com/blog/2024/01/21/search-array-phrase-algorithm)
19 | //! by [Doug Turnbull](https://softwaredoug.com/). The full explanation on how the internals
20 | //! work can be found in [here](https://gab-menezes.github.io/2025/01/13/using-the-most-unhinged-avx-512-instruction-to-make-the-fastest-phrase-search-algo.html).
21 | //!
22 | //! This crate uses the [log] crate for logging during indexing.
23 | //!
24 | //! It's highly recommended to compile this crate with `-C llvm-args=-align-all-functions=6`.
25 | //!
26 | //! ## Usage
27 | //!
28 | //! ```rust
29 | //! use phrase_search::{CommonTokens, Indexer, SimdIntersect};
30 | //!
31 | //! // Creates a new indexer that can be reused, it will index 300_000 documents
32 | //! // in each batch and will use the top 50 most common tokens to speed up the search,
33 | //! // by merging them.
34 | //! let indexer = Indexer::new(Some(300_000), Some(CommonTokens::FixedNum(50)));
35 | //!
36 | //! let docs = vec![
37 | //!     ("look at my beautiful cat", 0),
38 | //!     ("this is a document", 50),
39 | //!     ("look at my dog", 25),
40 | //!     ("look at my beautiful hamster", 35),
41 | //! ];
42 | //! let index_name = "./index";
43 | //! let db_size = 1024 * 1024;
44 | //!
45 | //! // Indexes the documents returned by the iterator `it`.
46 | //! // The index will be created at `index_name` with the given `db_size`.
47 | //! let (searcher, num_indexed_documents) = indexer.index(docs, index_name, db_size)?;
48 | //!
49 | //! // Search by the string "78"
50 | //! let result = searcher.search::<SimdIntersect>("at my beautiful")?;
51 | //! // This should return `[0, 35]`
52 | //! let documents = result.get_documents()?;
53 | //! ```
54 | 
55 | mod allocator;
56 | mod codecs;
57 | mod db;
58 | mod decreasing_window_iter;
59 | mod error;
60 | mod indexer;
61 | mod roaringish;
62 | mod searcher;
63 | mod stats;
64 | mod utils;
65 | 
66 | use allocator::Aligned64;
67 | use db::DB;
68 | use roaringish::BorrowRoaringishPacked;
69 | use roaringish::RoaringishPacked;
70 | use utils::{normalize, tokenize};
71 | 
72 | pub use db::Document;
73 | pub use error::{DbError, GetDocumentError, SearchError};
74 | pub use indexer::CommonTokens;
75 | pub use indexer::Indexer;
76 | pub use stats::Stats;
77 | 
78 | pub use roaringish::intersect::naive::NaiveIntersect;
79 | 
80 | pub use roaringish::intersect::Intersection;
81 | #[cfg(target_feature = "avx512f")]
82 | pub use roaringish::intersect::simd::SimdIntersect;
83 | pub use searcher::{SearchResult, Searcher};
84 | 


--------------------------------------------------------------------------------
/src/roaringish/intersect/gallop_first.rs:
--------------------------------------------------------------------------------
  1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed};
  2 | 
  3 | use crate::{
  4 |     Aligned64, BorrowRoaringishPacked, Stats,
  5 |     roaringish::{ADD_ONE_GROUP, Aligned, clear_values, unpack_values},
  6 | };
  7 | 
  8 | use super::{Intersect, Intersection, private::IntersectSeal};
  9 | 
 10 | pub struct GallopIntersectFirst;
 11 | impl IntersectSeal for GallopIntersectFirst {}
 12 | impl Intersection for GallopIntersectFirst {}
 13 | 
 14 | impl Intersect for GallopIntersectFirst {
 15 |     fn inner_intersect<const FIRST: bool>(
 16 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
 17 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
 18 | 
 19 |         lhs_i: &mut usize,
 20 |         rhs_i: &mut usize,
 21 | 
 22 |         packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 23 |         i: &mut usize,
 24 | 
 25 |         _msb_packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 26 |         _j: &mut usize,
 27 | 
 28 |         add_to_group: u64,
 29 |         lhs_len: u16,
 30 |         _msb_mask: u16,
 31 |         lsb_mask: u16,
 32 | 
 33 |         stats: &Stats,
 34 |     ) {
 35 |         let b = std::time::Instant::now();
 36 | 
 37 |         while *lhs_i < lhs.len() && *rhs_i < rhs.len() {
 38 |             let mut lhs_delta = 1;
 39 |             let mut rhs_delta = 1;
 40 | 
 41 |             while *lhs_i < lhs.len()
 42 |                 && clear_values(lhs[*lhs_i]) + add_to_group + if FIRST { 0 } else { ADD_ONE_GROUP }
 43 |                     < clear_values(rhs[*rhs_i])
 44 |             {
 45 |                 *lhs_i += lhs_delta;
 46 |                 lhs_delta *= 2;
 47 |             }
 48 |             *lhs_i -= lhs_delta / 2;
 49 | 
 50 |             while *rhs_i < rhs.len()
 51 |                 && clear_values(rhs[*rhs_i])
 52 |                     < unsafe { clear_values(*lhs.get_unchecked(*lhs_i)) }
 53 |                         + add_to_group
 54 |                         + if FIRST { 0 } else { ADD_ONE_GROUP }
 55 |             {
 56 |                 *rhs_i += rhs_delta;
 57 |                 rhs_delta *= 2;
 58 |             }
 59 |             *rhs_i -= rhs_delta / 2;
 60 | 
 61 |             let lhs_packed = unsafe { *lhs.get_unchecked(*lhs_i) }
 62 |                 + add_to_group
 63 |                 + if FIRST { 0 } else { ADD_ONE_GROUP };
 64 |             let rhs_packed = unsafe { *rhs.get_unchecked(*rhs_i) };
 65 | 
 66 |             let lhs_doc_id_group = clear_values(lhs_packed);
 67 |             let rhs_doc_id_group = clear_values(rhs_packed);
 68 | 
 69 |             let lhs_values = unpack_values(lhs_packed);
 70 |             let rhs_values = unpack_values(rhs_packed);
 71 | 
 72 |             match lhs_doc_id_group.cmp(&rhs_doc_id_group) {
 73 |                 std::cmp::Ordering::Less => *lhs_i += 1,
 74 |                 std::cmp::Ordering::Greater => *rhs_i += 1,
 75 |                 std::cmp::Ordering::Equal => {
 76 |                     let intersection = if FIRST {
 77 |                         (lhs_values << lhs_len) & rhs_values
 78 |                     } else {
 79 |                         lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values
 80 |                     };
 81 |                     unsafe {
 82 |                         packed_result
 83 |                             .get_unchecked_mut(*i)
 84 |                             .write(lhs_doc_id_group | intersection as u64);
 85 |                     }
 86 |                     *i += (intersection > 0) as usize;
 87 | 
 88 |                     *lhs_i += 1;
 89 |                     *rhs_i += 1;
 90 |                 }
 91 |             }
 92 |         }
 93 | 
 94 |         stats
 95 |             .first_intersect_gallop
 96 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
 97 |     }
 98 | 
 99 |     fn intersection_buffer_size(
100 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
101 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
102 |     ) -> usize {
103 |         lhs.0.len().min(rhs.0.len())
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/roaringish/intersect.rs:
--------------------------------------------------------------------------------
  1 | use std::mem::MaybeUninit;
  2 | 
  3 | use crate::{Stats, allocator::Aligned64};
  4 | 
  5 | use super::{ADD_ONE_GROUP, Aligned, BorrowRoaringishPacked};
  6 | 
  7 | pub mod gallop_first;
  8 | pub mod gallop_second;
  9 | pub mod naive;
 10 | pub mod simd;
 11 | 
 12 | mod private {
 13 |     pub trait IntersectSeal {}
 14 | }
 15 | 
 16 | /// Allows a type to be used as an intersection algorithm when searching.
 17 | pub trait Intersection: Intersect {}
 18 | 
 19 | /// Necessary functions for an intersection algorithm.
 20 | ///
 21 | /// The intersection is done in two phases that's why
 22 | /// the function have a `FIRST` const generic.
 23 | pub trait Intersect: private::IntersectSeal {
 24 |     /// Responsible for allocating the result buffers
 25 |     /// and compute the necessary values before starting
 26 |     /// the intersection.
 27 |     #[inline(never)]
 28 |     fn intersect<const FIRST: bool>(
 29 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
 30 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
 31 |         lhs_len: u32,
 32 | 
 33 |         stats: &Stats,
 34 |     ) -> (Vec<u64, Aligned64>, Vec<u64, Aligned64>) {
 35 |         let mut lhs_i = 0;
 36 |         let mut rhs_i = 0;
 37 | 
 38 |         let buffer_size = Self::intersection_buffer_size(lhs, rhs);
 39 | 
 40 |         let mut i = 0;
 41 |         let mut packed_result: Box<[MaybeUninit<u64>], Aligned64> =
 42 |             Box::new_uninit_slice_in(buffer_size, Aligned64::default());
 43 | 
 44 |         let mut j = 0;
 45 |         let mut msb_packed_result: Box<[MaybeUninit<u64>], Aligned64> = if FIRST {
 46 |             Box::new_uninit_slice_in(lhs.0.len() + 1, Aligned64::default())
 47 |         } else {
 48 |             Box::new_uninit_slice_in(0, Aligned64::default())
 49 |         };
 50 | 
 51 |         let add_to_group = (lhs_len / 16) as u64 * ADD_ONE_GROUP;
 52 |         let lhs_len = (lhs_len % 16) as u16;
 53 | 
 54 |         let msb_mask = !(u16::MAX >> lhs_len);
 55 |         let lsb_mask = !(u16::MAX << lhs_len);
 56 | 
 57 |         Self::inner_intersect::<FIRST>(
 58 |             lhs,
 59 |             rhs,
 60 |             &mut lhs_i,
 61 |             &mut rhs_i,
 62 |             &mut packed_result,
 63 |             &mut i,
 64 |             &mut msb_packed_result,
 65 |             &mut j,
 66 |             add_to_group,
 67 |             lhs_len,
 68 |             msb_mask,
 69 |             lsb_mask,
 70 |             stats,
 71 |         );
 72 | 
 73 |         let (packed_result_ptr, a0) = Box::into_raw_with_allocator(packed_result);
 74 |         let (msb_packed_result_ptr, a1) = Box::into_raw_with_allocator(msb_packed_result);
 75 |         unsafe {
 76 |             (
 77 |                 Vec::from_raw_parts_in(packed_result_ptr as *mut _, i, buffer_size, a0),
 78 |                 if FIRST {
 79 |                     Vec::from_raw_parts_in(msb_packed_result_ptr as *mut _, j, lhs.0.len() + 1, a1)
 80 |                 } else {
 81 |                     Vec::from_raw_parts_in(msb_packed_result_ptr as *mut _, 0, 0, a1)
 82 |                 },
 83 |             )
 84 |         }
 85 |     }
 86 | 
 87 |     /// Performs the intersection.
 88 |     ///
 89 |     /// `msb_packed_result` has 0 capacity if `FIRST` is false.
 90 |     #[allow(clippy::too_many_arguments)]
 91 |     fn inner_intersect<const FIRST: bool>(
 92 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
 93 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
 94 | 
 95 |         lhs_i: &mut usize,
 96 |         rhs_i: &mut usize,
 97 | 
 98 |         packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 99 |         i: &mut usize,
100 | 
101 |         msb_packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
102 |         j: &mut usize,
103 | 
104 |         add_to_group: u64,
105 |         lhs_len: u16,
106 |         msb_mask: u16,
107 |         lsb_mask: u16,
108 | 
109 |         stats: &Stats,
110 |     );
111 | 
112 |     /// Size of the buffer needed to store the intersection.
113 |     fn intersection_buffer_size(
114 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
115 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
116 |     ) -> usize;
117 | }
118 | 


--------------------------------------------------------------------------------
/src/roaringish/intersect/gallop_second.rs:
--------------------------------------------------------------------------------
  1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed};
  2 | 
  3 | use crate::{
  4 |     Aligned64, BorrowRoaringishPacked, Stats,
  5 |     roaringish::{Aligned, clear_values, unpack_values},
  6 | };
  7 | 
  8 | use super::{Intersect, Intersection, private::IntersectSeal};
  9 | 
 10 | pub struct GallopIntersectSecond;
 11 | impl IntersectSeal for GallopIntersectSecond {}
 12 | impl Intersection for GallopIntersectSecond {}
 13 | 
 14 | impl Intersect for GallopIntersectSecond {
 15 |     fn inner_intersect<const FIRST: bool>(
 16 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
 17 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
 18 | 
 19 |         lhs_i: &mut usize,
 20 |         rhs_i: &mut usize,
 21 | 
 22 |         packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 23 |         i: &mut usize,
 24 | 
 25 |         _msb_packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 26 |         _j: &mut usize,
 27 | 
 28 |         _add_to_group: u64,
 29 |         lhs_len: u16,
 30 |         _msb_mask: u16,
 31 |         lsb_mask: u16,
 32 | 
 33 |         stats: &Stats,
 34 |     ) {
 35 |         let b = std::time::Instant::now();
 36 | 
 37 |         while *lhs_i < lhs.len() && *rhs_i < rhs.len() {
 38 |             let mut lhs_delta = 1;
 39 |             let mut rhs_delta = 1;
 40 | 
 41 |             while *lhs_i < lhs.len() && clear_values(lhs[*lhs_i]) < clear_values(rhs[*rhs_i]) {
 42 |                 *lhs_i += lhs_delta;
 43 |                 lhs_delta *= 2;
 44 |             }
 45 |             *lhs_i -= lhs_delta / 2;
 46 | 
 47 |             while *rhs_i < rhs.len()
 48 |                 && clear_values(rhs[*rhs_i]) < unsafe { clear_values(*lhs.get_unchecked(*lhs_i)) }
 49 |             {
 50 |                 *rhs_i += rhs_delta;
 51 |                 rhs_delta *= 2;
 52 |             }
 53 |             *rhs_i -= rhs_delta / 2;
 54 | 
 55 |             let lhs_packed = unsafe { *lhs.get_unchecked(*lhs_i) };
 56 |             let rhs_packed = unsafe { *rhs.get_unchecked(*rhs_i) };
 57 | 
 58 |             let lhs_doc_id_group = clear_values(lhs_packed);
 59 |             let rhs_doc_id_group = clear_values(rhs_packed);
 60 | 
 61 |             let lhs_values = unpack_values(lhs_packed);
 62 |             let rhs_values = unpack_values(rhs_packed);
 63 | 
 64 |             match lhs_doc_id_group.cmp(&rhs_doc_id_group) {
 65 |                 std::cmp::Ordering::Less => *lhs_i += 1,
 66 |                 std::cmp::Ordering::Greater => *rhs_i += 1,
 67 |                 std::cmp::Ordering::Equal => {
 68 |                     let intersection =
 69 |                         lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values;
 70 |                     unsafe {
 71 |                         packed_result
 72 |                             .get_unchecked_mut(*i)
 73 |                             .write(lhs_doc_id_group | intersection as u64);
 74 |                     }
 75 |                     *i += (intersection > 0) as usize;
 76 | 
 77 |                     *lhs_i += 1;
 78 |                     *rhs_i += 1;
 79 |                 }
 80 |             }
 81 | 
 82 |             // // In micro benchmarking doing this version seems faster, but in the real
 83 |             // // use case is slower
 84 | 
 85 |             // let intersection = lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values;
 86 |             // if lhs_doc_id_group == rhs_doc_id_group && intersection > 0 {
 87 |             //     unsafe {
 88 |             //         packed_result
 89 |             //         .get_unchecked_mut(*i)
 90 |             //         .write(lhs_doc_id_group | intersection as u64);
 91 |             //     }
 92 |             //     *i += 1;
 93 |             // }
 94 | 
 95 |             // *lhs_i += (lhs_doc_id_group <= rhs_doc_id_group) as usize;
 96 |             // *rhs_i += (lhs_doc_id_group >= rhs_doc_id_group) as usize;
 97 |         }
 98 | 
 99 |         stats
100 |             .second_intersect_gallop
101 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
102 |     }
103 | 
104 |     fn intersection_buffer_size(
105 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
106 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
107 |     ) -> usize {
108 |         lhs.0.len().min(rhs.0.len())
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/roaringish/intersect/naive.rs:
--------------------------------------------------------------------------------
  1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed};
  2 | 
  3 | use crate::{
  4 |     Stats,
  5 |     allocator::Aligned64,
  6 |     roaringish::{ADD_ONE_GROUP, Aligned, BorrowRoaringishPacked, clear_values, unpack_values},
  7 | };
  8 | 
  9 | use super::{Intersect, Intersection, private::IntersectSeal};
 10 | 
 11 | /// Naive intersection algorithm.
 12 | pub struct NaiveIntersect;
 13 | impl IntersectSeal for NaiveIntersect {}
 14 | impl Intersection for NaiveIntersect {}
 15 | 
 16 | impl Intersect for NaiveIntersect {
 17 |     #[inline(always)]
 18 |     fn inner_intersect<const FIRST: bool>(
 19 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
 20 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
 21 | 
 22 |         lhs_i: &mut usize,
 23 |         rhs_i: &mut usize,
 24 | 
 25 |         packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 26 |         i: &mut usize,
 27 | 
 28 |         msb_packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
 29 |         j: &mut usize,
 30 | 
 31 |         add_to_group: u64,
 32 |         lhs_len: u16,
 33 |         msb_mask: u16,
 34 |         lsb_mask: u16,
 35 | 
 36 |         stats: &Stats,
 37 |     ) {
 38 |         let b = std::time::Instant::now();
 39 | 
 40 |         while *lhs_i < lhs.0.len() && *rhs_i < rhs.0.len() {
 41 |             let lhs_packed =
 42 |                 unsafe { *lhs.0.get_unchecked(*lhs_i) } + if FIRST { add_to_group } else { 0 };
 43 |             let lhs_doc_id_group = clear_values(lhs_packed);
 44 |             let lhs_values = unpack_values(lhs_packed);
 45 | 
 46 |             let rhs_packed = unsafe { *rhs.0.get_unchecked(*rhs_i) };
 47 |             let rhs_doc_id_group = clear_values(rhs_packed);
 48 |             let rhs_values = unpack_values(rhs_packed);
 49 | 
 50 |             match lhs_doc_id_group.cmp(&rhs_doc_id_group) {
 51 |                 std::cmp::Ordering::Equal => {
 52 |                     unsafe {
 53 |                         if FIRST {
 54 |                             let intersection = (lhs_values << lhs_len) & rhs_values;
 55 |                             packed_result
 56 |                                 .get_unchecked_mut(*i)
 57 |                                 .write(lhs_doc_id_group | intersection as u64);
 58 | 
 59 |                             msb_packed_result
 60 |                                 .get_unchecked_mut(*j)
 61 |                                 .write(lhs_packed + ADD_ONE_GROUP);
 62 | 
 63 |                             *j += (lhs_values & msb_mask > 0) as usize;
 64 |                         } else {
 65 |                             let intersection =
 66 |                                 lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values;
 67 |                             packed_result
 68 |                                 .get_unchecked_mut(*i)
 69 |                                 .write(lhs_doc_id_group | intersection as u64);
 70 |                         }
 71 |                     }
 72 |                     *i += 1;
 73 |                     *lhs_i += 1;
 74 |                     *rhs_i += 1;
 75 |                 }
 76 |                 std::cmp::Ordering::Greater => *rhs_i += 1,
 77 |                 std::cmp::Ordering::Less => {
 78 |                     if FIRST {
 79 |                         unsafe {
 80 |                             msb_packed_result
 81 |                                 .get_unchecked_mut(*j)
 82 |                                 .write(lhs_packed + ADD_ONE_GROUP);
 83 |                             *j += (lhs_values & msb_mask > 0) as usize;
 84 |                         }
 85 |                     }
 86 |                     *lhs_i += 1;
 87 |                 }
 88 |             }
 89 |         }
 90 | 
 91 |         if FIRST {
 92 |             stats
 93 |                 .first_intersect_naive
 94 |                 .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
 95 |         } else {
 96 |             stats
 97 |                 .second_intersect_naive
 98 |                 .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
 99 |         }
100 |     }
101 | 
102 |     fn intersection_buffer_size(
103 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
104 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
105 |     ) -> usize {
106 |         lhs.0.len().min(rhs.0.len())
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/searcher.rs:
--------------------------------------------------------------------------------
  1 | use std::{collections::HashSet, path::Path};
  2 | 
  3 | use crate::{DB, DbError, Intersection, SearchError, Stats, db::Document, error::GetDocumentError};
  4 | use memmap2::Mmap;
  5 | use rkyv::{Archive, Deserialize, de::Pool, rancor::Strategy};
  6 | 
  7 | /// Final result of a search operation.
  8 | pub struct SearchResult<'a, D: Document>(pub Result<Vec<u32>, SearchError>, &'a Searcher<D>);
  9 | impl<D: Document> SearchResult<'_, D> {
 10 |     /// Number of documents that matched the search query.
 11 |     pub fn len(&self) -> Option<usize> {
 12 |         self.0.as_ref().map(|p| p.len()).ok()
 13 |     }
 14 | 
 15 |     /// Returns the internal document IDs that matched the search query.
 16 |     pub fn get_internal_document_ids(&self) -> Option<&[u32]> {
 17 |         self.0.as_ref().map(|p| p.as_slice()).ok()
 18 |     }
 19 | 
 20 |     /// Gets the archived version of the documents that matched the search query.
 21 |     ///
 22 |     /// This avoids having to deserialize, but it's necessary to use a callback
 23 |     /// due to the lifetime of the transaction.
 24 |     ///
 25 |     /// If you want the documents deserialized, use [Self::get_documents] instead.
 26 |     pub fn get_archived_documents(
 27 |         &self,
 28 |         cb: impl FnOnce(Vec<&D::Archived>),
 29 |     ) -> Result<(), GetDocumentError> {
 30 |         let Some(doc_ids) = self.get_internal_document_ids() else {
 31 |             return Ok(());
 32 |         };
 33 | 
 34 |         self.1.get_archived_documents(doc_ids, cb)
 35 |     }
 36 | 
 37 |     /// Gets the deserialized version of the documents that matched the search query.
 38 |     pub fn get_documents(&self) -> Result<Vec<D>, GetDocumentError>
 39 |     where
 40 |         <D as Archive>::Archived: Deserialize<D, Strategy<Pool, rkyv::rancor::Error>>,
 41 |     {
 42 |         let Some(doc_ids) = self.get_internal_document_ids() else {
 43 |             return Ok(Vec::new());
 44 |         };
 45 | 
 46 |         self.1.get_documents(doc_ids)
 47 |     }
 48 | }
 49 | 
 50 | /// Object responsible for searching the database.
 51 | pub struct Searcher<D: Document> {
 52 |     db: DB<D>,
 53 |     common_tokens: HashSet<Box<str>>,
 54 |     mmap: Mmap,
 55 | }
 56 | 
 57 | impl<D: Document> Searcher<D> {
 58 |     /// Create a new searcher object.
 59 |     pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, DbError> {
 60 |         let (db, common_tokens, mmap) = DB::open(path)?;
 61 |         Ok(Self {
 62 |             db,
 63 |             common_tokens,
 64 |             mmap,
 65 |         })
 66 |     }
 67 | 
 68 |     /// Searches by the query `q`
 69 |     pub fn search<I: Intersection>(&self, q: &str) -> SearchResult<D> {
 70 |         let stats = Stats::default();
 71 |         self.search_with_stats::<I>(q, &stats)
 72 |     }
 73 | 
 74 |     /// Searches by the query `q`, allowing the user to pass a [Stats] object.
 75 |     pub fn search_with_stats<I: Intersection>(&self, q: &str, stats: &Stats) -> SearchResult<D> {
 76 |         SearchResult(
 77 |             self.db
 78 |                 .search::<I>(q, stats, &self.common_tokens, &self.mmap),
 79 |             self,
 80 |         )
 81 |     }
 82 | 
 83 |     /// Gets the archived version of the documents.
 84 |     ///
 85 |     /// This avoids having to deserialize, but it's necessary to use a callback
 86 |     /// due to the lifetime of the transaction.
 87 |     ///
 88 |     /// If you want the documents deserialized, use [Self::get_documents] instead.
 89 |     pub fn get_archived_documents(
 90 |         &self,
 91 |         doc_ids: &[u32],
 92 |         cb: impl FnOnce(Vec<&D::Archived>),
 93 |     ) -> Result<(), GetDocumentError> {
 94 |         self.db.get_archived_documents(doc_ids, cb)
 95 |     }
 96 | 
 97 |     /// Gets the archived version of a documents.
 98 |     ///
 99 |     /// This avoids having to deserialize, but it's necessary to use a callback
100 |     /// due to the lifetime of the transaction.
101 |     ///
102 |     /// If you want the documents deserialized, use [Self::get_document] instead.
103 |     pub fn get_archived_document(
104 |         &self,
105 |         doc_id: u32,
106 |         cb: impl FnOnce(&D::Archived),
107 |     ) -> Result<(), GetDocumentError> {
108 |         self.db.get_archived_document(doc_id, cb)
109 |     }
110 | 
111 |     /// Gets the deserialized version of the documents.
112 |     pub fn get_documents(&self, doc_ids: &[u32]) -> Result<Vec<D>, GetDocumentError>
113 |     where
114 |         <D as Archive>::Archived: Deserialize<D, Strategy<Pool, rkyv::rancor::Error>>,
115 |     {
116 |         self.db.get_documents(doc_ids)
117 |     }
118 | 
119 |     /// Gets the deserialized version of a documents.
120 |     pub fn get_document(&self, doc_id: u32) -> Result<D, GetDocumentError>
121 |     where
122 |         <D as Archive>::Archived: Deserialize<D, Strategy<Pool, rkyv::rancor::Error>>,
123 |     {
124 |         self.db.get_document(doc_id)
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/stats.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fmt::Debug,
  3 |     sync::atomic::{AtomicU64, Ordering::Relaxed},
  4 | };
  5 | 
  6 | /// Time stats collected during search.
  7 | #[derive(Default)]
  8 | pub struct Stats {
  9 |     /// Time spent during normalization and tokenization.
 10 |     pub normalize_tokenize: AtomicU64,
 11 |     /// Time spent during merging and minimizing.
 12 |     pub merge_minimize: AtomicU64,
 13 |     /// Time spent during the first binary search.
 14 |     pub first_binary_search: AtomicU64,
 15 |     /// Time spent during the first intersect.
 16 |     pub first_intersect: AtomicU64,
 17 |     /// Time spent during the first intersect using SIMD.
 18 |     pub first_intersect_simd: AtomicU64,
 19 |     /// Time spent during the first intersect using naive method.
 20 |     pub first_intersect_naive: AtomicU64,
 21 |     /// Time spent during the first intersect using gallop method.
 22 |     pub first_intersect_gallop: AtomicU64,
 23 | 
 24 |     /// Time spent during the second binary search.
 25 |     pub second_binary_search: AtomicU64,
 26 |     /// Time spent during the second intersect.
 27 |     pub second_intersect: AtomicU64,
 28 |     /// Time spent during the second intersect using SIMD.
 29 |     pub second_intersect_simd: AtomicU64,
 30 |     /// Time spent during the second intersect using naive method.
 31 |     pub second_intersect_naive: AtomicU64,
 32 |     /// Time spent during the second intersect using gallop method.
 33 |     pub second_intersect_gallop: AtomicU64,
 34 | 
 35 |     /// Time spent during the first merge phase.
 36 |     pub merge_phases_first_pass: AtomicU64,
 37 |     /// Time spent during the second merge phase.
 38 |     pub merge_phases_second_pass: AtomicU64,
 39 | 
 40 |     /// Time spent getting document ids.
 41 |     pub get_doc_ids: AtomicU64,
 42 | 
 43 |     /// Number of calls to the search function.
 44 |     pub iters: AtomicU64,
 45 | }
 46 | 
 47 | impl Debug for Stats {
 48 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 49 |         let sum = self.normalize_tokenize.load(Relaxed)
 50 |             + self.merge_minimize.load(Relaxed)
 51 |             + self.first_binary_search.load(Relaxed)
 52 |             + self.first_intersect.load(Relaxed)
 53 |             + self.second_binary_search.load(Relaxed)
 54 |             + self.second_intersect.load(Relaxed)
 55 |             + self.merge_phases_first_pass.load(Relaxed)
 56 |             + self.merge_phases_second_pass.load(Relaxed)
 57 |             + self.get_doc_ids.load(Relaxed);
 58 |         let sum = sum as f64;
 59 | 
 60 |         let normalize_tokenize = self.normalize_tokenize.load(Relaxed) as f64;
 61 |         let merge = self.merge_minimize.load(Relaxed) as f64;
 62 |         let first_binary_search = self.first_binary_search.load(Relaxed) as f64;
 63 |         let first_intersect = self.first_intersect.load(Relaxed) as f64;
 64 |         let first_intersect_simd = self.first_intersect_simd.load(Relaxed) as f64;
 65 |         let first_intersect_naive = self.first_intersect_naive.load(Relaxed) as f64;
 66 |         let first_intersect_gallop = self.first_intersect_gallop.load(Relaxed) as f64;
 67 |         let second_binary_search = self.second_binary_search.load(Relaxed) as f64;
 68 |         let second_intersect = self.second_intersect.load(Relaxed) as f64;
 69 |         let second_intersect_simd = self.second_intersect_simd.load(Relaxed) as f64;
 70 |         let second_intersect_naive = self.second_intersect_naive.load(Relaxed) as f64;
 71 |         let second_intersect_gallop = self.second_intersect_gallop.load(Relaxed) as f64;
 72 |         let merge_phases_first_pass = self.merge_phases_first_pass.load(Relaxed) as f64;
 73 |         let merge_phases_second_pass = self.merge_phases_second_pass.load(Relaxed) as f64;
 74 |         let get_doc_ids = self.get_doc_ids.load(Relaxed) as f64;
 75 |         let iters = self.iters.load(Relaxed) as f64;
 76 | 
 77 |         let per_normalize_tokenize = normalize_tokenize / sum * 100f64;
 78 |         let per_merge = merge / sum * 100f64;
 79 |         let per_first_binary_search = first_binary_search / sum * 100f64;
 80 |         let per_first_intersect = first_intersect / sum * 100f64;
 81 |         let per_second_binary_search = second_binary_search / sum * 100f64;
 82 |         let per_second_intersect = second_intersect / sum * 100f64;
 83 |         let per_merge_phases_first_pass = merge_phases_first_pass / sum * 100f64;
 84 |         let per_merge_phases_second_pass = merge_phases_second_pass / sum * 100f64;
 85 |         let per_get_doc_ids = get_doc_ids / sum * 100f64;
 86 | 
 87 |         f.debug_struct("Stats")
 88 |             .field(
 89 |                 "normalize_tokenize",
 90 |                 &format_args!(
 91 |                     "        ({:08.3}ms, {:08.3}us/iter, {per_normalize_tokenize:06.3}%)",
 92 |                     normalize_tokenize / 1000f64,
 93 |                     normalize_tokenize / iters,
 94 |                 ),
 95 |             )
 96 |             .field(
 97 |                 "merge_minimize",
 98 |                 &format_args!(
 99 |                     "            ({:08.3}ms, {:08.3}us/iter, {per_merge:06.3}%)",
100 |                     merge / 1000f64,
101 |                     merge / iters,
102 |                 ),
103 |             )
104 |             .field(
105 |                 "first_binary_search",
106 |                 &format_args!(
107 |                     "       ({:08.3}ms, {:08.3}us/iter, {per_first_binary_search:06.3}%)",
108 |                     first_binary_search / 1000f64,
109 |                     first_binary_search / iters,
110 |                 ),
111 |             )
112 |             .field(
113 |                 "first_intersect",
114 |                 &format_args!(
115 |                     "           ({:08.3}ms, {:08.3}us/iter, {per_first_intersect:06.3}%)",
116 |                     first_intersect / 1000f64,
117 |                     first_intersect / iters,
118 |                 ),
119 |             )
120 |             .field(
121 |                 "    first_intersect_simd",
122 |                 &format_args!(
123 |                     "      ({:08.3}ms, {:08.3}us/iter)",
124 |                     first_intersect_simd / 1000f64,
125 |                     first_intersect_simd / iters,
126 |                 ),
127 |             )
128 |             .field(
129 |                 "    first_intersect_naive",
130 |                 &format_args!(
131 |                     "     ({:08.3}ms, {:08.3}us/iter)",
132 |                     first_intersect_naive / 1000f64,
133 |                     first_intersect_naive / iters,
134 |                 ),
135 |             )
136 |             .field(
137 |                 "    first_intersect_gallop",
138 |                 &format_args!(
139 |                     "    ({:08.3}ms, {:08.3}us/iter)",
140 |                     first_intersect_gallop / 1000f64,
141 |                     first_intersect_gallop / iters,
142 |                 ),
143 |             )
144 |             .field(
145 |                 "second_binary_search",
146 |                 &format_args!(
147 |                     "      ({:08.3}ms, {:08.3}us/iter, {per_second_binary_search:06.3}%)",
148 |                     second_binary_search / 1000f64,
149 |                     second_binary_search / iters,
150 |                 ),
151 |             )
152 |             .field(
153 |                 "second_intersect",
154 |                 &format_args!(
155 |                     "          ({:08.3}ms, {:08.3}us/iter, {per_second_intersect:06.3}%)",
156 |                     second_intersect / 1000f64,
157 |                     second_intersect / iters,
158 |                 ),
159 |             )
160 |             .field(
161 |                 "    second_intersect_simd",
162 |                 &format_args!(
163 |                     "     ({:08.3}ms, {:08.3}us/iter)",
164 |                     second_intersect_simd / 1000f64,
165 |                     second_intersect_simd / iters,
166 |                 ),
167 |             )
168 |             .field(
169 |                 "    second_intersect_naive",
170 |                 &format_args!(
171 |                     "    ({:08.3}ms, {:08.3}us/iter)",
172 |                     second_intersect_naive / 1000f64,
173 |                     second_intersect_naive / iters,
174 |                 ),
175 |             )
176 |             .field(
177 |                 "    second_intersect_gallop",
178 |                 &format_args!(
179 |                     "   ({:08.3}ms, {:08.3}us/iter)",
180 |                     second_intersect_gallop / 1000f64,
181 |                     second_intersect_gallop / iters,
182 |                 ),
183 |             )
184 |             .field(
185 |                 "merge_phases_first_pass",
186 |                 &format_args!(
187 |                     "   ({:08.3}ms, {:08.3}us/iter, {per_merge_phases_first_pass:06.3}%)",
188 |                     merge_phases_first_pass / 1000f64,
189 |                     merge_phases_first_pass / iters,
190 |                 ),
191 |             )
192 |             .field(
193 |                 "merge_phases_second_pass",
194 |                 &format_args!(
195 |                     "  ({:08.3}ms, {:08.3}us/iter, {per_merge_phases_second_pass:06.3}%)",
196 |                     merge_phases_second_pass / 1000f64,
197 |                     merge_phases_second_pass / iters,
198 |                 ),
199 |             )
200 |             .field(
201 |                 "get_doc_ids",
202 |                 &format_args!(
203 |                     "               ({:08.3}ms, {:08.3}us/iter, {per_get_doc_ids:06.3}%)",
204 |                     get_doc_ids / 1000f64,
205 |                     get_doc_ids / iters,
206 |                 ),
207 |             )
208 |             .finish()
209 |     }
210 | }
211 | 


--------------------------------------------------------------------------------
/src/roaringish/intersect/simd.rs:
--------------------------------------------------------------------------------
  1 | #[allow(unused_imports)]
  2 | use std::{
  3 |     arch::x86_64::__m512i,
  4 |     mem::MaybeUninit,
  5 |     simd::{Simd, cmp::SimdPartialOrd},
  6 | };
  7 | use std::{
  8 |     arch::{
  9 |         asm,
 10 |         x86_64::{_mm512_load_epi64, _mm512_maskz_compress_epi64, _mm512_storeu_epi64},
 11 |     },
 12 |     sync::atomic::Ordering::Relaxed,
 13 | };
 14 | 
 15 | use crate::{
 16 |     Stats,
 17 |     roaringish::{
 18 |         ADD_ONE_GROUP, Aligned64, BorrowRoaringishPacked, clear_values, clear_values_simd,
 19 |         unpack_values_simd,
 20 |     },
 21 | };
 22 | 
 23 | use super::{Intersect, private::IntersectSeal};
 24 | use super::{Intersection, naive::NaiveIntersect};
 25 | use crate::roaringish::Aligned;
 26 | 
 27 | const N: usize = 8;
 28 | 
 29 | #[cfg(target_feature = "avx512vp2intersect")]
 30 | #[inline(always)]
 31 | unsafe fn vp2intersectq(a: __m512i, b: __m512i) -> (u8, u8) {
 32 |     unsafe {
 33 |         use std::arch::x86_64::__mmask8;
 34 | 
 35 |         let mut mask0: __mmask8;
 36 |         let mut mask1: __mmask8;
 37 |         asm!(
 38 |             "vp2intersectq k2, {0}, {1}",
 39 |             in(zmm_reg) a,
 40 |             in(zmm_reg) b,
 41 |             out("k2") mask0,
 42 |             out("k3") mask1,
 43 |             options(pure, nomem, nostack),
 44 |         );
 45 | 
 46 |         (mask0, mask1)
 47 |     }
 48 | }
 49 | 
 50 | #[cfg(not(target_feature = "avx512vp2intersect"))]
 51 | #[inline(always)]
 52 | unsafe fn vp2intersectq(a: __m512i, b: __m512i) -> (u8, u8) {
 53 |     use std::arch::x86_64::{
 54 |         _MM_PERM_BADC, _mm512_alignr_epi32, _mm512_cmpeq_epi64_mask, _mm512_shuffle_epi32,
 55 |     };
 56 | 
 57 |     unsafe {
 58 |         let a1 = _mm512_alignr_epi32(a, a, 4);
 59 |         let a2 = _mm512_alignr_epi32(a, a, 8);
 60 |         let a3 = _mm512_alignr_epi32(a, a, 12);
 61 | 
 62 |         let b1 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
 63 | 
 64 |         let m00 = _mm512_cmpeq_epi64_mask(a, b);
 65 |         let m01 = _mm512_cmpeq_epi64_mask(a, b1);
 66 |         let m10 = _mm512_cmpeq_epi64_mask(a1, b);
 67 |         let m11 = _mm512_cmpeq_epi64_mask(a1, b1);
 68 |         let m20 = _mm512_cmpeq_epi64_mask(a2, b);
 69 |         let m21 = _mm512_cmpeq_epi64_mask(a2, b1);
 70 |         let m30 = _mm512_cmpeq_epi64_mask(a3, b);
 71 |         let m31 = _mm512_cmpeq_epi64_mask(a3, b1);
 72 | 
 73 |         let mask0 = m00
 74 |             | m01
 75 |             | (m10 | m11).rotate_left(2)
 76 |             | (m20 | m21).rotate_left(4)
 77 |             | (m30 | m31).rotate_left(6);
 78 | 
 79 |         let m0 = m00 | m10 | m20 | m30;
 80 |         let m1 = m01 | m11 | m21 | m31;
 81 |         let mask1 = m0 | ((0x55 & m1) << 1) | ((m1 >> 1) & 0x55);
 82 | 
 83 |         (mask0, mask1)
 84 |     }
 85 | }
 86 | 
 87 | #[inline(always)]
 88 | unsafe fn analyze_msb(
 89 |     lhs_pack: Simd<u64, N>,
 90 |     msb_packed_result: &mut [MaybeUninit<u64>],
 91 |     j: &mut usize,
 92 |     msb_mask: Simd<u64, N>,
 93 | ) {
 94 |     let mask = (lhs_pack & msb_mask).simd_gt(Simd::splat(0)).to_bitmask() as u8;
 95 |     let pack_plus_one: Simd<u64, N> = lhs_pack + Simd::splat(ADD_ONE_GROUP);
 96 |     unsafe {
 97 |         // TODO: avoid compressstore on zen
 98 |         let compress = _mm512_maskz_compress_epi64(mask, pack_plus_one.into());
 99 |         _mm512_storeu_epi64(msb_packed_result.as_mut_ptr().add(*j) as *mut _, compress);
100 |     }
101 |     *j += mask.count_ones() as usize;
102 | }
103 | 
104 | #[inline(always)]
105 | fn rotl_u16(a: Simd<u64, N>, i: u64) -> Simd<u64, N> {
106 |     let p0 = a << i;
107 |     let p1 = a >> (16 - i);
108 | 
109 |     // we don't need to unpack the values, since
110 |     // in the next step we already `and` with
111 |     // with mask where the doc id and group are
112 |     // zeroed
113 |     p0 | p1
114 | }
115 | 
116 | /// SIMD version of the intersection algorithm using AVX-512.
117 | pub struct SimdIntersect;
118 | impl IntersectSeal for SimdIntersect {}
119 | impl Intersection for SimdIntersect {}
120 | 
121 | impl Intersect for SimdIntersect {
122 |     #[inline(always)]
123 |     fn inner_intersect<const FIRST: bool>(
124 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
125 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
126 | 
127 |         lhs_i: &mut usize,
128 |         rhs_i: &mut usize,
129 | 
130 |         packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
131 |         i: &mut usize,
132 | 
133 |         msb_packed_result: &mut Box<[MaybeUninit<u64>], Aligned64>,
134 |         j: &mut usize,
135 | 
136 |         add_to_group: u64,
137 |         lhs_len: u16,
138 |         msb_mask: u16,
139 |         lsb_mask: u16,
140 | 
141 |         stats: &Stats,
142 |     ) {
143 |         let b = std::time::Instant::now();
144 | 
145 |         let simd_msb_mask = Simd::splat(msb_mask as u64);
146 |         let simd_lsb_mask = Simd::splat(lsb_mask as u64);
147 |         let simd_add_to_group = Simd::splat(add_to_group);
148 | 
149 |         let end_lhs = lhs.0.len() / N * N;
150 |         let end_rhs = rhs.0.len() / N * N;
151 |         let lhs_packed = unsafe { lhs.0.get_unchecked(..end_lhs) };
152 |         let rhs_packed = unsafe { rhs.0.get_unchecked(..end_rhs) };
153 |         assert_eq!(lhs_packed.len() % N, 0);
154 |         assert_eq!(rhs_packed.len() % N, 0);
155 | 
156 |         let mut need_to_analyze_msb = false;
157 | 
158 |         // The first intersection will always fit into 4 pages, so no need to manually
159 |         // align the loop. Since it's size is 197 bytes > 64*3 = 192 bytes. If in the
160 |         // future we can reduce the size of the loop in at least 5 bytes we can fit it
161 |         // in 3 pages, the same way we fit the second intersection
162 | 
163 |         // Forces the alignment of the loop to be at the begining of a 64 bytes page
164 |         // making it fit in only 3 pages, instead of 4 (up 50% faster execution).
165 |         // Since this function is inlined the alignment of the loop is based on the
166 |         // parent function alignment, so this value will change in the future, but
167 |         // assuming that fuctions will be 64 byte aligned, it's fairly easy to find
168 |         // the new value once the code of the parent function changes
169 |         if FIRST {
170 |             for _ in 0..26 {
171 |                 unsafe {
172 |                     asm!("nop");
173 |                 }
174 |             }
175 |         } else {
176 |             for _ in 0..48 {
177 |                 unsafe {
178 |                     asm!("nop");
179 |                 }
180 |             }
181 |         }
182 | 
183 |         while *lhs_i < lhs_packed.len() && *rhs_i < rhs_packed.len() {
184 |             // Don't move this code around
185 |             // this leads to shit failed optimization by LLVM
186 |             // where it try to create SIMD code, but it fucks perf
187 |             //
188 |             // Me and my homies hate LLVM
189 |             let lhs_last = unsafe {
190 |                 clear_values(*lhs_packed.get_unchecked(*lhs_i + N - 1))
191 |                     + if FIRST { add_to_group } else { 0 }
192 |             };
193 |             let rhs_last = unsafe { clear_values(*rhs_packed.get_unchecked(*rhs_i + N - 1)) };
194 | 
195 |             let (lhs_pack, rhs_pack): (Simd<u64, N>, Simd<u64, N>) = unsafe {
196 |                 let lhs_pack = _mm512_load_epi64(lhs_packed.as_ptr().add(*lhs_i) as *const _);
197 |                 let rhs_pack = _mm512_load_epi64(rhs_packed.as_ptr().add(*rhs_i) as *const _);
198 |                 (lhs_pack.into(), rhs_pack.into())
199 |             };
200 |             let lhs_pack = if FIRST {
201 |                 lhs_pack + simd_add_to_group
202 |             } else {
203 |                 lhs_pack
204 |             };
205 | 
206 |             let lhs_doc_id_group = clear_values_simd(lhs_pack);
207 | 
208 |             let rhs_doc_id_group = clear_values_simd(rhs_pack);
209 |             let rhs_values = unpack_values_simd(rhs_pack);
210 | 
211 |             let (lhs_mask, rhs_mask) =
212 |                 unsafe { vp2intersectq(lhs_doc_id_group.into(), rhs_doc_id_group.into()) };
213 | 
214 |             if FIRST || lhs_mask > 0 {
215 |                 unsafe {
216 |                     let lhs_pack_compress: Simd<u64, N> =
217 |                         _mm512_maskz_compress_epi64(lhs_mask, lhs_pack.into()).into();
218 |                     let doc_id_group_compress = clear_values_simd(lhs_pack_compress);
219 |                     let lhs_values_compress = unpack_values_simd(lhs_pack_compress);
220 | 
221 |                     let rhs_values_compress: Simd<u64, N> =
222 |                         _mm512_maskz_compress_epi64(rhs_mask, rhs_values.into()).into();
223 | 
224 |                     let intersection = if FIRST {
225 |                         (lhs_values_compress << (lhs_len as u64)) & rhs_values_compress
226 |                     } else {
227 |                         rotl_u16(lhs_values_compress, lhs_len as u64)
228 |                             & simd_lsb_mask
229 |                             & rhs_values_compress
230 |                     };
231 | 
232 |                     _mm512_storeu_epi64(
233 |                         packed_result.as_mut_ptr().add(*i) as *mut _,
234 |                         (doc_id_group_compress | intersection).into(),
235 |                     );
236 | 
237 |                     *i += lhs_mask.count_ones() as usize;
238 |                 }
239 |             }
240 | 
241 |             if FIRST {
242 |                 if lhs_last <= rhs_last {
243 |                     unsafe {
244 |                         analyze_msb(lhs_pack, msb_packed_result, j, simd_msb_mask);
245 |                     }
246 |                     *lhs_i += N;
247 |                 }
248 |             } else {
249 |                 *lhs_i += N * (lhs_last <= rhs_last) as usize;
250 |             }
251 |             *rhs_i += N * (rhs_last <= lhs_last) as usize;
252 |             need_to_analyze_msb = rhs_last < lhs_last;
253 |         }
254 | 
255 |         if FIRST && need_to_analyze_msb && !(*lhs_i < lhs.0.len() && *rhs_i < rhs.0.len()) {
256 |             unsafe {
257 |                 let lhs_pack: Simd<u64, N> =
258 |                     _mm512_load_epi64(lhs_packed.as_ptr().add(*lhs_i) as *const _).into();
259 |                 analyze_msb(
260 |                     lhs_pack + simd_add_to_group,
261 |                     msb_packed_result,
262 |                     j,
263 |                     simd_msb_mask,
264 |                 );
265 |             };
266 |         }
267 | 
268 |         if FIRST {
269 |             stats
270 |                 .first_intersect_simd
271 |                 .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
272 |         } else {
273 |             stats
274 |                 .second_intersect_simd
275 |                 .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
276 |         }
277 | 
278 |         NaiveIntersect::inner_intersect::<FIRST>(
279 |             lhs,
280 |             rhs,
281 |             lhs_i,
282 |             rhs_i,
283 |             packed_result,
284 |             i,
285 |             msb_packed_result,
286 |             j,
287 |             add_to_group,
288 |             lhs_len,
289 |             msb_mask,
290 |             lsb_mask,
291 |             stats,
292 |         );
293 |     }
294 | 
295 |     fn intersection_buffer_size(
296 |         lhs: BorrowRoaringishPacked<'_, Aligned>,
297 |         rhs: BorrowRoaringishPacked<'_, Aligned>,
298 |     ) -> usize {
299 |         lhs.0.len().min(rhs.0.len()) + 1 + N
300 |     }
301 | }
302 | 


--------------------------------------------------------------------------------
/src/indexer.rs:
--------------------------------------------------------------------------------
  1 | use std::{cmp::Reverse, collections::HashSet, path::Path};
  2 | 
  3 | use crate::{
  4 |     RoaringishPacked, Searcher,
  5 |     db::{DB, Document, MAX_WINDOW_LEN},
  6 |     decreasing_window_iter::DecreasingWindows,
  7 |     error::DbError,
  8 |     roaringish::MAX_VALUE,
  9 |     utils::{normalize, tokenize},
 10 | };
 11 | use fxhash::FxHashMap;
 12 | use gxhash::{HashMap as GxHashMap, HashMapExt};
 13 | use heed::RwTxn;
 14 | use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
 15 | 
 16 | /// Specifies how the common tokens are treated during indexing.
 17 | #[derive(Debug)]
 18 | pub enum CommonTokens {
 19 |     /// Fixed list specified by the user.
 20 |     List(HashSet<String>),
 21 |     /// Top `n` most frequent tokens.
 22 |     FixedNum(u32),
 23 |     /// Percentage of the top `n` most frequent tokens.
 24 |     Percentage(f64),
 25 | }
 26 | 
 27 | /// Batch of documents to be indexed.
 28 | #[derive(Debug)]
 29 | struct Batch<D: Document> {
 30 |     /// Monotonically increasing batch id.
 31 |     batch_id: u32,
 32 | 
 33 |     /// Used to estimate the number of distinct tokens.
 34 |     hllp_tokens: HyperLogLogPlus<Box<str>, gxhash::GxBuildHasher>,
 35 | 
 36 |     /// Monotonically increasing token id (cleared after each batch).
 37 |     next_token_id: u32,
 38 | 
 39 |     /// Maps tokens to token ids (cleared after each batch).
 40 |     token_to_token_id: GxHashMap<Box<str>, u32>,
 41 | 
 42 |     /// Maps token ids to their roaringish packed data (cleared after each batch).
 43 |     ///
 44 |     /// This should be in sync with `token_id_to_token`
 45 |     token_id_to_roaringish_packed: Vec<RoaringishPacked>,
 46 |     /// Maps token ids to tokens (cleared after each batch).
 47 |     ///
 48 |     /// This should be in sync with `token_id_to_roaringish_packed`
 49 |     token_id_to_token: Vec<Box<str>>,
 50 | 
 51 |     // this 3 containers are in sync
 52 |     /// Document ids in the batch (cleared after each batch).
 53 |     ///
 54 |     /// This should be in sync with `documents` and `tokenized_docs`.
 55 |     doc_ids: Vec<u32>,
 56 |     /// Documents in the batch (cleared after each batch).
 57 |     ///
 58 |     /// This should be in sync with `doc_ids` and `tokenized_docs`.
 59 |     documents: Vec<D>,
 60 | 
 61 |     /// Tokenized representation of the documents in the batch (cleared after each batch).
 62 |     /// This representation is done by storing the token id.
 63 |     ///
 64 |     /// This should be in sync with `doc_ids` and `documents`.
 65 |     tokenized_docs: Vec<Vec<u32>>,
 66 | }
 67 | 
 68 | impl<D: Document> Batch<D> {
 69 |     /// Constructs a new batch.
 70 |     fn new() -> Self {
 71 |         Self {
 72 |             batch_id: 0,
 73 |             // This can't fail
 74 |             hllp_tokens: HyperLogLogPlus::new(18, gxhash::GxBuildHasher::default()).unwrap(),
 75 |             next_token_id: 0,
 76 |             token_to_token_id: GxHashMap::new(),
 77 |             token_id_to_roaringish_packed: Vec::new(),
 78 |             token_id_to_token: Vec::new(),
 79 |             doc_ids: Vec::new(),
 80 |             documents: Vec::new(),
 81 |             tokenized_docs: Vec::new(),
 82 |         }
 83 |     }
 84 | 
 85 |     /// Get an overestimated number of distinct tokens.
 86 |     fn estimate_number_of_distinct_tokens(&mut self) -> u64 {
 87 |         (self.hllp_tokens.count() * 1.015f64) as u64
 88 |     }
 89 | 
 90 |     /// Clears the batch.
 91 |     ///
 92 |     /// This should be called after each flush and before the start of a new batch.
 93 |     fn clear(&mut self) {
 94 |         self.next_token_id = 0;
 95 |         self.token_to_token_id.clear();
 96 |         self.token_id_to_roaringish_packed.clear();
 97 |         self.token_id_to_token.clear();
 98 | 
 99 |         self.doc_ids.clear();
100 |         self.documents.clear();
101 |         self.tokenized_docs.clear();
102 |     }
103 | 
104 |     /// Adds a document to the batch and starts the indexing process.
105 |     ///
106 |     /// `count_freq` is used to count the frequency of each token. This should
107 |     /// only be used in the first batch, allowing us to generate the common tokens.
108 |     fn push(&mut self, doc_id: u32, content: &str, doc: D, count_freq: impl FnMut(&str)) {
109 |         let tokenized_doc = self.index_doc(content, doc_id, count_freq);
110 |         self.doc_ids.push(doc_id);
111 |         self.documents.push(doc);
112 |         self.tokenized_docs.push(tokenized_doc);
113 |     }
114 | 
115 |     /// Get the token id for the input `token`. If the token is not present in the
116 |     /// in the batch then it's added and a new token id is generated by incrementing
117 |     /// the current value.
118 |     fn get_token_id(
119 |         token: &str,
120 |         hllp_tokens: &mut HyperLogLogPlus<Box<str>, gxhash::GxBuildHasher>,
121 |         token_to_token_id: &mut GxHashMap<Box<str>, u32>,
122 |         token_id_to_token: &mut Vec<Box<str>>,
123 |         token_id_to_roaringish_packed: &mut Vec<RoaringishPacked>,
124 |         next_token_id: &mut u32,
125 |     ) -> u32 {
126 |         hllp_tokens.insert(token);
127 | 
128 |         let (_, token_id) = token_to_token_id
129 |             .raw_entry_mut()
130 |             .from_key(token)
131 |             .or_insert_with(|| {
132 |                 let current_token_id = *next_token_id;
133 |                 *next_token_id += 1;
134 |                 (token.to_string().into_boxed_str(), current_token_id)
135 |             });
136 | 
137 |         if *token_id as usize >= token_id_to_token.len() {
138 |             token_id_to_token.push(token.to_string().into_boxed_str());
139 |             token_id_to_roaringish_packed.push(RoaringishPacked::default());
140 |         }
141 | 
142 |         *token_id
143 |     }
144 | 
145 |     /// Indexes `content`s for this `doc_id`.
146 |     ///
147 |     /// `count_freq` is used to count the frequency of each token. This should
148 |     /// only be used in the first batch, allowing us to generate the common tokens.
149 |     fn index_doc(
150 |         &mut self,
151 |         content: &str,
152 |         doc_id: u32,
153 |         mut count_freq: impl FnMut(&str),
154 |     ) -> Vec<u32> {
155 |         let mut tokenized_doc = Vec::new();
156 |         let mut token_id_to_positions: FxHashMap<u32, Vec<u32>> = FxHashMap::new();
157 |         let content = normalize(content);
158 |         for (pos, token) in tokenize(&content).enumerate().take(MAX_VALUE as usize) {
159 |             let token_id = Self::get_token_id(
160 |                 token,
161 |                 &mut self.hllp_tokens,
162 |                 &mut self.token_to_token_id,
163 |                 &mut self.token_id_to_token,
164 |                 &mut self.token_id_to_roaringish_packed,
165 |                 &mut self.next_token_id,
166 |             );
167 | 
168 |             count_freq(token);
169 | 
170 |             token_id_to_positions
171 |                 .entry(token_id)
172 |                 .or_default()
173 |                 .push(pos as u32);
174 |             tokenized_doc.push(token_id);
175 |         }
176 | 
177 |         for (token_id, positions) in token_id_to_positions.iter() {
178 |             self.token_id_to_roaringish_packed[*token_id as usize].push(doc_id, positions);
179 |         }
180 |         tokenized_doc
181 |     }
182 | 
183 |     /// Flushes the batch.
184 |     fn flush(
185 |         &mut self,
186 |         db: &DB<D>,
187 |         rwtxn: &mut RwTxn,
188 |         common_tokens: &HashSet<Box<str>>,
189 |         mmap_size: &mut usize,
190 |     ) -> Result<(), DbError> {
191 |         log::info!("Flushing batch");
192 |         let b = std::time::Instant::now();
193 | 
194 |         // Nothing to do if the batch is empty.
195 |         if self.doc_ids.is_empty() {
196 |             log::debug!("Empty batch, nothing to flush");
197 |             return Ok(());
198 |         }
199 | 
200 |         self.merge_common_tokens(common_tokens);
201 | 
202 |         db.write_token_to_roaringish_packed(
203 |             &self.token_to_token_id,
204 |             &self.token_id_to_roaringish_packed,
205 |             mmap_size,
206 |             self.batch_id,
207 |         )?;
208 |         db.write_doc_id_to_document(rwtxn, &self.doc_ids, &self.documents)?;
209 | 
210 |         self.batch_id += 1;
211 |         self.clear();
212 |         log::info!("Flush took {:?}", b.elapsed());
213 |         Ok(())
214 |     }
215 | 
216 |     /// Merges the tokens for all of the documents in the batch.
217 |     /// This will create new tokens and consequently new token ids.
218 |     ///
219 |     /// The generation is done by merging up to [MAX_WINDOW_LEN] tokens at a time.
220 |     /// We are only allowed to merge:
221 |     /// * Common tokens with other common tokens.
222 |     /// * Rare tokens with a common token.
223 |     /// * Common tokens with a rare token.
224 |     ///
225 |     /// So if it's impossible for the generated token to have two rare tokens.
226 |     /// Also if rare tokens can only be in the first or last position, for example:
227 |     ///
228 |     /// # Examples
229 |     /// ```
230 |     /// c c
231 |     /// c c c
232 |     /// r c
233 |     /// r c c
234 |     /// c r
235 |     /// c c r
236 |     /// ```
237 |     ///
238 |     /// This will generate all possible combinations of the merging process.
239 |     fn merge_common_tokens(&mut self, common_tokens: &HashSet<Box<str>>) {
240 |         log::debug!("Merging common tokens");
241 |         if common_tokens.is_empty() {
242 |             return;
243 |         }
244 | 
245 |         let b = std::time::Instant::now();
246 |         for (tokenized_doc, doc_id) in self.tokenized_docs.iter().zip(self.doc_ids.iter()) {
247 |             let mut token_id_to_positions: FxHashMap<u32, Vec<u32>> = FxHashMap::new();
248 |             let it = DecreasingWindows::new(tokenized_doc, MAX_WINDOW_LEN);
249 |             for (pos, token_ids) in it.enumerate() {
250 |                 let token_id = token_ids[0];
251 |                 let token = &self.token_id_to_token[token_id as usize];
252 |                 let is_first_token_rare = !common_tokens.contains(token);
253 | 
254 |                 for i in 1..token_ids.len() {
255 |                     let token_id = token_ids[i];
256 |                     let token = &self.token_id_to_token[token_id as usize];
257 |                     let is_token_rare = !common_tokens.contains(token);
258 |                     if is_first_token_rare && is_token_rare {
259 |                         break;
260 |                     }
261 |                     let token: String = token_ids[..i + 1]
262 |                         .iter()
263 |                         .map(|token_id| self.token_id_to_token[*token_id as usize].as_ref())
264 |                         .intersperse(" ")
265 |                         .collect();
266 |                     let token_id = Self::get_token_id(
267 |                         &token,
268 |                         &mut self.hllp_tokens,
269 |                         &mut self.token_to_token_id,
270 |                         &mut self.token_id_to_token,
271 |                         &mut self.token_id_to_roaringish_packed,
272 |                         &mut self.next_token_id,
273 |                     );
274 |                     token_id_to_positions
275 |                         .entry(token_id)
276 |                         .or_default()
277 |                         .push(pos as u32);
278 |                     if is_token_rare {
279 |                         break;
280 |                     }
281 |                 }
282 |             }
283 | 
284 |             for (token_id, positions) in token_id_to_positions.iter() {
285 |                 self.token_id_to_roaringish_packed[*token_id as usize].push(*doc_id, positions);
286 |             }
287 |         }
288 |         log::debug!("Merge took {:?}", b.elapsed());
289 |     }
290 | }
291 | 
292 | /// Responsible for indexing documents.
293 | pub struct Indexer {
294 |     batch_size: Option<u32>,
295 |     common_tokens: Option<CommonTokens>,
296 | }
297 | 
298 | impl Indexer {
299 |     /// Creates a new indexer.
300 |     ///
301 |     /// * If `batch_size` is [None] then the indexer will index all the documents in a single batch.
302 |     /// * If `common_tokens` is [None] then merging will happen.
303 |     pub fn new(batch_size: Option<u32>, common_tokens: Option<CommonTokens>) -> Self {
304 |         Self {
305 |             batch_size,
306 |             common_tokens,
307 |         }
308 |     }
309 | 
310 |     /// Generates the list of common tokens to be used
311 |     /// in the merging phase
312 |     fn generate_common_tokens(
313 |         &self,
314 |         token_to_freq: &GxHashMap<Box<str>, u32>,
315 |     ) -> HashSet<Box<str>> {
316 |         let Some(common_tokens) = &self.common_tokens else {
317 |             return HashSet::new();
318 |         };
319 |         match common_tokens {
320 |             CommonTokens::List(tokens) => tokens
321 |                 .into_iter()
322 |                 .map(|t| t.to_string().clone().into_boxed_str())
323 |                 .collect(),
324 |             CommonTokens::FixedNum(max) => {
325 |                 let max = (*max as usize).min(token_to_freq.len());
326 |                 let mut token_to_freq: Vec<_> = token_to_freq.iter().collect();
327 |                 token_to_freq.sort_unstable_by_key(|(_, freq)| Reverse(*freq));
328 |                 token_to_freq[0..max]
329 |                     .iter()
330 |                     .map(|(token, _)| (*token).clone())
331 |                     .collect()
332 |             }
333 |             CommonTokens::Percentage(p) => {
334 |                 let max = (token_to_freq.len() as f64 * *p) as usize;
335 |                 let mut token_to_freq: Vec<_> = token_to_freq.iter().collect();
336 |                 token_to_freq.sort_unstable_by_key(|(_, freq)| Reverse(*freq));
337 |                 token_to_freq[0..max]
338 |                     .iter()
339 |                     .map(|(token, _)| (*token).clone())
340 |                     .collect()
341 |             }
342 |         }
343 |     }
344 | 
345 |     /// Indexes an iterator of documents.
346 |     ///
347 |     /// This iterator should essentially return a tuple `(&str, D)`, where
348 |     /// `D` is the form of the document that will be serialized and stored in the database.
349 |     ///
350 |     /// So the content of the document (`&str`) can be different from the stored version (`D`).
351 |     ///
352 |     /// The type `D` is anything that can be serialized by [rkyv].
353 |     ///
354 |     /// This returns a [Searcher] object and the number of indexed documents.
355 |     pub fn index<S, D, I, P>(
356 |         &self,
357 |         docs: I,
358 |         path: P,
359 |         db_size: usize,
360 |     ) -> Result<(Searcher<D>, u32), DbError>
361 |     where
362 |         S: AsRef<str>,
363 |         I: IntoIterator<Item = (S, D)>,
364 |         D: Document,
365 |         P: AsRef<Path>,
366 |     {
367 |         let path = path.as_ref();
368 |         let db = DB::truncate(path, db_size)?;
369 |         let mut rwtxn = db.env.write_txn()?;
370 | 
371 |         let mut batch = Batch::new();
372 | 
373 |         let batch_size = self.batch_size.unwrap_or(u32::MAX);
374 |         let mut it = docs.into_iter();
375 | 
376 |         let mut token_to_freq = GxHashMap::new();
377 |         let mut next_doc_id = 0;
378 |         let mut mmap_size = 0;
379 | 
380 |         log::info!("Starting first batch");
381 |         // Index the first batch to generate the common tokens
382 |         let b = std::time::Instant::now();
383 |         for (content, doc) in it.by_ref() {
384 |             let doc_id = next_doc_id;
385 |             next_doc_id += 1;
386 | 
387 |             batch.push(doc_id, content.as_ref(), doc, |token| {
388 |                 let (_, freq) = token_to_freq
389 |                     .raw_entry_mut()
390 |                     .from_key(token)
391 |                     .or_insert_with(|| (token.to_owned().into_boxed_str(), 0));
392 |                 *freq += 1;
393 |             });
394 | 
395 |             if next_doc_id % batch_size == 0 {
396 |                 break;
397 |             }
398 |         }
399 |         log::info!("First batch took {:?}", b.elapsed());
400 | 
401 |         let common_tokens = self.generate_common_tokens(&token_to_freq);
402 |         drop(token_to_freq);
403 | 
404 |         batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?;
405 | 
406 |         // Index the rest of the documents
407 |         log::info!("Starting new batch");
408 |         let mut b = std::time::Instant::now();
409 |         for (content, doc) in it {
410 |             let doc_id = next_doc_id;
411 |             next_doc_id += 1;
412 | 
413 |             batch.push(doc_id, content.as_ref(), doc, |_| {});
414 | 
415 |             if next_doc_id % batch_size == 0 {
416 |                 log::info!("Batch took {:?}", b.elapsed());
417 |                 b = std::time::Instant::now();
418 |                 batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?;
419 |                 log::info!("Starting new batch");
420 |             }
421 |         }
422 | 
423 |         // Flush the last batch
424 |         batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?;
425 | 
426 |         let number_of_distinct_tokens = batch.estimate_number_of_distinct_tokens();
427 |         log::debug!(
428 |             "Approximation for the number of distinct tokens: {}",
429 |             number_of_distinct_tokens
430 |         );
431 | 
432 |         // Write to db
433 |         db.write_common_tokens(&mut rwtxn, &common_tokens)?;
434 |         db.generate_mmap_file(
435 |             number_of_distinct_tokens,
436 |             mmap_size,
437 |             batch.batch_id,
438 |             &mut rwtxn,
439 |         )?;
440 | 
441 |         let b = std::time::Instant::now();
442 |         log::info!("Commiting");
443 |         rwtxn.commit()?;
444 |         log::info!("Commit took {:?}", b.elapsed());
445 | 
446 |         let searcher = Searcher::new(path)?;
447 |         Ok((searcher, next_doc_id))
448 |     }
449 | }
450 | 


--------------------------------------------------------------------------------
/src/roaringish.rs:
--------------------------------------------------------------------------------
  1 | pub mod intersect;
  2 | 
  3 | use intersect::{
  4 |     Intersect, gallop_first::GallopIntersectFirst, gallop_second::GallopIntersectSecond,
  5 | };
  6 | use rkyv::{Archive, Serialize, with::InlineAsBox};
  7 | use std::{
  8 |     arch::x86_64::_mm256_mask_compressstoreu_epi32,
  9 |     fmt::{Binary, Debug, Display},
 10 |     marker::PhantomData,
 11 |     mem::MaybeUninit,
 12 |     ops::Deref,
 13 |     simd::{LaneCount, Simd, SupportedLaneCount, cmp::SimdPartialEq, num::SimdUint},
 14 |     sync::atomic::Ordering::Relaxed,
 15 | };
 16 | 
 17 | use crate::Stats;
 18 | use crate::{Intersection, allocator::Aligned64};
 19 | 
 20 | pub const MAX_VALUE: u32 = 16u32 * u16::MAX as u32;
 21 | pub const ADD_ONE_GROUP: u64 = u16::MAX as u64 + 1;
 22 | 
 23 | /// Group part of a position
 24 | const fn group(val: u32) -> u16 {
 25 |     (val / 16) as u16
 26 | }
 27 | 
 28 | /// Value part of a position
 29 | const fn value(val: u32) -> u16 {
 30 |     (val % 16) as u16
 31 | }
 32 | 
 33 | /// Group and value parts of a position
 34 | const fn gv(val: u32) -> (u16, u16) {
 35 |     (group(val), value(val))
 36 | }
 37 | 
 38 | /// Puts the document ID in the 32 MSBs of the packed representation
 39 | const fn pack_doc_id(doc_id: u32) -> u64 {
 40 |     (doc_id as u64) << 32
 41 | }
 42 | 
 43 | /// Puts the group in the middle of the packed representation
 44 | const fn pack_group(group: u16) -> u64 {
 45 |     (group as u64) << 16
 46 | }
 47 | 
 48 | /// Packs a value into its packed representation
 49 | const fn pack_value(value: u16) -> u64 {
 50 |     1 << value
 51 | }
 52 | 
 53 | /// Packs a document ID and group together (they should already be in their packed form)
 54 | const fn pack_doc_id_group(packed_doc_id: u64, group: u16) -> u64 {
 55 |     packed_doc_id | pack_group(group)
 56 | }
 57 | 
 58 | /// Packs a document ID, group (they should already be in their packed form),
 59 | /// also packs a value
 60 | const fn pack(packed_doc_id: u64, group: u16, value: u16) -> u64 {
 61 |     pack_doc_id_group(packed_doc_id, group) | pack_value(value)
 62 | }
 63 | 
 64 | /// Clears the values part of the packed representation
 65 | const fn clear_values(packed: u64) -> u64 {
 66 |     packed & !0xFFFF
 67 | }
 68 | 
 69 | /// Clears the group and values part of the packed representation
 70 | const fn clear_group_values(packed: u64) -> u64 {
 71 |     packed & !0xFFFFFFFF
 72 | }
 73 | 
 74 | /// Clears the values part of the packed representation
 75 | #[inline(always)]
 76 | fn clear_values_simd<const N: usize>(packed: Simd<u64, N>) -> Simd<u64, N>
 77 | where
 78 |     LaneCount<N>: SupportedLaneCount,
 79 | {
 80 |     packed & Simd::splat(!0xFFFF)
 81 | }
 82 | 
 83 | /// Unpacks the document ID from the packed representation
 84 | const fn unpack_doc_id(packed: u64) -> u32 {
 85 |     (packed >> 32) as u32
 86 | }
 87 | 
 88 | /// Unpacks the document ID from the packed representation
 89 | #[allow(unused)]
 90 | #[inline(always)]
 91 | fn unpack_doc_id_simd<const N: usize>(packed: Simd<u64, N>) -> Simd<u32, N>
 92 | where
 93 |     LaneCount<N>: SupportedLaneCount,
 94 | {
 95 |     (packed >> Simd::splat(32)).cast()
 96 | }
 97 | 
 98 | /// Unpacks the group from the packed representation
 99 | const fn unpack_group(packed: u64) -> u16 {
100 |     (packed >> 16) as u16
101 | }
102 | 
103 | /// Unpacks the values from the packed representation
104 | const fn unpack_values(packed: u64) -> u16 {
105 |     packed as u16
106 | }
107 | 
108 | /// Unpacks the values from the packed representation
109 | #[inline(always)]
110 | fn unpack_values_simd<const N: usize>(packed: Simd<u64, N>) -> Simd<u64, N>
111 | where
112 |     LaneCount<N>: SupportedLaneCount,
113 | {
114 |     packed & Simd::splat(0xFFFF)
115 | }
116 | 
117 | /// Enum used to distinguish between owned and borrowed RoaringishPacked.
118 | /// Mainly used at the end of the indexing phase when we merge all of the
119 | /// batches together.
120 | pub enum RoaringishPackedKind<'a, A> {
121 |     Owned(RoaringishPacked),
122 |     Archived(&'a ArchivedBorrowRoaringishPacked<'a, A>),
123 | }
124 | 
125 | impl<'a, A> RoaringishPackedKind<'a, A> {
126 |     /// Bytes of the Roaringish Packed
127 |     pub fn as_bytes(&self) -> &[u8] {
128 |         match self {
129 |             RoaringishPackedKind::Owned(packed) => unsafe {
130 |                 let (l, packed, r) = packed.0.align_to::<u8>();
131 |                 assert!(l.is_empty());
132 |                 assert!(r.is_empty());
133 |                 packed
134 |             },
135 |             RoaringishPackedKind::Archived(packed) => unsafe {
136 |                 let (l, packed, r) = packed.0.align_to::<u8>();
137 |                 assert!(l.is_empty());
138 |                 assert!(r.is_empty());
139 |                 packed
140 |             },
141 |         }
142 |     }
143 | 
144 |     /// Concatenates two Roaringish Packed together
145 |     pub fn concat<'b: 'a>(self, other: RoaringishPackedKind<'b, A>) -> RoaringishPackedKind<'b, A> {
146 |         unsafe fn copy_data<T, U>(dest: &mut [MaybeUninit<T>], lhs: &[U], rhs: &[U]) {
147 |             unsafe {
148 |                 let (l, buf, r) = dest.align_to_mut::<MaybeUninit<u8>>();
149 |                 assert!(l.is_empty());
150 |                 assert!(r.is_empty());
151 | 
152 |                 let (l, lhs, r) = lhs.align_to::<MaybeUninit<u8>>();
153 |                 assert!(l.is_empty());
154 |                 assert!(r.is_empty());
155 | 
156 |                 let (l, rhs, r) = rhs.align_to::<MaybeUninit<u8>>();
157 |                 assert!(l.is_empty());
158 |                 assert!(r.is_empty());
159 | 
160 |                 buf[0..lhs.len()].copy_from_slice(lhs);
161 |                 buf[lhs.len()..].copy_from_slice(rhs);
162 |             }
163 |         }
164 | 
165 |         let r = match (self, other) {
166 |             (RoaringishPackedKind::Owned(mut lhs), RoaringishPackedKind::Archived(rhs)) => {
167 |                 lhs.0.extend(rhs.0.iter().map(|v| v.to_native()));
168 |                 lhs
169 |             }
170 |             (RoaringishPackedKind::Archived(lhs), RoaringishPackedKind::Archived(rhs)) => {
171 |                 let n = lhs.0.len() + rhs.0.len();
172 |                 let mut packed: Box<[MaybeUninit<u64>], _> =
173 |                     Box::new_uninit_slice_in(n, Aligned64::default());
174 | 
175 |                 unsafe {
176 |                     copy_data(&mut packed, &lhs.0, &rhs.0);
177 |                     let (p_packed, a0) = Box::into_raw_with_allocator(packed);
178 |                     RoaringishPacked(Vec::from_raw_parts_in(p_packed as *mut _, n, n, a0))
179 |                 }
180 |             }
181 |             _ => panic!("This type of append should never happen"),
182 |         };
183 |         RoaringishPackedKind::Owned(r)
184 |     }
185 | }
186 | 
187 | /// Main data structure used for phrase search.
188 | /// In here we store a compact representation of the
189 | /// document IDs and positions.
190 | ///
191 | /// The representation should be in the form:
192 | /// ```
193 | /// document ID | group   | values
194 | ///   32 bits   | 16 bits | 16 bits
195 | /// ```
196 | ///
197 | /// So the packed fits into 64 bits.
198 | ///
199 | /// The data structure should be ordered by the
200 | /// document ID and group.
201 | #[derive(PartialEq, Eq, Debug, Serialize, Archive)]
202 | #[repr(transparent)]
203 | pub struct RoaringishPacked(Vec<u64, Aligned64>);
204 | 
205 | impl Deref for RoaringishPacked {
206 |     type Target = Vec<u64, Aligned64>;
207 | 
208 |     fn deref(&self) -> &Self::Target {
209 |         &self.0
210 |     }
211 | }
212 | 
213 | impl RoaringishPacked {
214 |     /// Size occupied in bytes
215 |     pub fn size_bytes(&self) -> usize {
216 |         self.len() * std::mem::size_of::<u64>()
217 |     }
218 | 
219 |     /// Adds a document with id `doc_id` and positions `pos`
220 |     /// to the Roaringish Packed.
221 |     pub fn push(&mut self, doc_id: u32, pos: &[u32]) {
222 |         let packed_doc_id = pack_doc_id(doc_id);
223 | 
224 |         let mut it = pos.iter().copied();
225 |         let Some(p) = it.next() else {
226 |             return;
227 |         };
228 | 
229 |         self.0.reserve(pos.len());
230 | 
231 |         unsafe {
232 |             let (group, value) = gv(p);
233 |             let packed = pack(packed_doc_id, group, value);
234 | 
235 |             self.0.push_within_capacity(packed).unwrap_unchecked();
236 |         }
237 | 
238 |         for p in it {
239 |             let (group, value) = gv(p);
240 |             let doc_id_group = pack_doc_id_group(packed_doc_id, group);
241 |             let value = pack_value(value);
242 |             let packed = doc_id_group | value;
243 | 
244 |             let last_doc_id_group = unsafe { clear_values(*self.0.last().unwrap_unchecked()) };
245 |             if last_doc_id_group == doc_id_group {
246 |                 unsafe {
247 |                     *self.0.last_mut().unwrap_unchecked() |= value;
248 |                 };
249 |             } else {
250 |                 unsafe {
251 |                     self.0.push_within_capacity(packed).unwrap_unchecked();
252 |                 }
253 |             }
254 |         }
255 |     }
256 | }
257 | 
258 | impl Default for RoaringishPacked {
259 |     fn default() -> Self {
260 |         Self(Vec::new_in(Aligned64::default()))
261 |     }
262 | }
263 | 
264 | /// Type used to mark when the Roaringish Packed is aligned to 64 bytes
265 | #[derive(Clone, Copy, Debug)]
266 | pub struct Aligned;
267 | 
268 | /// Type used to mark when the Roaringish Packed is unaligned
269 | #[derive(Clone, Copy, Debug)]
270 | pub struct Unaligned;
271 | 
272 | /// Borrow version of the Roaringish Packed. Maily used to
273 | /// interop with the Roaringish Packed retrieved from the DB.
274 | #[derive(Clone, Copy, Debug, Serialize, Archive)]
275 | pub struct BorrowRoaringishPacked<'a, A>(#[rkyv(with = InlineAsBox)] &'a [u64], PhantomData<A>);
276 | 
277 | impl<A> Deref for BorrowRoaringishPacked<'_, A> {
278 |     type Target = [u64];
279 | 
280 |     fn deref(&self) -> &Self::Target {
281 |         self.0
282 |     }
283 | }
284 | 
285 | impl<'a> BorrowRoaringishPacked<'a, Aligned> {
286 |     /// Creates a new Roaringish Packed from
287 |     /// the packed representation.
288 |     ///
289 |     /// Checks if it's aligned to 64 bytes.
290 |     pub fn new_raw(packed: &'a [u64]) -> Self {
291 |         assert!(packed.as_ptr().is_aligned_to(64));
292 |         Self(packed, PhantomData)
293 |     }
294 | 
295 |     /// Creates a new Roaringish Packed from
296 |     /// the packed representation.
297 |     #[allow(clippy::ptr_arg)]
298 |     pub fn new(packed: &'a Vec<u64, Aligned64>) -> Self {
299 |         Self(packed, PhantomData)
300 |     }
301 | 
302 |     #[inline(never)]
303 |     pub fn intersect<I: Intersection>(
304 |         self,
305 |         mut rhs: Self,
306 |         lhs_len: u32,
307 |         stats: &Stats,
308 |     ) -> RoaringishPacked {
309 |         const FIRST_GALLOP_INTERSECT: usize = 650;
310 |         const SECOND_GALLOP_INTERSECT: usize = 120;
311 | 
312 |         #[inline(always)]
313 |         fn binary_search(
314 |             lhs: &mut BorrowRoaringishPacked<'_, Aligned>,
315 |             rhs: &mut BorrowRoaringishPacked<'_, Aligned>,
316 |         ) {
317 |             // skip the begining of the slice
318 |             let Some(first_lhs) = lhs.0.first() else {
319 |                 return;
320 |             };
321 | 
322 |             let Some(first_rhs) = rhs.0.first() else {
323 |                 return;
324 |             };
325 | 
326 |             let first_lhs = clear_group_values(*first_lhs);
327 |             let first_rhs = clear_group_values(*first_rhs);
328 | 
329 |             match first_lhs.cmp(&first_rhs) {
330 |                 std::cmp::Ordering::Less => {
331 |                     let i = match lhs.0.binary_search_by_key(&first_rhs, |p| clear_values(*p)) {
332 |                         Ok(i) => i,
333 |                         Err(i) => i,
334 |                     };
335 |                     let aligned_i = i / 8 * 8;
336 |                     *lhs = BorrowRoaringishPacked::new_raw(&lhs.0[aligned_i..]);
337 |                 }
338 |                 std::cmp::Ordering::Greater => {
339 |                     let i = match rhs.0.binary_search_by_key(&first_lhs, |p| clear_values(*p)) {
340 |                         Ok(i) => i,
341 |                         Err(i) => i,
342 |                     };
343 |                     let aligned_i = i / 8 * 8;
344 |                     *rhs = BorrowRoaringishPacked::new_raw(&rhs.0[aligned_i..]);
345 |                 }
346 |                 std::cmp::Ordering::Equal => {}
347 |             }
348 |         }
349 | 
350 |         let mut lhs = self;
351 | 
352 |         if lhs.0.is_empty() || rhs.0.is_empty() {
353 |             return RoaringishPacked::default();
354 |         }
355 | 
356 |         let b = std::time::Instant::now();
357 |         binary_search(&mut lhs, &mut rhs);
358 |         stats
359 |             .first_binary_search
360 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
361 | 
362 |         let b = std::time::Instant::now();
363 |         // this can't fail we just checked
364 |         let proportion = lhs.len().max(rhs.len()) / lhs.len().min(rhs.len());
365 |         if proportion >= FIRST_GALLOP_INTERSECT {
366 |             let (packed, _) = GallopIntersectFirst::intersect::<true>(lhs, rhs, lhs_len, stats);
367 |             let (msb_packed, _) =
368 |                 GallopIntersectFirst::intersect::<false>(lhs, rhs, lhs_len, stats);
369 |             stats
370 |                 .first_intersect
371 |                 .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
372 | 
373 |             return Self::merge_results(packed, msb_packed, stats);
374 |         }
375 |         let (packed, msb_packed) = I::intersect::<true>(lhs, rhs, lhs_len, stats);
376 |         stats
377 |             .first_intersect
378 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
379 | 
380 |         let mut msb_packed = BorrowRoaringishPacked::new(&msb_packed);
381 | 
382 |         let b = std::time::Instant::now();
383 |         binary_search(&mut msb_packed, &mut rhs);
384 |         stats
385 |             .second_binary_search
386 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
387 | 
388 |         let b = std::time::Instant::now();
389 |         let proportion = msb_packed
390 |             .len()
391 |             .max(rhs.len())
392 |             .checked_div(msb_packed.len().min(rhs.len()));
393 |         let (msb_packed, _) = match proportion {
394 |             Some(proportion) => {
395 |                 if proportion >= SECOND_GALLOP_INTERSECT {
396 |                     GallopIntersectSecond::intersect::<false>(msb_packed, rhs, lhs_len, stats)
397 |                 } else {
398 |                     I::intersect::<false>(msb_packed, rhs, lhs_len, stats)
399 |                 }
400 |             }
401 |             None => I::intersect::<false>(msb_packed, rhs, lhs_len, stats),
402 |         };
403 |         stats
404 |             .second_intersect
405 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
406 | 
407 |         Self::merge_results(packed, msb_packed, stats)
408 |     }
409 | 
410 |     /// Merges the results of the first and second phase of the intersection.
411 |     ///
412 |     /// This function neeeds to be inline always, for some reason not inlining this
413 |     /// function makes some queries performance unpredictable
414 |     #[inline(always)]
415 |     fn merge_results(
416 |         packed: Vec<u64, Aligned64>,
417 |         msb_packed: Vec<u64, Aligned64>,
418 |         stats: &Stats,
419 |     ) -> RoaringishPacked {
420 |         let b = std::time::Instant::now();
421 |         let capacity = packed.len() + msb_packed.len();
422 |         let mut r_packed = Box::new_uninit_slice_in(capacity, Aligned64::default());
423 |         let mut r_i = 0;
424 |         let mut j = 0;
425 |         // let's use the fact the the first element in `packed` is smaller or
426 |         // equal to the first element in `msb_packed`
427 |         for pack in packed.iter().copied() {
428 |             unsafe {
429 |                 let doc_id_group = clear_values(pack);
430 |                 let values = unpack_values(pack);
431 | 
432 |                 // write from the `until it's smaller than the current doc_id_group`
433 |                 while j < msb_packed.len() {
434 |                     let msb_pack = *msb_packed.get_unchecked(j);
435 |                     let msb_doc_id_group = clear_values(msb_pack);
436 |                     let msb_values = unpack_values(msb_pack);
437 |                     j += 1;
438 | 
439 |                     if msb_doc_id_group >= doc_id_group {
440 |                         j -= 1;
441 |                         break;
442 |                     }
443 | 
444 |                     if msb_values > 0 {
445 |                         r_packed.get_unchecked_mut(r_i).write(msb_pack);
446 |                         r_i += 1;
447 |                     }
448 |                 }
449 | 
450 |                 // check to avoid writing elements where their values are 0
451 |                 let write = values > 0;
452 |                 if write {
453 |                     r_packed.get_unchecked_mut(r_i).write(pack);
454 |                     r_i += 1;
455 |                 }
456 | 
457 |                 // avoids out of bounds read
458 |                 if j >= msb_packed.len() {
459 |                     continue;
460 |                 }
461 | 
462 |                 // write the element from `msb_packed` that made the loop break
463 |                 // only if it's equal to the current `doc_id_group`
464 |                 let msb_pack = *msb_packed.get_unchecked(j);
465 |                 let msb_doc_id_group = clear_values(msb_pack);
466 |                 let msb_values = unpack_values(msb_pack);
467 |                 j += 1;
468 |                 if msb_doc_id_group != doc_id_group {
469 |                     j -= 1;
470 |                     continue;
471 |                 }
472 | 
473 |                 if write {
474 |                     // in this case at least one bit was set in the intersection,
475 |                     // so we can just `or` the new value with the previous one
476 |                     let r = r_packed.get_unchecked_mut(r_i - 1).assume_init_mut();
477 |                     *r |= msb_values as u64;
478 |                 } else if msb_values > 0 {
479 |                     // in this case no bit was set in the intersection,
480 |                     // so write as if it was new
481 |                     r_packed.get_unchecked_mut(r_i).write(msb_pack);
482 |                     r_i += 1;
483 |                 }
484 |             }
485 |         }
486 |         stats
487 |             .merge_phases_first_pass
488 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
489 | 
490 |         // finish the rest of the elements in `msb_packed`
491 |         let b = std::time::Instant::now();
492 |         for msb_pack in msb_packed.iter().skip(j).copied() {
493 |             unsafe {
494 |                 let msb_values = unpack_values(msb_pack);
495 |                 if msb_values > 0 {
496 |                     r_packed.get_unchecked_mut(r_i).write(msb_pack);
497 |                     r_i += 1;
498 |                 }
499 |             }
500 |         }
501 |         stats
502 |             .merge_phases_second_pass
503 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
504 | 
505 |         unsafe {
506 |             let (p_packed, a0) = Box::into_raw_with_allocator(r_packed);
507 |             let packed = Vec::from_raw_parts_in(p_packed as *mut _, r_i, capacity, a0);
508 |             RoaringishPacked(packed)
509 |         }
510 |     }
511 | }
512 | 
513 | impl<A> BorrowRoaringishPacked<'_, A> {
514 |     /// Gets the distinct document IDs from the Roaringish Packed.
515 |     #[cfg(not(target_feature = "avx512f"))]
516 |     #[inline(always)]
517 |     pub fn get_doc_ids(&self, stats: &Stats) -> Vec<u32> {
518 |         if self.0.is_empty() {
519 |             return Vec::new();
520 |         }
521 | 
522 |         if self.0.len() == 1 {
523 |             return vec![unpack_doc_id(self.0[0])];
524 |         }
525 | 
526 |         let b = std::time::Instant::now();
527 | 
528 |         let mut doc_ids: Box<[MaybeUninit<u32>]> = Box::new_uninit_slice(self.0.len());
529 |         let mut i = 0;
530 | 
531 |         for [packed0, packed1] in self.0.array_windows::<2>() {
532 |             let doc_id0 = unpack_doc_id(*packed0);
533 |             let doc_id1 = unpack_doc_id(*packed1);
534 |             if doc_id0 != doc_id1 {
535 |                 unsafe { doc_ids.get_unchecked_mut(i).write(doc_id0) };
536 |                 i += 1;
537 |             }
538 |         }
539 | 
540 |         unsafe {
541 |             doc_ids
542 |                 .get_unchecked_mut(i)
543 |                 .write(unpack_doc_id(*self.0.last().unwrap_unchecked()))
544 |         };
545 |         i += 1;
546 | 
547 |         stats
548 |             .get_doc_ids
549 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
550 | 
551 |         unsafe { Vec::from_raw_parts(Box::into_raw(doc_ids) as *mut _, i, self.0.len()) }
552 |     }
553 | 
554 |     /// Gets the distinct document IDs from the Roaringish Packed.
555 |     #[cfg(target_feature = "avx512f")]
556 |     #[inline(always)]
557 |     pub fn get_doc_ids(&self, stats: &Stats) -> Vec<u32> {
558 |         if self.0.is_empty() {
559 |             return Vec::new();
560 |         }
561 | 
562 |         if self.0.len() == 1 {
563 |             return vec![unpack_doc_id(self.0[0])];
564 |         }
565 | 
566 |         let b = std::time::Instant::now();
567 | 
568 |         let mut doc_ids: Box<[MaybeUninit<u32>]> = Box::new_uninit_slice(self.0.len());
569 |         let mut i = 0;
570 | 
571 |         unsafe { doc_ids.get_unchecked_mut(i).write(unpack_doc_id(self.0[0])) };
572 |         i += 1;
573 | 
574 |         let mut last_doc_id = unpack_doc_id(self.0[0]);
575 |         let (l, m, r) = self.0.as_simd::<8>();
576 |         assert!(l.is_empty());
577 |         for packed in m {
578 |             let doc_id = unpack_doc_id_simd(*packed);
579 |             let rot = doc_id.rotate_elements_right::<1>();
580 |             let first = doc_id.as_array()[0];
581 |             let last = doc_id.as_array()[7];
582 | 
583 |             let include_first = (first != last_doc_id) as u8;
584 |             let mask = (doc_id.simd_ne(rot).to_bitmask() as u8 & !1) | include_first;
585 | 
586 |             unsafe {
587 |                 // TODO: avoid compressstore on zen4
588 |                 _mm256_mask_compressstoreu_epi32(
589 |                     doc_ids.as_mut_ptr().add(i) as *mut _,
590 |                     mask,
591 |                     doc_id.into(),
592 |                 );
593 |             }
594 |             i += mask.count_ones() as usize;
595 |             last_doc_id = last;
596 |         }
597 | 
598 |         let j = r
599 |             .iter()
600 |             .take_while(|packed| unpack_doc_id(**packed) == last_doc_id)
601 |             .count();
602 |         let r = &r[j..];
603 |         for [packed0, packed1] in r.array_windows::<2>() {
604 |             let doc_id0 = unpack_doc_id(*packed0);
605 |             let doc_id1 = unpack_doc_id(*packed1);
606 |             if doc_id0 != doc_id1 {
607 |                 unsafe { doc_ids.get_unchecked_mut(i).write(doc_id0) };
608 |                 i += 1;
609 |             }
610 |         }
611 | 
612 |         if !r.is_empty() {
613 |             unsafe {
614 |                 doc_ids
615 |                     .get_unchecked_mut(i)
616 |                     .write(unpack_doc_id(*r.last().unwrap_unchecked()))
617 |             };
618 |             i += 1;
619 |         }
620 | 
621 |         stats
622 |             .get_doc_ids
623 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
624 | 
625 |         unsafe { Vec::from_raw_parts(Box::into_raw(doc_ids) as *mut _, i, self.0.len()) }
626 |     }
627 | }
628 | 
629 | impl Binary for RoaringishPacked {
630 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
631 |         let mut list = f.debug_list();
632 |         for packed in self.0.iter() {
633 |             list.entry_with(|f| {
634 |                 let doc_id = unpack_doc_id(*packed);
635 |                 let group = unpack_group(*packed);
636 |                 let values = unpack_values(*packed);
637 |                 f.write_fmt(format_args!("{doc_id:032b} {group:016b} {values:016b}"))
638 |             });
639 |         }
640 | 
641 |         list.finish()
642 |     }
643 | }
644 | 
645 | impl<A> Binary for BorrowRoaringishPacked<'_, A> {
646 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
647 |         let mut list = f.debug_list();
648 |         for packed in self.0.iter() {
649 |             list.entry_with(|f| {
650 |                 let doc_id = unpack_doc_id(*packed);
651 |                 let group = unpack_group(*packed);
652 |                 let values = unpack_values(*packed);
653 |                 f.write_fmt(format_args!("{doc_id:032b} {group:016b} {values:016b}"))
654 |             });
655 |         }
656 | 
657 |         list.finish()
658 |     }
659 | }
660 | 
661 | impl Display for RoaringishPacked {
662 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
663 |         let it = self.0.iter().flat_map(|packed| {
664 |             let doc_id = unpack_doc_id(*packed);
665 |             let group = unpack_group(*packed) as u32;
666 |             let values = unpack_values(*packed);
667 |             let s = group * 16;
668 |             (0..16u32).filter_map(move |i| ((values >> i) & 1 == 1).then_some((doc_id, s + i)))
669 |         });
670 |         f.debug_list().entries(it).finish()
671 |     }
672 | }
673 | 
674 | impl<A> Display for BorrowRoaringishPacked<'_, A> {
675 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
676 |         let it = self.0.iter().flat_map(|packed| {
677 |             let doc_id = unpack_doc_id(*packed);
678 |             let group = unpack_group(*packed) as u32;
679 |             let values = unpack_values(*packed);
680 |             let s = group * 16;
681 |             (0..16u32).filter_map(move |i| ((values >> i) & 1 == 1).then_some((doc_id, s + i)))
682 |         });
683 |         f.debug_list().entries(it).finish()
684 |     }
685 | }
686 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "bincode"
  7 | version = "1.3.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
 10 | dependencies = [
 11 |  "serde",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "bitflags"
 16 | version = "2.9.0"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
 19 | dependencies = [
 20 |  "serde",
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "bumpalo"
 25 | version = "3.17.0"
 26 | source = "registry+https://github.com/rust-lang/crates.io-index"
 27 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
 28 | 
 29 | [[package]]
 30 | name = "bytecheck"
 31 | version = "0.8.1"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "50690fb3370fb9fe3550372746084c46f2ac8c9685c583d2be10eefd89d3d1a3"
 34 | dependencies = [
 35 |  "bytecheck_derive",
 36 |  "ptr_meta",
 37 |  "rancor",
 38 |  "simdutf8",
 39 | ]
 40 | 
 41 | [[package]]
 42 | name = "bytecheck_derive"
 43 | version = "0.8.1"
 44 | source = "registry+https://github.com/rust-lang/crates.io-index"
 45 | checksum = "efb7846e0cb180355c2dec69e721edafa36919850f1a9f52ffba4ebc0393cb71"
 46 | dependencies = [
 47 |  "proc-macro2",
 48 |  "quote",
 49 |  "syn",
 50 | ]
 51 | 
 52 | [[package]]
 53 | name = "byteorder"
 54 | version = "1.5.0"
 55 | source = "registry+https://github.com/rust-lang/crates.io-index"
 56 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 57 | 
 58 | [[package]]
 59 | name = "bytes"
 60 | version = "1.10.0"
 61 | source = "registry+https://github.com/rust-lang/crates.io-index"
 62 | checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
 63 | 
 64 | [[package]]
 65 | name = "cc"
 66 | version = "1.2.16"
 67 | source = "registry+https://github.com/rust-lang/crates.io-index"
 68 | checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
 69 | dependencies = [
 70 |  "shlex",
 71 | ]
 72 | 
 73 | [[package]]
 74 | name = "crossbeam-queue"
 75 | version = "0.3.12"
 76 | source = "registry+https://github.com/rust-lang/crates.io-index"
 77 | checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
 78 | dependencies = [
 79 |  "crossbeam-utils",
 80 | ]
 81 | 
 82 | [[package]]
 83 | name = "crossbeam-utils"
 84 | version = "0.8.21"
 85 | source = "registry+https://github.com/rust-lang/crates.io-index"
 86 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 87 | 
 88 | [[package]]
 89 | name = "displaydoc"
 90 | version = "0.2.5"
 91 | source = "registry+https://github.com/rust-lang/crates.io-index"
 92 | checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 93 | dependencies = [
 94 |  "proc-macro2",
 95 |  "quote",
 96 |  "syn",
 97 | ]
 98 | 
 99 | [[package]]
100 | name = "doxygen-rs"
101 | version = "0.4.2"
102 | source = "registry+https://github.com/rust-lang/crates.io-index"
103 | checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9"
104 | dependencies = [
105 |  "phf",
106 | ]
107 | 
108 | [[package]]
109 | name = "equivalent"
110 | version = "1.0.2"
111 | source = "registry+https://github.com/rust-lang/crates.io-index"
112 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
113 | 
114 | [[package]]
115 | name = "form_urlencoded"
116 | version = "1.2.1"
117 | source = "registry+https://github.com/rust-lang/crates.io-index"
118 | checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
119 | dependencies = [
120 |  "percent-encoding",
121 | ]
122 | 
123 | [[package]]
124 | name = "fxhash"
125 | version = "0.2.1"
126 | source = "registry+https://github.com/rust-lang/crates.io-index"
127 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
128 | dependencies = [
129 |  "byteorder",
130 | ]
131 | 
132 | [[package]]
133 | name = "gxhash"
134 | version = "3.4.1"
135 | source = "registry+https://github.com/rust-lang/crates.io-index"
136 | checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb"
137 | dependencies = [
138 |  "rustversion",
139 | ]
140 | 
141 | [[package]]
142 | name = "hashbrown"
143 | version = "0.15.2"
144 | source = "registry+https://github.com/rust-lang/crates.io-index"
145 | checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
146 | 
147 | [[package]]
148 | name = "heed"
149 | version = "0.21.0"
150 | source = "registry+https://github.com/rust-lang/crates.io-index"
151 | checksum = "bd54745cfacb7b97dee45e8fdb91814b62bccddb481debb7de0f9ee6b7bf5b43"
152 | dependencies = [
153 |  "bitflags",
154 |  "byteorder",
155 |  "heed-traits",
156 |  "heed-types",
157 |  "libc",
158 |  "lmdb-master-sys",
159 |  "once_cell",
160 |  "page_size",
161 |  "serde",
162 |  "synchronoise",
163 |  "url",
164 | ]
165 | 
166 | [[package]]
167 | name = "heed-traits"
168 | version = "0.20.0"
169 | source = "registry+https://github.com/rust-lang/crates.io-index"
170 | checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff"
171 | 
172 | [[package]]
173 | name = "heed-types"
174 | version = "0.21.0"
175 | source = "registry+https://github.com/rust-lang/crates.io-index"
176 | checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d"
177 | dependencies = [
178 |  "bincode",
179 |  "byteorder",
180 |  "heed-traits",
181 |  "serde",
182 |  "serde_json",
183 | ]
184 | 
185 | [[package]]
186 | name = "hyperloglogplus"
187 | version = "0.4.1"
188 | source = "registry+https://github.com/rust-lang/crates.io-index"
189 | checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3"
190 | dependencies = [
191 |  "serde",
192 | ]
193 | 
194 | [[package]]
195 | name = "icu_collections"
196 | version = "1.5.0"
197 | source = "registry+https://github.com/rust-lang/crates.io-index"
198 | checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
199 | dependencies = [
200 |  "displaydoc",
201 |  "yoke",
202 |  "zerofrom",
203 |  "zerovec",
204 | ]
205 | 
206 | [[package]]
207 | name = "icu_locid"
208 | version = "1.5.0"
209 | source = "registry+https://github.com/rust-lang/crates.io-index"
210 | checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
211 | dependencies = [
212 |  "displaydoc",
213 |  "litemap",
214 |  "tinystr",
215 |  "writeable",
216 |  "zerovec",
217 | ]
218 | 
219 | [[package]]
220 | name = "icu_locid_transform"
221 | version = "1.5.0"
222 | source = "registry+https://github.com/rust-lang/crates.io-index"
223 | checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
224 | dependencies = [
225 |  "displaydoc",
226 |  "icu_locid",
227 |  "icu_locid_transform_data",
228 |  "icu_provider",
229 |  "tinystr",
230 |  "zerovec",
231 | ]
232 | 
233 | [[package]]
234 | name = "icu_locid_transform_data"
235 | version = "1.5.0"
236 | source = "registry+https://github.com/rust-lang/crates.io-index"
237 | checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
238 | 
239 | [[package]]
240 | name = "icu_normalizer"
241 | version = "1.5.0"
242 | source = "registry+https://github.com/rust-lang/crates.io-index"
243 | checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
244 | dependencies = [
245 |  "displaydoc",
246 |  "icu_collections",
247 |  "icu_normalizer_data",
248 |  "icu_properties",
249 |  "icu_provider",
250 |  "smallvec",
251 |  "utf16_iter",
252 |  "utf8_iter",
253 |  "write16",
254 |  "zerovec",
255 | ]
256 | 
257 | [[package]]
258 | name = "icu_normalizer_data"
259 | version = "1.5.0"
260 | source = "registry+https://github.com/rust-lang/crates.io-index"
261 | checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
262 | 
263 | [[package]]
264 | name = "icu_properties"
265 | version = "1.5.1"
266 | source = "registry+https://github.com/rust-lang/crates.io-index"
267 | checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
268 | dependencies = [
269 |  "displaydoc",
270 |  "icu_collections",
271 |  "icu_locid_transform",
272 |  "icu_properties_data",
273 |  "icu_provider",
274 |  "tinystr",
275 |  "zerovec",
276 | ]
277 | 
278 | [[package]]
279 | name = "icu_properties_data"
280 | version = "1.5.0"
281 | source = "registry+https://github.com/rust-lang/crates.io-index"
282 | checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
283 | 
284 | [[package]]
285 | name = "icu_provider"
286 | version = "1.5.0"
287 | source = "registry+https://github.com/rust-lang/crates.io-index"
288 | checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
289 | dependencies = [
290 |  "displaydoc",
291 |  "icu_locid",
292 |  "icu_provider_macros",
293 |  "stable_deref_trait",
294 |  "tinystr",
295 |  "writeable",
296 |  "yoke",
297 |  "zerofrom",
298 |  "zerovec",
299 | ]
300 | 
301 | [[package]]
302 | name = "icu_provider_macros"
303 | version = "1.5.0"
304 | source = "registry+https://github.com/rust-lang/crates.io-index"
305 | checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
306 | dependencies = [
307 |  "proc-macro2",
308 |  "quote",
309 |  "syn",
310 | ]
311 | 
312 | [[package]]
313 | name = "idna"
314 | version = "1.0.3"
315 | source = "registry+https://github.com/rust-lang/crates.io-index"
316 | checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
317 | dependencies = [
318 |  "idna_adapter",
319 |  "smallvec",
320 |  "utf8_iter",
321 | ]
322 | 
323 | [[package]]
324 | name = "idna_adapter"
325 | version = "1.2.0"
326 | source = "registry+https://github.com/rust-lang/crates.io-index"
327 | checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
328 | dependencies = [
329 |  "icu_normalizer",
330 |  "icu_properties",
331 | ]
332 | 
333 | [[package]]
334 | name = "indexmap"
335 | version = "2.7.1"
336 | source = "registry+https://github.com/rust-lang/crates.io-index"
337 | checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
338 | dependencies = [
339 |  "equivalent",
340 |  "hashbrown",
341 | ]
342 | 
343 | [[package]]
344 | name = "itoa"
345 | version = "1.0.15"
346 | source = "registry+https://github.com/rust-lang/crates.io-index"
347 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
348 | 
349 | [[package]]
350 | name = "libc"
351 | version = "0.2.170"
352 | source = "registry+https://github.com/rust-lang/crates.io-index"
353 | checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
354 | 
355 | [[package]]
356 | name = "litemap"
357 | version = "0.7.5"
358 | source = "registry+https://github.com/rust-lang/crates.io-index"
359 | checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856"
360 | 
361 | [[package]]
362 | name = "lmdb-master-sys"
363 | version = "0.2.4"
364 | source = "registry+https://github.com/rust-lang/crates.io-index"
365 | checksum = "472c3760e2a8d0f61f322fb36788021bb36d573c502b50fa3e2bcaac3ec326c9"
366 | dependencies = [
367 |  "cc",
368 |  "doxygen-rs",
369 |  "libc",
370 | ]
371 | 
372 | [[package]]
373 | name = "log"
374 | version = "0.4.26"
375 | source = "registry+https://github.com/rust-lang/crates.io-index"
376 | checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
377 | 
378 | [[package]]
379 | name = "memchr"
380 | version = "2.7.4"
381 | source = "registry+https://github.com/rust-lang/crates.io-index"
382 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
383 | 
384 | [[package]]
385 | name = "memmap2"
386 | version = "0.9.5"
387 | source = "registry+https://github.com/rust-lang/crates.io-index"
388 | checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
389 | dependencies = [
390 |  "libc",
391 | ]
392 | 
393 | [[package]]
394 | name = "munge"
395 | version = "0.4.3"
396 | source = "registry+https://github.com/rust-lang/crates.io-index"
397 | checksum = "a0091202c98cf06da46c279fdf50cccb6b1c43b4521abdf6a27b4c7e71d5d9d7"
398 | dependencies = [
399 |  "munge_macro",
400 | ]
401 | 
402 | [[package]]
403 | name = "munge_macro"
404 | version = "0.4.3"
405 | source = "registry+https://github.com/rust-lang/crates.io-index"
406 | checksum = "734799cf91479720b2f970c61a22850940dd91e27d4f02b1c6fc792778df2459"
407 | dependencies = [
408 |  "proc-macro2",
409 |  "quote",
410 |  "syn",
411 | ]
412 | 
413 | [[package]]
414 | name = "once_cell"
415 | version = "1.20.3"
416 | source = "registry+https://github.com/rust-lang/crates.io-index"
417 | checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e"
418 | 
419 | [[package]]
420 | name = "page_size"
421 | version = "0.6.0"
422 | source = "registry+https://github.com/rust-lang/crates.io-index"
423 | checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
424 | dependencies = [
425 |  "libc",
426 |  "winapi",
427 | ]
428 | 
429 | [[package]]
430 | name = "percent-encoding"
431 | version = "2.3.1"
432 | source = "registry+https://github.com/rust-lang/crates.io-index"
433 | checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
434 | 
435 | [[package]]
436 | name = "phf"
437 | version = "0.11.3"
438 | source = "registry+https://github.com/rust-lang/crates.io-index"
439 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
440 | dependencies = [
441 |  "phf_macros",
442 |  "phf_shared",
443 | ]
444 | 
445 | [[package]]
446 | name = "phf_generator"
447 | version = "0.11.3"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
450 | dependencies = [
451 |  "phf_shared",
452 |  "rand",
453 | ]
454 | 
455 | [[package]]
456 | name = "phf_macros"
457 | version = "0.11.3"
458 | source = "registry+https://github.com/rust-lang/crates.io-index"
459 | checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
460 | dependencies = [
461 |  "phf_generator",
462 |  "phf_shared",
463 |  "proc-macro2",
464 |  "quote",
465 |  "syn",
466 | ]
467 | 
468 | [[package]]
469 | name = "phf_shared"
470 | version = "0.11.3"
471 | source = "registry+https://github.com/rust-lang/crates.io-index"
472 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
473 | dependencies = [
474 |  "siphasher",
475 | ]
476 | 
477 | [[package]]
478 | name = "proc-macro2"
479 | version = "1.0.94"
480 | source = "registry+https://github.com/rust-lang/crates.io-index"
481 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
482 | dependencies = [
483 |  "unicode-ident",
484 | ]
485 | 
486 | [[package]]
487 | name = "ptr_meta"
488 | version = "0.3.0"
489 | source = "registry+https://github.com/rust-lang/crates.io-index"
490 | checksum = "fe9e76f66d3f9606f44e45598d155cb13ecf09f4a28199e48daf8c8fc937ea90"
491 | dependencies = [
492 |  "ptr_meta_derive",
493 | ]
494 | 
495 | [[package]]
496 | name = "ptr_meta_derive"
497 | version = "0.3.0"
498 | source = "registry+https://github.com/rust-lang/crates.io-index"
499 | checksum = "ca414edb151b4c8d125c12566ab0d74dc9cdba36fb80eb7b848c15f495fd32d1"
500 | dependencies = [
501 |  "proc-macro2",
502 |  "quote",
503 |  "syn",
504 | ]
505 | 
506 | [[package]]
507 | name = "quote"
508 | version = "1.0.39"
509 | source = "registry+https://github.com/rust-lang/crates.io-index"
510 | checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801"
511 | dependencies = [
512 |  "proc-macro2",
513 | ]
514 | 
515 | [[package]]
516 | name = "rancor"
517 | version = "0.1.0"
518 | source = "registry+https://github.com/rust-lang/crates.io-index"
519 | checksum = "caf5f7161924b9d1cea0e4cabc97c372cea92b5f927fc13c6bca67157a0ad947"
520 | dependencies = [
521 |  "ptr_meta",
522 | ]
523 | 
524 | [[package]]
525 | name = "rand"
526 | version = "0.8.5"
527 | source = "registry+https://github.com/rust-lang/crates.io-index"
528 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
529 | dependencies = [
530 |  "rand_core",
531 | ]
532 | 
533 | [[package]]
534 | name = "rand_core"
535 | version = "0.6.4"
536 | source = "registry+https://github.com/rust-lang/crates.io-index"
537 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
538 | 
539 | [[package]]
540 | name = "rend"
541 | version = "0.5.2"
542 | source = "registry+https://github.com/rust-lang/crates.io-index"
543 | checksum = "a35e8a6bf28cd121053a66aa2e6a2e3eaffad4a60012179f0e864aa5ffeff215"
544 | dependencies = [
545 |  "bytecheck",
546 | ]
547 | 
548 | [[package]]
549 | name = "rkyv"
550 | version = "0.8.10"
551 | source = "registry+https://github.com/rust-lang/crates.io-index"
552 | checksum = "1e147371c75553e1e2fcdb483944a8540b8438c31426279553b9a8182a9b7b65"
553 | dependencies = [
554 |  "bytecheck",
555 |  "bytes",
556 |  "hashbrown",
557 |  "indexmap",
558 |  "munge",
559 |  "ptr_meta",
560 |  "rancor",
561 |  "rend",
562 |  "rkyv_derive",
563 |  "tinyvec",
564 |  "uuid",
565 | ]
566 | 
567 | [[package]]
568 | name = "rkyv_derive"
569 | version = "0.8.10"
570 | source = "registry+https://github.com/rust-lang/crates.io-index"
571 | checksum = "246b40ac189af6c675d124b802e8ef6d5246c53e17367ce9501f8f66a81abb7a"
572 | dependencies = [
573 |  "proc-macro2",
574 |  "quote",
575 |  "syn",
576 | ]
577 | 
578 | [[package]]
579 | name = "rustversion"
580 | version = "1.0.20"
581 | source = "registry+https://github.com/rust-lang/crates.io-index"
582 | checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
583 | 
584 | [[package]]
585 | name = "ryu"
586 | version = "1.0.20"
587 | source = "registry+https://github.com/rust-lang/crates.io-index"
588 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
589 | 
590 | [[package]]
591 | name = "serde"
592 | version = "1.0.218"
593 | source = "registry+https://github.com/rust-lang/crates.io-index"
594 | checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60"
595 | dependencies = [
596 |  "serde_derive",
597 | ]
598 | 
599 | [[package]]
600 | name = "serde_derive"
601 | version = "1.0.218"
602 | source = "registry+https://github.com/rust-lang/crates.io-index"
603 | checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b"
604 | dependencies = [
605 |  "proc-macro2",
606 |  "quote",
607 |  "syn",
608 | ]
609 | 
610 | [[package]]
611 | name = "serde_json"
612 | version = "1.0.140"
613 | source = "registry+https://github.com/rust-lang/crates.io-index"
614 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
615 | dependencies = [
616 |  "itoa",
617 |  "memchr",
618 |  "ryu",
619 |  "serde",
620 | ]
621 | 
622 | [[package]]
623 | name = "shlex"
624 | version = "1.3.0"
625 | source = "registry+https://github.com/rust-lang/crates.io-index"
626 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
627 | 
628 | [[package]]
629 | name = "simdphrase"
630 | version = "0.1.1"
631 | dependencies = [
632 |  "bumpalo",
633 |  "fxhash",
634 |  "gxhash",
635 |  "heed",
636 |  "hyperloglogplus",
637 |  "log",
638 |  "memmap2",
639 |  "rkyv",
640 |  "thiserror",
641 |  "unicode-segmentation",
642 | ]
643 | 
644 | [[package]]
645 | name = "simdutf8"
646 | version = "0.1.5"
647 | source = "registry+https://github.com/rust-lang/crates.io-index"
648 | checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
649 | 
650 | [[package]]
651 | name = "siphasher"
652 | version = "1.0.1"
653 | source = "registry+https://github.com/rust-lang/crates.io-index"
654 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
655 | 
656 | [[package]]
657 | name = "smallvec"
658 | version = "1.14.0"
659 | source = "registry+https://github.com/rust-lang/crates.io-index"
660 | checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd"
661 | 
662 | [[package]]
663 | name = "stable_deref_trait"
664 | version = "1.2.0"
665 | source = "registry+https://github.com/rust-lang/crates.io-index"
666 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
667 | 
668 | [[package]]
669 | name = "syn"
670 | version = "2.0.99"
671 | source = "registry+https://github.com/rust-lang/crates.io-index"
672 | checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2"
673 | dependencies = [
674 |  "proc-macro2",
675 |  "quote",
676 |  "unicode-ident",
677 | ]
678 | 
679 | [[package]]
680 | name = "synchronoise"
681 | version = "1.0.1"
682 | source = "registry+https://github.com/rust-lang/crates.io-index"
683 | checksum = "3dbc01390fc626ce8d1cffe3376ded2b72a11bb70e1c75f404a210e4daa4def2"
684 | dependencies = [
685 |  "crossbeam-queue",
686 | ]
687 | 
688 | [[package]]
689 | name = "synstructure"
690 | version = "0.13.1"
691 | source = "registry+https://github.com/rust-lang/crates.io-index"
692 | checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
693 | dependencies = [
694 |  "proc-macro2",
695 |  "quote",
696 |  "syn",
697 | ]
698 | 
699 | [[package]]
700 | name = "thiserror"
701 | version = "2.0.12"
702 | source = "registry+https://github.com/rust-lang/crates.io-index"
703 | checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
704 | dependencies = [
705 |  "thiserror-impl",
706 | ]
707 | 
708 | [[package]]
709 | name = "thiserror-impl"
710 | version = "2.0.12"
711 | source = "registry+https://github.com/rust-lang/crates.io-index"
712 | checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
713 | dependencies = [
714 |  "proc-macro2",
715 |  "quote",
716 |  "syn",
717 | ]
718 | 
719 | [[package]]
720 | name = "tinystr"
721 | version = "0.7.6"
722 | source = "registry+https://github.com/rust-lang/crates.io-index"
723 | checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
724 | dependencies = [
725 |  "displaydoc",
726 |  "zerovec",
727 | ]
728 | 
729 | [[package]]
730 | name = "tinyvec"
731 | version = "1.9.0"
732 | source = "registry+https://github.com/rust-lang/crates.io-index"
733 | checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71"
734 | dependencies = [
735 |  "tinyvec_macros",
736 | ]
737 | 
738 | [[package]]
739 | name = "tinyvec_macros"
740 | version = "0.1.1"
741 | source = "registry+https://github.com/rust-lang/crates.io-index"
742 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
743 | 
744 | [[package]]
745 | name = "unicode-ident"
746 | version = "1.0.18"
747 | source = "registry+https://github.com/rust-lang/crates.io-index"
748 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
749 | 
750 | [[package]]
751 | name = "unicode-segmentation"
752 | version = "1.12.0"
753 | source = "registry+https://github.com/rust-lang/crates.io-index"
754 | checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
755 | 
756 | [[package]]
757 | name = "url"
758 | version = "2.5.4"
759 | source = "registry+https://github.com/rust-lang/crates.io-index"
760 | checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
761 | dependencies = [
762 |  "form_urlencoded",
763 |  "idna",
764 |  "percent-encoding",
765 | ]
766 | 
767 | [[package]]
768 | name = "utf16_iter"
769 | version = "1.0.5"
770 | source = "registry+https://github.com/rust-lang/crates.io-index"
771 | checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
772 | 
773 | [[package]]
774 | name = "utf8_iter"
775 | version = "1.0.4"
776 | source = "registry+https://github.com/rust-lang/crates.io-index"
777 | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
778 | 
779 | [[package]]
780 | name = "uuid"
781 | version = "1.15.1"
782 | source = "registry+https://github.com/rust-lang/crates.io-index"
783 | checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587"
784 | 
785 | [[package]]
786 | name = "winapi"
787 | version = "0.3.9"
788 | source = "registry+https://github.com/rust-lang/crates.io-index"
789 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
790 | dependencies = [
791 |  "winapi-i686-pc-windows-gnu",
792 |  "winapi-x86_64-pc-windows-gnu",
793 | ]
794 | 
795 | [[package]]
796 | name = "winapi-i686-pc-windows-gnu"
797 | version = "0.4.0"
798 | source = "registry+https://github.com/rust-lang/crates.io-index"
799 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
800 | 
801 | [[package]]
802 | name = "winapi-x86_64-pc-windows-gnu"
803 | version = "0.4.0"
804 | source = "registry+https://github.com/rust-lang/crates.io-index"
805 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
806 | 
807 | [[package]]
808 | name = "write16"
809 | version = "1.0.0"
810 | source = "registry+https://github.com/rust-lang/crates.io-index"
811 | checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
812 | 
813 | [[package]]
814 | name = "writeable"
815 | version = "0.5.5"
816 | source = "registry+https://github.com/rust-lang/crates.io-index"
817 | checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
818 | 
819 | [[package]]
820 | name = "yoke"
821 | version = "0.7.5"
822 | source = "registry+https://github.com/rust-lang/crates.io-index"
823 | checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
824 | dependencies = [
825 |  "serde",
826 |  "stable_deref_trait",
827 |  "yoke-derive",
828 |  "zerofrom",
829 | ]
830 | 
831 | [[package]]
832 | name = "yoke-derive"
833 | version = "0.7.5"
834 | source = "registry+https://github.com/rust-lang/crates.io-index"
835 | checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
836 | dependencies = [
837 |  "proc-macro2",
838 |  "quote",
839 |  "syn",
840 |  "synstructure",
841 | ]
842 | 
843 | [[package]]
844 | name = "zerofrom"
845 | version = "0.1.6"
846 | source = "registry+https://github.com/rust-lang/crates.io-index"
847 | checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
848 | dependencies = [
849 |  "zerofrom-derive",
850 | ]
851 | 
852 | [[package]]
853 | name = "zerofrom-derive"
854 | version = "0.1.6"
855 | source = "registry+https://github.com/rust-lang/crates.io-index"
856 | checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
857 | dependencies = [
858 |  "proc-macro2",
859 |  "quote",
860 |  "syn",
861 |  "synstructure",
862 | ]
863 | 
864 | [[package]]
865 | name = "zerovec"
866 | version = "0.10.4"
867 | source = "registry+https://github.com/rust-lang/crates.io-index"
868 | checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
869 | dependencies = [
870 |  "yoke",
871 |  "zerofrom",
872 |  "zerovec-derive",
873 | ]
874 | 
875 | [[package]]
876 | name = "zerovec-derive"
877 | version = "0.10.3"
878 | source = "registry+https://github.com/rust-lang/crates.io-index"
879 | checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
880 | dependencies = [
881 |  "proc-macro2",
882 |  "quote",
883 |  "syn",
884 | ]
885 | 


--------------------------------------------------------------------------------
/src/db.rs:
--------------------------------------------------------------------------------
   1 | use bumpalo::Bump;
   2 | use gxhash::{HashMap as GxHashMap, HashMapExt};
   3 | use heed::{
   4 |     Database, DatabaseFlags, Env, EnvFlags, EnvOpenOptions, PutFlags, RoTxn, RwTxn, Unspecified,
   5 |     types::Str,
   6 | };
   7 | use memmap2::{Mmap, MmapMut};
   8 | use rkyv::{
   9 |     Archive, Archived, Deserialize, Serialize,
  10 |     api::high::HighSerializer,
  11 |     de::Pool,
  12 |     deserialize,
  13 |     rancor::Strategy,
  14 |     ser::{allocator::ArenaHandle, writer::IoWriter},
  15 |     util::AlignedVec,
  16 |     with::InlineAsBox,
  17 | };
  18 | use std::{
  19 |     cmp::Reverse,
  20 |     collections::{BinaryHeap, HashSet, hash_map::Entry},
  21 |     fmt::Debug,
  22 |     fs::File,
  23 |     hash::Hash,
  24 |     io::BufWriter,
  25 |     num::NonZero,
  26 |     ops::Index,
  27 |     path::Path,
  28 |     sync::atomic::Ordering::Relaxed,
  29 | };
  30 | 
  31 | use crate::{
  32 |     BorrowRoaringishPacked, Intersection, RoaringishPacked,
  33 |     codecs::{NativeU32, ZeroCopyCodec},
  34 |     error::{DbError, GetDocumentError, SearchError},
  35 |     normalize,
  36 |     roaringish::{Aligned, ArchivedBorrowRoaringishPacked, RoaringishPackedKind, Unaligned},
  37 |     stats::Stats,
  38 |     tokenize,
  39 | };
  40 | 
  41 | struct Tokens {
  42 |     tokens: String,
  43 |     positions: Vec<(usize, usize)>,
  44 | }
  45 | 
  46 | impl Tokens {
  47 |     fn new(q: &str) -> Self {
  48 |         let q = normalize(q);
  49 |         let mut start = 0;
  50 |         let mut tokens = String::with_capacity(q.len() + 1);
  51 |         let mut positions = Vec::with_capacity(q.len() + 1);
  52 | 
  53 |         for token in tokenize(&q) {
  54 |             tokens.push_str(token);
  55 |             tokens.push(' ');
  56 | 
  57 |             let b = start;
  58 |             let e = b + token.len();
  59 |             start = e + 1;
  60 |             positions.push((b, e));
  61 |         }
  62 |         tokens.pop();
  63 | 
  64 |         Self { tokens, positions }
  65 |     }
  66 | 
  67 |     fn as_ref(&self) -> RefTokens {
  68 |         RefTokens {
  69 |             tokens: &self.tokens,
  70 |             positions: &self.positions,
  71 |         }
  72 |     }
  73 | }
  74 | 
  75 | #[derive(Clone, Copy)]
  76 | struct RefTokens<'a> {
  77 |     tokens: &'a str,
  78 |     positions: &'a [(usize, usize)],
  79 | }
  80 | 
  81 | impl RefTokens<'_> {
  82 |     fn len(&self) -> usize {
  83 |         self.positions.len()
  84 |     }
  85 | 
  86 |     fn is_empty(&self) -> bool {
  87 |         self.len() == 0
  88 |     }
  89 | 
  90 |     fn reserve_len(&self) -> usize {
  91 |         let n = MAX_WINDOW_LEN.get();
  92 |         let l = self.len();
  93 |         n * (l.max(n) - n + 1) + ((n - 1) * n) / 2
  94 |     }
  95 | 
  96 |     fn first(&self) -> Option<&str> {
  97 |         self.positions
  98 |             .first()
  99 |             .map(|(b, e)| unsafe { self.tokens.get_unchecked(*b..*e) })
 100 |     }
 101 | 
 102 |     fn ref_token_iter(&self) -> impl Iterator<Item = Self> + '_ {
 103 |         (0..self.positions.len()).map(|i| Self {
 104 |             tokens: self.tokens,
 105 |             positions: &self.positions[i..i + 1],
 106 |         })
 107 |     }
 108 | 
 109 |     fn iter(&self) -> impl Iterator<Item = &str> {
 110 |         self.positions
 111 |             .iter()
 112 |             .map(|(b, e)| unsafe { self.tokens.get_unchecked(*b..*e) })
 113 |     }
 114 | 
 115 |     fn range(&self) -> (usize, usize) {
 116 |         let (b, _) = self.positions.first().unwrap_or(&(0, 0));
 117 |         let (_, e) = self.positions.last().unwrap_or(&(0, 0));
 118 |         (*b, *e)
 119 |     }
 120 | 
 121 |     fn tokens(&self) -> &str {
 122 |         let (b, e) = self.range();
 123 |         unsafe { self.tokens.get_unchecked(b..e) }
 124 |     }
 125 | 
 126 |     fn split_at(&self, i: usize) -> (Self, Self) {
 127 |         let (l, r) = self.positions.split_at(i);
 128 |         (
 129 |             Self {
 130 |                 tokens: self.tokens,
 131 |                 positions: l,
 132 |             },
 133 |             Self {
 134 |                 tokens: self.tokens,
 135 |                 positions: r,
 136 |             },
 137 |         )
 138 |     }
 139 | }
 140 | 
 141 | impl PartialEq for RefTokens<'_> {
 142 |     fn eq(&self, other: &Self) -> bool {
 143 |         let t0 = self.tokens();
 144 |         let t1 = other.tokens();
 145 |         t0 == t1
 146 |     }
 147 | }
 148 | 
 149 | impl Eq for RefTokens<'_> {}
 150 | 
 151 | impl Hash for RefTokens<'_> {
 152 |     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
 153 |         self.tokens().hash(state);
 154 |     }
 155 | }
 156 | 
 157 | impl Index<usize> for RefTokens<'_> {
 158 |     type Output = str;
 159 | 
 160 |     fn index(&self, index: usize) -> &Self::Output {
 161 |         let (b, e) = self.positions[index];
 162 |         unsafe { self.tokens.get_unchecked(b..e) }
 163 |     }
 164 | }
 165 | 
 166 | impl Debug for RefTokens<'_> {
 167 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 168 |         f.debug_struct("RefTokens")
 169 |             .field("tokens", &self.tokens())
 170 |             .field("positions", &self.positions)
 171 |             .finish()
 172 |     }
 173 | }
 174 | 
 175 | #[derive(Clone, Copy, Debug)]
 176 | struct RefTokenLinkedList<'a, 'alloc> {
 177 |     tokens: RefTokens<'a>,
 178 |     next: Option<&'alloc RefTokenLinkedList<'a, 'alloc>>,
 179 | }
 180 | 
 181 | impl<'a, 'alloc> RefTokenLinkedList<'a, 'alloc> {
 182 |     fn iter<'b: 'alloc>(&'b self) -> RefTokenLinkedListIter<'a, 'alloc> {
 183 |         RefTokenLinkedListIter(Some(self))
 184 |     }
 185 | }
 186 | 
 187 | struct RefTokenLinkedListIter<'a, 'alloc>(Option<&'alloc RefTokenLinkedList<'a, 'alloc>>);
 188 | impl<'a, 'alloc> Iterator for RefTokenLinkedListIter<'a, 'alloc> {
 189 |     type Item = &'alloc RefTokens<'a>;
 190 | 
 191 |     fn next(&mut self) -> Option<Self::Item> {
 192 |         match self.0 {
 193 |             Some(linked_list) => {
 194 |                 self.0 = linked_list.next;
 195 |                 Some(&linked_list.tokens)
 196 |             }
 197 |             None => None,
 198 |         }
 199 |     }
 200 | }
 201 | 
 202 | #[derive(Archive, Serialize, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 203 | struct BorrowStr<'a>(#[rkyv(with = InlineAsBox)] &'a str);
 204 | 
 205 | mod db_constants {
 206 |     pub const DB_DOC_ID_TO_DOCUMENT: &str = "doc_id_to_document";
 207 |     pub const DB_TOKEN_TO_OFFSETS: &str = "token_to_offsets";
 208 |     pub const KEY_COMMON_TOKENS: &str = "common_tokens";
 209 |     pub const FILE_ROARINGISH_PACKED: &str = "roaringish_packed";
 210 |     pub const TEMP_FILE_TOKEN_TO_PACKED: &str = "temp_token_to_packed";
 211 | }
 212 | 
 213 | pub const MAX_WINDOW_LEN: NonZero<usize> = unsafe { NonZero::new_unchecked(3) };
 214 | 
 215 | #[derive(Debug, Serialize, Archive)]
 216 | struct Offset {
 217 |     begin: u64,
 218 |     len: u64,
 219 | }
 220 | 
 221 | /// Represents all types that can be stored in the database.
 222 | ///
 223 | /// This basically means that the type must be serializable by [rkyv].
 224 | pub trait Document:
 225 |     for<'a> Serialize<HighSerializer<AlignedVec, ArenaHandle<'a>, rkyv::rancor::Error>>
 226 |     + Archive
 227 |     + 'static
 228 | {
 229 | }
 230 | impl<D> Document for D where
 231 |     Self: for<'a> Serialize<HighSerializer<AlignedVec, ArenaHandle<'a>, rkyv::rancor::Error>>
 232 |         + Archive
 233 |         + 'static
 234 | {
 235 | }
 236 | 
 237 | pub struct DB<D: Document> {
 238 |     pub env: Env,
 239 |     db_main: Database<Unspecified, Unspecified>,
 240 |     db_doc_id_to_document: Database<NativeU32, ZeroCopyCodec<D>>,
 241 |     db_token_to_offsets: Database<Str, ZeroCopyCodec<Offset>>,
 242 | }
 243 | 
 244 | unsafe impl<D: Document> Send for DB<D> {}
 245 | 
 246 | unsafe impl<D: Document> Sync for DB<D> {}
 247 | 
 248 | impl<D: Document> DB<D> {
 249 |     pub fn truncate<P: AsRef<Path>>(path: P, db_size: usize) -> Result<Self, DbError> {
 250 |         let path = path.as_ref();
 251 |         let _ = std::fs::remove_dir_all(path);
 252 |         std::fs::create_dir_all(path)?;
 253 | 
 254 |         let env = unsafe {
 255 |             EnvOpenOptions::new()
 256 |                 .max_dbs(2)
 257 |                 .map_size(db_size)
 258 |                 .flags(EnvFlags::WRITE_MAP | EnvFlags::MAP_ASYNC)
 259 |                 .open(path)?
 260 |         };
 261 | 
 262 |         let mut wrtxn = env.write_txn()?;
 263 | 
 264 |         let db_main = env.create_database(&mut wrtxn, None)?;
 265 | 
 266 |         let db_doc_id_to_document = env
 267 |             .database_options()
 268 |             .types::<NativeU32, ZeroCopyCodec<D>>()
 269 |             .flags(DatabaseFlags::REVERSE_KEY)
 270 |             .name(db_constants::DB_DOC_ID_TO_DOCUMENT)
 271 |             .create(&mut wrtxn)?;
 272 | 
 273 |         let db_token_to_offsets =
 274 |             env.create_database(&mut wrtxn, Some(db_constants::DB_TOKEN_TO_OFFSETS))?;
 275 | 
 276 |         wrtxn.commit()?;
 277 | 
 278 |         Ok(Self {
 279 |             env,
 280 |             db_main,
 281 |             db_doc_id_to_document,
 282 |             db_token_to_offsets,
 283 |         })
 284 |     }
 285 | 
 286 |     pub fn write_doc_id_to_document(
 287 |         &self,
 288 |         rwtxn: &mut RwTxn,
 289 |         doc_ids: &[u32],
 290 |         documents: &[D],
 291 |     ) -> Result<(), DbError> {
 292 |         log::debug!("Writing documents");
 293 |         let b = std::time::Instant::now();
 294 |         for (doc_id, document) in doc_ids.iter().zip(documents.iter()) {
 295 |             self.db_doc_id_to_document
 296 |                 .put_with_flags(rwtxn, PutFlags::APPEND, doc_id, document)?;
 297 |         }
 298 |         log::debug!("Writing documents took {:?}", b.elapsed());
 299 |         Ok(())
 300 |     }
 301 | 
 302 |     pub fn write_token_to_roaringish_packed(
 303 |         &self,
 304 |         token_to_token_id: &GxHashMap<Box<str>, u32>,
 305 |         token_id_to_roaringish_packed: &[RoaringishPacked],
 306 |         mmap_size: &mut usize,
 307 |         batch_id: u32,
 308 |     ) -> Result<(), DbError> {
 309 |         log::debug!("Writing token to roaringish packed");
 310 |         let b = std::time::Instant::now();
 311 |         let mut token_to_packed: Vec<_> = token_to_token_id
 312 |             .iter()
 313 |             .map(|(token, token_id)| {
 314 |                 let packed = &token_id_to_roaringish_packed[*token_id as usize];
 315 |                 *mmap_size += packed.size_bytes();
 316 |                 (BorrowStr(token), BorrowRoaringishPacked::new(packed))
 317 |             })
 318 |             .collect();
 319 |         token_to_packed.sort_unstable_by(|(token0, _), (token1, _)| token0.cmp(token1));
 320 | 
 321 |         let file_name = format!("{}_{batch_id}", db_constants::TEMP_FILE_TOKEN_TO_PACKED);
 322 |         let file = IoWriter::new(BufWriter::new(
 323 |             File::options()
 324 |                 .create(true)
 325 |                 .truncate(true)
 326 |                 .read(true)
 327 |                 .write(true)
 328 |                 .open(self.env.path().join(file_name))?,
 329 |         ));
 330 |         rkyv::api::high::to_bytes_in::<_, rkyv::rancor::Error>(&token_to_packed, file)?;
 331 |         log::debug!("Writing token to roaringish packed took {:?}", b.elapsed());
 332 |         Ok(())
 333 |     }
 334 | 
 335 |     pub fn generate_mmap_file(
 336 |         &self,
 337 |         number_of_distinct_tokens: u64,
 338 |         mmap_size: usize,
 339 |         number_of_batches: u32,
 340 |         rwtxn: &mut RwTxn,
 341 |     ) -> Result<(), DbError> {
 342 |         #[inline(always)]
 343 |         unsafe fn write_to_mmap<const N: usize>(
 344 |             mmap: &mut MmapMut,
 345 |             mmap_offset: &mut usize,
 346 |             bytes: &[u8],
 347 |         ) -> Offset {
 348 |             unsafe {
 349 |                 let ptr = mmap.as_ptr().add(*mmap_offset);
 350 |                 let offset = ptr.align_offset(N);
 351 | 
 352 |                 *mmap_offset += offset;
 353 |                 mmap[*mmap_offset..*mmap_offset + bytes.len()].copy_from_slice(bytes);
 354 | 
 355 |                 let begin = *mmap_offset;
 356 |                 *mmap_offset += bytes.len();
 357 |                 Offset {
 358 |                     begin: begin as u64,
 359 |                     len: bytes.len() as u64,
 360 |                 }
 361 |             }
 362 |         }
 363 | 
 364 |         log::info!("Merging roaringish packed files to generate the final memory map file");
 365 |         let b = std::time::Instant::now();
 366 |         let file = File::options()
 367 |             .create(true)
 368 |             .truncate(true)
 369 |             .read(true)
 370 |             .write(true)
 371 |             .open(self.env.path().join(db_constants::FILE_ROARINGISH_PACKED))?;
 372 |         let final_size = mmap_size as u64 + (number_of_distinct_tokens * 64);
 373 |         log::debug!("Creating file with size: {} bytes", final_size);
 374 |         file.set_len(final_size)?;
 375 |         let mut mmap = unsafe { MmapMut::map_mut(&file)? };
 376 |         let mut mmap_offset = 0;
 377 | 
 378 |         // we need to do this in 3 steps because of the borrow checker
 379 |         let files_mmaps = (0..number_of_batches)
 380 |             .map(|i| -> Result<Mmap, DbError> {
 381 |                 let file_name = format!("{}_{i}", db_constants::TEMP_FILE_TOKEN_TO_PACKED);
 382 |                 let file = File::options()
 383 |                     .read(true)
 384 |                     .open(self.env.path().join(file_name))?;
 385 |                 unsafe { Ok(Mmap::map(&file)?) }
 386 |             })
 387 |             .collect::<Result<Vec<_>, DbError>>()?;
 388 |         let files_data: Vec<_> = files_mmaps
 389 |             .iter()
 390 |             .map(|mmap| unsafe {
 391 |                 rkyv::access_unchecked::<
 392 |                     Archived<Vec<(BorrowStr<'_>, BorrowRoaringishPacked<'_, Unaligned>)>>,
 393 |                 >(mmap)
 394 |             })
 395 |             .collect();
 396 |         let mut iters: Vec<_> = files_data
 397 |             .iter()
 398 |             .map(|tokens_to_packeds| tokens_to_packeds.iter())
 399 |             .collect();
 400 | 
 401 |         struct ToMerge<'a> {
 402 |             token: &'a ArchivedBorrowStr<'a>,
 403 |             packed: &'a ArchivedBorrowRoaringishPacked<'a, Unaligned>,
 404 |             i: usize,
 405 |         }
 406 |         impl PartialEq for ToMerge<'_> {
 407 |             fn eq(&self, other: &Self) -> bool {
 408 |                 self.token.0 == other.token.0 && self.i == other.i
 409 |             }
 410 |         }
 411 |         impl Eq for ToMerge<'_> {}
 412 |         impl PartialOrd for ToMerge<'_> {
 413 |             fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
 414 |                 Some(self.cmp(other))
 415 |             }
 416 |         }
 417 |         impl Ord for ToMerge<'_> {
 418 |             fn cmp(&self, other: &Self) -> std::cmp::Ordering {
 419 |                 match self.token.0.cmp(&other.token.0) {
 420 |                     std::cmp::Ordering::Equal => self.i.cmp(&other.i),
 421 |                     ord => ord,
 422 |                 }
 423 |             }
 424 |         }
 425 | 
 426 |         let mut heap = BinaryHeap::new();
 427 |         for (i, it) in iters.iter_mut().enumerate() {
 428 |             if let Some(token_to_packed) = it.next() {
 429 |                 heap.push(Reverse(ToMerge {
 430 |                     token: &token_to_packed.0,
 431 |                     packed: &token_to_packed.1,
 432 |                     i,
 433 |                 }))
 434 |             }
 435 |         }
 436 | 
 437 |         while let Some(token_to_packed) = heap.pop() {
 438 |             let to_merge = token_to_packed.0;
 439 |             if let Some(token_to_packed) = iters[to_merge.i].next() {
 440 |                 heap.push(Reverse(ToMerge {
 441 |                     token: &token_to_packed.0,
 442 |                     packed: &token_to_packed.1,
 443 |                     i: to_merge.i,
 444 |                 }));
 445 |             }
 446 | 
 447 |             let mut packed_kind = RoaringishPackedKind::Archived(to_merge.packed);
 448 |             loop {
 449 |                 let Some(next_to_merge) = heap.peek() else {
 450 |                     break;
 451 |                 };
 452 | 
 453 |                 if next_to_merge.0.token.0 != to_merge.token.0 {
 454 |                     break;
 455 |                 }
 456 | 
 457 |                 // This pop can't fail because we peeked before
 458 |                 let next_to_merge = heap.pop().unwrap().0;
 459 |                 if let Some(token_to_packed) = iters[next_to_merge.i].next() {
 460 |                     heap.push(Reverse(ToMerge {
 461 |                         token: &token_to_packed.0,
 462 |                         packed: &token_to_packed.1,
 463 |                         i: next_to_merge.i,
 464 |                     }));
 465 |                 }
 466 | 
 467 |                 let next_to_merge_kind = RoaringishPackedKind::Archived(next_to_merge.packed);
 468 |                 packed_kind = packed_kind.concat(next_to_merge_kind);
 469 |             }
 470 | 
 471 |             if to_merge.token.0.len() > 511 {
 472 |                 continue;
 473 |             }
 474 | 
 475 |             let packed = packed_kind.as_bytes();
 476 |             let offset = unsafe { write_to_mmap::<64>(&mut mmap, &mut mmap_offset, packed) };
 477 |             self.db_token_to_offsets.put_with_flags(
 478 |                 rwtxn,
 479 |                 PutFlags::APPEND,
 480 |                 &to_merge.token.0,
 481 |                 &offset,
 482 |             )?;
 483 |         }
 484 | 
 485 |         drop(iters);
 486 |         drop(files_data);
 487 |         drop(files_mmaps);
 488 | 
 489 |         log::debug!("Finished merging roaringish packed files");
 490 |         log::debug!("Removing old files");
 491 |         for i in 0..number_of_batches {
 492 |             let file_name = format!("{}_{i}", db_constants::TEMP_FILE_TOKEN_TO_PACKED);
 493 |             std::fs::remove_file(self.env.path().join(file_name))?;
 494 |         }
 495 |         log::info!("Whole merging process took {:?}", b.elapsed());
 496 | 
 497 |         Ok(())
 498 |     }
 499 | 
 500 |     fn read_common_tokens(
 501 |         rotxn: &RoTxn,
 502 |         db_main: Database<Unspecified, Unspecified>,
 503 |     ) -> Result<HashSet<Box<str>>, DbError> {
 504 |         let k = db_main
 505 |             .remap_types::<Str, ZeroCopyCodec<HashSet<Box<str>>>>()
 506 |             .get(rotxn, db_constants::KEY_COMMON_TOKENS)?
 507 |             .ok_or_else(|| {
 508 |                 DbError::KeyNotFound(
 509 |                     db_constants::KEY_COMMON_TOKENS.to_string(),
 510 |                     "main".to_string(),
 511 |                 )
 512 |             })?;
 513 | 
 514 |         Ok(deserialize::<_, rkyv::rancor::Error>(k)?)
 515 |     }
 516 | 
 517 |     pub fn write_common_tokens(
 518 |         &self,
 519 |         rwtxn: &mut RwTxn,
 520 |         common_tokens: &HashSet<Box<str>>,
 521 |     ) -> Result<(), DbError> {
 522 |         log::debug!("Writing common tokens");
 523 |         let b = std::time::Instant::now();
 524 |         self.db_main
 525 |             .remap_types::<Str, ZeroCopyCodec<HashSet<Box<str>>>>()
 526 |             .put(rwtxn, db_constants::KEY_COMMON_TOKENS, common_tokens)?;
 527 |         log::debug!("Writing common tokens took {:?}", b.elapsed());
 528 |         Ok(())
 529 |     }
 530 | 
 531 |     pub fn open<P: AsRef<Path>>(path: P) -> Result<(Self, HashSet<Box<str>>, Mmap), DbError> {
 532 |         let path = path.as_ref();
 533 |         let env = unsafe {
 534 |             EnvOpenOptions::new()
 535 |                 .max_dbs(2)
 536 |                 .flags(EnvFlags::READ_ONLY)
 537 |                 .open(path)?
 538 |         };
 539 | 
 540 |         let rotxn = env.read_txn()?;
 541 | 
 542 |         let db_main = env
 543 |             .open_database(&rotxn, None)?
 544 |             .ok_or_else(|| DbError::DatabaseError("main".to_string()))?;
 545 | 
 546 |         let db_doc_id_to_document = env
 547 |             .database_options()
 548 |             .types::<NativeU32, ZeroCopyCodec<D>>()
 549 |             .flags(DatabaseFlags::REVERSE_KEY)
 550 |             .name(db_constants::DB_DOC_ID_TO_DOCUMENT)
 551 |             .open(&rotxn)?
 552 |             .ok_or_else(|| {
 553 |                 DbError::DatabaseError(db_constants::DB_DOC_ID_TO_DOCUMENT.to_string())
 554 |             })?;
 555 | 
 556 |         let db_token_to_offsets = env
 557 |             .open_database(&rotxn, Some(db_constants::DB_TOKEN_TO_OFFSETS))?
 558 |             .ok_or_else(|| DbError::DatabaseError(db_constants::DB_TOKEN_TO_OFFSETS.to_string()))?;
 559 | 
 560 |         let common_tokens = Self::read_common_tokens(&rotxn, db_main)?;
 561 | 
 562 |         rotxn.commit()?;
 563 | 
 564 |         let mmap_file = File::open(path.join(db_constants::FILE_ROARINGISH_PACKED))?;
 565 |         let mmap = unsafe { Mmap::map(&mmap_file)? };
 566 | 
 567 |         Ok((
 568 |             Self {
 569 |                 env,
 570 |                 db_main,
 571 |                 db_doc_id_to_document,
 572 |                 db_token_to_offsets,
 573 |             },
 574 |             common_tokens,
 575 |             mmap,
 576 |         ))
 577 |     }
 578 | 
 579 |     // This function neeeds to be inline never, for some reason inlining this
 580 |     // function makes some queries performance unpredictable
 581 |     #[inline(never)]
 582 |     fn merge_and_minimize_tokens<'a, 'b, 'alloc>(
 583 |         &self,
 584 |         rotxn: &RoTxn,
 585 |         tokens: RefTokens<'a>,
 586 |         common_tokens: &HashSet<Box<str>>,
 587 |         mmap: &'b Mmap,
 588 | 
 589 |         bump: &'alloc Bump,
 590 |     ) -> Result<
 591 |         (
 592 |             Vec<RefTokens<'a>>,
 593 |             GxHashMap<RefTokens<'a>, BorrowRoaringishPacked<'b, Aligned>>,
 594 |         ),
 595 |         SearchError,
 596 |     > {
 597 |         #[inline(always)]
 598 |         fn check_before_recursion<'a, 'b, 'alloc, D: Document>(
 599 |             me: &DB<D>,
 600 |             rotxn: &RoTxn,
 601 |             tokens: RefTokens<'a>,
 602 |             token_to_packed: &mut GxHashMap<RefTokens<'a>, BorrowRoaringishPacked<'b, Aligned>>,
 603 |             mmap: &'b Mmap,
 604 |             memo_token_to_score_choices: &mut GxHashMap<
 605 |                 RefTokens<'a>,
 606 |                 (usize, &'alloc RefTokenLinkedList<'a, 'alloc>),
 607 |             >,
 608 |             bump: &'alloc Bump,
 609 |         ) -> Result<Option<usize>, SearchError> {
 610 |             if tokens.len() != 1 {
 611 |                 return Ok(None);
 612 |             }
 613 | 
 614 |             let score = match token_to_packed.entry(tokens) {
 615 |                 Entry::Occupied(e) => e.get().len(),
 616 |                 Entry::Vacant(e) => {
 617 |                     let packed = me.get_roaringish_packed(rotxn, &tokens[0], mmap)?;
 618 |                     let score = packed.len();
 619 |                     e.insert(packed);
 620 | 
 621 |                     let linked_list = bump.alloc(RefTokenLinkedList { tokens, next: None });
 622 |                     memo_token_to_score_choices.insert(tokens, (score, linked_list));
 623 |                     score
 624 |                 }
 625 |             };
 626 |             Ok(Some(score))
 627 |         }
 628 | 
 629 |         #[allow(clippy::too_many_arguments)]
 630 |         fn inner_merge_and_minimize_tokens<'a, 'b, 'c, 'alloc, D: Document>(
 631 |             me: &DB<D>,
 632 |             rotxn: &RoTxn,
 633 |             tokens: RefTokens<'a>,
 634 |             common_tokens: &HashSet<Box<str>>,
 635 |             token_to_packed: &mut GxHashMap<RefTokens<'a>, BorrowRoaringishPacked<'b, Aligned>>,
 636 |             mmap: &'b Mmap,
 637 |             memo_token_to_score_choices: &mut GxHashMap<
 638 |                 RefTokens<'a>,
 639 |                 (usize, &'alloc RefTokenLinkedList<'a, 'alloc>),
 640 |             >,
 641 | 
 642 |             bump: &'alloc Bump,
 643 |         ) -> Result<usize, SearchError> {
 644 |             const { assert!(MAX_WINDOW_LEN.get() == 3) };
 645 |             let mut final_score = usize::MAX;
 646 |             let mut best_token_choice = None;
 647 |             let mut best_rem_choice = None;
 648 | 
 649 |             // TODO: fix this, it looks ugly
 650 |             let mut end = tokens
 651 |                 .iter()
 652 |                 .skip(1)
 653 |                 .take(MAX_WINDOW_LEN.get() - 1)
 654 |                 .take_while(|t| common_tokens.contains(*t))
 655 |                 .count()
 656 |                 + 2;
 657 |             if common_tokens.contains(&tokens[0]) {
 658 |                 end += 1;
 659 |             }
 660 |             end = end.min(MAX_WINDOW_LEN.get() + 1).min(tokens.len() + 1);
 661 | 
 662 |             for i in (1..end).rev() {
 663 |                 let (tokens, rem) = tokens.split_at(i);
 664 | 
 665 |                 let score = match token_to_packed.entry(tokens) {
 666 |                     Entry::Occupied(e) => e.get().len(),
 667 |                     Entry::Vacant(e) => {
 668 |                         let packed = me.get_roaringish_packed(rotxn, tokens.tokens(), mmap)?;
 669 |                         let score = packed.len();
 670 |                         e.insert(packed);
 671 |                         score
 672 |                     }
 673 |                 };
 674 | 
 675 |                 let mut rem_score = 0;
 676 |                 if !rem.is_empty() {
 677 |                     rem_score = match memo_token_to_score_choices.get(&rem) {
 678 |                         Some(r) => r.0,
 679 |                         None => {
 680 |                             match check_before_recursion(
 681 |                                 me,
 682 |                                 rotxn,
 683 |                                 rem,
 684 |                                 token_to_packed,
 685 |                                 mmap,
 686 |                                 memo_token_to_score_choices,
 687 |                                 bump,
 688 |                             )? {
 689 |                                 Some(score) => score,
 690 |                                 None => inner_merge_and_minimize_tokens(
 691 |                                     me,
 692 |                                     rotxn,
 693 |                                     rem,
 694 |                                     common_tokens,
 695 |                                     token_to_packed,
 696 |                                     mmap,
 697 |                                     memo_token_to_score_choices,
 698 |                                     bump,
 699 |                                 )?,
 700 |                             }
 701 |                         }
 702 |                     };
 703 |                     if rem_score == 0 {
 704 |                         return Err(SearchError::MergeAndMinimizeNotPossible);
 705 |                     }
 706 |                 }
 707 | 
 708 |                 let calc_score = score + rem_score;
 709 |                 if calc_score < final_score {
 710 |                     final_score = calc_score;
 711 | 
 712 |                     best_token_choice = Some(tokens);
 713 |                     if let Some((_, rem_choices)) = memo_token_to_score_choices.get(&rem) {
 714 |                         best_rem_choice = Some(*rem_choices);
 715 |                     };
 716 |                 }
 717 |             }
 718 | 
 719 |             let choices = match (best_token_choice, best_rem_choice) {
 720 |                 (None, None) => return Err(SearchError::MergeAndMinimizeNotPossible),
 721 |                 (None, Some(_)) => return Err(SearchError::MergeAndMinimizeNotPossible),
 722 |                 (Some(tokens), None) => bump.alloc(RefTokenLinkedList { tokens, next: None }),
 723 |                 (Some(tokens), Some(rem)) => bump.alloc(RefTokenLinkedList {
 724 |                     tokens,
 725 |                     next: Some(rem),
 726 |                 }),
 727 |             };
 728 | 
 729 |             memo_token_to_score_choices.insert(tokens, (final_score, choices));
 730 |             Ok(final_score)
 731 |         }
 732 | 
 733 |         // This function neeeds to be inline never, for some reason inlining this
 734 |         // function makes some queries performance unpredictable
 735 |         #[inline(never)]
 736 |         fn no_common_tokens<'a, 'b, 'alloc, D: Document>(
 737 |             me: &DB<D>,
 738 |             rotxn: &RoTxn,
 739 |             tokens: RefTokens<'a>,
 740 |             mmap: &'b Mmap,
 741 |         ) -> Result<
 742 |             (
 743 |                 Vec<RefTokens<'a>>,
 744 |                 GxHashMap<RefTokens<'a>, BorrowRoaringishPacked<'b, Aligned>>,
 745 |             ),
 746 |             SearchError,
 747 |         > {
 748 |             let l = tokens.len();
 749 |             let mut token_to_packed = GxHashMap::with_capacity(l);
 750 |             let mut v = Vec::with_capacity(l);
 751 | 
 752 |             for token in tokens.ref_token_iter() {
 753 |                 let packed = me.get_roaringish_packed(rotxn, token.tokens(), mmap)?;
 754 |                 token_to_packed.insert(token, packed);
 755 |                 v.push(token);
 756 |             }
 757 | 
 758 |             return Ok((v, token_to_packed));
 759 |         }
 760 | 
 761 |         if common_tokens.is_empty() {
 762 |             return no_common_tokens(self, rotxn, tokens, mmap);
 763 |         }
 764 | 
 765 |         let len = tokens.reserve_len();
 766 |         let mut memo_token_to_score_choices = GxHashMap::with_capacity(len);
 767 |         let mut token_to_packed = GxHashMap::with_capacity(len);
 768 | 
 769 |         let score = match check_before_recursion(
 770 |             self,
 771 |             rotxn,
 772 |             tokens,
 773 |             &mut token_to_packed,
 774 |             mmap,
 775 |             &mut memo_token_to_score_choices,
 776 |             bump,
 777 |         )? {
 778 |             Some(score) => score,
 779 |             None => inner_merge_and_minimize_tokens(
 780 |                 self,
 781 |                 rotxn,
 782 |                 tokens,
 783 |                 common_tokens,
 784 |                 &mut token_to_packed,
 785 |                 mmap,
 786 |                 &mut memo_token_to_score_choices,
 787 |                 bump,
 788 |             )?,
 789 |         };
 790 | 
 791 |         if score == 0 {
 792 |             return Err(SearchError::MergeAndMinimizeNotPossible);
 793 |         }
 794 |         match memo_token_to_score_choices.remove(&tokens) {
 795 |             Some((_, choices)) => {
 796 |                 let v = choices.iter().copied().collect();
 797 |                 Ok((v, token_to_packed))
 798 |             }
 799 |             None => Err(SearchError::MergeAndMinimizeNotPossible),
 800 |         }
 801 |     }
 802 | 
 803 |     fn get_roaringish_packed_from_offset<'a>(
 804 |         offset: &ArchivedOffset,
 805 |         mmap: &'a Mmap,
 806 |     ) -> Result<BorrowRoaringishPacked<'a, Aligned>, SearchError> {
 807 |         let begin = offset.begin.to_native() as usize;
 808 |         let len = offset.len.to_native() as usize;
 809 |         let end = begin + len;
 810 |         let Some(packed) = &mmap.get(begin..end) else {
 811 |             return Err(SearchError::InternalError);
 812 |         };
 813 |         let (l, packed, r) = unsafe { packed.align_to::<u64>() };
 814 |         if !l.is_empty() || !r.is_empty() {
 815 |             return Err(SearchError::InternalError);
 816 |         }
 817 | 
 818 |         mmap.advise_range(memmap2::Advice::Sequential, begin, len)
 819 |             .map_err(|e| DbError::from(e))?;
 820 | 
 821 |         Ok(BorrowRoaringishPacked::new_raw(packed))
 822 |     }
 823 | 
 824 |     #[inline(always)]
 825 |     pub fn get_roaringish_packed<'a>(
 826 |         &self,
 827 |         rotxn: &RoTxn,
 828 |         token: &str,
 829 |         mmap: &'a Mmap,
 830 |     ) -> Result<BorrowRoaringishPacked<'a, Aligned>, SearchError> {
 831 |         let offset = self
 832 |             .db_token_to_offsets
 833 |             .get(rotxn, token)
 834 |             .map_err(|e| DbError::from(e))?;
 835 |         match offset {
 836 |             Some(offset) => Self::get_roaringish_packed_from_offset(offset, mmap),
 837 |             None => Err(SearchError::TokenNotFound(token.to_string())),
 838 |         }
 839 |     }
 840 | 
 841 |     pub fn search<I: Intersection>(
 842 |         &self,
 843 |         q: &str,
 844 |         stats: &Stats,
 845 |         common_tokens: &HashSet<Box<str>>,
 846 |         mmap: &Mmap,
 847 |     ) -> Result<Vec<u32>, SearchError> {
 848 |         stats.iters.fetch_add(1, Relaxed);
 849 | 
 850 |         let b = std::time::Instant::now();
 851 |         let tokens = Tokens::new(q);
 852 |         let tokens = tokens.as_ref();
 853 |         stats
 854 |             .normalize_tokenize
 855 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
 856 | 
 857 |         if tokens.is_empty() {
 858 |             return Err(SearchError::EmptyQuery);
 859 |         }
 860 | 
 861 |         let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?;
 862 |         if tokens.len() == 1 {
 863 |             // this can't failt, we just checked
 864 |             self.get_roaringish_packed(&rotxn, tokens.first().unwrap(), mmap)?
 865 |                 .get_doc_ids(stats);
 866 |         }
 867 | 
 868 |         let b = std::time::Instant::now();
 869 |         let bump = Bump::with_capacity(tokens.reserve_len() * 5);
 870 |         let (final_tokens, token_to_packed) =
 871 |             self.merge_and_minimize_tokens(&rotxn, tokens, common_tokens, mmap, &bump)?;
 872 |         stats
 873 |             .merge_minimize
 874 |             .fetch_add(b.elapsed().as_micros() as u64, Relaxed);
 875 | 
 876 |         if final_tokens.is_empty() {
 877 |             return Err(SearchError::EmptyQuery);
 878 |         }
 879 | 
 880 |         if final_tokens.len() == 1 {
 881 |             return token_to_packed
 882 |                 .get(&final_tokens[0])
 883 |                 .ok_or_else(|| SearchError::TokenNotFound(final_tokens[0].tokens().to_string()))
 884 |                 .map(|p| p.get_doc_ids(stats));
 885 |         }
 886 | 
 887 |         // at this point we know that we have at least
 888 |         // 2 tokens, so the loop will run at least once
 889 |         // changing the value of `i` to be inbounds
 890 |         let mut min = usize::MAX;
 891 |         let mut i = usize::MAX;
 892 |         for (j, ts) in final_tokens.array_windows::<2>().enumerate() {
 893 |             let l0 = token_to_packed
 894 |                 .get(&ts[0])
 895 |                 .ok_or_else(|| SearchError::TokenNotFound(ts[0].tokens().to_string()))?
 896 |                 .len();
 897 | 
 898 |             let l1 = token_to_packed
 899 |                 .get(&ts[1])
 900 |                 .ok_or_else(|| SearchError::TokenNotFound(ts[1].tokens().to_string()))?
 901 |                 .len();
 902 | 
 903 |             let l = l0 + l1;
 904 |             if l <= min {
 905 |                 i = j;
 906 |                 min = l;
 907 |             }
 908 |         }
 909 | 
 910 |         let lhs = &final_tokens[i];
 911 |         let mut lhs_len = lhs.len() as u32;
 912 |         let lhs = token_to_packed
 913 |             .get(lhs)
 914 |             .ok_or_else(|| SearchError::TokenNotFound(lhs.tokens().to_string()))?;
 915 | 
 916 |         let rhs = &final_tokens[i + 1];
 917 |         let mut rhs_len = rhs.len() as u32;
 918 |         let rhs = token_to_packed
 919 |             .get(rhs)
 920 |             .ok_or_else(|| SearchError::TokenNotFound(rhs.tokens().to_string()))?;
 921 | 
 922 |         let mut result = lhs.intersect::<I>(*rhs, lhs_len, stats);
 923 |         let mut result_borrow = BorrowRoaringishPacked::new(&result);
 924 | 
 925 |         let mut left_i = i.wrapping_sub(1);
 926 |         let mut right_i = i + 2;
 927 | 
 928 |         loop {
 929 |             let lhs = final_tokens.get(left_i);
 930 |             let rhs = final_tokens.get(right_i);
 931 |             match (lhs, rhs) {
 932 |                 (Some(t_lhs), Some(t_rhs)) => {
 933 |                     let lhs = token_to_packed
 934 |                         .get(t_lhs)
 935 |                         .ok_or_else(|| SearchError::TokenNotFound(t_lhs.tokens().to_string()))?;
 936 |                     let rhs = token_to_packed
 937 |                         .get(t_rhs)
 938 |                         .ok_or_else(|| SearchError::TokenNotFound(t_rhs.tokens().to_string()))?;
 939 |                     if lhs.len() <= rhs.len() {
 940 |                         lhs_len += t_lhs.len() as u32;
 941 | 
 942 |                         result = lhs.intersect::<I>(result_borrow, lhs_len, stats);
 943 |                         result_borrow = BorrowRoaringishPacked::new(&result);
 944 | 
 945 |                         left_i = left_i.wrapping_sub(1);
 946 |                     } else {
 947 |                         result = result_borrow.intersect::<I>(*rhs, rhs_len, stats);
 948 |                         result_borrow = BorrowRoaringishPacked::new(&result);
 949 | 
 950 |                         lhs_len += rhs_len;
 951 |                         rhs_len = t_rhs.len() as u32;
 952 | 
 953 |                         right_i += 1;
 954 |                     }
 955 |                 }
 956 |                 (Some(t_lhs), None) => {
 957 |                     let lhs = token_to_packed
 958 |                         .get(t_lhs)
 959 |                         .ok_or_else(|| SearchError::TokenNotFound(t_lhs.tokens().to_string()))?;
 960 |                     lhs_len += t_lhs.len() as u32;
 961 | 
 962 |                     result = lhs.intersect::<I>(result_borrow, lhs_len, stats);
 963 |                     result_borrow = BorrowRoaringishPacked::new(&result);
 964 | 
 965 |                     left_i = left_i.wrapping_sub(1);
 966 |                 }
 967 |                 (None, Some(t_rhs)) => {
 968 |                     let rhs = token_to_packed
 969 |                         .get(t_rhs)
 970 |                         .ok_or_else(|| SearchError::TokenNotFound(t_rhs.tokens().to_string()))?;
 971 | 
 972 |                     result = result_borrow.intersect::<I>(*rhs, rhs_len, stats);
 973 |                     result_borrow = BorrowRoaringishPacked::new(&result);
 974 | 
 975 |                     lhs_len += rhs_len;
 976 |                     rhs_len = t_rhs.len() as u32;
 977 | 
 978 |                     right_i += 1;
 979 |                 }
 980 |                 (None, None) => break,
 981 |             }
 982 | 
 983 |             if result.is_empty() {
 984 |                 return Err(SearchError::EmptyIntersection);
 985 |             }
 986 |         }
 987 | 
 988 |         Ok(result_borrow.get_doc_ids(stats))
 989 |     }
 990 | 
 991 |     fn inner_get_archived_document<'a>(
 992 |         &self,
 993 |         rotxn: &'a RoTxn,
 994 |         doc_id: &u32,
 995 |     ) -> Result<&'a D::Archived, GetDocumentError> {
 996 |         self.db_doc_id_to_document
 997 |             .get(rotxn, doc_id)
 998 |             .map_err(|e| DbError::from(e))?
 999 |             .ok_or(GetDocumentError::DocumentNotFound(*doc_id))
1000 |     }
1001 | 
1002 |     pub fn get_archived_documents(
1003 |         &self,
1004 |         doc_ids: &[u32],
1005 |         cb: impl FnOnce(Vec<&D::Archived>),
1006 |     ) -> Result<(), GetDocumentError> {
1007 |         let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?;
1008 |         let docs = doc_ids
1009 |             .into_iter()
1010 |             .map(|doc_id| self.inner_get_archived_document(&rotxn, doc_id))
1011 |             .collect::<Result<Vec<_>, _>>()?;
1012 | 
1013 |         cb(docs);
1014 | 
1015 |         Ok(())
1016 |     }
1017 | 
1018 |     pub fn get_archived_document(
1019 |         &self,
1020 |         doc_id: u32,
1021 |         cb: impl FnOnce(&D::Archived),
1022 |     ) -> Result<(), GetDocumentError> {
1023 |         let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?;
1024 |         let doc = self.inner_get_archived_document(&rotxn, &doc_id)?;
1025 | 
1026 |         cb(doc);
1027 | 
1028 |         Ok(())
1029 |     }
1030 | 
1031 |     pub fn get_documents(&self, doc_ids: &[u32]) -> Result<Vec<D>, GetDocumentError>
1032 |     where
1033 |         <D as Archive>::Archived: Deserialize<D, Strategy<Pool, rkyv::rancor::Error>>,
1034 |     {
1035 |         let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?;
1036 |         doc_ids
1037 |             .into_iter()
1038 |             .map(|doc_id| {
1039 |                 let archived = self.inner_get_archived_document(&rotxn, doc_id)?;
1040 |                 rkyv::deserialize::<D, rkyv::rancor::Error>(archived)
1041 |                     .map_err(|e| GetDocumentError::DbError(DbError::from(e)))
1042 |             })
1043 |             .collect::<Result<Vec<_>, _>>()
1044 |     }
1045 | 
1046 |     pub fn get_document(&self, doc_id: u32) -> Result<D, GetDocumentError>
1047 |     where
1048 |         <D as Archive>::Archived: Deserialize<D, Strategy<Pool, rkyv::rancor::Error>>,
1049 |     {
1050 |         let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?;
1051 |         let archived = self.inner_get_archived_document(&rotxn, &doc_id)?;
1052 |         rkyv::deserialize::<D, rkyv::rancor::Error>(archived)
1053 |             .map_err(|e| GetDocumentError::DbError(DbError::from(e)))
1054 |     }
1055 | }
1056 | 


--------------------------------------------------------------------------------