├── .envrc ├── .cargo └── config.toml ├── src ├── codecs.rs ├── utils.rs ├── codecs │ ├── native_u32.rs │ └── zero_copy.rs ├── decreasing_window_iter.rs ├── allocator.rs ├── error.rs ├── lib.rs ├── roaringish │ ├── intersect │ │ ├── gallop_first.rs │ │ ├── gallop_second.rs │ │ ├── naive.rs │ │ └── simd.rs │ └── intersect.rs ├── searcher.rs ├── stats.rs ├── indexer.rs ├── roaringish.rs └── db.rs ├── .gitignore ├── LICENSE ├── Cargo.toml ├── flake.lock ├── flake.nix ├── README.md └── Cargo.lock /.envrc: -------------------------------------------------------------------------------- 1 | use flake -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustdocflags = ["-C", "target-cpu=native"] -------------------------------------------------------------------------------- /src/codecs.rs: -------------------------------------------------------------------------------- 1 | mod native_u32; 2 | mod zero_copy; 3 | pub use native_u32::*; 4 | pub use zero_copy::*; 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /data* 3 | /db* 4 | **.mdb 5 | **.tsv 6 | .direnv 7 | /benchmark 8 | *.txt 9 | /msmarco* 10 | *.jsonl 11 | roaringish_packed 12 | **.asm 13 | **.svg 14 | **.data 15 | /temp -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use unicode_segmentation::UnicodeSegmentation; 2 | 3 | /// Normalizes the input string by trimming leading and trailing 4 | /// whitespaces and converting it to lowercase. 5 | pub fn normalize(s: &str) -> String { 6 | s.trim_start().trim_end().to_lowercase() 7 | } 8 | 9 | /// Tokenizes the input string by splitting it into word bounds 10 | /// also remove all tokens that are considered whitespace by utf-8. 11 | pub fn tokenize(s: &str) -> impl Iterator { 12 | s.split_word_bounds().filter(|t| { 13 | if !t.is_empty() { 14 | // This is safe because we know that `t` is not empty. 15 | return !t.chars().next().unwrap().is_whitespace(); 16 | } 17 | false 18 | }) 19 | } 20 | -------------------------------------------------------------------------------- /src/codecs/native_u32.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | 3 | use heed::BoxedError; 4 | 5 | pub struct NativeU32; 6 | 7 | impl<'a> heed::BytesDecode<'a> for NativeU32 { 8 | type DItem = u32; 9 | 10 | fn bytes_decode(bytes: &'a [u8]) -> Result { 11 | unsafe { Ok(u32::from_ne_bytes(bytes.try_into().unwrap_unchecked())) } 12 | } 13 | } 14 | 15 | impl<'a> heed::BytesEncode<'a> for NativeU32 { 16 | type EItem = u32; 17 | 18 | fn bytes_encode(item: &'a Self::EItem) -> Result, BoxedError> { 19 | let p = item as *const u32 as *const u8; 20 | let bytes = unsafe { std::slice::from_raw_parts(p, std::mem::size_of::()) }; 21 | Ok(Cow::Borrowed(bytes)) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/decreasing_window_iter.rs: -------------------------------------------------------------------------------- 1 | use std::{iter::FusedIterator, num::NonZero}; 2 | 3 | pub struct DecreasingWindows<'a, T: 'a> { 4 | v: &'a [T], 5 | size: NonZero, 6 | } 7 | impl<'a, T: 'a> DecreasingWindows<'a, T> { 8 | #[inline] 9 | pub fn new(slice: &'a [T], size: NonZero) -> Self { 10 | Self { v: slice, size } 11 | } 12 | } 13 | impl<'a, T> Iterator for DecreasingWindows<'a, T> { 14 | type Item = &'a [T]; 15 | 16 | #[inline] 17 | fn next(&mut self) -> Option<&'a [T]> { 18 | if self.size.get() > self.v.len() { 19 | self.size = NonZero::new(self.v.len())?; 20 | } 21 | 22 | let ret = Some(&self.v[..self.size.get()]); 23 | self.v = &self.v[1..]; 24 | ret 25 | } 26 | 27 | #[inline] 28 | fn size_hint(&self) -> (usize, Option) { 29 | let size = self.v.len(); 30 | (size, Some(size)) 31 | } 32 | 33 | #[inline] 34 | fn count(self) -> usize { 35 | self.len() 36 | } 37 | } 38 | impl ExactSizeIterator for DecreasingWindows<'_, T> {} 39 | impl FusedIterator for DecreasingWindows<'_, T> {} 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Gabriel Jorge Menezes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/codecs/zero_copy.rs: -------------------------------------------------------------------------------- 1 | use std::{borrow::Cow, marker::PhantomData}; 2 | 3 | use rkyv::{ 4 | Archive, Archived, Serialize, api::high::HighSerializer, ser::allocator::ArenaHandle, 5 | util::AlignedVec, 6 | }; 7 | 8 | pub struct ZeroCopyCodec(PhantomData) 9 | where 10 | T: for<'a> Serialize, rkyv::rancor::Error>> 11 | + Archive; 12 | 13 | impl<'a, T> heed::BytesEncode<'a> for ZeroCopyCodec 14 | where 15 | T: for<'b> Serialize, rkyv::rancor::Error>> 16 | + Archive 17 | + 'a, 18 | { 19 | type EItem = T; 20 | 21 | fn bytes_encode(item: &'a Self::EItem) -> Result, heed::BoxedError> { 22 | let bytes = rkyv::to_bytes(item).map(|bytes| Cow::Owned(bytes.to_vec())); 23 | 24 | Ok(bytes?) 25 | } 26 | } 27 | 28 | impl<'a, T> heed::BytesDecode<'a> for ZeroCopyCodec 29 | where 30 | T: for<'b> Serialize, rkyv::rancor::Error>> 31 | + Archive 32 | + 'a, 33 | { 34 | type DItem = &'a T::Archived; 35 | 36 | fn bytes_decode(bytes: &'a [u8]) -> Result { 37 | unsafe { Ok(rkyv::access_unchecked::>(bytes)) } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | cargo-features = ["profile-rustflags"] 2 | 3 | [package] 4 | name = "simdphrase" 5 | version = "0.1.1" 6 | edition = "2024" 7 | license = "MIT" 8 | authors = ["Gabriel Menezes "] 9 | readme = "README.md" 10 | repository = "https://github.com/Gab-Menezes/simdphrase" 11 | description = "Extremely fast phrase search implementation." 12 | keywords = ["simd", "phrase", "search", "information", "retrieval"] 13 | categories = ["algorithms", "data-structures", "database", "text-processing"] 14 | exclude = [ 15 | "flake.nix", 16 | "flake.lock", 17 | ".envrc", 18 | ] 19 | 20 | [dependencies] 21 | heed = "0.21.0" 22 | unicode-segmentation = "1.12.0" 23 | fxhash = "0.2.1" 24 | rkyv = { version = "0.8.10", features = ["unaligned", "pointer_width_64"] } 25 | memmap2 = "0.9.5" 26 | hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } 27 | bumpalo = "3.17.0" 28 | gxhash = "3.4.1" 29 | thiserror = "2.0.12" 30 | log = "0.4.26" 31 | 32 | [profile.dev] 33 | rustflags = [ 34 | "-C", "target-cpu=native", 35 | "-Z", "tune-cpu=native", 36 | "-C", "llvm-args=-align-all-functions=6", 37 | ] 38 | 39 | [profile.release] 40 | rustflags = [ 41 | "-C", "target-cpu=native", 42 | "-Z", "tune-cpu=native", 43 | "-C", "llvm-args=-align-all-functions=6", 44 | ] -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "nixpkgs": { 4 | "locked": { 5 | "lastModified": 1741010256, 6 | "narHash": "sha256-WZNlK/KX7Sni0RyqLSqLPbK8k08Kq7H7RijPJbq9KHM=", 7 | "owner": "NixOS", 8 | "repo": "nixpkgs", 9 | "rev": "ba487dbc9d04e0634c64e3b1f0d25839a0a68246", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "NixOS", 14 | "ref": "nixos-unstable", 15 | "repo": "nixpkgs", 16 | "type": "github" 17 | } 18 | }, 19 | "root": { 20 | "inputs": { 21 | "nixpkgs": "nixpkgs", 22 | "rust-overlay": "rust-overlay" 23 | } 24 | }, 25 | "rust-overlay": { 26 | "inputs": { 27 | "nixpkgs": [ 28 | "nixpkgs" 29 | ] 30 | }, 31 | "locked": { 32 | "lastModified": 1741055476, 33 | "narHash": "sha256-52vwEV0oS2lCnx3c/alOFGglujZTLmObit7K8VblnS8=", 34 | "owner": "oxalica", 35 | "repo": "rust-overlay", 36 | "rev": "aefb7017d710f150970299685e8d8b549d653649", 37 | "type": "github" 38 | }, 39 | "original": { 40 | "owner": "oxalica", 41 | "repo": "rust-overlay", 42 | "type": "github" 43 | } 44 | } 45 | }, 46 | "root": "root", 47 | "version": 7 48 | } 49 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs = { 3 | nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; 4 | rust-overlay = { 5 | url = "github:oxalica/rust-overlay"; 6 | inputs.nixpkgs.follows = "nixpkgs"; 7 | }; 8 | }; 9 | 10 | outputs = { self, nixpkgs, rust-overlay, ... }: 11 | let 12 | system = "x86_64-linux"; 13 | pkgs = import nixpkgs { 14 | inherit system; 15 | overlays = [ rust-overlay.overlays.default ]; 16 | }; 17 | rustbin = pkgs.rust-bin.selectLatestNightlyWith (toolchain: toolchain.default.override { 18 | extensions = [ "rust-src" "rust-analyzer" "miri" ]; 19 | }); 20 | 21 | clangVersion = "19"; 22 | in 23 | { 24 | devShells.${system}.default = pkgs.mkShell { 25 | packages = [ 26 | rustbin 27 | pkgs.cargo-show-asm 28 | pkgs.cargo-expand 29 | pkgs.cargo-flamegraph 30 | pkgs.cargo-valgrind 31 | pkgs.cargo-fuzz 32 | pkgs.cargo-pgo 33 | 34 | pkgs.openssl 35 | pkgs.pkg-config 36 | 37 | pkgs."clang_${clangVersion}" 38 | pkgs."llvmPackages_${clangVersion}".bintools 39 | pkgs."bolt_${clangVersion}" 40 | pkgs.cmake 41 | ]; 42 | 43 | LIBCLANG_PATH = pkgs.lib.makeLibraryPath [ pkgs."llvmPackages_${clangVersion}".libclang.lib ]; 44 | }; 45 | }; 46 | } 47 | -------------------------------------------------------------------------------- /src/allocator.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | alloc::{Allocator, alloc, dealloc}, 3 | ptr::NonNull, 4 | }; 5 | 6 | use rkyv::{Archive, Serialize}; 7 | 8 | #[derive(Default, Archive, Serialize)] 9 | pub struct AlignedAllocator; 10 | unsafe impl Allocator for AlignedAllocator { 11 | fn allocate( 12 | &self, 13 | layout: std::alloc::Layout, 14 | ) -> Result, std::alloc::AllocError> { 15 | const { assert!(N.is_power_of_two()) }; 16 | const { assert!(N != 0) }; 17 | unsafe { 18 | // This probably will never fail, if it does 19 | // what can I do ? Let it crash, something 20 | // went wrong. 21 | let p = alloc(layout.align_to(N).unwrap()); 22 | let s = std::ptr::slice_from_raw_parts_mut(p, layout.size()); 23 | #[cfg(debug_assertions)] 24 | return NonNull::new(s).ok_or(std::alloc::AllocError); 25 | 26 | #[cfg(not(debug_assertions))] 27 | return Ok(NonNull::new_unchecked(s)); 28 | } 29 | } 30 | 31 | unsafe fn deallocate(&self, ptr: std::ptr::NonNull, layout: std::alloc::Layout) { 32 | unsafe { 33 | // This probably will never fail, if it does 34 | // what can I do ? Let it crash, something 35 | // went wrong. 36 | dealloc(ptr.as_ptr(), layout.align_to(N).unwrap()); 37 | } 38 | } 39 | } 40 | 41 | pub type Aligned64 = AlignedAllocator<64>; 42 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Possible errors that can occur while interacting with the database. 4 | #[derive(Error, Debug)] 5 | pub enum DbError { 6 | #[error("Io error: {0}")] 7 | IoError(#[from] std::io::Error), 8 | 9 | #[error("Lmdb error: {0}")] 10 | LmdbError(#[from] heed::Error), 11 | 12 | #[error("Serialize error: {0}")] 13 | EncodingError(#[from] rkyv::rancor::Error), 14 | 15 | #[error("Database error: {0}")] 16 | DatabaseError(String), 17 | 18 | #[error("Key `{0}` not found in database `{1}`")] 19 | KeyNotFound(String, String), 20 | } 21 | 22 | /// Possible errors that can occur while searching. 23 | #[derive(Error, Debug)] 24 | pub enum SearchError { 25 | #[error("Db error: {0}")] 26 | DbError(#[from] DbError), 27 | 28 | #[error("Searched query is empty")] 29 | EmptyQuery, 30 | 31 | #[error("No combination found while trying to merge and minimize")] 32 | MergeAndMinimizeNotPossible, 33 | 34 | #[error("Token `{0}` not found in the database")] 35 | TokenNotFound(String), 36 | 37 | #[error("Empty Intersection")] 38 | EmptyIntersection, 39 | 40 | #[error("Catastrophic error has occurred")] 41 | InternalError, 42 | } 43 | 44 | /// Possible errors when trying to retrieve documents by their internal ID. 45 | #[derive(Error, Debug)] 46 | pub enum GetDocumentError { 47 | #[error("Db error: {0}")] 48 | DbError(#[from] DbError), 49 | 50 | #[error("Document with id `{0}` not found")] 51 | DocumentNotFound(u32), 52 | } 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simd Phrase Search 2 | 3 | Extremely fast phrase search implementation. 4 | 5 | ## Overview 6 | 7 | This implementation follows some of the ideas proposed in this 8 | [blog post](https://softwaredoug.com/blog/2024/01/21/search-array-phrase-algorithm) 9 | by [Doug Turnbull](https://softwaredoug.com/). The full explanation on how the internals 10 | work can be found in [here](https://gab-menezes.github.io/2025/01/13/using-the-most-unhinged-avx-512-instruction-to-make-the-fastest-phrase-search-algo.html). 11 | 12 | This crate uses the [log] crate for logging during indexing. 13 | 14 | It's highly recommended to compile this crate with `-C llvm-args=-align-all-functions=6`. 15 | 16 | ## Usage 17 | 18 | ```rust 19 | use phrase_search::{CommonTokens, Indexer, SimdIntersect}; 20 | 21 | // Creates a new indexer that can be reused, it will index 300_000 documents 22 | // in each batch and will use the top 50 most common tokens to speed up the search, 23 | // by merging them. 24 | let indexer = Indexer::new(Some(300_000), Some(CommonTokens::FixedNum(50))); 25 | 26 | let docs = vec![ 27 | ("look at my beautiful cat", 0), 28 | ("this is a document", 50), 29 | ("look at my dog", 25), 30 | ("look at my beautiful hamster", 35), 31 | ]; 32 | let index_name = "./index"; 33 | let db_size = 1024 * 1024; 34 | 35 | // Indexes the documents returned by the iterator `it`. 36 | // The index will be created at `index_name` with the given `db_size`. 37 | let (searcher, num_indexed_documents) = indexer.index(docs, index_name, db_size)?; 38 | 39 | // Search by the string "78" 40 | let result = searcher.search::("at my beautiful")?; 41 | // This should return `[0, 35]` 42 | let documents = result.get_documents()?; 43 | ``` -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(hash_raw_entry)] 2 | #![feature(array_windows)] 3 | #![feature(iter_intersperse)] 4 | #![feature(debug_closure_helpers)] 5 | #![feature(vec_push_within_capacity)] 6 | #![feature(trivial_bounds)] 7 | #![feature(portable_simd)] 8 | #![feature(stdarch_x86_avx512)] 9 | #![feature(avx512_target_feature)] 10 | #![feature(allocator_api)] 11 | #![feature(pointer_is_aligned_to)] 12 | 13 | //! Extremely fast phrase search implementation. 14 | //! 15 | //! ## Overview 16 | //! 17 | //! This implementation follows some of the ideas proposed in this 18 | //! [blog post](https://softwaredoug.com/blog/2024/01/21/search-array-phrase-algorithm) 19 | //! by [Doug Turnbull](https://softwaredoug.com/). The full explanation on how the internals 20 | //! work can be found in [here](https://gab-menezes.github.io/2025/01/13/using-the-most-unhinged-avx-512-instruction-to-make-the-fastest-phrase-search-algo.html). 21 | //! 22 | //! This crate uses the [log] crate for logging during indexing. 23 | //! 24 | //! It's highly recommended to compile this crate with `-C llvm-args=-align-all-functions=6`. 25 | //! 26 | //! ## Usage 27 | //! 28 | //! ```rust 29 | //! use phrase_search::{CommonTokens, Indexer, SimdIntersect}; 30 | //! 31 | //! // Creates a new indexer that can be reused, it will index 300_000 documents 32 | //! // in each batch and will use the top 50 most common tokens to speed up the search, 33 | //! // by merging them. 34 | //! let indexer = Indexer::new(Some(300_000), Some(CommonTokens::FixedNum(50))); 35 | //! 36 | //! let docs = vec![ 37 | //! ("look at my beautiful cat", 0), 38 | //! ("this is a document", 50), 39 | //! ("look at my dog", 25), 40 | //! ("look at my beautiful hamster", 35), 41 | //! ]; 42 | //! let index_name = "./index"; 43 | //! let db_size = 1024 * 1024; 44 | //! 45 | //! // Indexes the documents returned by the iterator `it`. 46 | //! // The index will be created at `index_name` with the given `db_size`. 47 | //! let (searcher, num_indexed_documents) = indexer.index(docs, index_name, db_size)?; 48 | //! 49 | //! // Search by the string "78" 50 | //! let result = searcher.search::("at my beautiful")?; 51 | //! // This should return `[0, 35]` 52 | //! let documents = result.get_documents()?; 53 | //! ``` 54 | 55 | mod allocator; 56 | mod codecs; 57 | mod db; 58 | mod decreasing_window_iter; 59 | mod error; 60 | mod indexer; 61 | mod roaringish; 62 | mod searcher; 63 | mod stats; 64 | mod utils; 65 | 66 | use allocator::Aligned64; 67 | use db::DB; 68 | use roaringish::BorrowRoaringishPacked; 69 | use roaringish::RoaringishPacked; 70 | use utils::{normalize, tokenize}; 71 | 72 | pub use db::Document; 73 | pub use error::{DbError, GetDocumentError, SearchError}; 74 | pub use indexer::CommonTokens; 75 | pub use indexer::Indexer; 76 | pub use stats::Stats; 77 | 78 | pub use roaringish::intersect::naive::NaiveIntersect; 79 | 80 | pub use roaringish::intersect::Intersection; 81 | #[cfg(target_feature = "avx512f")] 82 | pub use roaringish::intersect::simd::SimdIntersect; 83 | pub use searcher::{SearchResult, Searcher}; 84 | -------------------------------------------------------------------------------- /src/roaringish/intersect/gallop_first.rs: -------------------------------------------------------------------------------- 1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed}; 2 | 3 | use crate::{ 4 | Aligned64, BorrowRoaringishPacked, Stats, 5 | roaringish::{ADD_ONE_GROUP, Aligned, clear_values, unpack_values}, 6 | }; 7 | 8 | use super::{Intersect, Intersection, private::IntersectSeal}; 9 | 10 | pub struct GallopIntersectFirst; 11 | impl IntersectSeal for GallopIntersectFirst {} 12 | impl Intersection for GallopIntersectFirst {} 13 | 14 | impl Intersect for GallopIntersectFirst { 15 | fn inner_intersect( 16 | lhs: BorrowRoaringishPacked<'_, Aligned>, 17 | rhs: BorrowRoaringishPacked<'_, Aligned>, 18 | 19 | lhs_i: &mut usize, 20 | rhs_i: &mut usize, 21 | 22 | packed_result: &mut Box<[MaybeUninit], Aligned64>, 23 | i: &mut usize, 24 | 25 | _msb_packed_result: &mut Box<[MaybeUninit], Aligned64>, 26 | _j: &mut usize, 27 | 28 | add_to_group: u64, 29 | lhs_len: u16, 30 | _msb_mask: u16, 31 | lsb_mask: u16, 32 | 33 | stats: &Stats, 34 | ) { 35 | let b = std::time::Instant::now(); 36 | 37 | while *lhs_i < lhs.len() && *rhs_i < rhs.len() { 38 | let mut lhs_delta = 1; 39 | let mut rhs_delta = 1; 40 | 41 | while *lhs_i < lhs.len() 42 | && clear_values(lhs[*lhs_i]) + add_to_group + if FIRST { 0 } else { ADD_ONE_GROUP } 43 | < clear_values(rhs[*rhs_i]) 44 | { 45 | *lhs_i += lhs_delta; 46 | lhs_delta *= 2; 47 | } 48 | *lhs_i -= lhs_delta / 2; 49 | 50 | while *rhs_i < rhs.len() 51 | && clear_values(rhs[*rhs_i]) 52 | < unsafe { clear_values(*lhs.get_unchecked(*lhs_i)) } 53 | + add_to_group 54 | + if FIRST { 0 } else { ADD_ONE_GROUP } 55 | { 56 | *rhs_i += rhs_delta; 57 | rhs_delta *= 2; 58 | } 59 | *rhs_i -= rhs_delta / 2; 60 | 61 | let lhs_packed = unsafe { *lhs.get_unchecked(*lhs_i) } 62 | + add_to_group 63 | + if FIRST { 0 } else { ADD_ONE_GROUP }; 64 | let rhs_packed = unsafe { *rhs.get_unchecked(*rhs_i) }; 65 | 66 | let lhs_doc_id_group = clear_values(lhs_packed); 67 | let rhs_doc_id_group = clear_values(rhs_packed); 68 | 69 | let lhs_values = unpack_values(lhs_packed); 70 | let rhs_values = unpack_values(rhs_packed); 71 | 72 | match lhs_doc_id_group.cmp(&rhs_doc_id_group) { 73 | std::cmp::Ordering::Less => *lhs_i += 1, 74 | std::cmp::Ordering::Greater => *rhs_i += 1, 75 | std::cmp::Ordering::Equal => { 76 | let intersection = if FIRST { 77 | (lhs_values << lhs_len) & rhs_values 78 | } else { 79 | lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values 80 | }; 81 | unsafe { 82 | packed_result 83 | .get_unchecked_mut(*i) 84 | .write(lhs_doc_id_group | intersection as u64); 85 | } 86 | *i += (intersection > 0) as usize; 87 | 88 | *lhs_i += 1; 89 | *rhs_i += 1; 90 | } 91 | } 92 | } 93 | 94 | stats 95 | .first_intersect_gallop 96 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 97 | } 98 | 99 | fn intersection_buffer_size( 100 | lhs: BorrowRoaringishPacked<'_, Aligned>, 101 | rhs: BorrowRoaringishPacked<'_, Aligned>, 102 | ) -> usize { 103 | lhs.0.len().min(rhs.0.len()) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/roaringish/intersect.rs: -------------------------------------------------------------------------------- 1 | use std::mem::MaybeUninit; 2 | 3 | use crate::{Stats, allocator::Aligned64}; 4 | 5 | use super::{ADD_ONE_GROUP, Aligned, BorrowRoaringishPacked}; 6 | 7 | pub mod gallop_first; 8 | pub mod gallop_second; 9 | pub mod naive; 10 | pub mod simd; 11 | 12 | mod private { 13 | pub trait IntersectSeal {} 14 | } 15 | 16 | /// Allows a type to be used as an intersection algorithm when searching. 17 | pub trait Intersection: Intersect {} 18 | 19 | /// Necessary functions for an intersection algorithm. 20 | /// 21 | /// The intersection is done in two phases that's why 22 | /// the function have a `FIRST` const generic. 23 | pub trait Intersect: private::IntersectSeal { 24 | /// Responsible for allocating the result buffers 25 | /// and compute the necessary values before starting 26 | /// the intersection. 27 | #[inline(never)] 28 | fn intersect( 29 | lhs: BorrowRoaringishPacked<'_, Aligned>, 30 | rhs: BorrowRoaringishPacked<'_, Aligned>, 31 | lhs_len: u32, 32 | 33 | stats: &Stats, 34 | ) -> (Vec, Vec) { 35 | let mut lhs_i = 0; 36 | let mut rhs_i = 0; 37 | 38 | let buffer_size = Self::intersection_buffer_size(lhs, rhs); 39 | 40 | let mut i = 0; 41 | let mut packed_result: Box<[MaybeUninit], Aligned64> = 42 | Box::new_uninit_slice_in(buffer_size, Aligned64::default()); 43 | 44 | let mut j = 0; 45 | let mut msb_packed_result: Box<[MaybeUninit], Aligned64> = if FIRST { 46 | Box::new_uninit_slice_in(lhs.0.len() + 1, Aligned64::default()) 47 | } else { 48 | Box::new_uninit_slice_in(0, Aligned64::default()) 49 | }; 50 | 51 | let add_to_group = (lhs_len / 16) as u64 * ADD_ONE_GROUP; 52 | let lhs_len = (lhs_len % 16) as u16; 53 | 54 | let msb_mask = !(u16::MAX >> lhs_len); 55 | let lsb_mask = !(u16::MAX << lhs_len); 56 | 57 | Self::inner_intersect::( 58 | lhs, 59 | rhs, 60 | &mut lhs_i, 61 | &mut rhs_i, 62 | &mut packed_result, 63 | &mut i, 64 | &mut msb_packed_result, 65 | &mut j, 66 | add_to_group, 67 | lhs_len, 68 | msb_mask, 69 | lsb_mask, 70 | stats, 71 | ); 72 | 73 | let (packed_result_ptr, a0) = Box::into_raw_with_allocator(packed_result); 74 | let (msb_packed_result_ptr, a1) = Box::into_raw_with_allocator(msb_packed_result); 75 | unsafe { 76 | ( 77 | Vec::from_raw_parts_in(packed_result_ptr as *mut _, i, buffer_size, a0), 78 | if FIRST { 79 | Vec::from_raw_parts_in(msb_packed_result_ptr as *mut _, j, lhs.0.len() + 1, a1) 80 | } else { 81 | Vec::from_raw_parts_in(msb_packed_result_ptr as *mut _, 0, 0, a1) 82 | }, 83 | ) 84 | } 85 | } 86 | 87 | /// Performs the intersection. 88 | /// 89 | /// `msb_packed_result` has 0 capacity if `FIRST` is false. 90 | #[allow(clippy::too_many_arguments)] 91 | fn inner_intersect( 92 | lhs: BorrowRoaringishPacked<'_, Aligned>, 93 | rhs: BorrowRoaringishPacked<'_, Aligned>, 94 | 95 | lhs_i: &mut usize, 96 | rhs_i: &mut usize, 97 | 98 | packed_result: &mut Box<[MaybeUninit], Aligned64>, 99 | i: &mut usize, 100 | 101 | msb_packed_result: &mut Box<[MaybeUninit], Aligned64>, 102 | j: &mut usize, 103 | 104 | add_to_group: u64, 105 | lhs_len: u16, 106 | msb_mask: u16, 107 | lsb_mask: u16, 108 | 109 | stats: &Stats, 110 | ); 111 | 112 | /// Size of the buffer needed to store the intersection. 113 | fn intersection_buffer_size( 114 | lhs: BorrowRoaringishPacked<'_, Aligned>, 115 | rhs: BorrowRoaringishPacked<'_, Aligned>, 116 | ) -> usize; 117 | } 118 | -------------------------------------------------------------------------------- /src/roaringish/intersect/gallop_second.rs: -------------------------------------------------------------------------------- 1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed}; 2 | 3 | use crate::{ 4 | Aligned64, BorrowRoaringishPacked, Stats, 5 | roaringish::{Aligned, clear_values, unpack_values}, 6 | }; 7 | 8 | use super::{Intersect, Intersection, private::IntersectSeal}; 9 | 10 | pub struct GallopIntersectSecond; 11 | impl IntersectSeal for GallopIntersectSecond {} 12 | impl Intersection for GallopIntersectSecond {} 13 | 14 | impl Intersect for GallopIntersectSecond { 15 | fn inner_intersect( 16 | lhs: BorrowRoaringishPacked<'_, Aligned>, 17 | rhs: BorrowRoaringishPacked<'_, Aligned>, 18 | 19 | lhs_i: &mut usize, 20 | rhs_i: &mut usize, 21 | 22 | packed_result: &mut Box<[MaybeUninit], Aligned64>, 23 | i: &mut usize, 24 | 25 | _msb_packed_result: &mut Box<[MaybeUninit], Aligned64>, 26 | _j: &mut usize, 27 | 28 | _add_to_group: u64, 29 | lhs_len: u16, 30 | _msb_mask: u16, 31 | lsb_mask: u16, 32 | 33 | stats: &Stats, 34 | ) { 35 | let b = std::time::Instant::now(); 36 | 37 | while *lhs_i < lhs.len() && *rhs_i < rhs.len() { 38 | let mut lhs_delta = 1; 39 | let mut rhs_delta = 1; 40 | 41 | while *lhs_i < lhs.len() && clear_values(lhs[*lhs_i]) < clear_values(rhs[*rhs_i]) { 42 | *lhs_i += lhs_delta; 43 | lhs_delta *= 2; 44 | } 45 | *lhs_i -= lhs_delta / 2; 46 | 47 | while *rhs_i < rhs.len() 48 | && clear_values(rhs[*rhs_i]) < unsafe { clear_values(*lhs.get_unchecked(*lhs_i)) } 49 | { 50 | *rhs_i += rhs_delta; 51 | rhs_delta *= 2; 52 | } 53 | *rhs_i -= rhs_delta / 2; 54 | 55 | let lhs_packed = unsafe { *lhs.get_unchecked(*lhs_i) }; 56 | let rhs_packed = unsafe { *rhs.get_unchecked(*rhs_i) }; 57 | 58 | let lhs_doc_id_group = clear_values(lhs_packed); 59 | let rhs_doc_id_group = clear_values(rhs_packed); 60 | 61 | let lhs_values = unpack_values(lhs_packed); 62 | let rhs_values = unpack_values(rhs_packed); 63 | 64 | match lhs_doc_id_group.cmp(&rhs_doc_id_group) { 65 | std::cmp::Ordering::Less => *lhs_i += 1, 66 | std::cmp::Ordering::Greater => *rhs_i += 1, 67 | std::cmp::Ordering::Equal => { 68 | let intersection = 69 | lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values; 70 | unsafe { 71 | packed_result 72 | .get_unchecked_mut(*i) 73 | .write(lhs_doc_id_group | intersection as u64); 74 | } 75 | *i += (intersection > 0) as usize; 76 | 77 | *lhs_i += 1; 78 | *rhs_i += 1; 79 | } 80 | } 81 | 82 | // // In micro benchmarking doing this version seems faster, but in the real 83 | // // use case is slower 84 | 85 | // let intersection = lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values; 86 | // if lhs_doc_id_group == rhs_doc_id_group && intersection > 0 { 87 | // unsafe { 88 | // packed_result 89 | // .get_unchecked_mut(*i) 90 | // .write(lhs_doc_id_group | intersection as u64); 91 | // } 92 | // *i += 1; 93 | // } 94 | 95 | // *lhs_i += (lhs_doc_id_group <= rhs_doc_id_group) as usize; 96 | // *rhs_i += (lhs_doc_id_group >= rhs_doc_id_group) as usize; 97 | } 98 | 99 | stats 100 | .second_intersect_gallop 101 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 102 | } 103 | 104 | fn intersection_buffer_size( 105 | lhs: BorrowRoaringishPacked<'_, Aligned>, 106 | rhs: BorrowRoaringishPacked<'_, Aligned>, 107 | ) -> usize { 108 | lhs.0.len().min(rhs.0.len()) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/roaringish/intersect/naive.rs: -------------------------------------------------------------------------------- 1 | use std::{mem::MaybeUninit, sync::atomic::Ordering::Relaxed}; 2 | 3 | use crate::{ 4 | Stats, 5 | allocator::Aligned64, 6 | roaringish::{ADD_ONE_GROUP, Aligned, BorrowRoaringishPacked, clear_values, unpack_values}, 7 | }; 8 | 9 | use super::{Intersect, Intersection, private::IntersectSeal}; 10 | 11 | /// Naive intersection algorithm. 12 | pub struct NaiveIntersect; 13 | impl IntersectSeal for NaiveIntersect {} 14 | impl Intersection for NaiveIntersect {} 15 | 16 | impl Intersect for NaiveIntersect { 17 | #[inline(always)] 18 | fn inner_intersect( 19 | lhs: BorrowRoaringishPacked<'_, Aligned>, 20 | rhs: BorrowRoaringishPacked<'_, Aligned>, 21 | 22 | lhs_i: &mut usize, 23 | rhs_i: &mut usize, 24 | 25 | packed_result: &mut Box<[MaybeUninit], Aligned64>, 26 | i: &mut usize, 27 | 28 | msb_packed_result: &mut Box<[MaybeUninit], Aligned64>, 29 | j: &mut usize, 30 | 31 | add_to_group: u64, 32 | lhs_len: u16, 33 | msb_mask: u16, 34 | lsb_mask: u16, 35 | 36 | stats: &Stats, 37 | ) { 38 | let b = std::time::Instant::now(); 39 | 40 | while *lhs_i < lhs.0.len() && *rhs_i < rhs.0.len() { 41 | let lhs_packed = 42 | unsafe { *lhs.0.get_unchecked(*lhs_i) } + if FIRST { add_to_group } else { 0 }; 43 | let lhs_doc_id_group = clear_values(lhs_packed); 44 | let lhs_values = unpack_values(lhs_packed); 45 | 46 | let rhs_packed = unsafe { *rhs.0.get_unchecked(*rhs_i) }; 47 | let rhs_doc_id_group = clear_values(rhs_packed); 48 | let rhs_values = unpack_values(rhs_packed); 49 | 50 | match lhs_doc_id_group.cmp(&rhs_doc_id_group) { 51 | std::cmp::Ordering::Equal => { 52 | unsafe { 53 | if FIRST { 54 | let intersection = (lhs_values << lhs_len) & rhs_values; 55 | packed_result 56 | .get_unchecked_mut(*i) 57 | .write(lhs_doc_id_group | intersection as u64); 58 | 59 | msb_packed_result 60 | .get_unchecked_mut(*j) 61 | .write(lhs_packed + ADD_ONE_GROUP); 62 | 63 | *j += (lhs_values & msb_mask > 0) as usize; 64 | } else { 65 | let intersection = 66 | lhs_values.rotate_left(lhs_len as u32) & lsb_mask & rhs_values; 67 | packed_result 68 | .get_unchecked_mut(*i) 69 | .write(lhs_doc_id_group | intersection as u64); 70 | } 71 | } 72 | *i += 1; 73 | *lhs_i += 1; 74 | *rhs_i += 1; 75 | } 76 | std::cmp::Ordering::Greater => *rhs_i += 1, 77 | std::cmp::Ordering::Less => { 78 | if FIRST { 79 | unsafe { 80 | msb_packed_result 81 | .get_unchecked_mut(*j) 82 | .write(lhs_packed + ADD_ONE_GROUP); 83 | *j += (lhs_values & msb_mask > 0) as usize; 84 | } 85 | } 86 | *lhs_i += 1; 87 | } 88 | } 89 | } 90 | 91 | if FIRST { 92 | stats 93 | .first_intersect_naive 94 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 95 | } else { 96 | stats 97 | .second_intersect_naive 98 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 99 | } 100 | } 101 | 102 | fn intersection_buffer_size( 103 | lhs: BorrowRoaringishPacked<'_, Aligned>, 104 | rhs: BorrowRoaringishPacked<'_, Aligned>, 105 | ) -> usize { 106 | lhs.0.len().min(rhs.0.len()) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/searcher.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashSet, path::Path}; 2 | 3 | use crate::{DB, DbError, Intersection, SearchError, Stats, db::Document, error::GetDocumentError}; 4 | use memmap2::Mmap; 5 | use rkyv::{Archive, Deserialize, de::Pool, rancor::Strategy}; 6 | 7 | /// Final result of a search operation. 8 | pub struct SearchResult<'a, D: Document>(pub Result, SearchError>, &'a Searcher); 9 | impl SearchResult<'_, D> { 10 | /// Number of documents that matched the search query. 11 | pub fn len(&self) -> Option { 12 | self.0.as_ref().map(|p| p.len()).ok() 13 | } 14 | 15 | /// Returns the internal document IDs that matched the search query. 16 | pub fn get_internal_document_ids(&self) -> Option<&[u32]> { 17 | self.0.as_ref().map(|p| p.as_slice()).ok() 18 | } 19 | 20 | /// Gets the archived version of the documents that matched the search query. 21 | /// 22 | /// This avoids having to deserialize, but it's necessary to use a callback 23 | /// due to the lifetime of the transaction. 24 | /// 25 | /// If you want the documents deserialized, use [Self::get_documents] instead. 26 | pub fn get_archived_documents( 27 | &self, 28 | cb: impl FnOnce(Vec<&D::Archived>), 29 | ) -> Result<(), GetDocumentError> { 30 | let Some(doc_ids) = self.get_internal_document_ids() else { 31 | return Ok(()); 32 | }; 33 | 34 | self.1.get_archived_documents(doc_ids, cb) 35 | } 36 | 37 | /// Gets the deserialized version of the documents that matched the search query. 38 | pub fn get_documents(&self) -> Result, GetDocumentError> 39 | where 40 | ::Archived: Deserialize>, 41 | { 42 | let Some(doc_ids) = self.get_internal_document_ids() else { 43 | return Ok(Vec::new()); 44 | }; 45 | 46 | self.1.get_documents(doc_ids) 47 | } 48 | } 49 | 50 | /// Object responsible for searching the database. 51 | pub struct Searcher { 52 | db: DB, 53 | common_tokens: HashSet>, 54 | mmap: Mmap, 55 | } 56 | 57 | impl Searcher { 58 | /// Create a new searcher object. 59 | pub fn new>(path: P) -> Result { 60 | let (db, common_tokens, mmap) = DB::open(path)?; 61 | Ok(Self { 62 | db, 63 | common_tokens, 64 | mmap, 65 | }) 66 | } 67 | 68 | /// Searches by the query `q` 69 | pub fn search(&self, q: &str) -> SearchResult { 70 | let stats = Stats::default(); 71 | self.search_with_stats::(q, &stats) 72 | } 73 | 74 | /// Searches by the query `q`, allowing the user to pass a [Stats] object. 75 | pub fn search_with_stats(&self, q: &str, stats: &Stats) -> SearchResult { 76 | SearchResult( 77 | self.db 78 | .search::(q, stats, &self.common_tokens, &self.mmap), 79 | self, 80 | ) 81 | } 82 | 83 | /// Gets the archived version of the documents. 84 | /// 85 | /// This avoids having to deserialize, but it's necessary to use a callback 86 | /// due to the lifetime of the transaction. 87 | /// 88 | /// If you want the documents deserialized, use [Self::get_documents] instead. 89 | pub fn get_archived_documents( 90 | &self, 91 | doc_ids: &[u32], 92 | cb: impl FnOnce(Vec<&D::Archived>), 93 | ) -> Result<(), GetDocumentError> { 94 | self.db.get_archived_documents(doc_ids, cb) 95 | } 96 | 97 | /// Gets the archived version of a documents. 98 | /// 99 | /// This avoids having to deserialize, but it's necessary to use a callback 100 | /// due to the lifetime of the transaction. 101 | /// 102 | /// If you want the documents deserialized, use [Self::get_document] instead. 103 | pub fn get_archived_document( 104 | &self, 105 | doc_id: u32, 106 | cb: impl FnOnce(&D::Archived), 107 | ) -> Result<(), GetDocumentError> { 108 | self.db.get_archived_document(doc_id, cb) 109 | } 110 | 111 | /// Gets the deserialized version of the documents. 112 | pub fn get_documents(&self, doc_ids: &[u32]) -> Result, GetDocumentError> 113 | where 114 | ::Archived: Deserialize>, 115 | { 116 | self.db.get_documents(doc_ids) 117 | } 118 | 119 | /// Gets the deserialized version of a documents. 120 | pub fn get_document(&self, doc_id: u32) -> Result 121 | where 122 | ::Archived: Deserialize>, 123 | { 124 | self.db.get_document(doc_id) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/stats.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::Debug, 3 | sync::atomic::{AtomicU64, Ordering::Relaxed}, 4 | }; 5 | 6 | /// Time stats collected during search. 7 | #[derive(Default)] 8 | pub struct Stats { 9 | /// Time spent during normalization and tokenization. 10 | pub normalize_tokenize: AtomicU64, 11 | /// Time spent during merging and minimizing. 12 | pub merge_minimize: AtomicU64, 13 | /// Time spent during the first binary search. 14 | pub first_binary_search: AtomicU64, 15 | /// Time spent during the first intersect. 16 | pub first_intersect: AtomicU64, 17 | /// Time spent during the first intersect using SIMD. 18 | pub first_intersect_simd: AtomicU64, 19 | /// Time spent during the first intersect using naive method. 20 | pub first_intersect_naive: AtomicU64, 21 | /// Time spent during the first intersect using gallop method. 22 | pub first_intersect_gallop: AtomicU64, 23 | 24 | /// Time spent during the second binary search. 25 | pub second_binary_search: AtomicU64, 26 | /// Time spent during the second intersect. 27 | pub second_intersect: AtomicU64, 28 | /// Time spent during the second intersect using SIMD. 29 | pub second_intersect_simd: AtomicU64, 30 | /// Time spent during the second intersect using naive method. 31 | pub second_intersect_naive: AtomicU64, 32 | /// Time spent during the second intersect using gallop method. 33 | pub second_intersect_gallop: AtomicU64, 34 | 35 | /// Time spent during the first merge phase. 36 | pub merge_phases_first_pass: AtomicU64, 37 | /// Time spent during the second merge phase. 38 | pub merge_phases_second_pass: AtomicU64, 39 | 40 | /// Time spent getting document ids. 41 | pub get_doc_ids: AtomicU64, 42 | 43 | /// Number of calls to the search function. 44 | pub iters: AtomicU64, 45 | } 46 | 47 | impl Debug for Stats { 48 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 49 | let sum = self.normalize_tokenize.load(Relaxed) 50 | + self.merge_minimize.load(Relaxed) 51 | + self.first_binary_search.load(Relaxed) 52 | + self.first_intersect.load(Relaxed) 53 | + self.second_binary_search.load(Relaxed) 54 | + self.second_intersect.load(Relaxed) 55 | + self.merge_phases_first_pass.load(Relaxed) 56 | + self.merge_phases_second_pass.load(Relaxed) 57 | + self.get_doc_ids.load(Relaxed); 58 | let sum = sum as f64; 59 | 60 | let normalize_tokenize = self.normalize_tokenize.load(Relaxed) as f64; 61 | let merge = self.merge_minimize.load(Relaxed) as f64; 62 | let first_binary_search = self.first_binary_search.load(Relaxed) as f64; 63 | let first_intersect = self.first_intersect.load(Relaxed) as f64; 64 | let first_intersect_simd = self.first_intersect_simd.load(Relaxed) as f64; 65 | let first_intersect_naive = self.first_intersect_naive.load(Relaxed) as f64; 66 | let first_intersect_gallop = self.first_intersect_gallop.load(Relaxed) as f64; 67 | let second_binary_search = self.second_binary_search.load(Relaxed) as f64; 68 | let second_intersect = self.second_intersect.load(Relaxed) as f64; 69 | let second_intersect_simd = self.second_intersect_simd.load(Relaxed) as f64; 70 | let second_intersect_naive = self.second_intersect_naive.load(Relaxed) as f64; 71 | let second_intersect_gallop = self.second_intersect_gallop.load(Relaxed) as f64; 72 | let merge_phases_first_pass = self.merge_phases_first_pass.load(Relaxed) as f64; 73 | let merge_phases_second_pass = self.merge_phases_second_pass.load(Relaxed) as f64; 74 | let get_doc_ids = self.get_doc_ids.load(Relaxed) as f64; 75 | let iters = self.iters.load(Relaxed) as f64; 76 | 77 | let per_normalize_tokenize = normalize_tokenize / sum * 100f64; 78 | let per_merge = merge / sum * 100f64; 79 | let per_first_binary_search = first_binary_search / sum * 100f64; 80 | let per_first_intersect = first_intersect / sum * 100f64; 81 | let per_second_binary_search = second_binary_search / sum * 100f64; 82 | let per_second_intersect = second_intersect / sum * 100f64; 83 | let per_merge_phases_first_pass = merge_phases_first_pass / sum * 100f64; 84 | let per_merge_phases_second_pass = merge_phases_second_pass / sum * 100f64; 85 | let per_get_doc_ids = get_doc_ids / sum * 100f64; 86 | 87 | f.debug_struct("Stats") 88 | .field( 89 | "normalize_tokenize", 90 | &format_args!( 91 | " ({:08.3}ms, {:08.3}us/iter, {per_normalize_tokenize:06.3}%)", 92 | normalize_tokenize / 1000f64, 93 | normalize_tokenize / iters, 94 | ), 95 | ) 96 | .field( 97 | "merge_minimize", 98 | &format_args!( 99 | " ({:08.3}ms, {:08.3}us/iter, {per_merge:06.3}%)", 100 | merge / 1000f64, 101 | merge / iters, 102 | ), 103 | ) 104 | .field( 105 | "first_binary_search", 106 | &format_args!( 107 | " ({:08.3}ms, {:08.3}us/iter, {per_first_binary_search:06.3}%)", 108 | first_binary_search / 1000f64, 109 | first_binary_search / iters, 110 | ), 111 | ) 112 | .field( 113 | "first_intersect", 114 | &format_args!( 115 | " ({:08.3}ms, {:08.3}us/iter, {per_first_intersect:06.3}%)", 116 | first_intersect / 1000f64, 117 | first_intersect / iters, 118 | ), 119 | ) 120 | .field( 121 | " first_intersect_simd", 122 | &format_args!( 123 | " ({:08.3}ms, {:08.3}us/iter)", 124 | first_intersect_simd / 1000f64, 125 | first_intersect_simd / iters, 126 | ), 127 | ) 128 | .field( 129 | " first_intersect_naive", 130 | &format_args!( 131 | " ({:08.3}ms, {:08.3}us/iter)", 132 | first_intersect_naive / 1000f64, 133 | first_intersect_naive / iters, 134 | ), 135 | ) 136 | .field( 137 | " first_intersect_gallop", 138 | &format_args!( 139 | " ({:08.3}ms, {:08.3}us/iter)", 140 | first_intersect_gallop / 1000f64, 141 | first_intersect_gallop / iters, 142 | ), 143 | ) 144 | .field( 145 | "second_binary_search", 146 | &format_args!( 147 | " ({:08.3}ms, {:08.3}us/iter, {per_second_binary_search:06.3}%)", 148 | second_binary_search / 1000f64, 149 | second_binary_search / iters, 150 | ), 151 | ) 152 | .field( 153 | "second_intersect", 154 | &format_args!( 155 | " ({:08.3}ms, {:08.3}us/iter, {per_second_intersect:06.3}%)", 156 | second_intersect / 1000f64, 157 | second_intersect / iters, 158 | ), 159 | ) 160 | .field( 161 | " second_intersect_simd", 162 | &format_args!( 163 | " ({:08.3}ms, {:08.3}us/iter)", 164 | second_intersect_simd / 1000f64, 165 | second_intersect_simd / iters, 166 | ), 167 | ) 168 | .field( 169 | " second_intersect_naive", 170 | &format_args!( 171 | " ({:08.3}ms, {:08.3}us/iter)", 172 | second_intersect_naive / 1000f64, 173 | second_intersect_naive / iters, 174 | ), 175 | ) 176 | .field( 177 | " second_intersect_gallop", 178 | &format_args!( 179 | " ({:08.3}ms, {:08.3}us/iter)", 180 | second_intersect_gallop / 1000f64, 181 | second_intersect_gallop / iters, 182 | ), 183 | ) 184 | .field( 185 | "merge_phases_first_pass", 186 | &format_args!( 187 | " ({:08.3}ms, {:08.3}us/iter, {per_merge_phases_first_pass:06.3}%)", 188 | merge_phases_first_pass / 1000f64, 189 | merge_phases_first_pass / iters, 190 | ), 191 | ) 192 | .field( 193 | "merge_phases_second_pass", 194 | &format_args!( 195 | " ({:08.3}ms, {:08.3}us/iter, {per_merge_phases_second_pass:06.3}%)", 196 | merge_phases_second_pass / 1000f64, 197 | merge_phases_second_pass / iters, 198 | ), 199 | ) 200 | .field( 201 | "get_doc_ids", 202 | &format_args!( 203 | " ({:08.3}ms, {:08.3}us/iter, {per_get_doc_ids:06.3}%)", 204 | get_doc_ids / 1000f64, 205 | get_doc_ids / iters, 206 | ), 207 | ) 208 | .finish() 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/roaringish/intersect/simd.rs: -------------------------------------------------------------------------------- 1 | #[allow(unused_imports)] 2 | use std::{ 3 | arch::x86_64::__m512i, 4 | mem::MaybeUninit, 5 | simd::{Simd, cmp::SimdPartialOrd}, 6 | }; 7 | use std::{ 8 | arch::{ 9 | asm, 10 | x86_64::{_mm512_load_epi64, _mm512_maskz_compress_epi64, _mm512_storeu_epi64}, 11 | }, 12 | sync::atomic::Ordering::Relaxed, 13 | }; 14 | 15 | use crate::{ 16 | Stats, 17 | roaringish::{ 18 | ADD_ONE_GROUP, Aligned64, BorrowRoaringishPacked, clear_values, clear_values_simd, 19 | unpack_values_simd, 20 | }, 21 | }; 22 | 23 | use super::{Intersect, private::IntersectSeal}; 24 | use super::{Intersection, naive::NaiveIntersect}; 25 | use crate::roaringish::Aligned; 26 | 27 | const N: usize = 8; 28 | 29 | #[cfg(target_feature = "avx512vp2intersect")] 30 | #[inline(always)] 31 | unsafe fn vp2intersectq(a: __m512i, b: __m512i) -> (u8, u8) { 32 | unsafe { 33 | use std::arch::x86_64::__mmask8; 34 | 35 | let mut mask0: __mmask8; 36 | let mut mask1: __mmask8; 37 | asm!( 38 | "vp2intersectq k2, {0}, {1}", 39 | in(zmm_reg) a, 40 | in(zmm_reg) b, 41 | out("k2") mask0, 42 | out("k3") mask1, 43 | options(pure, nomem, nostack), 44 | ); 45 | 46 | (mask0, mask1) 47 | } 48 | } 49 | 50 | #[cfg(not(target_feature = "avx512vp2intersect"))] 51 | #[inline(always)] 52 | unsafe fn vp2intersectq(a: __m512i, b: __m512i) -> (u8, u8) { 53 | use std::arch::x86_64::{ 54 | _MM_PERM_BADC, _mm512_alignr_epi32, _mm512_cmpeq_epi64_mask, _mm512_shuffle_epi32, 55 | }; 56 | 57 | unsafe { 58 | let a1 = _mm512_alignr_epi32(a, a, 4); 59 | let a2 = _mm512_alignr_epi32(a, a, 8); 60 | let a3 = _mm512_alignr_epi32(a, a, 12); 61 | 62 | let b1 = _mm512_shuffle_epi32(b, _MM_PERM_BADC); 63 | 64 | let m00 = _mm512_cmpeq_epi64_mask(a, b); 65 | let m01 = _mm512_cmpeq_epi64_mask(a, b1); 66 | let m10 = _mm512_cmpeq_epi64_mask(a1, b); 67 | let m11 = _mm512_cmpeq_epi64_mask(a1, b1); 68 | let m20 = _mm512_cmpeq_epi64_mask(a2, b); 69 | let m21 = _mm512_cmpeq_epi64_mask(a2, b1); 70 | let m30 = _mm512_cmpeq_epi64_mask(a3, b); 71 | let m31 = _mm512_cmpeq_epi64_mask(a3, b1); 72 | 73 | let mask0 = m00 74 | | m01 75 | | (m10 | m11).rotate_left(2) 76 | | (m20 | m21).rotate_left(4) 77 | | (m30 | m31).rotate_left(6); 78 | 79 | let m0 = m00 | m10 | m20 | m30; 80 | let m1 = m01 | m11 | m21 | m31; 81 | let mask1 = m0 | ((0x55 & m1) << 1) | ((m1 >> 1) & 0x55); 82 | 83 | (mask0, mask1) 84 | } 85 | } 86 | 87 | #[inline(always)] 88 | unsafe fn analyze_msb( 89 | lhs_pack: Simd, 90 | msb_packed_result: &mut [MaybeUninit], 91 | j: &mut usize, 92 | msb_mask: Simd, 93 | ) { 94 | let mask = (lhs_pack & msb_mask).simd_gt(Simd::splat(0)).to_bitmask() as u8; 95 | let pack_plus_one: Simd = lhs_pack + Simd::splat(ADD_ONE_GROUP); 96 | unsafe { 97 | // TODO: avoid compressstore on zen 98 | let compress = _mm512_maskz_compress_epi64(mask, pack_plus_one.into()); 99 | _mm512_storeu_epi64(msb_packed_result.as_mut_ptr().add(*j) as *mut _, compress); 100 | } 101 | *j += mask.count_ones() as usize; 102 | } 103 | 104 | #[inline(always)] 105 | fn rotl_u16(a: Simd, i: u64) -> Simd { 106 | let p0 = a << i; 107 | let p1 = a >> (16 - i); 108 | 109 | // we don't need to unpack the values, since 110 | // in the next step we already `and` with 111 | // with mask where the doc id and group are 112 | // zeroed 113 | p0 | p1 114 | } 115 | 116 | /// SIMD version of the intersection algorithm using AVX-512. 117 | pub struct SimdIntersect; 118 | impl IntersectSeal for SimdIntersect {} 119 | impl Intersection for SimdIntersect {} 120 | 121 | impl Intersect for SimdIntersect { 122 | #[inline(always)] 123 | fn inner_intersect( 124 | lhs: BorrowRoaringishPacked<'_, Aligned>, 125 | rhs: BorrowRoaringishPacked<'_, Aligned>, 126 | 127 | lhs_i: &mut usize, 128 | rhs_i: &mut usize, 129 | 130 | packed_result: &mut Box<[MaybeUninit], Aligned64>, 131 | i: &mut usize, 132 | 133 | msb_packed_result: &mut Box<[MaybeUninit], Aligned64>, 134 | j: &mut usize, 135 | 136 | add_to_group: u64, 137 | lhs_len: u16, 138 | msb_mask: u16, 139 | lsb_mask: u16, 140 | 141 | stats: &Stats, 142 | ) { 143 | let b = std::time::Instant::now(); 144 | 145 | let simd_msb_mask = Simd::splat(msb_mask as u64); 146 | let simd_lsb_mask = Simd::splat(lsb_mask as u64); 147 | let simd_add_to_group = Simd::splat(add_to_group); 148 | 149 | let end_lhs = lhs.0.len() / N * N; 150 | let end_rhs = rhs.0.len() / N * N; 151 | let lhs_packed = unsafe { lhs.0.get_unchecked(..end_lhs) }; 152 | let rhs_packed = unsafe { rhs.0.get_unchecked(..end_rhs) }; 153 | assert_eq!(lhs_packed.len() % N, 0); 154 | assert_eq!(rhs_packed.len() % N, 0); 155 | 156 | let mut need_to_analyze_msb = false; 157 | 158 | // The first intersection will always fit into 4 pages, so no need to manually 159 | // align the loop. Since it's size is 197 bytes > 64*3 = 192 bytes. If in the 160 | // future we can reduce the size of the loop in at least 5 bytes we can fit it 161 | // in 3 pages, the same way we fit the second intersection 162 | 163 | // Forces the alignment of the loop to be at the begining of a 64 bytes page 164 | // making it fit in only 3 pages, instead of 4 (up 50% faster execution). 165 | // Since this function is inlined the alignment of the loop is based on the 166 | // parent function alignment, so this value will change in the future, but 167 | // assuming that fuctions will be 64 byte aligned, it's fairly easy to find 168 | // the new value once the code of the parent function changes 169 | if FIRST { 170 | for _ in 0..26 { 171 | unsafe { 172 | asm!("nop"); 173 | } 174 | } 175 | } else { 176 | for _ in 0..48 { 177 | unsafe { 178 | asm!("nop"); 179 | } 180 | } 181 | } 182 | 183 | while *lhs_i < lhs_packed.len() && *rhs_i < rhs_packed.len() { 184 | // Don't move this code around 185 | // this leads to shit failed optimization by LLVM 186 | // where it try to create SIMD code, but it fucks perf 187 | // 188 | // Me and my homies hate LLVM 189 | let lhs_last = unsafe { 190 | clear_values(*lhs_packed.get_unchecked(*lhs_i + N - 1)) 191 | + if FIRST { add_to_group } else { 0 } 192 | }; 193 | let rhs_last = unsafe { clear_values(*rhs_packed.get_unchecked(*rhs_i + N - 1)) }; 194 | 195 | let (lhs_pack, rhs_pack): (Simd, Simd) = unsafe { 196 | let lhs_pack = _mm512_load_epi64(lhs_packed.as_ptr().add(*lhs_i) as *const _); 197 | let rhs_pack = _mm512_load_epi64(rhs_packed.as_ptr().add(*rhs_i) as *const _); 198 | (lhs_pack.into(), rhs_pack.into()) 199 | }; 200 | let lhs_pack = if FIRST { 201 | lhs_pack + simd_add_to_group 202 | } else { 203 | lhs_pack 204 | }; 205 | 206 | let lhs_doc_id_group = clear_values_simd(lhs_pack); 207 | 208 | let rhs_doc_id_group = clear_values_simd(rhs_pack); 209 | let rhs_values = unpack_values_simd(rhs_pack); 210 | 211 | let (lhs_mask, rhs_mask) = 212 | unsafe { vp2intersectq(lhs_doc_id_group.into(), rhs_doc_id_group.into()) }; 213 | 214 | if FIRST || lhs_mask > 0 { 215 | unsafe { 216 | let lhs_pack_compress: Simd = 217 | _mm512_maskz_compress_epi64(lhs_mask, lhs_pack.into()).into(); 218 | let doc_id_group_compress = clear_values_simd(lhs_pack_compress); 219 | let lhs_values_compress = unpack_values_simd(lhs_pack_compress); 220 | 221 | let rhs_values_compress: Simd = 222 | _mm512_maskz_compress_epi64(rhs_mask, rhs_values.into()).into(); 223 | 224 | let intersection = if FIRST { 225 | (lhs_values_compress << (lhs_len as u64)) & rhs_values_compress 226 | } else { 227 | rotl_u16(lhs_values_compress, lhs_len as u64) 228 | & simd_lsb_mask 229 | & rhs_values_compress 230 | }; 231 | 232 | _mm512_storeu_epi64( 233 | packed_result.as_mut_ptr().add(*i) as *mut _, 234 | (doc_id_group_compress | intersection).into(), 235 | ); 236 | 237 | *i += lhs_mask.count_ones() as usize; 238 | } 239 | } 240 | 241 | if FIRST { 242 | if lhs_last <= rhs_last { 243 | unsafe { 244 | analyze_msb(lhs_pack, msb_packed_result, j, simd_msb_mask); 245 | } 246 | *lhs_i += N; 247 | } 248 | } else { 249 | *lhs_i += N * (lhs_last <= rhs_last) as usize; 250 | } 251 | *rhs_i += N * (rhs_last <= lhs_last) as usize; 252 | need_to_analyze_msb = rhs_last < lhs_last; 253 | } 254 | 255 | if FIRST && need_to_analyze_msb && !(*lhs_i < lhs.0.len() && *rhs_i < rhs.0.len()) { 256 | unsafe { 257 | let lhs_pack: Simd = 258 | _mm512_load_epi64(lhs_packed.as_ptr().add(*lhs_i) as *const _).into(); 259 | analyze_msb( 260 | lhs_pack + simd_add_to_group, 261 | msb_packed_result, 262 | j, 263 | simd_msb_mask, 264 | ); 265 | }; 266 | } 267 | 268 | if FIRST { 269 | stats 270 | .first_intersect_simd 271 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 272 | } else { 273 | stats 274 | .second_intersect_simd 275 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 276 | } 277 | 278 | NaiveIntersect::inner_intersect::( 279 | lhs, 280 | rhs, 281 | lhs_i, 282 | rhs_i, 283 | packed_result, 284 | i, 285 | msb_packed_result, 286 | j, 287 | add_to_group, 288 | lhs_len, 289 | msb_mask, 290 | lsb_mask, 291 | stats, 292 | ); 293 | } 294 | 295 | fn intersection_buffer_size( 296 | lhs: BorrowRoaringishPacked<'_, Aligned>, 297 | rhs: BorrowRoaringishPacked<'_, Aligned>, 298 | ) -> usize { 299 | lhs.0.len().min(rhs.0.len()) + 1 + N 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /src/indexer.rs: -------------------------------------------------------------------------------- 1 | use std::{cmp::Reverse, collections::HashSet, path::Path}; 2 | 3 | use crate::{ 4 | RoaringishPacked, Searcher, 5 | db::{DB, Document, MAX_WINDOW_LEN}, 6 | decreasing_window_iter::DecreasingWindows, 7 | error::DbError, 8 | roaringish::MAX_VALUE, 9 | utils::{normalize, tokenize}, 10 | }; 11 | use fxhash::FxHashMap; 12 | use gxhash::{HashMap as GxHashMap, HashMapExt}; 13 | use heed::RwTxn; 14 | use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; 15 | 16 | /// Specifies how the common tokens are treated during indexing. 17 | #[derive(Debug)] 18 | pub enum CommonTokens { 19 | /// Fixed list specified by the user. 20 | List(HashSet), 21 | /// Top `n` most frequent tokens. 22 | FixedNum(u32), 23 | /// Percentage of the top `n` most frequent tokens. 24 | Percentage(f64), 25 | } 26 | 27 | /// Batch of documents to be indexed. 28 | #[derive(Debug)] 29 | struct Batch { 30 | /// Monotonically increasing batch id. 31 | batch_id: u32, 32 | 33 | /// Used to estimate the number of distinct tokens. 34 | hllp_tokens: HyperLogLogPlus, gxhash::GxBuildHasher>, 35 | 36 | /// Monotonically increasing token id (cleared after each batch). 37 | next_token_id: u32, 38 | 39 | /// Maps tokens to token ids (cleared after each batch). 40 | token_to_token_id: GxHashMap, u32>, 41 | 42 | /// Maps token ids to their roaringish packed data (cleared after each batch). 43 | /// 44 | /// This should be in sync with `token_id_to_token` 45 | token_id_to_roaringish_packed: Vec, 46 | /// Maps token ids to tokens (cleared after each batch). 47 | /// 48 | /// This should be in sync with `token_id_to_roaringish_packed` 49 | token_id_to_token: Vec>, 50 | 51 | // this 3 containers are in sync 52 | /// Document ids in the batch (cleared after each batch). 53 | /// 54 | /// This should be in sync with `documents` and `tokenized_docs`. 55 | doc_ids: Vec, 56 | /// Documents in the batch (cleared after each batch). 57 | /// 58 | /// This should be in sync with `doc_ids` and `tokenized_docs`. 59 | documents: Vec, 60 | 61 | /// Tokenized representation of the documents in the batch (cleared after each batch). 62 | /// This representation is done by storing the token id. 63 | /// 64 | /// This should be in sync with `doc_ids` and `documents`. 65 | tokenized_docs: Vec>, 66 | } 67 | 68 | impl Batch { 69 | /// Constructs a new batch. 70 | fn new() -> Self { 71 | Self { 72 | batch_id: 0, 73 | // This can't fail 74 | hllp_tokens: HyperLogLogPlus::new(18, gxhash::GxBuildHasher::default()).unwrap(), 75 | next_token_id: 0, 76 | token_to_token_id: GxHashMap::new(), 77 | token_id_to_roaringish_packed: Vec::new(), 78 | token_id_to_token: Vec::new(), 79 | doc_ids: Vec::new(), 80 | documents: Vec::new(), 81 | tokenized_docs: Vec::new(), 82 | } 83 | } 84 | 85 | /// Get an overestimated number of distinct tokens. 86 | fn estimate_number_of_distinct_tokens(&mut self) -> u64 { 87 | (self.hllp_tokens.count() * 1.015f64) as u64 88 | } 89 | 90 | /// Clears the batch. 91 | /// 92 | /// This should be called after each flush and before the start of a new batch. 93 | fn clear(&mut self) { 94 | self.next_token_id = 0; 95 | self.token_to_token_id.clear(); 96 | self.token_id_to_roaringish_packed.clear(); 97 | self.token_id_to_token.clear(); 98 | 99 | self.doc_ids.clear(); 100 | self.documents.clear(); 101 | self.tokenized_docs.clear(); 102 | } 103 | 104 | /// Adds a document to the batch and starts the indexing process. 105 | /// 106 | /// `count_freq` is used to count the frequency of each token. This should 107 | /// only be used in the first batch, allowing us to generate the common tokens. 108 | fn push(&mut self, doc_id: u32, content: &str, doc: D, count_freq: impl FnMut(&str)) { 109 | let tokenized_doc = self.index_doc(content, doc_id, count_freq); 110 | self.doc_ids.push(doc_id); 111 | self.documents.push(doc); 112 | self.tokenized_docs.push(tokenized_doc); 113 | } 114 | 115 | /// Get the token id for the input `token`. If the token is not present in the 116 | /// in the batch then it's added and a new token id is generated by incrementing 117 | /// the current value. 118 | fn get_token_id( 119 | token: &str, 120 | hllp_tokens: &mut HyperLogLogPlus, gxhash::GxBuildHasher>, 121 | token_to_token_id: &mut GxHashMap, u32>, 122 | token_id_to_token: &mut Vec>, 123 | token_id_to_roaringish_packed: &mut Vec, 124 | next_token_id: &mut u32, 125 | ) -> u32 { 126 | hllp_tokens.insert(token); 127 | 128 | let (_, token_id) = token_to_token_id 129 | .raw_entry_mut() 130 | .from_key(token) 131 | .or_insert_with(|| { 132 | let current_token_id = *next_token_id; 133 | *next_token_id += 1; 134 | (token.to_string().into_boxed_str(), current_token_id) 135 | }); 136 | 137 | if *token_id as usize >= token_id_to_token.len() { 138 | token_id_to_token.push(token.to_string().into_boxed_str()); 139 | token_id_to_roaringish_packed.push(RoaringishPacked::default()); 140 | } 141 | 142 | *token_id 143 | } 144 | 145 | /// Indexes `content`s for this `doc_id`. 146 | /// 147 | /// `count_freq` is used to count the frequency of each token. This should 148 | /// only be used in the first batch, allowing us to generate the common tokens. 149 | fn index_doc( 150 | &mut self, 151 | content: &str, 152 | doc_id: u32, 153 | mut count_freq: impl FnMut(&str), 154 | ) -> Vec { 155 | let mut tokenized_doc = Vec::new(); 156 | let mut token_id_to_positions: FxHashMap> = FxHashMap::new(); 157 | let content = normalize(content); 158 | for (pos, token) in tokenize(&content).enumerate().take(MAX_VALUE as usize) { 159 | let token_id = Self::get_token_id( 160 | token, 161 | &mut self.hllp_tokens, 162 | &mut self.token_to_token_id, 163 | &mut self.token_id_to_token, 164 | &mut self.token_id_to_roaringish_packed, 165 | &mut self.next_token_id, 166 | ); 167 | 168 | count_freq(token); 169 | 170 | token_id_to_positions 171 | .entry(token_id) 172 | .or_default() 173 | .push(pos as u32); 174 | tokenized_doc.push(token_id); 175 | } 176 | 177 | for (token_id, positions) in token_id_to_positions.iter() { 178 | self.token_id_to_roaringish_packed[*token_id as usize].push(doc_id, positions); 179 | } 180 | tokenized_doc 181 | } 182 | 183 | /// Flushes the batch. 184 | fn flush( 185 | &mut self, 186 | db: &DB, 187 | rwtxn: &mut RwTxn, 188 | common_tokens: &HashSet>, 189 | mmap_size: &mut usize, 190 | ) -> Result<(), DbError> { 191 | log::info!("Flushing batch"); 192 | let b = std::time::Instant::now(); 193 | 194 | // Nothing to do if the batch is empty. 195 | if self.doc_ids.is_empty() { 196 | log::debug!("Empty batch, nothing to flush"); 197 | return Ok(()); 198 | } 199 | 200 | self.merge_common_tokens(common_tokens); 201 | 202 | db.write_token_to_roaringish_packed( 203 | &self.token_to_token_id, 204 | &self.token_id_to_roaringish_packed, 205 | mmap_size, 206 | self.batch_id, 207 | )?; 208 | db.write_doc_id_to_document(rwtxn, &self.doc_ids, &self.documents)?; 209 | 210 | self.batch_id += 1; 211 | self.clear(); 212 | log::info!("Flush took {:?}", b.elapsed()); 213 | Ok(()) 214 | } 215 | 216 | /// Merges the tokens for all of the documents in the batch. 217 | /// This will create new tokens and consequently new token ids. 218 | /// 219 | /// The generation is done by merging up to [MAX_WINDOW_LEN] tokens at a time. 220 | /// We are only allowed to merge: 221 | /// * Common tokens with other common tokens. 222 | /// * Rare tokens with a common token. 223 | /// * Common tokens with a rare token. 224 | /// 225 | /// So if it's impossible for the generated token to have two rare tokens. 226 | /// Also if rare tokens can only be in the first or last position, for example: 227 | /// 228 | /// # Examples 229 | /// ``` 230 | /// c c 231 | /// c c c 232 | /// r c 233 | /// r c c 234 | /// c r 235 | /// c c r 236 | /// ``` 237 | /// 238 | /// This will generate all possible combinations of the merging process. 239 | fn merge_common_tokens(&mut self, common_tokens: &HashSet>) { 240 | log::debug!("Merging common tokens"); 241 | if common_tokens.is_empty() { 242 | return; 243 | } 244 | 245 | let b = std::time::Instant::now(); 246 | for (tokenized_doc, doc_id) in self.tokenized_docs.iter().zip(self.doc_ids.iter()) { 247 | let mut token_id_to_positions: FxHashMap> = FxHashMap::new(); 248 | let it = DecreasingWindows::new(tokenized_doc, MAX_WINDOW_LEN); 249 | for (pos, token_ids) in it.enumerate() { 250 | let token_id = token_ids[0]; 251 | let token = &self.token_id_to_token[token_id as usize]; 252 | let is_first_token_rare = !common_tokens.contains(token); 253 | 254 | for i in 1..token_ids.len() { 255 | let token_id = token_ids[i]; 256 | let token = &self.token_id_to_token[token_id as usize]; 257 | let is_token_rare = !common_tokens.contains(token); 258 | if is_first_token_rare && is_token_rare { 259 | break; 260 | } 261 | let token: String = token_ids[..i + 1] 262 | .iter() 263 | .map(|token_id| self.token_id_to_token[*token_id as usize].as_ref()) 264 | .intersperse(" ") 265 | .collect(); 266 | let token_id = Self::get_token_id( 267 | &token, 268 | &mut self.hllp_tokens, 269 | &mut self.token_to_token_id, 270 | &mut self.token_id_to_token, 271 | &mut self.token_id_to_roaringish_packed, 272 | &mut self.next_token_id, 273 | ); 274 | token_id_to_positions 275 | .entry(token_id) 276 | .or_default() 277 | .push(pos as u32); 278 | if is_token_rare { 279 | break; 280 | } 281 | } 282 | } 283 | 284 | for (token_id, positions) in token_id_to_positions.iter() { 285 | self.token_id_to_roaringish_packed[*token_id as usize].push(*doc_id, positions); 286 | } 287 | } 288 | log::debug!("Merge took {:?}", b.elapsed()); 289 | } 290 | } 291 | 292 | /// Responsible for indexing documents. 293 | pub struct Indexer { 294 | batch_size: Option, 295 | common_tokens: Option, 296 | } 297 | 298 | impl Indexer { 299 | /// Creates a new indexer. 300 | /// 301 | /// * If `batch_size` is [None] then the indexer will index all the documents in a single batch. 302 | /// * If `common_tokens` is [None] then merging will happen. 303 | pub fn new(batch_size: Option, common_tokens: Option) -> Self { 304 | Self { 305 | batch_size, 306 | common_tokens, 307 | } 308 | } 309 | 310 | /// Generates the list of common tokens to be used 311 | /// in the merging phase 312 | fn generate_common_tokens( 313 | &self, 314 | token_to_freq: &GxHashMap, u32>, 315 | ) -> HashSet> { 316 | let Some(common_tokens) = &self.common_tokens else { 317 | return HashSet::new(); 318 | }; 319 | match common_tokens { 320 | CommonTokens::List(tokens) => tokens 321 | .into_iter() 322 | .map(|t| t.to_string().clone().into_boxed_str()) 323 | .collect(), 324 | CommonTokens::FixedNum(max) => { 325 | let max = (*max as usize).min(token_to_freq.len()); 326 | let mut token_to_freq: Vec<_> = token_to_freq.iter().collect(); 327 | token_to_freq.sort_unstable_by_key(|(_, freq)| Reverse(*freq)); 328 | token_to_freq[0..max] 329 | .iter() 330 | .map(|(token, _)| (*token).clone()) 331 | .collect() 332 | } 333 | CommonTokens::Percentage(p) => { 334 | let max = (token_to_freq.len() as f64 * *p) as usize; 335 | let mut token_to_freq: Vec<_> = token_to_freq.iter().collect(); 336 | token_to_freq.sort_unstable_by_key(|(_, freq)| Reverse(*freq)); 337 | token_to_freq[0..max] 338 | .iter() 339 | .map(|(token, _)| (*token).clone()) 340 | .collect() 341 | } 342 | } 343 | } 344 | 345 | /// Indexes an iterator of documents. 346 | /// 347 | /// This iterator should essentially return a tuple `(&str, D)`, where 348 | /// `D` is the form of the document that will be serialized and stored in the database. 349 | /// 350 | /// So the content of the document (`&str`) can be different from the stored version (`D`). 351 | /// 352 | /// The type `D` is anything that can be serialized by [rkyv]. 353 | /// 354 | /// This returns a [Searcher] object and the number of indexed documents. 355 | pub fn index( 356 | &self, 357 | docs: I, 358 | path: P, 359 | db_size: usize, 360 | ) -> Result<(Searcher, u32), DbError> 361 | where 362 | S: AsRef, 363 | I: IntoIterator, 364 | D: Document, 365 | P: AsRef, 366 | { 367 | let path = path.as_ref(); 368 | let db = DB::truncate(path, db_size)?; 369 | let mut rwtxn = db.env.write_txn()?; 370 | 371 | let mut batch = Batch::new(); 372 | 373 | let batch_size = self.batch_size.unwrap_or(u32::MAX); 374 | let mut it = docs.into_iter(); 375 | 376 | let mut token_to_freq = GxHashMap::new(); 377 | let mut next_doc_id = 0; 378 | let mut mmap_size = 0; 379 | 380 | log::info!("Starting first batch"); 381 | // Index the first batch to generate the common tokens 382 | let b = std::time::Instant::now(); 383 | for (content, doc) in it.by_ref() { 384 | let doc_id = next_doc_id; 385 | next_doc_id += 1; 386 | 387 | batch.push(doc_id, content.as_ref(), doc, |token| { 388 | let (_, freq) = token_to_freq 389 | .raw_entry_mut() 390 | .from_key(token) 391 | .or_insert_with(|| (token.to_owned().into_boxed_str(), 0)); 392 | *freq += 1; 393 | }); 394 | 395 | if next_doc_id % batch_size == 0 { 396 | break; 397 | } 398 | } 399 | log::info!("First batch took {:?}", b.elapsed()); 400 | 401 | let common_tokens = self.generate_common_tokens(&token_to_freq); 402 | drop(token_to_freq); 403 | 404 | batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?; 405 | 406 | // Index the rest of the documents 407 | log::info!("Starting new batch"); 408 | let mut b = std::time::Instant::now(); 409 | for (content, doc) in it { 410 | let doc_id = next_doc_id; 411 | next_doc_id += 1; 412 | 413 | batch.push(doc_id, content.as_ref(), doc, |_| {}); 414 | 415 | if next_doc_id % batch_size == 0 { 416 | log::info!("Batch took {:?}", b.elapsed()); 417 | b = std::time::Instant::now(); 418 | batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?; 419 | log::info!("Starting new batch"); 420 | } 421 | } 422 | 423 | // Flush the last batch 424 | batch.flush(&db, &mut rwtxn, &common_tokens, &mut mmap_size)?; 425 | 426 | let number_of_distinct_tokens = batch.estimate_number_of_distinct_tokens(); 427 | log::debug!( 428 | "Approximation for the number of distinct tokens: {}", 429 | number_of_distinct_tokens 430 | ); 431 | 432 | // Write to db 433 | db.write_common_tokens(&mut rwtxn, &common_tokens)?; 434 | db.generate_mmap_file( 435 | number_of_distinct_tokens, 436 | mmap_size, 437 | batch.batch_id, 438 | &mut rwtxn, 439 | )?; 440 | 441 | let b = std::time::Instant::now(); 442 | log::info!("Commiting"); 443 | rwtxn.commit()?; 444 | log::info!("Commit took {:?}", b.elapsed()); 445 | 446 | let searcher = Searcher::new(path)?; 447 | Ok((searcher, next_doc_id)) 448 | } 449 | } 450 | -------------------------------------------------------------------------------- /src/roaringish.rs: -------------------------------------------------------------------------------- 1 | pub mod intersect; 2 | 3 | use intersect::{ 4 | Intersect, gallop_first::GallopIntersectFirst, gallop_second::GallopIntersectSecond, 5 | }; 6 | use rkyv::{Archive, Serialize, with::InlineAsBox}; 7 | use std::{ 8 | arch::x86_64::_mm256_mask_compressstoreu_epi32, 9 | fmt::{Binary, Debug, Display}, 10 | marker::PhantomData, 11 | mem::MaybeUninit, 12 | ops::Deref, 13 | simd::{LaneCount, Simd, SupportedLaneCount, cmp::SimdPartialEq, num::SimdUint}, 14 | sync::atomic::Ordering::Relaxed, 15 | }; 16 | 17 | use crate::Stats; 18 | use crate::{Intersection, allocator::Aligned64}; 19 | 20 | pub const MAX_VALUE: u32 = 16u32 * u16::MAX as u32; 21 | pub const ADD_ONE_GROUP: u64 = u16::MAX as u64 + 1; 22 | 23 | /// Group part of a position 24 | const fn group(val: u32) -> u16 { 25 | (val / 16) as u16 26 | } 27 | 28 | /// Value part of a position 29 | const fn value(val: u32) -> u16 { 30 | (val % 16) as u16 31 | } 32 | 33 | /// Group and value parts of a position 34 | const fn gv(val: u32) -> (u16, u16) { 35 | (group(val), value(val)) 36 | } 37 | 38 | /// Puts the document ID in the 32 MSBs of the packed representation 39 | const fn pack_doc_id(doc_id: u32) -> u64 { 40 | (doc_id as u64) << 32 41 | } 42 | 43 | /// Puts the group in the middle of the packed representation 44 | const fn pack_group(group: u16) -> u64 { 45 | (group as u64) << 16 46 | } 47 | 48 | /// Packs a value into its packed representation 49 | const fn pack_value(value: u16) -> u64 { 50 | 1 << value 51 | } 52 | 53 | /// Packs a document ID and group together (they should already be in their packed form) 54 | const fn pack_doc_id_group(packed_doc_id: u64, group: u16) -> u64 { 55 | packed_doc_id | pack_group(group) 56 | } 57 | 58 | /// Packs a document ID, group (they should already be in their packed form), 59 | /// also packs a value 60 | const fn pack(packed_doc_id: u64, group: u16, value: u16) -> u64 { 61 | pack_doc_id_group(packed_doc_id, group) | pack_value(value) 62 | } 63 | 64 | /// Clears the values part of the packed representation 65 | const fn clear_values(packed: u64) -> u64 { 66 | packed & !0xFFFF 67 | } 68 | 69 | /// Clears the group and values part of the packed representation 70 | const fn clear_group_values(packed: u64) -> u64 { 71 | packed & !0xFFFFFFFF 72 | } 73 | 74 | /// Clears the values part of the packed representation 75 | #[inline(always)] 76 | fn clear_values_simd(packed: Simd) -> Simd 77 | where 78 | LaneCount: SupportedLaneCount, 79 | { 80 | packed & Simd::splat(!0xFFFF) 81 | } 82 | 83 | /// Unpacks the document ID from the packed representation 84 | const fn unpack_doc_id(packed: u64) -> u32 { 85 | (packed >> 32) as u32 86 | } 87 | 88 | /// Unpacks the document ID from the packed representation 89 | #[allow(unused)] 90 | #[inline(always)] 91 | fn unpack_doc_id_simd(packed: Simd) -> Simd 92 | where 93 | LaneCount: SupportedLaneCount, 94 | { 95 | (packed >> Simd::splat(32)).cast() 96 | } 97 | 98 | /// Unpacks the group from the packed representation 99 | const fn unpack_group(packed: u64) -> u16 { 100 | (packed >> 16) as u16 101 | } 102 | 103 | /// Unpacks the values from the packed representation 104 | const fn unpack_values(packed: u64) -> u16 { 105 | packed as u16 106 | } 107 | 108 | /// Unpacks the values from the packed representation 109 | #[inline(always)] 110 | fn unpack_values_simd(packed: Simd) -> Simd 111 | where 112 | LaneCount: SupportedLaneCount, 113 | { 114 | packed & Simd::splat(0xFFFF) 115 | } 116 | 117 | /// Enum used to distinguish between owned and borrowed RoaringishPacked. 118 | /// Mainly used at the end of the indexing phase when we merge all of the 119 | /// batches together. 120 | pub enum RoaringishPackedKind<'a, A> { 121 | Owned(RoaringishPacked), 122 | Archived(&'a ArchivedBorrowRoaringishPacked<'a, A>), 123 | } 124 | 125 | impl<'a, A> RoaringishPackedKind<'a, A> { 126 | /// Bytes of the Roaringish Packed 127 | pub fn as_bytes(&self) -> &[u8] { 128 | match self { 129 | RoaringishPackedKind::Owned(packed) => unsafe { 130 | let (l, packed, r) = packed.0.align_to::(); 131 | assert!(l.is_empty()); 132 | assert!(r.is_empty()); 133 | packed 134 | }, 135 | RoaringishPackedKind::Archived(packed) => unsafe { 136 | let (l, packed, r) = packed.0.align_to::(); 137 | assert!(l.is_empty()); 138 | assert!(r.is_empty()); 139 | packed 140 | }, 141 | } 142 | } 143 | 144 | /// Concatenates two Roaringish Packed together 145 | pub fn concat<'b: 'a>(self, other: RoaringishPackedKind<'b, A>) -> RoaringishPackedKind<'b, A> { 146 | unsafe fn copy_data(dest: &mut [MaybeUninit], lhs: &[U], rhs: &[U]) { 147 | unsafe { 148 | let (l, buf, r) = dest.align_to_mut::>(); 149 | assert!(l.is_empty()); 150 | assert!(r.is_empty()); 151 | 152 | let (l, lhs, r) = lhs.align_to::>(); 153 | assert!(l.is_empty()); 154 | assert!(r.is_empty()); 155 | 156 | let (l, rhs, r) = rhs.align_to::>(); 157 | assert!(l.is_empty()); 158 | assert!(r.is_empty()); 159 | 160 | buf[0..lhs.len()].copy_from_slice(lhs); 161 | buf[lhs.len()..].copy_from_slice(rhs); 162 | } 163 | } 164 | 165 | let r = match (self, other) { 166 | (RoaringishPackedKind::Owned(mut lhs), RoaringishPackedKind::Archived(rhs)) => { 167 | lhs.0.extend(rhs.0.iter().map(|v| v.to_native())); 168 | lhs 169 | } 170 | (RoaringishPackedKind::Archived(lhs), RoaringishPackedKind::Archived(rhs)) => { 171 | let n = lhs.0.len() + rhs.0.len(); 172 | let mut packed: Box<[MaybeUninit], _> = 173 | Box::new_uninit_slice_in(n, Aligned64::default()); 174 | 175 | unsafe { 176 | copy_data(&mut packed, &lhs.0, &rhs.0); 177 | let (p_packed, a0) = Box::into_raw_with_allocator(packed); 178 | RoaringishPacked(Vec::from_raw_parts_in(p_packed as *mut _, n, n, a0)) 179 | } 180 | } 181 | _ => panic!("This type of append should never happen"), 182 | }; 183 | RoaringishPackedKind::Owned(r) 184 | } 185 | } 186 | 187 | /// Main data structure used for phrase search. 188 | /// In here we store a compact representation of the 189 | /// document IDs and positions. 190 | /// 191 | /// The representation should be in the form: 192 | /// ``` 193 | /// document ID | group | values 194 | /// 32 bits | 16 bits | 16 bits 195 | /// ``` 196 | /// 197 | /// So the packed fits into 64 bits. 198 | /// 199 | /// The data structure should be ordered by the 200 | /// document ID and group. 201 | #[derive(PartialEq, Eq, Debug, Serialize, Archive)] 202 | #[repr(transparent)] 203 | pub struct RoaringishPacked(Vec); 204 | 205 | impl Deref for RoaringishPacked { 206 | type Target = Vec; 207 | 208 | fn deref(&self) -> &Self::Target { 209 | &self.0 210 | } 211 | } 212 | 213 | impl RoaringishPacked { 214 | /// Size occupied in bytes 215 | pub fn size_bytes(&self) -> usize { 216 | self.len() * std::mem::size_of::() 217 | } 218 | 219 | /// Adds a document with id `doc_id` and positions `pos` 220 | /// to the Roaringish Packed. 221 | pub fn push(&mut self, doc_id: u32, pos: &[u32]) { 222 | let packed_doc_id = pack_doc_id(doc_id); 223 | 224 | let mut it = pos.iter().copied(); 225 | let Some(p) = it.next() else { 226 | return; 227 | }; 228 | 229 | self.0.reserve(pos.len()); 230 | 231 | unsafe { 232 | let (group, value) = gv(p); 233 | let packed = pack(packed_doc_id, group, value); 234 | 235 | self.0.push_within_capacity(packed).unwrap_unchecked(); 236 | } 237 | 238 | for p in it { 239 | let (group, value) = gv(p); 240 | let doc_id_group = pack_doc_id_group(packed_doc_id, group); 241 | let value = pack_value(value); 242 | let packed = doc_id_group | value; 243 | 244 | let last_doc_id_group = unsafe { clear_values(*self.0.last().unwrap_unchecked()) }; 245 | if last_doc_id_group == doc_id_group { 246 | unsafe { 247 | *self.0.last_mut().unwrap_unchecked() |= value; 248 | }; 249 | } else { 250 | unsafe { 251 | self.0.push_within_capacity(packed).unwrap_unchecked(); 252 | } 253 | } 254 | } 255 | } 256 | } 257 | 258 | impl Default for RoaringishPacked { 259 | fn default() -> Self { 260 | Self(Vec::new_in(Aligned64::default())) 261 | } 262 | } 263 | 264 | /// Type used to mark when the Roaringish Packed is aligned to 64 bytes 265 | #[derive(Clone, Copy, Debug)] 266 | pub struct Aligned; 267 | 268 | /// Type used to mark when the Roaringish Packed is unaligned 269 | #[derive(Clone, Copy, Debug)] 270 | pub struct Unaligned; 271 | 272 | /// Borrow version of the Roaringish Packed. Maily used to 273 | /// interop with the Roaringish Packed retrieved from the DB. 274 | #[derive(Clone, Copy, Debug, Serialize, Archive)] 275 | pub struct BorrowRoaringishPacked<'a, A>(#[rkyv(with = InlineAsBox)] &'a [u64], PhantomData); 276 | 277 | impl Deref for BorrowRoaringishPacked<'_, A> { 278 | type Target = [u64]; 279 | 280 | fn deref(&self) -> &Self::Target { 281 | self.0 282 | } 283 | } 284 | 285 | impl<'a> BorrowRoaringishPacked<'a, Aligned> { 286 | /// Creates a new Roaringish Packed from 287 | /// the packed representation. 288 | /// 289 | /// Checks if it's aligned to 64 bytes. 290 | pub fn new_raw(packed: &'a [u64]) -> Self { 291 | assert!(packed.as_ptr().is_aligned_to(64)); 292 | Self(packed, PhantomData) 293 | } 294 | 295 | /// Creates a new Roaringish Packed from 296 | /// the packed representation. 297 | #[allow(clippy::ptr_arg)] 298 | pub fn new(packed: &'a Vec) -> Self { 299 | Self(packed, PhantomData) 300 | } 301 | 302 | #[inline(never)] 303 | pub fn intersect( 304 | self, 305 | mut rhs: Self, 306 | lhs_len: u32, 307 | stats: &Stats, 308 | ) -> RoaringishPacked { 309 | const FIRST_GALLOP_INTERSECT: usize = 650; 310 | const SECOND_GALLOP_INTERSECT: usize = 120; 311 | 312 | #[inline(always)] 313 | fn binary_search( 314 | lhs: &mut BorrowRoaringishPacked<'_, Aligned>, 315 | rhs: &mut BorrowRoaringishPacked<'_, Aligned>, 316 | ) { 317 | // skip the begining of the slice 318 | let Some(first_lhs) = lhs.0.first() else { 319 | return; 320 | }; 321 | 322 | let Some(first_rhs) = rhs.0.first() else { 323 | return; 324 | }; 325 | 326 | let first_lhs = clear_group_values(*first_lhs); 327 | let first_rhs = clear_group_values(*first_rhs); 328 | 329 | match first_lhs.cmp(&first_rhs) { 330 | std::cmp::Ordering::Less => { 331 | let i = match lhs.0.binary_search_by_key(&first_rhs, |p| clear_values(*p)) { 332 | Ok(i) => i, 333 | Err(i) => i, 334 | }; 335 | let aligned_i = i / 8 * 8; 336 | *lhs = BorrowRoaringishPacked::new_raw(&lhs.0[aligned_i..]); 337 | } 338 | std::cmp::Ordering::Greater => { 339 | let i = match rhs.0.binary_search_by_key(&first_lhs, |p| clear_values(*p)) { 340 | Ok(i) => i, 341 | Err(i) => i, 342 | }; 343 | let aligned_i = i / 8 * 8; 344 | *rhs = BorrowRoaringishPacked::new_raw(&rhs.0[aligned_i..]); 345 | } 346 | std::cmp::Ordering::Equal => {} 347 | } 348 | } 349 | 350 | let mut lhs = self; 351 | 352 | if lhs.0.is_empty() || rhs.0.is_empty() { 353 | return RoaringishPacked::default(); 354 | } 355 | 356 | let b = std::time::Instant::now(); 357 | binary_search(&mut lhs, &mut rhs); 358 | stats 359 | .first_binary_search 360 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 361 | 362 | let b = std::time::Instant::now(); 363 | // this can't fail we just checked 364 | let proportion = lhs.len().max(rhs.len()) / lhs.len().min(rhs.len()); 365 | if proportion >= FIRST_GALLOP_INTERSECT { 366 | let (packed, _) = GallopIntersectFirst::intersect::(lhs, rhs, lhs_len, stats); 367 | let (msb_packed, _) = 368 | GallopIntersectFirst::intersect::(lhs, rhs, lhs_len, stats); 369 | stats 370 | .first_intersect 371 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 372 | 373 | return Self::merge_results(packed, msb_packed, stats); 374 | } 375 | let (packed, msb_packed) = I::intersect::(lhs, rhs, lhs_len, stats); 376 | stats 377 | .first_intersect 378 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 379 | 380 | let mut msb_packed = BorrowRoaringishPacked::new(&msb_packed); 381 | 382 | let b = std::time::Instant::now(); 383 | binary_search(&mut msb_packed, &mut rhs); 384 | stats 385 | .second_binary_search 386 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 387 | 388 | let b = std::time::Instant::now(); 389 | let proportion = msb_packed 390 | .len() 391 | .max(rhs.len()) 392 | .checked_div(msb_packed.len().min(rhs.len())); 393 | let (msb_packed, _) = match proportion { 394 | Some(proportion) => { 395 | if proportion >= SECOND_GALLOP_INTERSECT { 396 | GallopIntersectSecond::intersect::(msb_packed, rhs, lhs_len, stats) 397 | } else { 398 | I::intersect::(msb_packed, rhs, lhs_len, stats) 399 | } 400 | } 401 | None => I::intersect::(msb_packed, rhs, lhs_len, stats), 402 | }; 403 | stats 404 | .second_intersect 405 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 406 | 407 | Self::merge_results(packed, msb_packed, stats) 408 | } 409 | 410 | /// Merges the results of the first and second phase of the intersection. 411 | /// 412 | /// This function neeeds to be inline always, for some reason not inlining this 413 | /// function makes some queries performance unpredictable 414 | #[inline(always)] 415 | fn merge_results( 416 | packed: Vec, 417 | msb_packed: Vec, 418 | stats: &Stats, 419 | ) -> RoaringishPacked { 420 | let b = std::time::Instant::now(); 421 | let capacity = packed.len() + msb_packed.len(); 422 | let mut r_packed = Box::new_uninit_slice_in(capacity, Aligned64::default()); 423 | let mut r_i = 0; 424 | let mut j = 0; 425 | // let's use the fact the the first element in `packed` is smaller or 426 | // equal to the first element in `msb_packed` 427 | for pack in packed.iter().copied() { 428 | unsafe { 429 | let doc_id_group = clear_values(pack); 430 | let values = unpack_values(pack); 431 | 432 | // write from the `until it's smaller than the current doc_id_group` 433 | while j < msb_packed.len() { 434 | let msb_pack = *msb_packed.get_unchecked(j); 435 | let msb_doc_id_group = clear_values(msb_pack); 436 | let msb_values = unpack_values(msb_pack); 437 | j += 1; 438 | 439 | if msb_doc_id_group >= doc_id_group { 440 | j -= 1; 441 | break; 442 | } 443 | 444 | if msb_values > 0 { 445 | r_packed.get_unchecked_mut(r_i).write(msb_pack); 446 | r_i += 1; 447 | } 448 | } 449 | 450 | // check to avoid writing elements where their values are 0 451 | let write = values > 0; 452 | if write { 453 | r_packed.get_unchecked_mut(r_i).write(pack); 454 | r_i += 1; 455 | } 456 | 457 | // avoids out of bounds read 458 | if j >= msb_packed.len() { 459 | continue; 460 | } 461 | 462 | // write the element from `msb_packed` that made the loop break 463 | // only if it's equal to the current `doc_id_group` 464 | let msb_pack = *msb_packed.get_unchecked(j); 465 | let msb_doc_id_group = clear_values(msb_pack); 466 | let msb_values = unpack_values(msb_pack); 467 | j += 1; 468 | if msb_doc_id_group != doc_id_group { 469 | j -= 1; 470 | continue; 471 | } 472 | 473 | if write { 474 | // in this case at least one bit was set in the intersection, 475 | // so we can just `or` the new value with the previous one 476 | let r = r_packed.get_unchecked_mut(r_i - 1).assume_init_mut(); 477 | *r |= msb_values as u64; 478 | } else if msb_values > 0 { 479 | // in this case no bit was set in the intersection, 480 | // so write as if it was new 481 | r_packed.get_unchecked_mut(r_i).write(msb_pack); 482 | r_i += 1; 483 | } 484 | } 485 | } 486 | stats 487 | .merge_phases_first_pass 488 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 489 | 490 | // finish the rest of the elements in `msb_packed` 491 | let b = std::time::Instant::now(); 492 | for msb_pack in msb_packed.iter().skip(j).copied() { 493 | unsafe { 494 | let msb_values = unpack_values(msb_pack); 495 | if msb_values > 0 { 496 | r_packed.get_unchecked_mut(r_i).write(msb_pack); 497 | r_i += 1; 498 | } 499 | } 500 | } 501 | stats 502 | .merge_phases_second_pass 503 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 504 | 505 | unsafe { 506 | let (p_packed, a0) = Box::into_raw_with_allocator(r_packed); 507 | let packed = Vec::from_raw_parts_in(p_packed as *mut _, r_i, capacity, a0); 508 | RoaringishPacked(packed) 509 | } 510 | } 511 | } 512 | 513 | impl BorrowRoaringishPacked<'_, A> { 514 | /// Gets the distinct document IDs from the Roaringish Packed. 515 | #[cfg(not(target_feature = "avx512f"))] 516 | #[inline(always)] 517 | pub fn get_doc_ids(&self, stats: &Stats) -> Vec { 518 | if self.0.is_empty() { 519 | return Vec::new(); 520 | } 521 | 522 | if self.0.len() == 1 { 523 | return vec![unpack_doc_id(self.0[0])]; 524 | } 525 | 526 | let b = std::time::Instant::now(); 527 | 528 | let mut doc_ids: Box<[MaybeUninit]> = Box::new_uninit_slice(self.0.len()); 529 | let mut i = 0; 530 | 531 | for [packed0, packed1] in self.0.array_windows::<2>() { 532 | let doc_id0 = unpack_doc_id(*packed0); 533 | let doc_id1 = unpack_doc_id(*packed1); 534 | if doc_id0 != doc_id1 { 535 | unsafe { doc_ids.get_unchecked_mut(i).write(doc_id0) }; 536 | i += 1; 537 | } 538 | } 539 | 540 | unsafe { 541 | doc_ids 542 | .get_unchecked_mut(i) 543 | .write(unpack_doc_id(*self.0.last().unwrap_unchecked())) 544 | }; 545 | i += 1; 546 | 547 | stats 548 | .get_doc_ids 549 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 550 | 551 | unsafe { Vec::from_raw_parts(Box::into_raw(doc_ids) as *mut _, i, self.0.len()) } 552 | } 553 | 554 | /// Gets the distinct document IDs from the Roaringish Packed. 555 | #[cfg(target_feature = "avx512f")] 556 | #[inline(always)] 557 | pub fn get_doc_ids(&self, stats: &Stats) -> Vec { 558 | if self.0.is_empty() { 559 | return Vec::new(); 560 | } 561 | 562 | if self.0.len() == 1 { 563 | return vec![unpack_doc_id(self.0[0])]; 564 | } 565 | 566 | let b = std::time::Instant::now(); 567 | 568 | let mut doc_ids: Box<[MaybeUninit]> = Box::new_uninit_slice(self.0.len()); 569 | let mut i = 0; 570 | 571 | unsafe { doc_ids.get_unchecked_mut(i).write(unpack_doc_id(self.0[0])) }; 572 | i += 1; 573 | 574 | let mut last_doc_id = unpack_doc_id(self.0[0]); 575 | let (l, m, r) = self.0.as_simd::<8>(); 576 | assert!(l.is_empty()); 577 | for packed in m { 578 | let doc_id = unpack_doc_id_simd(*packed); 579 | let rot = doc_id.rotate_elements_right::<1>(); 580 | let first = doc_id.as_array()[0]; 581 | let last = doc_id.as_array()[7]; 582 | 583 | let include_first = (first != last_doc_id) as u8; 584 | let mask = (doc_id.simd_ne(rot).to_bitmask() as u8 & !1) | include_first; 585 | 586 | unsafe { 587 | // TODO: avoid compressstore on zen4 588 | _mm256_mask_compressstoreu_epi32( 589 | doc_ids.as_mut_ptr().add(i) as *mut _, 590 | mask, 591 | doc_id.into(), 592 | ); 593 | } 594 | i += mask.count_ones() as usize; 595 | last_doc_id = last; 596 | } 597 | 598 | let j = r 599 | .iter() 600 | .take_while(|packed| unpack_doc_id(**packed) == last_doc_id) 601 | .count(); 602 | let r = &r[j..]; 603 | for [packed0, packed1] in r.array_windows::<2>() { 604 | let doc_id0 = unpack_doc_id(*packed0); 605 | let doc_id1 = unpack_doc_id(*packed1); 606 | if doc_id0 != doc_id1 { 607 | unsafe { doc_ids.get_unchecked_mut(i).write(doc_id0) }; 608 | i += 1; 609 | } 610 | } 611 | 612 | if !r.is_empty() { 613 | unsafe { 614 | doc_ids 615 | .get_unchecked_mut(i) 616 | .write(unpack_doc_id(*r.last().unwrap_unchecked())) 617 | }; 618 | i += 1; 619 | } 620 | 621 | stats 622 | .get_doc_ids 623 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 624 | 625 | unsafe { Vec::from_raw_parts(Box::into_raw(doc_ids) as *mut _, i, self.0.len()) } 626 | } 627 | } 628 | 629 | impl Binary for RoaringishPacked { 630 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 631 | let mut list = f.debug_list(); 632 | for packed in self.0.iter() { 633 | list.entry_with(|f| { 634 | let doc_id = unpack_doc_id(*packed); 635 | let group = unpack_group(*packed); 636 | let values = unpack_values(*packed); 637 | f.write_fmt(format_args!("{doc_id:032b} {group:016b} {values:016b}")) 638 | }); 639 | } 640 | 641 | list.finish() 642 | } 643 | } 644 | 645 | impl Binary for BorrowRoaringishPacked<'_, A> { 646 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 647 | let mut list = f.debug_list(); 648 | for packed in self.0.iter() { 649 | list.entry_with(|f| { 650 | let doc_id = unpack_doc_id(*packed); 651 | let group = unpack_group(*packed); 652 | let values = unpack_values(*packed); 653 | f.write_fmt(format_args!("{doc_id:032b} {group:016b} {values:016b}")) 654 | }); 655 | } 656 | 657 | list.finish() 658 | } 659 | } 660 | 661 | impl Display for RoaringishPacked { 662 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 663 | let it = self.0.iter().flat_map(|packed| { 664 | let doc_id = unpack_doc_id(*packed); 665 | let group = unpack_group(*packed) as u32; 666 | let values = unpack_values(*packed); 667 | let s = group * 16; 668 | (0..16u32).filter_map(move |i| ((values >> i) & 1 == 1).then_some((doc_id, s + i))) 669 | }); 670 | f.debug_list().entries(it).finish() 671 | } 672 | } 673 | 674 | impl Display for BorrowRoaringishPacked<'_, A> { 675 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 676 | let it = self.0.iter().flat_map(|packed| { 677 | let doc_id = unpack_doc_id(*packed); 678 | let group = unpack_group(*packed) as u32; 679 | let values = unpack_values(*packed); 680 | let s = group * 16; 681 | (0..16u32).filter_map(move |i| ((values >> i) & 1 == 1).then_some((doc_id, s + i))) 682 | }); 683 | f.debug_list().entries(it).finish() 684 | } 685 | } 686 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "bincode" 7 | version = "1.3.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" 10 | dependencies = [ 11 | "serde", 12 | ] 13 | 14 | [[package]] 15 | name = "bitflags" 16 | version = "2.9.0" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" 19 | dependencies = [ 20 | "serde", 21 | ] 22 | 23 | [[package]] 24 | name = "bumpalo" 25 | version = "3.17.0" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" 28 | 29 | [[package]] 30 | name = "bytecheck" 31 | version = "0.8.1" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "50690fb3370fb9fe3550372746084c46f2ac8c9685c583d2be10eefd89d3d1a3" 34 | dependencies = [ 35 | "bytecheck_derive", 36 | "ptr_meta", 37 | "rancor", 38 | "simdutf8", 39 | ] 40 | 41 | [[package]] 42 | name = "bytecheck_derive" 43 | version = "0.8.1" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "efb7846e0cb180355c2dec69e721edafa36919850f1a9f52ffba4ebc0393cb71" 46 | dependencies = [ 47 | "proc-macro2", 48 | "quote", 49 | "syn", 50 | ] 51 | 52 | [[package]] 53 | name = "byteorder" 54 | version = "1.5.0" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 57 | 58 | [[package]] 59 | name = "bytes" 60 | version = "1.10.0" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" 63 | 64 | [[package]] 65 | name = "cc" 66 | version = "1.2.16" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" 69 | dependencies = [ 70 | "shlex", 71 | ] 72 | 73 | [[package]] 74 | name = "crossbeam-queue" 75 | version = "0.3.12" 76 | source = "registry+https://github.com/rust-lang/crates.io-index" 77 | checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" 78 | dependencies = [ 79 | "crossbeam-utils", 80 | ] 81 | 82 | [[package]] 83 | name = "crossbeam-utils" 84 | version = "0.8.21" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 87 | 88 | [[package]] 89 | name = "displaydoc" 90 | version = "0.2.5" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" 93 | dependencies = [ 94 | "proc-macro2", 95 | "quote", 96 | "syn", 97 | ] 98 | 99 | [[package]] 100 | name = "doxygen-rs" 101 | version = "0.4.2" 102 | source = "registry+https://github.com/rust-lang/crates.io-index" 103 | checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9" 104 | dependencies = [ 105 | "phf", 106 | ] 107 | 108 | [[package]] 109 | name = "equivalent" 110 | version = "1.0.2" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 113 | 114 | [[package]] 115 | name = "form_urlencoded" 116 | version = "1.2.1" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" 119 | dependencies = [ 120 | "percent-encoding", 121 | ] 122 | 123 | [[package]] 124 | name = "fxhash" 125 | version = "0.2.1" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 128 | dependencies = [ 129 | "byteorder", 130 | ] 131 | 132 | [[package]] 133 | name = "gxhash" 134 | version = "3.4.1" 135 | source = "registry+https://github.com/rust-lang/crates.io-index" 136 | checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" 137 | dependencies = [ 138 | "rustversion", 139 | ] 140 | 141 | [[package]] 142 | name = "hashbrown" 143 | version = "0.15.2" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" 146 | 147 | [[package]] 148 | name = "heed" 149 | version = "0.21.0" 150 | source = "registry+https://github.com/rust-lang/crates.io-index" 151 | checksum = "bd54745cfacb7b97dee45e8fdb91814b62bccddb481debb7de0f9ee6b7bf5b43" 152 | dependencies = [ 153 | "bitflags", 154 | "byteorder", 155 | "heed-traits", 156 | "heed-types", 157 | "libc", 158 | "lmdb-master-sys", 159 | "once_cell", 160 | "page_size", 161 | "serde", 162 | "synchronoise", 163 | "url", 164 | ] 165 | 166 | [[package]] 167 | name = "heed-traits" 168 | version = "0.20.0" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" 171 | 172 | [[package]] 173 | name = "heed-types" 174 | version = "0.21.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d" 177 | dependencies = [ 178 | "bincode", 179 | "byteorder", 180 | "heed-traits", 181 | "serde", 182 | "serde_json", 183 | ] 184 | 185 | [[package]] 186 | name = "hyperloglogplus" 187 | version = "0.4.1" 188 | source = "registry+https://github.com/rust-lang/crates.io-index" 189 | checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" 190 | dependencies = [ 191 | "serde", 192 | ] 193 | 194 | [[package]] 195 | name = "icu_collections" 196 | version = "1.5.0" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" 199 | dependencies = [ 200 | "displaydoc", 201 | "yoke", 202 | "zerofrom", 203 | "zerovec", 204 | ] 205 | 206 | [[package]] 207 | name = "icu_locid" 208 | version = "1.5.0" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" 211 | dependencies = [ 212 | "displaydoc", 213 | "litemap", 214 | "tinystr", 215 | "writeable", 216 | "zerovec", 217 | ] 218 | 219 | [[package]] 220 | name = "icu_locid_transform" 221 | version = "1.5.0" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" 224 | dependencies = [ 225 | "displaydoc", 226 | "icu_locid", 227 | "icu_locid_transform_data", 228 | "icu_provider", 229 | "tinystr", 230 | "zerovec", 231 | ] 232 | 233 | [[package]] 234 | name = "icu_locid_transform_data" 235 | version = "1.5.0" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" 238 | 239 | [[package]] 240 | name = "icu_normalizer" 241 | version = "1.5.0" 242 | source = "registry+https://github.com/rust-lang/crates.io-index" 243 | checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" 244 | dependencies = [ 245 | "displaydoc", 246 | "icu_collections", 247 | "icu_normalizer_data", 248 | "icu_properties", 249 | "icu_provider", 250 | "smallvec", 251 | "utf16_iter", 252 | "utf8_iter", 253 | "write16", 254 | "zerovec", 255 | ] 256 | 257 | [[package]] 258 | name = "icu_normalizer_data" 259 | version = "1.5.0" 260 | source = "registry+https://github.com/rust-lang/crates.io-index" 261 | checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" 262 | 263 | [[package]] 264 | name = "icu_properties" 265 | version = "1.5.1" 266 | source = "registry+https://github.com/rust-lang/crates.io-index" 267 | checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" 268 | dependencies = [ 269 | "displaydoc", 270 | "icu_collections", 271 | "icu_locid_transform", 272 | "icu_properties_data", 273 | "icu_provider", 274 | "tinystr", 275 | "zerovec", 276 | ] 277 | 278 | [[package]] 279 | name = "icu_properties_data" 280 | version = "1.5.0" 281 | source = "registry+https://github.com/rust-lang/crates.io-index" 282 | checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" 283 | 284 | [[package]] 285 | name = "icu_provider" 286 | version = "1.5.0" 287 | source = "registry+https://github.com/rust-lang/crates.io-index" 288 | checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" 289 | dependencies = [ 290 | "displaydoc", 291 | "icu_locid", 292 | "icu_provider_macros", 293 | "stable_deref_trait", 294 | "tinystr", 295 | "writeable", 296 | "yoke", 297 | "zerofrom", 298 | "zerovec", 299 | ] 300 | 301 | [[package]] 302 | name = "icu_provider_macros" 303 | version = "1.5.0" 304 | source = "registry+https://github.com/rust-lang/crates.io-index" 305 | checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" 306 | dependencies = [ 307 | "proc-macro2", 308 | "quote", 309 | "syn", 310 | ] 311 | 312 | [[package]] 313 | name = "idna" 314 | version = "1.0.3" 315 | source = "registry+https://github.com/rust-lang/crates.io-index" 316 | checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" 317 | dependencies = [ 318 | "idna_adapter", 319 | "smallvec", 320 | "utf8_iter", 321 | ] 322 | 323 | [[package]] 324 | name = "idna_adapter" 325 | version = "1.2.0" 326 | source = "registry+https://github.com/rust-lang/crates.io-index" 327 | checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" 328 | dependencies = [ 329 | "icu_normalizer", 330 | "icu_properties", 331 | ] 332 | 333 | [[package]] 334 | name = "indexmap" 335 | version = "2.7.1" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" 338 | dependencies = [ 339 | "equivalent", 340 | "hashbrown", 341 | ] 342 | 343 | [[package]] 344 | name = "itoa" 345 | version = "1.0.15" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 348 | 349 | [[package]] 350 | name = "libc" 351 | version = "0.2.170" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" 354 | 355 | [[package]] 356 | name = "litemap" 357 | version = "0.7.5" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" 360 | 361 | [[package]] 362 | name = "lmdb-master-sys" 363 | version = "0.2.4" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "472c3760e2a8d0f61f322fb36788021bb36d573c502b50fa3e2bcaac3ec326c9" 366 | dependencies = [ 367 | "cc", 368 | "doxygen-rs", 369 | "libc", 370 | ] 371 | 372 | [[package]] 373 | name = "log" 374 | version = "0.4.26" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" 377 | 378 | [[package]] 379 | name = "memchr" 380 | version = "2.7.4" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 383 | 384 | [[package]] 385 | name = "memmap2" 386 | version = "0.9.5" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" 389 | dependencies = [ 390 | "libc", 391 | ] 392 | 393 | [[package]] 394 | name = "munge" 395 | version = "0.4.3" 396 | source = "registry+https://github.com/rust-lang/crates.io-index" 397 | checksum = "a0091202c98cf06da46c279fdf50cccb6b1c43b4521abdf6a27b4c7e71d5d9d7" 398 | dependencies = [ 399 | "munge_macro", 400 | ] 401 | 402 | [[package]] 403 | name = "munge_macro" 404 | version = "0.4.3" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "734799cf91479720b2f970c61a22850940dd91e27d4f02b1c6fc792778df2459" 407 | dependencies = [ 408 | "proc-macro2", 409 | "quote", 410 | "syn", 411 | ] 412 | 413 | [[package]] 414 | name = "once_cell" 415 | version = "1.20.3" 416 | source = "registry+https://github.com/rust-lang/crates.io-index" 417 | checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" 418 | 419 | [[package]] 420 | name = "page_size" 421 | version = "0.6.0" 422 | source = "registry+https://github.com/rust-lang/crates.io-index" 423 | checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" 424 | dependencies = [ 425 | "libc", 426 | "winapi", 427 | ] 428 | 429 | [[package]] 430 | name = "percent-encoding" 431 | version = "2.3.1" 432 | source = "registry+https://github.com/rust-lang/crates.io-index" 433 | checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" 434 | 435 | [[package]] 436 | name = "phf" 437 | version = "0.11.3" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 440 | dependencies = [ 441 | "phf_macros", 442 | "phf_shared", 443 | ] 444 | 445 | [[package]] 446 | name = "phf_generator" 447 | version = "0.11.3" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 450 | dependencies = [ 451 | "phf_shared", 452 | "rand", 453 | ] 454 | 455 | [[package]] 456 | name = "phf_macros" 457 | version = "0.11.3" 458 | source = "registry+https://github.com/rust-lang/crates.io-index" 459 | checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" 460 | dependencies = [ 461 | "phf_generator", 462 | "phf_shared", 463 | "proc-macro2", 464 | "quote", 465 | "syn", 466 | ] 467 | 468 | [[package]] 469 | name = "phf_shared" 470 | version = "0.11.3" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 473 | dependencies = [ 474 | "siphasher", 475 | ] 476 | 477 | [[package]] 478 | name = "proc-macro2" 479 | version = "1.0.94" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" 482 | dependencies = [ 483 | "unicode-ident", 484 | ] 485 | 486 | [[package]] 487 | name = "ptr_meta" 488 | version = "0.3.0" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "fe9e76f66d3f9606f44e45598d155cb13ecf09f4a28199e48daf8c8fc937ea90" 491 | dependencies = [ 492 | "ptr_meta_derive", 493 | ] 494 | 495 | [[package]] 496 | name = "ptr_meta_derive" 497 | version = "0.3.0" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "ca414edb151b4c8d125c12566ab0d74dc9cdba36fb80eb7b848c15f495fd32d1" 500 | dependencies = [ 501 | "proc-macro2", 502 | "quote", 503 | "syn", 504 | ] 505 | 506 | [[package]] 507 | name = "quote" 508 | version = "1.0.39" 509 | source = "registry+https://github.com/rust-lang/crates.io-index" 510 | checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" 511 | dependencies = [ 512 | "proc-macro2", 513 | ] 514 | 515 | [[package]] 516 | name = "rancor" 517 | version = "0.1.0" 518 | source = "registry+https://github.com/rust-lang/crates.io-index" 519 | checksum = "caf5f7161924b9d1cea0e4cabc97c372cea92b5f927fc13c6bca67157a0ad947" 520 | dependencies = [ 521 | "ptr_meta", 522 | ] 523 | 524 | [[package]] 525 | name = "rand" 526 | version = "0.8.5" 527 | source = "registry+https://github.com/rust-lang/crates.io-index" 528 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 529 | dependencies = [ 530 | "rand_core", 531 | ] 532 | 533 | [[package]] 534 | name = "rand_core" 535 | version = "0.6.4" 536 | source = "registry+https://github.com/rust-lang/crates.io-index" 537 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 538 | 539 | [[package]] 540 | name = "rend" 541 | version = "0.5.2" 542 | source = "registry+https://github.com/rust-lang/crates.io-index" 543 | checksum = "a35e8a6bf28cd121053a66aa2e6a2e3eaffad4a60012179f0e864aa5ffeff215" 544 | dependencies = [ 545 | "bytecheck", 546 | ] 547 | 548 | [[package]] 549 | name = "rkyv" 550 | version = "0.8.10" 551 | source = "registry+https://github.com/rust-lang/crates.io-index" 552 | checksum = "1e147371c75553e1e2fcdb483944a8540b8438c31426279553b9a8182a9b7b65" 553 | dependencies = [ 554 | "bytecheck", 555 | "bytes", 556 | "hashbrown", 557 | "indexmap", 558 | "munge", 559 | "ptr_meta", 560 | "rancor", 561 | "rend", 562 | "rkyv_derive", 563 | "tinyvec", 564 | "uuid", 565 | ] 566 | 567 | [[package]] 568 | name = "rkyv_derive" 569 | version = "0.8.10" 570 | source = "registry+https://github.com/rust-lang/crates.io-index" 571 | checksum = "246b40ac189af6c675d124b802e8ef6d5246c53e17367ce9501f8f66a81abb7a" 572 | dependencies = [ 573 | "proc-macro2", 574 | "quote", 575 | "syn", 576 | ] 577 | 578 | [[package]] 579 | name = "rustversion" 580 | version = "1.0.20" 581 | source = "registry+https://github.com/rust-lang/crates.io-index" 582 | checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" 583 | 584 | [[package]] 585 | name = "ryu" 586 | version = "1.0.20" 587 | source = "registry+https://github.com/rust-lang/crates.io-index" 588 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 589 | 590 | [[package]] 591 | name = "serde" 592 | version = "1.0.218" 593 | source = "registry+https://github.com/rust-lang/crates.io-index" 594 | checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" 595 | dependencies = [ 596 | "serde_derive", 597 | ] 598 | 599 | [[package]] 600 | name = "serde_derive" 601 | version = "1.0.218" 602 | source = "registry+https://github.com/rust-lang/crates.io-index" 603 | checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" 604 | dependencies = [ 605 | "proc-macro2", 606 | "quote", 607 | "syn", 608 | ] 609 | 610 | [[package]] 611 | name = "serde_json" 612 | version = "1.0.140" 613 | source = "registry+https://github.com/rust-lang/crates.io-index" 614 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" 615 | dependencies = [ 616 | "itoa", 617 | "memchr", 618 | "ryu", 619 | "serde", 620 | ] 621 | 622 | [[package]] 623 | name = "shlex" 624 | version = "1.3.0" 625 | source = "registry+https://github.com/rust-lang/crates.io-index" 626 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 627 | 628 | [[package]] 629 | name = "simdphrase" 630 | version = "0.1.1" 631 | dependencies = [ 632 | "bumpalo", 633 | "fxhash", 634 | "gxhash", 635 | "heed", 636 | "hyperloglogplus", 637 | "log", 638 | "memmap2", 639 | "rkyv", 640 | "thiserror", 641 | "unicode-segmentation", 642 | ] 643 | 644 | [[package]] 645 | name = "simdutf8" 646 | version = "0.1.5" 647 | source = "registry+https://github.com/rust-lang/crates.io-index" 648 | checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" 649 | 650 | [[package]] 651 | name = "siphasher" 652 | version = "1.0.1" 653 | source = "registry+https://github.com/rust-lang/crates.io-index" 654 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 655 | 656 | [[package]] 657 | name = "smallvec" 658 | version = "1.14.0" 659 | source = "registry+https://github.com/rust-lang/crates.io-index" 660 | checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" 661 | 662 | [[package]] 663 | name = "stable_deref_trait" 664 | version = "1.2.0" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 667 | 668 | [[package]] 669 | name = "syn" 670 | version = "2.0.99" 671 | source = "registry+https://github.com/rust-lang/crates.io-index" 672 | checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" 673 | dependencies = [ 674 | "proc-macro2", 675 | "quote", 676 | "unicode-ident", 677 | ] 678 | 679 | [[package]] 680 | name = "synchronoise" 681 | version = "1.0.1" 682 | source = "registry+https://github.com/rust-lang/crates.io-index" 683 | checksum = "3dbc01390fc626ce8d1cffe3376ded2b72a11bb70e1c75f404a210e4daa4def2" 684 | dependencies = [ 685 | "crossbeam-queue", 686 | ] 687 | 688 | [[package]] 689 | name = "synstructure" 690 | version = "0.13.1" 691 | source = "registry+https://github.com/rust-lang/crates.io-index" 692 | checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" 693 | dependencies = [ 694 | "proc-macro2", 695 | "quote", 696 | "syn", 697 | ] 698 | 699 | [[package]] 700 | name = "thiserror" 701 | version = "2.0.12" 702 | source = "registry+https://github.com/rust-lang/crates.io-index" 703 | checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" 704 | dependencies = [ 705 | "thiserror-impl", 706 | ] 707 | 708 | [[package]] 709 | name = "thiserror-impl" 710 | version = "2.0.12" 711 | source = "registry+https://github.com/rust-lang/crates.io-index" 712 | checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" 713 | dependencies = [ 714 | "proc-macro2", 715 | "quote", 716 | "syn", 717 | ] 718 | 719 | [[package]] 720 | name = "tinystr" 721 | version = "0.7.6" 722 | source = "registry+https://github.com/rust-lang/crates.io-index" 723 | checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" 724 | dependencies = [ 725 | "displaydoc", 726 | "zerovec", 727 | ] 728 | 729 | [[package]] 730 | name = "tinyvec" 731 | version = "1.9.0" 732 | source = "registry+https://github.com/rust-lang/crates.io-index" 733 | checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" 734 | dependencies = [ 735 | "tinyvec_macros", 736 | ] 737 | 738 | [[package]] 739 | name = "tinyvec_macros" 740 | version = "0.1.1" 741 | source = "registry+https://github.com/rust-lang/crates.io-index" 742 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" 743 | 744 | [[package]] 745 | name = "unicode-ident" 746 | version = "1.0.18" 747 | source = "registry+https://github.com/rust-lang/crates.io-index" 748 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 749 | 750 | [[package]] 751 | name = "unicode-segmentation" 752 | version = "1.12.0" 753 | source = "registry+https://github.com/rust-lang/crates.io-index" 754 | checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" 755 | 756 | [[package]] 757 | name = "url" 758 | version = "2.5.4" 759 | source = "registry+https://github.com/rust-lang/crates.io-index" 760 | checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" 761 | dependencies = [ 762 | "form_urlencoded", 763 | "idna", 764 | "percent-encoding", 765 | ] 766 | 767 | [[package]] 768 | name = "utf16_iter" 769 | version = "1.0.5" 770 | source = "registry+https://github.com/rust-lang/crates.io-index" 771 | checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" 772 | 773 | [[package]] 774 | name = "utf8_iter" 775 | version = "1.0.4" 776 | source = "registry+https://github.com/rust-lang/crates.io-index" 777 | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" 778 | 779 | [[package]] 780 | name = "uuid" 781 | version = "1.15.1" 782 | source = "registry+https://github.com/rust-lang/crates.io-index" 783 | checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" 784 | 785 | [[package]] 786 | name = "winapi" 787 | version = "0.3.9" 788 | source = "registry+https://github.com/rust-lang/crates.io-index" 789 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 790 | dependencies = [ 791 | "winapi-i686-pc-windows-gnu", 792 | "winapi-x86_64-pc-windows-gnu", 793 | ] 794 | 795 | [[package]] 796 | name = "winapi-i686-pc-windows-gnu" 797 | version = "0.4.0" 798 | source = "registry+https://github.com/rust-lang/crates.io-index" 799 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 800 | 801 | [[package]] 802 | name = "winapi-x86_64-pc-windows-gnu" 803 | version = "0.4.0" 804 | source = "registry+https://github.com/rust-lang/crates.io-index" 805 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 806 | 807 | [[package]] 808 | name = "write16" 809 | version = "1.0.0" 810 | source = "registry+https://github.com/rust-lang/crates.io-index" 811 | checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" 812 | 813 | [[package]] 814 | name = "writeable" 815 | version = "0.5.5" 816 | source = "registry+https://github.com/rust-lang/crates.io-index" 817 | checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" 818 | 819 | [[package]] 820 | name = "yoke" 821 | version = "0.7.5" 822 | source = "registry+https://github.com/rust-lang/crates.io-index" 823 | checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" 824 | dependencies = [ 825 | "serde", 826 | "stable_deref_trait", 827 | "yoke-derive", 828 | "zerofrom", 829 | ] 830 | 831 | [[package]] 832 | name = "yoke-derive" 833 | version = "0.7.5" 834 | source = "registry+https://github.com/rust-lang/crates.io-index" 835 | checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" 836 | dependencies = [ 837 | "proc-macro2", 838 | "quote", 839 | "syn", 840 | "synstructure", 841 | ] 842 | 843 | [[package]] 844 | name = "zerofrom" 845 | version = "0.1.6" 846 | source = "registry+https://github.com/rust-lang/crates.io-index" 847 | checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" 848 | dependencies = [ 849 | "zerofrom-derive", 850 | ] 851 | 852 | [[package]] 853 | name = "zerofrom-derive" 854 | version = "0.1.6" 855 | source = "registry+https://github.com/rust-lang/crates.io-index" 856 | checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" 857 | dependencies = [ 858 | "proc-macro2", 859 | "quote", 860 | "syn", 861 | "synstructure", 862 | ] 863 | 864 | [[package]] 865 | name = "zerovec" 866 | version = "0.10.4" 867 | source = "registry+https://github.com/rust-lang/crates.io-index" 868 | checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" 869 | dependencies = [ 870 | "yoke", 871 | "zerofrom", 872 | "zerovec-derive", 873 | ] 874 | 875 | [[package]] 876 | name = "zerovec-derive" 877 | version = "0.10.3" 878 | source = "registry+https://github.com/rust-lang/crates.io-index" 879 | checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" 880 | dependencies = [ 881 | "proc-macro2", 882 | "quote", 883 | "syn", 884 | ] 885 | -------------------------------------------------------------------------------- /src/db.rs: -------------------------------------------------------------------------------- 1 | use bumpalo::Bump; 2 | use gxhash::{HashMap as GxHashMap, HashMapExt}; 3 | use heed::{ 4 | Database, DatabaseFlags, Env, EnvFlags, EnvOpenOptions, PutFlags, RoTxn, RwTxn, Unspecified, 5 | types::Str, 6 | }; 7 | use memmap2::{Mmap, MmapMut}; 8 | use rkyv::{ 9 | Archive, Archived, Deserialize, Serialize, 10 | api::high::HighSerializer, 11 | de::Pool, 12 | deserialize, 13 | rancor::Strategy, 14 | ser::{allocator::ArenaHandle, writer::IoWriter}, 15 | util::AlignedVec, 16 | with::InlineAsBox, 17 | }; 18 | use std::{ 19 | cmp::Reverse, 20 | collections::{BinaryHeap, HashSet, hash_map::Entry}, 21 | fmt::Debug, 22 | fs::File, 23 | hash::Hash, 24 | io::BufWriter, 25 | num::NonZero, 26 | ops::Index, 27 | path::Path, 28 | sync::atomic::Ordering::Relaxed, 29 | }; 30 | 31 | use crate::{ 32 | BorrowRoaringishPacked, Intersection, RoaringishPacked, 33 | codecs::{NativeU32, ZeroCopyCodec}, 34 | error::{DbError, GetDocumentError, SearchError}, 35 | normalize, 36 | roaringish::{Aligned, ArchivedBorrowRoaringishPacked, RoaringishPackedKind, Unaligned}, 37 | stats::Stats, 38 | tokenize, 39 | }; 40 | 41 | struct Tokens { 42 | tokens: String, 43 | positions: Vec<(usize, usize)>, 44 | } 45 | 46 | impl Tokens { 47 | fn new(q: &str) -> Self { 48 | let q = normalize(q); 49 | let mut start = 0; 50 | let mut tokens = String::with_capacity(q.len() + 1); 51 | let mut positions = Vec::with_capacity(q.len() + 1); 52 | 53 | for token in tokenize(&q) { 54 | tokens.push_str(token); 55 | tokens.push(' '); 56 | 57 | let b = start; 58 | let e = b + token.len(); 59 | start = e + 1; 60 | positions.push((b, e)); 61 | } 62 | tokens.pop(); 63 | 64 | Self { tokens, positions } 65 | } 66 | 67 | fn as_ref(&self) -> RefTokens { 68 | RefTokens { 69 | tokens: &self.tokens, 70 | positions: &self.positions, 71 | } 72 | } 73 | } 74 | 75 | #[derive(Clone, Copy)] 76 | struct RefTokens<'a> { 77 | tokens: &'a str, 78 | positions: &'a [(usize, usize)], 79 | } 80 | 81 | impl RefTokens<'_> { 82 | fn len(&self) -> usize { 83 | self.positions.len() 84 | } 85 | 86 | fn is_empty(&self) -> bool { 87 | self.len() == 0 88 | } 89 | 90 | fn reserve_len(&self) -> usize { 91 | let n = MAX_WINDOW_LEN.get(); 92 | let l = self.len(); 93 | n * (l.max(n) - n + 1) + ((n - 1) * n) / 2 94 | } 95 | 96 | fn first(&self) -> Option<&str> { 97 | self.positions 98 | .first() 99 | .map(|(b, e)| unsafe { self.tokens.get_unchecked(*b..*e) }) 100 | } 101 | 102 | fn ref_token_iter(&self) -> impl Iterator + '_ { 103 | (0..self.positions.len()).map(|i| Self { 104 | tokens: self.tokens, 105 | positions: &self.positions[i..i + 1], 106 | }) 107 | } 108 | 109 | fn iter(&self) -> impl Iterator { 110 | self.positions 111 | .iter() 112 | .map(|(b, e)| unsafe { self.tokens.get_unchecked(*b..*e) }) 113 | } 114 | 115 | fn range(&self) -> (usize, usize) { 116 | let (b, _) = self.positions.first().unwrap_or(&(0, 0)); 117 | let (_, e) = self.positions.last().unwrap_or(&(0, 0)); 118 | (*b, *e) 119 | } 120 | 121 | fn tokens(&self) -> &str { 122 | let (b, e) = self.range(); 123 | unsafe { self.tokens.get_unchecked(b..e) } 124 | } 125 | 126 | fn split_at(&self, i: usize) -> (Self, Self) { 127 | let (l, r) = self.positions.split_at(i); 128 | ( 129 | Self { 130 | tokens: self.tokens, 131 | positions: l, 132 | }, 133 | Self { 134 | tokens: self.tokens, 135 | positions: r, 136 | }, 137 | ) 138 | } 139 | } 140 | 141 | impl PartialEq for RefTokens<'_> { 142 | fn eq(&self, other: &Self) -> bool { 143 | let t0 = self.tokens(); 144 | let t1 = other.tokens(); 145 | t0 == t1 146 | } 147 | } 148 | 149 | impl Eq for RefTokens<'_> {} 150 | 151 | impl Hash for RefTokens<'_> { 152 | fn hash(&self, state: &mut H) { 153 | self.tokens().hash(state); 154 | } 155 | } 156 | 157 | impl Index for RefTokens<'_> { 158 | type Output = str; 159 | 160 | fn index(&self, index: usize) -> &Self::Output { 161 | let (b, e) = self.positions[index]; 162 | unsafe { self.tokens.get_unchecked(b..e) } 163 | } 164 | } 165 | 166 | impl Debug for RefTokens<'_> { 167 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 168 | f.debug_struct("RefTokens") 169 | .field("tokens", &self.tokens()) 170 | .field("positions", &self.positions) 171 | .finish() 172 | } 173 | } 174 | 175 | #[derive(Clone, Copy, Debug)] 176 | struct RefTokenLinkedList<'a, 'alloc> { 177 | tokens: RefTokens<'a>, 178 | next: Option<&'alloc RefTokenLinkedList<'a, 'alloc>>, 179 | } 180 | 181 | impl<'a, 'alloc> RefTokenLinkedList<'a, 'alloc> { 182 | fn iter<'b: 'alloc>(&'b self) -> RefTokenLinkedListIter<'a, 'alloc> { 183 | RefTokenLinkedListIter(Some(self)) 184 | } 185 | } 186 | 187 | struct RefTokenLinkedListIter<'a, 'alloc>(Option<&'alloc RefTokenLinkedList<'a, 'alloc>>); 188 | impl<'a, 'alloc> Iterator for RefTokenLinkedListIter<'a, 'alloc> { 189 | type Item = &'alloc RefTokens<'a>; 190 | 191 | fn next(&mut self) -> Option { 192 | match self.0 { 193 | Some(linked_list) => { 194 | self.0 = linked_list.next; 195 | Some(&linked_list.tokens) 196 | } 197 | None => None, 198 | } 199 | } 200 | } 201 | 202 | #[derive(Archive, Serialize, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] 203 | struct BorrowStr<'a>(#[rkyv(with = InlineAsBox)] &'a str); 204 | 205 | mod db_constants { 206 | pub const DB_DOC_ID_TO_DOCUMENT: &str = "doc_id_to_document"; 207 | pub const DB_TOKEN_TO_OFFSETS: &str = "token_to_offsets"; 208 | pub const KEY_COMMON_TOKENS: &str = "common_tokens"; 209 | pub const FILE_ROARINGISH_PACKED: &str = "roaringish_packed"; 210 | pub const TEMP_FILE_TOKEN_TO_PACKED: &str = "temp_token_to_packed"; 211 | } 212 | 213 | pub const MAX_WINDOW_LEN: NonZero = unsafe { NonZero::new_unchecked(3) }; 214 | 215 | #[derive(Debug, Serialize, Archive)] 216 | struct Offset { 217 | begin: u64, 218 | len: u64, 219 | } 220 | 221 | /// Represents all types that can be stored in the database. 222 | /// 223 | /// This basically means that the type must be serializable by [rkyv]. 224 | pub trait Document: 225 | for<'a> Serialize, rkyv::rancor::Error>> 226 | + Archive 227 | + 'static 228 | { 229 | } 230 | impl Document for D where 231 | Self: for<'a> Serialize, rkyv::rancor::Error>> 232 | + Archive 233 | + 'static 234 | { 235 | } 236 | 237 | pub struct DB { 238 | pub env: Env, 239 | db_main: Database, 240 | db_doc_id_to_document: Database>, 241 | db_token_to_offsets: Database>, 242 | } 243 | 244 | unsafe impl Send for DB {} 245 | 246 | unsafe impl Sync for DB {} 247 | 248 | impl DB { 249 | pub fn truncate>(path: P, db_size: usize) -> Result { 250 | let path = path.as_ref(); 251 | let _ = std::fs::remove_dir_all(path); 252 | std::fs::create_dir_all(path)?; 253 | 254 | let env = unsafe { 255 | EnvOpenOptions::new() 256 | .max_dbs(2) 257 | .map_size(db_size) 258 | .flags(EnvFlags::WRITE_MAP | EnvFlags::MAP_ASYNC) 259 | .open(path)? 260 | }; 261 | 262 | let mut wrtxn = env.write_txn()?; 263 | 264 | let db_main = env.create_database(&mut wrtxn, None)?; 265 | 266 | let db_doc_id_to_document = env 267 | .database_options() 268 | .types::>() 269 | .flags(DatabaseFlags::REVERSE_KEY) 270 | .name(db_constants::DB_DOC_ID_TO_DOCUMENT) 271 | .create(&mut wrtxn)?; 272 | 273 | let db_token_to_offsets = 274 | env.create_database(&mut wrtxn, Some(db_constants::DB_TOKEN_TO_OFFSETS))?; 275 | 276 | wrtxn.commit()?; 277 | 278 | Ok(Self { 279 | env, 280 | db_main, 281 | db_doc_id_to_document, 282 | db_token_to_offsets, 283 | }) 284 | } 285 | 286 | pub fn write_doc_id_to_document( 287 | &self, 288 | rwtxn: &mut RwTxn, 289 | doc_ids: &[u32], 290 | documents: &[D], 291 | ) -> Result<(), DbError> { 292 | log::debug!("Writing documents"); 293 | let b = std::time::Instant::now(); 294 | for (doc_id, document) in doc_ids.iter().zip(documents.iter()) { 295 | self.db_doc_id_to_document 296 | .put_with_flags(rwtxn, PutFlags::APPEND, doc_id, document)?; 297 | } 298 | log::debug!("Writing documents took {:?}", b.elapsed()); 299 | Ok(()) 300 | } 301 | 302 | pub fn write_token_to_roaringish_packed( 303 | &self, 304 | token_to_token_id: &GxHashMap, u32>, 305 | token_id_to_roaringish_packed: &[RoaringishPacked], 306 | mmap_size: &mut usize, 307 | batch_id: u32, 308 | ) -> Result<(), DbError> { 309 | log::debug!("Writing token to roaringish packed"); 310 | let b = std::time::Instant::now(); 311 | let mut token_to_packed: Vec<_> = token_to_token_id 312 | .iter() 313 | .map(|(token, token_id)| { 314 | let packed = &token_id_to_roaringish_packed[*token_id as usize]; 315 | *mmap_size += packed.size_bytes(); 316 | (BorrowStr(token), BorrowRoaringishPacked::new(packed)) 317 | }) 318 | .collect(); 319 | token_to_packed.sort_unstable_by(|(token0, _), (token1, _)| token0.cmp(token1)); 320 | 321 | let file_name = format!("{}_{batch_id}", db_constants::TEMP_FILE_TOKEN_TO_PACKED); 322 | let file = IoWriter::new(BufWriter::new( 323 | File::options() 324 | .create(true) 325 | .truncate(true) 326 | .read(true) 327 | .write(true) 328 | .open(self.env.path().join(file_name))?, 329 | )); 330 | rkyv::api::high::to_bytes_in::<_, rkyv::rancor::Error>(&token_to_packed, file)?; 331 | log::debug!("Writing token to roaringish packed took {:?}", b.elapsed()); 332 | Ok(()) 333 | } 334 | 335 | pub fn generate_mmap_file( 336 | &self, 337 | number_of_distinct_tokens: u64, 338 | mmap_size: usize, 339 | number_of_batches: u32, 340 | rwtxn: &mut RwTxn, 341 | ) -> Result<(), DbError> { 342 | #[inline(always)] 343 | unsafe fn write_to_mmap( 344 | mmap: &mut MmapMut, 345 | mmap_offset: &mut usize, 346 | bytes: &[u8], 347 | ) -> Offset { 348 | unsafe { 349 | let ptr = mmap.as_ptr().add(*mmap_offset); 350 | let offset = ptr.align_offset(N); 351 | 352 | *mmap_offset += offset; 353 | mmap[*mmap_offset..*mmap_offset + bytes.len()].copy_from_slice(bytes); 354 | 355 | let begin = *mmap_offset; 356 | *mmap_offset += bytes.len(); 357 | Offset { 358 | begin: begin as u64, 359 | len: bytes.len() as u64, 360 | } 361 | } 362 | } 363 | 364 | log::info!("Merging roaringish packed files to generate the final memory map file"); 365 | let b = std::time::Instant::now(); 366 | let file = File::options() 367 | .create(true) 368 | .truncate(true) 369 | .read(true) 370 | .write(true) 371 | .open(self.env.path().join(db_constants::FILE_ROARINGISH_PACKED))?; 372 | let final_size = mmap_size as u64 + (number_of_distinct_tokens * 64); 373 | log::debug!("Creating file with size: {} bytes", final_size); 374 | file.set_len(final_size)?; 375 | let mut mmap = unsafe { MmapMut::map_mut(&file)? }; 376 | let mut mmap_offset = 0; 377 | 378 | // we need to do this in 3 steps because of the borrow checker 379 | let files_mmaps = (0..number_of_batches) 380 | .map(|i| -> Result { 381 | let file_name = format!("{}_{i}", db_constants::TEMP_FILE_TOKEN_TO_PACKED); 382 | let file = File::options() 383 | .read(true) 384 | .open(self.env.path().join(file_name))?; 385 | unsafe { Ok(Mmap::map(&file)?) } 386 | }) 387 | .collect::, DbError>>()?; 388 | let files_data: Vec<_> = files_mmaps 389 | .iter() 390 | .map(|mmap| unsafe { 391 | rkyv::access_unchecked::< 392 | Archived, BorrowRoaringishPacked<'_, Unaligned>)>>, 393 | >(mmap) 394 | }) 395 | .collect(); 396 | let mut iters: Vec<_> = files_data 397 | .iter() 398 | .map(|tokens_to_packeds| tokens_to_packeds.iter()) 399 | .collect(); 400 | 401 | struct ToMerge<'a> { 402 | token: &'a ArchivedBorrowStr<'a>, 403 | packed: &'a ArchivedBorrowRoaringishPacked<'a, Unaligned>, 404 | i: usize, 405 | } 406 | impl PartialEq for ToMerge<'_> { 407 | fn eq(&self, other: &Self) -> bool { 408 | self.token.0 == other.token.0 && self.i == other.i 409 | } 410 | } 411 | impl Eq for ToMerge<'_> {} 412 | impl PartialOrd for ToMerge<'_> { 413 | fn partial_cmp(&self, other: &Self) -> Option { 414 | Some(self.cmp(other)) 415 | } 416 | } 417 | impl Ord for ToMerge<'_> { 418 | fn cmp(&self, other: &Self) -> std::cmp::Ordering { 419 | match self.token.0.cmp(&other.token.0) { 420 | std::cmp::Ordering::Equal => self.i.cmp(&other.i), 421 | ord => ord, 422 | } 423 | } 424 | } 425 | 426 | let mut heap = BinaryHeap::new(); 427 | for (i, it) in iters.iter_mut().enumerate() { 428 | if let Some(token_to_packed) = it.next() { 429 | heap.push(Reverse(ToMerge { 430 | token: &token_to_packed.0, 431 | packed: &token_to_packed.1, 432 | i, 433 | })) 434 | } 435 | } 436 | 437 | while let Some(token_to_packed) = heap.pop() { 438 | let to_merge = token_to_packed.0; 439 | if let Some(token_to_packed) = iters[to_merge.i].next() { 440 | heap.push(Reverse(ToMerge { 441 | token: &token_to_packed.0, 442 | packed: &token_to_packed.1, 443 | i: to_merge.i, 444 | })); 445 | } 446 | 447 | let mut packed_kind = RoaringishPackedKind::Archived(to_merge.packed); 448 | loop { 449 | let Some(next_to_merge) = heap.peek() else { 450 | break; 451 | }; 452 | 453 | if next_to_merge.0.token.0 != to_merge.token.0 { 454 | break; 455 | } 456 | 457 | // This pop can't fail because we peeked before 458 | let next_to_merge = heap.pop().unwrap().0; 459 | if let Some(token_to_packed) = iters[next_to_merge.i].next() { 460 | heap.push(Reverse(ToMerge { 461 | token: &token_to_packed.0, 462 | packed: &token_to_packed.1, 463 | i: next_to_merge.i, 464 | })); 465 | } 466 | 467 | let next_to_merge_kind = RoaringishPackedKind::Archived(next_to_merge.packed); 468 | packed_kind = packed_kind.concat(next_to_merge_kind); 469 | } 470 | 471 | if to_merge.token.0.len() > 511 { 472 | continue; 473 | } 474 | 475 | let packed = packed_kind.as_bytes(); 476 | let offset = unsafe { write_to_mmap::<64>(&mut mmap, &mut mmap_offset, packed) }; 477 | self.db_token_to_offsets.put_with_flags( 478 | rwtxn, 479 | PutFlags::APPEND, 480 | &to_merge.token.0, 481 | &offset, 482 | )?; 483 | } 484 | 485 | drop(iters); 486 | drop(files_data); 487 | drop(files_mmaps); 488 | 489 | log::debug!("Finished merging roaringish packed files"); 490 | log::debug!("Removing old files"); 491 | for i in 0..number_of_batches { 492 | let file_name = format!("{}_{i}", db_constants::TEMP_FILE_TOKEN_TO_PACKED); 493 | std::fs::remove_file(self.env.path().join(file_name))?; 494 | } 495 | log::info!("Whole merging process took {:?}", b.elapsed()); 496 | 497 | Ok(()) 498 | } 499 | 500 | fn read_common_tokens( 501 | rotxn: &RoTxn, 502 | db_main: Database, 503 | ) -> Result>, DbError> { 504 | let k = db_main 505 | .remap_types::>>>() 506 | .get(rotxn, db_constants::KEY_COMMON_TOKENS)? 507 | .ok_or_else(|| { 508 | DbError::KeyNotFound( 509 | db_constants::KEY_COMMON_TOKENS.to_string(), 510 | "main".to_string(), 511 | ) 512 | })?; 513 | 514 | Ok(deserialize::<_, rkyv::rancor::Error>(k)?) 515 | } 516 | 517 | pub fn write_common_tokens( 518 | &self, 519 | rwtxn: &mut RwTxn, 520 | common_tokens: &HashSet>, 521 | ) -> Result<(), DbError> { 522 | log::debug!("Writing common tokens"); 523 | let b = std::time::Instant::now(); 524 | self.db_main 525 | .remap_types::>>>() 526 | .put(rwtxn, db_constants::KEY_COMMON_TOKENS, common_tokens)?; 527 | log::debug!("Writing common tokens took {:?}", b.elapsed()); 528 | Ok(()) 529 | } 530 | 531 | pub fn open>(path: P) -> Result<(Self, HashSet>, Mmap), DbError> { 532 | let path = path.as_ref(); 533 | let env = unsafe { 534 | EnvOpenOptions::new() 535 | .max_dbs(2) 536 | .flags(EnvFlags::READ_ONLY) 537 | .open(path)? 538 | }; 539 | 540 | let rotxn = env.read_txn()?; 541 | 542 | let db_main = env 543 | .open_database(&rotxn, None)? 544 | .ok_or_else(|| DbError::DatabaseError("main".to_string()))?; 545 | 546 | let db_doc_id_to_document = env 547 | .database_options() 548 | .types::>() 549 | .flags(DatabaseFlags::REVERSE_KEY) 550 | .name(db_constants::DB_DOC_ID_TO_DOCUMENT) 551 | .open(&rotxn)? 552 | .ok_or_else(|| { 553 | DbError::DatabaseError(db_constants::DB_DOC_ID_TO_DOCUMENT.to_string()) 554 | })?; 555 | 556 | let db_token_to_offsets = env 557 | .open_database(&rotxn, Some(db_constants::DB_TOKEN_TO_OFFSETS))? 558 | .ok_or_else(|| DbError::DatabaseError(db_constants::DB_TOKEN_TO_OFFSETS.to_string()))?; 559 | 560 | let common_tokens = Self::read_common_tokens(&rotxn, db_main)?; 561 | 562 | rotxn.commit()?; 563 | 564 | let mmap_file = File::open(path.join(db_constants::FILE_ROARINGISH_PACKED))?; 565 | let mmap = unsafe { Mmap::map(&mmap_file)? }; 566 | 567 | Ok(( 568 | Self { 569 | env, 570 | db_main, 571 | db_doc_id_to_document, 572 | db_token_to_offsets, 573 | }, 574 | common_tokens, 575 | mmap, 576 | )) 577 | } 578 | 579 | // This function neeeds to be inline never, for some reason inlining this 580 | // function makes some queries performance unpredictable 581 | #[inline(never)] 582 | fn merge_and_minimize_tokens<'a, 'b, 'alloc>( 583 | &self, 584 | rotxn: &RoTxn, 585 | tokens: RefTokens<'a>, 586 | common_tokens: &HashSet>, 587 | mmap: &'b Mmap, 588 | 589 | bump: &'alloc Bump, 590 | ) -> Result< 591 | ( 592 | Vec>, 593 | GxHashMap, BorrowRoaringishPacked<'b, Aligned>>, 594 | ), 595 | SearchError, 596 | > { 597 | #[inline(always)] 598 | fn check_before_recursion<'a, 'b, 'alloc, D: Document>( 599 | me: &DB, 600 | rotxn: &RoTxn, 601 | tokens: RefTokens<'a>, 602 | token_to_packed: &mut GxHashMap, BorrowRoaringishPacked<'b, Aligned>>, 603 | mmap: &'b Mmap, 604 | memo_token_to_score_choices: &mut GxHashMap< 605 | RefTokens<'a>, 606 | (usize, &'alloc RefTokenLinkedList<'a, 'alloc>), 607 | >, 608 | bump: &'alloc Bump, 609 | ) -> Result, SearchError> { 610 | if tokens.len() != 1 { 611 | return Ok(None); 612 | } 613 | 614 | let score = match token_to_packed.entry(tokens) { 615 | Entry::Occupied(e) => e.get().len(), 616 | Entry::Vacant(e) => { 617 | let packed = me.get_roaringish_packed(rotxn, &tokens[0], mmap)?; 618 | let score = packed.len(); 619 | e.insert(packed); 620 | 621 | let linked_list = bump.alloc(RefTokenLinkedList { tokens, next: None }); 622 | memo_token_to_score_choices.insert(tokens, (score, linked_list)); 623 | score 624 | } 625 | }; 626 | Ok(Some(score)) 627 | } 628 | 629 | #[allow(clippy::too_many_arguments)] 630 | fn inner_merge_and_minimize_tokens<'a, 'b, 'c, 'alloc, D: Document>( 631 | me: &DB, 632 | rotxn: &RoTxn, 633 | tokens: RefTokens<'a>, 634 | common_tokens: &HashSet>, 635 | token_to_packed: &mut GxHashMap, BorrowRoaringishPacked<'b, Aligned>>, 636 | mmap: &'b Mmap, 637 | memo_token_to_score_choices: &mut GxHashMap< 638 | RefTokens<'a>, 639 | (usize, &'alloc RefTokenLinkedList<'a, 'alloc>), 640 | >, 641 | 642 | bump: &'alloc Bump, 643 | ) -> Result { 644 | const { assert!(MAX_WINDOW_LEN.get() == 3) }; 645 | let mut final_score = usize::MAX; 646 | let mut best_token_choice = None; 647 | let mut best_rem_choice = None; 648 | 649 | // TODO: fix this, it looks ugly 650 | let mut end = tokens 651 | .iter() 652 | .skip(1) 653 | .take(MAX_WINDOW_LEN.get() - 1) 654 | .take_while(|t| common_tokens.contains(*t)) 655 | .count() 656 | + 2; 657 | if common_tokens.contains(&tokens[0]) { 658 | end += 1; 659 | } 660 | end = end.min(MAX_WINDOW_LEN.get() + 1).min(tokens.len() + 1); 661 | 662 | for i in (1..end).rev() { 663 | let (tokens, rem) = tokens.split_at(i); 664 | 665 | let score = match token_to_packed.entry(tokens) { 666 | Entry::Occupied(e) => e.get().len(), 667 | Entry::Vacant(e) => { 668 | let packed = me.get_roaringish_packed(rotxn, tokens.tokens(), mmap)?; 669 | let score = packed.len(); 670 | e.insert(packed); 671 | score 672 | } 673 | }; 674 | 675 | let mut rem_score = 0; 676 | if !rem.is_empty() { 677 | rem_score = match memo_token_to_score_choices.get(&rem) { 678 | Some(r) => r.0, 679 | None => { 680 | match check_before_recursion( 681 | me, 682 | rotxn, 683 | rem, 684 | token_to_packed, 685 | mmap, 686 | memo_token_to_score_choices, 687 | bump, 688 | )? { 689 | Some(score) => score, 690 | None => inner_merge_and_minimize_tokens( 691 | me, 692 | rotxn, 693 | rem, 694 | common_tokens, 695 | token_to_packed, 696 | mmap, 697 | memo_token_to_score_choices, 698 | bump, 699 | )?, 700 | } 701 | } 702 | }; 703 | if rem_score == 0 { 704 | return Err(SearchError::MergeAndMinimizeNotPossible); 705 | } 706 | } 707 | 708 | let calc_score = score + rem_score; 709 | if calc_score < final_score { 710 | final_score = calc_score; 711 | 712 | best_token_choice = Some(tokens); 713 | if let Some((_, rem_choices)) = memo_token_to_score_choices.get(&rem) { 714 | best_rem_choice = Some(*rem_choices); 715 | }; 716 | } 717 | } 718 | 719 | let choices = match (best_token_choice, best_rem_choice) { 720 | (None, None) => return Err(SearchError::MergeAndMinimizeNotPossible), 721 | (None, Some(_)) => return Err(SearchError::MergeAndMinimizeNotPossible), 722 | (Some(tokens), None) => bump.alloc(RefTokenLinkedList { tokens, next: None }), 723 | (Some(tokens), Some(rem)) => bump.alloc(RefTokenLinkedList { 724 | tokens, 725 | next: Some(rem), 726 | }), 727 | }; 728 | 729 | memo_token_to_score_choices.insert(tokens, (final_score, choices)); 730 | Ok(final_score) 731 | } 732 | 733 | // This function neeeds to be inline never, for some reason inlining this 734 | // function makes some queries performance unpredictable 735 | #[inline(never)] 736 | fn no_common_tokens<'a, 'b, 'alloc, D: Document>( 737 | me: &DB, 738 | rotxn: &RoTxn, 739 | tokens: RefTokens<'a>, 740 | mmap: &'b Mmap, 741 | ) -> Result< 742 | ( 743 | Vec>, 744 | GxHashMap, BorrowRoaringishPacked<'b, Aligned>>, 745 | ), 746 | SearchError, 747 | > { 748 | let l = tokens.len(); 749 | let mut token_to_packed = GxHashMap::with_capacity(l); 750 | let mut v = Vec::with_capacity(l); 751 | 752 | for token in tokens.ref_token_iter() { 753 | let packed = me.get_roaringish_packed(rotxn, token.tokens(), mmap)?; 754 | token_to_packed.insert(token, packed); 755 | v.push(token); 756 | } 757 | 758 | return Ok((v, token_to_packed)); 759 | } 760 | 761 | if common_tokens.is_empty() { 762 | return no_common_tokens(self, rotxn, tokens, mmap); 763 | } 764 | 765 | let len = tokens.reserve_len(); 766 | let mut memo_token_to_score_choices = GxHashMap::with_capacity(len); 767 | let mut token_to_packed = GxHashMap::with_capacity(len); 768 | 769 | let score = match check_before_recursion( 770 | self, 771 | rotxn, 772 | tokens, 773 | &mut token_to_packed, 774 | mmap, 775 | &mut memo_token_to_score_choices, 776 | bump, 777 | )? { 778 | Some(score) => score, 779 | None => inner_merge_and_minimize_tokens( 780 | self, 781 | rotxn, 782 | tokens, 783 | common_tokens, 784 | &mut token_to_packed, 785 | mmap, 786 | &mut memo_token_to_score_choices, 787 | bump, 788 | )?, 789 | }; 790 | 791 | if score == 0 { 792 | return Err(SearchError::MergeAndMinimizeNotPossible); 793 | } 794 | match memo_token_to_score_choices.remove(&tokens) { 795 | Some((_, choices)) => { 796 | let v = choices.iter().copied().collect(); 797 | Ok((v, token_to_packed)) 798 | } 799 | None => Err(SearchError::MergeAndMinimizeNotPossible), 800 | } 801 | } 802 | 803 | fn get_roaringish_packed_from_offset<'a>( 804 | offset: &ArchivedOffset, 805 | mmap: &'a Mmap, 806 | ) -> Result, SearchError> { 807 | let begin = offset.begin.to_native() as usize; 808 | let len = offset.len.to_native() as usize; 809 | let end = begin + len; 810 | let Some(packed) = &mmap.get(begin..end) else { 811 | return Err(SearchError::InternalError); 812 | }; 813 | let (l, packed, r) = unsafe { packed.align_to::() }; 814 | if !l.is_empty() || !r.is_empty() { 815 | return Err(SearchError::InternalError); 816 | } 817 | 818 | mmap.advise_range(memmap2::Advice::Sequential, begin, len) 819 | .map_err(|e| DbError::from(e))?; 820 | 821 | Ok(BorrowRoaringishPacked::new_raw(packed)) 822 | } 823 | 824 | #[inline(always)] 825 | pub fn get_roaringish_packed<'a>( 826 | &self, 827 | rotxn: &RoTxn, 828 | token: &str, 829 | mmap: &'a Mmap, 830 | ) -> Result, SearchError> { 831 | let offset = self 832 | .db_token_to_offsets 833 | .get(rotxn, token) 834 | .map_err(|e| DbError::from(e))?; 835 | match offset { 836 | Some(offset) => Self::get_roaringish_packed_from_offset(offset, mmap), 837 | None => Err(SearchError::TokenNotFound(token.to_string())), 838 | } 839 | } 840 | 841 | pub fn search( 842 | &self, 843 | q: &str, 844 | stats: &Stats, 845 | common_tokens: &HashSet>, 846 | mmap: &Mmap, 847 | ) -> Result, SearchError> { 848 | stats.iters.fetch_add(1, Relaxed); 849 | 850 | let b = std::time::Instant::now(); 851 | let tokens = Tokens::new(q); 852 | let tokens = tokens.as_ref(); 853 | stats 854 | .normalize_tokenize 855 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 856 | 857 | if tokens.is_empty() { 858 | return Err(SearchError::EmptyQuery); 859 | } 860 | 861 | let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?; 862 | if tokens.len() == 1 { 863 | // this can't failt, we just checked 864 | self.get_roaringish_packed(&rotxn, tokens.first().unwrap(), mmap)? 865 | .get_doc_ids(stats); 866 | } 867 | 868 | let b = std::time::Instant::now(); 869 | let bump = Bump::with_capacity(tokens.reserve_len() * 5); 870 | let (final_tokens, token_to_packed) = 871 | self.merge_and_minimize_tokens(&rotxn, tokens, common_tokens, mmap, &bump)?; 872 | stats 873 | .merge_minimize 874 | .fetch_add(b.elapsed().as_micros() as u64, Relaxed); 875 | 876 | if final_tokens.is_empty() { 877 | return Err(SearchError::EmptyQuery); 878 | } 879 | 880 | if final_tokens.len() == 1 { 881 | return token_to_packed 882 | .get(&final_tokens[0]) 883 | .ok_or_else(|| SearchError::TokenNotFound(final_tokens[0].tokens().to_string())) 884 | .map(|p| p.get_doc_ids(stats)); 885 | } 886 | 887 | // at this point we know that we have at least 888 | // 2 tokens, so the loop will run at least once 889 | // changing the value of `i` to be inbounds 890 | let mut min = usize::MAX; 891 | let mut i = usize::MAX; 892 | for (j, ts) in final_tokens.array_windows::<2>().enumerate() { 893 | let l0 = token_to_packed 894 | .get(&ts[0]) 895 | .ok_or_else(|| SearchError::TokenNotFound(ts[0].tokens().to_string()))? 896 | .len(); 897 | 898 | let l1 = token_to_packed 899 | .get(&ts[1]) 900 | .ok_or_else(|| SearchError::TokenNotFound(ts[1].tokens().to_string()))? 901 | .len(); 902 | 903 | let l = l0 + l1; 904 | if l <= min { 905 | i = j; 906 | min = l; 907 | } 908 | } 909 | 910 | let lhs = &final_tokens[i]; 911 | let mut lhs_len = lhs.len() as u32; 912 | let lhs = token_to_packed 913 | .get(lhs) 914 | .ok_or_else(|| SearchError::TokenNotFound(lhs.tokens().to_string()))?; 915 | 916 | let rhs = &final_tokens[i + 1]; 917 | let mut rhs_len = rhs.len() as u32; 918 | let rhs = token_to_packed 919 | .get(rhs) 920 | .ok_or_else(|| SearchError::TokenNotFound(rhs.tokens().to_string()))?; 921 | 922 | let mut result = lhs.intersect::(*rhs, lhs_len, stats); 923 | let mut result_borrow = BorrowRoaringishPacked::new(&result); 924 | 925 | let mut left_i = i.wrapping_sub(1); 926 | let mut right_i = i + 2; 927 | 928 | loop { 929 | let lhs = final_tokens.get(left_i); 930 | let rhs = final_tokens.get(right_i); 931 | match (lhs, rhs) { 932 | (Some(t_lhs), Some(t_rhs)) => { 933 | let lhs = token_to_packed 934 | .get(t_lhs) 935 | .ok_or_else(|| SearchError::TokenNotFound(t_lhs.tokens().to_string()))?; 936 | let rhs = token_to_packed 937 | .get(t_rhs) 938 | .ok_or_else(|| SearchError::TokenNotFound(t_rhs.tokens().to_string()))?; 939 | if lhs.len() <= rhs.len() { 940 | lhs_len += t_lhs.len() as u32; 941 | 942 | result = lhs.intersect::(result_borrow, lhs_len, stats); 943 | result_borrow = BorrowRoaringishPacked::new(&result); 944 | 945 | left_i = left_i.wrapping_sub(1); 946 | } else { 947 | result = result_borrow.intersect::(*rhs, rhs_len, stats); 948 | result_borrow = BorrowRoaringishPacked::new(&result); 949 | 950 | lhs_len += rhs_len; 951 | rhs_len = t_rhs.len() as u32; 952 | 953 | right_i += 1; 954 | } 955 | } 956 | (Some(t_lhs), None) => { 957 | let lhs = token_to_packed 958 | .get(t_lhs) 959 | .ok_or_else(|| SearchError::TokenNotFound(t_lhs.tokens().to_string()))?; 960 | lhs_len += t_lhs.len() as u32; 961 | 962 | result = lhs.intersect::(result_borrow, lhs_len, stats); 963 | result_borrow = BorrowRoaringishPacked::new(&result); 964 | 965 | left_i = left_i.wrapping_sub(1); 966 | } 967 | (None, Some(t_rhs)) => { 968 | let rhs = token_to_packed 969 | .get(t_rhs) 970 | .ok_or_else(|| SearchError::TokenNotFound(t_rhs.tokens().to_string()))?; 971 | 972 | result = result_borrow.intersect::(*rhs, rhs_len, stats); 973 | result_borrow = BorrowRoaringishPacked::new(&result); 974 | 975 | lhs_len += rhs_len; 976 | rhs_len = t_rhs.len() as u32; 977 | 978 | right_i += 1; 979 | } 980 | (None, None) => break, 981 | } 982 | 983 | if result.is_empty() { 984 | return Err(SearchError::EmptyIntersection); 985 | } 986 | } 987 | 988 | Ok(result_borrow.get_doc_ids(stats)) 989 | } 990 | 991 | fn inner_get_archived_document<'a>( 992 | &self, 993 | rotxn: &'a RoTxn, 994 | doc_id: &u32, 995 | ) -> Result<&'a D::Archived, GetDocumentError> { 996 | self.db_doc_id_to_document 997 | .get(rotxn, doc_id) 998 | .map_err(|e| DbError::from(e))? 999 | .ok_or(GetDocumentError::DocumentNotFound(*doc_id)) 1000 | } 1001 | 1002 | pub fn get_archived_documents( 1003 | &self, 1004 | doc_ids: &[u32], 1005 | cb: impl FnOnce(Vec<&D::Archived>), 1006 | ) -> Result<(), GetDocumentError> { 1007 | let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?; 1008 | let docs = doc_ids 1009 | .into_iter() 1010 | .map(|doc_id| self.inner_get_archived_document(&rotxn, doc_id)) 1011 | .collect::, _>>()?; 1012 | 1013 | cb(docs); 1014 | 1015 | Ok(()) 1016 | } 1017 | 1018 | pub fn get_archived_document( 1019 | &self, 1020 | doc_id: u32, 1021 | cb: impl FnOnce(&D::Archived), 1022 | ) -> Result<(), GetDocumentError> { 1023 | let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?; 1024 | let doc = self.inner_get_archived_document(&rotxn, &doc_id)?; 1025 | 1026 | cb(doc); 1027 | 1028 | Ok(()) 1029 | } 1030 | 1031 | pub fn get_documents(&self, doc_ids: &[u32]) -> Result, GetDocumentError> 1032 | where 1033 | ::Archived: Deserialize>, 1034 | { 1035 | let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?; 1036 | doc_ids 1037 | .into_iter() 1038 | .map(|doc_id| { 1039 | let archived = self.inner_get_archived_document(&rotxn, doc_id)?; 1040 | rkyv::deserialize::(archived) 1041 | .map_err(|e| GetDocumentError::DbError(DbError::from(e))) 1042 | }) 1043 | .collect::, _>>() 1044 | } 1045 | 1046 | pub fn get_document(&self, doc_id: u32) -> Result 1047 | where 1048 | ::Archived: Deserialize>, 1049 | { 1050 | let rotxn = self.env.read_txn().map_err(|e| DbError::from(e))?; 1051 | let archived = self.inner_get_archived_document(&rotxn, &doc_id)?; 1052 | rkyv::deserialize::(archived) 1053 | .map_err(|e| GetDocumentError::DbError(DbError::from(e))) 1054 | } 1055 | } 1056 | --------------------------------------------------------------------------------