├── .github ├── dependabot.yml └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── engine ├── Cargo.toml └── src │ ├── collectors.rs │ ├── lib.rs │ ├── searchable.rs │ └── streams.rs └── wasm-example ├── .cargo └── config ├── Cargo.toml ├── build_fst.js ├── download_data.sh ├── query_fst.js └── src └── lib.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "cargo" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | - package-ecosystem: "github-actions" 8 | directory: "/" 9 | schedule: 10 | interval: "daily" 11 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Build 16 | run: cargo build --verbose 17 | - name: Run tests 18 | run: cargo test --verbose --all-features 19 | - uses: jetli/wasm-pack-action@v0.3.0 20 | - name: Build for wasm-example 21 | run: cd wasm-example && wasm-pack build . -t nodejs 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | wasm-example/data 4 | wasm-example/pkg 5 | Cargo.lock -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "engine", 4 | "wasm-example", 5 | ] 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Maurus Cuelenaere 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | porigon 2 | === 3 | Lightweight FST-based autocompleter library written in Rust, targeting WebAssembly and data stored in-memory 4 | 5 | [![Build status](https://github.com/mcuelenaere/porigon/workflows/Rust/badge.svg)](https://github.com/mcuelenaere/porigon/actions) 6 | [![](http://meritbadge.herokuapp.com/porigon)](https://crates.io/crates/porigon) 7 | 8 | Licensed under MIT. 9 | 10 | ### Intended usecase 11 | 12 | The idea of this library is to have a lightweight, yet idiomatic API around the [fst crate](https://github.com/BurntSushi/fst) that allows you to construct, serialize/deserialize and query FSTs in an WebAssembly environment. It's an ideal starting point for building an autocompleter service that can be used on the web, the edge (eg Cloudflare Worker) or the backend-side (node.js). 13 | 14 | Existing solutions like eg [tantivy](https://github.com/tantivy-search/tantivy) are not fitting as they're too heavyweight (wasm binary size is over 1MB) or not compilable to WebAssembly. If you're looking for a more full fledged full-text search engine, take a look at the list of alternatives at the bottom. 15 | 16 | 17 | ### Documentation 18 | 19 | https://docs.rs/porigon 20 | 21 | 22 | ### Installation 23 | 24 | Simply add a corresponding entry to your `Cargo.toml` dependency list: 25 | 26 | ```toml,ignore 27 | [dependencies] 28 | porigon = "0.1.0" 29 | ``` 30 | 31 | ### Example 32 | 33 | This example demonstrates building a `Searchable` in memory, executing a `StartsWith` query 34 | against it and collecting the top 3 documents with `TopScoreCollector`. 35 | 36 | ```rust 37 | use porigon::{Searchable, TopScoreCollector}; 38 | 39 | fn main() -> Result<(), Box> { 40 | let items = vec!( 41 | ("bar".as_bytes(), 1), 42 | ("foo".as_bytes(), 2), 43 | ("foobar".as_bytes(), 3) 44 | ); 45 | let searchable = Searchable::build_from_iter(items)?; 46 | 47 | let mut collector = TopScoreCollector::new(3); 48 | collector.consume_stream( 49 | searchable 50 | .starts_with("foo") 51 | .rescore(|_, index, _| index * 2) 52 | ); 53 | 54 | let docs = collector.top_documents(); 55 | assert_eq!(docs[0].index, 3); 56 | 57 | Ok(()) 58 | } 59 | ``` 60 | 61 | Check out the documentation or `wasm-example` for a more examples. 62 | 63 | ### Alternatives 64 | 65 | If you're looking for a more general-purpose full-text search engine, take a look at these alternatives: 66 | 67 | - https://github.com/tantivy-search/tantivy 68 | - https://github.com/meilisearch/MeiliSearch 69 | - https://github.com/toshi-search/Toshi 70 | -------------------------------------------------------------------------------- /engine/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "porigon" 3 | version = "0.4.0" 4 | authors = ["Maurus Cuelenaere "] 5 | description = "Lightweight FST-based autocompleter library, targeting WebAssembly and data stored in-memory" 6 | repository = "https://github.com/mcuelenaere/porigon" 7 | license = "MIT" 8 | edition = "2021" 9 | 10 | [lib] 11 | name = "porigon" 12 | 13 | [dependencies] 14 | fst = { version = "0.4", default-features = false } 15 | levenshtein_automata = "0.2" 16 | itertools = "0.10" 17 | maybe-owned = "0.3" 18 | serde = { version = "1.0", features = ["derive"], optional = true } 19 | rkyv = { version = "0.7", optional = true } 20 | q_compress = "0.9.3" 21 | thiserror = "1.0" -------------------------------------------------------------------------------- /engine/src/collectors.rs: -------------------------------------------------------------------------------- 1 | use crate::{Score, SearchStream}; 2 | use std::cmp::{Ordering, Reverse}; 3 | use std::collections::BinaryHeap; 4 | 5 | pub struct Document { 6 | pub index: u64, 7 | pub score: Score, 8 | } 9 | 10 | impl PartialOrd for Document { 11 | fn partial_cmp(&self, other: &Self) -> Option { 12 | self.score.partial_cmp(&other.score) 13 | } 14 | } 15 | 16 | impl Ord for Document { 17 | #[inline] 18 | fn cmp(&self, other: &Self) -> Ordering { 19 | self.score.cmp(&other.score) 20 | } 21 | } 22 | 23 | impl PartialEq for Document { 24 | fn eq(&self, other: &Self) -> bool { 25 | other.index == self.index 26 | } 27 | } 28 | 29 | impl Eq for Document {} 30 | 31 | /// Documents collector, keeping track of the top N items (based on their score). 32 | /// 33 | /// Internally, this uses a binary min-heap as an efficient manner of keeping track of the top N 34 | /// documents. 35 | pub struct TopScoreCollector { 36 | // heap should be a min-heap, so use Reverse to achieve this 37 | heap: BinaryHeap>, 38 | sorted_docs: Vec, 39 | limit: usize, 40 | } 41 | 42 | impl TopScoreCollector { 43 | /// Constructs a new `TopScoreCollector` with a hardcoded `limit` of documents. 44 | pub fn new(limit: usize) -> Self { 45 | TopScoreCollector { 46 | limit, 47 | sorted_docs: Vec::with_capacity(limit), 48 | heap: BinaryHeap::with_capacity(limit), 49 | } 50 | } 51 | 52 | /// Resets the internal state of the collector. 53 | pub fn reset(&mut self) { 54 | self.heap.clear(); 55 | } 56 | 57 | /// Consumes a `SearchStream`, collecting the items and only keeping the top N items (based on 58 | /// their score). 59 | /// 60 | /// # Example 61 | /// ``` 62 | /// use porigon::{SearchableStorage, SearchStream, TopScoreCollector}; 63 | /// 64 | /// let storage = SearchableStorage::build_from_iter(vec!( 65 | /// ("foo", 1), 66 | /// ("foobar", 2), 67 | /// )).unwrap(); 68 | /// let searchable = storage.to_searchable().unwrap(); 69 | /// let mut collector = TopScoreCollector::new(1); 70 | /// 71 | /// collector.consume_stream( 72 | /// searchable 73 | /// .starts_with("foo") 74 | /// .rescore(|_, index, _| index * 2) 75 | /// ); 76 | /// collector.consume_stream( 77 | /// searchable 78 | /// .exact_match("foobar") 79 | /// ); 80 | /// assert_eq!(collector.top_documents()[0].index, 2); 81 | /// ``` 82 | pub fn consume_stream(&mut self, mut stream: S) 83 | where 84 | S: SearchStream, 85 | { 86 | while let Some((_, index, score)) = stream.next() { 87 | self.process_document(Document { score, index }); 88 | } 89 | } 90 | 91 | /// Collects a single item 92 | pub fn collect_item(&mut self, index: u64, score: Score) { 93 | self.process_document(Document { score, index }) 94 | } 95 | 96 | fn process_document(&mut self, doc: Document) { 97 | if let Some(Reverse(existing_doc)) = self.heap.iter().find(|other| other.0 == doc) { 98 | if doc.score > existing_doc.score { 99 | // The heap already contains this document, but it has a lower score than 100 | // the one we are trying to add now. Since BinaryHeap does not have a method 101 | // to remove an item, we'll manually pop all the items using sorted_docs as 102 | // a scratch space. 103 | self.sorted_docs.clear(); 104 | while let Some(Reverse(item)) = self.heap.pop() { 105 | if item == doc { 106 | break; 107 | } 108 | self.sorted_docs.push(item); 109 | } 110 | self.heap.push(Reverse(doc)); 111 | while let Some(item) = self.sorted_docs.pop() { 112 | self.heap.push(Reverse(item)); 113 | } 114 | } 115 | return; 116 | } 117 | 118 | if self.heap.len() < self.limit { 119 | self.heap.push(Reverse(doc)); 120 | } else if let Some(mut head) = self.heap.peek_mut() { 121 | if head.0.score < doc.score { 122 | *head = Reverse(doc); 123 | } 124 | } 125 | } 126 | 127 | /// Returns a slice of the processed top documents, ordered by their score. 128 | pub fn top_documents(&mut self) -> &[Document] { 129 | self.sorted_docs.clear(); 130 | while let Some(doc) = self.heap.pop() { 131 | self.sorted_docs.push(doc.0) 132 | } 133 | self.sorted_docs.reverse(); 134 | self.sorted_docs.as_slice() 135 | } 136 | } 137 | 138 | #[cfg(test)] 139 | mod tests { 140 | use super::*; 141 | 142 | #[test] 143 | fn test_collector() { 144 | let mut collector = TopScoreCollector::new(5); 145 | for i in 0..10 { 146 | collector.process_document(Document { 147 | index: i, 148 | score: (i as Score), 149 | }); 150 | } 151 | let top_docs = collector.top_documents(); 152 | 153 | // check scores 154 | let scores: Vec = top_docs.iter().map(|doc| doc.score).collect(); 155 | assert_eq!(scores, vec!(9, 8, 7, 6, 5)); 156 | 157 | // check indices 158 | let indices: Vec = top_docs.iter().map(|doc| doc.index).collect(); 159 | assert_eq!(indices, vec!(9, 8, 7, 6, 5)); 160 | } 161 | 162 | #[test] 163 | fn test_collector_duplicates() { 164 | let mut collector = TopScoreCollector::new(7); 165 | for i in 0..10 { 166 | collector.process_document(Document { 167 | index: i, 168 | score: (i as Score), 169 | }); 170 | } 171 | collector.process_document(Document { 172 | index: 3, 173 | score: 6 as Score, 174 | }); 175 | collector.process_document(Document { 176 | index: 6, 177 | score: 2 as Score, 178 | }); 179 | collector.process_document(Document { 180 | index: 2, 181 | score: 8 as Score, 182 | }); 183 | let top_docs = collector.top_documents(); 184 | 185 | // check scores 186 | let scores: Vec = top_docs.iter().map(|doc| doc.score).collect(); 187 | assert_eq!(scores, vec!(9, 8, 8, 7, 6, 6, 5)); 188 | 189 | // check indices 190 | let indices: Vec = top_docs.iter().map(|doc| doc.index).collect(); 191 | assert_eq!(indices, vec!(9, 2, 8, 7, 6, 3, 5)); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /engine/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod collectors; 2 | pub use levenshtein_automata as levenshtein; 3 | pub mod searchable; 4 | pub mod streams; 5 | 6 | pub use crate::{ 7 | collectors::TopScoreCollector, 8 | searchable::{Searchable, SearchableStorage}, 9 | streams::SearchStream, 10 | }; 11 | 12 | /// Score type, used for keeping a per-item score in `SearchStream`. 13 | pub type Score = u64; 14 | -------------------------------------------------------------------------------- /engine/src/searchable.rs: -------------------------------------------------------------------------------- 1 | use crate::streams::{DeduplicatedStream, SearchStream}; 2 | use crate::Score; 3 | use fst::automaton::{Automaton, Str, Subsequence}; 4 | use fst::{IntoStreamer, Map, Streamer}; 5 | use itertools::{process_results, Itertools}; 6 | use levenshtein_automata::{Distance as LevenshteinDistance, LevenshteinAutomatonBuilder}; 7 | use maybe_owned::MaybeOwned; 8 | use q_compress::errors::QCompressError; 9 | use std::collections::HashMap; 10 | use thiserror::Error; 11 | 12 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 13 | #[cfg_attr( 14 | feature = "rkyv", 15 | derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize) 16 | )] 17 | pub struct Duplicates { 18 | offsets_map: HashMap, 19 | compressed_indices: Vec, 20 | } 21 | 22 | pub trait DuplicatesLookup { 23 | type Iter: Iterator; 24 | 25 | fn get(&self, key: u64) -> Option; 26 | } 27 | 28 | impl DuplicatesLookup for Duplicates { 29 | type Iter = std::vec::IntoIter; 30 | 31 | fn get(&self, key: u64) -> Option { 32 | self.offsets_map.get(&key).map(|offset| { 33 | let words = q_compress::BitWords::from(&self.compressed_indices); 34 | let mut reader = q_compress::BitReader::from(&words); 35 | let decompressor = q_compress::Decompressor::::default(); 36 | let flags = decompressor.header(&mut reader).unwrap(); 37 | reader.seek(*offset * 8); 38 | let chunk = decompressor.chunk(&mut reader, &flags).unwrap().unwrap(); 39 | chunk.nums.into_iter() 40 | }) 41 | } 42 | } 43 | 44 | #[cfg(feature = "rkyv")] 45 | impl DuplicatesLookup for ArchivedDuplicates { 46 | type Iter = std::vec::IntoIter; 47 | 48 | fn get(&self, key: u64) -> Option { 49 | self.offsets_map.get(&key).map(|offset| { 50 | let words = q_compress::BitWords::from(&self.compressed_indices); 51 | let mut reader = q_compress::BitReader::from(&words); 52 | let decompressor = q_compress::Decompressor::::default(); 53 | let flags = decompressor.header(&mut reader).unwrap(); 54 | reader.seek(*offset as usize * 8); 55 | let chunk = decompressor.chunk(&mut reader, &flags).unwrap().unwrap(); 56 | chunk.nums.into_iter() 57 | }) 58 | } 59 | } 60 | 61 | /// Structure that contains all underlying data needed to construct a `Searchable`. 62 | /// 63 | /// NOTE: this struct is serializable 64 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 65 | #[cfg_attr( 66 | feature = "rkyv", 67 | derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize) 68 | )] 69 | pub struct SearchableStorage { 70 | fst_data: Vec, 71 | duplicates: Duplicates, 72 | } 73 | 74 | #[derive(Error, Debug)] 75 | pub enum BuildError { 76 | #[error("could not build FST: {0}")] 77 | BuildFst(#[from] fst::Error), 78 | #[error("could not compress indices: {0}")] 79 | CompressIndices(#[from] QCompressError), 80 | } 81 | 82 | #[derive(Error, Debug)] 83 | pub enum LoadError { 84 | #[error("could not read FST: {0}")] 85 | ReadFst(#[from] fst::Error), 86 | } 87 | 88 | #[cfg(feature = "rkyv")] 89 | impl ArchivedSearchableStorage 90 | where 91 | SearchableStorage: rkyv::Archive, 92 | { 93 | pub fn to_searchable(&self) -> Result { 94 | Ok(ArchivedSearchable { 95 | map: Map::new(self.fst_data.as_slice())?, 96 | duplicates: &self.duplicates, 97 | }) 98 | } 99 | } 100 | 101 | impl SearchableStorage { 102 | pub fn to_searchable(&self) -> Result { 103 | Ok(Searchable { 104 | map: Map::new(self.fst_data.as_slice())?, 105 | duplicates: &self.duplicates, 106 | }) 107 | } 108 | 109 | /// Construct a Searchable from an `Iterator`. 110 | /// 111 | /// This expects the items to be pre-sorted in lexicographic order. If not, an error will be 112 | /// returned. 113 | /// 114 | /// This method can handle duplicate keys. 115 | /// 116 | /// # Example 117 | /// ``` 118 | /// use porigon::SearchableStorage; 119 | /// 120 | /// let searchable = SearchableStorage::build_from_iter(vec!( 121 | /// ("bar", 1), 122 | /// ("foo", 2), 123 | /// )).unwrap(); 124 | /// ``` 125 | pub fn build_from_iter<'a, I>(iter: I) -> Result 126 | where 127 | I: IntoIterator, 128 | { 129 | let mut bit_writer = q_compress::BitWriter::default(); 130 | let compressor = q_compress::Compressor::::from_config(q_compress::CompressorConfig { 131 | delta_encoding_order: 1, 132 | compression_level: 6, 133 | }); 134 | compressor.header(&mut bit_writer)?; 135 | let header_offset = bit_writer.byte_size(); 136 | 137 | // group items by key and build map 138 | let mut offsets_map = HashMap::new(); 139 | let iter = iter.into_iter().group_by(|(key, _)| *key); 140 | let iter = iter.into_iter().map(|(key, mut group)| { 141 | let (_, first) = group.next().unwrap(); 142 | if let Some((_, second)) = group.next() { 143 | let mut indices = vec![first, second]; 144 | indices.extend(group.map(|(_, next)| next)); 145 | indices.sort(); 146 | 147 | let offset = bit_writer.byte_size(); 148 | compressor.chunk(indices.as_slice(), &mut bit_writer)?; 149 | offsets_map.insert(first, offset - header_offset); 150 | } 151 | 152 | Ok::<_, QCompressError>((key, first)) 153 | }); 154 | let map = process_results(iter, |iter| Map::from_iter(iter))??; 155 | 156 | compressor.footer(&mut bit_writer)?; 157 | 158 | Ok(Self { 159 | fst_data: map.into_fst().into_inner(), 160 | duplicates: Duplicates { 161 | offsets_map, 162 | compressed_indices: bit_writer.bytes(), 163 | }, 164 | }) 165 | } 166 | } 167 | 168 | /// Main entry point to querying FSTs. 169 | /// 170 | /// This is a thin wrapper around `fst::Map`, providing easy access to querying it in 171 | /// various ways (exact_match, starts_with, levenshtein, ...). 172 | pub struct SearchableInner<'a, D: DuplicatesLookup> { 173 | map: Map<&'a [u8]>, 174 | duplicates: &'a D, 175 | } 176 | 177 | impl<'s, D: DuplicatesLookup> SearchableInner<'s, D> { 178 | fn create_stream<'a, A: 'a>(&'a self, automaton: A) -> impl SearchStream + 'a 179 | where 180 | A: Automaton, 181 | { 182 | struct Adapter<'m, A>(fst::map::Stream<'m, A>) 183 | where 184 | A: Automaton; 185 | 186 | impl<'m, A> SearchStream for Adapter<'m, A> 187 | where 188 | A: Automaton, 189 | { 190 | fn next(&mut self) -> Option<(&str, u64, Score)> { 191 | self.0.next().map(|(key, index)| { 192 | // SAFETY: we have built this FST containing only valid strings, so this should 193 | // always hold. 194 | let key = unsafe { std::str::from_utf8_unchecked(key) }; 195 | (key, index, 0) 196 | }) 197 | } 198 | } 199 | 200 | let stream = self.map.search(automaton).into_stream(); 201 | DeduplicatedStream::new(Adapter(stream), self.duplicates) 202 | } 203 | 204 | /// Creates a `SearchStream` from a `StartsWith` matcher for the given `query`. 205 | /// 206 | /// # Example 207 | /// ``` 208 | /// use porigon::{SearchableStorage, SearchStream}; 209 | /// 210 | /// let items = vec!( 211 | /// ("bar", 1), 212 | /// ("foo", 2), 213 | /// ("foo_bar", 3) 214 | /// ); 215 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 216 | /// let searchable = storage.to_searchable().unwrap(); 217 | /// 218 | /// let mut strm = searchable.starts_with("foo"); 219 | /// assert_eq!(strm.next(), Some(("foo", 2, 0))); 220 | /// assert_eq!(strm.next(), Some(("foo_bar", 3, 0))); 221 | /// assert_eq!(strm.next(), None); 222 | /// ``` 223 | /// 224 | pub fn starts_with<'a>(&'a self, query: &'a str) -> impl SearchStream + 'a { 225 | let automaton = Str::new(query).starts_with(); 226 | self.create_stream(automaton) 227 | } 228 | 229 | /// Creates a `SearchStream` from an `ExactMatch` matcher for the given `query`. 230 | /// 231 | /// # Example 232 | /// ``` 233 | /// use porigon::{SearchableStorage, SearchStream}; 234 | /// 235 | /// let items = vec!( 236 | /// ("bar", 1), 237 | /// ("foo", 2), 238 | /// ("foo_bar", 3) 239 | /// ); 240 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 241 | /// let searchable = storage.to_searchable().unwrap(); 242 | /// 243 | /// let mut strm = searchable.exact_match("foo"); 244 | /// assert_eq!(strm.next(), Some(("foo", 2, 0))); 245 | /// assert_eq!(strm.next(), None); 246 | /// ``` 247 | /// 248 | pub fn exact_match<'a>(&'a self, query: &'a str) -> impl SearchStream + 'a { 249 | let automaton = Str::new(query); 250 | self.create_stream(automaton) 251 | } 252 | 253 | /// Creates a `SearchStream` from a `levenshtein_automata::DFA` matcher. 254 | /// 255 | /// This method supports both moving the DFA or passing a reference to it. 256 | /// 257 | /// # Example 258 | /// ``` 259 | /// use porigon::levenshtein::LevenshteinAutomatonBuilder; 260 | /// use porigon::{SearchableStorage, SearchStream}; 261 | /// 262 | /// let items = vec!( 263 | /// ("bar", 1), 264 | /// ("fob", 2), 265 | /// ("foo", 3), 266 | /// ("foo_bar", 4) 267 | /// ); 268 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 269 | /// let searchable = storage.to_searchable().unwrap(); 270 | /// let levenshtein_builder = LevenshteinAutomatonBuilder::new(1, false); 271 | /// 272 | /// let dfa = levenshtein_builder.build_dfa("foo"); 273 | /// let mut strm = searchable.levenshtein(&dfa); 274 | /// assert_eq!(strm.next(), Some(("fob", 2, 0))); 275 | /// assert_eq!(strm.next(), Some(("foo", 3, 0))); 276 | /// assert_eq!(strm.next(), None); 277 | /// ``` 278 | /// 279 | pub fn levenshtein<'a, DFA>(&'a self, dfa: DFA) -> impl SearchStream + 'a 280 | where 281 | DFA: Into>, 282 | { 283 | struct Adapter<'a>(MaybeOwned<'a, levenshtein_automata::DFA>); 284 | 285 | impl<'a> fst::Automaton for Adapter<'a> { 286 | type State = u32; 287 | 288 | fn start(&self) -> u32 { 289 | self.0.initial_state() 290 | } 291 | 292 | fn is_match(&self, state: &u32) -> bool { 293 | match self.0.distance(*state) { 294 | LevenshteinDistance::Exact(_) => true, 295 | LevenshteinDistance::AtLeast(_) => false, 296 | } 297 | } 298 | 299 | fn can_match(&self, state: &u32) -> bool { 300 | *state != levenshtein_automata::SINK_STATE 301 | } 302 | 303 | fn accept(&self, state: &u32, byte: u8) -> u32 { 304 | self.0.transition(*state, byte) 305 | } 306 | } 307 | 308 | self.create_stream(Adapter(dfa.into())) 309 | } 310 | 311 | /// Creates a `SearchStream` for a `LevenshteinAutomatonBuilder` and the given `query`. 312 | /// 313 | /// # Example 314 | /// ``` 315 | /// use porigon::levenshtein::LevenshteinAutomatonBuilder; 316 | /// use porigon::{SearchableStorage, SearchStream}; 317 | /// 318 | /// let items = vec!( 319 | /// ("bar", 1), 320 | /// ("fob", 2), 321 | /// ("foo", 3), 322 | /// ("foo_bar", 4) 323 | /// ); 324 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 325 | /// let searchable = storage.to_searchable().unwrap(); 326 | /// let levenshtein_builder = LevenshteinAutomatonBuilder::new(1, false); 327 | /// 328 | /// let dfa = levenshtein_builder.build_dfa("foo"); 329 | /// let mut strm = searchable.levenshtein_exact_match(&levenshtein_builder, "foo"); 330 | /// assert_eq!(strm.next(), Some(("fob", 2, 0))); 331 | /// assert_eq!(strm.next(), Some(("foo", 3, 0))); 332 | /// assert_eq!(strm.next(), None); 333 | /// ``` 334 | /// 335 | pub fn levenshtein_exact_match<'a>( 336 | &'a self, 337 | builder: &LevenshteinAutomatonBuilder, 338 | query: &'a str, 339 | ) -> impl SearchStream + 'a { 340 | self.levenshtein(builder.build_dfa(query)) 341 | } 342 | 343 | /// Creates a `SearchStream` for a `LevenshteinAutomatonBuilder` and the given `query`. 344 | /// 345 | /// # Example 346 | /// ``` 347 | /// use porigon::levenshtein::LevenshteinAutomatonBuilder; 348 | /// use porigon::{SearchableStorage, SearchStream}; 349 | /// 350 | /// let items = vec!( 351 | /// ("bar", 1), 352 | /// ("fob", 2), 353 | /// ("foo", 3), 354 | /// ("foo_bar", 4) 355 | /// ); 356 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 357 | /// let searchable = storage.to_searchable().unwrap(); 358 | /// let levenshtein_builder = LevenshteinAutomatonBuilder::new(1, false); 359 | /// 360 | /// let dfa = levenshtein_builder.build_dfa("foo"); 361 | /// let mut strm = searchable.levenshtein_starts_with(&levenshtein_builder, "foo"); 362 | /// assert_eq!(strm.next(), Some(("fob", 2, 0))); 363 | /// assert_eq!(strm.next(), Some(("foo", 3, 0))); 364 | /// assert_eq!(strm.next(), Some(("foo_bar", 4, 0))); 365 | /// assert_eq!(strm.next(), None); 366 | /// ``` 367 | /// 368 | pub fn levenshtein_starts_with<'a>( 369 | &'a self, 370 | builder: &LevenshteinAutomatonBuilder, 371 | query: &'a str, 372 | ) -> impl SearchStream + 'a { 373 | self.levenshtein(builder.build_prefix_dfa(query)) 374 | } 375 | 376 | /// Creates a `SearchStream` from a `Subsequence` matcher for the given `query`. 377 | /// 378 | /// # Example 379 | /// ``` 380 | /// use porigon::{SearchableStorage, SearchStream}; 381 | /// 382 | /// let items = vec!(("bar_foo", 2), ("foo", 0), ("foo_bar", 1)); 383 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 384 | /// let searchable = storage.to_searchable().unwrap(); 385 | /// 386 | /// let mut strm = searchable.subsequence("fb"); 387 | /// assert_eq!(strm.next(), Some(("foo_bar", 1, 0))); 388 | /// assert_eq!(strm.next(), None); 389 | /// ``` 390 | /// 391 | pub fn subsequence<'a>(&'a self, query: &'a str) -> impl SearchStream + 'a { 392 | let automaton = Subsequence::new(query); 393 | self.create_stream(automaton) 394 | } 395 | } 396 | 397 | pub type Searchable<'a> = SearchableInner<'a, Duplicates>; 398 | #[cfg(feature = "rkyv")] 399 | pub type ArchivedSearchable<'a> = SearchableInner<'a, ArchivedDuplicates>; 400 | 401 | #[cfg(test)] 402 | mod tests { 403 | use super::*; 404 | 405 | type TestResult = Result<(), Box>; 406 | 407 | trait SearchStreamIntoVec { 408 | fn into_vec(self) -> Vec<(String, u64, Score)>; 409 | } 410 | 411 | impl SearchStreamIntoVec for S { 412 | fn into_vec(mut self) -> Vec<(String, u64, Score)> { 413 | let mut items = Vec::new(); 414 | while let Some((key, index, score)) = self.next() { 415 | items.push((key.to_string(), index, score)); 416 | } 417 | items 418 | } 419 | } 420 | 421 | #[test] 422 | fn test_build() -> TestResult { 423 | let items = vec![("bar", 1), ("foo", 0)]; 424 | let storage = SearchableStorage::build_from_iter(items)?; 425 | let results = storage 426 | .to_searchable()? 427 | .map 428 | .as_ref() 429 | .stream() 430 | .into_str_vec()?; 431 | assert_eq!(results.len(), 2); 432 | assert_eq!( 433 | results, 434 | vec![("bar".to_string(), 1), ("foo".to_string(), 0)] 435 | ); 436 | Ok(()) 437 | } 438 | 439 | #[test] 440 | fn test_searchable_exact_match() -> TestResult { 441 | let items = vec![("fo", 1), ("foo", 0), ("foobar", 2)]; 442 | let storage = SearchableStorage::build_from_iter(items)?; 443 | let searchable = storage.to_searchable()?; 444 | 445 | // negative match 446 | let results = searchable.exact_match("bar").into_vec(); 447 | assert_eq!(results.len(), 0); 448 | 449 | // positive match 450 | let results = searchable.exact_match("foo").into_vec(); 451 | assert_eq!(results.len(), 1); 452 | assert_eq!(results, vec!(("foo".to_string(), 0, 0))); 453 | 454 | Ok(()) 455 | } 456 | 457 | #[test] 458 | fn test_searchable_starts_with() -> TestResult { 459 | let items = vec![("fo", 1), ("foo", 0), ("foobar", 2)]; 460 | let storage = SearchableStorage::build_from_iter(items)?; 461 | let searchable = storage.to_searchable()?; 462 | 463 | // negative match 464 | let results = searchable.starts_with("b").into_vec(); 465 | assert_eq!(results.len(), 0); 466 | 467 | // positive match 468 | let results = searchable.starts_with("foo").into_vec(); 469 | assert_eq!(results.len(), 2); 470 | assert_eq!( 471 | results, 472 | vec!(("foo".to_string(), 0, 0), ("foobar".to_string(), 2, 0)) 473 | ); 474 | 475 | Ok(()) 476 | } 477 | 478 | #[test] 479 | fn test_searchable_subsequence() -> TestResult { 480 | let items = vec![("bar_foo", 2), ("foo", 0), ("foo_bar", 1)]; 481 | let storage = SearchableStorage::build_from_iter(items)?; 482 | let searchable = storage.to_searchable()?; 483 | 484 | // negative match 485 | let results = searchable.subsequence("m").into_vec(); 486 | assert_eq!(results.len(), 0); 487 | 488 | // positive match 489 | let results = searchable.subsequence("fb").into_vec(); 490 | assert_eq!(results.len(), 1); 491 | assert_eq!(results, vec!(("foo_bar".to_string(), 1, 0))); 492 | 493 | // other positive match 494 | let results = searchable.subsequence("bf").into_vec(); 495 | assert_eq!(results.len(), 1); 496 | assert_eq!(results, vec!(("bar_foo".to_string(), 2, 0))); 497 | 498 | Ok(()) 499 | } 500 | 501 | #[test] 502 | fn test_scored_stream() -> TestResult { 503 | let items = vec![("fo", 1), ("foo", 0), ("foobar", 2)]; 504 | let storage = SearchableStorage::build_from_iter(items)?; 505 | let searchable = storage.to_searchable()?; 506 | 507 | // use key 508 | let results = searchable 509 | .starts_with("foo") 510 | .rescore(|key, _, _| key.len() as u64) 511 | .into_vec(); 512 | assert_eq!(results.len(), 2); 513 | assert_eq!( 514 | results, 515 | vec!(("foo".to_string(), 0, 3), ("foobar".to_string(), 2, 6)) 516 | ); 517 | 518 | // use index 519 | let results = searchable 520 | .starts_with("foo") 521 | .rescore(|_, idx, _| (idx * 2) as u64) 522 | .into_vec(); 523 | assert_eq!(results.len(), 2); 524 | assert_eq!( 525 | results, 526 | vec!(("foo".to_string(), 0, 0), ("foobar".to_string(), 2, 4)) 527 | ); 528 | 529 | Ok(()) 530 | } 531 | 532 | #[test] 533 | fn test_filter() -> TestResult { 534 | let items = vec![("fo", 1), ("foo", 0), ("foobar", 2)]; 535 | let storage = SearchableStorage::build_from_iter(items)?; 536 | let searchable = storage.to_searchable()?; 537 | 538 | let results = searchable 539 | .starts_with("foo") 540 | .filter(|key, _, _| key != "foobar") 541 | .into_vec(); 542 | assert_eq!(results.len(), 1); 543 | assert_eq!(results, vec!(("foo".to_string(), 0, 0))); 544 | 545 | let results = searchable 546 | .starts_with("foo") 547 | .filter(|key, _, _| key == "foobar") 548 | .into_vec(); 549 | assert_eq!(results.len(), 1); 550 | assert_eq!(results, vec!(("foobar".to_string(), 2, 0))); 551 | 552 | Ok(()) 553 | } 554 | 555 | #[test] 556 | fn test_map() -> TestResult { 557 | let items = vec![("fo", 1), ("foo", 2), ("foobar", 3)]; 558 | let storage = SearchableStorage::build_from_iter(items)?; 559 | let searchable = storage.to_searchable()?; 560 | 561 | let results = searchable 562 | .starts_with("foo") 563 | .map(|key, index, score| (key, index * 2, score)) 564 | .into_vec(); 565 | assert_eq!(results.len(), 2); 566 | assert_eq!( 567 | results, 568 | vec!(("foo".to_string(), 4, 0), ("foobar".to_string(), 6, 0)) 569 | ); 570 | 571 | Ok(()) 572 | } 573 | 574 | #[test] 575 | fn test_duplicates() -> TestResult { 576 | let items = vec![("foo", 0), ("foo", 1), ("foobar", 2)]; 577 | let storage = SearchableStorage::build_from_iter(items)?; 578 | let searchable = storage.to_searchable()?; 579 | 580 | let results = searchable.exact_match("foo").into_vec(); 581 | assert_eq!(results.len(), 2); 582 | assert_eq!( 583 | results, 584 | vec!(("foo".to_string(), 0, 0), ("foo".to_string(), 1, 0)) 585 | ); 586 | 587 | Ok(()) 588 | } 589 | 590 | #[test] 591 | fn test_levenshtein() -> TestResult { 592 | let items = vec![ 593 | ("bar", 0), 594 | ("baz", 1), 595 | ("boz", 2), 596 | ("coz", 3), 597 | ("fob", 4), 598 | ("foo", 5), 599 | ("foobar", 6), 600 | ("something else", 7), 601 | ]; 602 | let storage = SearchableStorage::build_from_iter(items)?; 603 | let searchable = storage.to_searchable()?; 604 | let dfa_builder_1 = LevenshteinAutomatonBuilder::new(1, false); 605 | let dfa_builder_2 = LevenshteinAutomatonBuilder::new(2, false); 606 | 607 | let results = searchable 608 | .levenshtein_exact_match(&dfa_builder_1, "foo") 609 | .into_vec(); 610 | assert_eq!(results.len(), 2); 611 | assert_eq!( 612 | results, 613 | vec!(("fob".to_string(), 4, 0), ("foo".to_string(), 5, 0)) 614 | ); 615 | 616 | let results = searchable 617 | .levenshtein_starts_with(&dfa_builder_1, "foo") 618 | .into_vec(); 619 | assert_eq!(results.len(), 3); 620 | assert_eq!( 621 | results, 622 | vec!( 623 | ("fob".to_string(), 4, 0), 624 | ("foo".to_string(), 5, 0), 625 | ("foobar".to_string(), 6, 0) 626 | ) 627 | ); 628 | 629 | let results = searchable 630 | .levenshtein_exact_match(&dfa_builder_2, "bar") 631 | .into_vec(); 632 | assert_eq!(results.len(), 3); 633 | assert_eq!( 634 | results, 635 | vec!( 636 | ("bar".to_string(), 0, 0), 637 | ("baz".to_string(), 1, 0), 638 | ("boz".to_string(), 2, 0) 639 | ) 640 | ); 641 | 642 | let dfa = dfa_builder_2.build_prefix_dfa("bar"); 643 | let results = searchable 644 | .levenshtein(&dfa) 645 | .rescore(|key, _, _| match dfa.eval(key) { 646 | LevenshteinDistance::Exact(d) => d as u64, 647 | LevenshteinDistance::AtLeast(_) => 0, 648 | }) 649 | .into_vec(); 650 | assert_eq!(results.len(), 3); 651 | assert_eq!( 652 | results, 653 | vec!( 654 | ("bar".to_string(), 0, 0), 655 | ("baz".to_string(), 1, 1), 656 | ("boz".to_string(), 2, 2) 657 | ) 658 | ); 659 | 660 | Ok(()) 661 | } 662 | } 663 | -------------------------------------------------------------------------------- /engine/src/streams.rs: -------------------------------------------------------------------------------- 1 | use crate::searchable::DuplicatesLookup; 2 | use crate::Score; 3 | 4 | pub struct FilteredStream 5 | where 6 | S: SearchStream, 7 | F: Fn(&str, u64, Score) -> bool, 8 | { 9 | cur_key: String, 10 | filter: F, 11 | wrapped: S, 12 | } 13 | 14 | impl FilteredStream 15 | where 16 | S: SearchStream, 17 | F: Fn(&str, u64, Score) -> bool, 18 | { 19 | pub fn new(streamer: S, filter: F) -> Self { 20 | Self { 21 | cur_key: String::new(), 22 | filter, 23 | wrapped: streamer, 24 | } 25 | } 26 | } 27 | 28 | impl SearchStream for FilteredStream 29 | where 30 | S: SearchStream, 31 | F: Fn(&str, u64, Score) -> bool, 32 | { 33 | fn next(&mut self) -> Option<(&str, u64, Score)> { 34 | let filter_fn = &self.filter; 35 | while let Some((key, index, score)) = self.wrapped.next() { 36 | if !filter_fn(key, index, score) { 37 | continue; 38 | } 39 | 40 | // borrow checker workaround: we can't seem to pass the key as-is, so we make 41 | // an (useless) copy and return that instead 42 | self.cur_key.clear(); 43 | self.cur_key.push_str(key); 44 | return Some((self.cur_key.as_str(), index, score)); 45 | } 46 | 47 | None 48 | } 49 | } 50 | 51 | pub struct MappedStream 52 | where 53 | S: SearchStream, 54 | F: Fn(&str, u64, Score) -> (&str, u64, Score), 55 | { 56 | mapper: F, 57 | wrapped: S, 58 | } 59 | 60 | impl MappedStream 61 | where 62 | S: SearchStream, 63 | F: Fn(&str, u64, Score) -> (&str, u64, Score), 64 | { 65 | pub fn new(streamer: S, mapper: F) -> Self { 66 | Self { 67 | mapper, 68 | wrapped: streamer, 69 | } 70 | } 71 | } 72 | 73 | impl SearchStream for MappedStream 74 | where 75 | S: SearchStream, 76 | F: Fn(&str, u64, Score) -> (&str, u64, Score), 77 | { 78 | fn next(&mut self) -> Option<(&str, u64, Score)> { 79 | let mapper_fn = &self.mapper; 80 | self.wrapped 81 | .next() 82 | .map(|(key, index, score)| mapper_fn(key, index, score)) 83 | } 84 | } 85 | 86 | pub struct RescoredStream 87 | where 88 | S: SearchStream, 89 | F: Fn(&str, u64, Score) -> Score, 90 | { 91 | scorer: F, 92 | wrapped: S, 93 | } 94 | 95 | impl RescoredStream 96 | where 97 | S: SearchStream, 98 | F: Fn(&str, u64, Score) -> Score, 99 | { 100 | pub fn new(streamer: S, scorer: F) -> Self { 101 | Self { 102 | scorer, 103 | wrapped: streamer, 104 | } 105 | } 106 | } 107 | 108 | impl SearchStream for RescoredStream 109 | where 110 | S: SearchStream, 111 | F: Fn(&str, u64, Score) -> Score, 112 | { 113 | fn next(&mut self) -> Option<(&str, u64, Score)> { 114 | let scorer_fn = &self.scorer; 115 | self.wrapped 116 | .next() 117 | .map(|(key, index, score)| (key, index, scorer_fn(key, index, score))) 118 | } 119 | } 120 | 121 | pub struct DeduplicatedStream<'a, S, D> 122 | where 123 | S: SearchStream, 124 | D: DuplicatesLookup, 125 | { 126 | cur_key: String, 127 | cur_iter: Option, 128 | cur_score: Score, 129 | duplicates: &'a D, 130 | wrapped: S, 131 | } 132 | 133 | impl<'a, S, D> DeduplicatedStream<'a, S, D> 134 | where 135 | S: SearchStream, 136 | D: DuplicatesLookup, 137 | { 138 | pub fn new(streamer: S, duplicates: &'a D) -> Self { 139 | Self { 140 | cur_key: String::new(), 141 | cur_iter: None, 142 | cur_score: 0, 143 | duplicates, 144 | wrapped: streamer, 145 | } 146 | } 147 | } 148 | 149 | impl<'a, S, D> SearchStream for DeduplicatedStream<'a, S, D> 150 | where 151 | S: SearchStream, 152 | D: DuplicatesLookup, 153 | { 154 | fn next(&mut self) -> Option<(&str, u64, Score)> { 155 | if let Some(iter) = &mut self.cur_iter { 156 | match iter.next() { 157 | Some(index) => return Some((self.cur_key.as_str(), index, self.cur_score)), 158 | None => { 159 | self.cur_iter = None; 160 | self.cur_key.clear(); 161 | self.cur_score = 0; 162 | } 163 | } 164 | } 165 | 166 | self.wrapped 167 | .next() 168 | .map(|(key, index, score)| match self.duplicates.get(index) { 169 | Some(mut dupes) => { 170 | let index = dupes.next().unwrap(); 171 | self.cur_key.clear(); 172 | self.cur_key.push_str(key); 173 | self.cur_iter = Some(dupes); 174 | self.cur_score = score; 175 | (key, index, score) 176 | } 177 | None => (key, index, score), 178 | }) 179 | } 180 | } 181 | 182 | /// FST stream on which various operations can be chained. 183 | pub trait SearchStream { 184 | /// Emits the next scored document in this stream, or `None` to indicate 185 | /// the stream has been exhausted. 186 | /// 187 | /// It is not specified what a stream does after `None` is emitted. In most 188 | /// cases, `None` should be emitted on every subsequent call. 189 | fn next(&mut self) -> Option<(&str, u64, Score)>; 190 | 191 | /// Scores a stream, using the given closure. 192 | /// 193 | /// # Examples 194 | /// 195 | /// Basic usage: 196 | /// 197 | /// ``` 198 | /// use porigon::{SearchableStorage, SearchStream}; 199 | /// 200 | /// let storage = SearchableStorage::build_from_iter(vec!( 201 | /// ("foo", 0), 202 | /// ("foobar", 1)) 203 | /// ).unwrap(); 204 | /// let searchable = storage.to_searchable().unwrap(); 205 | /// let mut strm = searchable 206 | /// .starts_with("foo") 207 | /// .rescore(|key, _, _| key.len() as porigon::Score) 208 | /// ; 209 | /// assert_eq!(strm.next(), Some(("foo", 0, 3))); 210 | /// assert_eq!(strm.next(), Some(("foobar", 1, 6))); 211 | /// assert_eq!(strm.next(), None); 212 | /// ``` 213 | /// 214 | /// You can also use this to build upon a previously set score: 215 | /// 216 | /// ``` 217 | /// use porigon::{SearchableStorage, SearchStream}; 218 | /// 219 | /// let storage = SearchableStorage::build_from_iter(vec!( 220 | /// ("foo", 0), 221 | /// ("foobar", 1)) 222 | /// ).unwrap(); 223 | /// let searchable = storage.to_searchable().unwrap(); 224 | /// let mut strm = searchable 225 | /// .starts_with("foo") 226 | /// .rescore(|key, _, _| key.len() as porigon::Score) 227 | /// .rescore(|_, index, old_score| (old_score << 16) | index) 228 | /// ; 229 | /// assert_eq!(strm.next(), Some(("foo", 0, 3 << 16))); 230 | /// assert_eq!(strm.next(), Some(("foobar", 1, (6 << 16) | 1))); 231 | /// assert_eq!(strm.next(), None); 232 | /// ``` 233 | fn rescore(self, func: F) -> RescoredStream 234 | where 235 | F: Fn(&str, u64, Score) -> Score, 236 | Self: Sized, 237 | { 238 | RescoredStream::new(self, func) 239 | } 240 | 241 | /// Filters a stream, using the given closure. 242 | /// 243 | /// # Example 244 | /// 245 | /// ``` 246 | /// use porigon::{SearchableStorage, SearchStream}; 247 | /// 248 | /// let storage = SearchableStorage::build_from_iter(vec!( 249 | /// ("foo", 0), 250 | /// ("foobar", 1)) 251 | /// ).unwrap(); 252 | /// let searchable = storage.to_searchable().unwrap(); 253 | /// let mut strm = searchable 254 | /// .starts_with("foo") 255 | /// .filter(|key, _, _| key != "foobar") 256 | /// ; 257 | /// assert_eq!(strm.next(), Some(("foo", 0, 0))); 258 | /// assert_eq!(strm.next(), None); 259 | /// ``` 260 | fn filter(self, func: F) -> FilteredStream 261 | where 262 | F: Fn(&str, u64, Score) -> bool, 263 | Self: Sized, 264 | { 265 | FilteredStream::new(self, func) 266 | } 267 | 268 | /// Maps over a stream, using the given closure. 269 | /// 270 | /// This more of an advanced method, used for changing the stream's key or index. Most probably 271 | /// you want to use [rescore()](#method.rescore) instead. 272 | /// 273 | /// # Example 274 | /// 275 | /// ``` 276 | /// use porigon::{SearchableStorage, SearchStream}; 277 | /// 278 | /// let mut items = vec!( 279 | /// ("this is a bar", 15), 280 | /// ("is a bar", (1 << 32) | 15), 281 | /// ("a bar", (1 << 32) | 15), 282 | /// ("bar", (1 << 32) | 15), 283 | /// ("barfoo", 16) 284 | /// ); 285 | /// items.sort_by_key(|(key, _)| *key); 286 | /// let storage = SearchableStorage::build_from_iter(items).unwrap(); 287 | /// let searchable = storage.to_searchable().unwrap(); 288 | /// let mut strm = searchable 289 | /// .starts_with("bar") 290 | /// .map(|key, index, score| (key, index & !(1 << 32), score)) 291 | /// ; 292 | /// assert_eq!(strm.next(), Some(("bar", 15, 0))); 293 | /// assert_eq!(strm.next(), Some(("barfoo", 16, 0))); 294 | /// assert_eq!(strm.next(), None); 295 | /// ``` 296 | fn map(self, func: F) -> MappedStream 297 | where 298 | F: Fn(&str, u64, Score) -> (&str, u64, Score), 299 | Self: Sized, 300 | { 301 | MappedStream::new(self, func) 302 | } 303 | } 304 | -------------------------------------------------------------------------------- /wasm-example/.cargo/config: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "wasm32-unknown-unknown" -------------------------------------------------------------------------------- /wasm-example/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "porigon-wasm-example" 3 | version = "0.4.0" 4 | authors = ["Maurus Cuelenaere "] 5 | edition = "2021" 6 | publish = false 7 | 8 | [lib] 9 | crate-type = ["cdylib", "rlib"] 10 | doctest = false 11 | 12 | [dependencies] 13 | rkyv = "0.7.29" 14 | rkyv_derive = "0.7.29" 15 | serde = { version = "1.0", features = ["derive"] } 16 | stats_alloc = "0.1.8" 17 | porigon = { path = "../engine", features = ["rkyv"] } 18 | wasm-bindgen = { version = "0.2", features = ["serde-serialize"] } -------------------------------------------------------------------------------- /wasm-example/build_fst.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const readline = require('readline'); 3 | const wasmExample = require('./pkg/porigon_wasm_example'); 4 | 5 | function openFile(filename) { 6 | return readline.createInterface({ 7 | input: fs.createReadStream(filename), 8 | crlfDelay: Infinity, 9 | }); 10 | } 11 | 12 | function transliterate(s) { 13 | // https://towardsdatascience.com/difference-between-nfd-nfc-nfkd-and-nfkc-explained-with-python-code-e2631f96ae6c 14 | // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges 15 | return s.normalize("NFKD").replace(/[\u0300-\u036F\u1AB0-\u1AFF\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g, "").toLowerCase(); 16 | } 17 | 18 | const IGNORED_TYPES = ['short', 'video', 'tvEpisode', 'tvShort', 'tvSpecial']; 19 | 20 | async function readFromTitles() { 21 | const strm = openFile('./data/titles.tsv'); 22 | const titles = [], lookup = {}; 23 | for await (const line of strm) { 24 | const [tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres] = line.split('\t', 9); 25 | if (tconst === 'tconst') { 26 | // header line, skip it 27 | continue; 28 | } 29 | 30 | if (IGNORED_TYPES.includes(titleType) || startYear === '\\N') { 31 | continue; 32 | } 33 | 34 | const id = parseInt(tconst.replace(/^tt0*/, '')); 35 | const transliteratedPrimaryTitle = transliterate(primaryTitle); 36 | const transliteratedOriginalTitle = transliterate(originalTitle); 37 | 38 | titles.push([id, transliteratedPrimaryTitle]); 39 | if (transliteratedPrimaryTitle !== transliteratedOriginalTitle) { 40 | titles.push([id, transliteratedOriginalTitle]); 41 | } 42 | lookup[id] = { 43 | titleType, 44 | primaryTitle, 45 | startYear, 46 | }; 47 | } 48 | 49 | return { titles, lookup }; 50 | } 51 | 52 | async function readFromRatings(lookup) { 53 | const strm = openFile('./data/ratings.tsv'); 54 | const ratings = {}; 55 | for await (const line of strm) { 56 | const [tconst, averageRating, numVotes] = line.split('\t', 9); 57 | if (tconst === 'tconst') { 58 | // header line, skip it 59 | continue; 60 | } 61 | 62 | const id = parseInt(tconst.replace(/^tt0*/, '')); 63 | if (!(id in lookup)) { 64 | continue; 65 | } 66 | 67 | const rating = parseFloat(averageRating); 68 | ratings[id] = parseInt(rating * 100); 69 | lookup[id].rating = rating; 70 | } 71 | 72 | return ratings; 73 | } 74 | 75 | async function buildFst(titles, ratings) { 76 | return wasmExample.build({ titles, ratings }); 77 | } 78 | 79 | async function writeLookupToDisk(lookup) { 80 | fs.writeFileSync('./data/lookup.json', JSON.stringify(lookup), 'utf8'); 81 | } 82 | 83 | async function writeFstToDisk(fst) { 84 | fs.writeFileSync('./data/fst.bin', fst); 85 | } 86 | 87 | async function timeIt(label, fn) { 88 | console.log(label + '...'); 89 | console.time(label + ' took'); 90 | const ret = await fn(); 91 | console.timeEnd(label + ' took'); 92 | const memUsageInMb = Math.round(process.memoryUsage().heapUsed / 1024 / 1024); 93 | console.log(`Current memory usage: ${memUsageInMb} MB`); 94 | 95 | return ret; 96 | } 97 | 98 | async function main() { 99 | const { titles, lookup } = await timeIt('Reading from titles dataset', readFromTitles); 100 | const ratings = await timeIt('Reading from ratings dataset', () => readFromRatings(lookup)); 101 | const fst = await timeIt('Building FST', () => buildFst(titles, ratings)); 102 | await timeIt('Writing FST to disk', () => writeFstToDisk(fst)); 103 | await timeIt('Writing lookup to disk', () => writeLookupToDisk(lookup)); 104 | } 105 | 106 | main() 107 | .catch(err => { 108 | console.error(err); 109 | process.exit(1); 110 | }); -------------------------------------------------------------------------------- /wasm-example/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | rm -rf data 5 | mkdir data 6 | 7 | echo Downloading dataset... 8 | wget -O data/titles.tsv.gz https://datasets.imdbws.com/title.basics.tsv.gz 9 | wget -O data/ratings.tsv.gz https://datasets.imdbws.com/title.ratings.tsv.gz 10 | 11 | echo Unzipping dataset... 12 | gunzip data/titles.tsv.gz 13 | gunzip data/ratings.tsv.gz 14 | 15 | echo All done! -------------------------------------------------------------------------------- /wasm-example/query_fst.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const readline = require('readline'); 3 | const { Searcher, memory_stats } = require('./pkg/porigon_wasm_example'); 4 | 5 | function transliterate(s) { 6 | // https://towardsdatascience.com/difference-between-nfd-nfc-nfkd-and-nfkc-explained-with-python-code-e2631f96ae6c 7 | // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges 8 | return s.normalize("NFKD").replace(/[\u0300-\u036F\u1AB0-\u1AFF\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g, "").toLowerCase(); 9 | } 10 | 11 | function askUserForQuery(rl) { 12 | return new Promise((resolve) => { 13 | rl.question('Search query: ', resolve); 14 | }) 15 | } 16 | 17 | async function main() { 18 | console.time('Reading lookup'); 19 | const lookup = JSON.parse(fs.readFileSync('./data/lookup.json', 'utf8')); 20 | console.timeEnd('Reading lookup'); 21 | console.time('Reading FST'); 22 | const fst_data = fs.readFileSync('./data/fst.bin', null); 23 | console.timeEnd('Reading FST'); 24 | console.time('Initing searcher'); 25 | const searcher = new Searcher(fst_data, 10); 26 | console.timeEnd('Initing searcher'); 27 | const memUsageInMb = Math.round(process.memoryUsage().heapUsed / 1024 / 1024); 28 | console.log(`Process memory usage: ${memUsageInMb}MB`); 29 | 30 | const rl = readline.createInterface({ 31 | input: process.stdin, 32 | output: process.stdout, 33 | completer: (line) => { 34 | const results = searcher.search(transliterate(line)); 35 | const completions = Array.from(results).map(({ index }) => lookup[index].primaryTitle); 36 | const dedupedCompletions = Array.from(new Set(completions)); 37 | return [dedupedCompletions, line]; 38 | } 39 | }); 40 | 41 | let query; 42 | do { 43 | console.log(''); 44 | query = transliterate(await askUserForQuery(rl)); 45 | if (!query) { 46 | break; 47 | } 48 | 49 | console.time('Querying FST took'); 50 | const results = searcher.search(query); 51 | console.timeEnd('Querying FST took'); 52 | 53 | const stats = memory_stats(); 54 | const bytesUsedInMb = Math.round(stats.bytes_used / 1024 / 1024 * 100) / 100; 55 | console.log(`WASM memory usage: ${bytesUsedInMb}MB`); 56 | 57 | for (const { index, score } of results) { 58 | const { primaryTitle, titleType, rating, startYear } = lookup[index]; 59 | const imdbId = 'tt' + index.toString().padStart(7, '0'); 60 | const formattedRating = (rating || 0).toString().padStart(3, ' '); 61 | console.log(` * {${formattedRating}} ${primaryTitle} [${startYear}] (https://imdb.com/title/${imdbId}/) [type=${titleType}, score=${score}]`); 62 | } 63 | } while (query); 64 | 65 | rl.close(); 66 | } 67 | 68 | main() 69 | .catch(err => { 70 | console.error(err); 71 | process.exit(1); 72 | }); 73 | 74 | -------------------------------------------------------------------------------- /wasm-example/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate stats_alloc; 2 | extern crate wasm_bindgen; 3 | 4 | use porigon::levenshtein::LevenshteinAutomatonBuilder; 5 | use porigon::{SearchStream, SearchableStorage, TopScoreCollector}; 6 | use rkyv::{ 7 | archived_root, 8 | ser::{serializers::AllocSerializer, Serializer}, 9 | Archive, Serialize, 10 | }; 11 | use stats_alloc::{StatsAlloc, INSTRUMENTED_SYSTEM}; 12 | use std::alloc::System; 13 | use std::collections::HashMap; 14 | use wasm_bindgen::prelude::*; 15 | 16 | #[global_allocator] 17 | static GLOBAL: &StatsAlloc = &INSTRUMENTED_SYSTEM; 18 | 19 | #[derive(serde::Serialize)] 20 | pub struct MemoryStats { 21 | pub allocations: usize, 22 | pub deallocations: usize, 23 | pub bytes_used: isize, 24 | } 25 | 26 | #[wasm_bindgen] 27 | pub fn memory_stats() -> JsValue { 28 | let stats = GLOBAL.stats(); 29 | let result = MemoryStats { 30 | allocations: stats.allocations + stats.reallocations, 31 | deallocations: stats.deallocations, 32 | bytes_used: stats.bytes_allocated as isize - stats.bytes_deallocated as isize, 33 | }; 34 | 35 | JsValue::from_serde(&result).unwrap() 36 | } 37 | 38 | #[derive(Archive, Serialize)] 39 | struct SearchData { 40 | titles: SearchableStorage, 41 | ratings: HashMap, 42 | } 43 | 44 | impl ArchivedSearchData { 45 | pub fn from_bytes(bytes: &Vec) -> &Self { 46 | unsafe { archived_root::(bytes.as_slice()) } 47 | } 48 | } 49 | 50 | impl SearchData { 51 | pub fn to_bytes(&self) -> Result, JsError> { 52 | let mut serializer = AllocSerializer::<4096>::default(); 53 | serializer.serialize_value(self)?; 54 | Ok(serializer.into_serializer().into_inner().into_vec()) 55 | } 56 | } 57 | 58 | #[wasm_bindgen] 59 | pub struct Searcher { 60 | data: Vec, 61 | collector: TopScoreCollector, 62 | levenshtein_builder_1: LevenshteinAutomatonBuilder, 63 | levenshtein_builder_2: LevenshteinAutomatonBuilder, 64 | } 65 | 66 | #[derive(serde::Serialize)] 67 | pub struct SearchResult { 68 | pub index: u64, 69 | pub score: u64, 70 | } 71 | 72 | #[wasm_bindgen] 73 | impl Searcher { 74 | #[wasm_bindgen(constructor)] 75 | pub fn new(data: Vec, limit: usize) -> Searcher { 76 | let collector = TopScoreCollector::new(limit); 77 | let levenshtein_builder_1 = LevenshteinAutomatonBuilder::new(1, false); 78 | let levenshtein_builder_2 = LevenshteinAutomatonBuilder::new(2, false); 79 | Searcher { 80 | data, 81 | collector, 82 | levenshtein_builder_1, 83 | levenshtein_builder_2, 84 | } 85 | } 86 | 87 | pub fn search(&mut self, query: &str) -> JsValue { 88 | let data = ArchivedSearchData::from_bytes(&self.data); 89 | let ratings = &data.ratings; 90 | let get_rating_for = move |index| *ratings.get(&index).unwrap_or(&0) as u64; 91 | 92 | let titles = data.titles.to_searchable().unwrap(); 93 | 94 | self.collector.reset(); 95 | 96 | self.collector.consume_stream( 97 | titles 98 | .exact_match(query) 99 | .rescore(move |_, index, _| 50000 + get_rating_for(index)), 100 | ); 101 | self.collector.consume_stream( 102 | titles 103 | .starts_with(query) 104 | .rescore(move |_, index, _| 40000 + get_rating_for(index)), 105 | ); 106 | 107 | if query.len() > 3 { 108 | // running the levenshtein matchers on short query strings is quite expensive, so don't do that 109 | self.collector.consume_stream( 110 | titles 111 | .levenshtein_exact_match(&self.levenshtein_builder_1, query) 112 | .rescore(move |_, index, _| 30000 + get_rating_for(index)), 113 | ); 114 | self.collector.consume_stream( 115 | titles 116 | .levenshtein_starts_with(&self.levenshtein_builder_1, query) 117 | .rescore(move |_, index, _| 20000 + get_rating_for(index)), 118 | ); 119 | self.collector.consume_stream( 120 | titles 121 | .levenshtein_exact_match(&self.levenshtein_builder_2, query) 122 | .rescore(move |_, index, _| 10000 + get_rating_for(index)), 123 | ); 124 | self.collector.consume_stream( 125 | titles 126 | .levenshtein_starts_with(&self.levenshtein_builder_2, query) 127 | .rescore(move |_, index, _| get_rating_for(index)), 128 | ); 129 | } 130 | 131 | let results: Vec = self 132 | .collector 133 | .top_documents() 134 | .iter() 135 | .map(|doc| SearchResult { 136 | score: doc.score, 137 | index: doc.index, 138 | }) 139 | .collect(); 140 | 141 | JsValue::from_serde(&results).unwrap() 142 | } 143 | } 144 | 145 | #[derive(serde::Serialize, serde::Deserialize)] 146 | pub struct BuildData { 147 | titles: Vec<(u64, String)>, 148 | ratings: HashMap, 149 | } 150 | 151 | #[wasm_bindgen] 152 | pub fn build(val: &JsValue) -> Result, JsError> { 153 | let data: BuildData = val.into_serde()?; 154 | let build_searchable = |input: Vec<(u64, String)>| { 155 | let mut i: Vec<(&str, u64)> = input 156 | .iter() 157 | .map(|(key, val)| (val.as_str(), *key)) 158 | .collect(); 159 | i.sort_by_key(|(key, _)| *key); 160 | SearchableStorage::build_from_iter(i) 161 | }; 162 | let search_data = SearchData { 163 | titles: build_searchable(data.titles)?, 164 | ratings: data.ratings, 165 | }; 166 | Ok(search_data.to_bytes()?) 167 | } 168 | --------------------------------------------------------------------------------