├── rustfmt.toml ├── .gitignore ├── .github ├── dependabot.yml └── workflows │ └── ci.yaml ├── Cargo.toml ├── LICENSE ├── .devcontainer └── devcontainer.json ├── tests └── web.rs ├── src └── lib.rs └── README.md /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 2 | tab_spaces = 2 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | pkg/ 6 | wasm-pack.log 7 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "08:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "haiku-search" 3 | version = "0.1.0" 4 | authors = ["Luis Cardoso "] 5 | edition = "2021" 6 | description = "A fast and memory-efficient fuzzy search library for text documents." 7 | readme = "README.md" 8 | license-file = "LICENSE" 9 | repository = "https://github.com/beowolx/haiku-search" 10 | keywords = ["fuzzy", "search", "text", "document"] 11 | categories = ["fuzzy", "search", "text", "document"] 12 | 13 | [lib] 14 | crate-type = ["cdylib", "rlib"] 15 | 16 | [dependencies] 17 | wasm-bindgen = "0.2.84" 18 | serde = { version = "1.0", features = ["derive"] } 19 | serde-wasm-bindgen = "0.6.5" 20 | 21 | [dev-dependencies] 22 | wasm-bindgen-test = "0.3.34" 23 | 24 | [profile.release] 25 | opt-level = "s" 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Luis Cardoso 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Test 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - uses: dtolnay/rust-toolchain@stable 12 | - uses: Swatinem/rust-cache@v2 13 | 14 | - name: Install wasm-pack 15 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 16 | 17 | - name: Build 18 | run: wasm-pack build 19 | 20 | - name: Run Tests 21 | run: | 22 | cargo test 23 | wasm-pack test --headless --chrome 24 | wasm-pack test --headless --firefox 25 | 26 | 27 | fmt: 28 | name: Rustfmt 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | - uses: dtolnay/rust-toolchain@stable 33 | with: 34 | components: rustfmt 35 | - name: Enforce formatting 36 | run: cargo fmt --check 37 | 38 | clippy: 39 | name: Clippy 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/checkout@v4 43 | - uses: dtolnay/rust-toolchain@stable 44 | with: 45 | components: clippy 46 | - uses: Swatinem/rust-cache@v2 47 | - name: Linting 48 | run: cargo clippy -- -D warnings -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/rust 3 | { 4 | "name": "Rust", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/rust:1-1-bullseye", 7 | "features": { 8 | "ghcr.io/devcontainers-contrib/features/node-asdf:0": {}, 9 | "ghcr.io/lumenpink/devcontainer-features/wasm-pack:0": {} 10 | } 11 | 12 | // Use 'mounts' to make the cargo cache persistent in a Docker Volume. 13 | // "mounts": [ 14 | // { 15 | // "source": "devcontainer-cargo-cache-${devcontainerId}", 16 | // "target": "/usr/local/cargo", 17 | // "type": "volume" 18 | // } 19 | // ] 20 | 21 | // Features to add to the dev container. More info: https://containers.dev/features. 22 | // "features": {}, 23 | 24 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 25 | // "forwardPorts": [], 26 | 27 | // Use 'postCreateCommand' to run commands after the container is created. 28 | // "postCreateCommand": "rustc --version", 29 | 30 | // Configure tool-specific properties. 31 | // "customizations": {}, 32 | 33 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 34 | // "remoteUser": "root" 35 | } 36 | -------------------------------------------------------------------------------- /tests/web.rs: -------------------------------------------------------------------------------- 1 | //! Test suite for the Web and headless browsers. 2 | 3 | #![cfg(target_arch = "wasm32")] 4 | 5 | extern crate wasm_bindgen_test; 6 | use haiku_search::SearchConfig; 7 | use haiku_search::SearchEngine; 8 | use serde_wasm_bindgen; 9 | use wasm_bindgen_test::*; 10 | 11 | wasm_bindgen_test_configure!(run_in_browser); 12 | 13 | #[wasm_bindgen_test] 14 | fn test_bitap_exact_match() { 15 | let config = SearchConfig::new(3, 0); 16 | let engine = SearchEngine::new( 17 | vec!["hello".to_string(), "hallo".to_string()], 18 | serde_wasm_bindgen::to_value(&config).unwrap(), 19 | ); 20 | let results = engine.search("hello"); 21 | assert!(results.iter().any(|result| result.text() == "hello" 22 | && (result.score() - 1.0).abs() < f32::EPSILON)); 23 | } 24 | 25 | #[wasm_bindgen_test] 26 | fn test_bitap_with_errors() { 27 | let config = SearchConfig::new(3, 1); 28 | let engine = SearchEngine::new( 29 | vec!["Brian Doe".to_string(), "Brain Dose".to_string()], 30 | serde_wasm_bindgen::to_value(&config).unwrap(), 31 | ); 32 | let results = engine.search("Brian"); 33 | assert!(results 34 | .iter() 35 | .any(|result| result.text() == "Brian Doe" && result.score() >= 0.8)); 36 | } 37 | 38 | #[wasm_bindgen_test] 39 | fn test_bitap_no_match() { 40 | let config = SearchConfig::new(3, 0); 41 | let engine = SearchEngine::new( 42 | vec!["world".to_string()], 43 | serde_wasm_bindgen::to_value(&config).unwrap(), 44 | ); 45 | let results = engine.search("word"); 46 | assert!(results.is_empty()); 47 | } 48 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use std::collections::HashMap; 3 | use wasm_bindgen::prelude::*; 4 | 5 | #[wasm_bindgen] 6 | #[derive(Serialize, Deserialize)] 7 | pub struct SearchConfig { 8 | pub ngram_size: usize, 9 | pub max_distance: usize, 10 | } 11 | 12 | #[wasm_bindgen] 13 | impl SearchConfig { 14 | pub fn new(ngram_size: usize, max_distance: usize) -> SearchConfig { 15 | SearchConfig { 16 | ngram_size, 17 | max_distance, 18 | } 19 | } 20 | pub fn generate_ngrams(&self, input: &str) -> Vec { 21 | let mut ngrams = Vec::new(); 22 | if self.ngram_size > 0 && input.len() >= self.ngram_size { 23 | for i in 0..=input.len() - self.ngram_size { 24 | ngrams.push(input[i..i + self.ngram_size].to_string()); 25 | } 26 | } 27 | ngrams 28 | } 29 | } 30 | 31 | #[wasm_bindgen] 32 | pub struct SearchResult { 33 | text: String, 34 | score: f32, 35 | } 36 | 37 | #[wasm_bindgen] 38 | impl SearchResult { 39 | #[wasm_bindgen(constructor)] 40 | pub fn new(text: String, score: f32) -> SearchResult { 41 | SearchResult { text, score } 42 | } 43 | 44 | #[wasm_bindgen(getter)] 45 | pub fn text(&self) -> String { 46 | self.text.clone() 47 | } 48 | 49 | #[wasm_bindgen(getter)] 50 | pub fn score(&self) -> f32 { 51 | self.score 52 | } 53 | } 54 | 55 | #[wasm_bindgen] 56 | pub struct SearchEngine { 57 | index: HashMap>, 58 | documents: Vec, 59 | config: SearchConfig, 60 | } 61 | 62 | #[wasm_bindgen] 63 | impl SearchEngine { 64 | #[wasm_bindgen(constructor)] 65 | pub fn new(data: Vec, config: JsValue) -> SearchEngine { 66 | let config: SearchConfig = serde_wasm_bindgen::from_value(config).unwrap(); 67 | let mut engine = SearchEngine { 68 | index: HashMap::new(), 69 | documents: data, 70 | config, 71 | }; 72 | engine.index_documents(); 73 | engine 74 | } 75 | 76 | fn index_documents(&mut self) { 77 | for (id, text) in self.documents.iter().enumerate() { 78 | let ngrams = self.config.generate_ngrams(text); 79 | for ngram in ngrams { 80 | self.index.entry(ngram).or_default().push(id); 81 | } 82 | } 83 | } 84 | 85 | #[wasm_bindgen] 86 | pub fn search(&self, query: &str) -> Vec { 87 | let ngrams = self.config.generate_ngrams(query); 88 | let mut candidates = HashMap::new(); 89 | 90 | let mut pattern_mask = [0; 256]; 91 | for (i, ch) in query.chars().enumerate() { 92 | pattern_mask[ch as usize] |= 1 << i; 93 | } 94 | 95 | for ngram in ngrams { 96 | if let Some(docs) = self.index.get(&ngram) { 97 | for &id in docs { 98 | *candidates.entry(id).or_insert(0) += 1; 99 | } 100 | } 101 | } 102 | 103 | candidates 104 | .into_iter() 105 | .filter_map(|(id, _)| { 106 | self.bitap_search(&self.documents[id], query, &pattern_mask) 107 | }) 108 | .collect() 109 | } 110 | 111 | fn bitap_search( 112 | &self, 113 | text: &str, 114 | pattern: &str, 115 | pattern_mask: &[usize; 256], 116 | ) -> Option { 117 | if pattern.is_empty() || pattern.len() > 32 { 118 | return None; 119 | } 120 | 121 | let text_len = text.len(); 122 | let pattern_len = pattern.len(); 123 | let max_mismatches = self.config.max_distance; 124 | 125 | for i in 0..=text_len.saturating_sub(pattern_len) { 126 | let mut mismatches = 0; 127 | let mut r = 0; 128 | 129 | for (j, character) in text[i..i + pattern_len].chars().enumerate() { 130 | r = ((r << 1) | 1) & pattern_mask[character as usize]; 131 | 132 | if r & (1 << j) == 0 { 133 | mismatches += 1; 134 | } 135 | } 136 | 137 | if mismatches <= max_mismatches { 138 | let score = 1.0 - mismatches as f32 / pattern_len as f32; 139 | return Some(SearchResult::new(text.to_string(), score)); 140 | } 141 | } 142 | 143 | None 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Haiku-Search 2 | 3 | [![Build Status](https://github.com/beowolx/haiku/actions/workflows/ci.yaml/badge.svg)](https://github.com/beowolx/haiku/actions) 4 | [![npm](https://img.shields.io/npm/v/haiku-search)](https://www.npmjs.com/package/haiku-search) 5 | 6 | ## Overview 7 | 8 | Haiku-Search is a high-performance fuzzy search library designed for web applications. It is built using Rust and compiled to WebAssembly, providing lightning-fast search capabilities directly in the browser. This combination allows Haiku-Search to execute complex search algorithms efficiently, taking advantage of Rust's performance and safety features. 9 | 10 | ## Powered by Rust and WebAssembly 11 | 12 | Haiku-Search leverages the power of Rust and WebAssembly to run directly in web browsers, offering significant performance improvements over traditional JavaScript search libraries. By compiling to WebAssembly, Haiku-Search provides near-native speed, making it ideal for applications requiring quick search responses on large datasets. 13 | 14 | ## Algorithms 15 | 16 | Haiku-Search implements two primary algorithms: 17 | 18 | - **N-gram Indexing**: This technique breaks down text into chunks (n-grams) and indexes them for quick lookup, allowing us to quickly narrow down potential matches. 19 | - **Bitap Algorithm**: For precise matching, Haiku-Search uses the [Bitap algorithm](https://en.wikipedia.org/wiki/Bitap_algorithm), which supports a configurable number of errors. It is effective for short to medium text lengths and allows for approximate string matching. 20 | 21 | ## Installation 22 | 23 | To install, just run: 24 | 25 | ```bash 26 | npm install haiku-search 27 | ``` 28 | 29 | ## Usage Example 30 | 31 | Here is an example of how to use Haiku-Search without a bundler in your app: 32 | 33 | ```js 34 | import init, { 35 | SearchEngine, 36 | } from "./node_modules/haiku-search/haiku_search.js"; 37 | 38 | async function demoHaikuSearch() { 39 | // Initialize the WASM module, only needed if once and if not using a bundler 40 | await init(); 41 | const data = ["Apple Pie", "Banana Bread", "Carrot Cake"]; 42 | const haikuEngine = new SearchEngine(data, { 43 | ngram_size: 3, 44 | max_distance: 1, 45 | }); 46 | 47 | const query = "Apple"; 48 | const result = await haikuEngine.search(query); 49 | 50 | console.log("Search result:", result[0].text); 51 | console.log("Score result:", result[0].score); 52 | } 53 | 54 | demoHaikuSearch(); 55 | ``` 56 | 57 | Currently, WASM modules are only supported by WebPack. So, if you want use Haiku-Search in a web application, you will have to use WebPack. 58 | 59 | ## Performance Comparison to Fuse.js 60 | 61 | Haiku-Search is designed to be significantly faster than traditional JavaScript libraries like [Fuse.js](<[url](https://www.fusejs.io/)>). In benchmarks, Haiku-Search performs searches up to **13x faster than Fuse.js**. 62 | 63 | ![image](https://github.com/beowolx/haiku/assets/61982523/3684be93-0eb6-4138-9e81-a02ccc5e99d5) 64 | 65 | You can see this chart live here: [Haiku-Search vs Fuse.js Benchmark Results](https://beowolx.github.io/haiku-search/index.html). 66 | 67 | ## Known Limitations 68 | 69 | - **Unicode Support**: Haiku-Search does not currently support unicode characters, which may limit its use in applications requiring internationalization. 70 | - **Pattern Length**: The Bitap algorithm used in Haiku-Search supports patterns up to 32 characters long due to limitations in handling bit masks. 71 | 72 | ## Roadmap 73 | 74 | - Add unicode support 75 | - Improve memory footprint 76 | - Improve code documentation 77 | 78 | ## Contributing 79 | 80 | Contributions to Haiku-Search are welcome! If you're looking to contribute, please: 81 | 82 | - Check out the current issues on GitHub, especially those tagged as "good first issue". 83 | - Fork the repository and make your changes. 84 | - Write clear, commented code. 85 | - Ensure your changes pass all existing tests and add new tests for added functionality. 86 | - Submit a pull request with a detailed description of your changes. 87 | 88 | For major changes, please open an issue first to discuss what you would like to change. 89 | 90 | Thank you for your interest in improving Haiku-Search! 91 | --------------------------------------------------------------------------------