├── rustfmt.toml ├── Cargo.toml ├── .vscode └── settings.json ├── static ├── docfind.js ├── docfind_bg.wasm ├── documents.json ├── docfind_bg.wasm.br ├── install.sh ├── install.ps1 └── index.html ├── .gitignore ├── wasm ├── Cargo.toml ├── index.js └── src │ └── lib.rs ├── .gitattributes ├── cli ├── Cargo.toml └── src │ └── main.rs ├── scripts ├── build.sh ├── build-demo.sh └── version.sh ├── core ├── Cargo.toml ├── english.stop └── src │ ├── lib.rs │ └── tests.rs ├── .github └── workflows │ ├── copilot-setup-steps.yml │ ├── static.yml │ └── ci.yml ├── SECURITY.md ├── LICENSE ├── README.md └── Cargo.lock /rustfmt.toml: -------------------------------------------------------------------------------- 1 | hard_tabs = true 2 | tab_spaces = 2 -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cli", "wasm"] 3 | resolver = "2" 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.branchProtection": [ 3 | "main" 4 | ] 5 | } -------------------------------------------------------------------------------- /static/docfind.js: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9521cfa6406ac37f8b768220abedeb6e8cbf9f0867b56fbdfd7033170b3c778f 3 | size 3497 4 | -------------------------------------------------------------------------------- /static/docfind_bg.wasm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:216664e1c84152d632f03915008ac7017d3d5ee61275c0bacbaaed0e57af0af4 3 | size 11484322 4 | -------------------------------------------------------------------------------- /static/documents.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f5c1094a3b9865f49cb56262f91dd4ee2f57c5ac960fbefbeaaa97a1a53f5c9d 3 | size 17145937 4 | -------------------------------------------------------------------------------- /static/docfind_bg.wasm.br: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7794cad68dbf03a4acd903b52cd1e05d025b243e384ade21f5ac7db405417570 3 | size 5197055 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | pkg 3 | # Temporary dataset files in scripts directory 4 | scripts/train.csv 5 | scripts/test.csv 6 | scripts/documents.json 7 | # Python cache 8 | __pycache__/ 9 | *.pyc -------------------------------------------------------------------------------- /wasm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "docfind-wasm" 3 | version = "0.5.1" 4 | edition = "2024" 5 | 6 | [lib] 7 | crate-type = ["cdylib", "rlib"] 8 | 9 | [dependencies] 10 | docfind_core = { path = "../core", features = ["wasm"] } 11 | wasm-bindgen = "0.2" 12 | serde-wasm-bindgen = "0.6" 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Large demo files tracked with Git LFS 2 | static/docfind.js filter=lfs diff=lfs merge=lfs -text 3 | static/docfind_bg.wasm filter=lfs diff=lfs merge=lfs -text 4 | static/docfind_bg.wasm.br filter=lfs diff=lfs merge=lfs -text 5 | static/documents.json filter=lfs diff=lfs merge=lfs -text 6 | -------------------------------------------------------------------------------- /cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "docfind" 3 | version = "0.5.1" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | docfind_core = { path = "../core", features = ["cli"] } 8 | serde_json = "1.0.145" 9 | wasm-encoder = { version = "0.240.0", features = ["wasmparser"] } 10 | wasmparser = "0.240.0" 11 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Build wasm template 5 | wasm-pack build wasm --out-name docfind --release --target web 6 | 7 | # Minify JavaScript 8 | npx --yes esbuild --bundle wasm/index.js --format=esm --minify --outfile=wasm/pkg/docfind.js --allow-overwrite 9 | 10 | # Then build CLI 11 | cargo build --release -p docfind 12 | -------------------------------------------------------------------------------- /wasm/index.js: -------------------------------------------------------------------------------- 1 | import _init, { search as _search } from './pkg/docfind.js'; 2 | 3 | let didInit = false; 4 | 5 | export function init() { 6 | return _init(); 7 | } 8 | 9 | export default async function search(needle, maxResults) { 10 | if (!didInit) { 11 | await _init(); 12 | didInit = true; 13 | } 14 | return _search(needle, maxResults); 15 | } -------------------------------------------------------------------------------- /core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "docfind_core" 3 | version = "0.5.1" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | fst = { version = "0.4", features = ["levenshtein"] } 8 | serde = { version = "1.0.228", features = ["derive"] } 9 | postcard = { version = "1.1.3", features = ["alloc", "use-std"] } 10 | fsst-rs = "0.5.4" 11 | rake = { version = "0.3", optional = true } 12 | 13 | [dev-dependencies] 14 | serde_json = "1.0.145" 15 | 16 | [features] 17 | cli = ["rake"] 18 | wasm = [] 19 | 20 | [dev-dependencies.rake] 21 | version = "0.3" 22 | -------------------------------------------------------------------------------- /.github/workflows/copilot-setup-steps.yml: -------------------------------------------------------------------------------- 1 | name: "Copilot Setup Steps" 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | paths: 7 | - .github/workflows/copilot-setup-steps.yml 8 | pull_request: 9 | paths: 10 | - .github/workflows/copilot-setup-steps.yml 11 | 12 | jobs: 13 | copilot-setup-steps: 14 | runs-on: ubuntu-latest 15 | 16 | permissions: 17 | contents: read 18 | 19 | steps: 20 | - uses: actions/checkout@v5 21 | 22 | - name: Install wasm-pack 23 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 24 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which 6 | includes all source code repositories in our GitHub organizations. 7 | 8 | **Please do not report security vulnerabilities through public GitHub issues.** 9 | 10 | For security reporting information, locations, contact information, and policies, 11 | please review the latest guidance for Microsoft repositories at 12 | [https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md). 13 | 14 | -------------------------------------------------------------------------------- /scripts/build-demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "Building demo WASM files from static/documents.json..." 5 | 6 | # Build the docfind CLI first if needed 7 | if [ ! -f "target/release/docfind" ]; then 8 | echo "Building docfind CLI..." 9 | ./scripts/build.sh 10 | fi 11 | 12 | # Generate WASM files from documents.json 13 | echo "Generating WASM files..." 14 | ./target/release/docfind static/documents.json static/ 15 | 16 | # Compress WASM with Brotli 17 | echo "Compressing WASM with Brotli..." 18 | brotli -k -f static/docfind_bg.wasm 19 | 20 | echo "Demo build completed successfully!" 21 | echo "" 22 | echo "Generated files:" 23 | ls -lh static/docfind.js static/docfind_bg.wasm static/docfind_bg.wasm.br 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) Microsoft Corporation 3 | 4 | All rights reserved. 5 | 6 | MIT License 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 9 | files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 10 | modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 11 | is furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 17 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 18 | OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /wasm/src/lib.rs: -------------------------------------------------------------------------------- 1 | use docfind_core::Index; 2 | use std::sync::OnceLock; 3 | use wasm_bindgen::prelude::*; 4 | 5 | #[wasm_bindgen] 6 | extern "C" { 7 | #[wasm_bindgen(js_namespace = console)] 8 | fn log(msg: &str); 9 | } 10 | 11 | #[unsafe(no_mangle)] 12 | pub static mut INDEX_BASE: u32 = 0xdead_beef; 13 | 14 | #[unsafe(no_mangle)] 15 | pub static mut INDEX_LEN: u32 = 0xdead_beef; 16 | 17 | static INDEX: OnceLock = OnceLock::new(); 18 | 19 | /// Search the index for a query string 20 | /// Returns a JavaScript array of matching documents 21 | #[wasm_bindgen] 22 | pub fn search(query: &str, max_results: Option) -> Result { 23 | let index = INDEX.get_or_init(|| { 24 | let raw_index = 25 | unsafe { std::slice::from_raw_parts(INDEX_BASE as *const u8, INDEX_LEN as usize) }; 26 | Index::from_bytes(raw_index).expect("Failed to deserialize index") 27 | }); 28 | 29 | let result = docfind_core::search(index, query, max_results.unwrap_or(10)) 30 | .map_err(|e| JsValue::from_str(&format!("Search failed: {}", e)))?; 31 | 32 | serde_wasm_bindgen::to_value(&result) 33 | .map_err(|e| JsValue::from_str(&format!("Failed to convert results to JS: {}", e))) 34 | } 35 | -------------------------------------------------------------------------------- /.github/workflows/static.yml: -------------------------------------------------------------------------------- 1 | # Workflow for building and deploying the docfind example to GitHub Pages 2 | name: Build and deploy example to Pages 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["main"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: false 23 | 24 | jobs: 25 | build-and-deploy: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v4 33 | with: 34 | lfs: true 35 | 36 | - name: Setup Pages 37 | uses: actions/configure-pages@v5 38 | 39 | - name: Upload artifact 40 | uses: actions/upload-pages-artifact@v3 41 | with: 42 | path: 'static' 43 | 44 | - name: Deploy to GitHub Pages 45 | id: deployment 46 | uses: actions/deploy-pages@v4 47 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | tags: 7 | - 'v*.*.*' 8 | pull_request: 9 | branches: [ main ] 10 | 11 | # Cancel running builds if new commits are pushed to a PR 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }} 14 | cancel-in-progress: ${{ github.event_name == 'pull_request' }} 15 | 16 | env: 17 | CARGO_TERM_COLOR: always 18 | 19 | jobs: 20 | test: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | 26 | - name: Test 27 | run: cargo test 28 | working-directory: core 29 | 30 | build: 31 | needs: test 32 | strategy: 33 | matrix: 34 | include: 35 | - os: windows-latest 36 | target: x86_64-pc-windows-msvc 37 | - os: macos-latest 38 | target: x86_64-apple-darwin 39 | - os: macos-latest 40 | target: aarch64-apple-darwin 41 | - os: ubuntu-latest 42 | target: x86_64-unknown-linux-gnu 43 | - os: ubuntu-latest 44 | target: aarch64-unknown-linux-gnu 45 | - os: ubuntu-latest 46 | target: x86_64-unknown-linux-musl 47 | - os: ubuntu-latest 48 | target: aarch64-unknown-linux-musl 49 | 50 | runs-on: ${{ matrix.os }} 51 | 52 | steps: 53 | - uses: actions/checkout@v4 54 | 55 | - name: Install Rust target 56 | run: rustup target add ${{ matrix.target }} 57 | 58 | - name: Install cross-compilation tools (Linux ARM64) 59 | if: matrix.target == 'aarch64-unknown-linux-gnu' || matrix.target == 'aarch64-unknown-linux-musl' 60 | run: | 61 | sudo apt-get update 62 | sudo apt-get install -y gcc-aarch64-linux-gnu 63 | 64 | - name: Install musl tools 65 | if: contains(matrix.target, 'musl') 66 | run: | 67 | sudo apt-get update 68 | sudo apt-get install -y musl-tools 69 | 70 | - name: Install wasm-pack 71 | if: runner.os != 'Windows' 72 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 73 | 74 | - name: Install wasm-pack (Windows) 75 | if: runner.os == 'Windows' 76 | run: npm install -g wasm-pack 77 | shell: pwsh 78 | 79 | - name: Build WASM template 80 | run: wasm-pack build wasm --out-name docfind --release --target web 81 | 82 | - name: Minify JavaScript 83 | run: npx --yes esbuild --bundle wasm/index.js --format=esm --minify --outfile=wasm/pkg/docfind.js --allow-overwrite 84 | 85 | - name: Build CLI 86 | run: cargo build --release -p docfind --target ${{ matrix.target }} 87 | env: 88 | CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc 89 | CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc 90 | 91 | - name: Package binaries (Unix) 92 | if: runner.os != 'Windows' 93 | run: | 94 | cd target/${{ matrix.target }}/release 95 | tar czf docfind-${{ matrix.target }}.tar.gz docfind 96 | mv docfind-${{ matrix.target }}.tar.gz ${{ github.workspace }} 97 | 98 | - name: Package binaries (Windows) 99 | if: runner.os == 'Windows' 100 | run: | 101 | cd target/${{ matrix.target }}/release 102 | 7z a docfind-${{ matrix.target }}.zip docfind.exe 103 | mv docfind-${{ matrix.target }}.zip ${{ github.workspace }} 104 | shell: pwsh 105 | 106 | - name: Upload artifacts 107 | uses: actions/upload-artifact@v4 108 | with: 109 | name: docfind-${{ matrix.target }} 110 | path: | 111 | docfind-${{ matrix.target }}.tar.gz 112 | docfind-${{ matrix.target }}.zip 113 | if-no-files-found: ignore 114 | 115 | release: 116 | if: startsWith(github.ref, 'refs/tags/') 117 | needs: build 118 | runs-on: ubuntu-latest 119 | permissions: 120 | contents: write 121 | 122 | steps: 123 | - name: Download all artifacts 124 | uses: actions/download-artifact@v4 125 | with: 126 | path: artifacts 127 | 128 | - name: Create Release 129 | uses: softprops/action-gh-release@v2 130 | with: 131 | files: artifacts/**/* 132 | draft: false 133 | prerelease: false 134 | generate_release_notes: true 135 | -------------------------------------------------------------------------------- /scripts/version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Version bumping script for Cargo workspace 5 | # Usage: ./scripts/version.sh [major|minor|patch|] 6 | 7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" 9 | 10 | # Color codes for output 11 | RED='\033[0;31m' 12 | GREEN='\033[0;32m' 13 | YELLOW='\033[1;33m' 14 | NC='\033[0m' # No Color 15 | 16 | # Function to display usage 17 | usage() { 18 | echo "Usage: $0 [major|minor|patch|]" 19 | echo "" 20 | echo "Examples:" 21 | echo " $0 patch # 0.2.0 -> 0.2.1" 22 | echo " $0 minor # 0.2.0 -> 0.3.0" 23 | echo " $0 major # 0.2.0 -> 1.0.0" 24 | echo " $0 1.5.2 # Set specific version" 25 | exit 1 26 | } 27 | 28 | # Function to extract current version from a Cargo.toml file 29 | get_version() { 30 | local file="$1" 31 | grep '^version = ' "$file" | head -1 | sed 's/version = "\(.*\)"/\1/' 32 | } 33 | 34 | # Function to validate semantic version format 35 | is_valid_version() { 36 | local version="$1" 37 | if [[ $version =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then 38 | return 0 39 | else 40 | return 1 41 | fi 42 | } 43 | 44 | # Function to bump version 45 | bump_version() { 46 | local current="$1" 47 | local bump_type="$2" 48 | 49 | IFS='.' read -r major minor patch <<< "$current" 50 | 51 | case "$bump_type" in 52 | major) 53 | echo "$((major + 1)).0.0" 54 | ;; 55 | minor) 56 | echo "${major}.$((minor + 1)).0" 57 | ;; 58 | patch) 59 | echo "${major}.${minor}.$((patch + 1))" 60 | ;; 61 | *) 62 | if is_valid_version "$bump_type"; then 63 | echo "$bump_type" 64 | else 65 | echo -e "${RED}Error: Invalid version format: $bump_type${NC}" >&2 66 | echo -e "${YELLOW}Version must be in format: X.Y.Z${NC}" >&2 67 | exit 1 68 | fi 69 | ;; 70 | esac 71 | } 72 | 73 | # Function to update version in a Cargo.toml file 74 | update_cargo_toml() { 75 | local file="$1" 76 | local new_version="$2" 77 | 78 | # Use awk to replace only the first occurrence of version line (fully compatible) 79 | if [[ "$OSTYPE" == "darwin"* ]]; then 80 | awk -v new_ver="$new_version" '/^version = / && !done { sub(/^version = ".*"/, "version = \"" new_ver "\""); done=1 } 1' "$file" > "$file.tmp" && mv "$file.tmp" "$file" 81 | else 82 | # GNU sed supports the 0,/pattern/ syntax 83 | sed -i "0,/^version = \".*\"/s//version = \"$new_version\"/" "$file" 84 | fi 85 | 86 | echo -e "${GREEN}✓${NC} Updated $(basename $(dirname "$file"))/$(basename "$file")" 87 | } 88 | 89 | # Main script 90 | main() { 91 | if [ $# -eq 0 ]; then 92 | usage 93 | fi 94 | 95 | local bump_type="$1" 96 | 97 | # Find all Cargo.toml files with version fields 98 | CARGO_FILES=( 99 | "$PROJECT_ROOT/cli/Cargo.toml" 100 | "$PROJECT_ROOT/core/Cargo.toml" 101 | "$PROJECT_ROOT/wasm/Cargo.toml" 102 | ) 103 | 104 | # Get current version from the first file (cli) 105 | CURRENT_VERSION=$(get_version "${CARGO_FILES[0]}") 106 | 107 | if [ -z "$CURRENT_VERSION" ]; then 108 | echo -e "${RED}Error: Could not determine current version${NC}" 109 | exit 1 110 | fi 111 | 112 | echo -e "Current version: ${YELLOW}$CURRENT_VERSION${NC}" 113 | 114 | # Calculate new version 115 | NEW_VERSION=$(bump_version "$CURRENT_VERSION" "$bump_type") 116 | 117 | echo -e "New version: ${GREEN}$NEW_VERSION${NC}" 118 | echo "" 119 | 120 | # Confirm with user 121 | read -p "Update version to $NEW_VERSION? (y/N): " -n 1 -r 122 | echo 123 | if [[ ! $REPLY =~ ^[Yy]$ ]]; then 124 | echo "Aborted" 125 | exit 0 126 | fi 127 | 128 | # Update all Cargo.toml files 129 | echo "" 130 | echo "Updating Cargo.toml files..." 131 | for file in "${CARGO_FILES[@]}"; do 132 | if [ -f "$file" ]; then 133 | update_cargo_toml "$file" "$NEW_VERSION" 134 | else 135 | echo -e "${YELLOW}Warning: File not found: $file${NC}" 136 | fi 137 | done 138 | 139 | echo "" 140 | echo -e "${GREEN}Version updated successfully!${NC}" 141 | echo "" 142 | 143 | echo "" 144 | echo -e "${YELLOW}Changes not committed. You can review and commit manually:${NC}" 145 | echo " git add cli/Cargo.toml core/Cargo.toml wasm/Cargo.toml" 146 | echo " git commit -m 'Bump version to $NEW_VERSION'" 147 | echo " git tag -a v$NEW_VERSION -m 'Release version $NEW_VERSION'" 148 | echo " git push && git push --tags" 149 | } 150 | 151 | main "$@" 152 | -------------------------------------------------------------------------------- /core/english.stop: -------------------------------------------------------------------------------- 1 | #stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop 2 | a 3 | a's 4 | able 5 | about 6 | above 7 | according 8 | accordingly 9 | across 10 | actually 11 | after 12 | afterwards 13 | again 14 | against 15 | ain't 16 | all 17 | allow 18 | allows 19 | almost 20 | alone 21 | along 22 | already 23 | also 24 | although 25 | always 26 | am 27 | among 28 | amongst 29 | an 30 | and 31 | another 32 | any 33 | anybody 34 | anyhow 35 | anyone 36 | anything 37 | anyway 38 | anyways 39 | anywhere 40 | apart 41 | appear 42 | appreciate 43 | appropriate 44 | are 45 | aren't 46 | around 47 | as 48 | aside 49 | ask 50 | asking 51 | associated 52 | at 53 | available 54 | away 55 | awfully 56 | b 57 | be 58 | became 59 | because 60 | become 61 | becomes 62 | becoming 63 | been 64 | before 65 | beforehand 66 | behind 67 | being 68 | believe 69 | below 70 | beside 71 | besides 72 | best 73 | better 74 | between 75 | beyond 76 | both 77 | brief 78 | but 79 | by 80 | c 81 | c'mon 82 | c's 83 | came 84 | can 85 | can't 86 | cannot 87 | cant 88 | cause 89 | causes 90 | certain 91 | certainly 92 | changes 93 | clearly 94 | co 95 | com 96 | come 97 | comes 98 | concerning 99 | consequently 100 | consider 101 | considering 102 | contain 103 | containing 104 | contains 105 | corresponding 106 | could 107 | couldn't 108 | course 109 | currently 110 | d 111 | definitely 112 | described 113 | despite 114 | did 115 | didn't 116 | different 117 | do 118 | does 119 | doesn't 120 | doing 121 | don't 122 | done 123 | down 124 | downwards 125 | during 126 | e 127 | each 128 | edu 129 | eg 130 | eight 131 | either 132 | else 133 | elsewhere 134 | enough 135 | entirely 136 | especially 137 | et 138 | etc 139 | even 140 | ever 141 | every 142 | everybody 143 | everyone 144 | everything 145 | everywhere 146 | ex 147 | exactly 148 | example 149 | except 150 | f 151 | far 152 | few 153 | fifth 154 | first 155 | five 156 | followed 157 | following 158 | follows 159 | for 160 | former 161 | formerly 162 | forth 163 | four 164 | from 165 | further 166 | furthermore 167 | g 168 | get 169 | gets 170 | getting 171 | given 172 | gives 173 | go 174 | goes 175 | going 176 | gone 177 | got 178 | gotten 179 | greetings 180 | h 181 | had 182 | hadn't 183 | happens 184 | hardly 185 | has 186 | hasn't 187 | have 188 | haven't 189 | having 190 | he 191 | he's 192 | hello 193 | help 194 | hence 195 | her 196 | here 197 | here's 198 | hereafter 199 | hereby 200 | herein 201 | hereupon 202 | hers 203 | herself 204 | hi 205 | him 206 | himself 207 | his 208 | hither 209 | hopefully 210 | how 211 | howbeit 212 | however 213 | i 214 | i'd 215 | i'll 216 | i'm 217 | i've 218 | ie 219 | if 220 | ignored 221 | immediate 222 | in 223 | inasmuch 224 | inc 225 | indeed 226 | indicate 227 | indicated 228 | indicates 229 | inner 230 | insofar 231 | instead 232 | into 233 | inward 234 | is 235 | isn't 236 | it 237 | it'd 238 | it'll 239 | it's 240 | its 241 | itself 242 | j 243 | just 244 | k 245 | keep 246 | keeps 247 | kept 248 | know 249 | knows 250 | known 251 | l 252 | last 253 | lately 254 | later 255 | latter 256 | latterly 257 | least 258 | less 259 | lest 260 | let 261 | let's 262 | like 263 | liked 264 | likely 265 | little 266 | look 267 | looking 268 | looks 269 | ltd 270 | m 271 | mainly 272 | many 273 | may 274 | maybe 275 | me 276 | mean 277 | meanwhile 278 | merely 279 | might 280 | more 281 | moreover 282 | most 283 | mostly 284 | much 285 | must 286 | my 287 | myself 288 | n 289 | name 290 | namely 291 | nd 292 | near 293 | nearly 294 | necessary 295 | need 296 | needs 297 | neither 298 | never 299 | nevertheless 300 | new 301 | next 302 | nine 303 | no 304 | nobody 305 | non 306 | none 307 | noone 308 | nor 309 | normally 310 | not 311 | nothing 312 | novel 313 | now 314 | nowhere 315 | o 316 | obviously 317 | of 318 | off 319 | often 320 | oh 321 | ok 322 | okay 323 | old 324 | on 325 | once 326 | one 327 | ones 328 | only 329 | onto 330 | or 331 | other 332 | others 333 | otherwise 334 | ought 335 | our 336 | ours 337 | ourselves 338 | out 339 | outside 340 | over 341 | overall 342 | own 343 | p 344 | particular 345 | particularly 346 | per 347 | perhaps 348 | placed 349 | please 350 | plus 351 | possible 352 | presumably 353 | probably 354 | provides 355 | q 356 | que 357 | quite 358 | qv 359 | r 360 | rather 361 | rd 362 | re 363 | really 364 | reasonably 365 | regarding 366 | regardless 367 | regards 368 | relatively 369 | respectively 370 | right 371 | s 372 | said 373 | same 374 | saw 375 | say 376 | saying 377 | says 378 | second 379 | secondly 380 | see 381 | seeing 382 | seem 383 | seemed 384 | seeming 385 | seems 386 | seen 387 | self 388 | selves 389 | sensible 390 | sent 391 | serious 392 | seriously 393 | seven 394 | several 395 | shall 396 | she 397 | should 398 | shouldn't 399 | since 400 | six 401 | so 402 | some 403 | somebody 404 | somehow 405 | someone 406 | something 407 | sometime 408 | sometimes 409 | somewhat 410 | somewhere 411 | soon 412 | sorry 413 | specified 414 | specify 415 | specifying 416 | still 417 | sub 418 | such 419 | sup 420 | sure 421 | t 422 | t's 423 | take 424 | taken 425 | tell 426 | tends 427 | th 428 | than 429 | thank 430 | thanks 431 | thanx 432 | that 433 | that's 434 | thats 435 | the 436 | their 437 | theirs 438 | them 439 | themselves 440 | then 441 | thence 442 | there 443 | there's 444 | thereafter 445 | thereby 446 | therefore 447 | therein 448 | theres 449 | thereupon 450 | these 451 | they 452 | they'd 453 | they'll 454 | they're 455 | they've 456 | think 457 | third 458 | this 459 | thorough 460 | thoroughly 461 | those 462 | though 463 | three 464 | through 465 | throughout 466 | thru 467 | thus 468 | to 469 | together 470 | too 471 | took 472 | toward 473 | towards 474 | tried 475 | tries 476 | truly 477 | try 478 | trying 479 | twice 480 | two 481 | u 482 | un 483 | under 484 | unfortunately 485 | unless 486 | unlikely 487 | until 488 | unto 489 | up 490 | upon 491 | us 492 | use 493 | used 494 | useful 495 | uses 496 | using 497 | usually 498 | uucp 499 | v 500 | value 501 | various 502 | very 503 | via 504 | viz 505 | vs 506 | w 507 | want 508 | wants 509 | was 510 | wasn't 511 | way 512 | we 513 | we'd 514 | we'll 515 | we're 516 | we've 517 | welcome 518 | well 519 | went 520 | were 521 | weren't 522 | what 523 | what's 524 | whatever 525 | when 526 | whence 527 | whenever 528 | where 529 | where's 530 | whereafter 531 | whereas 532 | whereby 533 | wherein 534 | whereupon 535 | wherever 536 | whether 537 | which 538 | while 539 | whither 540 | who 541 | who's 542 | whoever 543 | whole 544 | whom 545 | whose 546 | why 547 | will 548 | willing 549 | wish 550 | with 551 | within 552 | without 553 | won't 554 | wonder 555 | would 556 | would 557 | wouldn't 558 | x 559 | y 560 | yes 561 | yet 562 | you 563 | you'd 564 | you'll 565 | you're 566 | you've 567 | your 568 | yours 569 | yourself 570 | yourselves 571 | z 572 | zero 573 | -------------------------------------------------------------------------------- /static/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # docfind installer script for Unix-like systems 3 | # Usage: curl -fsSL https://microsoft.github.io/docfind/install.sh | sh 4 | 5 | set -e 6 | 7 | # Configuration 8 | REPO="microsoft/docfind" 9 | BINARY_NAME="docfind" 10 | INSTALL_DIR="${DOCFIND_INSTALL_DIR:-$HOME/.local/bin}" 11 | 12 | # Colors for output 13 | RED='\033[0;31m' 14 | GREEN='\033[0;32m' 15 | YELLOW='\033[1;33m' 16 | NC='\033[0m' # No Color 17 | 18 | # Helper functions 19 | info() { 20 | printf "${GREEN}==>${NC} %s\n" "$1" 21 | } 22 | 23 | warn() { 24 | printf "${YELLOW}Warning:${NC} %s\n" "$1" 25 | } 26 | 27 | error() { 28 | printf "${RED}Error:${NC} %s\n" "$1" >&2 29 | exit 1 30 | } 31 | 32 | # Detect OS and architecture 33 | detect_platform() { 34 | OS="$(uname -s)" 35 | ARCH="$(uname -m)" 36 | 37 | case "$OS" in 38 | Linux*) 39 | PLATFORM="unknown-linux-musl" 40 | ;; 41 | Darwin*) 42 | PLATFORM="apple-darwin" 43 | ;; 44 | *) 45 | error "Unsupported operating system: $OS" 46 | ;; 47 | esac 48 | 49 | case "$ARCH" in 50 | x86_64|amd64) 51 | ARCH="x86_64" 52 | ;; 53 | aarch64|arm64) 54 | ARCH="aarch64" 55 | ;; 56 | *) 57 | error "Unsupported architecture: $ARCH" 58 | ;; 59 | esac 60 | 61 | TARGET="${ARCH}-${PLATFORM}" 62 | info "Detected platform: $TARGET" 63 | } 64 | 65 | # Get the current installed version 66 | get_current_version() { 67 | if command -v "$BINARY_NAME" >/dev/null 2>&1; then 68 | # Extract version from "docfind X.Y.Z" output 69 | CURRENT_VERSION=$("$BINARY_NAME" --version 2>/dev/null | sed -E 's/^[^ ]+ //') 70 | if [ -n "$CURRENT_VERSION" ]; then 71 | echo "$CURRENT_VERSION" 72 | fi 73 | fi 74 | } 75 | 76 | # Get the latest release version 77 | get_latest_version() { 78 | info "Fetching latest release..." 79 | 80 | # Prepare auth header if GITHUB_TOKEN is set 81 | AUTH_HEADER="" 82 | if [ -n "$GITHUB_TOKEN" ]; then 83 | AUTH_HEADER="Authorization: Bearer $GITHUB_TOKEN" 84 | fi 85 | 86 | if command -v curl >/dev/null 2>&1; then 87 | if [ -n "$AUTH_HEADER" ]; then 88 | VERSION=$(curl -fsSL -H "$AUTH_HEADER" "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') 89 | else 90 | VERSION=$(curl -fsSL "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') 91 | fi 92 | elif command -v wget >/dev/null 2>&1; then 93 | if [ -n "$AUTH_HEADER" ]; then 94 | VERSION=$(wget -qO- --header="$AUTH_HEADER" "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') 95 | else 96 | VERSION=$(wget -qO- "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') 97 | fi 98 | else 99 | error "Neither curl nor wget found. Please install one of them." 100 | fi 101 | 102 | if [ -z "$VERSION" ]; then 103 | error "Failed to fetch latest version" 104 | fi 105 | 106 | info "Latest version: $VERSION" 107 | } 108 | 109 | # Download and install binary 110 | install_binary() { 111 | DOWNLOAD_URL="https://github.com/$REPO/releases/download/$VERSION/${BINARY_NAME}-${TARGET}.tar.gz" 112 | TEMP_FILE="/tmp/${BINARY_NAME}-${TARGET}.tar.gz" 113 | 114 | info "Downloading from $DOWNLOAD_URL..." 115 | 116 | if command -v curl >/dev/null 2>&1; then 117 | curl -fsSL "$DOWNLOAD_URL" -o "$TEMP_FILE" || error "Download failed" 118 | elif command -v wget >/dev/null 2>&1; then 119 | wget -q "$DOWNLOAD_URL" -O "$TEMP_FILE" || error "Download failed" 120 | fi 121 | 122 | # Create install directory if it doesn't exist 123 | if [ ! -d "$INSTALL_DIR" ]; then 124 | info "Creating directory $INSTALL_DIR..." 125 | mkdir -p "$INSTALL_DIR" || error "Failed to create install directory" 126 | fi 127 | 128 | # Extract and install binary 129 | info "Extracting archive..." 130 | tar -xzf "$TEMP_FILE" -C "$INSTALL_DIR" || error "Failed to extract archive" 131 | 132 | info "Installing to $INSTALL_DIR/$BINARY_NAME..." 133 | chmod +x "$INSTALL_DIR/$BINARY_NAME" || error "Failed to make binary executable" 134 | 135 | # Clean up 136 | rm "$TEMP_FILE" 2>/dev/null || true 137 | 138 | info "Successfully installed $BINARY_NAME to $INSTALL_DIR" 139 | } 140 | 141 | # Check if install directory is in PATH 142 | check_path() { 143 | case ":$PATH:" in 144 | *":$INSTALL_DIR:"*) 145 | return 0 146 | ;; 147 | *) 148 | return 1 149 | ;; 150 | esac 151 | } 152 | 153 | # Print post-install instructions 154 | post_install() { 155 | echo "" 156 | info "Installation complete!" 157 | 158 | if ! check_path; then 159 | warn "$INSTALL_DIR is not in your PATH" 160 | echo "" 161 | echo "Add it to your PATH by adding this line to your shell profile:" 162 | echo " ${GREEN}export PATH=\"\$PATH:$INSTALL_DIR\"${NC}" 163 | echo "" 164 | 165 | # Detect shell and provide specific instructions 166 | SHELL_NAME="$(basename "$SHELL")" 167 | case "$SHELL_NAME" in 168 | bash) 169 | echo "For bash, add it to ~/.bashrc or ~/.bash_profile" 170 | ;; 171 | zsh) 172 | echo "For zsh, add it to ~/.zshrc" 173 | ;; 174 | fish) 175 | echo "For fish, run: ${GREEN}fish_add_path $INSTALL_DIR${NC}" 176 | ;; 177 | *) 178 | echo "Add it to your shell's configuration file" 179 | ;; 180 | esac 181 | echo "" 182 | echo "Then reload your shell or run: ${GREEN}source ~/.${SHELL_NAME}rc${NC}" 183 | else 184 | echo "You can now use '${GREEN}$BINARY_NAME${NC}' from anywhere!" 185 | fi 186 | 187 | echo "" 188 | echo "Try it out:" 189 | echo " ${GREEN}$BINARY_NAME --help${NC}" 190 | } 191 | 192 | # Main installation flow 193 | main() { 194 | info "Installing $BINARY_NAME..." 195 | 196 | detect_platform 197 | get_latest_version 198 | 199 | # Check if already installed with the same version 200 | CURRENT_VERSION=$(get_current_version) 201 | if [ -n "$CURRENT_VERSION" ]; then 202 | info "Current version: $CURRENT_VERSION" 203 | # Strip 'v' prefix from VERSION if present for comparison 204 | LATEST_VERSION_NUM=$(echo "$VERSION" | sed 's/^v//') 205 | if [ "$CURRENT_VERSION" = "$LATEST_VERSION_NUM" ] || [ "$CURRENT_VERSION" = "$VERSION" ]; then 206 | info "$BINARY_NAME $CURRENT_VERSION is already installed (latest version)" 207 | echo "" 208 | echo "If you want to reinstall, please uninstall first:" 209 | echo " ${GREEN}rm \$(which $BINARY_NAME)${NC}" 210 | exit 0 211 | fi 212 | fi 213 | 214 | install_binary 215 | post_install 216 | } 217 | 218 | main 219 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # docfind 2 | 3 | A high-performance document search engine built in Rust with WebAssembly support. Combines full-text search using FST (Finite State Transducers) with FSST compression for efficient storage and fast fuzzy matching capabilities. 4 | 5 | ## Live Demo 6 | 7 | Check out the [interactive demo](https://microsoft.github.io/docfind/). The demo showcases docfind searching through 50,000 news articles from the AG News dataset, running entirely in your browser with WebAssembly. 8 | 9 | **Demo Performance Metrics:** 10 | - **Dataset**: 50,000 news articles (AG News Classification Dataset) 11 | - **Dataset Size**: 17.14 MB ([uncompressed JSON](https://github.com/microsoft/docfind/raw/refs/heads/main/static/documents.json)) 12 | - **Index Size**: 11.48 MB ([WASM file](https://github.com/microsoft/docfind/raw/refs/heads/main/static/docfind_bg.wasm)) 13 | - **Compressed Size**: 5.20 MB ([compressed with Brotli](https://github.com/microsoft/docfind/raw/refs/heads/main/static/docfind_bg.wasm.br)) 14 | - **Index Build Time**: ~1.1 seconds 15 | - **Load Time**: ~100ms (depending on network and browser) 16 | - **Search Speed**: ~1-3ms per query 17 | 18 | ## Features 19 | 20 | - **Fast Fuzzy Search**: Uses FST for efficient keyword matching with Levenshtein distance support 21 | - **Compact Storage**: FSST compression reduces index size while maintaining fast decompression 22 | - **RAKE Keyword Extraction**: Automatic keyword extraction from document content using the RAKE algorithm 23 | - **WebAssembly Ready**: Compile to WASM for browser-based search with no server required 24 | - **Standalone CLI Tool**: Self-contained CLI tool to build a .wasm file out of a collection of documents, no Rust tooling required 25 | 26 | ## Installation 27 | 28 | ### Quick Install 29 | 30 | **macOS/Linux:** 31 | ```bash 32 | curl -fsSL https://microsoft.github.io/docfind/install.sh | sh 33 | ``` 34 | 35 | **Windows (PowerShell):** 36 | ```powershell 37 | irm https://microsoft.github.io/docfind/install.ps1 | iex 38 | ``` 39 | 40 | The installer will: 41 | - Download the latest release binary for your platform 42 | - Install it to `~/.local/bin` (Unix) or `~\.docfind\bin` (Windows) 43 | - Provide instructions for adding it to your PATH if needed 44 | 45 | ### Manual Installation 46 | 47 | Download the binary for your platform from the [latest release](https://github.com/microsoft/docfind/releases/latest): 48 | 49 | - **macOS (Intel)**: `docfind-x86_64-apple-darwin` 50 | - **macOS (Apple Silicon)**: `docfind-aarch64-apple-darwin` 51 | - **Linux (x64)**: `docfind-x86_64-unknown-linux-musl` 52 | - **Linux (ARM64)**: `docfind-aarch64-unknown-linux-musl` 53 | - **Windows (x64)**: `docfind-x86_64-pc-windows-msvc.exe` 54 | - **Windows (ARM64)**: `docfind-aarch64-pc-windows-msvc.exe` 55 | 56 | Rename it to `docfind` (or `docfind.exe` on Windows), make it executable, and place it in your PATH. 57 | 58 | ### Building from Source 59 | 60 | #### Prerequisites 61 | 62 | Before building from source, ensure you have the following installed: 63 | 64 | 1. **Rust** - [rustup.rs](https://rustup.rs/) 65 | 2. **wasm-pack** - [drager.github.io/wasm-pack](https://drager.github.io/wasm-pack/) 66 | 3. **Node.js** - [nodejs.org](https://nodejs.org/) (required for esbuild) 67 | 68 | #### Build 69 | 70 | ```bash 71 | ./scripts/build.sh 72 | ``` 73 | 74 | The compiled binary will be available at `./target/release/docfind`. 75 | 76 | ## Usage 77 | 78 | ### Creating a Search Index 79 | 80 | Prepare a JSON file with your documents: 81 | 82 | ```json 83 | [ 84 | { 85 | "title": "Getting Started", 86 | "category": "docs", 87 | "href": "/docs/getting-started", 88 | "body": "This guide will help you get started." 89 | }, 90 | { 91 | "title": "API Reference", 92 | "category": "reference", 93 | "href": "/docs/api", 94 | "body": "Complete API documentation for all search functions and configuration options." 95 | } 96 | ] 97 | ``` 98 | 99 | Build the index and generate a WASM module: 100 | 101 | ```bash 102 | docfind documents.json output 103 | ``` 104 | 105 | This creates: 106 | - `output/docfind.js` - JavaScript bindings 107 | - `output/docfind_bg.wasm` - WebAssembly module with embedded index 108 | 109 | ### Using in the Browser 110 | 111 | ```html 112 | 118 | ``` 119 | 120 | ## How It Works 121 | 122 | ```mermaid 123 | flowchart LR 124 | A([documents.json]) --> B[docfind] 125 | B --> C[Keyword Extraction
RAKE] 126 | B --> E[FSST Compression
document strings] 127 | C --> D[FST Map
keywords → docs] 128 | D --> F[[Index]] 129 | E --> F 130 | F --> G([docfind_bg.wasm
+ docfind.js]) 131 | 132 | style A fill:#e1f5ff 133 | style G fill:#e1f5ff 134 | style F fill:#ffffcc 135 | ``` 136 | 137 | 1. **Indexing Phase** (CLI): 138 | - Extracts keywords from document titles, categories, and bodies 139 | - Uses RAKE algorithm to identify important multi-word phrases 140 | - Assigns relevance scores based on keyword source (metadata > title > body) 141 | - Builds an FST mapping keywords to document indices 142 | - Compresses all document strings using FSST 143 | - Serializes the index using Postcard (binary format) 144 | 145 | 2. **Embedding Phase** (CLI): 146 | - Parses the pre-compiled WASM module 147 | - Expands WASM memory to accommodate the index 148 | - Patches global variables (`INDEX_BASE`, `INDEX_LEN`) with actual values 149 | - Adds the index as a new data segment in the WASM binary 150 | 151 | 3. **Search Phase** (WASM): 152 | - Deserializes the embedded index on first use 153 | - Performs fuzzy matching using Levenshtein automaton 154 | - Combines results from multiple keywords with score accumulation 155 | - Decompresses matching document strings on demand 156 | - Returns ranked results as JavaScript objects 157 | 158 | ## Dependencies 159 | 160 | - **fst**: Fast finite state transducer library with Levenshtein support 161 | - **fsst-rs**: Fast string compression for text data 162 | - **rake**: Rapid Automatic Keyword Extraction algorithm 163 | - **serde/postcard**: Efficient serialization 164 | - **wasm-bindgen**: WebAssembly bindings for Rust 165 | - **wasm-encoder/wasmparser**: WASM manipulation tools 166 | 167 | ## Performance 168 | 169 | The combination of FST and FSST provides: 170 | - Sub-millisecond search times for typical queries 171 | - 60-80% compression ratio for document storage 172 | - Instant startup with lazy index loading 173 | - Zero network requests after initial load 174 | 175 | ## References 176 | 177 | ### Prior Art 178 | 179 | This project builds on the rich ecosystem of search technologies: 180 | 181 | - **[Algolia](https://www.algolia.com/)** - Server-side search-as-a-service platform 182 | - **[TypeSense](https://typesense.org/)** - Open-source server-side search engine 183 | - **[Lunr.js](https://lunrjs.com/)** - Client-side full-text search library for JavaScript 184 | - **[Stork Search](https://stork-search.net/)** - WebAssembly-powered search for static sites 185 | - **[Tinysearch](https://endler.dev/2019/tinysearch/)** - Minimalist WASM-based search engine 186 | 187 | ### Technical Foundations 188 | 189 | Key technologies and concepts that inspired and power docfind: 190 | 191 | - **[Finite State Transducers](https://burntsushi.net/transducers/)** - Andrew Gallant's comprehensive article on FSTs, the core data structure for efficient search 192 | - **[RAKE Algorithm](https://docs.rs/rake/latest/rake/)** - Rapid Automatic Keyword Extraction for identifying important phrases 193 | - **[FSST Compression](https://docs.rs/fsst-rs/latest/fsst/index.html)** - Fast Static Symbol Table compression for efficient text storage 194 | -------------------------------------------------------------------------------- /static/install.ps1: -------------------------------------------------------------------------------- 1 | # docfind installer script for Windows 2 | # Usage: irm https://microsoft.github.io/docfind/install.ps1 | iex 3 | 4 | $ErrorActionPreference = 'Stop' 5 | 6 | # Configuration 7 | $Repo = "microsoft/docfind" 8 | $BinaryName = "docfind" 9 | $InstallDir = if ($env:DOCFIND_INSTALL_DIR) { $env:DOCFIND_INSTALL_DIR } else { "$env:USERPROFILE\.docfind\bin" } 10 | 11 | # Helper functions 12 | function Write-Info { 13 | param([string]$Message) 14 | Write-Host "==> " -ForegroundColor Green -NoNewline 15 | Write-Host $Message 16 | } 17 | 18 | function Write-Warn { 19 | param([string]$Message) 20 | Write-Host "Warning: " -ForegroundColor Yellow -NoNewline 21 | Write-Host $Message 22 | } 23 | 24 | function Write-Error-Custom { 25 | param([string]$Message) 26 | Write-Host "Error: " -ForegroundColor Red -NoNewline 27 | Write-Host $Message 28 | exit 1 29 | } 30 | 31 | # Detect architecture 32 | function Get-Architecture { 33 | $arch = $env:PROCESSOR_ARCHITECTURE 34 | switch ($arch) { 35 | "AMD64" { return "x86_64" } 36 | "ARM64" { return "aarch64" } 37 | default { Write-Error-Custom "Unsupported architecture: $arch" } 38 | } 39 | } 40 | 41 | # Get the current installed version 42 | function Get-CurrentVersion { 43 | try { 44 | # Check if docfind is in PATH and can be executed 45 | $currentVersionOutput = & $BinaryName --version 2>&1 46 | if ($LASTEXITCODE -eq 0 -and $currentVersionOutput) { 47 | # Extract version from "docfind X.Y.Z" output 48 | $versionMatch = $currentVersionOutput -match "^$BinaryName\s+(.+)$" 49 | if ($versionMatch -and $Matches[1]) { 50 | return $Matches[1].Trim() 51 | } 52 | } 53 | } 54 | catch { 55 | # Binary not found or not executable 56 | } 57 | return $null 58 | } 59 | 60 | # Get the latest release version 61 | function Get-LatestVersion { 62 | Write-Info "Fetching latest release..." 63 | 64 | try { 65 | # Prepare headers for authentication if GITHUB_TOKEN is set 66 | $headers = @{} 67 | if ($env:GITHUB_TOKEN) { 68 | $headers["Authorization"] = "Bearer $env:GITHUB_TOKEN" 69 | } 70 | 71 | $response = if ($headers.Count -gt 0) { 72 | Invoke-RestMethod -Uri "https://api.github.com/repos/$Repo/releases/latest" -Headers $headers 73 | } else { 74 | Invoke-RestMethod -Uri "https://api.github.com/repos/$Repo/releases/latest" 75 | } 76 | 77 | $version = $response.tag_name 78 | 79 | if (-not $version) { 80 | Write-Error-Custom "Failed to fetch latest version" 81 | } 82 | 83 | Write-Info "Latest version: $version" 84 | return $version 85 | } 86 | catch { 87 | Write-Error-Custom "Failed to fetch release information: $_" 88 | } 89 | } 90 | 91 | # Download and install binary 92 | function Install-Binary { 93 | param( 94 | [string]$Version, 95 | [string]$Target 96 | ) 97 | 98 | $fileName = "${BinaryName}-${Target}.zip" 99 | $downloadUrl = "https://github.com/$Repo/releases/download/$Version/$fileName" 100 | $tempFile = Join-Path $env:TEMP $fileName 101 | $tempExtractDir = Join-Path $env:TEMP "docfind-extract" 102 | 103 | Write-Info "Downloading from $downloadUrl..." 104 | 105 | try { 106 | Invoke-WebRequest -Uri $downloadUrl -OutFile $tempFile -UseBasicParsing 107 | } 108 | catch { 109 | Write-Error-Custom "Download failed: $_" 110 | } 111 | 112 | # Create install directory if it doesn't exist 113 | if (-not (Test-Path $InstallDir)) { 114 | Write-Info "Creating directory $InstallDir..." 115 | New-Item -ItemType Directory -Path $InstallDir -Force | Out-Null 116 | } 117 | 118 | # Extract archive 119 | Write-Info "Extracting archive..." 120 | try { 121 | # Clean up temp extract directory if it exists 122 | if (Test-Path $tempExtractDir) { 123 | Remove-Item -Path $tempExtractDir -Recurse -Force 124 | } 125 | New-Item -ItemType Directory -Path $tempExtractDir -Force | Out-Null 126 | 127 | Expand-Archive -Path $tempFile -DestinationPath $tempExtractDir -Force 128 | } 129 | catch { 130 | Write-Error-Custom "Failed to extract archive: $_" 131 | } 132 | 133 | # Install binary 134 | $destination = Join-Path $InstallDir "${BinaryName}.exe" 135 | $extractedBinary = Join-Path $tempExtractDir "${BinaryName}.exe" 136 | Write-Info "Installing to $destination..." 137 | 138 | try { 139 | Move-Item -Path $extractedBinary -Destination $destination -Force 140 | } 141 | catch { 142 | Write-Error-Custom "Failed to install binary: $_" 143 | } 144 | 145 | # Clean up 146 | try { 147 | Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue 148 | Remove-Item -Path $tempExtractDir -Recurse -Force -ErrorAction SilentlyContinue 149 | } 150 | catch { 151 | # Ignore cleanup errors 152 | } 153 | 154 | Write-Info "Successfully installed $BinaryName to $InstallDir" 155 | } 156 | 157 | # Check if install directory is in PATH 158 | function Test-InPath { 159 | param([string]$Directory) 160 | 161 | $pathDirs = $env:PATH -split ';' 162 | return $pathDirs -contains $Directory 163 | } 164 | 165 | # Add directory to PATH 166 | function Add-ToPath { 167 | param([string]$Directory) 168 | 169 | Write-Info "Adding $Directory to your PATH..." 170 | 171 | try { 172 | # Get current user PATH 173 | $currentPath = [Environment]::GetEnvironmentVariable("PATH", "User") 174 | 175 | if ($currentPath -notlike "*$Directory*") { 176 | $newPath = if ($currentPath) { "$currentPath;$Directory" } else { $Directory } 177 | [Environment]::SetEnvironmentVariable("PATH", $newPath, "User") 178 | 179 | # Update current session PATH 180 | $env:PATH = "$env:PATH;$Directory" 181 | 182 | Write-Info "Added $Directory to PATH" 183 | return $true 184 | } 185 | else { 186 | Write-Info "$Directory is already in PATH" 187 | return $false 188 | } 189 | } 190 | catch { 191 | Write-Warn "Failed to add to PATH automatically: $_" 192 | return $false 193 | } 194 | } 195 | 196 | # Print post-install instructions 197 | function Show-PostInstall { 198 | param([bool]$PathUpdated) 199 | 200 | Write-Host "" 201 | Write-Info "Installation complete!" 202 | Write-Host "" 203 | 204 | if ($PathUpdated) { 205 | Write-Host "The installation directory has been added to your PATH." 206 | Write-Host "You may need to restart your terminal for the changes to take effect." 207 | Write-Host "" 208 | Write-Host "In a new terminal, you can run:" -ForegroundColor Cyan 209 | Write-Host " $BinaryName --help" -ForegroundColor Green 210 | } 211 | else { 212 | if (-not (Test-InPath $InstallDir)) { 213 | Write-Warn "$InstallDir is not in your PATH" 214 | Write-Host "" 215 | Write-Host "To add it permanently, run this in an elevated PowerShell:" -ForegroundColor Cyan 216 | Write-Host " [Environment]::SetEnvironmentVariable('PATH', `$env:PATH + ';$InstallDir', 'User')" -ForegroundColor Green 217 | Write-Host "" 218 | Write-Host "Or add it to your current session:" -ForegroundColor Cyan 219 | Write-Host " `$env:PATH += ';$InstallDir'" -ForegroundColor Green 220 | Write-Host "" 221 | } 222 | else { 223 | Write-Host "You can now use '$BinaryName' from anywhere!" -ForegroundColor Cyan 224 | Write-Host "" 225 | Write-Host "Try it out:" -ForegroundColor Cyan 226 | Write-Host " $BinaryName --help" -ForegroundColor Green 227 | } 228 | } 229 | } 230 | 231 | # Main installation flow 232 | function Main { 233 | Write-Info "Installing $BinaryName..." 234 | 235 | $arch = Get-Architecture 236 | $target = "${arch}-pc-windows-msvc" 237 | Write-Info "Detected platform: $target" 238 | 239 | $version = Get-LatestVersion 240 | 241 | # Check if already installed with the same version 242 | $currentVersion = Get-CurrentVersion 243 | if ($currentVersion) { 244 | Write-Info "Current version: $currentVersion" 245 | # Strip 'v' prefix from version if present for comparison 246 | $latestVersionNum = $version -replace '^v', '' 247 | if ($currentVersion -eq $latestVersionNum -or $currentVersion -eq $version) { 248 | Write-Info "$BinaryName $currentVersion is already installed (latest version)" 249 | Write-Host "" 250 | Write-Host "If you want to reinstall, please uninstall first:" -ForegroundColor Cyan 251 | Write-Host " Remove-Item (Get-Command $BinaryName).Path" -ForegroundColor Green 252 | exit 0 253 | } 254 | } 255 | 256 | Install-Binary -Version $version -Target $target 257 | 258 | $pathUpdated = $false 259 | if (-not (Test-InPath $InstallDir)) { 260 | $pathUpdated = Add-ToPath -Directory $InstallDir 261 | } 262 | 263 | Show-PostInstall -PathUpdated $pathUpdated 264 | } 265 | 266 | # Run the installer 267 | try { 268 | Main 269 | } 270 | catch { 271 | Write-Error-Custom "Installation failed: $_" 272 | } 273 | -------------------------------------------------------------------------------- /core/src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[cfg(any(feature = "cli", feature = "wasm", test))] 4 | use std::collections::HashMap; 5 | 6 | /// A minimal FSST-compressed vector of UTF-8 strings with random access. 7 | #[derive(Debug, Clone, Serialize, Deserialize)] 8 | pub struct FsstStrVec { 9 | // FSST dictionary we trained (as raw bytes for compact serde) 10 | dict_syms: Vec<[u8; 8]>, 11 | dict_lens: Vec, 12 | // Concatenated compressed payload and per-item offsets 13 | offsets: Vec, // offsets[i] = start of item i in `data` 14 | data: Vec, 15 | } 16 | 17 | impl FsstStrVec { 18 | /// Train FSST on `strings` and build the compressed vector. 19 | #[cfg(any(feature = "cli", test))] 20 | fn from_strings(strings: &[impl AsRef]) -> Self { 21 | // 1) Train a compressor on the corpus. 22 | let sample: Vec<&[u8]> = strings.iter().map(|s| s.as_ref().as_bytes()).collect(); 23 | let compressor = fsst::Compressor::train(&sample); 24 | 25 | // Keep dictionary for later decoding. 26 | let syms: Vec = compressor.symbol_table().to_vec(); 27 | let lens: Vec = compressor.symbol_lengths().to_vec(); 28 | 29 | // 2) Compress each string independently; store offsets + bytes. 30 | let mut offsets = Vec::with_capacity(strings.len()); 31 | let mut data = Vec::new(); 32 | for s in strings { 33 | offsets.push(data.len() as u32); 34 | let c = compressor.compress(s.as_ref().as_bytes()); 35 | data.extend_from_slice(&c); 36 | } 37 | 38 | // 3) Store symbol table as raw bytes for compact serialization. 39 | let dict_syms: Vec<[u8; 8]> = syms 40 | .into_iter() 41 | .map(|sym| u64::to_le_bytes(sym.to_u64())) 42 | .collect(); 43 | 44 | Self { 45 | dict_syms, 46 | dict_lens: lens, 47 | offsets, 48 | data, 49 | } 50 | } 51 | 52 | /// Number of strings 53 | pub fn len(&self) -> usize { 54 | self.offsets.len() 55 | } 56 | 57 | /// Random access: decode item i into an owned String. 58 | pub fn get(&self, i: usize) -> Option { 59 | if i >= self.len() { 60 | return None; 61 | } 62 | let start = self.offsets[i] as usize; 63 | let end = if i + 1 < self.len() { 64 | self.offsets[i + 1] as usize 65 | } else { 66 | self.data.len() 67 | }; 68 | let codes = &self.data[start..end]; 69 | 70 | // Rebuild a Decompressor on-demand. (You can cache this in the struct if you 71 | // read frequently; it's cheap either way.) 72 | let syms: Vec = self 73 | .dict_syms 74 | .iter() 75 | .map(fsst::Symbol::from_slice) 76 | .collect(); 77 | let decomp = fsst::Decompressor::new(&syms, &self.dict_lens); 78 | 79 | let bytes = decomp.decompress(codes); 80 | Some(String::from_utf8(bytes).expect("FSST preserves UTF-8 for UTF-8 input")) 81 | } 82 | } 83 | 84 | #[derive(Debug, serde::Serialize, serde::Deserialize)] 85 | #[serde(rename_all = "camelCase")] 86 | pub struct Document { 87 | pub title: String, 88 | pub category: String, 89 | pub href: String, 90 | pub body: String, 91 | pub keywords: Option>, 92 | } 93 | 94 | #[derive(Debug, serde::Serialize, serde::Deserialize)] 95 | pub struct Index { 96 | /// FST vector for keyword to entry index 97 | fst: Vec, 98 | 99 | /// FSST string vector of all document strings 100 | document_strings: FsstStrVec, 101 | 102 | /// Vector of keyword to document index entries 103 | keyword_to_documents: Vec>, 104 | } 105 | 106 | impl Index { 107 | pub fn from_bytes(bytes: &[u8]) -> Result> { 108 | let index: Index = postcard::from_bytes(bytes)?; 109 | Ok(index) 110 | } 111 | 112 | pub fn to_bytes(&self) -> Result, Box> { 113 | Ok(postcard::to_allocvec(self)?) 114 | } 115 | } 116 | 117 | #[cfg(any(feature = "cli", test))] 118 | pub fn build_index(documents: Vec) -> Result> { 119 | use std::collections::HashSet; 120 | 121 | let stop_words = include_str!("../english.stop") 122 | .lines() 123 | .filter(|line| !line.is_empty() && !line.starts_with('#')) 124 | .map(|line| line.to_lowercase()) 125 | .collect::>(); 126 | 127 | let sw = rake::StopWords::from(stop_words); 128 | let rake = rake::Rake::new(sw.clone()); 129 | 130 | let mut strings: Vec<&str> = Vec::new(); 131 | let mut keywords_to_documents: HashMap> = HashMap::new(); 132 | let mut doc_index_map: HashMap<&str, usize> = HashMap::new(); 133 | 134 | for (doc_index, doc) in documents.iter().enumerate() { 135 | doc_index_map.insert(&doc.href, doc_index); 136 | strings.push(&doc.title); 137 | strings.push(&doc.category); 138 | strings.push(&doc.href); 139 | strings.push(&doc.body); 140 | 141 | let mut keyword_set: HashSet = HashSet::new(); 142 | let mut keywords: Vec<(String, f64)> = Vec::new(); 143 | 144 | // Add explicit keywords from document metadata 145 | if let Some(kw) = &doc.keywords { 146 | for k in kw { 147 | let keyword = k 148 | .trim_matches(|c: char| !c.is_alphanumeric()) 149 | .to_lowercase(); 150 | if !keyword.is_empty() && !sw.contains(&keyword.clone()) && !keyword_set.contains(&keyword) 151 | { 152 | keywords.push((keyword.clone(), 100.0)); 153 | keyword_set.insert(keyword.clone()); 154 | } 155 | } 156 | } 157 | 158 | // add keywords from title 159 | let title_keywords = doc 160 | .title 161 | .split_whitespace() 162 | .map(|w| { 163 | w.trim_matches(|c: char| !c.is_alphanumeric()) 164 | .to_lowercase() 165 | }) 166 | .filter(|w| !w.is_empty() && !sw.contains(&w.clone())) 167 | .collect::>(); // deduplicate 168 | 169 | for tk in title_keywords { 170 | if !keyword_set.contains(&tk) { 171 | keywords.push((tk.clone(), 90.0)); 172 | keyword_set.insert(tk.clone()); 173 | } 174 | } 175 | 176 | let body_keywords = rake.run_fragments(vec![doc.body.as_str()]); 177 | let mut single_word_budget = 5; 178 | let mut double_word_budget = 3; 179 | 180 | for k in &body_keywords { 181 | let keyword = k.keyword.to_lowercase(); 182 | 183 | // continue if keyword is already in title keywords 184 | if keyword_set.contains(&keyword) { 185 | continue; 186 | } 187 | 188 | let whitespace_count = k.keyword.matches(' ').count(); 189 | 190 | if whitespace_count == 0 && single_word_budget > 0 { 191 | single_word_budget -= 1; 192 | } else if whitespace_count == 1 && double_word_budget > 0 { 193 | double_word_budget -= 1; 194 | } else { 195 | continue; 196 | } 197 | 198 | keywords.push((keyword.clone(), k.score)); 199 | keyword_set.insert(keyword.clone()); 200 | 201 | if single_word_budget == 0 && double_word_budget == 0 { 202 | break; 203 | } 204 | } 205 | 206 | for k in keywords.iter() { 207 | keywords_to_documents 208 | .entry(k.0.clone()) 209 | .or_default() 210 | .push((doc, k.1)); 211 | } 212 | } 213 | 214 | println!("Extracted {} unique keywords", keywords_to_documents.len()); 215 | 216 | let mut fst_builder = fst::MapBuilder::memory(); 217 | let mut keyword_to_documents: Vec> = Vec::new(); 218 | let mut keywords: Vec = keywords_to_documents.keys().cloned().collect(); 219 | keywords.sort(); 220 | 221 | for (index, keyword) in keywords.iter().enumerate() { 222 | fst_builder.insert(keyword, index as u64)?; 223 | 224 | let mut doc_scores = keywords_to_documents.get(keyword).unwrap().clone(); 225 | doc_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); 226 | 227 | let entry = doc_scores 228 | .iter() 229 | .map(|(doc, score)| (doc_index_map[doc.href.as_str()], *score as u8)) 230 | .collect::>(); 231 | 232 | keyword_to_documents.push(entry); 233 | } 234 | 235 | let fst = fst_builder.into_inner().unwrap(); 236 | let document_strings = FsstStrVec::from_strings(&strings); 237 | 238 | Ok(Index { 239 | fst, 240 | document_strings, 241 | keyword_to_documents, 242 | }) 243 | } 244 | 245 | #[cfg(any(feature = "wasm", test))] 246 | pub fn search( 247 | index: &Index, 248 | query: &str, 249 | max_results: usize, 250 | ) -> Result, Box> { 251 | use fst::automaton::Levenshtein; 252 | use fst::map::OpBuilder; 253 | use fst::{Automaton, Streamer}; 254 | use std::collections::HashSet; 255 | 256 | let map = fst::Map::new(&index.fst)?; 257 | 258 | let mut query_words: HashSet = query 259 | .split_whitespace() 260 | .map(|w| { 261 | w.trim_matches(|c: char| !c.is_alphanumeric()) 262 | .to_lowercase() 263 | }) 264 | .filter(|w| !w.is_empty()) 265 | .collect(); 266 | 267 | query_words.insert(query.to_lowercase()); 268 | 269 | let mut keywords: Vec<(String, u64)> = Vec::new(); 270 | 271 | for query_word in query_words { 272 | use fst::automaton::Str; 273 | 274 | let lev = Levenshtein::new(query_word.as_str(), 1)?; 275 | let prefix = Str::new(query_word.as_str()).starts_with(); 276 | 277 | let mut op = OpBuilder::new() 278 | .add(map.search(lev)) 279 | .add(map.search(prefix)) 280 | .union(); 281 | 282 | while let Some((keyword, indexed_value)) = op.next() { 283 | let keyword_str = String::from_utf8(keyword.to_vec())?; 284 | let score = indexed_value.to_vec().get(0).unwrap().value; 285 | keywords.push((keyword_str, score)); 286 | } 287 | } 288 | 289 | // Sort keywords by length (shorter first) 290 | keywords.sort_by_key(|(kw, _)| kw.len()); 291 | 292 | let mut documents: HashMap = HashMap::new(); 293 | 294 | for (_, keyword_index) in keywords { 295 | let documents_matching_keyword = &index.keyword_to_documents[keyword_index as usize]; 296 | 297 | for (document_index, score) in documents_matching_keyword { 298 | let entry = documents.entry(*document_index).or_insert(0); 299 | *entry = entry.saturating_add(*score); 300 | } 301 | } 302 | 303 | // sort documents by score (descending), then by document index (ascending) for stable ordering 304 | let mut documents: Vec<(usize, u8)> = documents.into_iter().collect(); 305 | documents.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); 306 | documents.truncate(max_results); 307 | 308 | let mut result: Vec = Vec::new(); 309 | 310 | for (document_index, _score) in documents { 311 | let title = index 312 | .document_strings 313 | .get(document_index * 4) 314 | .ok_or_else(|| "Failed to get document title")?; 315 | let category = index 316 | .document_strings 317 | .get(document_index * 4 + 1) 318 | .ok_or_else(|| "Failed to get document category")?; 319 | let href = index 320 | .document_strings 321 | .get(document_index * 4 + 2) 322 | .ok_or_else(|| "Failed to get document href")?; 323 | let body = index 324 | .document_strings 325 | .get(document_index * 4 + 3) 326 | .ok_or_else(|| "Failed to get document body")?; 327 | 328 | let document = Document { 329 | title, 330 | category, 331 | href, 332 | body, 333 | keywords: None, 334 | }; 335 | 336 | result.push(document); 337 | } 338 | 339 | Ok(result) 340 | } 341 | 342 | #[cfg(test)] 343 | mod tests; 344 | -------------------------------------------------------------------------------- /static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | docfind - Fast Document Search Demo 8 | 154 | 155 | 156 | 157 |
158 | 159 |
160 | 165 |
166 | 167 | 168 |
169 |
170 | 171 |
172 |
173 |
174 | 175 |
176 |

177 | Powered by docfind 178 |

179 |
180 |
181 | 182 | 329 | 330 | 331 | -------------------------------------------------------------------------------- /cli/src/main.rs: -------------------------------------------------------------------------------- 1 | use docfind_core::Document; 2 | use std::io::Write; 3 | use std::path::Path; 4 | use std::{collections::HashMap, fs::File}; 5 | use wasm_encoder::{ConstExpr, DataSection, MemorySection, MemoryType}; 6 | use wasmparser::{Parser, Payload}; 7 | 8 | #[derive(Debug)] 9 | enum WasmDataSegment { 10 | Passive(Vec), 11 | Active { 12 | memory_index: u32, 13 | offset: ConstExpr, 14 | data: Vec, 15 | i32const_offset: Option, 16 | }, 17 | } 18 | 19 | /// Represents different types of WASM sections we care about 20 | #[derive(Debug)] 21 | enum WasmSection { 22 | Data(Vec), 23 | DataCount(u32), 24 | Memory, 25 | Raw { id: u8, data: Vec }, 26 | } 27 | 28 | /// Convert a wasmparser ConstExpr to a wasm_encoder ConstExpr 29 | fn convert_const_expr( 30 | expr: &wasmparser::ConstExpr, 31 | ) -> Result> { 32 | let mut ops_reader = expr.get_operators_reader(); 33 | 34 | // We'll handle the most common cases 35 | if !ops_reader.eof() { 36 | let op = ops_reader.read()?; 37 | match op { 38 | wasmparser::Operator::I32Const { value } => return Ok(ConstExpr::i32_const(value)), 39 | wasmparser::Operator::I64Const { value } => return Ok(ConstExpr::i64_const(value)), 40 | wasmparser::Operator::F32Const { value } => { 41 | // Convert wasmparser Ieee32 to wasm_encoder Ieee32 42 | let f32_val = f32::from_bits(value.bits()); 43 | return Ok(ConstExpr::f32_const(f32_val.into())); 44 | } 45 | wasmparser::Operator::F64Const { value } => { 46 | // Convert wasmparser Ieee64 to wasm_encoder Ieee64 47 | let f64_val = f64::from_bits(value.bits()); 48 | return Ok(ConstExpr::f64_const(f64_val.into())); 49 | } 50 | wasmparser::Operator::GlobalGet { global_index } => { 51 | return Ok(ConstExpr::global_get(global_index)); 52 | } 53 | wasmparser::Operator::RefNull { hty } => { 54 | // Convert heap type 55 | let heap_type = match hty { 56 | wasmparser::HeapType::Concrete(_) => wasm_encoder::HeapType::Concrete(0), 57 | _ => wasm_encoder::HeapType::Abstract { 58 | shared: false, 59 | ty: wasm_encoder::AbstractHeapType::Func, 60 | }, 61 | }; 62 | return Ok(ConstExpr::ref_null(heap_type)); 63 | } 64 | wasmparser::Operator::RefFunc { function_index } => { 65 | return Ok(ConstExpr::ref_func(function_index)); 66 | } 67 | _ => { 68 | // For other operators, use raw with empty bytes 69 | return Ok(ConstExpr::raw(vec![])); 70 | } 71 | } 72 | } 73 | 74 | Ok(ConstExpr::raw(vec![])) 75 | } 76 | 77 | fn main() -> Result<(), Box> { 78 | let debug = std::env::var("DOCFIND_DEBUG").is_ok(); 79 | let args: Vec = std::env::args().collect(); 80 | 81 | // Handle --version flag 82 | if args.len() == 2 && (args[1] == "--version" || args[1] == "-v") { 83 | println!("docfind {}", env!("CARGO_PKG_VERSION")); 84 | std::process::exit(0); 85 | } 86 | 87 | if args.len() != 3 { 88 | eprintln!("Usage: {} ", args[0]); 89 | std::process::exit(1); 90 | } 91 | 92 | let input_path = &args[1]; 93 | let output_dir = &args[2]; 94 | if debug { 95 | eprintln!("[docfind] CWD: {:?}", std::env::current_dir()?); 96 | eprintln!("[docfind] input_path: {}", input_path); 97 | eprintln!("[docfind] output_dir: {}", output_dir); 98 | } 99 | let documents_file = File::open(input_path)?; 100 | let documents: Vec = serde_json::from_reader(documents_file)?; 101 | 102 | let start = std::time::Instant::now(); 103 | let index = docfind_core::build_index(documents)?; 104 | let duration = start.elapsed(); 105 | if debug { 106 | eprintln!("[docfind] Indexing completed in: {:?}", duration); 107 | } else { 108 | println!("Indexing completed in: {:?}", duration); 109 | } 110 | 111 | let start = std::time::Instant::now(); 112 | let mut sections: Vec = Vec::new(); 113 | 114 | let mut old_memory_page_count: u64 = 0; 115 | let mut index_base_global_index: Option = None; 116 | let mut index_len_global_index: Option = None; 117 | let mut i32_globals: HashMap = HashMap::new(); 118 | 119 | let docfind_js: &[u8] = include_bytes!("../../wasm/pkg/docfind.js"); 120 | let docfind_bg_wasm: &[u8] = include_bytes!("../../wasm/pkg/docfind_bg.wasm"); 121 | if debug { 122 | eprintln!("[docfind] Embedded JS size: {} bytes", docfind_js.len()); 123 | eprintln!( 124 | "[docfind] Embedded WASM size: {} bytes", 125 | docfind_bg_wasm.len() 126 | ); 127 | } 128 | 129 | for payload in Parser::new(0).parse_all(docfind_bg_wasm) { 130 | let payload = payload?; 131 | 132 | // process i32 const data sections differently 133 | if let Payload::DataSection(reader) = payload { 134 | let mut data_segments: Vec = Vec::new(); 135 | 136 | for data in reader { 137 | let data = data?; 138 | 139 | match data.kind { 140 | wasmparser::DataKind::Passive => { 141 | data_segments.push(WasmDataSegment::Passive(data.data.to_vec())); 142 | } 143 | wasmparser::DataKind::Active { 144 | memory_index, 145 | offset_expr, 146 | } => { 147 | let const_expr = convert_const_expr(&offset_expr)?; 148 | let i32const_offset = if let wasmparser::Operator::I32Const { value } = 149 | offset_expr.get_operators_reader().read()? 150 | { 151 | Some(value) 152 | } else { 153 | None 154 | }; 155 | 156 | data_segments.push(WasmDataSegment::Active { 157 | memory_index, 158 | offset: const_expr, 159 | data: data.data.to_vec(), 160 | i32const_offset, 161 | }); 162 | } 163 | } 164 | } 165 | 166 | sections.push(WasmSection::Data(data_segments)); 167 | } else if let Payload::DataCountSection { count, .. } = payload { 168 | sections.push(WasmSection::DataCount(count)); 169 | } else if let Payload::MemorySection(reader) = payload { 170 | for memory in reader { 171 | old_memory_page_count = memory?.initial as u64; 172 | } 173 | sections.push(WasmSection::Memory); 174 | } else { 175 | if let Some((id, data)) = payload.as_section() { 176 | sections.push(WasmSection::Raw { 177 | id, 178 | data: docfind_bg_wasm[data.start..data.end].to_vec(), 179 | }); 180 | } 181 | 182 | match payload { 183 | Payload::ExportSection(reader) => { 184 | for export in reader { 185 | let export = export?; 186 | if export.name == "INDEX_BASE" { 187 | index_base_global_index = Some(export.index); 188 | } else if export.name == "INDEX_LEN" { 189 | index_len_global_index = Some(export.index); 190 | } 191 | } 192 | } 193 | Payload::GlobalSection(reader) => { 194 | for (idx, global) in reader.into_iter().enumerate() { 195 | let global = global?; 196 | let mut ops_reader = global.init_expr.get_operators_reader(); 197 | 198 | if !ops_reader.eof() { 199 | if let Ok(wasmparser::Operator::I32Const { value }) = ops_reader.read() { 200 | i32_globals.insert(idx as u32, value); 201 | } 202 | } 203 | } 204 | } 205 | _ => {} 206 | } 207 | } 208 | } 209 | 210 | let index_base_global_index = 211 | index_base_global_index.expect("Could not find INDEX_BASE global index"); 212 | let index_len_global_index = 213 | index_len_global_index.expect("Could not find INDEX_LEN global index"); 214 | if debug { 215 | eprintln!( 216 | "[docfind] INDEX_BASE global index: {}", 217 | index_base_global_index 218 | ); 219 | eprintln!( 220 | "[docfind] INDEX_LEN global index: {}", 221 | index_len_global_index 222 | ); 223 | } 224 | 225 | let index_base_global_address = i32_globals 226 | .get(&index_base_global_index) 227 | .expect("Could not find INDEX_BASE global value"); 228 | 229 | let index_len_global_address = i32_globals 230 | .get(&index_len_global_index) 231 | .expect("Could not find INDEX_LEN global value"); 232 | if debug { 233 | eprintln!( 234 | "[docfind] INDEX_BASE address: {}", 235 | index_base_global_address 236 | ); 237 | eprintln!("[docfind] INDEX_LEN address: {}", index_len_global_address); 238 | } 239 | 240 | let raw_index: Vec = index.to_bytes()?; // will embed into wasm 241 | if debug { 242 | eprintln!("[docfind] Index size: {} bytes", raw_index.len()); 243 | } else { 244 | println!("Index size: {} bytes", raw_index.len()); 245 | } 246 | 247 | let new_memory_page_count = old_memory_page_count + (raw_index.len() as u64 / 0x10000) + 1; 248 | let index_base = old_memory_page_count * 0x10000; 249 | if debug { 250 | eprintln!("[docfind] Old memory pages: {}", old_memory_page_count); 251 | eprintln!("[docfind] New memory pages: {}", new_memory_page_count); 252 | eprintln!("[docfind] Index base address: {}", index_base); 253 | } 254 | 255 | let mut encoder = wasm_encoder::Module::new(); 256 | 257 | for section in sections { 258 | match section { 259 | WasmSection::DataCount(count) => { 260 | encoder.section(&wasm_encoder::DataCountSection { count: count + 1 }); 261 | } 262 | WasmSection::Data(data_segments) => { 263 | let mut data_section = DataSection::new(); 264 | 265 | for segment in data_segments { 266 | match segment { 267 | WasmDataSegment::Passive(data) => { 268 | data_section.passive(data.iter().copied()); 269 | } 270 | WasmDataSegment::Active { 271 | memory_index, 272 | offset, 273 | data, 274 | i32const_offset, 275 | } => { 276 | if let Some(i32_offset) = i32const_offset { 277 | let start = i32_offset; 278 | let end = i32_offset + (data.len() as i32); 279 | 280 | // Patch the data if it contains the INDEX_BASE or INDEX_LEN addresses 281 | if index_base_global_address >= &start && index_base_global_address < &end { 282 | assert!( 283 | index_len_global_address >= &start && index_len_global_address < &end, 284 | "INDEX_LEN address not in data segment!" 285 | ); 286 | 287 | let mut data = data; 288 | 289 | let base_relative_offset = (index_base_global_address - start) as usize; 290 | data[base_relative_offset..base_relative_offset + 4] 291 | .copy_from_slice(&(index_base as i32).to_le_bytes()); 292 | 293 | let length_relative_offset = (index_len_global_address - start) as usize; 294 | data[length_relative_offset..length_relative_offset + 4] 295 | .copy_from_slice(&(raw_index.len() as i32).to_le_bytes()); 296 | 297 | data_section.active(memory_index, &offset, data); 298 | continue; 299 | } 300 | } 301 | 302 | data_section.active(memory_index, &offset, data); 303 | } 304 | } 305 | } 306 | 307 | data_section.active( 308 | 0, 309 | &ConstExpr::i32_const(index_base as i32), 310 | raw_index.iter().copied(), 311 | ); 312 | 313 | encoder.section(&data_section); 314 | } 315 | WasmSection::Memory => { 316 | let mut new_memory_section = MemorySection::new(); 317 | new_memory_section.memory(MemoryType { 318 | minimum: new_memory_page_count, 319 | maximum: None, 320 | memory64: false, 321 | shared: false, 322 | page_size_log2: None, 323 | }); 324 | encoder.section(&new_memory_section); 325 | } 326 | WasmSection::Raw { id, data } => { 327 | encoder.section(&wasm_encoder::RawSection { id, data: &data }); 328 | } 329 | } 330 | } 331 | 332 | let wasm_bytes = encoder.finish(); 333 | wasmparser::Validator::new().validate_all(&wasm_bytes)?; 334 | 335 | let output_dir = Path::new(output_dir); 336 | std::fs::create_dir_all(output_dir)?; 337 | 338 | let mut output_js = File::create(output_dir.join("docfind.js"))?; 339 | output_js.write_all(docfind_js)?; 340 | 341 | let mut output_wasm = File::create(output_dir.join("docfind_bg.wasm"))?; 342 | output_wasm.write_all(&wasm_bytes)?; 343 | 344 | let duration = start.elapsed(); 345 | println!("WASM creation completed in: {:?}", duration); 346 | 347 | Ok(()) 348 | } 349 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "atomic-polyfill" 16 | version = "1.0.3" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" 19 | dependencies = [ 20 | "critical-section", 21 | ] 22 | 23 | [[package]] 24 | name = "bitflags" 25 | version = "2.10.0" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" 28 | 29 | [[package]] 30 | name = "bumpalo" 31 | version = "3.19.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 34 | 35 | [[package]] 36 | name = "byteorder" 37 | version = "1.5.0" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 40 | 41 | [[package]] 42 | name = "cfg-if" 43 | version = "1.0.4" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" 46 | 47 | [[package]] 48 | name = "cobs" 49 | version = "0.3.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" 52 | dependencies = [ 53 | "thiserror", 54 | ] 55 | 56 | [[package]] 57 | name = "critical-section" 58 | version = "1.2.0" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" 61 | 62 | [[package]] 63 | name = "docfind" 64 | version = "0.5.1" 65 | dependencies = [ 66 | "docfind_core", 67 | "serde_json", 68 | "wasm-encoder", 69 | "wasmparser", 70 | ] 71 | 72 | [[package]] 73 | name = "docfind-wasm" 74 | version = "0.5.1" 75 | dependencies = [ 76 | "docfind_core", 77 | "serde-wasm-bindgen", 78 | "wasm-bindgen", 79 | ] 80 | 81 | [[package]] 82 | name = "docfind_core" 83 | version = "0.5.1" 84 | dependencies = [ 85 | "fsst-rs", 86 | "fst", 87 | "postcard", 88 | "rake", 89 | "serde", 90 | "serde_json", 91 | ] 92 | 93 | [[package]] 94 | name = "embedded-io" 95 | version = "0.4.0" 96 | source = "registry+https://github.com/rust-lang/crates.io-index" 97 | checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" 98 | 99 | [[package]] 100 | name = "embedded-io" 101 | version = "0.6.1" 102 | source = "registry+https://github.com/rust-lang/crates.io-index" 103 | checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" 104 | 105 | [[package]] 106 | name = "equivalent" 107 | version = "1.0.2" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 110 | 111 | [[package]] 112 | name = "foldhash" 113 | version = "0.1.5" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" 116 | 117 | [[package]] 118 | name = "fsst-rs" 119 | version = "0.5.4" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "ab195789b87bb56fce91b3617e44d36dbba68a4c8d736ef48767187932a5161b" 122 | 123 | [[package]] 124 | name = "fst" 125 | version = "0.4.7" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" 128 | dependencies = [ 129 | "utf8-ranges", 130 | ] 131 | 132 | [[package]] 133 | name = "hash32" 134 | version = "0.2.1" 135 | source = "registry+https://github.com/rust-lang/crates.io-index" 136 | checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" 137 | dependencies = [ 138 | "byteorder", 139 | ] 140 | 141 | [[package]] 142 | name = "hashbrown" 143 | version = "0.15.5" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" 146 | dependencies = [ 147 | "foldhash", 148 | "serde", 149 | ] 150 | 151 | [[package]] 152 | name = "hashbrown" 153 | version = "0.16.0" 154 | source = "registry+https://github.com/rust-lang/crates.io-index" 155 | checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" 156 | 157 | [[package]] 158 | name = "heapless" 159 | version = "0.7.17" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" 162 | dependencies = [ 163 | "atomic-polyfill", 164 | "hash32", 165 | "rustc_version", 166 | "serde", 167 | "spin", 168 | "stable_deref_trait", 169 | ] 170 | 171 | [[package]] 172 | name = "indexmap" 173 | version = "2.12.0" 174 | source = "registry+https://github.com/rust-lang/crates.io-index" 175 | checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" 176 | dependencies = [ 177 | "equivalent", 178 | "hashbrown 0.16.0", 179 | "serde", 180 | "serde_core", 181 | ] 182 | 183 | [[package]] 184 | name = "itoa" 185 | version = "1.0.15" 186 | source = "registry+https://github.com/rust-lang/crates.io-index" 187 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 188 | 189 | [[package]] 190 | name = "js-sys" 191 | version = "0.3.81" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" 194 | dependencies = [ 195 | "once_cell", 196 | "wasm-bindgen", 197 | ] 198 | 199 | [[package]] 200 | name = "lazy_static" 201 | version = "1.5.0" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 204 | 205 | [[package]] 206 | name = "leb128fmt" 207 | version = "0.1.0" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" 210 | 211 | [[package]] 212 | name = "lock_api" 213 | version = "0.4.14" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" 216 | dependencies = [ 217 | "scopeguard", 218 | ] 219 | 220 | [[package]] 221 | name = "log" 222 | version = "0.4.28" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" 225 | 226 | [[package]] 227 | name = "memchr" 228 | version = "2.7.6" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 231 | 232 | [[package]] 233 | name = "once_cell" 234 | version = "1.21.3" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 237 | 238 | [[package]] 239 | name = "postcard" 240 | version = "1.1.3" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" 243 | dependencies = [ 244 | "cobs", 245 | "embedded-io 0.4.0", 246 | "embedded-io 0.6.1", 247 | "heapless", 248 | "serde", 249 | ] 250 | 251 | [[package]] 252 | name = "proc-macro2" 253 | version = "1.0.101" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 256 | dependencies = [ 257 | "unicode-ident", 258 | ] 259 | 260 | [[package]] 261 | name = "quote" 262 | version = "1.0.41" 263 | source = "registry+https://github.com/rust-lang/crates.io-index" 264 | checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" 265 | dependencies = [ 266 | "proc-macro2", 267 | ] 268 | 269 | [[package]] 270 | name = "rake" 271 | version = "0.3.6" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "5a0a7b4878cdfa9c73657cf8479a1f2430104b21991db7940e97ab000056f0a1" 274 | dependencies = [ 275 | "lazy_static", 276 | "regex", 277 | "serde", 278 | ] 279 | 280 | [[package]] 281 | name = "regex" 282 | version = "1.12.2" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" 285 | dependencies = [ 286 | "aho-corasick", 287 | "memchr", 288 | "regex-automata", 289 | "regex-syntax", 290 | ] 291 | 292 | [[package]] 293 | name = "regex-automata" 294 | version = "0.4.13" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" 297 | dependencies = [ 298 | "aho-corasick", 299 | "memchr", 300 | "regex-syntax", 301 | ] 302 | 303 | [[package]] 304 | name = "regex-syntax" 305 | version = "0.8.8" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" 308 | 309 | [[package]] 310 | name = "rustc_version" 311 | version = "0.4.1" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" 314 | dependencies = [ 315 | "semver", 316 | ] 317 | 318 | [[package]] 319 | name = "rustversion" 320 | version = "1.0.22" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 323 | 324 | [[package]] 325 | name = "ryu" 326 | version = "1.0.20" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 329 | 330 | [[package]] 331 | name = "scopeguard" 332 | version = "1.2.0" 333 | source = "registry+https://github.com/rust-lang/crates.io-index" 334 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 335 | 336 | [[package]] 337 | name = "semver" 338 | version = "1.0.27" 339 | source = "registry+https://github.com/rust-lang/crates.io-index" 340 | checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" 341 | 342 | [[package]] 343 | name = "serde" 344 | version = "1.0.228" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 347 | dependencies = [ 348 | "serde_core", 349 | "serde_derive", 350 | ] 351 | 352 | [[package]] 353 | name = "serde-wasm-bindgen" 354 | version = "0.6.5" 355 | source = "registry+https://github.com/rust-lang/crates.io-index" 356 | checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b" 357 | dependencies = [ 358 | "js-sys", 359 | "serde", 360 | "wasm-bindgen", 361 | ] 362 | 363 | [[package]] 364 | name = "serde_core" 365 | version = "1.0.228" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 368 | dependencies = [ 369 | "serde_derive", 370 | ] 371 | 372 | [[package]] 373 | name = "serde_derive" 374 | version = "1.0.228" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 377 | dependencies = [ 378 | "proc-macro2", 379 | "quote", 380 | "syn", 381 | ] 382 | 383 | [[package]] 384 | name = "serde_json" 385 | version = "1.0.145" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 388 | dependencies = [ 389 | "itoa", 390 | "memchr", 391 | "ryu", 392 | "serde", 393 | "serde_core", 394 | ] 395 | 396 | [[package]] 397 | name = "spin" 398 | version = "0.9.8" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" 401 | dependencies = [ 402 | "lock_api", 403 | ] 404 | 405 | [[package]] 406 | name = "stable_deref_trait" 407 | version = "1.2.1" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" 410 | 411 | [[package]] 412 | name = "syn" 413 | version = "2.0.107" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" 416 | dependencies = [ 417 | "proc-macro2", 418 | "quote", 419 | "unicode-ident", 420 | ] 421 | 422 | [[package]] 423 | name = "thiserror" 424 | version = "2.0.17" 425 | source = "registry+https://github.com/rust-lang/crates.io-index" 426 | checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" 427 | dependencies = [ 428 | "thiserror-impl", 429 | ] 430 | 431 | [[package]] 432 | name = "thiserror-impl" 433 | version = "2.0.17" 434 | source = "registry+https://github.com/rust-lang/crates.io-index" 435 | checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" 436 | dependencies = [ 437 | "proc-macro2", 438 | "quote", 439 | "syn", 440 | ] 441 | 442 | [[package]] 443 | name = "unicode-ident" 444 | version = "1.0.19" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 447 | 448 | [[package]] 449 | name = "utf8-ranges" 450 | version = "1.0.5" 451 | source = "registry+https://github.com/rust-lang/crates.io-index" 452 | checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" 453 | 454 | [[package]] 455 | name = "wasm-bindgen" 456 | version = "0.2.104" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" 459 | dependencies = [ 460 | "cfg-if", 461 | "once_cell", 462 | "rustversion", 463 | "wasm-bindgen-macro", 464 | "wasm-bindgen-shared", 465 | ] 466 | 467 | [[package]] 468 | name = "wasm-bindgen-backend" 469 | version = "0.2.104" 470 | source = "registry+https://github.com/rust-lang/crates.io-index" 471 | checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" 472 | dependencies = [ 473 | "bumpalo", 474 | "log", 475 | "proc-macro2", 476 | "quote", 477 | "syn", 478 | "wasm-bindgen-shared", 479 | ] 480 | 481 | [[package]] 482 | name = "wasm-bindgen-macro" 483 | version = "0.2.104" 484 | source = "registry+https://github.com/rust-lang/crates.io-index" 485 | checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" 486 | dependencies = [ 487 | "quote", 488 | "wasm-bindgen-macro-support", 489 | ] 490 | 491 | [[package]] 492 | name = "wasm-bindgen-macro-support" 493 | version = "0.2.104" 494 | source = "registry+https://github.com/rust-lang/crates.io-index" 495 | checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" 496 | dependencies = [ 497 | "proc-macro2", 498 | "quote", 499 | "syn", 500 | "wasm-bindgen-backend", 501 | "wasm-bindgen-shared", 502 | ] 503 | 504 | [[package]] 505 | name = "wasm-bindgen-shared" 506 | version = "0.2.104" 507 | source = "registry+https://github.com/rust-lang/crates.io-index" 508 | checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" 509 | dependencies = [ 510 | "unicode-ident", 511 | ] 512 | 513 | [[package]] 514 | name = "wasm-encoder" 515 | version = "0.240.0" 516 | source = "registry+https://github.com/rust-lang/crates.io-index" 517 | checksum = "06d642d8c5ecc083aafe9ceb32809276a304547a3a6eeecceb5d8152598bc71f" 518 | dependencies = [ 519 | "leb128fmt", 520 | "wasmparser", 521 | ] 522 | 523 | [[package]] 524 | name = "wasmparser" 525 | version = "0.240.0" 526 | source = "registry+https://github.com/rust-lang/crates.io-index" 527 | checksum = "b722dcf61e0ea47440b53ff83ccb5df8efec57a69d150e4f24882e4eba7e24a4" 528 | dependencies = [ 529 | "bitflags", 530 | "hashbrown 0.15.5", 531 | "indexmap", 532 | "semver", 533 | "serde", 534 | ] 535 | -------------------------------------------------------------------------------- /core/src/tests.rs: -------------------------------------------------------------------------------- 1 | mod tests { 2 | use crate::Index; 3 | use crate::{Document, FsstStrVec}; 4 | use crate::{build_index, search}; 5 | 6 | // ======================================================================== 7 | // SECTION 1: Basic Sanity Tests - FsstStrVec 8 | // ======================================================================== 9 | 10 | #[test] 11 | fn test_fsst_str_vec_basic() { 12 | // Simple sanity test for FsstStrVec 13 | 14 | let strings = vec!["hello", "world", "rust", "search"]; 15 | let vec = FsstStrVec::from_strings(&strings); 16 | 17 | assert_eq!(vec.len(), 4); 18 | 19 | assert_eq!(vec.get(0), Some("hello".to_string())); 20 | assert_eq!(vec.get(1), Some("world".to_string())); 21 | assert_eq!(vec.get(2), Some("rust".to_string())); 22 | assert_eq!(vec.get(3), Some("search".to_string())); 23 | } 24 | 25 | #[test] 26 | fn test_fsst_str_vec_out_of_bounds() { 27 | // Test that getting an out-of-bounds index returns None 28 | let strings = vec!["hello", "world"]; 29 | let vec = FsstStrVec::from_strings(&strings); 30 | 31 | assert_eq!(vec.get(5), None); 32 | assert_eq!(vec.get(100), None); 33 | } 34 | 35 | #[test] 36 | fn test_fsst_str_vec_empty() { 37 | // Test behavior with empty vector 38 | let strings: Vec<&str> = vec![]; 39 | let vec = FsstStrVec::from_strings(&strings); 40 | 41 | assert_eq!(vec.len(), 0); 42 | assert_eq!(vec.get(0), None); 43 | } 44 | 45 | #[test] 46 | fn test_fsst_str_vec_single_item() { 47 | // Test with a single string 48 | let strings = vec!["solo"]; 49 | let vec = FsstStrVec::from_strings(&strings); 50 | 51 | assert_eq!(vec.len(), 1); 52 | assert_eq!(vec.get(0), Some("solo".to_string())); 53 | assert_eq!(vec.get(1), None); 54 | } 55 | 56 | #[test] 57 | fn test_fsst_str_vec_long_strings() { 58 | // Test with longer strings to verify compression works 59 | let strings = vec![ 60 | "This is a much longer string that should compress well with FSST", 61 | "Another long string with similar patterns and repeated words", 62 | "The third long string continues the pattern with more text", 63 | ]; 64 | let vec = FsstStrVec::from_strings(&strings); 65 | 66 | assert_eq!(vec.len(), 3); 67 | assert_eq!(vec.get(0), Some(strings[0].to_string())); 68 | assert_eq!(vec.get(1), Some(strings[1].to_string())); 69 | assert_eq!(vec.get(2), Some(strings[2].to_string())); 70 | } 71 | 72 | #[test] 73 | fn test_fsst_str_vec_unicode() { 74 | // Test with Unicode strings 75 | let strings = vec!["Hello 世界", "Rust 🦀", "Café ☕"]; 76 | let vec = FsstStrVec::from_strings(&strings); 77 | 78 | assert_eq!(vec.len(), 3); 79 | assert_eq!(vec.get(0), Some("Hello 世界".to_string())); 80 | assert_eq!(vec.get(1), Some("Rust 🦀".to_string())); 81 | assert_eq!(vec.get(2), Some("Café ☕".to_string())); 82 | } 83 | 84 | // ======================================================================== 85 | // SECTION 2: Document Structure Tests 86 | // ======================================================================== 87 | 88 | #[test] 89 | fn test_document_creation() { 90 | // Test that we can create Document structs 91 | let doc = Document { 92 | title: "Test Document".to_string(), 93 | category: "Test".to_string(), 94 | href: "/test".to_string(), 95 | body: "This is a test document body".to_string(), 96 | keywords: Some(vec!["test".to_string(), "document".to_string()]), 97 | }; 98 | 99 | assert_eq!(doc.title, "Test Document"); 100 | assert_eq!(doc.category, "Test"); 101 | assert_eq!(doc.href, "/test"); 102 | assert_eq!(doc.body, "This is a test document body"); 103 | assert_eq!( 104 | doc.keywords, 105 | Some(vec!["test".to_string(), "document".to_string()]) 106 | ); 107 | } 108 | 109 | #[test] 110 | fn test_document_serialization() { 111 | // Test document serialization/deserialization 112 | let doc = Document { 113 | title: "Test".to_string(), 114 | category: "Category".to_string(), 115 | href: "/link".to_string(), 116 | body: "Body text".to_string(), 117 | keywords: Some(vec!["test".to_string(), "example".to_string()]), 118 | }; 119 | 120 | let serialized = serde_json::to_string(&doc).unwrap(); 121 | let deserialized: Document = serde_json::from_str(&serialized).unwrap(); 122 | 123 | assert_eq!(doc.title, deserialized.title); 124 | assert_eq!(doc.category, deserialized.category); 125 | assert_eq!(doc.href, deserialized.href); 126 | assert_eq!(doc.body, deserialized.body); 127 | assert_eq!(doc.keywords, deserialized.keywords); 128 | } 129 | 130 | // ======================================================================== 131 | // SECTION 3: Index Building Tests 132 | // ======================================================================== 133 | 134 | #[test] 135 | fn test_build_index_simple() { 136 | // Test building a simple index with a few documents 137 | 138 | let documents = vec![ 139 | Document { 140 | title: "Rust Programming".to_string(), 141 | category: "Documentation".to_string(), 142 | href: "/docs/rust".to_string(), 143 | body: "Learn Rust programming language".to_string(), 144 | keywords: Some(vec!["rust".to_string(), "programming".to_string()]), 145 | }, 146 | Document { 147 | title: "Python Guide".to_string(), 148 | category: "Documentation".to_string(), 149 | href: "/docs/python".to_string(), 150 | body: "Python is a versatile programming language".to_string(), 151 | keywords: Some(vec!["python".to_string(), "guide".to_string()]), 152 | }, 153 | ]; 154 | 155 | let index = build_index(documents); 156 | assert!(index.is_ok()); 157 | 158 | let index = index.unwrap(); 159 | assert_eq!(index.document_strings.len(), 8); // 4 strings per document * 2 documents 160 | } 161 | 162 | #[test] 163 | fn test_build_index_empty() { 164 | // Test building an index with no documents 165 | let documents: Vec = vec![]; 166 | let index = build_index(documents); 167 | assert!(index.is_ok()); 168 | 169 | let index = index.unwrap(); 170 | assert_eq!(index.document_strings.len(), 0); 171 | } 172 | 173 | #[test] 174 | fn test_build_index_single_document() { 175 | // Test building an index with a single document 176 | let documents = vec![Document { 177 | title: "Single Document".to_string(), 178 | category: "Test".to_string(), 179 | href: "/single".to_string(), 180 | body: "This is the only document".to_string(), 181 | keywords: Some(vec!["single".to_string(), "document".to_string()]), 182 | }]; 183 | 184 | let index = build_index(documents); 185 | assert!(index.is_ok()); 186 | 187 | let index = index.unwrap(); 188 | assert_eq!(index.document_strings.len(), 4); // title, category, href, body 189 | } 190 | 191 | #[test] 192 | fn test_build_index_duplicate_titles() { 193 | // Test with documents that have similar or duplicate titles 194 | let documents = vec![ 195 | Document { 196 | title: "Getting Started".to_string(), 197 | category: "Guide".to_string(), 198 | href: "/guide1".to_string(), 199 | body: "First guide".to_string(), 200 | keywords: Some(vec!["getting".to_string(), "started".to_string()]), 201 | }, 202 | Document { 203 | title: "Getting Started".to_string(), 204 | category: "Tutorial".to_string(), 205 | href: "/tutorial1".to_string(), 206 | body: "First tutorial".to_string(), 207 | keywords: Some(vec!["getting".to_string(), "started".to_string()]), 208 | }, 209 | ]; 210 | 211 | let index = build_index(documents); 212 | assert!(index.is_ok()); 213 | } 214 | 215 | // ======================================================================== 216 | // SECTION 4: Index Serialization Tests 217 | // ======================================================================== 218 | 219 | #[test] 220 | fn test_index_serialization() { 221 | // Test that we can serialize and deserialize an index 222 | 223 | let documents = vec![Document { 224 | title: "Test Document".to_string(), 225 | category: "Test".to_string(), 226 | href: "/test".to_string(), 227 | body: "This is a test document".to_string(), 228 | keywords: Some(vec!["test".to_string(), "document".to_string()]), 229 | }]; 230 | 231 | let index = build_index(documents).unwrap(); 232 | 233 | // Create a buffer to serialize to 234 | let buffer = index.to_bytes().unwrap(); 235 | assert!(!buffer.is_empty()); 236 | 237 | // Try to deserialize from the buffer 238 | let deserialized = Index::from_bytes(&buffer); 239 | assert!(deserialized.is_ok()); 240 | 241 | let deserialized_index = deserialized.unwrap(); 242 | assert_eq!( 243 | deserialized_index.document_strings.len(), 244 | index.document_strings.len() 245 | ); 246 | } 247 | 248 | #[test] 249 | fn test_index_serialization_roundtrip() { 250 | // Test that we can serialize and deserialize multiple times 251 | let documents = vec![ 252 | Document { 253 | title: "Document One".to_string(), 254 | category: "Category A".to_string(), 255 | href: "/doc1".to_string(), 256 | body: "Content for document one".to_string(), 257 | keywords: Some(vec!["document".to_string(), "one".to_string()]), 258 | }, 259 | Document { 260 | title: "Document Two".to_string(), 261 | category: "Category B".to_string(), 262 | href: "/doc2".to_string(), 263 | body: "Content for document two".to_string(), 264 | keywords: Some(vec!["document".to_string(), "two".to_string()]), 265 | }, 266 | ]; 267 | 268 | let original_index = build_index(documents).unwrap(); 269 | 270 | // First roundtrip 271 | let buffer1 = original_index.to_bytes().unwrap(); 272 | let index1 = Index::from_bytes(&buffer1).unwrap(); 273 | 274 | // Second roundtrip 275 | let buffer2 = index1.to_bytes().unwrap(); 276 | let index2 = Index::from_bytes(&buffer2).unwrap(); 277 | 278 | // Verify the data is consistent 279 | assert_eq!( 280 | index2.document_strings.len(), 281 | original_index.document_strings.len() 282 | ); 283 | } 284 | 285 | // ======================================================================== 286 | // SECTION 5: Simple Search Tests 287 | // ======================================================================== 288 | 289 | #[test] 290 | fn test_search_single_word() { 291 | // Test searching for a single word 292 | let documents = vec![ 293 | Document { 294 | title: "Rust Programming".to_string(), 295 | category: "Documentation".to_string(), 296 | href: "/docs/rust".to_string(), 297 | body: "Learn Rust programming language".to_string(), 298 | keywords: Some(vec!["rust".to_string(), "programming".to_string()]), 299 | }, 300 | Document { 301 | title: "Python Guide".to_string(), 302 | category: "Documentation".to_string(), 303 | href: "/docs/python".to_string(), 304 | body: "Python is a versatile programming language".to_string(), 305 | keywords: Some(vec!["python".to_string(), "guide".to_string()]), 306 | }, 307 | ]; 308 | 309 | let index = build_index(documents).unwrap(); 310 | let results = search(&index, "Rust", 10).unwrap(); 311 | 312 | assert!(!results.is_empty()); 313 | assert_eq!(results[0].title, "Rust Programming"); 314 | assert_eq!(results[0].href, "/docs/rust"); 315 | } 316 | 317 | #[test] 318 | fn test_search_case_insensitive() { 319 | // Test that search is case-insensitive 320 | let documents = vec![Document { 321 | title: "JavaScript Tutorial".to_string(), 322 | category: "Tutorials".to_string(), 323 | href: "/tutorials/javascript".to_string(), 324 | body: "Learn JavaScript programming".to_string(), 325 | keywords: Some(vec!["javascript".to_string(), "tutorial".to_string()]), 326 | }]; 327 | 328 | let index = build_index(documents).unwrap(); 329 | 330 | let results_lower = search(&index, "javascript", 10).unwrap(); 331 | let results_upper = search(&index, "JAVASCRIPT", 10).unwrap(); 332 | let results_mixed = search(&index, "JavaScript", 10).unwrap(); 333 | 334 | assert!(!results_lower.is_empty()); 335 | assert!(!results_upper.is_empty()); 336 | assert!(!results_mixed.is_empty()); 337 | 338 | // All should find the same document 339 | assert_eq!(results_lower[0].href, "/tutorials/javascript"); 340 | assert_eq!(results_upper[0].href, "/tutorials/javascript"); 341 | assert_eq!(results_mixed[0].href, "/tutorials/javascript"); 342 | } 343 | 344 | #[test] 345 | fn test_search_no_results() { 346 | // Test searching for something that doesn't exist 347 | let documents = vec![Document { 348 | title: "Rust Programming".to_string(), 349 | category: "Documentation".to_string(), 350 | href: "/docs/rust".to_string(), 351 | body: "Learn Rust programming language".to_string(), 352 | keywords: Some(vec!["rust".to_string(), "programming".to_string()]), 353 | }]; 354 | 355 | let index = build_index(documents).unwrap(); 356 | let results = search(&index, "NonexistentKeyword", 10).unwrap(); 357 | 358 | assert!(results.is_empty()); 359 | } 360 | 361 | #[test] 362 | fn test_search_empty_query() { 363 | // Test searching with an empty query 364 | let documents = vec![Document { 365 | title: "Test Document".to_string(), 366 | category: "Test".to_string(), 367 | href: "/test".to_string(), 368 | body: "Test content".to_string(), 369 | keywords: Some(vec!["test".to_string(), "document".to_string()]), 370 | }]; 371 | 372 | let index = build_index(documents).unwrap(); 373 | let results = search(&index, "", 10).unwrap(); 374 | 375 | // Empty query should return no results (or possibly all results depending on implementation) 376 | // Just verify it doesn't crash 377 | assert!(results.len() <= 1); 378 | } 379 | 380 | // ======================================================================== 381 | // SECTION 6: Multi-word and Phrase Search Tests 382 | // ======================================================================== 383 | 384 | #[test] 385 | fn test_search_multiple_words() { 386 | // Test searching for multiple words 387 | let documents = vec![ 388 | Document { 389 | title: "VS Code Extensions".to_string(), 390 | category: "Documentation".to_string(), 391 | href: "/docs/extensions".to_string(), 392 | body: "Learn how to create VS Code extensions with comprehensive guides".to_string(), 393 | keywords: Some(vec![ 394 | "vs".to_string(), 395 | "code".to_string(), 396 | "extensions".to_string(), 397 | ]), 398 | }, 399 | Document { 400 | title: "VS Code Settings".to_string(), 401 | category: "Documentation".to_string(), 402 | href: "/docs/settings".to_string(), 403 | body: "Configure your VS Code settings for optimal development experience".to_string(), 404 | keywords: Some(vec![ 405 | "vs".to_string(), 406 | "code".to_string(), 407 | "settings".to_string(), 408 | ]), 409 | }, 410 | Document { 411 | title: "Python Guide".to_string(), 412 | category: "Documentation".to_string(), 413 | href: "/docs/python".to_string(), 414 | body: "Python is a versatile programming language".to_string(), 415 | keywords: Some(vec!["python".to_string(), "guide".to_string()]), 416 | }, 417 | ]; 418 | 419 | let index = build_index(documents).unwrap(); 420 | let results = search(&index, "VS Code", 10).unwrap(); 421 | 422 | // Should find both VS Code documents 423 | assert!(results.len() >= 2); 424 | assert!(results.iter().any(|d| d.href == "/docs/extensions")); 425 | assert!(results.iter().any(|d| d.href == "/docs/settings")); 426 | } 427 | 428 | #[test] 429 | fn test_search_partial_word_match() { 430 | // Test that partial word matches work 431 | let documents = vec![Document { 432 | title: "Debugging in VS Code".to_string(), 433 | category: "Documentation".to_string(), 434 | href: "/docs/debugging".to_string(), 435 | body: "Debug your applications with powerful debugging tools".to_string(), 436 | keywords: Some(vec![ 437 | "debugging".to_string(), 438 | "vs".to_string(), 439 | "code".to_string(), 440 | ]), 441 | }]; 442 | 443 | let index = build_index(documents).unwrap(); 444 | let results = search(&index, "debug", 10).unwrap(); 445 | 446 | // Should find documents with "debugging" and "debug" 447 | assert!(!results.is_empty()); 448 | } 449 | 450 | // ======================================================================== 451 | // SECTION 7: Ranking and Relevance Tests 452 | // ======================================================================== 453 | 454 | #[test] 455 | fn test_search_title_match_ranks_higher() { 456 | // Test that title matches rank higher than body matches 457 | let documents = vec![ 458 | Document { 459 | title: "Python Tutorial".to_string(), 460 | category: "Tutorials".to_string(), 461 | href: "/tutorials/python".to_string(), 462 | body: "Learn programming with this tutorial".to_string(), 463 | keywords: Some(vec!["python".to_string(), "tutorial".to_string()]), 464 | }, 465 | Document { 466 | title: "Getting Started".to_string(), 467 | category: "Documentation".to_string(), 468 | href: "/docs/start".to_string(), 469 | body: "This guide covers Python basics and advanced features".to_string(), 470 | keywords: Some(vec!["getting".to_string(), "started".to_string()]), 471 | }, 472 | ]; 473 | 474 | let index = build_index(documents).unwrap(); 475 | let results = search(&index, "Python", 10).unwrap(); 476 | 477 | // Document with "Python" in title should rank first 478 | assert!(!results.is_empty()); 479 | assert_eq!(results[0].href, "/tutorials/python"); 480 | } 481 | 482 | #[test] 483 | fn test_search_multiple_keyword_matches() { 484 | // Test that documents matching multiple keywords rank higher 485 | let documents = vec![ 486 | Document { 487 | title: "VS Code Debugging".to_string(), 488 | category: "Documentation".to_string(), 489 | href: "/docs/debugging".to_string(), 490 | body: "Debug VS Code extensions".to_string(), 491 | keywords: Some(vec![ 492 | "vs".to_string(), 493 | "code".to_string(), 494 | "debugging".to_string(), 495 | ]), 496 | }, 497 | Document { 498 | title: "VS Code Overview".to_string(), 499 | category: "Documentation".to_string(), 500 | href: "/docs/overview".to_string(), 501 | body: "Introduction to the editor".to_string(), 502 | keywords: Some(vec![ 503 | "vs".to_string(), 504 | "code".to_string(), 505 | "overview".to_string(), 506 | ]), 507 | }, 508 | Document { 509 | title: "Debugging Guide".to_string(), 510 | category: "Tutorials".to_string(), 511 | href: "/tutorials/debug".to_string(), 512 | body: "General debugging techniques".to_string(), 513 | keywords: Some(vec!["debugging".to_string(), "guide".to_string()]), 514 | }, 515 | ]; 516 | 517 | let index = build_index(documents).unwrap(); 518 | let results = search(&index, "VS Code debugging", 10).unwrap(); 519 | 520 | // Document with all three keywords should rank first 521 | assert!(!results.is_empty()); 522 | assert_eq!(results[0].href, "/docs/debugging"); 523 | } 524 | 525 | #[test] 526 | fn test_search_max_results_limit() { 527 | // Test that max_results parameter limits results correctly 528 | let documents = vec![ 529 | Document { 530 | title: "Guide One".to_string(), 531 | category: "Guides".to_string(), 532 | href: "/guide1".to_string(), 533 | body: "First guide about programming".to_string(), 534 | keywords: Some(vec!["guide".to_string(), "one".to_string()]), 535 | }, 536 | Document { 537 | title: "Guide Two".to_string(), 538 | category: "Guides".to_string(), 539 | href: "/guide2".to_string(), 540 | body: "Second guide about programming".to_string(), 541 | keywords: Some(vec!["guide".to_string(), "two".to_string()]), 542 | }, 543 | Document { 544 | title: "Guide Three".to_string(), 545 | category: "Guides".to_string(), 546 | href: "/guide3".to_string(), 547 | body: "Third guide about programming".to_string(), 548 | keywords: Some(vec!["guide".to_string(), "three".to_string()]), 549 | }, 550 | Document { 551 | title: "Guide Four".to_string(), 552 | category: "Guides".to_string(), 553 | href: "/guide4".to_string(), 554 | body: "Fourth guide about programming".to_string(), 555 | keywords: Some(vec!["guide".to_string(), "four".to_string()]), 556 | }, 557 | ]; 558 | 559 | let index = build_index(documents).unwrap(); 560 | 561 | let results_2 = search(&index, "guide", 2).unwrap(); 562 | let results_3 = search(&index, "guide", 3).unwrap(); 563 | let results_10 = search(&index, "guide", 10).unwrap(); 564 | 565 | assert!(results_2.len() <= 2); 566 | assert!(results_3.len() <= 3); 567 | assert!(results_10.len() <= 10); 568 | } 569 | 570 | // ======================================================================== 571 | // SECTION 8: Complex Search Query Tests 572 | // ======================================================================== 573 | 574 | #[test] 575 | fn test_search_technical_terms() { 576 | // Test searching for technical terms and acronyms 577 | let documents = vec![ 578 | Document { 579 | title: "TypeScript Configuration".to_string(), 580 | category: "Documentation".to_string(), 581 | href: "/docs/typescript".to_string(), 582 | body: "Configure TypeScript with tsconfig.json for your project".to_string(), 583 | keywords: Some(vec!["typescript".to_string(), "configuration".to_string()]), 584 | }, 585 | Document { 586 | title: "JavaScript Basics".to_string(), 587 | category: "Tutorials".to_string(), 588 | href: "/tutorials/javascript".to_string(), 589 | body: "Learn JavaScript fundamentals".to_string(), 590 | keywords: Some(vec!["javascript".to_string(), "basics".to_string()]), 591 | }, 592 | Document { 593 | title: "Language Support".to_string(), 594 | category: "Documentation".to_string(), 595 | href: "/docs/languages".to_string(), 596 | body: "VS Code supports TypeScript, JavaScript, and many other languages".to_string(), 597 | keywords: Some(vec!["language".to_string(), "support".to_string()]), 598 | }, 599 | ]; 600 | 601 | let index = build_index(documents).unwrap(); 602 | let results = search(&index, "TypeScript", 10).unwrap(); 603 | 604 | assert!(!results.is_empty()); 605 | assert!(results.iter().any(|d| d.href == "/docs/typescript")); 606 | } 607 | 608 | #[test] 609 | fn test_search_with_special_characters() { 610 | // Test searching with special characters 611 | let documents = vec![ 612 | Document { 613 | title: "C++ Programming".to_string(), 614 | category: "Documentation".to_string(), 615 | href: "/docs/cpp".to_string(), 616 | body: "Learn C++ programming language".to_string(), 617 | keywords: Some(vec!["c++".to_string(), "programming".to_string()]), 618 | }, 619 | Document { 620 | title: "C# Guide".to_string(), 621 | category: "Documentation".to_string(), 622 | href: "/docs/csharp".to_string(), 623 | body: "C# development with .NET".to_string(), 624 | keywords: Some(vec!["c#".to_string(), "guide".to_string()]), 625 | }, 626 | ]; 627 | 628 | let index = build_index(documents).unwrap(); 629 | let results_cpp = search(&index, "C++", 10); 630 | let results_csharp = search(&index, "C#", 10); 631 | 632 | // Should handle special characters gracefully 633 | assert!(results_cpp.is_ok() as bool); 634 | assert!(results_csharp.is_ok() as bool); 635 | } 636 | 637 | #[test] 638 | fn test_search_compound_keywords() { 639 | // Test searching for compound keywords and multi-word phrases 640 | let documents = vec![ 641 | Document { 642 | title: "Remote Development Setup".to_string(), 643 | category: "Tutorials".to_string(), 644 | href: "/tutorials/remote-dev".to_string(), 645 | body: "Set up remote development environment for distributed teams".to_string(), 646 | keywords: Some(vec![ 647 | "remote".to_string(), 648 | "development".to_string(), 649 | "setup".to_string(), 650 | ]), 651 | }, 652 | Document { 653 | title: "Development Environment".to_string(), 654 | category: "Documentation".to_string(), 655 | href: "/docs/environment".to_string(), 656 | body: "Configure your local development environment".to_string(), 657 | keywords: Some(vec!["development".to_string(), "environment".to_string()]), 658 | }, 659 | Document { 660 | title: "Remote Connections".to_string(), 661 | category: "Documentation".to_string(), 662 | href: "/docs/remote".to_string(), 663 | body: "Connect to remote servers and containers".to_string(), 664 | keywords: Some(vec!["remote".to_string(), "connections".to_string()]), 665 | }, 666 | ]; 667 | 668 | let index = build_index(documents).unwrap(); 669 | let results = search(&index, "remote development", 10).unwrap(); 670 | 671 | // Should find the document that has both keywords together 672 | assert!(!results.is_empty()); 673 | assert_eq!(results[0].href, "/tutorials/remote-dev"); 674 | } 675 | 676 | #[test] 677 | fn test_search_with_stopwords() { 678 | // Test that common stop words don't interfere with search 679 | let documents = vec![Document { 680 | title: "Getting Started with VS Code".to_string(), 681 | category: "Tutorials".to_string(), 682 | href: "/tutorials/start".to_string(), 683 | body: "This is a guide to help you get started with the editor".to_string(), 684 | keywords: Some(vec![ 685 | "getting".to_string(), 686 | "started".to_string(), 687 | "vs".to_string(), 688 | "code".to_string(), 689 | ]), 690 | }]; 691 | 692 | let index = build_index(documents).unwrap(); 693 | let results = search(&index, "getting started with vscode", 10).unwrap(); 694 | 695 | // Should find results despite stop words like "with", "the", "a" 696 | assert!(!results.is_empty()); 697 | } 698 | 699 | #[test] 700 | fn test_search_with_numbers() { 701 | // Test searching with version numbers and numeric values 702 | let documents = vec![ 703 | Document { 704 | title: "Node.js 18 Features".to_string(), 705 | category: "Updates".to_string(), 706 | href: "/updates/nodejs18".to_string(), 707 | body: "New features in Node.js version 18 release".to_string(), 708 | keywords: Some(vec![ 709 | "node.js".to_string(), 710 | "18".to_string(), 711 | "features".to_string(), 712 | ]), 713 | }, 714 | Document { 715 | title: "Node.js 16 Support".to_string(), 716 | category: "Updates".to_string(), 717 | href: "/updates/nodejs16".to_string(), 718 | body: "Long-term support for Node.js 16".to_string(), 719 | keywords: Some(vec![ 720 | "node.js".to_string(), 721 | "16".to_string(), 722 | "support".to_string(), 723 | ]), 724 | }, 725 | ]; 726 | 727 | let index = build_index(documents).unwrap(); 728 | let results = search(&index, "nodejs 18", 10).unwrap(); 729 | 730 | assert!(!results.is_empty()); 731 | // Should find the Node.js 18 document 732 | assert!(results.iter().any(|d| d.href.contains("nodejs18"))); 733 | } 734 | 735 | #[test] 736 | fn test_search_long_query() { 737 | // Test with a longer, more natural language query 738 | let documents = vec![ 739 | Document { 740 | title: "Remote SSH Extension".to_string(), 741 | category: "Extensions".to_string(), 742 | href: "/extensions/remote-ssh".to_string(), 743 | body: "Connect to remote servers via SSH and develop directly on remote machines" 744 | .to_string(), 745 | keywords: Some(vec![ 746 | "remote".to_string(), 747 | "ssh".to_string(), 748 | "extension".to_string(), 749 | ]), 750 | }, 751 | Document { 752 | title: "SSH Key Setup".to_string(), 753 | category: "Documentation".to_string(), 754 | href: "/docs/ssh-keys".to_string(), 755 | body: "Configure SSH keys for secure remote connections".to_string(), 756 | keywords: Some(vec![ 757 | "ssh".to_string(), 758 | "key".to_string(), 759 | "setup".to_string(), 760 | ]), 761 | }, 762 | ]; 763 | 764 | let index = build_index(documents).unwrap(); 765 | let results = search(&index, "how do i connect to a remote server using ssh", 10).unwrap(); 766 | 767 | // Should extract relevant keywords and find documents 768 | assert!(!results.is_empty()); 769 | } 770 | 771 | // ======================================================================== 772 | // SECTION 9: Edge Cases and Stress Tests 773 | // ======================================================================== 774 | 775 | #[test] 776 | fn test_search_many_documents() { 777 | // Test with a larger number of documents 778 | let mut documents = Vec::new(); 779 | for i in 0..100 { 780 | documents.push(Document { 781 | title: format!("Document {}", i).to_string(), 782 | category: format!("Category {}", i % 10).to_string(), 783 | href: format!("/doc{}", i).to_string(), 784 | body: format!("This is document number {} with some content", i).to_string(), 785 | keywords: Some(vec![format!("document{}", i).to_string()]), 786 | }); 787 | } 788 | 789 | // Add a special document to search for 790 | documents.push(Document { 791 | title: "Special Search Target".to_string(), 792 | category: "Test".to_string(), 793 | href: "/special".to_string(), 794 | body: "This document should be easy to find".to_string(), 795 | keywords: Some(vec!["special".to_string(), "target".to_string()]), 796 | }); 797 | 798 | let index = build_index(documents).unwrap(); 799 | let results = search(&index, "special target", 10).unwrap(); 800 | 801 | assert!(!results.is_empty()); 802 | assert_eq!(results[0].href, "/special"); 803 | } 804 | 805 | #[test] 806 | fn test_search_empty_fields() { 807 | // Test with documents that have empty fields 808 | let documents = vec![ 809 | Document { 810 | title: "".to_string(), 811 | category: "Empty Title".to_string(), 812 | href: "/empty1".to_string(), 813 | body: "This document has no title".to_string(), 814 | keywords: Some(vec!["empty".to_string()]), 815 | }, 816 | Document { 817 | title: "Empty Body".to_string(), 818 | category: "Test".to_string(), 819 | href: "/empty2".to_string(), 820 | body: "".to_string(), 821 | keywords: Some(vec!["empty".to_string(), "body".to_string()]), 822 | }, 823 | ]; 824 | 825 | let index = build_index(documents); 826 | assert!(index.is_ok()); 827 | 828 | let results = search(&index.unwrap(), "empty", 10).unwrap(); 829 | // Should handle empty fields gracefully 830 | assert!(!results.is_empty()); 831 | } 832 | 833 | #[test] 834 | fn test_search_whitespace_handling() { 835 | // Test that extra whitespace doesn't break search 836 | let documents = vec![Document { 837 | title: "Whitespace Test".to_string(), 838 | category: "Test".to_string(), 839 | href: "/whitespace".to_string(), 840 | body: "Multiple spaces between words".to_string(), 841 | keywords: Some(vec!["whitespace".to_string(), "test".to_string()]), 842 | }]; 843 | 844 | let index = build_index(documents).unwrap(); 845 | let results = search(&index, " whitespace test ", 10).unwrap(); 846 | 847 | assert!(!results.is_empty()); 848 | } 849 | 850 | #[test] 851 | fn test_search_with_typo() -> Result<(), Box> { 852 | let document_strings = FsstStrVec::from_strings(&vec![ 853 | "Document 1", 854 | "Docs", 855 | "/doc1", 856 | "This is the first document.", 857 | "Document 2", 858 | "Docs", 859 | "/doc2", 860 | "This is the second document.", 861 | "Document 3", 862 | "Docs", 863 | "/doc3", 864 | "This is the third document.", 865 | ]); 866 | 867 | let keyword_to_documents: Vec> = vec![ 868 | vec![(1, 1)], // "language" appears in doc 1 869 | vec![(0, 10), (2, 4)], // "programming" appears in doc 0 and 2 870 | vec![(0, 5), (1, 3)], // "rust" appears in doc 0 and 1 871 | ]; 872 | 873 | let mut fst_builder = fst::MapBuilder::memory(); 874 | fst_builder.insert("language", 0).unwrap(); 875 | fst_builder.insert("programming", 1).unwrap(); 876 | fst_builder.insert("rust", 2).unwrap(); 877 | let fst = fst_builder.into_inner()?; 878 | 879 | let index = Index { 880 | fst, 881 | document_strings, 882 | keyword_to_documents, 883 | }; 884 | 885 | let results = search(&index, "lamguage", 10)?; 886 | assert_eq!(results.len(), 1, "Expected 1 result for 'lamguage'"); 887 | 888 | Ok(()) 889 | } 890 | } 891 | --------------------------------------------------------------------------------