├── rustfmt.toml
├── Cargo.toml
├── .vscode
    └── settings.json
├── static
    ├── docfind.js
    ├── docfind_bg.wasm
    ├── documents.json
    ├── docfind_bg.wasm.br
    ├── install.sh
    ├── install.ps1
    └── index.html
├── .gitignore
├── wasm
    ├── Cargo.toml
    ├── index.js
    └── src
    │   └── lib.rs
├── .gitattributes
├── cli
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── scripts
    ├── build.sh
    ├── build-demo.sh
    └── version.sh
├── core
    ├── Cargo.toml
    ├── english.stop
    └── src
    │   ├── lib.rs
    │   └── tests.rs
├── .github
    └── workflows
    │   ├── copilot-setup-steps.yml
    │   ├── static.yml
    │   └── ci.yml
├── SECURITY.md
├── LICENSE
├── README.md
└── Cargo.lock


/rustfmt.toml:
--------------------------------------------------------------------------------
1 | hard_tabs = true
2 | tab_spaces = 2


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = ["cli", "wasm"]
3 | resolver = "2"
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "git.branchProtection": [
3 |     "main"
4 |   ]
5 | }


--------------------------------------------------------------------------------
/static/docfind.js:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9521cfa6406ac37f8b768220abedeb6e8cbf9f0867b56fbdfd7033170b3c778f
3 | size 3497
4 | 


--------------------------------------------------------------------------------
/static/docfind_bg.wasm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:216664e1c84152d632f03915008ac7017d3d5ee61275c0bacbaaed0e57af0af4
3 | size 11484322
4 | 


--------------------------------------------------------------------------------
/static/documents.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f5c1094a3b9865f49cb56262f91dd4ee2f57c5ac960fbefbeaaa97a1a53f5c9d
3 | size 17145937
4 | 


--------------------------------------------------------------------------------
/static/docfind_bg.wasm.br:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7794cad68dbf03a4acd903b52cd1e05d025b243e384ade21f5ac7db405417570
3 | size 5197055
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | pkg
3 | # Temporary dataset files in scripts directory
4 | scripts/train.csv
5 | scripts/test.csv
6 | scripts/documents.json
7 | # Python cache
8 | __pycache__/
9 | *.pyc


--------------------------------------------------------------------------------
/wasm/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "docfind-wasm"
 3 | version = "0.5.1"
 4 | edition = "2024"
 5 | 
 6 | [lib]
 7 | crate-type = ["cdylib", "rlib"]
 8 | 
 9 | [dependencies]
10 | docfind_core = { path = "../core", features = ["wasm"] }
11 | wasm-bindgen = "0.2"
12 | serde-wasm-bindgen = "0.6"
13 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Large demo files tracked with Git LFS
2 | static/docfind.js filter=lfs diff=lfs merge=lfs -text
3 | static/docfind_bg.wasm filter=lfs diff=lfs merge=lfs -text
4 | static/docfind_bg.wasm.br filter=lfs diff=lfs merge=lfs -text
5 | static/documents.json filter=lfs diff=lfs merge=lfs -text
6 | 


--------------------------------------------------------------------------------
/cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "docfind"
 3 | version = "0.5.1"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | docfind_core = { path = "../core", features = ["cli"] }
 8 | serde_json = "1.0.145"
 9 | wasm-encoder = { version = "0.240.0", features = ["wasmparser"] }
10 | wasmparser = "0.240.0"
11 | 


--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Build wasm template
 5 | wasm-pack build wasm --out-name docfind --release --target web
 6 | 
 7 | # Minify JavaScript
 8 | npx --yes esbuild --bundle wasm/index.js --format=esm --minify --outfile=wasm/pkg/docfind.js --allow-overwrite
 9 | 
10 | # Then build CLI
11 | cargo build --release -p docfind
12 | 


--------------------------------------------------------------------------------
/wasm/index.js:
--------------------------------------------------------------------------------
 1 | import _init, { search as _search } from './pkg/docfind.js';
 2 | 
 3 | let didInit = false;
 4 | 
 5 | export function init() {
 6 |   return _init();
 7 | }
 8 | 
 9 | export default async function search(needle, maxResults) {
10 |   if (!didInit) {
11 |     await _init();
12 |     didInit = true;
13 |   }
14 |   return _search(needle, maxResults);
15 | }


--------------------------------------------------------------------------------
/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "docfind_core"
 3 | version = "0.5.1"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | fst = { version = "0.4", features = ["levenshtein"] }
 8 | serde = { version = "1.0.228", features = ["derive"] }
 9 | postcard = { version = "1.1.3", features = ["alloc", "use-std"] }
10 | fsst-rs = "0.5.4"
11 | rake = { version = "0.3", optional = true }
12 | 
13 | [dev-dependencies]
14 | serde_json = "1.0.145"
15 | 
16 | [features]
17 | cli = ["rake"]
18 | wasm = []
19 | 
20 | [dev-dependencies.rake]
21 | version = "0.3"
22 | 


--------------------------------------------------------------------------------
/.github/workflows/copilot-setup-steps.yml:
--------------------------------------------------------------------------------
 1 | name: "Copilot Setup Steps"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     paths:
 7 |       - .github/workflows/copilot-setup-steps.yml
 8 |   pull_request:
 9 |     paths:
10 |       - .github/workflows/copilot-setup-steps.yml
11 | 
12 | jobs:
13 |   copilot-setup-steps:
14 |     runs-on: ubuntu-latest
15 | 
16 |     permissions:
17 |       contents: read
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v5
21 | 
22 |       - name: Install wasm-pack
23 |         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
24 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V1.0.0 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which
 6 | includes all source code repositories in our GitHub organizations.
 7 | 
 8 | **Please do not report security vulnerabilities through public GitHub issues.**
 9 | 
10 | For security reporting information, locations, contact information, and policies,
11 | please review the latest guidance for Microsoft repositories at
12 | [https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md).
13 | 
14 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/scripts/build-demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo "Building demo WASM files from static/documents.json..."
 5 | 
 6 | # Build the docfind CLI first if needed
 7 | if [ ! -f "target/release/docfind" ]; then
 8 |     echo "Building docfind CLI..."
 9 |     ./scripts/build.sh
10 | fi
11 | 
12 | # Generate WASM files from documents.json
13 | echo "Generating WASM files..."
14 | ./target/release/docfind static/documents.json static/
15 | 
16 | # Compress WASM with Brotli
17 | echo "Compressing WASM with Brotli..."
18 | brotli -k -f static/docfind_bg.wasm
19 | 
20 | echo "Demo build completed successfully!"
21 | echo ""
22 | echo "Generated files:"
23 | ls -lh static/docfind.js static/docfind_bg.wasm static/docfind_bg.wasm.br
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (c) Microsoft Corporation
 3 | 
 4 | All rights reserved. 
 5 | 
 6 | MIT License
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
 9 | files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
10 | modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
11 | is furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
17 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
18 | OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/wasm/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use docfind_core::Index;
 2 | use std::sync::OnceLock;
 3 | use wasm_bindgen::prelude::*;
 4 | 
 5 | #[wasm_bindgen]
 6 | extern "C" {
 7 | 	#[wasm_bindgen(js_namespace = console)]
 8 | 	fn log(msg: &str);
 9 | }
10 | 
11 | #[unsafe(no_mangle)]
12 | pub static mut INDEX_BASE: u32 = 0xdead_beef;
13 | 
14 | #[unsafe(no_mangle)]
15 | pub static mut INDEX_LEN: u32 = 0xdead_beef;
16 | 
17 | static INDEX: OnceLock<Index> = OnceLock::new();
18 | 
19 | /// Search the index for a query string
20 | /// Returns a JavaScript array of matching documents
21 | #[wasm_bindgen]
22 | pub fn search(query: &str, max_results: Option<usize>) -> Result<JsValue, JsValue> {
23 | 	let index = INDEX.get_or_init(|| {
24 | 		let raw_index =
25 | 			unsafe { std::slice::from_raw_parts(INDEX_BASE as *const u8, INDEX_LEN as usize) };
26 | 		Index::from_bytes(raw_index).expect("Failed to deserialize index")
27 | 	});
28 | 
29 | 	let result = docfind_core::search(index, query, max_results.unwrap_or(10))
30 | 		.map_err(|e| JsValue::from_str(&format!("Search failed: {}", e)))?;
31 | 
32 | 	serde_wasm_bindgen::to_value(&result)
33 | 		.map_err(|e| JsValue::from_str(&format!("Failed to convert results to JS: {}", e)))
34 | }
35 | 


--------------------------------------------------------------------------------
/.github/workflows/static.yml:
--------------------------------------------------------------------------------
 1 | # Workflow for building and deploying the docfind example to GitHub Pages
 2 | name: Build and deploy example to Pages
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["main"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: false
23 | 
24 | jobs:
25 |   build-and-deploy:
26 |     environment:
27 |       name: github-pages
28 |       url: ${{ steps.deployment.outputs.page_url }}
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v4
33 |         with:
34 |           lfs: true
35 | 
36 |       - name: Setup Pages
37 |         uses: actions/configure-pages@v5
38 | 
39 |       - name: Upload artifact
40 |         uses: actions/upload-pages-artifact@v3
41 |         with:
42 |           path: 'static'
43 | 
44 |       - name: Deploy to GitHub Pages
45 |         id: deployment
46 |         uses: actions/deploy-pages@v4
47 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main ]
  6 |     tags:
  7 |       - 'v*.*.*'
  8 |   pull_request:
  9 |     branches: [ main ]
 10 | 
 11 | # Cancel running builds if new commits are pushed to a PR
 12 | concurrency:
 13 |   group: ${{ github.workflow }}-${{ github.ref }}
 14 |   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 15 | 
 16 | env:
 17 |   CARGO_TERM_COLOR: always
 18 | 
 19 | jobs:
 20 |   test:
 21 |     runs-on: ubuntu-latest
 22 |     
 23 |     steps:
 24 |     - uses: actions/checkout@v4
 25 | 
 26 |     - name: Test
 27 |       run: cargo test
 28 |       working-directory: core
 29 | 
 30 |   build:
 31 |     needs: test
 32 |     strategy:
 33 |       matrix:
 34 |         include:
 35 |           - os: windows-latest
 36 |             target: x86_64-pc-windows-msvc
 37 |           - os: macos-latest
 38 |             target: x86_64-apple-darwin
 39 |           - os: macos-latest
 40 |             target: aarch64-apple-darwin
 41 |           - os: ubuntu-latest
 42 |             target: x86_64-unknown-linux-gnu
 43 |           - os: ubuntu-latest
 44 |             target: aarch64-unknown-linux-gnu
 45 |           - os: ubuntu-latest
 46 |             target: x86_64-unknown-linux-musl
 47 |           - os: ubuntu-latest
 48 |             target: aarch64-unknown-linux-musl
 49 |     
 50 |     runs-on: ${{ matrix.os }}
 51 |     
 52 |     steps:
 53 |     - uses: actions/checkout@v4
 54 |     
 55 |     - name: Install Rust target
 56 |       run: rustup target add ${{ matrix.target }}
 57 |     
 58 |     - name: Install cross-compilation tools (Linux ARM64)
 59 |       if: matrix.target == 'aarch64-unknown-linux-gnu' || matrix.target == 'aarch64-unknown-linux-musl'
 60 |       run: |
 61 |         sudo apt-get update
 62 |         sudo apt-get install -y gcc-aarch64-linux-gnu
 63 |     
 64 |     - name: Install musl tools
 65 |       if: contains(matrix.target, 'musl')
 66 |       run: |
 67 |         sudo apt-get update
 68 |         sudo apt-get install -y musl-tools
 69 |     
 70 |     - name: Install wasm-pack
 71 |       if: runner.os != 'Windows'
 72 |       run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
 73 |     
 74 |     - name: Install wasm-pack (Windows)
 75 |       if: runner.os == 'Windows'
 76 |       run: npm install -g wasm-pack
 77 |       shell: pwsh
 78 |     
 79 |     - name: Build WASM template
 80 |       run: wasm-pack build wasm --out-name docfind --release --target web
 81 |     
 82 |     - name: Minify JavaScript
 83 |       run: npx --yes esbuild --bundle wasm/index.js --format=esm --minify --outfile=wasm/pkg/docfind.js --allow-overwrite
 84 |     
 85 |     - name: Build CLI
 86 |       run: cargo build --release -p docfind --target ${{ matrix.target }}
 87 |       env:
 88 |         CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc
 89 |         CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc
 90 |     
 91 |     - name: Package binaries (Unix)
 92 |       if: runner.os != 'Windows'
 93 |       run: |
 94 |         cd target/${{ matrix.target }}/release
 95 |         tar czf docfind-${{ matrix.target }}.tar.gz docfind
 96 |         mv docfind-${{ matrix.target }}.tar.gz ${{ github.workspace }}
 97 |     
 98 |     - name: Package binaries (Windows)
 99 |       if: runner.os == 'Windows'
100 |       run: |
101 |         cd target/${{ matrix.target }}/release
102 |         7z a docfind-${{ matrix.target }}.zip docfind.exe
103 |         mv docfind-${{ matrix.target }}.zip ${{ github.workspace }}
104 |       shell: pwsh
105 |     
106 |     - name: Upload artifacts
107 |       uses: actions/upload-artifact@v4
108 |       with:
109 |         name: docfind-${{ matrix.target }}
110 |         path: |
111 |           docfind-${{ matrix.target }}.tar.gz
112 |           docfind-${{ matrix.target }}.zip
113 |         if-no-files-found: ignore
114 | 
115 |   release:
116 |     if: startsWith(github.ref, 'refs/tags/')
117 |     needs: build
118 |     runs-on: ubuntu-latest
119 |     permissions:
120 |       contents: write
121 |     
122 |     steps:
123 |     - name: Download all artifacts
124 |       uses: actions/download-artifact@v4
125 |       with:
126 |         path: artifacts
127 |     
128 |     - name: Create Release
129 |       uses: softprops/action-gh-release@v2
130 |       with:
131 |         files: artifacts/**/*
132 |         draft: false
133 |         prerelease: false
134 |         generate_release_notes: true
135 | 


--------------------------------------------------------------------------------
/scripts/version.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -euo pipefail
  3 | 
  4 | # Version bumping script for Cargo workspace
  5 | # Usage: ./scripts/version.sh [major|minor|patch|<version>]
  6 | 
  7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  8 | PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
  9 | 
 10 | # Color codes for output
 11 | RED='\033[0;31m'
 12 | GREEN='\033[0;32m'
 13 | YELLOW='\033[1;33m'
 14 | NC='\033[0m' # No Color
 15 | 
 16 | # Function to display usage
 17 | usage() {
 18 |     echo "Usage: $0 [major|minor|patch|<version>]"
 19 |     echo ""
 20 |     echo "Examples:"
 21 |     echo "  $0 patch       # 0.2.0 -> 0.2.1"
 22 |     echo "  $0 minor       # 0.2.0 -> 0.3.0"
 23 |     echo "  $0 major       # 0.2.0 -> 1.0.0"
 24 |     echo "  $0 1.5.2       # Set specific version"
 25 |     exit 1
 26 | }
 27 | 
 28 | # Function to extract current version from a Cargo.toml file
 29 | get_version() {
 30 |     local file="$1"
 31 |     grep '^version = ' "$file" | head -1 | sed 's/version = "\(.*\)"/\1/'
 32 | }
 33 | 
 34 | # Function to validate semantic version format
 35 | is_valid_version() {
 36 |     local version="$1"
 37 |     if [[ $version =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
 38 |         return 0
 39 |     else
 40 |         return 1
 41 |     fi
 42 | }
 43 | 
 44 | # Function to bump version
 45 | bump_version() {
 46 |     local current="$1"
 47 |     local bump_type="$2"
 48 |     
 49 |     IFS='.' read -r major minor patch <<< "$current"
 50 |     
 51 |     case "$bump_type" in
 52 |         major)
 53 |             echo "$((major + 1)).0.0"
 54 |             ;;
 55 |         minor)
 56 |             echo "${major}.$((minor + 1)).0"
 57 |             ;;
 58 |         patch)
 59 |             echo "${major}.${minor}.$((patch + 1))"
 60 |             ;;
 61 |         *)
 62 |             if is_valid_version "$bump_type"; then
 63 |                 echo "$bump_type"
 64 |             else
 65 |                 echo -e "${RED}Error: Invalid version format: $bump_type${NC}" >&2
 66 |                 echo -e "${YELLOW}Version must be in format: X.Y.Z${NC}" >&2
 67 |                 exit 1
 68 |             fi
 69 |             ;;
 70 |     esac
 71 | }
 72 | 
 73 | # Function to update version in a Cargo.toml file
 74 | update_cargo_toml() {
 75 |     local file="$1"
 76 |     local new_version="$2"
 77 |     
 78 |     # Use awk to replace only the first occurrence of version line (fully compatible)
 79 |     if [[ "$OSTYPE" == "darwin"* ]]; then
 80 |         awk -v new_ver="$new_version" '/^version = / && !done { sub(/^version = ".*"/, "version = \"" new_ver "\""); done=1 } 1' "$file" > "$file.tmp" && mv "$file.tmp" "$file"
 81 |     else
 82 |         # GNU sed supports the 0,/pattern/ syntax
 83 |         sed -i "0,/^version = \".*\"/s//version = \"$new_version\"/" "$file"
 84 |     fi
 85 |     
 86 |     echo -e "${GREEN}✓${NC} Updated $(basename $(dirname "$file"))/$(basename "$file")"
 87 | }
 88 | 
 89 | # Main script
 90 | main() {
 91 |     if [ $# -eq 0 ]; then
 92 |         usage
 93 |     fi
 94 |     
 95 |     local bump_type="$1"
 96 |     
 97 |     # Find all Cargo.toml files with version fields
 98 |     CARGO_FILES=(
 99 |         "$PROJECT_ROOT/cli/Cargo.toml"
100 |         "$PROJECT_ROOT/core/Cargo.toml"
101 |         "$PROJECT_ROOT/wasm/Cargo.toml"
102 |     )
103 |     
104 |     # Get current version from the first file (cli)
105 |     CURRENT_VERSION=$(get_version "${CARGO_FILES[0]}")
106 |     
107 |     if [ -z "$CURRENT_VERSION" ]; then
108 |         echo -e "${RED}Error: Could not determine current version${NC}"
109 |         exit 1
110 |     fi
111 |     
112 |     echo -e "Current version: ${YELLOW}$CURRENT_VERSION${NC}"
113 |     
114 |     # Calculate new version
115 |     NEW_VERSION=$(bump_version "$CURRENT_VERSION" "$bump_type")
116 |     
117 |     echo -e "New version:     ${GREEN}$NEW_VERSION${NC}"
118 |     echo ""
119 |     
120 |     # Confirm with user
121 |     read -p "Update version to $NEW_VERSION? (y/N): " -n 1 -r
122 |     echo
123 |     if [[ ! $REPLY =~ ^[Yy]$ ]]; then
124 |         echo "Aborted"
125 |         exit 0
126 |     fi
127 |     
128 |     # Update all Cargo.toml files
129 |     echo ""
130 |     echo "Updating Cargo.toml files..."
131 |     for file in "${CARGO_FILES[@]}"; do
132 |         if [ -f "$file" ]; then
133 |             update_cargo_toml "$file" "$NEW_VERSION"
134 |         else
135 |             echo -e "${YELLOW}Warning: File not found: $file${NC}"
136 |         fi
137 |     done
138 |     
139 |     echo ""
140 |     echo -e "${GREEN}Version updated successfully!${NC}"
141 |     echo ""
142 |     
143 |     echo ""
144 |     echo -e "${YELLOW}Changes not committed. You can review and commit manually:${NC}"
145 |     echo "  git add cli/Cargo.toml core/Cargo.toml wasm/Cargo.toml"
146 |     echo "  git commit -m 'Bump version to $NEW_VERSION'"
147 |     echo "  git tag -a v$NEW_VERSION -m 'Release version $NEW_VERSION'"
148 |     echo "  git push && git push --tags"
149 | }
150 | 
151 | main "$@"
152 | 


--------------------------------------------------------------------------------
/core/english.stop:
--------------------------------------------------------------------------------
  1 | #stop word list from SMART (Salton,1971).  Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
  2 | a
  3 | a's
  4 | able
  5 | about
  6 | above
  7 | according
  8 | accordingly
  9 | across
 10 | actually
 11 | after
 12 | afterwards
 13 | again
 14 | against
 15 | ain't
 16 | all
 17 | allow
 18 | allows
 19 | almost
 20 | alone
 21 | along
 22 | already
 23 | also
 24 | although
 25 | always
 26 | am
 27 | among
 28 | amongst
 29 | an
 30 | and
 31 | another
 32 | any
 33 | anybody
 34 | anyhow
 35 | anyone
 36 | anything
 37 | anyway
 38 | anyways
 39 | anywhere
 40 | apart
 41 | appear
 42 | appreciate
 43 | appropriate
 44 | are
 45 | aren't
 46 | around
 47 | as
 48 | aside
 49 | ask
 50 | asking
 51 | associated
 52 | at
 53 | available
 54 | away
 55 | awfully
 56 | b
 57 | be
 58 | became
 59 | because
 60 | become
 61 | becomes
 62 | becoming
 63 | been
 64 | before
 65 | beforehand
 66 | behind
 67 | being
 68 | believe
 69 | below
 70 | beside
 71 | besides
 72 | best
 73 | better
 74 | between
 75 | beyond
 76 | both
 77 | brief
 78 | but
 79 | by
 80 | c
 81 | c'mon
 82 | c's
 83 | came
 84 | can
 85 | can't
 86 | cannot
 87 | cant
 88 | cause
 89 | causes
 90 | certain
 91 | certainly
 92 | changes
 93 | clearly
 94 | co
 95 | com
 96 | come
 97 | comes
 98 | concerning
 99 | consequently
100 | consider
101 | considering
102 | contain
103 | containing
104 | contains
105 | corresponding
106 | could
107 | couldn't
108 | course
109 | currently
110 | d
111 | definitely
112 | described
113 | despite
114 | did
115 | didn't
116 | different
117 | do
118 | does
119 | doesn't
120 | doing
121 | don't
122 | done
123 | down
124 | downwards
125 | during
126 | e
127 | each
128 | edu
129 | eg
130 | eight
131 | either
132 | else
133 | elsewhere
134 | enough
135 | entirely
136 | especially
137 | et
138 | etc
139 | even
140 | ever
141 | every
142 | everybody
143 | everyone
144 | everything
145 | everywhere
146 | ex
147 | exactly
148 | example
149 | except
150 | f
151 | far
152 | few
153 | fifth
154 | first
155 | five
156 | followed
157 | following
158 | follows
159 | for
160 | former
161 | formerly
162 | forth
163 | four
164 | from
165 | further
166 | furthermore
167 | g
168 | get
169 | gets
170 | getting
171 | given
172 | gives
173 | go
174 | goes
175 | going
176 | gone
177 | got
178 | gotten
179 | greetings
180 | h
181 | had
182 | hadn't
183 | happens
184 | hardly
185 | has
186 | hasn't
187 | have
188 | haven't
189 | having
190 | he
191 | he's
192 | hello
193 | help
194 | hence
195 | her
196 | here
197 | here's
198 | hereafter
199 | hereby
200 | herein
201 | hereupon
202 | hers
203 | herself
204 | hi
205 | him
206 | himself
207 | his
208 | hither
209 | hopefully
210 | how
211 | howbeit
212 | however
213 | i
214 | i'd
215 | i'll
216 | i'm
217 | i've
218 | ie
219 | if
220 | ignored
221 | immediate
222 | in
223 | inasmuch
224 | inc
225 | indeed
226 | indicate
227 | indicated
228 | indicates
229 | inner
230 | insofar
231 | instead
232 | into
233 | inward
234 | is
235 | isn't
236 | it
237 | it'd
238 | it'll
239 | it's
240 | its
241 | itself
242 | j
243 | just
244 | k
245 | keep
246 | keeps
247 | kept
248 | know
249 | knows
250 | known
251 | l
252 | last
253 | lately
254 | later
255 | latter
256 | latterly
257 | least
258 | less
259 | lest
260 | let
261 | let's
262 | like
263 | liked
264 | likely
265 | little
266 | look
267 | looking
268 | looks
269 | ltd
270 | m
271 | mainly
272 | many
273 | may
274 | maybe
275 | me
276 | mean
277 | meanwhile
278 | merely
279 | might
280 | more
281 | moreover
282 | most
283 | mostly
284 | much
285 | must
286 | my
287 | myself
288 | n
289 | name
290 | namely
291 | nd
292 | near
293 | nearly
294 | necessary
295 | need
296 | needs
297 | neither
298 | never
299 | nevertheless
300 | new
301 | next
302 | nine
303 | no
304 | nobody
305 | non
306 | none
307 | noone
308 | nor
309 | normally
310 | not
311 | nothing
312 | novel
313 | now
314 | nowhere
315 | o
316 | obviously
317 | of
318 | off
319 | often
320 | oh
321 | ok
322 | okay
323 | old
324 | on
325 | once
326 | one
327 | ones
328 | only
329 | onto
330 | or
331 | other
332 | others
333 | otherwise
334 | ought
335 | our
336 | ours
337 | ourselves
338 | out
339 | outside
340 | over
341 | overall
342 | own
343 | p
344 | particular
345 | particularly
346 | per
347 | perhaps
348 | placed
349 | please
350 | plus
351 | possible
352 | presumably
353 | probably
354 | provides
355 | q
356 | que
357 | quite
358 | qv
359 | r
360 | rather
361 | rd
362 | re
363 | really
364 | reasonably
365 | regarding
366 | regardless
367 | regards
368 | relatively
369 | respectively
370 | right
371 | s
372 | said
373 | same
374 | saw
375 | say
376 | saying
377 | says
378 | second
379 | secondly
380 | see
381 | seeing
382 | seem
383 | seemed
384 | seeming
385 | seems
386 | seen
387 | self
388 | selves
389 | sensible
390 | sent
391 | serious
392 | seriously
393 | seven
394 | several
395 | shall
396 | she
397 | should
398 | shouldn't
399 | since
400 | six
401 | so
402 | some
403 | somebody
404 | somehow
405 | someone
406 | something
407 | sometime
408 | sometimes
409 | somewhat
410 | somewhere
411 | soon
412 | sorry
413 | specified
414 | specify
415 | specifying
416 | still
417 | sub
418 | such
419 | sup
420 | sure
421 | t
422 | t's
423 | take
424 | taken
425 | tell
426 | tends
427 | th
428 | than
429 | thank
430 | thanks
431 | thanx
432 | that
433 | that's
434 | thats
435 | the
436 | their
437 | theirs
438 | them
439 | themselves
440 | then
441 | thence
442 | there
443 | there's
444 | thereafter
445 | thereby
446 | therefore
447 | therein
448 | theres
449 | thereupon
450 | these
451 | they
452 | they'd
453 | they'll
454 | they're
455 | they've
456 | think
457 | third
458 | this
459 | thorough
460 | thoroughly
461 | those
462 | though
463 | three
464 | through
465 | throughout
466 | thru
467 | thus
468 | to
469 | together
470 | too
471 | took
472 | toward
473 | towards
474 | tried
475 | tries
476 | truly
477 | try
478 | trying
479 | twice
480 | two
481 | u
482 | un
483 | under
484 | unfortunately
485 | unless
486 | unlikely
487 | until
488 | unto
489 | up
490 | upon
491 | us
492 | use
493 | used
494 | useful
495 | uses
496 | using
497 | usually
498 | uucp
499 | v
500 | value
501 | various
502 | very
503 | via
504 | viz
505 | vs
506 | w
507 | want
508 | wants
509 | was
510 | wasn't
511 | way
512 | we
513 | we'd
514 | we'll
515 | we're
516 | we've
517 | welcome
518 | well
519 | went
520 | were
521 | weren't
522 | what
523 | what's
524 | whatever
525 | when
526 | whence
527 | whenever
528 | where
529 | where's
530 | whereafter
531 | whereas
532 | whereby
533 | wherein
534 | whereupon
535 | wherever
536 | whether
537 | which
538 | while
539 | whither
540 | who
541 | who's
542 | whoever
543 | whole
544 | whom
545 | whose
546 | why
547 | will
548 | willing
549 | wish
550 | with
551 | within
552 | without
553 | won't
554 | wonder
555 | would
556 | would
557 | wouldn't
558 | x
559 | y
560 | yes
561 | yet
562 | you
563 | you'd
564 | you'll
565 | you're
566 | you've
567 | your
568 | yours
569 | yourself
570 | yourselves
571 | z
572 | zero
573 | 


--------------------------------------------------------------------------------
/static/install.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # docfind installer script for Unix-like systems
  3 | # Usage: curl -fsSL https://microsoft.github.io/docfind/install.sh | sh
  4 | 
  5 | set -e
  6 | 
  7 | # Configuration
  8 | REPO="microsoft/docfind"
  9 | BINARY_NAME="docfind"
 10 | INSTALL_DIR="${DOCFIND_INSTALL_DIR:-$HOME/.local/bin}"
 11 | 
 12 | # Colors for output
 13 | RED='\033[0;31m'
 14 | GREEN='\033[0;32m'
 15 | YELLOW='\033[1;33m'
 16 | NC='\033[0m' # No Color
 17 | 
 18 | # Helper functions
 19 | info() {
 20 |     printf "${GREEN}==>${NC} %s\n" "$1"
 21 | }
 22 | 
 23 | warn() {
 24 |     printf "${YELLOW}Warning:${NC} %s\n" "$1"
 25 | }
 26 | 
 27 | error() {
 28 |     printf "${RED}Error:${NC} %s\n" "$1" >&2
 29 |     exit 1
 30 | }
 31 | 
 32 | # Detect OS and architecture
 33 | detect_platform() {
 34 |     OS="$(uname -s)"
 35 |     ARCH="$(uname -m)"
 36 |     
 37 |     case "$OS" in
 38 |         Linux*)
 39 |             PLATFORM="unknown-linux-musl"
 40 |             ;;
 41 |         Darwin*)
 42 |             PLATFORM="apple-darwin"
 43 |             ;;
 44 |         *)
 45 |             error "Unsupported operating system: $OS"
 46 |             ;;
 47 |     esac
 48 |     
 49 |     case "$ARCH" in
 50 |         x86_64|amd64)
 51 |             ARCH="x86_64"
 52 |             ;;
 53 |         aarch64|arm64)
 54 |             ARCH="aarch64"
 55 |             ;;
 56 |         *)
 57 |             error "Unsupported architecture: $ARCH"
 58 |             ;;
 59 |     esac
 60 |     
 61 |     TARGET="${ARCH}-${PLATFORM}"
 62 |     info "Detected platform: $TARGET"
 63 | }
 64 | 
 65 | # Get the current installed version
 66 | get_current_version() {
 67 |     if command -v "$BINARY_NAME" >/dev/null 2>&1; then
 68 |         # Extract version from "docfind X.Y.Z" output
 69 |         CURRENT_VERSION=$("$BINARY_NAME" --version 2>/dev/null | sed -E 's/^[^ ]+ //')
 70 |         if [ -n "$CURRENT_VERSION" ]; then
 71 |             echo "$CURRENT_VERSION"
 72 |         fi
 73 |     fi
 74 | }
 75 | 
 76 | # Get the latest release version
 77 | get_latest_version() {
 78 |     info "Fetching latest release..."
 79 |     
 80 |     # Prepare auth header if GITHUB_TOKEN is set
 81 |     AUTH_HEADER=""
 82 |     if [ -n "$GITHUB_TOKEN" ]; then
 83 |         AUTH_HEADER="Authorization: Bearer $GITHUB_TOKEN"
 84 |     fi
 85 |     
 86 |     if command -v curl >/dev/null 2>&1; then
 87 |         if [ -n "$AUTH_HEADER" ]; then
 88 |             VERSION=$(curl -fsSL -H "$AUTH_HEADER" "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
 89 |         else
 90 |             VERSION=$(curl -fsSL "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
 91 |         fi
 92 |     elif command -v wget >/dev/null 2>&1; then
 93 |         if [ -n "$AUTH_HEADER" ]; then
 94 |             VERSION=$(wget -qO- --header="$AUTH_HEADER" "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
 95 |         else
 96 |             VERSION=$(wget -qO- "https://api.github.com/repos/$REPO/releases/latest" | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
 97 |         fi
 98 |     else
 99 |         error "Neither curl nor wget found. Please install one of them."
100 |     fi
101 |     
102 |     if [ -z "$VERSION" ]; then
103 |         error "Failed to fetch latest version"
104 |     fi
105 |     
106 |     info "Latest version: $VERSION"
107 | }
108 | 
109 | # Download and install binary
110 | install_binary() {
111 |     DOWNLOAD_URL="https://github.com/$REPO/releases/download/$VERSION/${BINARY_NAME}-${TARGET}.tar.gz"
112 |     TEMP_FILE="/tmp/${BINARY_NAME}-${TARGET}.tar.gz"
113 |     
114 |     info "Downloading from $DOWNLOAD_URL..."
115 |     
116 |     if command -v curl >/dev/null 2>&1; then
117 |         curl -fsSL "$DOWNLOAD_URL" -o "$TEMP_FILE" || error "Download failed"
118 |     elif command -v wget >/dev/null 2>&1; then
119 |         wget -q "$DOWNLOAD_URL" -O "$TEMP_FILE" || error "Download failed"
120 |     fi
121 |     
122 |     # Create install directory if it doesn't exist
123 |     if [ ! -d "$INSTALL_DIR" ]; then
124 |         info "Creating directory $INSTALL_DIR..."
125 |         mkdir -p "$INSTALL_DIR" || error "Failed to create install directory"
126 |     fi
127 |     
128 |     # Extract and install binary
129 |     info "Extracting archive..."
130 |     tar -xzf "$TEMP_FILE" -C "$INSTALL_DIR" || error "Failed to extract archive"
131 |     
132 |     info "Installing to $INSTALL_DIR/$BINARY_NAME..."
133 |     chmod +x "$INSTALL_DIR/$BINARY_NAME" || error "Failed to make binary executable"
134 |     
135 |     # Clean up
136 |     rm "$TEMP_FILE" 2>/dev/null || true
137 |     
138 |     info "Successfully installed $BINARY_NAME to $INSTALL_DIR"
139 | }
140 | 
141 | # Check if install directory is in PATH
142 | check_path() {
143 |     case ":$PATH:" in
144 |         *":$INSTALL_DIR:"*)
145 |             return 0
146 |             ;;
147 |         *)
148 |             return 1
149 |             ;;
150 |     esac
151 | }
152 | 
153 | # Print post-install instructions
154 | post_install() {
155 |     echo ""
156 |     info "Installation complete!"
157 |     
158 |     if ! check_path; then
159 |         warn "$INSTALL_DIR is not in your PATH"
160 |         echo ""
161 |         echo "Add it to your PATH by adding this line to your shell profile:"
162 |         echo "  ${GREEN}export PATH=\"\$PATH:$INSTALL_DIR\"${NC}"
163 |         echo ""
164 |         
165 |         # Detect shell and provide specific instructions
166 |         SHELL_NAME="$(basename "$SHELL")"
167 |         case "$SHELL_NAME" in
168 |             bash)
169 |                 echo "For bash, add it to ~/.bashrc or ~/.bash_profile"
170 |                 ;;
171 |             zsh)
172 |                 echo "For zsh, add it to ~/.zshrc"
173 |                 ;;
174 |             fish)
175 |                 echo "For fish, run: ${GREEN}fish_add_path $INSTALL_DIR${NC}"
176 |                 ;;
177 |             *)
178 |                 echo "Add it to your shell's configuration file"
179 |                 ;;
180 |         esac
181 |         echo ""
182 |         echo "Then reload your shell or run: ${GREEN}source ~/.${SHELL_NAME}rc${NC}"
183 |     else
184 |         echo "You can now use '${GREEN}$BINARY_NAME${NC}' from anywhere!"
185 |     fi
186 |     
187 |     echo ""
188 |     echo "Try it out:"
189 |     echo "  ${GREEN}$BINARY_NAME --help${NC}"
190 | }
191 | 
192 | # Main installation flow
193 | main() {
194 |     info "Installing $BINARY_NAME..."
195 |     
196 |     detect_platform
197 |     get_latest_version
198 |     
199 |     # Check if already installed with the same version
200 |     CURRENT_VERSION=$(get_current_version)
201 |     if [ -n "$CURRENT_VERSION" ]; then
202 |         info "Current version: $CURRENT_VERSION"
203 |         # Strip 'v' prefix from VERSION if present for comparison
204 |         LATEST_VERSION_NUM=$(echo "$VERSION" | sed 's/^v//')
205 |         if [ "$CURRENT_VERSION" = "$LATEST_VERSION_NUM" ] || [ "$CURRENT_VERSION" = "$VERSION" ]; then
206 |             info "$BINARY_NAME $CURRENT_VERSION is already installed (latest version)"
207 |             echo ""
208 |             echo "If you want to reinstall, please uninstall first:"
209 |             echo "  ${GREEN}rm \$(which $BINARY_NAME)${NC}"
210 |             exit 0
211 |         fi
212 |     fi
213 |     
214 |     install_binary
215 |     post_install
216 | }
217 | 
218 | main
219 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # docfind
  2 | 
  3 | A high-performance document search engine built in Rust with WebAssembly support. Combines full-text search using FST (Finite State Transducers) with FSST compression for efficient storage and fast fuzzy matching capabilities.
  4 | 
  5 | ## Live Demo
  6 | 
  7 | Check out the [interactive demo](https://microsoft.github.io/docfind/). The demo showcases docfind searching through 50,000 news articles from the AG News dataset, running entirely in your browser with WebAssembly.
  8 | 
  9 | **Demo Performance Metrics:**
 10 | - **Dataset**: 50,000 news articles (AG News Classification Dataset)
 11 | - **Dataset Size**: 17.14 MB ([uncompressed JSON](https://github.com/microsoft/docfind/raw/refs/heads/main/static/documents.json))
 12 | - **Index Size**: 11.48 MB ([WASM file](https://github.com/microsoft/docfind/raw/refs/heads/main/static/docfind_bg.wasm))
 13 | - **Compressed Size**: 5.20 MB ([compressed with Brotli](https://github.com/microsoft/docfind/raw/refs/heads/main/static/docfind_bg.wasm.br))
 14 | - **Index Build Time**: ~1.1 seconds
 15 | - **Load Time**: ~100ms (depending on network and browser)
 16 | - **Search Speed**: ~1-3ms per query
 17 | 
 18 | ## Features
 19 | 
 20 | - **Fast Fuzzy Search**: Uses FST for efficient keyword matching with Levenshtein distance support
 21 | - **Compact Storage**: FSST compression reduces index size while maintaining fast decompression
 22 | - **RAKE Keyword Extraction**: Automatic keyword extraction from document content using the RAKE algorithm
 23 | - **WebAssembly Ready**: Compile to WASM for browser-based search with no server required
 24 | - **Standalone CLI Tool**: Self-contained CLI tool to build a .wasm file out of a collection of documents, no Rust tooling required
 25 | 
 26 | ## Installation
 27 | 
 28 | ### Quick Install
 29 | 
 30 | **macOS/Linux:**
 31 | ```bash
 32 | curl -fsSL https://microsoft.github.io/docfind/install.sh | sh
 33 | ```
 34 | 
 35 | **Windows (PowerShell):**
 36 | ```powershell
 37 | irm https://microsoft.github.io/docfind/install.ps1 | iex
 38 | ```
 39 | 
 40 | The installer will:
 41 | - Download the latest release binary for your platform
 42 | - Install it to `~/.local/bin` (Unix) or `~\.docfind\bin` (Windows)
 43 | - Provide instructions for adding it to your PATH if needed
 44 | 
 45 | ### Manual Installation
 46 | 
 47 | Download the binary for your platform from the [latest release](https://github.com/microsoft/docfind/releases/latest):
 48 | 
 49 | - **macOS (Intel)**: `docfind-x86_64-apple-darwin`
 50 | - **macOS (Apple Silicon)**: `docfind-aarch64-apple-darwin`
 51 | - **Linux (x64)**: `docfind-x86_64-unknown-linux-musl`
 52 | - **Linux (ARM64)**: `docfind-aarch64-unknown-linux-musl`
 53 | - **Windows (x64)**: `docfind-x86_64-pc-windows-msvc.exe`
 54 | - **Windows (ARM64)**: `docfind-aarch64-pc-windows-msvc.exe`
 55 | 
 56 | Rename it to `docfind` (or `docfind.exe` on Windows), make it executable, and place it in your PATH.
 57 | 
 58 | ### Building from Source
 59 | 
 60 | #### Prerequisites
 61 | 
 62 | Before building from source, ensure you have the following installed:
 63 | 
 64 | 1. **Rust** - [rustup.rs](https://rustup.rs/)
 65 | 2. **wasm-pack** - [drager.github.io/wasm-pack](https://drager.github.io/wasm-pack/)
 66 | 3. **Node.js** - [nodejs.org](https://nodejs.org/) (required for esbuild)
 67 | 
 68 | #### Build
 69 | 
 70 | ```bash
 71 | ./scripts/build.sh
 72 | ```
 73 | 
 74 | The compiled binary will be available at `./target/release/docfind`.
 75 | 
 76 | ## Usage
 77 | 
 78 | ### Creating a Search Index
 79 | 
 80 | Prepare a JSON file with your documents:
 81 | 
 82 | ```json
 83 | [
 84 |   {
 85 |     "title": "Getting Started",
 86 |     "category": "docs",
 87 |     "href": "/docs/getting-started",
 88 |     "body": "This guide will help you get started."
 89 |     },
 90 |     {
 91 |     "title": "API Reference",
 92 |     "category": "reference",
 93 |     "href": "/docs/api",
 94 |     "body": "Complete API documentation for all search functions and configuration options."
 95 |     }
 96 | ]
 97 | ```
 98 | 
 99 | Build the index and generate a WASM module:
100 | 
101 | ```bash
102 | docfind documents.json output
103 | ```
104 | 
105 | This creates:
106 | - `output/docfind.js` - JavaScript bindings
107 | - `output/docfind_bg.wasm` - WebAssembly module with embedded index
108 | 
109 | ### Using in the Browser
110 | 
111 | ```html
112 | <script type="module">
113 |   import search from 'docfind.js';
114 |   
115 |   const documents = await search('needle');
116 |   console.log(documents);
117 | </script>
118 | ```
119 | 
120 | ## How It Works
121 | 
122 | ```mermaid
123 | flowchart LR
124 |     A([documents.json]) --> B[docfind]
125 |     B --> C[Keyword Extraction<br/>RAKE]
126 |     B --> E[FSST Compression<br/>document strings]
127 |     C --> D[FST Map<br/>keywords → docs]
128 |     D --> F[[Index]]
129 |     E --> F
130 |     F --> G([docfind_bg.wasm<br/>+ docfind.js])
131 |     
132 |     style A fill:#e1f5ff
133 |     style G fill:#e1f5ff
134 |     style F fill:#ffffcc
135 | ```
136 | 
137 | 1. **Indexing Phase** (CLI):
138 |    - Extracts keywords from document titles, categories, and bodies
139 |    - Uses RAKE algorithm to identify important multi-word phrases
140 |    - Assigns relevance scores based on keyword source (metadata > title > body)
141 |    - Builds an FST mapping keywords to document indices
142 |    - Compresses all document strings using FSST
143 |    - Serializes the index using Postcard (binary format)
144 | 
145 | 2. **Embedding Phase** (CLI):
146 |    - Parses the pre-compiled WASM module
147 |    - Expands WASM memory to accommodate the index
148 |    - Patches global variables (`INDEX_BASE`, `INDEX_LEN`) with actual values
149 |    - Adds the index as a new data segment in the WASM binary
150 | 
151 | 3. **Search Phase** (WASM):
152 |    - Deserializes the embedded index on first use
153 |    - Performs fuzzy matching using Levenshtein automaton
154 |    - Combines results from multiple keywords with score accumulation
155 |    - Decompresses matching document strings on demand
156 |    - Returns ranked results as JavaScript objects
157 | 
158 | ## Dependencies
159 | 
160 | - **fst**: Fast finite state transducer library with Levenshtein support
161 | - **fsst-rs**: Fast string compression for text data
162 | - **rake**: Rapid Automatic Keyword Extraction algorithm
163 | - **serde/postcard**: Efficient serialization
164 | - **wasm-bindgen**: WebAssembly bindings for Rust
165 | - **wasm-encoder/wasmparser**: WASM manipulation tools
166 | 
167 | ## Performance
168 | 
169 | The combination of FST and FSST provides:
170 | - Sub-millisecond search times for typical queries
171 | - 60-80% compression ratio for document storage
172 | - Instant startup with lazy index loading
173 | - Zero network requests after initial load
174 | 
175 | ## References
176 | 
177 | ### Prior Art
178 | 
179 | This project builds on the rich ecosystem of search technologies:
180 | 
181 | - **[Algolia](https://www.algolia.com/)** - Server-side search-as-a-service platform
182 | - **[TypeSense](https://typesense.org/)** - Open-source server-side search engine
183 | - **[Lunr.js](https://lunrjs.com/)** - Client-side full-text search library for JavaScript
184 | - **[Stork Search](https://stork-search.net/)** - WebAssembly-powered search for static sites
185 | - **[Tinysearch](https://endler.dev/2019/tinysearch/)** - Minimalist WASM-based search engine
186 | 
187 | ### Technical Foundations
188 | 
189 | Key technologies and concepts that inspired and power docfind:
190 | 
191 | - **[Finite State Transducers](https://burntsushi.net/transducers/)** - Andrew Gallant's comprehensive article on FSTs, the core data structure for efficient search
192 | - **[RAKE Algorithm](https://docs.rs/rake/latest/rake/)** - Rapid Automatic Keyword Extraction for identifying important phrases
193 | - **[FSST Compression](https://docs.rs/fsst-rs/latest/fsst/index.html)** - Fast Static Symbol Table compression for efficient text storage
194 | 


--------------------------------------------------------------------------------
/static/install.ps1:
--------------------------------------------------------------------------------
  1 | # docfind installer script for Windows
  2 | # Usage: irm https://microsoft.github.io/docfind/install.ps1 | iex
  3 | 
  4 | $ErrorActionPreference = 'Stop'
  5 | 
  6 | # Configuration
  7 | $Repo = "microsoft/docfind"
  8 | $BinaryName = "docfind"
  9 | $InstallDir = if ($env:DOCFIND_INSTALL_DIR) { $env:DOCFIND_INSTALL_DIR } else { "$env:USERPROFILE\.docfind\bin" }
 10 | 
 11 | # Helper functions
 12 | function Write-Info {
 13 |     param([string]$Message)
 14 |     Write-Host "==> " -ForegroundColor Green -NoNewline
 15 |     Write-Host $Message
 16 | }
 17 | 
 18 | function Write-Warn {
 19 |     param([string]$Message)
 20 |     Write-Host "Warning: " -ForegroundColor Yellow -NoNewline
 21 |     Write-Host $Message
 22 | }
 23 | 
 24 | function Write-Error-Custom {
 25 |     param([string]$Message)
 26 |     Write-Host "Error: " -ForegroundColor Red -NoNewline
 27 |     Write-Host $Message
 28 |     exit 1
 29 | }
 30 | 
 31 | # Detect architecture
 32 | function Get-Architecture {
 33 |     $arch = $env:PROCESSOR_ARCHITECTURE
 34 |     switch ($arch) {
 35 |         "AMD64" { return "x86_64" }
 36 |         "ARM64" { return "aarch64" }
 37 |         default { Write-Error-Custom "Unsupported architecture: $arch" }
 38 |     }
 39 | }
 40 | 
 41 | # Get the current installed version
 42 | function Get-CurrentVersion {
 43 |     try {
 44 |         # Check if docfind is in PATH and can be executed
 45 |         $currentVersionOutput = & $BinaryName --version 2>&1
 46 |         if ($LASTEXITCODE -eq 0 -and $currentVersionOutput) {
 47 |             # Extract version from "docfind X.Y.Z" output
 48 |             $versionMatch = $currentVersionOutput -match "^$BinaryName\s+(.+)$"
 49 |             if ($versionMatch -and $Matches[1]) {
 50 |                 return $Matches[1].Trim()
 51 |             }
 52 |         }
 53 |     }
 54 |     catch {
 55 |         # Binary not found or not executable
 56 |     }
 57 |     return $null
 58 | }
 59 | 
 60 | # Get the latest release version
 61 | function Get-LatestVersion {
 62 |     Write-Info "Fetching latest release..."
 63 |     
 64 |     try {
 65 |         # Prepare headers for authentication if GITHUB_TOKEN is set
 66 |         $headers = @{}
 67 |         if ($env:GITHUB_TOKEN) {
 68 |             $headers["Authorization"] = "Bearer $env:GITHUB_TOKEN"
 69 |         }
 70 |         
 71 |         $response = if ($headers.Count -gt 0) {
 72 |             Invoke-RestMethod -Uri "https://api.github.com/repos/$Repo/releases/latest" -Headers $headers
 73 |         } else {
 74 |             Invoke-RestMethod -Uri "https://api.github.com/repos/$Repo/releases/latest"
 75 |         }
 76 |         
 77 |         $version = $response.tag_name
 78 |         
 79 |         if (-not $version) {
 80 |             Write-Error-Custom "Failed to fetch latest version"
 81 |         }
 82 |         
 83 |         Write-Info "Latest version: $version"
 84 |         return $version
 85 |     }
 86 |     catch {
 87 |         Write-Error-Custom "Failed to fetch release information: $_"
 88 |     }
 89 | }
 90 | 
 91 | # Download and install binary
 92 | function Install-Binary {
 93 |     param(
 94 |         [string]$Version,
 95 |         [string]$Target
 96 |     )
 97 |     
 98 |     $fileName = "${BinaryName}-${Target}.zip"
 99 |     $downloadUrl = "https://github.com/$Repo/releases/download/$Version/$fileName"
100 |     $tempFile = Join-Path $env:TEMP $fileName
101 |     $tempExtractDir = Join-Path $env:TEMP "docfind-extract"
102 |     
103 |     Write-Info "Downloading from $downloadUrl..."
104 |     
105 |     try {
106 |         Invoke-WebRequest -Uri $downloadUrl -OutFile $tempFile -UseBasicParsing
107 |     }
108 |     catch {
109 |         Write-Error-Custom "Download failed: $_"
110 |     }
111 |     
112 |     # Create install directory if it doesn't exist
113 |     if (-not (Test-Path $InstallDir)) {
114 |         Write-Info "Creating directory $InstallDir..."
115 |         New-Item -ItemType Directory -Path $InstallDir -Force | Out-Null
116 |     }
117 |     
118 |     # Extract archive
119 |     Write-Info "Extracting archive..."
120 |     try {
121 |         # Clean up temp extract directory if it exists
122 |         if (Test-Path $tempExtractDir) {
123 |             Remove-Item -Path $tempExtractDir -Recurse -Force
124 |         }
125 |         New-Item -ItemType Directory -Path $tempExtractDir -Force | Out-Null
126 |         
127 |         Expand-Archive -Path $tempFile -DestinationPath $tempExtractDir -Force
128 |     }
129 |     catch {
130 |         Write-Error-Custom "Failed to extract archive: $_"
131 |     }
132 |     
133 |     # Install binary
134 |     $destination = Join-Path $InstallDir "${BinaryName}.exe"
135 |     $extractedBinary = Join-Path $tempExtractDir "${BinaryName}.exe"
136 |     Write-Info "Installing to $destination..."
137 |     
138 |     try {
139 |         Move-Item -Path $extractedBinary -Destination $destination -Force
140 |     }
141 |     catch {
142 |         Write-Error-Custom "Failed to install binary: $_"
143 |     }
144 |     
145 |     # Clean up
146 |     try {
147 |         Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue
148 |         Remove-Item -Path $tempExtractDir -Recurse -Force -ErrorAction SilentlyContinue
149 |     }
150 |     catch {
151 |         # Ignore cleanup errors
152 |     }
153 |     
154 |     Write-Info "Successfully installed $BinaryName to $InstallDir"
155 | }
156 | 
157 | # Check if install directory is in PATH
158 | function Test-InPath {
159 |     param([string]$Directory)
160 |     
161 |     $pathDirs = $env:PATH -split ';'
162 |     return $pathDirs -contains $Directory
163 | }
164 | 
165 | # Add directory to PATH
166 | function Add-ToPath {
167 |     param([string]$Directory)
168 |     
169 |     Write-Info "Adding $Directory to your PATH..."
170 |     
171 |     try {
172 |         # Get current user PATH
173 |         $currentPath = [Environment]::GetEnvironmentVariable("PATH", "User")
174 |         
175 |         if ($currentPath -notlike "*$Directory*") {
176 |             $newPath = if ($currentPath) { "$currentPath;$Directory" } else { $Directory }
177 |             [Environment]::SetEnvironmentVariable("PATH", $newPath, "User")
178 |             
179 |             # Update current session PATH
180 |             $env:PATH = "$env:PATH;$Directory"
181 |             
182 |             Write-Info "Added $Directory to PATH"
183 |             return $true
184 |         }
185 |         else {
186 |             Write-Info "$Directory is already in PATH"
187 |             return $false
188 |         }
189 |     }
190 |     catch {
191 |         Write-Warn "Failed to add to PATH automatically: $_"
192 |         return $false
193 |     }
194 | }
195 | 
196 | # Print post-install instructions
197 | function Show-PostInstall {
198 |     param([bool]$PathUpdated)
199 |     
200 |     Write-Host ""
201 |     Write-Info "Installation complete!"
202 |     Write-Host ""
203 |     
204 |     if ($PathUpdated) {
205 |         Write-Host "The installation directory has been added to your PATH."
206 |         Write-Host "You may need to restart your terminal for the changes to take effect."
207 |         Write-Host ""
208 |         Write-Host "In a new terminal, you can run:" -ForegroundColor Cyan
209 |         Write-Host "  $BinaryName --help" -ForegroundColor Green
210 |     }
211 |     else {
212 |         if (-not (Test-InPath $InstallDir)) {
213 |             Write-Warn "$InstallDir is not in your PATH"
214 |             Write-Host ""
215 |             Write-Host "To add it permanently, run this in an elevated PowerShell:" -ForegroundColor Cyan
216 |             Write-Host "  [Environment]::SetEnvironmentVariable('PATH', `$env:PATH + ';$InstallDir', 'User')" -ForegroundColor Green
217 |             Write-Host ""
218 |             Write-Host "Or add it to your current session:" -ForegroundColor Cyan
219 |             Write-Host "  `$env:PATH += ';$InstallDir'" -ForegroundColor Green
220 |             Write-Host ""
221 |         }
222 |         else {
223 |             Write-Host "You can now use '$BinaryName' from anywhere!" -ForegroundColor Cyan
224 |             Write-Host ""
225 |             Write-Host "Try it out:" -ForegroundColor Cyan
226 |             Write-Host "  $BinaryName --help" -ForegroundColor Green
227 |         }
228 |     }
229 | }
230 | 
231 | # Main installation flow
232 | function Main {
233 |     Write-Info "Installing $BinaryName..."
234 |     
235 |     $arch = Get-Architecture
236 |     $target = "${arch}-pc-windows-msvc"
237 |     Write-Info "Detected platform: $target"
238 |     
239 |     $version = Get-LatestVersion
240 |     
241 |     # Check if already installed with the same version
242 |     $currentVersion = Get-CurrentVersion
243 |     if ($currentVersion) {
244 |         Write-Info "Current version: $currentVersion"
245 |         # Strip 'v' prefix from version if present for comparison
246 |         $latestVersionNum = $version -replace '^v', ''
247 |         if ($currentVersion -eq $latestVersionNum -or $currentVersion -eq $version) {
248 |             Write-Info "$BinaryName $currentVersion is already installed (latest version)"
249 |             Write-Host ""
250 |             Write-Host "If you want to reinstall, please uninstall first:" -ForegroundColor Cyan
251 |             Write-Host "  Remove-Item (Get-Command $BinaryName).Path" -ForegroundColor Green
252 |             exit 0
253 |         }
254 |     }
255 |     
256 |     Install-Binary -Version $version -Target $target
257 |     
258 |     $pathUpdated = $false
259 |     if (-not (Test-InPath $InstallDir)) {
260 |         $pathUpdated = Add-ToPath -Directory $InstallDir
261 |     }
262 |     
263 |     Show-PostInstall -PathUpdated $pathUpdated
264 | }
265 | 
266 | # Run the installer
267 | try {
268 |     Main
269 | }
270 | catch {
271 |     Write-Error-Custom "Installation failed: $_"
272 | }
273 | 


--------------------------------------------------------------------------------
/core/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | 
  3 | #[cfg(any(feature = "cli", feature = "wasm", test))]
  4 | use std::collections::HashMap;
  5 | 
  6 | /// A minimal FSST-compressed vector of UTF-8 strings with random access.
  7 | #[derive(Debug, Clone, Serialize, Deserialize)]
  8 | pub struct FsstStrVec {
  9 | 	// FSST dictionary we trained (as raw bytes for compact serde)
 10 | 	dict_syms: Vec<[u8; 8]>,
 11 | 	dict_lens: Vec<u8>,
 12 | 	// Concatenated compressed payload and per-item offsets
 13 | 	offsets: Vec<u32>, // offsets[i] = start of item i in `data`
 14 | 	data: Vec<u8>,
 15 | }
 16 | 
 17 | impl FsstStrVec {
 18 | 	/// Train FSST on `strings` and build the compressed vector.
 19 | 	#[cfg(any(feature = "cli", test))]
 20 | 	fn from_strings(strings: &[impl AsRef<str>]) -> Self {
 21 | 		// 1) Train a compressor on the corpus.
 22 | 		let sample: Vec<&[u8]> = strings.iter().map(|s| s.as_ref().as_bytes()).collect();
 23 | 		let compressor = fsst::Compressor::train(&sample);
 24 | 
 25 | 		// Keep dictionary for later decoding.
 26 | 		let syms: Vec<fsst::Symbol> = compressor.symbol_table().to_vec();
 27 | 		let lens: Vec<u8> = compressor.symbol_lengths().to_vec();
 28 | 
 29 | 		// 2) Compress each string independently; store offsets + bytes.
 30 | 		let mut offsets = Vec::with_capacity(strings.len());
 31 | 		let mut data = Vec::new();
 32 | 		for s in strings {
 33 | 			offsets.push(data.len() as u32);
 34 | 			let c = compressor.compress(s.as_ref().as_bytes());
 35 | 			data.extend_from_slice(&c);
 36 | 		}
 37 | 
 38 | 		// 3) Store symbol table as raw bytes for compact serialization.
 39 | 		let dict_syms: Vec<[u8; 8]> = syms
 40 | 			.into_iter()
 41 | 			.map(|sym| u64::to_le_bytes(sym.to_u64()))
 42 | 			.collect();
 43 | 
 44 | 		Self {
 45 | 			dict_syms,
 46 | 			dict_lens: lens,
 47 | 			offsets,
 48 | 			data,
 49 | 		}
 50 | 	}
 51 | 
 52 | 	/// Number of strings
 53 | 	pub fn len(&self) -> usize {
 54 | 		self.offsets.len()
 55 | 	}
 56 | 
 57 | 	/// Random access: decode item i into an owned String.
 58 | 	pub fn get(&self, i: usize) -> Option<String> {
 59 | 		if i >= self.len() {
 60 | 			return None;
 61 | 		}
 62 | 		let start = self.offsets[i] as usize;
 63 | 		let end = if i + 1 < self.len() {
 64 | 			self.offsets[i + 1] as usize
 65 | 		} else {
 66 | 			self.data.len()
 67 | 		};
 68 | 		let codes = &self.data[start..end];
 69 | 
 70 | 		// Rebuild a Decompressor on-demand. (You can cache this in the struct if you
 71 | 		// read frequently; it's cheap either way.)
 72 | 		let syms: Vec<fsst::Symbol> = self
 73 | 			.dict_syms
 74 | 			.iter()
 75 | 			.map(fsst::Symbol::from_slice)
 76 | 			.collect();
 77 | 		let decomp = fsst::Decompressor::new(&syms, &self.dict_lens);
 78 | 
 79 | 		let bytes = decomp.decompress(codes);
 80 | 		Some(String::from_utf8(bytes).expect("FSST preserves UTF-8 for UTF-8 input"))
 81 | 	}
 82 | }
 83 | 
 84 | #[derive(Debug, serde::Serialize, serde::Deserialize)]
 85 | #[serde(rename_all = "camelCase")]
 86 | pub struct Document {
 87 | 	pub title: String,
 88 | 	pub category: String,
 89 | 	pub href: String,
 90 | 	pub body: String,
 91 | 	pub keywords: Option<Vec<String>>,
 92 | }
 93 | 
 94 | #[derive(Debug, serde::Serialize, serde::Deserialize)]
 95 | pub struct Index {
 96 | 	/// FST vector for keyword to entry index
 97 | 	fst: Vec<u8>,
 98 | 
 99 | 	/// FSST string vector of all document strings
100 | 	document_strings: FsstStrVec,
101 | 
102 | 	/// Vector of keyword to document index entries
103 | 	keyword_to_documents: Vec<Vec<(usize, u8)>>,
104 | }
105 | 
106 | impl Index {
107 | 	pub fn from_bytes(bytes: &[u8]) -> Result<Self, Box<dyn std::error::Error>> {
108 | 		let index: Index = postcard::from_bytes(bytes)?;
109 | 		Ok(index)
110 | 	}
111 | 
112 | 	pub fn to_bytes(&self) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
113 | 		Ok(postcard::to_allocvec(self)?)
114 | 	}
115 | }
116 | 
117 | #[cfg(any(feature = "cli", test))]
118 | pub fn build_index(documents: Vec<Document>) -> Result<Index, Box<dyn std::error::Error>> {
119 | 	use std::collections::HashSet;
120 | 
121 | 	let stop_words = include_str!("../english.stop")
122 | 		.lines()
123 | 		.filter(|line| !line.is_empty() && !line.starts_with('#'))
124 | 		.map(|line| line.to_lowercase())
125 | 		.collect::<HashSet<String>>();
126 | 
127 | 	let sw = rake::StopWords::from(stop_words);
128 | 	let rake = rake::Rake::new(sw.clone());
129 | 
130 | 	let mut strings: Vec<&str> = Vec::new();
131 | 	let mut keywords_to_documents: HashMap<String, Vec<(&Document, f64)>> = HashMap::new();
132 | 	let mut doc_index_map: HashMap<&str, usize> = HashMap::new();
133 | 
134 | 	for (doc_index, doc) in documents.iter().enumerate() {
135 | 		doc_index_map.insert(&doc.href, doc_index);
136 | 		strings.push(&doc.title);
137 | 		strings.push(&doc.category);
138 | 		strings.push(&doc.href);
139 | 		strings.push(&doc.body);
140 | 
141 | 		let mut keyword_set: HashSet<String> = HashSet::new();
142 | 		let mut keywords: Vec<(String, f64)> = Vec::new();
143 | 
144 | 		// Add explicit keywords from document metadata
145 | 		if let Some(kw) = &doc.keywords {
146 | 			for k in kw {
147 | 				let keyword = k
148 | 					.trim_matches(|c: char| !c.is_alphanumeric())
149 | 					.to_lowercase();
150 | 				if !keyword.is_empty() && !sw.contains(&keyword.clone()) && !keyword_set.contains(&keyword)
151 | 				{
152 | 					keywords.push((keyword.clone(), 100.0));
153 | 					keyword_set.insert(keyword.clone());
154 | 				}
155 | 			}
156 | 		}
157 | 
158 | 		// add keywords from title
159 | 		let title_keywords = doc
160 | 			.title
161 | 			.split_whitespace()
162 | 			.map(|w| {
163 | 				w.trim_matches(|c: char| !c.is_alphanumeric())
164 | 					.to_lowercase()
165 | 			})
166 | 			.filter(|w| !w.is_empty() && !sw.contains(&w.clone()))
167 | 			.collect::<HashSet<String>>(); // deduplicate
168 | 
169 | 		for tk in title_keywords {
170 | 			if !keyword_set.contains(&tk) {
171 | 				keywords.push((tk.clone(), 90.0));
172 | 				keyword_set.insert(tk.clone());
173 | 			}
174 | 		}
175 | 
176 | 		let body_keywords = rake.run_fragments(vec![doc.body.as_str()]);
177 | 		let mut single_word_budget = 5;
178 | 		let mut double_word_budget = 3;
179 | 
180 | 		for k in &body_keywords {
181 | 			let keyword = k.keyword.to_lowercase();
182 | 
183 | 			// continue if keyword is already in title keywords
184 | 			if keyword_set.contains(&keyword) {
185 | 				continue;
186 | 			}
187 | 
188 | 			let whitespace_count = k.keyword.matches(' ').count();
189 | 
190 | 			if whitespace_count == 0 && single_word_budget > 0 {
191 | 				single_word_budget -= 1;
192 | 			} else if whitespace_count == 1 && double_word_budget > 0 {
193 | 				double_word_budget -= 1;
194 | 			} else {
195 | 				continue;
196 | 			}
197 | 
198 | 			keywords.push((keyword.clone(), k.score));
199 | 			keyword_set.insert(keyword.clone());
200 | 
201 | 			if single_word_budget == 0 && double_word_budget == 0 {
202 | 				break;
203 | 			}
204 | 		}
205 | 
206 | 		for k in keywords.iter() {
207 | 			keywords_to_documents
208 | 				.entry(k.0.clone())
209 | 				.or_default()
210 | 				.push((doc, k.1));
211 | 		}
212 | 	}
213 | 
214 | 	println!("Extracted {} unique keywords", keywords_to_documents.len());
215 | 
216 | 	let mut fst_builder = fst::MapBuilder::memory();
217 | 	let mut keyword_to_documents: Vec<Vec<(usize, u8)>> = Vec::new();
218 | 	let mut keywords: Vec<String> = keywords_to_documents.keys().cloned().collect();
219 | 	keywords.sort();
220 | 
221 | 	for (index, keyword) in keywords.iter().enumerate() {
222 | 		fst_builder.insert(keyword, index as u64)?;
223 | 
224 | 		let mut doc_scores = keywords_to_documents.get(keyword).unwrap().clone();
225 | 		doc_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
226 | 
227 | 		let entry = doc_scores
228 | 			.iter()
229 | 			.map(|(doc, score)| (doc_index_map[doc.href.as_str()], *score as u8))
230 | 			.collect::<Vec<(usize, u8)>>();
231 | 
232 | 		keyword_to_documents.push(entry);
233 | 	}
234 | 
235 | 	let fst = fst_builder.into_inner().unwrap();
236 | 	let document_strings = FsstStrVec::from_strings(&strings);
237 | 
238 | 	Ok(Index {
239 | 		fst,
240 | 		document_strings,
241 | 		keyword_to_documents,
242 | 	})
243 | }
244 | 
245 | #[cfg(any(feature = "wasm", test))]
246 | pub fn search(
247 | 	index: &Index,
248 | 	query: &str,
249 | 	max_results: usize,
250 | ) -> Result<Vec<Document>, Box<dyn std::error::Error>> {
251 | 	use fst::automaton::Levenshtein;
252 | 	use fst::map::OpBuilder;
253 | 	use fst::{Automaton, Streamer};
254 | 	use std::collections::HashSet;
255 | 
256 | 	let map = fst::Map::new(&index.fst)?;
257 | 
258 | 	let mut query_words: HashSet<String> = query
259 | 		.split_whitespace()
260 | 		.map(|w| {
261 | 			w.trim_matches(|c: char| !c.is_alphanumeric())
262 | 				.to_lowercase()
263 | 		})
264 | 		.filter(|w| !w.is_empty())
265 | 		.collect();
266 | 
267 | 	query_words.insert(query.to_lowercase());
268 | 
269 | 	let mut keywords: Vec<(String, u64)> = Vec::new();
270 | 
271 | 	for query_word in query_words {
272 | 		use fst::automaton::Str;
273 | 
274 | 		let lev = Levenshtein::new(query_word.as_str(), 1)?;
275 | 		let prefix = Str::new(query_word.as_str()).starts_with();
276 | 
277 | 		let mut op = OpBuilder::new()
278 | 			.add(map.search(lev))
279 | 			.add(map.search(prefix))
280 | 			.union();
281 | 
282 | 		while let Some((keyword, indexed_value)) = op.next() {
283 | 			let keyword_str = String::from_utf8(keyword.to_vec())?;
284 | 			let score = indexed_value.to_vec().get(0).unwrap().value;
285 | 			keywords.push((keyword_str, score));
286 | 		}
287 | 	}
288 | 
289 | 	// Sort keywords by length (shorter first)
290 | 	keywords.sort_by_key(|(kw, _)| kw.len());
291 | 
292 | 	let mut documents: HashMap<usize, u8> = HashMap::new();
293 | 
294 | 	for (_, keyword_index) in keywords {
295 | 		let documents_matching_keyword = &index.keyword_to_documents[keyword_index as usize];
296 | 
297 | 		for (document_index, score) in documents_matching_keyword {
298 | 			let entry = documents.entry(*document_index).or_insert(0);
299 | 			*entry = entry.saturating_add(*score);
300 | 		}
301 | 	}
302 | 
303 | 	// sort documents by score (descending), then by document index (ascending) for stable ordering
304 | 	let mut documents: Vec<(usize, u8)> = documents.into_iter().collect();
305 | 	documents.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
306 | 	documents.truncate(max_results);
307 | 
308 | 	let mut result: Vec<Document> = Vec::new();
309 | 
310 | 	for (document_index, _score) in documents {
311 | 		let title = index
312 | 			.document_strings
313 | 			.get(document_index * 4)
314 | 			.ok_or_else(|| "Failed to get document title")?;
315 | 		let category = index
316 | 			.document_strings
317 | 			.get(document_index * 4 + 1)
318 | 			.ok_or_else(|| "Failed to get document category")?;
319 | 		let href = index
320 | 			.document_strings
321 | 			.get(document_index * 4 + 2)
322 | 			.ok_or_else(|| "Failed to get document href")?;
323 | 		let body = index
324 | 			.document_strings
325 | 			.get(document_index * 4 + 3)
326 | 			.ok_or_else(|| "Failed to get document body")?;
327 | 
328 | 		let document = Document {
329 | 			title,
330 | 			category,
331 | 			href,
332 | 			body,
333 | 			keywords: None,
334 | 		};
335 | 
336 | 		result.push(document);
337 | 	}
338 | 
339 | 	Ok(result)
340 | }
341 | 
342 | #[cfg(test)]
343 | mod tests;
344 | 


--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |     <title>docfind - Fast Document Search Demo</title>
  8 |     <style>
  9 |         * {
 10 |             box-sizing: border-box;
 11 |             margin: 0;
 12 |             padding: 0;
 13 |         }
 14 | 
 15 |         body {
 16 |             font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
 17 |             line-height: 1.6;
 18 |             color: #333;
 19 |             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
 20 |             min-height: 100vh;
 21 |             padding: 20px;
 22 |         }
 23 | 
 24 |         .container {
 25 |             max-width: 900px;
 26 |             margin: 0 auto;
 27 |         }
 28 | 
 29 |         .search-container {
 30 |             background: white;
 31 |             border-radius: 12px;
 32 |             box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
 33 |             padding: 20px;
 34 |             margin-bottom: 20px;
 35 |         }
 36 | 
 37 |         .search-box {
 38 |             position: relative;
 39 |             margin-bottom: 10px;
 40 |         }
 41 | 
 42 |         input[type="search"] {
 43 |             width: 100%;
 44 |             padding: 12px 16px;
 45 |             font-size: 1em;
 46 |             border: 2px solid #e0e0e0;
 47 |             border-radius: 8px;
 48 |             outline: none;
 49 |             transition: border-color 0.3s;
 50 |         }
 51 | 
 52 |         input[type="search"]:focus {
 53 |             border-color: #667eea;
 54 |         }
 55 | 
 56 |         .search-info {
 57 |             display: flex;
 58 |             justify-content: space-between;
 59 |             align-items: center;
 60 |             color: #666;
 61 |             font-size: 0.9em;
 62 |             margin-top: 10px;
 63 |         }
 64 | 
 65 |         .loading {
 66 |             color: #667eea;
 67 |             font-weight: 500;
 68 |         }
 69 | 
 70 |         .results-container {
 71 |             background: white;
 72 |             border-radius: 12px;
 73 |             box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
 74 |             padding: 20px;
 75 |             min-height: 200px;
 76 |         }
 77 | 
 78 |         .result-item {
 79 |             padding: 12px;
 80 |             border-bottom: 1px solid #f0f0f0;
 81 |             transition: background-color 0.2s;
 82 |         }
 83 | 
 84 |         .result-item:last-child {
 85 |             border-bottom: none;
 86 |         }
 87 | 
 88 |         .result-item:hover {
 89 |             background-color: #f8f9fa;
 90 |         }
 91 | 
 92 |         .result-title {
 93 |             font-size: 1.1em;
 94 |             font-weight: 600;
 95 |             color: #667eea;
 96 |             margin-bottom: 6px;
 97 |         }
 98 | 
 99 |         .result-body {
100 |             color: #555;
101 |             line-height: 1.5;
102 |             margin-top: 6px;
103 |             font-size: 0.95em;
104 |         }
105 | 
106 |         .result-score {
107 |             font-size: 0.8em;
108 |             color: #999;
109 |             margin-top: 6px;
110 |         }
111 | 
112 |         .no-results {
113 |             text-align: center;
114 |             color: #999;
115 |             padding: 40px 20px;
116 |         }
117 | 
118 |         .no-results-icon {
119 |             font-size: 3em;
120 |             margin-bottom: 10px;
121 |         }
122 | 
123 |         .error {
124 |             background: #fee;
125 |             color: #c33;
126 |             padding: 15px;
127 |             border-radius: 8px;
128 |             margin-bottom: 20px;
129 |         }
130 | 
131 |         footer {
132 |             text-align: center;
133 |             color: white;
134 |             margin-top: 40px;
135 |             opacity: 0.9;
136 |         }
137 | 
138 |         footer a {
139 |             color: white;
140 |             text-decoration: underline;
141 |         }
142 | 
143 |         @media (max-width: 600px) {
144 |             h1 {
145 |                 font-size: 1.5em;
146 |             }
147 | 
148 |             .search-container,
149 |             .results-container {
150 |                 padding: 15px;
151 |             }
152 |         }
153 |     </style>
154 | </head>
155 | 
156 | <body>
157 |     <div class="container">
158 | 
159 |         <div class="search-container">
160 |             <div class="search-box">
161 |                 <input type="search" id="search-input"
162 |                     placeholder="Search the AG News dataset... Try 'sports', 'technology', 'business'..."
163 |                     autocomplete="off" autofocus>
164 |             </div>
165 |             <div class="search-info">
166 |                 <span id="result-count"></span>
167 |                 <span id="search-time"></span>
168 |             </div>
169 |         </div>
170 | 
171 |         <div class="results-container">
172 |             <div id="results"></div>
173 |         </div>
174 | 
175 |         <footer>
176 |             <p>
177 |                 Powered by <a href="https://github.com/microsoft/docfind" target="_blank">docfind</a>
178 |             </p>
179 |         </footer>
180 |     </div>
181 | 
182 |     <script type="module">
183 |         import search, { init } from './docfind.js';
184 | 
185 |         let isReady = false;
186 | 
187 |         // Pre-load the library on startup
188 |         async function initSearch() {
189 |             try {
190 |                 await init();
191 |             } catch (error) {
192 |                 console.warn('Failed to pre-load search library:', error);
193 |             }
194 | 
195 |             // Show welcome message
196 |             showWelcomeMessage();
197 | 
198 |             // Enable search
199 |             document.getElementById('search-input').disabled = false;
200 |             isReady = true;
201 |         }
202 | 
203 |         function showWelcomeMessage() {
204 |             const resultsDiv = document.getElementById('results');
205 |             resultsDiv.innerHTML = `
206 |                 <div style="text-align: center; padding: 40px 20px; color: #666;">
207 |                     <div style="font-size: 3em; margin-bottom: 20px;">👋</div>
208 |                     <h2 style="margin-bottom: 15px; color: #333;">Welcome to docfind!</h2>
209 |                     <p style="max-width: 500px; margin: 0 auto; line-height: 1.6;">
210 |                         This demo searches through thousands of news articles from the AG News dataset.
211 |                         Try searching for topics like "sports", "technology", "business", or "world news".
212 |                     </p>
213 |                     <p style="margin-top: 20px; font-size: 0.9em; color: #999;">
214 |                         The entire search index is running in your browser with WebAssembly - no server requests needed!
215 |                     </p>
216 |                 </div>
217 |             `;
218 |         }
219 | 
220 |         function showError(message) {
221 |             const resultsDiv = document.getElementById('results');
222 |             resultsDiv.innerHTML = `<div class="error">${message}</div>`;
223 |         }
224 | 
225 |         async function performSearch(query) {
226 |             if (!isReady) {
227 |                 document.getElementById('result-count').textContent = 'Initializing...';
228 |                 return;
229 |             }
230 | 
231 |             if (!query.trim()) {
232 |                 showWelcomeMessage();
233 |                 document.getElementById('result-count').textContent = '';
234 |                 document.getElementById('search-time').textContent = '';
235 |                 return;
236 |             }
237 | 
238 |             const searchStart = performance.now();
239 | 
240 |             try {
241 |                 const results = await search(query, 100);
242 |                 const searchTime = (performance.now() - searchStart).toFixed(2);
243 | 
244 |                 displayResults(results, query, searchTime);
245 |             } catch (error) {
246 |                 showError('Search failed: ' + error.message);
247 |             }
248 |         }
249 | 
250 |         function displayResults(results, query, searchTime) {
251 |             const resultsDiv = document.getElementById('results');
252 |             const resultCount = document.getElementById('result-count');
253 |             const searchTimeSpan = document.getElementById('search-time');
254 | 
255 |             // Display "100+" if we hit the limit
256 |             const countText = results.length >= 100 ? '100+' : results.length;
257 |             resultCount.textContent = `${countText} result${results.length !== 1 ? 's' : ''}`;
258 |             searchTimeSpan.innerHTML = `<span class="loading">⚡ ${searchTime}ms</span>`;
259 | 
260 |             if (results.length === 0) {
261 |                 resultsDiv.innerHTML = `
262 |                     <div class="no-results">
263 |                         <div class="no-results-icon">🔍</div>
264 |                         <div>No results found for "${escapeHtml(query)}"</div>
265 |                         <div style="margin-top: 10px; font-size: 0.9em;">Try different keywords or check your spelling</div>
266 |                     </div>
267 |                 `;
268 |                 return;
269 |             }
270 | 
271 |             let html = '';
272 |             results.forEach((result, index) => {
273 |                 const scoreDisplay = (result.score !== undefined && result.score !== null)
274 |                     ? `<div class="result-score">Relevance: ${result.score.toFixed(2)}</div>`
275 |                     : '';
276 | 
277 |                 html += `
278 |                     <div class="result-item">
279 |                         <div class="result-title">${highlightQuery(escapeHtml(result.title), query)}</div>
280 |                         <div class="result-body">${highlightQuery(escapeHtml(truncate(result.body, 250)), query)}</div>
281 |                         ${scoreDisplay}
282 |                     </div>
283 |                 `;
284 |             });
285 | 
286 |             resultsDiv.innerHTML = html;
287 |         }
288 | 
289 |         function escapeHtml(text) {
290 |             const div = document.createElement('div');
291 |             div.textContent = text;
292 |             return div.innerHTML;
293 |         }
294 | 
295 |         function highlightQuery(text, query) {
296 |             if (!query.trim()) return text;
297 | 
298 |             const words = query.toLowerCase().split(/\s+/);
299 |             let highlighted = text;
300 | 
301 |             words.forEach(word => {
302 |                 const regex = new RegExp(`(${word})`, 'gi');
303 |                 highlighted = highlighted.replace(regex, '<strong style="background-color: #fff3cd; font-weight: 600;">$1</strong>');
304 |             });
305 | 
306 |             return highlighted;
307 |         }
308 | 
309 |         function truncate(text, maxLength) {
310 |             if (text.length <= maxLength) return text;
311 |             return text.substring(0, maxLength).trim() + '...';
312 |         }
313 | 
314 |         // Set up event listeners
315 |         const searchInput = document.getElementById('search-input');
316 |         searchInput.disabled = true; // Disable until ready
317 | 
318 |         let searchTimeout;
319 |         searchInput.addEventListener('input', (e) => {
320 |             clearTimeout(searchTimeout);
321 |             searchTimeout = setTimeout(() => {
322 |                 performSearch(e.target.value);
323 |             }, 10); // 10ms debounce
324 |         });
325 | 
326 |         // Initialize on load
327 |         initSearch();
328 |     </script>
329 | </body>
330 | 
331 | </html>


--------------------------------------------------------------------------------
/cli/src/main.rs:
--------------------------------------------------------------------------------
  1 | use docfind_core::Document;
  2 | use std::io::Write;
  3 | use std::path::Path;
  4 | use std::{collections::HashMap, fs::File};
  5 | use wasm_encoder::{ConstExpr, DataSection, MemorySection, MemoryType};
  6 | use wasmparser::{Parser, Payload};
  7 | 
  8 | #[derive(Debug)]
  9 | enum WasmDataSegment {
 10 | 	Passive(Vec<u8>),
 11 | 	Active {
 12 | 		memory_index: u32,
 13 | 		offset: ConstExpr,
 14 | 		data: Vec<u8>,
 15 | 		i32const_offset: Option<i32>,
 16 | 	},
 17 | }
 18 | 
 19 | /// Represents different types of WASM sections we care about
 20 | #[derive(Debug)]
 21 | enum WasmSection {
 22 | 	Data(Vec<WasmDataSegment>),
 23 | 	DataCount(u32),
 24 | 	Memory,
 25 | 	Raw { id: u8, data: Vec<u8> },
 26 | }
 27 | 
 28 | /// Convert a wasmparser ConstExpr to a wasm_encoder ConstExpr
 29 | fn convert_const_expr(
 30 | 	expr: &wasmparser::ConstExpr,
 31 | ) -> Result<ConstExpr, Box<dyn std::error::Error>> {
 32 | 	let mut ops_reader = expr.get_operators_reader();
 33 | 
 34 | 	// We'll handle the most common cases
 35 | 	if !ops_reader.eof() {
 36 | 		let op = ops_reader.read()?;
 37 | 		match op {
 38 | 			wasmparser::Operator::I32Const { value } => return Ok(ConstExpr::i32_const(value)),
 39 | 			wasmparser::Operator::I64Const { value } => return Ok(ConstExpr::i64_const(value)),
 40 | 			wasmparser::Operator::F32Const { value } => {
 41 | 				// Convert wasmparser Ieee32 to wasm_encoder Ieee32
 42 | 				let f32_val = f32::from_bits(value.bits());
 43 | 				return Ok(ConstExpr::f32_const(f32_val.into()));
 44 | 			}
 45 | 			wasmparser::Operator::F64Const { value } => {
 46 | 				// Convert wasmparser Ieee64 to wasm_encoder Ieee64
 47 | 				let f64_val = f64::from_bits(value.bits());
 48 | 				return Ok(ConstExpr::f64_const(f64_val.into()));
 49 | 			}
 50 | 			wasmparser::Operator::GlobalGet { global_index } => {
 51 | 				return Ok(ConstExpr::global_get(global_index));
 52 | 			}
 53 | 			wasmparser::Operator::RefNull { hty } => {
 54 | 				// Convert heap type
 55 | 				let heap_type = match hty {
 56 | 					wasmparser::HeapType::Concrete(_) => wasm_encoder::HeapType::Concrete(0),
 57 | 					_ => wasm_encoder::HeapType::Abstract {
 58 | 						shared: false,
 59 | 						ty: wasm_encoder::AbstractHeapType::Func,
 60 | 					},
 61 | 				};
 62 | 				return Ok(ConstExpr::ref_null(heap_type));
 63 | 			}
 64 | 			wasmparser::Operator::RefFunc { function_index } => {
 65 | 				return Ok(ConstExpr::ref_func(function_index));
 66 | 			}
 67 | 			_ => {
 68 | 				// For other operators, use raw with empty bytes
 69 | 				return Ok(ConstExpr::raw(vec![]));
 70 | 			}
 71 | 		}
 72 | 	}
 73 | 
 74 | 	Ok(ConstExpr::raw(vec![]))
 75 | }
 76 | 
 77 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 78 | 	let debug = std::env::var("DOCFIND_DEBUG").is_ok();
 79 | 	let args: Vec<String> = std::env::args().collect();
 80 | 
 81 | 	// Handle --version flag
 82 | 	if args.len() == 2 && (args[1] == "--version" || args[1] == "-v") {
 83 | 		println!("docfind {}", env!("CARGO_PKG_VERSION"));
 84 | 		std::process::exit(0);
 85 | 	}
 86 | 
 87 | 	if args.len() != 3 {
 88 | 		eprintln!("Usage: {} <documents.json> <outdir>", args[0]);
 89 | 		std::process::exit(1);
 90 | 	}
 91 | 
 92 | 	let input_path = &args[1];
 93 | 	let output_dir = &args[2];
 94 | 	if debug {
 95 | 		eprintln!("[docfind] CWD: {:?}", std::env::current_dir()?);
 96 | 		eprintln!("[docfind] input_path: {}", input_path);
 97 | 		eprintln!("[docfind] output_dir: {}", output_dir);
 98 | 	}
 99 | 	let documents_file = File::open(input_path)?;
100 | 	let documents: Vec<Document> = serde_json::from_reader(documents_file)?;
101 | 
102 | 	let start = std::time::Instant::now();
103 | 	let index = docfind_core::build_index(documents)?;
104 | 	let duration = start.elapsed();
105 | 	if debug {
106 | 		eprintln!("[docfind] Indexing completed in: {:?}", duration);
107 | 	} else {
108 | 		println!("Indexing completed in: {:?}", duration);
109 | 	}
110 | 
111 | 	let start = std::time::Instant::now();
112 | 	let mut sections: Vec<WasmSection> = Vec::new();
113 | 
114 | 	let mut old_memory_page_count: u64 = 0;
115 | 	let mut index_base_global_index: Option<u32> = None;
116 | 	let mut index_len_global_index: Option<u32> = None;
117 | 	let mut i32_globals: HashMap<u32, i32> = HashMap::new();
118 | 
119 | 	let docfind_js: &[u8] = include_bytes!("../../wasm/pkg/docfind.js");
120 | 	let docfind_bg_wasm: &[u8] = include_bytes!("../../wasm/pkg/docfind_bg.wasm");
121 | 	if debug {
122 | 		eprintln!("[docfind] Embedded JS size: {} bytes", docfind_js.len());
123 | 		eprintln!(
124 | 			"[docfind] Embedded WASM size: {} bytes",
125 | 			docfind_bg_wasm.len()
126 | 		);
127 | 	}
128 | 
129 | 	for payload in Parser::new(0).parse_all(docfind_bg_wasm) {
130 | 		let payload = payload?;
131 | 
132 | 		// process i32 const data sections differently
133 | 		if let Payload::DataSection(reader) = payload {
134 | 			let mut data_segments: Vec<WasmDataSegment> = Vec::new();
135 | 
136 | 			for data in reader {
137 | 				let data = data?;
138 | 
139 | 				match data.kind {
140 | 					wasmparser::DataKind::Passive => {
141 | 						data_segments.push(WasmDataSegment::Passive(data.data.to_vec()));
142 | 					}
143 | 					wasmparser::DataKind::Active {
144 | 						memory_index,
145 | 						offset_expr,
146 | 					} => {
147 | 						let const_expr = convert_const_expr(&offset_expr)?;
148 | 						let i32const_offset = if let wasmparser::Operator::I32Const { value } =
149 | 							offset_expr.get_operators_reader().read()?
150 | 						{
151 | 							Some(value)
152 | 						} else {
153 | 							None
154 | 						};
155 | 
156 | 						data_segments.push(WasmDataSegment::Active {
157 | 							memory_index,
158 | 							offset: const_expr,
159 | 							data: data.data.to_vec(),
160 | 							i32const_offset,
161 | 						});
162 | 					}
163 | 				}
164 | 			}
165 | 
166 | 			sections.push(WasmSection::Data(data_segments));
167 | 		} else if let Payload::DataCountSection { count, .. } = payload {
168 | 			sections.push(WasmSection::DataCount(count));
169 | 		} else if let Payload::MemorySection(reader) = payload {
170 | 			for memory in reader {
171 | 				old_memory_page_count = memory?.initial as u64;
172 | 			}
173 | 			sections.push(WasmSection::Memory);
174 | 		} else {
175 | 			if let Some((id, data)) = payload.as_section() {
176 | 				sections.push(WasmSection::Raw {
177 | 					id,
178 | 					data: docfind_bg_wasm[data.start..data.end].to_vec(),
179 | 				});
180 | 			}
181 | 
182 | 			match payload {
183 | 				Payload::ExportSection(reader) => {
184 | 					for export in reader {
185 | 						let export = export?;
186 | 						if export.name == "INDEX_BASE" {
187 | 							index_base_global_index = Some(export.index);
188 | 						} else if export.name == "INDEX_LEN" {
189 | 							index_len_global_index = Some(export.index);
190 | 						}
191 | 					}
192 | 				}
193 | 				Payload::GlobalSection(reader) => {
194 | 					for (idx, global) in reader.into_iter().enumerate() {
195 | 						let global = global?;
196 | 						let mut ops_reader = global.init_expr.get_operators_reader();
197 | 
198 | 						if !ops_reader.eof() {
199 | 							if let Ok(wasmparser::Operator::I32Const { value }) = ops_reader.read() {
200 | 								i32_globals.insert(idx as u32, value);
201 | 							}
202 | 						}
203 | 					}
204 | 				}
205 | 				_ => {}
206 | 			}
207 | 		}
208 | 	}
209 | 
210 | 	let index_base_global_index =
211 | 		index_base_global_index.expect("Could not find INDEX_BASE global index");
212 | 	let index_len_global_index =
213 | 		index_len_global_index.expect("Could not find INDEX_LEN global index");
214 | 	if debug {
215 | 		eprintln!(
216 | 			"[docfind] INDEX_BASE global index: {}",
217 | 			index_base_global_index
218 | 		);
219 | 		eprintln!(
220 | 			"[docfind] INDEX_LEN global index: {}",
221 | 			index_len_global_index
222 | 		);
223 | 	}
224 | 
225 | 	let index_base_global_address = i32_globals
226 | 		.get(&index_base_global_index)
227 | 		.expect("Could not find INDEX_BASE global value");
228 | 
229 | 	let index_len_global_address = i32_globals
230 | 		.get(&index_len_global_index)
231 | 		.expect("Could not find INDEX_LEN global value");
232 | 	if debug {
233 | 		eprintln!(
234 | 			"[docfind] INDEX_BASE address: {}",
235 | 			index_base_global_address
236 | 		);
237 | 		eprintln!("[docfind] INDEX_LEN address: {}", index_len_global_address);
238 | 	}
239 | 
240 | 	let raw_index: Vec<u8> = index.to_bytes()?; // will embed into wasm
241 | 	if debug {
242 | 		eprintln!("[docfind] Index size: {} bytes", raw_index.len());
243 | 	} else {
244 | 		println!("Index size: {} bytes", raw_index.len());
245 | 	}
246 | 
247 | 	let new_memory_page_count = old_memory_page_count + (raw_index.len() as u64 / 0x10000) + 1;
248 | 	let index_base = old_memory_page_count * 0x10000;
249 | 	if debug {
250 | 		eprintln!("[docfind] Old memory pages: {}", old_memory_page_count);
251 | 		eprintln!("[docfind] New memory pages: {}", new_memory_page_count);
252 | 		eprintln!("[docfind] Index base address: {}", index_base);
253 | 	}
254 | 
255 | 	let mut encoder = wasm_encoder::Module::new();
256 | 
257 | 	for section in sections {
258 | 		match section {
259 | 			WasmSection::DataCount(count) => {
260 | 				encoder.section(&wasm_encoder::DataCountSection { count: count + 1 });
261 | 			}
262 | 			WasmSection::Data(data_segments) => {
263 | 				let mut data_section = DataSection::new();
264 | 
265 | 				for segment in data_segments {
266 | 					match segment {
267 | 						WasmDataSegment::Passive(data) => {
268 | 							data_section.passive(data.iter().copied());
269 | 						}
270 | 						WasmDataSegment::Active {
271 | 							memory_index,
272 | 							offset,
273 | 							data,
274 | 							i32const_offset,
275 | 						} => {
276 | 							if let Some(i32_offset) = i32const_offset {
277 | 								let start = i32_offset;
278 | 								let end = i32_offset + (data.len() as i32);
279 | 
280 | 								// Patch the data if it contains the INDEX_BASE or INDEX_LEN addresses
281 | 								if index_base_global_address >= &start && index_base_global_address < &end {
282 | 									assert!(
283 | 										index_len_global_address >= &start && index_len_global_address < &end,
284 | 										"INDEX_LEN address not in data segment!"
285 | 									);
286 | 
287 | 									let mut data = data;
288 | 
289 | 									let base_relative_offset = (index_base_global_address - start) as usize;
290 | 									data[base_relative_offset..base_relative_offset + 4]
291 | 										.copy_from_slice(&(index_base as i32).to_le_bytes());
292 | 
293 | 									let length_relative_offset = (index_len_global_address - start) as usize;
294 | 									data[length_relative_offset..length_relative_offset + 4]
295 | 										.copy_from_slice(&(raw_index.len() as i32).to_le_bytes());
296 | 
297 | 									data_section.active(memory_index, &offset, data);
298 | 									continue;
299 | 								}
300 | 							}
301 | 
302 | 							data_section.active(memory_index, &offset, data);
303 | 						}
304 | 					}
305 | 				}
306 | 
307 | 				data_section.active(
308 | 					0,
309 | 					&ConstExpr::i32_const(index_base as i32),
310 | 					raw_index.iter().copied(),
311 | 				);
312 | 
313 | 				encoder.section(&data_section);
314 | 			}
315 | 			WasmSection::Memory => {
316 | 				let mut new_memory_section = MemorySection::new();
317 | 				new_memory_section.memory(MemoryType {
318 | 					minimum: new_memory_page_count,
319 | 					maximum: None,
320 | 					memory64: false,
321 | 					shared: false,
322 | 					page_size_log2: None,
323 | 				});
324 | 				encoder.section(&new_memory_section);
325 | 			}
326 | 			WasmSection::Raw { id, data } => {
327 | 				encoder.section(&wasm_encoder::RawSection { id, data: &data });
328 | 			}
329 | 		}
330 | 	}
331 | 
332 | 	let wasm_bytes = encoder.finish();
333 | 	wasmparser::Validator::new().validate_all(&wasm_bytes)?;
334 | 
335 | 	let output_dir = Path::new(output_dir);
336 | 	std::fs::create_dir_all(output_dir)?;
337 | 
338 | 	let mut output_js = File::create(output_dir.join("docfind.js"))?;
339 | 	output_js.write_all(docfind_js)?;
340 | 
341 | 	let mut output_wasm = File::create(output_dir.join("docfind_bg.wasm"))?;
342 | 	output_wasm.write_all(&wasm_bytes)?;
343 | 
344 | 	let duration = start.elapsed();
345 | 	println!("WASM creation completed in: {:?}", duration);
346 | 
347 | 	Ok(())
348 | }
349 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "1.1.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "atomic-polyfill"
 16 | version = "1.0.3"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4"
 19 | dependencies = [
 20 |  "critical-section",
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "bitflags"
 25 | version = "2.10.0"
 26 | source = "registry+https://github.com/rust-lang/crates.io-index"
 27 | checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
 28 | 
 29 | [[package]]
 30 | name = "bumpalo"
 31 | version = "3.19.0"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
 34 | 
 35 | [[package]]
 36 | name = "byteorder"
 37 | version = "1.5.0"
 38 | source = "registry+https://github.com/rust-lang/crates.io-index"
 39 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 40 | 
 41 | [[package]]
 42 | name = "cfg-if"
 43 | version = "1.0.4"
 44 | source = "registry+https://github.com/rust-lang/crates.io-index"
 45 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 46 | 
 47 | [[package]]
 48 | name = "cobs"
 49 | version = "0.3.0"
 50 | source = "registry+https://github.com/rust-lang/crates.io-index"
 51 | checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
 52 | dependencies = [
 53 |  "thiserror",
 54 | ]
 55 | 
 56 | [[package]]
 57 | name = "critical-section"
 58 | version = "1.2.0"
 59 | source = "registry+https://github.com/rust-lang/crates.io-index"
 60 | checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
 61 | 
 62 | [[package]]
 63 | name = "docfind"
 64 | version = "0.5.1"
 65 | dependencies = [
 66 |  "docfind_core",
 67 |  "serde_json",
 68 |  "wasm-encoder",
 69 |  "wasmparser",
 70 | ]
 71 | 
 72 | [[package]]
 73 | name = "docfind-wasm"
 74 | version = "0.5.1"
 75 | dependencies = [
 76 |  "docfind_core",
 77 |  "serde-wasm-bindgen",
 78 |  "wasm-bindgen",
 79 | ]
 80 | 
 81 | [[package]]
 82 | name = "docfind_core"
 83 | version = "0.5.1"
 84 | dependencies = [
 85 |  "fsst-rs",
 86 |  "fst",
 87 |  "postcard",
 88 |  "rake",
 89 |  "serde",
 90 |  "serde_json",
 91 | ]
 92 | 
 93 | [[package]]
 94 | name = "embedded-io"
 95 | version = "0.4.0"
 96 | source = "registry+https://github.com/rust-lang/crates.io-index"
 97 | checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced"
 98 | 
 99 | [[package]]
100 | name = "embedded-io"
101 | version = "0.6.1"
102 | source = "registry+https://github.com/rust-lang/crates.io-index"
103 | checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
104 | 
105 | [[package]]
106 | name = "equivalent"
107 | version = "1.0.2"
108 | source = "registry+https://github.com/rust-lang/crates.io-index"
109 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
110 | 
111 | [[package]]
112 | name = "foldhash"
113 | version = "0.1.5"
114 | source = "registry+https://github.com/rust-lang/crates.io-index"
115 | checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
116 | 
117 | [[package]]
118 | name = "fsst-rs"
119 | version = "0.5.4"
120 | source = "registry+https://github.com/rust-lang/crates.io-index"
121 | checksum = "ab195789b87bb56fce91b3617e44d36dbba68a4c8d736ef48767187932a5161b"
122 | 
123 | [[package]]
124 | name = "fst"
125 | version = "0.4.7"
126 | source = "registry+https://github.com/rust-lang/crates.io-index"
127 | checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
128 | dependencies = [
129 |  "utf8-ranges",
130 | ]
131 | 
132 | [[package]]
133 | name = "hash32"
134 | version = "0.2.1"
135 | source = "registry+https://github.com/rust-lang/crates.io-index"
136 | checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
137 | dependencies = [
138 |  "byteorder",
139 | ]
140 | 
141 | [[package]]
142 | name = "hashbrown"
143 | version = "0.15.5"
144 | source = "registry+https://github.com/rust-lang/crates.io-index"
145 | checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
146 | dependencies = [
147 |  "foldhash",
148 |  "serde",
149 | ]
150 | 
151 | [[package]]
152 | name = "hashbrown"
153 | version = "0.16.0"
154 | source = "registry+https://github.com/rust-lang/crates.io-index"
155 | checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
156 | 
157 | [[package]]
158 | name = "heapless"
159 | version = "0.7.17"
160 | source = "registry+https://github.com/rust-lang/crates.io-index"
161 | checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f"
162 | dependencies = [
163 |  "atomic-polyfill",
164 |  "hash32",
165 |  "rustc_version",
166 |  "serde",
167 |  "spin",
168 |  "stable_deref_trait",
169 | ]
170 | 
171 | [[package]]
172 | name = "indexmap"
173 | version = "2.12.0"
174 | source = "registry+https://github.com/rust-lang/crates.io-index"
175 | checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
176 | dependencies = [
177 |  "equivalent",
178 |  "hashbrown 0.16.0",
179 |  "serde",
180 |  "serde_core",
181 | ]
182 | 
183 | [[package]]
184 | name = "itoa"
185 | version = "1.0.15"
186 | source = "registry+https://github.com/rust-lang/crates.io-index"
187 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
188 | 
189 | [[package]]
190 | name = "js-sys"
191 | version = "0.3.81"
192 | source = "registry+https://github.com/rust-lang/crates.io-index"
193 | checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
194 | dependencies = [
195 |  "once_cell",
196 |  "wasm-bindgen",
197 | ]
198 | 
199 | [[package]]
200 | name = "lazy_static"
201 | version = "1.5.0"
202 | source = "registry+https://github.com/rust-lang/crates.io-index"
203 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
204 | 
205 | [[package]]
206 | name = "leb128fmt"
207 | version = "0.1.0"
208 | source = "registry+https://github.com/rust-lang/crates.io-index"
209 | checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
210 | 
211 | [[package]]
212 | name = "lock_api"
213 | version = "0.4.14"
214 | source = "registry+https://github.com/rust-lang/crates.io-index"
215 | checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
216 | dependencies = [
217 |  "scopeguard",
218 | ]
219 | 
220 | [[package]]
221 | name = "log"
222 | version = "0.4.28"
223 | source = "registry+https://github.com/rust-lang/crates.io-index"
224 | checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
225 | 
226 | [[package]]
227 | name = "memchr"
228 | version = "2.7.6"
229 | source = "registry+https://github.com/rust-lang/crates.io-index"
230 | checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
231 | 
232 | [[package]]
233 | name = "once_cell"
234 | version = "1.21.3"
235 | source = "registry+https://github.com/rust-lang/crates.io-index"
236 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
237 | 
238 | [[package]]
239 | name = "postcard"
240 | version = "1.1.3"
241 | source = "registry+https://github.com/rust-lang/crates.io-index"
242 | checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
243 | dependencies = [
244 |  "cobs",
245 |  "embedded-io 0.4.0",
246 |  "embedded-io 0.6.1",
247 |  "heapless",
248 |  "serde",
249 | ]
250 | 
251 | [[package]]
252 | name = "proc-macro2"
253 | version = "1.0.101"
254 | source = "registry+https://github.com/rust-lang/crates.io-index"
255 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
256 | dependencies = [
257 |  "unicode-ident",
258 | ]
259 | 
260 | [[package]]
261 | name = "quote"
262 | version = "1.0.41"
263 | source = "registry+https://github.com/rust-lang/crates.io-index"
264 | checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
265 | dependencies = [
266 |  "proc-macro2",
267 | ]
268 | 
269 | [[package]]
270 | name = "rake"
271 | version = "0.3.6"
272 | source = "registry+https://github.com/rust-lang/crates.io-index"
273 | checksum = "5a0a7b4878cdfa9c73657cf8479a1f2430104b21991db7940e97ab000056f0a1"
274 | dependencies = [
275 |  "lazy_static",
276 |  "regex",
277 |  "serde",
278 | ]
279 | 
280 | [[package]]
281 | name = "regex"
282 | version = "1.12.2"
283 | source = "registry+https://github.com/rust-lang/crates.io-index"
284 | checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
285 | dependencies = [
286 |  "aho-corasick",
287 |  "memchr",
288 |  "regex-automata",
289 |  "regex-syntax",
290 | ]
291 | 
292 | [[package]]
293 | name = "regex-automata"
294 | version = "0.4.13"
295 | source = "registry+https://github.com/rust-lang/crates.io-index"
296 | checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
297 | dependencies = [
298 |  "aho-corasick",
299 |  "memchr",
300 |  "regex-syntax",
301 | ]
302 | 
303 | [[package]]
304 | name = "regex-syntax"
305 | version = "0.8.8"
306 | source = "registry+https://github.com/rust-lang/crates.io-index"
307 | checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
308 | 
309 | [[package]]
310 | name = "rustc_version"
311 | version = "0.4.1"
312 | source = "registry+https://github.com/rust-lang/crates.io-index"
313 | checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
314 | dependencies = [
315 |  "semver",
316 | ]
317 | 
318 | [[package]]
319 | name = "rustversion"
320 | version = "1.0.22"
321 | source = "registry+https://github.com/rust-lang/crates.io-index"
322 | checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
323 | 
324 | [[package]]
325 | name = "ryu"
326 | version = "1.0.20"
327 | source = "registry+https://github.com/rust-lang/crates.io-index"
328 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
329 | 
330 | [[package]]
331 | name = "scopeguard"
332 | version = "1.2.0"
333 | source = "registry+https://github.com/rust-lang/crates.io-index"
334 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
335 | 
336 | [[package]]
337 | name = "semver"
338 | version = "1.0.27"
339 | source = "registry+https://github.com/rust-lang/crates.io-index"
340 | checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
341 | 
342 | [[package]]
343 | name = "serde"
344 | version = "1.0.228"
345 | source = "registry+https://github.com/rust-lang/crates.io-index"
346 | checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
347 | dependencies = [
348 |  "serde_core",
349 |  "serde_derive",
350 | ]
351 | 
352 | [[package]]
353 | name = "serde-wasm-bindgen"
354 | version = "0.6.5"
355 | source = "registry+https://github.com/rust-lang/crates.io-index"
356 | checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b"
357 | dependencies = [
358 |  "js-sys",
359 |  "serde",
360 |  "wasm-bindgen",
361 | ]
362 | 
363 | [[package]]
364 | name = "serde_core"
365 | version = "1.0.228"
366 | source = "registry+https://github.com/rust-lang/crates.io-index"
367 | checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
368 | dependencies = [
369 |  "serde_derive",
370 | ]
371 | 
372 | [[package]]
373 | name = "serde_derive"
374 | version = "1.0.228"
375 | source = "registry+https://github.com/rust-lang/crates.io-index"
376 | checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
377 | dependencies = [
378 |  "proc-macro2",
379 |  "quote",
380 |  "syn",
381 | ]
382 | 
383 | [[package]]
384 | name = "serde_json"
385 | version = "1.0.145"
386 | source = "registry+https://github.com/rust-lang/crates.io-index"
387 | checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
388 | dependencies = [
389 |  "itoa",
390 |  "memchr",
391 |  "ryu",
392 |  "serde",
393 |  "serde_core",
394 | ]
395 | 
396 | [[package]]
397 | name = "spin"
398 | version = "0.9.8"
399 | source = "registry+https://github.com/rust-lang/crates.io-index"
400 | checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
401 | dependencies = [
402 |  "lock_api",
403 | ]
404 | 
405 | [[package]]
406 | name = "stable_deref_trait"
407 | version = "1.2.1"
408 | source = "registry+https://github.com/rust-lang/crates.io-index"
409 | checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
410 | 
411 | [[package]]
412 | name = "syn"
413 | version = "2.0.107"
414 | source = "registry+https://github.com/rust-lang/crates.io-index"
415 | checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
416 | dependencies = [
417 |  "proc-macro2",
418 |  "quote",
419 |  "unicode-ident",
420 | ]
421 | 
422 | [[package]]
423 | name = "thiserror"
424 | version = "2.0.17"
425 | source = "registry+https://github.com/rust-lang/crates.io-index"
426 | checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
427 | dependencies = [
428 |  "thiserror-impl",
429 | ]
430 | 
431 | [[package]]
432 | name = "thiserror-impl"
433 | version = "2.0.17"
434 | source = "registry+https://github.com/rust-lang/crates.io-index"
435 | checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
436 | dependencies = [
437 |  "proc-macro2",
438 |  "quote",
439 |  "syn",
440 | ]
441 | 
442 | [[package]]
443 | name = "unicode-ident"
444 | version = "1.0.19"
445 | source = "registry+https://github.com/rust-lang/crates.io-index"
446 | checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
447 | 
448 | [[package]]
449 | name = "utf8-ranges"
450 | version = "1.0.5"
451 | source = "registry+https://github.com/rust-lang/crates.io-index"
452 | checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
453 | 
454 | [[package]]
455 | name = "wasm-bindgen"
456 | version = "0.2.104"
457 | source = "registry+https://github.com/rust-lang/crates.io-index"
458 | checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
459 | dependencies = [
460 |  "cfg-if",
461 |  "once_cell",
462 |  "rustversion",
463 |  "wasm-bindgen-macro",
464 |  "wasm-bindgen-shared",
465 | ]
466 | 
467 | [[package]]
468 | name = "wasm-bindgen-backend"
469 | version = "0.2.104"
470 | source = "registry+https://github.com/rust-lang/crates.io-index"
471 | checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
472 | dependencies = [
473 |  "bumpalo",
474 |  "log",
475 |  "proc-macro2",
476 |  "quote",
477 |  "syn",
478 |  "wasm-bindgen-shared",
479 | ]
480 | 
481 | [[package]]
482 | name = "wasm-bindgen-macro"
483 | version = "0.2.104"
484 | source = "registry+https://github.com/rust-lang/crates.io-index"
485 | checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
486 | dependencies = [
487 |  "quote",
488 |  "wasm-bindgen-macro-support",
489 | ]
490 | 
491 | [[package]]
492 | name = "wasm-bindgen-macro-support"
493 | version = "0.2.104"
494 | source = "registry+https://github.com/rust-lang/crates.io-index"
495 | checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
496 | dependencies = [
497 |  "proc-macro2",
498 |  "quote",
499 |  "syn",
500 |  "wasm-bindgen-backend",
501 |  "wasm-bindgen-shared",
502 | ]
503 | 
504 | [[package]]
505 | name = "wasm-bindgen-shared"
506 | version = "0.2.104"
507 | source = "registry+https://github.com/rust-lang/crates.io-index"
508 | checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
509 | dependencies = [
510 |  "unicode-ident",
511 | ]
512 | 
513 | [[package]]
514 | name = "wasm-encoder"
515 | version = "0.240.0"
516 | source = "registry+https://github.com/rust-lang/crates.io-index"
517 | checksum = "06d642d8c5ecc083aafe9ceb32809276a304547a3a6eeecceb5d8152598bc71f"
518 | dependencies = [
519 |  "leb128fmt",
520 |  "wasmparser",
521 | ]
522 | 
523 | [[package]]
524 | name = "wasmparser"
525 | version = "0.240.0"
526 | source = "registry+https://github.com/rust-lang/crates.io-index"
527 | checksum = "b722dcf61e0ea47440b53ff83ccb5df8efec57a69d150e4f24882e4eba7e24a4"
528 | dependencies = [
529 |  "bitflags",
530 |  "hashbrown 0.15.5",
531 |  "indexmap",
532 |  "semver",
533 |  "serde",
534 | ]
535 | 


--------------------------------------------------------------------------------
/core/src/tests.rs:
--------------------------------------------------------------------------------
  1 | mod tests {
  2 | 	use crate::Index;
  3 | 	use crate::{Document, FsstStrVec};
  4 | 	use crate::{build_index, search};
  5 | 
  6 | 	// ========================================================================
  7 | 	// SECTION 1: Basic Sanity Tests - FsstStrVec
  8 | 	// ========================================================================
  9 | 
 10 | 	#[test]
 11 | 	fn test_fsst_str_vec_basic() {
 12 | 		// Simple sanity test for FsstStrVec
 13 | 
 14 | 		let strings = vec!["hello", "world", "rust", "search"];
 15 | 		let vec = FsstStrVec::from_strings(&strings);
 16 | 
 17 | 		assert_eq!(vec.len(), 4);
 18 | 
 19 | 		assert_eq!(vec.get(0), Some("hello".to_string()));
 20 | 		assert_eq!(vec.get(1), Some("world".to_string()));
 21 | 		assert_eq!(vec.get(2), Some("rust".to_string()));
 22 | 		assert_eq!(vec.get(3), Some("search".to_string()));
 23 | 	}
 24 | 
 25 | 	#[test]
 26 | 	fn test_fsst_str_vec_out_of_bounds() {
 27 | 		// Test that getting an out-of-bounds index returns None
 28 | 		let strings = vec!["hello", "world"];
 29 | 		let vec = FsstStrVec::from_strings(&strings);
 30 | 
 31 | 		assert_eq!(vec.get(5), None);
 32 | 		assert_eq!(vec.get(100), None);
 33 | 	}
 34 | 
 35 | 	#[test]
 36 | 	fn test_fsst_str_vec_empty() {
 37 | 		// Test behavior with empty vector
 38 | 		let strings: Vec<&str> = vec![];
 39 | 		let vec = FsstStrVec::from_strings(&strings);
 40 | 
 41 | 		assert_eq!(vec.len(), 0);
 42 | 		assert_eq!(vec.get(0), None);
 43 | 	}
 44 | 
 45 | 	#[test]
 46 | 	fn test_fsst_str_vec_single_item() {
 47 | 		// Test with a single string
 48 | 		let strings = vec!["solo"];
 49 | 		let vec = FsstStrVec::from_strings(&strings);
 50 | 
 51 | 		assert_eq!(vec.len(), 1);
 52 | 		assert_eq!(vec.get(0), Some("solo".to_string()));
 53 | 		assert_eq!(vec.get(1), None);
 54 | 	}
 55 | 
 56 | 	#[test]
 57 | 	fn test_fsst_str_vec_long_strings() {
 58 | 		// Test with longer strings to verify compression works
 59 | 		let strings = vec![
 60 | 			"This is a much longer string that should compress well with FSST",
 61 | 			"Another long string with similar patterns and repeated words",
 62 | 			"The third long string continues the pattern with more text",
 63 | 		];
 64 | 		let vec = FsstStrVec::from_strings(&strings);
 65 | 
 66 | 		assert_eq!(vec.len(), 3);
 67 | 		assert_eq!(vec.get(0), Some(strings[0].to_string()));
 68 | 		assert_eq!(vec.get(1), Some(strings[1].to_string()));
 69 | 		assert_eq!(vec.get(2), Some(strings[2].to_string()));
 70 | 	}
 71 | 
 72 | 	#[test]
 73 | 	fn test_fsst_str_vec_unicode() {
 74 | 		// Test with Unicode strings
 75 | 		let strings = vec!["Hello 世界", "Rust 🦀", "Café ☕"];
 76 | 		let vec = FsstStrVec::from_strings(&strings);
 77 | 
 78 | 		assert_eq!(vec.len(), 3);
 79 | 		assert_eq!(vec.get(0), Some("Hello 世界".to_string()));
 80 | 		assert_eq!(vec.get(1), Some("Rust 🦀".to_string()));
 81 | 		assert_eq!(vec.get(2), Some("Café ☕".to_string()));
 82 | 	}
 83 | 
 84 | 	// ========================================================================
 85 | 	// SECTION 2: Document Structure Tests
 86 | 	// ========================================================================
 87 | 
 88 | 	#[test]
 89 | 	fn test_document_creation() {
 90 | 		// Test that we can create Document structs
 91 | 		let doc = Document {
 92 | 			title: "Test Document".to_string(),
 93 | 			category: "Test".to_string(),
 94 | 			href: "/test".to_string(),
 95 | 			body: "This is a test document body".to_string(),
 96 | 			keywords: Some(vec!["test".to_string(), "document".to_string()]),
 97 | 		};
 98 | 
 99 | 		assert_eq!(doc.title, "Test Document");
100 | 		assert_eq!(doc.category, "Test");
101 | 		assert_eq!(doc.href, "/test");
102 | 		assert_eq!(doc.body, "This is a test document body");
103 | 		assert_eq!(
104 | 			doc.keywords,
105 | 			Some(vec!["test".to_string(), "document".to_string()])
106 | 		);
107 | 	}
108 | 
109 | 	#[test]
110 | 	fn test_document_serialization() {
111 | 		// Test document serialization/deserialization
112 | 		let doc = Document {
113 | 			title: "Test".to_string(),
114 | 			category: "Category".to_string(),
115 | 			href: "/link".to_string(),
116 | 			body: "Body text".to_string(),
117 | 			keywords: Some(vec!["test".to_string(), "example".to_string()]),
118 | 		};
119 | 
120 | 		let serialized = serde_json::to_string(&doc).unwrap();
121 | 		let deserialized: Document = serde_json::from_str(&serialized).unwrap();
122 | 
123 | 		assert_eq!(doc.title, deserialized.title);
124 | 		assert_eq!(doc.category, deserialized.category);
125 | 		assert_eq!(doc.href, deserialized.href);
126 | 		assert_eq!(doc.body, deserialized.body);
127 | 		assert_eq!(doc.keywords, deserialized.keywords);
128 | 	}
129 | 
130 | 	// ========================================================================
131 | 	// SECTION 3: Index Building Tests
132 | 	// ========================================================================
133 | 
134 | 	#[test]
135 | 	fn test_build_index_simple() {
136 | 		// Test building a simple index with a few documents
137 | 
138 | 		let documents = vec![
139 | 			Document {
140 | 				title: "Rust Programming".to_string(),
141 | 				category: "Documentation".to_string(),
142 | 				href: "/docs/rust".to_string(),
143 | 				body: "Learn Rust programming language".to_string(),
144 | 				keywords: Some(vec!["rust".to_string(), "programming".to_string()]),
145 | 			},
146 | 			Document {
147 | 				title: "Python Guide".to_string(),
148 | 				category: "Documentation".to_string(),
149 | 				href: "/docs/python".to_string(),
150 | 				body: "Python is a versatile programming language".to_string(),
151 | 				keywords: Some(vec!["python".to_string(), "guide".to_string()]),
152 | 			},
153 | 		];
154 | 
155 | 		let index = build_index(documents);
156 | 		assert!(index.is_ok());
157 | 
158 | 		let index = index.unwrap();
159 | 		assert_eq!(index.document_strings.len(), 8); // 4 strings per document * 2 documents
160 | 	}
161 | 
162 | 	#[test]
163 | 	fn test_build_index_empty() {
164 | 		// Test building an index with no documents
165 | 		let documents: Vec<Document> = vec![];
166 | 		let index = build_index(documents);
167 | 		assert!(index.is_ok());
168 | 
169 | 		let index = index.unwrap();
170 | 		assert_eq!(index.document_strings.len(), 0);
171 | 	}
172 | 
173 | 	#[test]
174 | 	fn test_build_index_single_document() {
175 | 		// Test building an index with a single document
176 | 		let documents = vec![Document {
177 | 			title: "Single Document".to_string(),
178 | 			category: "Test".to_string(),
179 | 			href: "/single".to_string(),
180 | 			body: "This is the only document".to_string(),
181 | 			keywords: Some(vec!["single".to_string(), "document".to_string()]),
182 | 		}];
183 | 
184 | 		let index = build_index(documents);
185 | 		assert!(index.is_ok());
186 | 
187 | 		let index = index.unwrap();
188 | 		assert_eq!(index.document_strings.len(), 4); // title, category, href, body
189 | 	}
190 | 
191 | 	#[test]
192 | 	fn test_build_index_duplicate_titles() {
193 | 		// Test with documents that have similar or duplicate titles
194 | 		let documents = vec![
195 | 			Document {
196 | 				title: "Getting Started".to_string(),
197 | 				category: "Guide".to_string(),
198 | 				href: "/guide1".to_string(),
199 | 				body: "First guide".to_string(),
200 | 				keywords: Some(vec!["getting".to_string(), "started".to_string()]),
201 | 			},
202 | 			Document {
203 | 				title: "Getting Started".to_string(),
204 | 				category: "Tutorial".to_string(),
205 | 				href: "/tutorial1".to_string(),
206 | 				body: "First tutorial".to_string(),
207 | 				keywords: Some(vec!["getting".to_string(), "started".to_string()]),
208 | 			},
209 | 		];
210 | 
211 | 		let index = build_index(documents);
212 | 		assert!(index.is_ok());
213 | 	}
214 | 
215 | 	// ========================================================================
216 | 	// SECTION 4: Index Serialization Tests
217 | 	// ========================================================================
218 | 
219 | 	#[test]
220 | 	fn test_index_serialization() {
221 | 		// Test that we can serialize and deserialize an index
222 | 
223 | 		let documents = vec![Document {
224 | 			title: "Test Document".to_string(),
225 | 			category: "Test".to_string(),
226 | 			href: "/test".to_string(),
227 | 			body: "This is a test document".to_string(),
228 | 			keywords: Some(vec!["test".to_string(), "document".to_string()]),
229 | 		}];
230 | 
231 | 		let index = build_index(documents).unwrap();
232 | 
233 | 		// Create a buffer to serialize to
234 | 		let buffer = index.to_bytes().unwrap();
235 | 		assert!(!buffer.is_empty());
236 | 
237 | 		// Try to deserialize from the buffer
238 | 		let deserialized = Index::from_bytes(&buffer);
239 | 		assert!(deserialized.is_ok());
240 | 
241 | 		let deserialized_index = deserialized.unwrap();
242 | 		assert_eq!(
243 | 			deserialized_index.document_strings.len(),
244 | 			index.document_strings.len()
245 | 		);
246 | 	}
247 | 
248 | 	#[test]
249 | 	fn test_index_serialization_roundtrip() {
250 | 		// Test that we can serialize and deserialize multiple times
251 | 		let documents = vec![
252 | 			Document {
253 | 				title: "Document One".to_string(),
254 | 				category: "Category A".to_string(),
255 | 				href: "/doc1".to_string(),
256 | 				body: "Content for document one".to_string(),
257 | 				keywords: Some(vec!["document".to_string(), "one".to_string()]),
258 | 			},
259 | 			Document {
260 | 				title: "Document Two".to_string(),
261 | 				category: "Category B".to_string(),
262 | 				href: "/doc2".to_string(),
263 | 				body: "Content for document two".to_string(),
264 | 				keywords: Some(vec!["document".to_string(), "two".to_string()]),
265 | 			},
266 | 		];
267 | 
268 | 		let original_index = build_index(documents).unwrap();
269 | 
270 | 		// First roundtrip
271 | 		let buffer1 = original_index.to_bytes().unwrap();
272 | 		let index1 = Index::from_bytes(&buffer1).unwrap();
273 | 
274 | 		// Second roundtrip
275 | 		let buffer2 = index1.to_bytes().unwrap();
276 | 		let index2 = Index::from_bytes(&buffer2).unwrap();
277 | 
278 | 		// Verify the data is consistent
279 | 		assert_eq!(
280 | 			index2.document_strings.len(),
281 | 			original_index.document_strings.len()
282 | 		);
283 | 	}
284 | 
285 | 	// ========================================================================
286 | 	// SECTION 5: Simple Search Tests
287 | 	// ========================================================================
288 | 
289 | 	#[test]
290 | 	fn test_search_single_word() {
291 | 		// Test searching for a single word
292 | 		let documents = vec![
293 | 			Document {
294 | 				title: "Rust Programming".to_string(),
295 | 				category: "Documentation".to_string(),
296 | 				href: "/docs/rust".to_string(),
297 | 				body: "Learn Rust programming language".to_string(),
298 | 				keywords: Some(vec!["rust".to_string(), "programming".to_string()]),
299 | 			},
300 | 			Document {
301 | 				title: "Python Guide".to_string(),
302 | 				category: "Documentation".to_string(),
303 | 				href: "/docs/python".to_string(),
304 | 				body: "Python is a versatile programming language".to_string(),
305 | 				keywords: Some(vec!["python".to_string(), "guide".to_string()]),
306 | 			},
307 | 		];
308 | 
309 | 		let index = build_index(documents).unwrap();
310 | 		let results = search(&index, "Rust", 10).unwrap();
311 | 
312 | 		assert!(!results.is_empty());
313 | 		assert_eq!(results[0].title, "Rust Programming");
314 | 		assert_eq!(results[0].href, "/docs/rust");
315 | 	}
316 | 
317 | 	#[test]
318 | 	fn test_search_case_insensitive() {
319 | 		// Test that search is case-insensitive
320 | 		let documents = vec![Document {
321 | 			title: "JavaScript Tutorial".to_string(),
322 | 			category: "Tutorials".to_string(),
323 | 			href: "/tutorials/javascript".to_string(),
324 | 			body: "Learn JavaScript programming".to_string(),
325 | 			keywords: Some(vec!["javascript".to_string(), "tutorial".to_string()]),
326 | 		}];
327 | 
328 | 		let index = build_index(documents).unwrap();
329 | 
330 | 		let results_lower = search(&index, "javascript", 10).unwrap();
331 | 		let results_upper = search(&index, "JAVASCRIPT", 10).unwrap();
332 | 		let results_mixed = search(&index, "JavaScript", 10).unwrap();
333 | 
334 | 		assert!(!results_lower.is_empty());
335 | 		assert!(!results_upper.is_empty());
336 | 		assert!(!results_mixed.is_empty());
337 | 
338 | 		// All should find the same document
339 | 		assert_eq!(results_lower[0].href, "/tutorials/javascript");
340 | 		assert_eq!(results_upper[0].href, "/tutorials/javascript");
341 | 		assert_eq!(results_mixed[0].href, "/tutorials/javascript");
342 | 	}
343 | 
344 | 	#[test]
345 | 	fn test_search_no_results() {
346 | 		// Test searching for something that doesn't exist
347 | 		let documents = vec![Document {
348 | 			title: "Rust Programming".to_string(),
349 | 			category: "Documentation".to_string(),
350 | 			href: "/docs/rust".to_string(),
351 | 			body: "Learn Rust programming language".to_string(),
352 | 			keywords: Some(vec!["rust".to_string(), "programming".to_string()]),
353 | 		}];
354 | 
355 | 		let index = build_index(documents).unwrap();
356 | 		let results = search(&index, "NonexistentKeyword", 10).unwrap();
357 | 
358 | 		assert!(results.is_empty());
359 | 	}
360 | 
361 | 	#[test]
362 | 	fn test_search_empty_query() {
363 | 		// Test searching with an empty query
364 | 		let documents = vec![Document {
365 | 			title: "Test Document".to_string(),
366 | 			category: "Test".to_string(),
367 | 			href: "/test".to_string(),
368 | 			body: "Test content".to_string(),
369 | 			keywords: Some(vec!["test".to_string(), "document".to_string()]),
370 | 		}];
371 | 
372 | 		let index = build_index(documents).unwrap();
373 | 		let results = search(&index, "", 10).unwrap();
374 | 
375 | 		// Empty query should return no results (or possibly all results depending on implementation)
376 | 		// Just verify it doesn't crash
377 | 		assert!(results.len() <= 1);
378 | 	}
379 | 
380 | 	// ========================================================================
381 | 	// SECTION 6: Multi-word and Phrase Search Tests
382 | 	// ========================================================================
383 | 
384 | 	#[test]
385 | 	fn test_search_multiple_words() {
386 | 		// Test searching for multiple words
387 | 		let documents = vec![
388 | 			Document {
389 | 				title: "VS Code Extensions".to_string(),
390 | 				category: "Documentation".to_string(),
391 | 				href: "/docs/extensions".to_string(),
392 | 				body: "Learn how to create VS Code extensions with comprehensive guides".to_string(),
393 | 				keywords: Some(vec![
394 | 					"vs".to_string(),
395 | 					"code".to_string(),
396 | 					"extensions".to_string(),
397 | 				]),
398 | 			},
399 | 			Document {
400 | 				title: "VS Code Settings".to_string(),
401 | 				category: "Documentation".to_string(),
402 | 				href: "/docs/settings".to_string(),
403 | 				body: "Configure your VS Code settings for optimal development experience".to_string(),
404 | 				keywords: Some(vec![
405 | 					"vs".to_string(),
406 | 					"code".to_string(),
407 | 					"settings".to_string(),
408 | 				]),
409 | 			},
410 | 			Document {
411 | 				title: "Python Guide".to_string(),
412 | 				category: "Documentation".to_string(),
413 | 				href: "/docs/python".to_string(),
414 | 				body: "Python is a versatile programming language".to_string(),
415 | 				keywords: Some(vec!["python".to_string(), "guide".to_string()]),
416 | 			},
417 | 		];
418 | 
419 | 		let index = build_index(documents).unwrap();
420 | 		let results = search(&index, "VS Code", 10).unwrap();
421 | 
422 | 		// Should find both VS Code documents
423 | 		assert!(results.len() >= 2);
424 | 		assert!(results.iter().any(|d| d.href == "/docs/extensions"));
425 | 		assert!(results.iter().any(|d| d.href == "/docs/settings"));
426 | 	}
427 | 
428 | 	#[test]
429 | 	fn test_search_partial_word_match() {
430 | 		// Test that partial word matches work
431 | 		let documents = vec![Document {
432 | 			title: "Debugging in VS Code".to_string(),
433 | 			category: "Documentation".to_string(),
434 | 			href: "/docs/debugging".to_string(),
435 | 			body: "Debug your applications with powerful debugging tools".to_string(),
436 | 			keywords: Some(vec![
437 | 				"debugging".to_string(),
438 | 				"vs".to_string(),
439 | 				"code".to_string(),
440 | 			]),
441 | 		}];
442 | 
443 | 		let index = build_index(documents).unwrap();
444 | 		let results = search(&index, "debug", 10).unwrap();
445 | 
446 | 		// Should find documents with "debugging" and "debug"
447 | 		assert!(!results.is_empty());
448 | 	}
449 | 
450 | 	// ========================================================================
451 | 	// SECTION 7: Ranking and Relevance Tests
452 | 	// ========================================================================
453 | 
454 | 	#[test]
455 | 	fn test_search_title_match_ranks_higher() {
456 | 		// Test that title matches rank higher than body matches
457 | 		let documents = vec![
458 | 			Document {
459 | 				title: "Python Tutorial".to_string(),
460 | 				category: "Tutorials".to_string(),
461 | 				href: "/tutorials/python".to_string(),
462 | 				body: "Learn programming with this tutorial".to_string(),
463 | 				keywords: Some(vec!["python".to_string(), "tutorial".to_string()]),
464 | 			},
465 | 			Document {
466 | 				title: "Getting Started".to_string(),
467 | 				category: "Documentation".to_string(),
468 | 				href: "/docs/start".to_string(),
469 | 				body: "This guide covers Python basics and advanced features".to_string(),
470 | 				keywords: Some(vec!["getting".to_string(), "started".to_string()]),
471 | 			},
472 | 		];
473 | 
474 | 		let index = build_index(documents).unwrap();
475 | 		let results = search(&index, "Python", 10).unwrap();
476 | 
477 | 		// Document with "Python" in title should rank first
478 | 		assert!(!results.is_empty());
479 | 		assert_eq!(results[0].href, "/tutorials/python");
480 | 	}
481 | 
482 | 	#[test]
483 | 	fn test_search_multiple_keyword_matches() {
484 | 		// Test that documents matching multiple keywords rank higher
485 | 		let documents = vec![
486 | 			Document {
487 | 				title: "VS Code Debugging".to_string(),
488 | 				category: "Documentation".to_string(),
489 | 				href: "/docs/debugging".to_string(),
490 | 				body: "Debug VS Code extensions".to_string(),
491 | 				keywords: Some(vec![
492 | 					"vs".to_string(),
493 | 					"code".to_string(),
494 | 					"debugging".to_string(),
495 | 				]),
496 | 			},
497 | 			Document {
498 | 				title: "VS Code Overview".to_string(),
499 | 				category: "Documentation".to_string(),
500 | 				href: "/docs/overview".to_string(),
501 | 				body: "Introduction to the editor".to_string(),
502 | 				keywords: Some(vec![
503 | 					"vs".to_string(),
504 | 					"code".to_string(),
505 | 					"overview".to_string(),
506 | 				]),
507 | 			},
508 | 			Document {
509 | 				title: "Debugging Guide".to_string(),
510 | 				category: "Tutorials".to_string(),
511 | 				href: "/tutorials/debug".to_string(),
512 | 				body: "General debugging techniques".to_string(),
513 | 				keywords: Some(vec!["debugging".to_string(), "guide".to_string()]),
514 | 			},
515 | 		];
516 | 
517 | 		let index = build_index(documents).unwrap();
518 | 		let results = search(&index, "VS Code debugging", 10).unwrap();
519 | 
520 | 		// Document with all three keywords should rank first
521 | 		assert!(!results.is_empty());
522 | 		assert_eq!(results[0].href, "/docs/debugging");
523 | 	}
524 | 
525 | 	#[test]
526 | 	fn test_search_max_results_limit() {
527 | 		// Test that max_results parameter limits results correctly
528 | 		let documents = vec![
529 | 			Document {
530 | 				title: "Guide One".to_string(),
531 | 				category: "Guides".to_string(),
532 | 				href: "/guide1".to_string(),
533 | 				body: "First guide about programming".to_string(),
534 | 				keywords: Some(vec!["guide".to_string(), "one".to_string()]),
535 | 			},
536 | 			Document {
537 | 				title: "Guide Two".to_string(),
538 | 				category: "Guides".to_string(),
539 | 				href: "/guide2".to_string(),
540 | 				body: "Second guide about programming".to_string(),
541 | 				keywords: Some(vec!["guide".to_string(), "two".to_string()]),
542 | 			},
543 | 			Document {
544 | 				title: "Guide Three".to_string(),
545 | 				category: "Guides".to_string(),
546 | 				href: "/guide3".to_string(),
547 | 				body: "Third guide about programming".to_string(),
548 | 				keywords: Some(vec!["guide".to_string(), "three".to_string()]),
549 | 			},
550 | 			Document {
551 | 				title: "Guide Four".to_string(),
552 | 				category: "Guides".to_string(),
553 | 				href: "/guide4".to_string(),
554 | 				body: "Fourth guide about programming".to_string(),
555 | 				keywords: Some(vec!["guide".to_string(), "four".to_string()]),
556 | 			},
557 | 		];
558 | 
559 | 		let index = build_index(documents).unwrap();
560 | 
561 | 		let results_2 = search(&index, "guide", 2).unwrap();
562 | 		let results_3 = search(&index, "guide", 3).unwrap();
563 | 		let results_10 = search(&index, "guide", 10).unwrap();
564 | 
565 | 		assert!(results_2.len() <= 2);
566 | 		assert!(results_3.len() <= 3);
567 | 		assert!(results_10.len() <= 10);
568 | 	}
569 | 
570 | 	// ========================================================================
571 | 	// SECTION 8: Complex Search Query Tests
572 | 	// ========================================================================
573 | 
574 | 	#[test]
575 | 	fn test_search_technical_terms() {
576 | 		// Test searching for technical terms and acronyms
577 | 		let documents = vec![
578 | 			Document {
579 | 				title: "TypeScript Configuration".to_string(),
580 | 				category: "Documentation".to_string(),
581 | 				href: "/docs/typescript".to_string(),
582 | 				body: "Configure TypeScript with tsconfig.json for your project".to_string(),
583 | 				keywords: Some(vec!["typescript".to_string(), "configuration".to_string()]),
584 | 			},
585 | 			Document {
586 | 				title: "JavaScript Basics".to_string(),
587 | 				category: "Tutorials".to_string(),
588 | 				href: "/tutorials/javascript".to_string(),
589 | 				body: "Learn JavaScript fundamentals".to_string(),
590 | 				keywords: Some(vec!["javascript".to_string(), "basics".to_string()]),
591 | 			},
592 | 			Document {
593 | 				title: "Language Support".to_string(),
594 | 				category: "Documentation".to_string(),
595 | 				href: "/docs/languages".to_string(),
596 | 				body: "VS Code supports TypeScript, JavaScript, and many other languages".to_string(),
597 | 				keywords: Some(vec!["language".to_string(), "support".to_string()]),
598 | 			},
599 | 		];
600 | 
601 | 		let index = build_index(documents).unwrap();
602 | 		let results = search(&index, "TypeScript", 10).unwrap();
603 | 
604 | 		assert!(!results.is_empty());
605 | 		assert!(results.iter().any(|d| d.href == "/docs/typescript"));
606 | 	}
607 | 
608 | 	#[test]
609 | 	fn test_search_with_special_characters() {
610 | 		// Test searching with special characters
611 | 		let documents = vec![
612 | 			Document {
613 | 				title: "C++ Programming".to_string(),
614 | 				category: "Documentation".to_string(),
615 | 				href: "/docs/cpp".to_string(),
616 | 				body: "Learn C++ programming language".to_string(),
617 | 				keywords: Some(vec!["c++".to_string(), "programming".to_string()]),
618 | 			},
619 | 			Document {
620 | 				title: "C# Guide".to_string(),
621 | 				category: "Documentation".to_string(),
622 | 				href: "/docs/csharp".to_string(),
623 | 				body: "C# development with .NET".to_string(),
624 | 				keywords: Some(vec!["c#".to_string(), "guide".to_string()]),
625 | 			},
626 | 		];
627 | 
628 | 		let index = build_index(documents).unwrap();
629 | 		let results_cpp = search(&index, "C++", 10);
630 | 		let results_csharp = search(&index, "C#", 10);
631 | 
632 | 		// Should handle special characters gracefully
633 | 		assert!(results_cpp.is_ok() as bool);
634 | 		assert!(results_csharp.is_ok() as bool);
635 | 	}
636 | 
637 | 	#[test]
638 | 	fn test_search_compound_keywords() {
639 | 		// Test searching for compound keywords and multi-word phrases
640 | 		let documents = vec![
641 | 			Document {
642 | 				title: "Remote Development Setup".to_string(),
643 | 				category: "Tutorials".to_string(),
644 | 				href: "/tutorials/remote-dev".to_string(),
645 | 				body: "Set up remote development environment for distributed teams".to_string(),
646 | 				keywords: Some(vec![
647 | 					"remote".to_string(),
648 | 					"development".to_string(),
649 | 					"setup".to_string(),
650 | 				]),
651 | 			},
652 | 			Document {
653 | 				title: "Development Environment".to_string(),
654 | 				category: "Documentation".to_string(),
655 | 				href: "/docs/environment".to_string(),
656 | 				body: "Configure your local development environment".to_string(),
657 | 				keywords: Some(vec!["development".to_string(), "environment".to_string()]),
658 | 			},
659 | 			Document {
660 | 				title: "Remote Connections".to_string(),
661 | 				category: "Documentation".to_string(),
662 | 				href: "/docs/remote".to_string(),
663 | 				body: "Connect to remote servers and containers".to_string(),
664 | 				keywords: Some(vec!["remote".to_string(), "connections".to_string()]),
665 | 			},
666 | 		];
667 | 
668 | 		let index = build_index(documents).unwrap();
669 | 		let results = search(&index, "remote development", 10).unwrap();
670 | 
671 | 		// Should find the document that has both keywords together
672 | 		assert!(!results.is_empty());
673 | 		assert_eq!(results[0].href, "/tutorials/remote-dev");
674 | 	}
675 | 
676 | 	#[test]
677 | 	fn test_search_with_stopwords() {
678 | 		// Test that common stop words don't interfere with search
679 | 		let documents = vec![Document {
680 | 			title: "Getting Started with VS Code".to_string(),
681 | 			category: "Tutorials".to_string(),
682 | 			href: "/tutorials/start".to_string(),
683 | 			body: "This is a guide to help you get started with the editor".to_string(),
684 | 			keywords: Some(vec![
685 | 				"getting".to_string(),
686 | 				"started".to_string(),
687 | 				"vs".to_string(),
688 | 				"code".to_string(),
689 | 			]),
690 | 		}];
691 | 
692 | 		let index = build_index(documents).unwrap();
693 | 		let results = search(&index, "getting started with vscode", 10).unwrap();
694 | 
695 | 		// Should find results despite stop words like "with", "the", "a"
696 | 		assert!(!results.is_empty());
697 | 	}
698 | 
699 | 	#[test]
700 | 	fn test_search_with_numbers() {
701 | 		// Test searching with version numbers and numeric values
702 | 		let documents = vec![
703 | 			Document {
704 | 				title: "Node.js 18 Features".to_string(),
705 | 				category: "Updates".to_string(),
706 | 				href: "/updates/nodejs18".to_string(),
707 | 				body: "New features in Node.js version 18 release".to_string(),
708 | 				keywords: Some(vec![
709 | 					"node.js".to_string(),
710 | 					"18".to_string(),
711 | 					"features".to_string(),
712 | 				]),
713 | 			},
714 | 			Document {
715 | 				title: "Node.js 16 Support".to_string(),
716 | 				category: "Updates".to_string(),
717 | 				href: "/updates/nodejs16".to_string(),
718 | 				body: "Long-term support for Node.js 16".to_string(),
719 | 				keywords: Some(vec![
720 | 					"node.js".to_string(),
721 | 					"16".to_string(),
722 | 					"support".to_string(),
723 | 				]),
724 | 			},
725 | 		];
726 | 
727 | 		let index = build_index(documents).unwrap();
728 | 		let results = search(&index, "nodejs 18", 10).unwrap();
729 | 
730 | 		assert!(!results.is_empty());
731 | 		// Should find the Node.js 18 document
732 | 		assert!(results.iter().any(|d| d.href.contains("nodejs18")));
733 | 	}
734 | 
735 | 	#[test]
736 | 	fn test_search_long_query() {
737 | 		// Test with a longer, more natural language query
738 | 		let documents = vec![
739 | 			Document {
740 | 				title: "Remote SSH Extension".to_string(),
741 | 				category: "Extensions".to_string(),
742 | 				href: "/extensions/remote-ssh".to_string(),
743 | 				body: "Connect to remote servers via SSH and develop directly on remote machines"
744 | 					.to_string(),
745 | 				keywords: Some(vec![
746 | 					"remote".to_string(),
747 | 					"ssh".to_string(),
748 | 					"extension".to_string(),
749 | 				]),
750 | 			},
751 | 			Document {
752 | 				title: "SSH Key Setup".to_string(),
753 | 				category: "Documentation".to_string(),
754 | 				href: "/docs/ssh-keys".to_string(),
755 | 				body: "Configure SSH keys for secure remote connections".to_string(),
756 | 				keywords: Some(vec![
757 | 					"ssh".to_string(),
758 | 					"key".to_string(),
759 | 					"setup".to_string(),
760 | 				]),
761 | 			},
762 | 		];
763 | 
764 | 		let index = build_index(documents).unwrap();
765 | 		let results = search(&index, "how do i connect to a remote server using ssh", 10).unwrap();
766 | 
767 | 		// Should extract relevant keywords and find documents
768 | 		assert!(!results.is_empty());
769 | 	}
770 | 
771 | 	// ========================================================================
772 | 	// SECTION 9: Edge Cases and Stress Tests
773 | 	// ========================================================================
774 | 
775 | 	#[test]
776 | 	fn test_search_many_documents() {
777 | 		// Test with a larger number of documents
778 | 		let mut documents = Vec::new();
779 | 		for i in 0..100 {
780 | 			documents.push(Document {
781 | 				title: format!("Document {}", i).to_string(),
782 | 				category: format!("Category {}", i % 10).to_string(),
783 | 				href: format!("/doc{}", i).to_string(),
784 | 				body: format!("This is document number {} with some content", i).to_string(),
785 | 				keywords: Some(vec![format!("document{}", i).to_string()]),
786 | 			});
787 | 		}
788 | 
789 | 		// Add a special document to search for
790 | 		documents.push(Document {
791 | 			title: "Special Search Target".to_string(),
792 | 			category: "Test".to_string(),
793 | 			href: "/special".to_string(),
794 | 			body: "This document should be easy to find".to_string(),
795 | 			keywords: Some(vec!["special".to_string(), "target".to_string()]),
796 | 		});
797 | 
798 | 		let index = build_index(documents).unwrap();
799 | 		let results = search(&index, "special target", 10).unwrap();
800 | 
801 | 		assert!(!results.is_empty());
802 | 		assert_eq!(results[0].href, "/special");
803 | 	}
804 | 
805 | 	#[test]
806 | 	fn test_search_empty_fields() {
807 | 		// Test with documents that have empty fields
808 | 		let documents = vec![
809 | 			Document {
810 | 				title: "".to_string(),
811 | 				category: "Empty Title".to_string(),
812 | 				href: "/empty1".to_string(),
813 | 				body: "This document has no title".to_string(),
814 | 				keywords: Some(vec!["empty".to_string()]),
815 | 			},
816 | 			Document {
817 | 				title: "Empty Body".to_string(),
818 | 				category: "Test".to_string(),
819 | 				href: "/empty2".to_string(),
820 | 				body: "".to_string(),
821 | 				keywords: Some(vec!["empty".to_string(), "body".to_string()]),
822 | 			},
823 | 		];
824 | 
825 | 		let index = build_index(documents);
826 | 		assert!(index.is_ok());
827 | 
828 | 		let results = search(&index.unwrap(), "empty", 10).unwrap();
829 | 		// Should handle empty fields gracefully
830 | 		assert!(!results.is_empty());
831 | 	}
832 | 
833 | 	#[test]
834 | 	fn test_search_whitespace_handling() {
835 | 		// Test that extra whitespace doesn't break search
836 | 		let documents = vec![Document {
837 | 			title: "Whitespace   Test".to_string(),
838 | 			category: "Test".to_string(),
839 | 			href: "/whitespace".to_string(),
840 | 			body: "Multiple   spaces   between   words".to_string(),
841 | 			keywords: Some(vec!["whitespace".to_string(), "test".to_string()]),
842 | 		}];
843 | 
844 | 		let index = build_index(documents).unwrap();
845 | 		let results = search(&index, "  whitespace  test  ", 10).unwrap();
846 | 
847 | 		assert!(!results.is_empty());
848 | 	}
849 | 
850 | 	#[test]
851 | 	fn test_search_with_typo() -> Result<(), Box<dyn std::error::Error>> {
852 | 		let document_strings = FsstStrVec::from_strings(&vec![
853 | 			"Document 1",
854 | 			"Docs",
855 | 			"/doc1",
856 | 			"This is the first document.",
857 | 			"Document 2",
858 | 			"Docs",
859 | 			"/doc2",
860 | 			"This is the second document.",
861 | 			"Document 3",
862 | 			"Docs",
863 | 			"/doc3",
864 | 			"This is the third document.",
865 | 		]);
866 | 
867 | 		let keyword_to_documents: Vec<Vec<(usize, u8)>> = vec![
868 | 			vec![(1, 1)],          // "language" appears in doc 1
869 | 			vec![(0, 10), (2, 4)], // "programming" appears in doc 0 and 2
870 | 			vec![(0, 5), (1, 3)],  // "rust" appears in doc 0 and 1
871 | 		];
872 | 
873 | 		let mut fst_builder = fst::MapBuilder::memory();
874 | 		fst_builder.insert("language", 0).unwrap();
875 | 		fst_builder.insert("programming", 1).unwrap();
876 | 		fst_builder.insert("rust", 2).unwrap();
877 | 		let fst = fst_builder.into_inner()?;
878 | 
879 | 		let index = Index {
880 | 			fst,
881 | 			document_strings,
882 | 			keyword_to_documents,
883 | 		};
884 | 
885 | 		let results = search(&index, "lamguage", 10)?;
886 | 		assert_eq!(results.len(), 1, "Expected 1 result for 'lamguage'");
887 | 
888 | 		Ok(())
889 | 	}
890 | }
891 | 


--------------------------------------------------------------------------------