├── .github └── workflows │ └── rust.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── examples ├── demo.rs └── perf_test.rs └── src └── lib.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.1.0] - 2024-12-28 4 | - Initial release 5 | - Single-file storage implementation 6 | - Support for Euclidean and Cosine distance 7 | - Three-layer search structure 8 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "bincode" 7 | version = "1.3.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" 10 | dependencies = [ 11 | "serde", 12 | ] 13 | 14 | [[package]] 15 | name = "bytemuck" 16 | version = "1.21.0" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" 19 | 20 | [[package]] 21 | name = "byteorder" 22 | version = "1.5.0" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 25 | 26 | [[package]] 27 | name = "cfg-if" 28 | version = "1.0.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 31 | 32 | [[package]] 33 | name = "crossbeam-deque" 34 | version = "0.8.6" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 37 | dependencies = [ 38 | "crossbeam-epoch", 39 | "crossbeam-utils", 40 | ] 41 | 42 | [[package]] 43 | name = "crossbeam-epoch" 44 | version = "0.9.18" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 47 | dependencies = [ 48 | "crossbeam-utils", 49 | ] 50 | 51 | [[package]] 52 | name = "crossbeam-utils" 53 | version = "0.8.21" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 56 | 57 | [[package]] 58 | name = "diskann-rs" 59 | version = "0.1.0" 60 | dependencies = [ 61 | "bincode", 62 | "bytemuck", 63 | "memmap2", 64 | "rand", 65 | "rayon", 66 | "serde", 67 | "thiserror", 68 | ] 69 | 70 | [[package]] 71 | name = "either" 72 | version = "1.13.0" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 75 | 76 | [[package]] 77 | name = "getrandom" 78 | version = "0.2.15" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 81 | dependencies = [ 82 | "cfg-if", 83 | "libc", 84 | "wasi", 85 | ] 86 | 87 | [[package]] 88 | name = "libc" 89 | version = "0.2.169" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 92 | 93 | [[package]] 94 | name = "memmap2" 95 | version = "0.5.10" 96 | source = "registry+https://github.com/rust-lang/crates.io-index" 97 | checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" 98 | dependencies = [ 99 | "libc", 100 | ] 101 | 102 | [[package]] 103 | name = "ppv-lite86" 104 | version = "0.2.20" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" 107 | dependencies = [ 108 | "zerocopy", 109 | ] 110 | 111 | [[package]] 112 | name = "proc-macro2" 113 | version = "1.0.92" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" 116 | dependencies = [ 117 | "unicode-ident", 118 | ] 119 | 120 | [[package]] 121 | name = "quote" 122 | version = "1.0.37" 123 | source = "registry+https://github.com/rust-lang/crates.io-index" 124 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 125 | dependencies = [ 126 | "proc-macro2", 127 | ] 128 | 129 | [[package]] 130 | name = "rand" 131 | version = "0.8.5" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 134 | dependencies = [ 135 | "libc", 136 | "rand_chacha", 137 | "rand_core", 138 | ] 139 | 140 | [[package]] 141 | name = "rand_chacha" 142 | version = "0.3.1" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 145 | dependencies = [ 146 | "ppv-lite86", 147 | "rand_core", 148 | ] 149 | 150 | [[package]] 151 | name = "rand_core" 152 | version = "0.6.4" 153 | source = "registry+https://github.com/rust-lang/crates.io-index" 154 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 155 | dependencies = [ 156 | "getrandom", 157 | ] 158 | 159 | [[package]] 160 | name = "rayon" 161 | version = "1.10.0" 162 | source = "registry+https://github.com/rust-lang/crates.io-index" 163 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 164 | dependencies = [ 165 | "either", 166 | "rayon-core", 167 | ] 168 | 169 | [[package]] 170 | name = "rayon-core" 171 | version = "1.12.1" 172 | source = "registry+https://github.com/rust-lang/crates.io-index" 173 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 174 | dependencies = [ 175 | "crossbeam-deque", 176 | "crossbeam-utils", 177 | ] 178 | 179 | [[package]] 180 | name = "serde" 181 | version = "1.0.216" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" 184 | dependencies = [ 185 | "serde_derive", 186 | ] 187 | 188 | [[package]] 189 | name = "serde_derive" 190 | version = "1.0.216" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" 193 | dependencies = [ 194 | "proc-macro2", 195 | "quote", 196 | "syn", 197 | ] 198 | 199 | [[package]] 200 | name = "syn" 201 | version = "2.0.91" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035" 204 | dependencies = [ 205 | "proc-macro2", 206 | "quote", 207 | "unicode-ident", 208 | ] 209 | 210 | [[package]] 211 | name = "thiserror" 212 | version = "1.0.69" 213 | source = "registry+https://github.com/rust-lang/crates.io-index" 214 | checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" 215 | dependencies = [ 216 | "thiserror-impl", 217 | ] 218 | 219 | [[package]] 220 | name = "thiserror-impl" 221 | version = "1.0.69" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" 224 | dependencies = [ 225 | "proc-macro2", 226 | "quote", 227 | "syn", 228 | ] 229 | 230 | [[package]] 231 | name = "unicode-ident" 232 | version = "1.0.14" 233 | source = "registry+https://github.com/rust-lang/crates.io-index" 234 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" 235 | 236 | [[package]] 237 | name = "wasi" 238 | version = "0.11.0+wasi-snapshot-preview1" 239 | source = "registry+https://github.com/rust-lang/crates.io-index" 240 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 241 | 242 | [[package]] 243 | name = "zerocopy" 244 | version = "0.7.35" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 247 | dependencies = [ 248 | "byteorder", 249 | "zerocopy-derive", 250 | ] 251 | 252 | [[package]] 253 | name = "zerocopy-derive" 254 | version = "0.7.35" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 257 | dependencies = [ 258 | "proc-macro2", 259 | "quote", 260 | "syn", 261 | ] 262 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "diskann-rs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | description = "A Rust implementation of DiskANN (Disk-based Approximate Nearest Neighbor search) featuring a 3-layer index architecture and parallel query processing. This project provides an efficient and scalable solution for large-scale vector similarity search with single-file storage." 6 | license = "MIT" 7 | repository = "https://github.com/lukaesch/diskann-rs" 8 | 9 | [dependencies] 10 | rand = "0.8" 11 | memmap2 = "0.5" 12 | serde = { version = "1.0", features = ["derive"] } 13 | bincode = "1.3" 14 | thiserror = "1.0" 15 | bytemuck = "1.14" 16 | rayon = "1.7" 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Lukas Schmyrczyk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DiskANN Implementation in Rust 2 | 3 | [![Rust](https://github.com/lukaesch/diskann-rs/actions/workflows/rust.yml/badge.svg?branch=main)](https://github.com/lukaesch/diskann-rs/actions/workflows/rust.yml) 4 | 5 | A Rust implementation of DiskANN (Disk-based Approximate Nearest Neighbor search) featuring a 3-layer index architecture and parallel query processing. This project provides an efficient and scalable solution for large-scale vector similarity search with single-file storage. 6 | 7 | ## Overview 8 | 9 | This implementation provides a memory-efficient approach to similarity search by: 10 | - Using a 3-layer hierarchical index structure for faster search 11 | - Storing all data in a single file using memory mapping 12 | - Supporting both Euclidean distance and Cosine similarity 13 | - Managing adjacency lists for graph-based search 14 | - Implementing parallel query processing 15 | - Supporting large-scale datasets that don't fit in RAM 16 | 17 | ## Features 18 | 19 | - Three-layer hierarchical index structure: 20 | - Top layer (L0): Smallest, most selective layer 21 | - Middle layer (L1): Intermediate connectivity 22 | - Base layer (L2): Complete dataset 23 | - Single-file storage format for simplified deployment 24 | - Choice of distance metrics: 25 | - Euclidean distance 26 | - Cosine similarity 27 | - Cluster-based graph construction for meaningful adjacency 28 | - Parallel query processing using rayon 29 | - Memory-mapped file access for handling large datasets 30 | - Comprehensive error handling with custom error types 31 | 32 | ## Usage 33 | 34 | ### Building a New Index 35 | 36 | ```rust 37 | use diskannrs::{SingleFileDiskANN, DistanceMetric}; 38 | 39 | let index = SingleFileDiskANN::build_index_singlefile( 40 | 1_000_000, // number of vectors 41 | 128, // dimension 42 | 32, // max neighbors per node 43 | 0.01, // fraction of vectors in top layer 44 | 0.1, // fraction of vectors in middle layer 45 | DistanceMetric::Euclidean, // or DistanceMetric::Cosine 46 | "index.db" // single file to store everything 47 | )?; 48 | ``` 49 | 50 | ### Opening an Existing Index 51 | 52 | ```rust 53 | let index = SingleFileDiskANN::open_index_singlefile("index.db")?; 54 | ``` 55 | 56 | ### Searching the Index 57 | 58 | ```rust 59 | // Prepare your query vector 60 | let query = vec![0.1, 0.2, ...; 128]; // must match index dimension 61 | 62 | // Search for nearest neighbors 63 | let k = 10; // number of neighbors to return 64 | let beam_width = 64; // search beam width 65 | let neighbors = index.search(&query, k, beam_width); 66 | ``` 67 | 68 | ### Parallel Search 69 | 70 | ```rust 71 | use rayon::prelude::*; 72 | use std::sync::Arc; 73 | 74 | // Create shared index reference 75 | let index = Arc::new(index); 76 | 77 | // Perform parallel queries 78 | let results: Vec> = query_batch 79 | .par_iter() 80 | .map(|query| index.search(query, k, beam_width)) 81 | .collect(); 82 | ``` 83 | 84 | ## Performance Characteristics 85 | 86 | - Memory Usage: O(1) for vector storage due to memory mapping 87 | - Disk Space: Single file containing: 88 | - Vectors: num_vectors * dimension * 4 bytes 89 | - Adjacency Lists: Varies by layer size and max_degree 90 | - Metadata: Small overhead 91 | - Search Time: Logarithmic due to hierarchical structure 92 | - Parallel Processing: Scales with available CPU cores 93 | 94 | ## Building and Testing 95 | 96 | ```bash 97 | # Build the library 98 | cargo build --release 99 | 100 | # Run tests 101 | cargo test 102 | 103 | # Run with example 104 | cargo run --release --example simple_search 105 | ``` 106 | 107 | ## Current Status 108 | 109 | This implementation features: 110 | - [x] Single-file storage format 111 | - [x] 3-layer hierarchical index structure 112 | - [x] Multiple distance metrics support 113 | - [x] Cluster-based graph construction 114 | - [x] Parallel query processing 115 | - [x] Memory-mapped I/O 116 | - [x] Comprehensive test suite 117 | 118 | ## Future Improvements 119 | 120 | 1. Add more distance metrics 121 | 2. Implement dynamic index updates 122 | 3. Add parameter auto-tuning 123 | 4. Expand benchmarking suite 124 | 5. Add more examples 125 | 6. Improve documentation 126 | 127 | ## Contributing 128 | 129 | Contributions are welcome! Please feel free to: 130 | - Open issues for bugs or feature requests 131 | - Submit PRs for improvements 132 | - Share ideas for optimization 133 | 134 | ## License 135 | 136 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 137 | 138 | ## References 139 | 140 | - Original DiskANN paper: [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://www.microsoft.com/en-us/research/publication/diskann-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node/) 141 | -------------------------------------------------------------------------------- /examples/demo.rs: -------------------------------------------------------------------------------- 1 | // examples/demo.rs 2 | use diskann_rs::{DiskAnnError, DistanceMetric, SingleFileDiskANN}; 3 | use std::sync::Arc; 4 | 5 | fn main() -> Result<(), DiskAnnError> { 6 | let singlefile_path = "diskann.db"; 7 | let num_vectors = 100_000; 8 | let dim = 128; 9 | let max_degree = 32; 10 | let fraction_top = 0.01; 11 | let fraction_mid = 0.1; 12 | let distance_metric = DistanceMetric::Cosine; 13 | 14 | // Build if missing 15 | if !std::path::Path::new(singlefile_path).exists() { 16 | println!("Building single-file diskann at {singlefile_path}..."); 17 | let index = SingleFileDiskANN::build_index_singlefile( 18 | num_vectors, 19 | dim, 20 | max_degree, 21 | fraction_top, 22 | fraction_mid, 23 | distance_metric, 24 | singlefile_path, 25 | )?; 26 | println!("Build done. Index dimension = {}", index.dim); 27 | } else { 28 | println!("Index file {singlefile_path} already exists, skipping build."); 29 | } 30 | 31 | // Open 32 | let index = Arc::new(SingleFileDiskANN::open_index_singlefile(singlefile_path)?); 33 | 34 | // Query 35 | let query = vec![0.1, 0.2, 0.3 /* ... up to dim */]; 36 | let k = 10; 37 | let beam_width = 64; 38 | let neighbors = index.search(&query, k, beam_width); 39 | println!("Neighbors for the sample query = {:?}", neighbors); 40 | 41 | Ok(()) 42 | } 43 | -------------------------------------------------------------------------------- /examples/perf_test.rs: -------------------------------------------------------------------------------- 1 | use diskann_rs::{DiskAnnError, DistanceMetric, SingleFileDiskANN}; 2 | use rand::prelude::*; 3 | use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; 4 | use std::sync::Arc; 5 | use std::time::Instant; 6 | 7 | fn main() -> Result<(), DiskAnnError> { 8 | const NUM_VECTORS: usize = 1_000_000; 9 | const DIM: usize = 1536; 10 | const MAX_DEGREE: usize = 32; 11 | const FRACTION_TOP: f64 = 0.01; 12 | const FRACTION_MID: f64 = 0.1; 13 | let distance_metric = DistanceMetric::Cosine; 14 | 15 | let singlefile_path = "diskann_parallel.db"; 16 | 17 | // Build if missing 18 | if !std::path::Path::new(singlefile_path).exists() { 19 | println!( 20 | "Building single-file index with parallel adjacency + distance={:?}", 21 | distance_metric 22 | ); 23 | let start = Instant::now(); 24 | let _index = SingleFileDiskANN::build_index_singlefile( 25 | NUM_VECTORS, 26 | DIM, 27 | MAX_DEGREE, 28 | FRACTION_TOP, 29 | FRACTION_MID, 30 | distance_metric, 31 | singlefile_path, 32 | )?; 33 | let elapsed = start.elapsed().as_secs_f32(); 34 | println!("Done building index in {:.2} s", elapsed); 35 | } else { 36 | println!( 37 | "Index file {} already exists, skipping build.", 38 | singlefile_path 39 | ); 40 | } 41 | 42 | // open 43 | let open_start = Instant::now(); 44 | let index = Arc::new(SingleFileDiskANN::open_index_singlefile(singlefile_path)?); 45 | let open_time = open_start.elapsed().as_secs_f32(); 46 | println!( 47 | "Opened index with {} vectors, dim={}, metric={:?} in {:.2} s", 48 | index.num_vectors, index.dim, index.distance_metric, open_time 49 | ); 50 | 51 | // Create queries 52 | let queries = 5; 53 | let k = 10; 54 | let beam_width = 64; 55 | 56 | // Generate all queries in a batch 57 | let mut rng = rand::thread_rng(); 58 | let mut query_batch: Vec> = Vec::with_capacity(queries); 59 | for _ in 0..queries { 60 | let q: Vec = (0..index.dim).map(|_| rng.gen()).collect(); 61 | query_batch.push(q); 62 | } 63 | 64 | // Now run queries in parallel 65 | let search_start = Instant::now(); 66 | query_batch.par_iter().enumerate().for_each(|(i, query)| { 67 | let neighbors = index.search(query, k, beam_width); 68 | println!("Query {i} => top-{k} neighbors = {:?}", neighbors); 69 | }); 70 | let search_time = search_start.elapsed().as_secs_f32(); 71 | println!("Performed {queries} queries in {:.2} s", search_time); 72 | 73 | Ok(()) 74 | } 75 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # DiskAnnRS 2 | //! 3 | //! A DiskANN-like Rust library implementing approximate nearest neighbor search with 4 | //! single-file storage support. The library provides both Euclidean distance and 5 | //! Cosine similarity metrics, with a three-layer hierarchical search structure. 6 | //! 7 | //! ## Features 8 | //! 9 | //! - Single-file storage format 10 | //! - Support for both Euclidean and Cosine distance metrics 11 | //! - Parallel index construction using Rayon 12 | //! - Memory-mapped file access for efficient searches 13 | //! - Three-layer hierarchical search structure 14 | //! 15 | //! ## Example 16 | //! 17 | //! ```rust,no_run 18 | //! use diskann_rs::{SingleFileDiskANN, DistanceMetric}; 19 | //! 20 | //! // Build a new index 21 | //! let index = SingleFileDiskANN::build_index_singlefile( 22 | //! 1000, // number of vectors 23 | //! 128, // dimensionality 24 | //! 32, // maximum degree 25 | //! 0.1, // fraction of vectors in top layer 26 | //! 0.2, // fraction of vectors in middle layer 27 | //! DistanceMetric::Euclidean, 28 | //! "index.db" 29 | //! ).unwrap(); 30 | //! 31 | //! // Search the index 32 | //! let query = vec![0.0; 128]; // your query vector 33 | //! let neighbors = index.search(&query, 10, 64); // find top 10 with beam width 64 34 | //! ``` 35 | 36 | use bytemuck; 37 | use memmap2::Mmap; 38 | use rand::prelude::{Rng, SliceRandom}; 39 | use rayon::prelude::*; 40 | use serde::{Deserialize, Serialize}; 41 | use std::hash::Hash; 42 | use std::{ 43 | fs::{File, OpenOptions}, 44 | io::{Seek, SeekFrom, Write}, 45 | os::unix::fs::FileExt, // for read_at / write_at 46 | time::Instant, 47 | }; 48 | use thiserror::Error; 49 | 50 | /// Custom error type for DiskAnnRS operations 51 | #[derive(Debug, Error)] 52 | pub enum DiskAnnError { 53 | /// Represents I/O errors during file operations 54 | #[error("I/O error: {0}")] 55 | Io(#[from] std::io::Error), 56 | 57 | /// Represents serialization/deserialization errors 58 | #[error("Serialization error: {0}")] 59 | Bincode(#[from] bincode::Error), 60 | 61 | /// Represents index-specific errors 62 | #[error("Index error: {0}")] 63 | IndexError(String), 64 | } 65 | 66 | /// Supported distance metrics for vector comparison 67 | #[derive(Clone, Copy, Debug, Serialize, Deserialize)] 68 | pub enum DistanceMetric { 69 | /// Standard Euclidean distance 70 | Euclidean, 71 | /// Cosine similarity (converted to distance as 1 - similarity) 72 | Cosine, 73 | } 74 | 75 | /// Internal metadata structure stored in the index file 76 | #[derive(Serialize, Deserialize, Debug)] 77 | struct SingleFileMetadata { 78 | dim: usize, 79 | num_vectors: usize, 80 | max_degree: usize, 81 | fraction_top: f64, 82 | fraction_mid: f64, 83 | distance_metric: DistanceMetric, 84 | layer0_ids: Vec, 85 | layer1_ids: Vec, 86 | vectors_offset: u64, 87 | adjacency_offset: u64, 88 | offset_layer0: usize, 89 | offset_layer1: usize, 90 | offset_layer2: usize, 91 | } 92 | 93 | /// Main struct representing a DiskANN index 94 | pub struct SingleFileDiskANN { 95 | /// Dimensionality of vectors in the index 96 | pub dim: usize, 97 | /// Number of vectors in the index 98 | pub num_vectors: usize, 99 | /// Maximum number of edges per node 100 | pub max_degree: usize, 101 | /// Fraction of vectors in top layer 102 | pub fraction_top: f64, 103 | /// Fraction of vectors in middle layer 104 | pub fraction_mid: f64, 105 | /// Distance metric used by this index 106 | pub distance_metric: DistanceMetric, 107 | 108 | layer0_ids: Vec, 109 | layer1_ids: Vec, 110 | offset_layer0: usize, 111 | offset_layer1: usize, 112 | offset_layer2: usize, 113 | vectors_offset: u64, 114 | adjacency_offset: u64, 115 | mmap: Mmap, 116 | } 117 | 118 | impl SingleFileDiskANN { 119 | /// Builds a new single-file index with the specified parameters 120 | /// 121 | /// # Arguments 122 | /// 123 | /// * `num_vectors` - Number of vectors to store 124 | /// * `dim` - Dimensionality of the vectors 125 | /// * `max_degree` - Maximum number of edges per node 126 | /// * `fraction_top` - Fraction of vectors in top layer 127 | /// * `fraction_mid` - Fraction of vectors in middle layer 128 | /// * `distance_metric` - Distance metric to use 129 | /// * `singlefile_path` - Path where the index file will be created 130 | /// 131 | /// # Returns 132 | /// 133 | /// Returns `Result` 134 | pub fn build_index_singlefile( 135 | num_vectors: usize, 136 | dim: usize, 137 | max_degree: usize, 138 | fraction_top: f64, 139 | fraction_mid: f64, 140 | distance_metric: DistanceMetric, 141 | singlefile_path: &str, 142 | ) -> Result { 143 | let mut file = OpenOptions::new() 144 | .create(true) 145 | .write(true) 146 | .read(true) 147 | .truncate(true) 148 | .open(singlefile_path)?; 149 | 150 | let vectors_offset = 1024 * 1024; 151 | let total_vector_bytes = (num_vectors as u64) * (dim as u64) * 4; 152 | 153 | println!( 154 | "Generating {num_vectors} random vectors of dim {dim} => ~{:.2} GB on disk...", 155 | (num_vectors as f64 * dim as f64 * 4.0) / (1024.0 * 1024.0 * 1024.0) 156 | ); 157 | let chunk_size = 100_000; 158 | let mut rng = rand::thread_rng(); 159 | let mut buffer = Vec::with_capacity(chunk_size * dim); 160 | 161 | let gen_start = Instant::now(); 162 | let mut written = 0usize; 163 | while written < num_vectors { 164 | buffer.clear(); 165 | let remaining = num_vectors - written; 166 | let batch = remaining.min(chunk_size); 167 | for _ in 0..batch { 168 | for _ in 0..dim { 169 | let val: f32 = rng.gen(); 170 | buffer.push(val); 171 | } 172 | } 173 | let bytes = bytemuck::cast_slice(&buffer); 174 | let offset = vectors_offset + (written * dim * 4) as u64; 175 | file.write_at(bytes, offset)?; 176 | written += batch; 177 | } 178 | let gen_time = gen_start.elapsed().as_secs_f32(); 179 | println!("Vector generation took {gen_time:.2} s"); 180 | 181 | let mut all_ids: Vec = (0..num_vectors as u32).collect(); 182 | all_ids.shuffle(&mut rng); 183 | let size_l0 = (num_vectors as f64 * fraction_top).ceil() as usize; 184 | let size_l1 = (num_vectors as f64 * fraction_mid).ceil() as usize; 185 | let size_l1 = size_l1.max(size_l0); 186 | 187 | let l0slice = &all_ids[..size_l0]; 188 | let l1slice = &all_ids[..size_l1]; 189 | let mut layer0_ids = l0slice.to_vec(); 190 | let mut layer1_ids = l1slice.to_vec(); 191 | layer0_ids.sort_unstable(); 192 | layer1_ids.sort_unstable(); 193 | 194 | let bytes_per_node = max_degree * 4; 195 | let offset_layer0 = 0; 196 | let offset_layer1 = size_l0 * bytes_per_node; 197 | let offset_layer2 = offset_layer1 + (size_l1 * bytes_per_node); 198 | let total_adj_bytes = offset_layer2 + (num_vectors * bytes_per_node); 199 | 200 | let adjacency_offset = vectors_offset + total_vector_bytes; 201 | let adjacency_end = adjacency_offset + total_adj_bytes as u64; 202 | file.set_len(adjacency_end)?; 203 | 204 | let cluster_count = 20; 205 | let centroids = 206 | pick_random_centroids(cluster_count, &file, vectors_offset, dim, num_vectors)?; 207 | 208 | let build_start = Instant::now(); 209 | build_layer_adjacency_parallel( 210 | &file, 211 | adjacency_offset, 212 | offset_layer0, 213 | &layer0_ids, 214 | dim, 215 | max_degree, 216 | vectors_offset, 217 | ¢roids, 218 | distance_metric, 219 | )?; 220 | build_layer_adjacency_parallel( 221 | &file, 222 | adjacency_offset, 223 | offset_layer1, 224 | &layer1_ids, 225 | dim, 226 | max_degree, 227 | vectors_offset, 228 | ¢roids, 229 | distance_metric, 230 | )?; 231 | let base_ids: Vec = (0..num_vectors as u32).collect(); 232 | build_layer_adjacency_parallel( 233 | &file, 234 | adjacency_offset, 235 | offset_layer2, 236 | &base_ids, 237 | dim, 238 | max_degree, 239 | vectors_offset, 240 | ¢roids, 241 | distance_metric, 242 | )?; 243 | let build_time = build_start.elapsed().as_secs_f32(); 244 | println!("Parallel adjacency build took {build_time:.2} s"); 245 | 246 | let metadata = SingleFileMetadata { 247 | dim, 248 | num_vectors, 249 | max_degree, 250 | fraction_top, 251 | fraction_mid, 252 | distance_metric, 253 | layer0_ids, 254 | layer1_ids, 255 | vectors_offset, 256 | adjacency_offset, 257 | offset_layer0, 258 | offset_layer1, 259 | offset_layer2, 260 | }; 261 | let md_bytes = bincode::serialize(&metadata)?; 262 | file.seek(SeekFrom::Start(0))?; 263 | let md_len = md_bytes.len() as u64; 264 | file.write_all(&md_len.to_le_bytes())?; 265 | file.write_all(&md_bytes)?; 266 | file.sync_all()?; 267 | 268 | let mmap = unsafe { memmap2::Mmap::map(&file)? }; 269 | 270 | Ok(Self { 271 | dim, 272 | num_vectors, 273 | max_degree, 274 | fraction_top, 275 | fraction_mid, 276 | distance_metric: metadata.distance_metric, 277 | layer0_ids: metadata.layer0_ids, 278 | layer1_ids: metadata.layer1_ids, 279 | offset_layer0: metadata.offset_layer0, 280 | offset_layer1: metadata.offset_layer1, 281 | offset_layer2: metadata.offset_layer2, 282 | vectors_offset: metadata.vectors_offset, 283 | adjacency_offset: metadata.adjacency_offset, 284 | mmap, 285 | }) 286 | } 287 | 288 | /// Opens an existing index file 289 | /// 290 | /// # Arguments 291 | /// 292 | /// * `path` - Path to the index file 293 | /// 294 | /// # Returns 295 | /// 296 | /// Returns `Result` 297 | pub fn open_index_singlefile(path: &str) -> Result { 298 | let file = OpenOptions::new().read(true).write(false).open(path)?; 299 | let mut buf8 = [0u8; 8]; 300 | file.read_at(&mut buf8, 0)?; 301 | let md_len = u64::from_le_bytes(buf8); 302 | let mut md_bytes = vec![0u8; md_len as usize]; 303 | file.read_at(&mut md_bytes, 8)?; 304 | let metadata: SingleFileMetadata = bincode::deserialize(&md_bytes)?; 305 | 306 | let mmap = unsafe { memmap2::Mmap::map(&file)? }; 307 | 308 | Ok(Self { 309 | dim: metadata.dim, 310 | num_vectors: metadata.num_vectors, 311 | max_degree: metadata.max_degree, 312 | fraction_top: metadata.fraction_top, 313 | fraction_mid: metadata.fraction_mid, 314 | distance_metric: metadata.distance_metric, 315 | layer0_ids: metadata.layer0_ids, 316 | layer1_ids: metadata.layer1_ids, 317 | offset_layer0: metadata.offset_layer0, 318 | offset_layer1: metadata.offset_layer1, 319 | offset_layer2: metadata.offset_layer2, 320 | vectors_offset: metadata.vectors_offset, 321 | adjacency_offset: metadata.adjacency_offset, 322 | mmap, 323 | }) 324 | } 325 | 326 | /// Searches the index for nearest neighbors 327 | /// 328 | /// # Arguments 329 | /// 330 | /// * `query` - Query vector 331 | /// * `k` - Number of nearest neighbors to return 332 | /// * `beam_width` - Beam width for the search 333 | /// 334 | /// # Returns 335 | /// 336 | /// Returns a vector of node IDs representing the nearest neighbors 337 | pub fn search(&self, query: &[f32], k: usize, beam_width: usize) -> Vec { 338 | // Use small layers as is 339 | let _l0 = self.search_layer(query, &self.layer0_ids, self.offset_layer0, beam_width, 1); 340 | let _l1 = self.search_layer(query, &self.layer1_ids, self.offset_layer1, beam_width, 1); 341 | 342 | // Use chunks for base layer 343 | const CHUNK_SIZE: usize = 100_000; 344 | let mut results = Vec::with_capacity(k); 345 | for chunk_start in (0..self.num_vectors).step_by(CHUNK_SIZE) { 346 | let chunk_end = (chunk_start + CHUNK_SIZE).min(self.num_vectors); 347 | let chunk_ids: Vec = (chunk_start..chunk_end).map(|x| x as u32).collect(); 348 | let chunk_results = 349 | self.search_layer(query, &chunk_ids, self.offset_layer2, beam_width, k); 350 | results.extend(chunk_results); 351 | } 352 | results.sort_by(|&a, &b| { 353 | let da = self.distance_to(query, a as usize); 354 | let db = self.distance_to(query, b as usize); 355 | da.partial_cmp(&db).unwrap() 356 | }); 357 | results.truncate(k); 358 | results 359 | } 360 | 361 | fn search_layer( 362 | &self, 363 | query: &[f32], 364 | layer_ids: &[u32], 365 | layer_offset: usize, 366 | beam_width: usize, 367 | k: usize, 368 | ) -> Vec { 369 | if layer_ids.is_empty() { 370 | return vec![]; 371 | } 372 | use std::cmp::Ordering; 373 | use std::collections::BinaryHeap; 374 | 375 | #[derive(Clone)] 376 | struct Candidate { 377 | dist: f32, 378 | node_id: u32, 379 | } 380 | impl PartialEq for Candidate { 381 | fn eq(&self, other: &Self) -> bool { 382 | self.dist == other.dist 383 | } 384 | } 385 | impl Eq for Candidate {} 386 | impl PartialOrd for Candidate { 387 | fn partial_cmp(&self, other: &Self) -> Option { 388 | other.dist.partial_cmp(&self.dist) // Min-heap: smaller distance is "greater" 389 | } 390 | } 391 | impl Ord for Candidate { 392 | fn cmp(&self, other: &Self) -> Ordering { 393 | self.partial_cmp(other).unwrap_or(Ordering::Equal) 394 | } 395 | } 396 | 397 | // Find starting point 398 | let mut best_id = layer_ids[0]; 399 | let mut best_dist = self.distance_to(query, best_id as usize); 400 | for &candidate_id in layer_ids.iter().skip(1) { 401 | let d = self.distance_to(query, candidate_id as usize); 402 | if d < best_dist { 403 | best_dist = d; 404 | best_id = candidate_id; 405 | } 406 | } 407 | let start_id = best_id; 408 | 409 | // Initialize 410 | let mut visited = vec![false; layer_ids.len()]; 411 | let id_to_idx = layer_ids 412 | .iter() 413 | .enumerate() 414 | .map(|(i, &nid)| (nid, i)) 415 | .collect::>(); 416 | 417 | let mut current = BinaryHeap::new(); 418 | current.push(Candidate { 419 | dist: best_dist, 420 | node_id: start_id, 421 | }); 422 | if let Some(&idx) = id_to_idx.get(&start_id) { 423 | visited[idx] = true; 424 | } 425 | 426 | let mut best = BinaryHeap::new(); 427 | best.push(Candidate { 428 | dist: best_dist, 429 | node_id: start_id, 430 | }); 431 | 432 | // Beam search with a maximum number of iterations 433 | let max_iterations = 100; // Adjust based on experimentation 434 | for _ in 0..max_iterations { 435 | let mut next = BinaryHeap::new(); 436 | while let Some(current_cand) = current.pop() { 437 | let neighbors = self.get_layer_neighbors(current_cand.node_id, layer_offset); 438 | for &nbr in neighbors { 439 | if nbr == 0 { 440 | continue; 441 | } 442 | if let Some(&nbr_idx) = id_to_idx.get(&nbr) { 443 | if !visited[nbr_idx] { 444 | visited[nbr_idx] = true; 445 | let d = self.distance_to(query, nbr as usize); 446 | let cand = Candidate { 447 | dist: d, 448 | node_id: nbr, 449 | }; 450 | next.push(cand.clone()); 451 | best.push(cand); 452 | if best.len() > beam_width { 453 | best.pop(); // Keep top beam_width 454 | } 455 | } 456 | } 457 | } 458 | } 459 | // Prepare next iteration: take top beam_width from next 460 | current.clear(); 461 | while current.len() < beam_width && !next.is_empty() { 462 | if let Some(cand) = next.pop() { 463 | current.push(cand); 464 | } 465 | } 466 | if current.is_empty() { 467 | break; 468 | } 469 | } 470 | 471 | let mut final_vec = best.into_vec(); 472 | final_vec.sort_by(|a, b| a.dist.partial_cmp(&b.dist).unwrap()); 473 | final_vec.truncate(k); 474 | final_vec.into_iter().map(|c| c.node_id).collect() 475 | } 476 | 477 | fn get_layer_neighbors(&self, node_id: u32, layer_offset: usize) -> &[u32] { 478 | let node_off = layer_offset + (node_id as usize * self.max_degree * 4); 479 | let start = (self.adjacency_offset as usize) + node_off; 480 | let end = start + (self.max_degree * 4); 481 | let bytes = &self.mmap[start..end]; 482 | bytemuck::cast_slice(bytes) 483 | } 484 | 485 | fn distance_to(&self, query: &[f32], idx: usize) -> f32 { 486 | let vector_offset = self.vectors_offset + (idx * self.dim * 4) as u64; 487 | let start = vector_offset as usize; 488 | let end = start + (self.dim * 4); 489 | let bytes = &self.mmap[start..end]; 490 | let vecf: &[f32] = bytemuck::cast_slice(bytes); 491 | 492 | match self.distance_metric { 493 | DistanceMetric::Euclidean => euclidean_distance(query, vecf), 494 | DistanceMetric::Cosine => 1.0 - cosine_similarity(query, vecf), 495 | } 496 | } 497 | } 498 | 499 | fn build_layer_adjacency_parallel( 500 | file: &File, 501 | adjacency_offset: u64, 502 | layer_offset: usize, 503 | layer_ids: &[u32], 504 | dim: usize, 505 | max_degree: usize, 506 | vectors_offset: u64, 507 | centroids: &[(usize, Vec)], 508 | distance_metric: DistanceMetric, 509 | ) -> Result<(), DiskAnnError> { 510 | if layer_ids.is_empty() { 511 | return Ok(()); 512 | } 513 | 514 | let node_assignments: Vec<(usize, u32)> = layer_ids 515 | .par_iter() 516 | .map(|&nid| { 517 | let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap(); 518 | let mut best_c = 0; 519 | let mut best_d = f32::MAX; 520 | for (cidx, (_, cvec)) in centroids.iter().enumerate() { 521 | let d = match distance_metric { 522 | DistanceMetric::Euclidean => euclidean_distance(&nv, cvec), 523 | DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, cvec), 524 | }; 525 | if d < best_d { 526 | best_d = d; 527 | best_c = cidx; 528 | } 529 | } 530 | (best_c, nid) 531 | }) 532 | .collect(); 533 | 534 | let cluster_count = centroids.len(); 535 | let mut buckets = vec![Vec::new(); cluster_count]; 536 | for (cidx, nid) in node_assignments { 537 | buckets[cidx].push(nid); 538 | } 539 | 540 | buckets.into_par_iter().for_each(|bucket| { 541 | if bucket.len() <= 1 { 542 | return; 543 | } 544 | let mut rng = rand::thread_rng(); 545 | let sample_size = 256.min(bucket.len()); 546 | let mut sample_ids = bucket.clone(); 547 | sample_ids.shuffle(&mut rng); 548 | sample_ids.truncate(sample_size); 549 | 550 | let sample_vecs: Vec<(u32, Vec)> = sample_ids 551 | .iter() 552 | .map(|&sid| { 553 | let v = read_vector(file, vectors_offset, dim, sid as usize).unwrap(); 554 | (sid, v) 555 | }) 556 | .collect(); 557 | 558 | bucket.par_iter().for_each(|&nid| { 559 | let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap(); 560 | let mut dists = Vec::with_capacity(sample_vecs.len()); 561 | for (sid, sv) in &sample_vecs { 562 | if *sid != nid { 563 | let d = match distance_metric { 564 | DistanceMetric::Euclidean => euclidean_distance(&nv, sv), 565 | DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, sv), 566 | }; 567 | dists.push((*sid, d)); 568 | } 569 | } 570 | dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); 571 | dists.truncate(max_degree); 572 | 573 | let mut nbrs: Vec = dists.iter().map(|(id, _)| *id).collect(); 574 | while nbrs.len() < max_degree { 575 | nbrs.push(0); 576 | } 577 | let node_off = layer_offset + (nid as usize * max_degree * 4); 578 | let off = adjacency_offset + node_off as u64; 579 | let bytes = bytemuck::cast_slice(&nbrs); 580 | file.write_at(bytes, off).unwrap(); 581 | }); 582 | }); 583 | 584 | Ok(()) 585 | } 586 | 587 | fn pick_random_centroids( 588 | cluster_count: usize, 589 | file: &File, 590 | vectors_offset: u64, 591 | dim: usize, 592 | num_vectors: usize, 593 | ) -> Result)>, DiskAnnError> { 594 | let mut rng = rand::thread_rng(); 595 | let mut cents = Vec::with_capacity(cluster_count); 596 | for _ in 0..cluster_count { 597 | let id = rng.gen_range(0..num_vectors); 598 | let vec = read_vector(file, vectors_offset, dim, id)?; 599 | cents.push((id, vec)); 600 | } 601 | Ok(cents) 602 | } 603 | 604 | fn read_vector( 605 | file: &File, 606 | vectors_offset: u64, 607 | dim: usize, 608 | idx: usize, 609 | ) -> Result, DiskAnnError> { 610 | let off = vectors_offset + (idx * dim * 4) as u64; 611 | let mut buf = vec![0u8; dim * 4]; 612 | file.read_at(&mut buf, off)?; 613 | let floats: &[f32] = bytemuck::cast_slice(&buf); 614 | Ok(floats.to_vec()) 615 | } 616 | 617 | /// Computes Euclidean distance between two vectors 618 | fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 { 619 | a.iter() 620 | .zip(b.iter()) 621 | .map(|(x, y)| (x - y) * (x - y)) 622 | .sum::() 623 | .sqrt() 624 | } 625 | 626 | /// Computes cosine similarity between two vectors 627 | fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { 628 | let mut dot = 0.0; 629 | let mut norm_a = 0.0; 630 | let mut norm_b = 0.0; 631 | for (x, y) in a.iter().zip(b.iter()) { 632 | dot += x * y; 633 | norm_a += x * x; 634 | norm_b += y * y; 635 | } 636 | if norm_a == 0.0 || norm_b == 0.0 { 637 | return 0.0; 638 | } 639 | dot / (norm_a.sqrt() * norm_b.sqrt()) 640 | } 641 | 642 | /// Basic unit tests 643 | #[cfg(test)] 644 | mod tests { 645 | use std::collections::HashSet; 646 | 647 | use super::*; 648 | 649 | // We'll define a small set of 5 2D vectors in a static array. 650 | // This is purely deterministic. 651 | const TEST_VECTORS_2D: &[[f32; 2]] = 652 | &[[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]]; 653 | 654 | /// A special builder that does not generate random vectors, but writes 655 | /// the `TEST_VECTORS_2D` to the file, uses cluster_count=1 or 2, 656 | /// and no partial sampling for the adjacency build. 657 | fn build_index_singlefile_for_test( 658 | dim: usize, 659 | distance_metric: DistanceMetric, 660 | file_path: &str, 661 | ) -> Result { 662 | // 1) Create/truncate the file 663 | let mut file = std::fs::OpenOptions::new() 664 | .create(true) 665 | .write(true) 666 | .read(true) 667 | .truncate(true) 668 | .open(file_path)?; 669 | 670 | let num_vectors = TEST_VECTORS_2D.len(); 671 | let vectors_offset = 1024 * 1024; 672 | let total_vector_bytes = (num_vectors as u64) * (dim as u64) * 4; 673 | file.set_len(vectors_offset + total_vector_bytes)?; 674 | 675 | // 2) Write the fixed vectors into the file 676 | // We'll assume dim=2 matches TEST_VECTORS_2D 677 | for (i, vec2) in TEST_VECTORS_2D.iter().enumerate() { 678 | let offset = vectors_offset + (i * dim * 4) as u64; 679 | let bytes = bytemuck::cast_slice(vec2); 680 | file.write_at(bytes, offset)?; 681 | } 682 | 683 | // 3) We'll place all vectors in a single "cluster" => cluster_count=1 684 | // So adjacency is effectively complete (we won't skip anything). 685 | // Or if you want 2 clusters, do cluster_count=2 but it usually won't matter 686 | let cluster_count = 1; 687 | 688 | // We'll store IDs in [0..num_vectors) 689 | let layer0_ids: Vec = (0..num_vectors as u32).collect(); 690 | let layer1_ids = layer0_ids.clone(); 691 | let fraction_top = 0.2; 692 | let fraction_mid = 0.4; 693 | let max_degree = 2; // for the small test 694 | let bytes_per_node = max_degree * 4; 695 | 696 | let offset_layer0 = 0; 697 | let offset_layer1 = layer0_ids.len() * bytes_per_node; 698 | let offset_layer2 = offset_layer1 + (layer1_ids.len() * bytes_per_node); 699 | 700 | let adjacency_offset = vectors_offset + total_vector_bytes; 701 | let total_adj_bytes = offset_layer2 + (num_vectors * bytes_per_node); 702 | file.set_len(adjacency_offset + total_adj_bytes as u64)?; 703 | 704 | // 4) Build adjacency *without partial sampling* 705 | // We'll treat the entire bucket. That ensures a "complete" adjacency for a small set. 706 | let centroids = 707 | pick_predefined_centroids(cluster_count, &file, vectors_offset, dim, distance_metric)?; 708 | 709 | // Build adjacency for layer0 => all vectors 710 | build_layer_adjacency_test( 711 | &file, 712 | adjacency_offset, 713 | offset_layer0, 714 | &layer0_ids, 715 | dim, 716 | max_degree, 717 | vectors_offset, 718 | ¢roids, 719 | distance_metric, 720 | /* no partial sampling = entire bucket */ 721 | )?; 722 | 723 | // same for layer1 => same IDs 724 | build_layer_adjacency_test( 725 | &file, 726 | adjacency_offset, 727 | offset_layer1, 728 | &layer1_ids, 729 | dim, 730 | max_degree, 731 | vectors_offset, 732 | ¢roids, 733 | distance_metric, 734 | /* no partial sampling = entire bucket */ 735 | )?; 736 | 737 | // layer2 => also same set 738 | let base_ids: Vec = (0..num_vectors as u32).collect(); 739 | build_layer_adjacency_test( 740 | &file, 741 | adjacency_offset, 742 | offset_layer2, 743 | &base_ids, 744 | dim, 745 | max_degree, 746 | vectors_offset, 747 | ¢roids, 748 | distance_metric, 749 | /* no partial sampling */ 750 | )?; 751 | 752 | // 5) Write metadata 753 | let md = SingleFileMetadata { 754 | dim, 755 | num_vectors, 756 | max_degree, 757 | fraction_top, 758 | fraction_mid, 759 | distance_metric, 760 | layer0_ids: layer0_ids.clone(), 761 | layer1_ids: layer1_ids.clone(), 762 | vectors_offset, 763 | adjacency_offset, 764 | offset_layer0, 765 | offset_layer1, 766 | offset_layer2, 767 | }; 768 | let md_bytes = bincode::serialize(&md)?; 769 | file.seek(std::io::SeekFrom::Start(0))?; 770 | let md_len = md_bytes.len() as u64; 771 | file.write_all(&md_len.to_le_bytes())?; 772 | file.write_all(&md_bytes)?; 773 | file.sync_all()?; 774 | 775 | let mmap = unsafe { memmap2::Mmap::map(&file)? }; 776 | 777 | // Return the struct 778 | Ok(SingleFileDiskANN { 779 | dim, 780 | num_vectors, 781 | max_degree: md.max_degree, 782 | fraction_top: md.fraction_top, 783 | fraction_mid: md.fraction_mid, 784 | distance_metric: md.distance_metric, 785 | layer0_ids: md.layer0_ids, 786 | layer1_ids: md.layer1_ids, 787 | offset_layer0: md.offset_layer0, 788 | offset_layer1: md.offset_layer1, 789 | offset_layer2: md.offset_layer2, 790 | vectors_offset: md.vectors_offset, 791 | adjacency_offset: md.adjacency_offset, 792 | mmap, 793 | }) 794 | } 795 | 796 | /// We define a small "centroid" for cluster_count=1 => we just pick e.g. the first vector 797 | fn pick_predefined_centroids( 798 | cluster_count: usize, 799 | file: &std::fs::File, 800 | vectors_offset: u64, 801 | dim: usize, 802 | _distance_metric: DistanceMetric, 803 | ) -> Result)>, DiskAnnError> { 804 | // If cluster_count=1, let's just pick the first vector 805 | let mut out = Vec::new(); 806 | for i in 0..cluster_count { 807 | // We'll pick i-th 808 | let v = read_vector(file, vectors_offset, dim, i)?; 809 | out.push((i, v)); 810 | } 811 | Ok(out) 812 | } 813 | 814 | /// A specialized adjacency builder that uses the entire bucket for small tests 815 | fn build_layer_adjacency_test( 816 | file: &std::fs::File, 817 | adjacency_offset: u64, 818 | layer_offset: usize, 819 | layer_ids: &[u32], 820 | dim: usize, 821 | max_degree: usize, 822 | vectors_offset: u64, 823 | _centroids: &[(usize, Vec)], 824 | distance_metric: DistanceMetric, 825 | // no partial sampling => entire bucket 826 | ) -> Result<(), DiskAnnError> { 827 | if layer_ids.is_empty() { 828 | return Ok(()); 829 | } 830 | // single cluster approach => everything in 1 bucket 831 | let mut bucket = Vec::new(); 832 | for &nid in layer_ids { 833 | bucket.push(nid); 834 | } 835 | 836 | // sample = entire bucket 837 | let sample_vecs: Vec<(u32, Vec)> = bucket 838 | .iter() 839 | .map(|&sid| { 840 | let v = read_vector(file, vectors_offset, dim, sid as usize).unwrap(); 841 | (sid, v) 842 | }) 843 | .collect(); 844 | 845 | // adjacency for each node => top max_degree from entire bucket minus itself 846 | for &nid in bucket.iter() { 847 | let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap(); 848 | // compute distance to all other nodes in bucket 849 | let mut dists = Vec::with_capacity(sample_vecs.len()); 850 | for (sid, sv) in &sample_vecs { 851 | if *sid != nid { 852 | let d = match distance_metric { 853 | DistanceMetric::Euclidean => euclidean_distance(&nv, sv), 854 | DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, sv), 855 | }; 856 | dists.push((*sid, d)); 857 | } 858 | } 859 | dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); 860 | dists.truncate(max_degree); 861 | 862 | let mut nbrs: Vec = dists.iter().map(|(id, _)| *id).collect(); 863 | while nbrs.len() < max_degree { 864 | nbrs.push(0); 865 | } 866 | 867 | let node_off = layer_offset + (nid as usize * max_degree * 4); 868 | let off = adjacency_offset + node_off as u64; 869 | let bytes = bytemuck::cast_slice(&nbrs); 870 | file.write_at(bytes, off).unwrap(); 871 | } 872 | 873 | Ok(()) 874 | } 875 | 876 | // test_small_euclidean using the above "hard-coded" approach 877 | #[test] 878 | fn test_small_euclidean() -> Result<(), DiskAnnError> { 879 | let tmpfile = "test_small_euclid.db"; 880 | if std::path::Path::new(tmpfile).exists() { 881 | std::fs::remove_file(tmpfile).unwrap(); 882 | } 883 | 884 | // build w/ no random + single cluster => adjacency is effectively "complete" 885 | let index = build_index_singlefile_for_test( 886 | 2, // dim 887 | DistanceMetric::Euclidean, 888 | tmpfile, 889 | )?; 890 | 891 | // We'll pick vector 0 as the query 892 | let query = index.get_vector(0)?; 893 | let k = 2; 894 | let beam_width = 4; 895 | let neighbors = index.search(&query, k, beam_width); 896 | 897 | // Manually compute actual top-2 by Euclidean among our TEST_VECTORS_2D 898 | // We know them from e.g. [0,0], [1,0], [0,1], [1,1], [0.5,0.5] 899 | // In this approach, let's do it generically: 900 | let n = index.num_vectors; 901 | let mut dists: Vec<(usize, f32)> = (0..n) 902 | .map(|i| { 903 | let v = index.get_vector(i)?; 904 | Ok((i, euclidean_distance(&query, &v))) 905 | }) 906 | .collect::, DiskAnnError>>()?; 907 | 908 | dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); 909 | let correct_ids: Vec = dists[..k].iter().map(|(i, _)| *i).collect(); 910 | 911 | let set1: std::collections::HashSet<_> = correct_ids.into_iter().collect(); 912 | let set2: std::collections::HashSet<_> = neighbors.iter().map(|&x| x as usize).collect(); 913 | assert_eq!(set1, set2); 914 | 915 | Ok(()) 916 | } 917 | 918 | #[test] 919 | fn test_small_cosine() -> Result<(), DiskAnnError> { 920 | let tmpfile = "test_small_cosine.db"; 921 | if std::path::Path::new(tmpfile).exists() { 922 | std::fs::remove_file(tmpfile).unwrap(); 923 | } 924 | 925 | let index = build_index_singlefile_for_test( 926 | 2, // dim 927 | DistanceMetric::Cosine, 928 | tmpfile, 929 | )?; 930 | 931 | // Use vector[1] = [1.0, 0.0] as query instead of vector[0] 932 | let query = index.get_vector(1)?; 933 | let k = 2; 934 | let beam_width = 4; 935 | let neighbors = index.search(&query, k, beam_width); 936 | 937 | // compute actual top-2 by (1 - cos) 938 | let n = index.num_vectors; 939 | let mut dists: Vec<(usize, f32)> = (0..n) 940 | .map(|i| { 941 | let v = index.get_vector(i)?; 942 | let sim = cosine_similarity(&query, &v); 943 | Ok((i, 1.0 - sim)) // interpret distance = 1 - cos 944 | }) 945 | .collect::, DiskAnnError>>()?; 946 | 947 | dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); 948 | let correct_ids: Vec = dists[..k].iter().map(|(i, _)| *i).collect(); 949 | 950 | let set1: HashSet<_> = setify(correct_ids); 951 | let set2: HashSet<_> = setify(neighbors.iter().map(|&x| x as usize)); 952 | assert_eq!(set1, set2); 953 | 954 | Ok(()) 955 | } 956 | 957 | // small convenience fn to build a hashset from an iterator 958 | fn setify(iter: I) -> HashSet<::Item> 959 | where 960 | I: IntoIterator, 961 | ::Item: Eq + Hash, 962 | { 963 | iter.into_iter().collect() 964 | } 965 | 966 | // reuses your get_vector for the test 967 | impl SingleFileDiskANN { 968 | pub fn get_vector(&self, idx: usize) -> Result, DiskAnnError> { 969 | let vector_offset = self.vectors_offset + (idx * self.dim * 4) as u64; 970 | let start = vector_offset as usize; 971 | let end = start + (self.dim * 4); 972 | let bytes = &self.mmap[start..end]; 973 | let vecf: &[f32] = bytemuck::cast_slice(bytes); 974 | Ok(vecf.to_vec()) 975 | } 976 | } 977 | } 978 | --------------------------------------------------------------------------------