├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── examples
    ├── demo.rs
    └── perf_test.rs
└── src
    └── lib.rs


/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - name: Build
20 |       run: cargo build --verbose
21 |     - name: Run tests
22 |       run: cargo test --verbose
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | 
3 | ## [0.1.0] - 2024-12-28
4 | - Initial release
5 | - Single-file storage implementation
6 | - Support for Euclidean and Cosine distance
7 | - Three-layer search structure
8 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "bincode"
  7 | version = "1.3.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
 10 | dependencies = [
 11 |  "serde",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "bytemuck"
 16 | version = "1.21.0"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3"
 19 | 
 20 | [[package]]
 21 | name = "byteorder"
 22 | version = "1.5.0"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 25 | 
 26 | [[package]]
 27 | name = "cfg-if"
 28 | version = "1.0.0"
 29 | source = "registry+https://github.com/rust-lang/crates.io-index"
 30 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 31 | 
 32 | [[package]]
 33 | name = "crossbeam-deque"
 34 | version = "0.8.6"
 35 | source = "registry+https://github.com/rust-lang/crates.io-index"
 36 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
 37 | dependencies = [
 38 |  "crossbeam-epoch",
 39 |  "crossbeam-utils",
 40 | ]
 41 | 
 42 | [[package]]
 43 | name = "crossbeam-epoch"
 44 | version = "0.9.18"
 45 | source = "registry+https://github.com/rust-lang/crates.io-index"
 46 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 47 | dependencies = [
 48 |  "crossbeam-utils",
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "crossbeam-utils"
 53 | version = "0.8.21"
 54 | source = "registry+https://github.com/rust-lang/crates.io-index"
 55 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 56 | 
 57 | [[package]]
 58 | name = "diskann-rs"
 59 | version = "0.1.0"
 60 | dependencies = [
 61 |  "bincode",
 62 |  "bytemuck",
 63 |  "memmap2",
 64 |  "rand",
 65 |  "rayon",
 66 |  "serde",
 67 |  "thiserror",
 68 | ]
 69 | 
 70 | [[package]]
 71 | name = "either"
 72 | version = "1.13.0"
 73 | source = "registry+https://github.com/rust-lang/crates.io-index"
 74 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 75 | 
 76 | [[package]]
 77 | name = "getrandom"
 78 | version = "0.2.15"
 79 | source = "registry+https://github.com/rust-lang/crates.io-index"
 80 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 81 | dependencies = [
 82 |  "cfg-if",
 83 |  "libc",
 84 |  "wasi",
 85 | ]
 86 | 
 87 | [[package]]
 88 | name = "libc"
 89 | version = "0.2.169"
 90 | source = "registry+https://github.com/rust-lang/crates.io-index"
 91 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 92 | 
 93 | [[package]]
 94 | name = "memmap2"
 95 | version = "0.5.10"
 96 | source = "registry+https://github.com/rust-lang/crates.io-index"
 97 | checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
 98 | dependencies = [
 99 |  "libc",
100 | ]
101 | 
102 | [[package]]
103 | name = "ppv-lite86"
104 | version = "0.2.20"
105 | source = "registry+https://github.com/rust-lang/crates.io-index"
106 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
107 | dependencies = [
108 |  "zerocopy",
109 | ]
110 | 
111 | [[package]]
112 | name = "proc-macro2"
113 | version = "1.0.92"
114 | source = "registry+https://github.com/rust-lang/crates.io-index"
115 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
116 | dependencies = [
117 |  "unicode-ident",
118 | ]
119 | 
120 | [[package]]
121 | name = "quote"
122 | version = "1.0.37"
123 | source = "registry+https://github.com/rust-lang/crates.io-index"
124 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
125 | dependencies = [
126 |  "proc-macro2",
127 | ]
128 | 
129 | [[package]]
130 | name = "rand"
131 | version = "0.8.5"
132 | source = "registry+https://github.com/rust-lang/crates.io-index"
133 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
134 | dependencies = [
135 |  "libc",
136 |  "rand_chacha",
137 |  "rand_core",
138 | ]
139 | 
140 | [[package]]
141 | name = "rand_chacha"
142 | version = "0.3.1"
143 | source = "registry+https://github.com/rust-lang/crates.io-index"
144 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
145 | dependencies = [
146 |  "ppv-lite86",
147 |  "rand_core",
148 | ]
149 | 
150 | [[package]]
151 | name = "rand_core"
152 | version = "0.6.4"
153 | source = "registry+https://github.com/rust-lang/crates.io-index"
154 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
155 | dependencies = [
156 |  "getrandom",
157 | ]
158 | 
159 | [[package]]
160 | name = "rayon"
161 | version = "1.10.0"
162 | source = "registry+https://github.com/rust-lang/crates.io-index"
163 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
164 | dependencies = [
165 |  "either",
166 |  "rayon-core",
167 | ]
168 | 
169 | [[package]]
170 | name = "rayon-core"
171 | version = "1.12.1"
172 | source = "registry+https://github.com/rust-lang/crates.io-index"
173 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
174 | dependencies = [
175 |  "crossbeam-deque",
176 |  "crossbeam-utils",
177 | ]
178 | 
179 | [[package]]
180 | name = "serde"
181 | version = "1.0.216"
182 | source = "registry+https://github.com/rust-lang/crates.io-index"
183 | checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
184 | dependencies = [
185 |  "serde_derive",
186 | ]
187 | 
188 | [[package]]
189 | name = "serde_derive"
190 | version = "1.0.216"
191 | source = "registry+https://github.com/rust-lang/crates.io-index"
192 | checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
193 | dependencies = [
194 |  "proc-macro2",
195 |  "quote",
196 |  "syn",
197 | ]
198 | 
199 | [[package]]
200 | name = "syn"
201 | version = "2.0.91"
202 | source = "registry+https://github.com/rust-lang/crates.io-index"
203 | checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035"
204 | dependencies = [
205 |  "proc-macro2",
206 |  "quote",
207 |  "unicode-ident",
208 | ]
209 | 
210 | [[package]]
211 | name = "thiserror"
212 | version = "1.0.69"
213 | source = "registry+https://github.com/rust-lang/crates.io-index"
214 | checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
215 | dependencies = [
216 |  "thiserror-impl",
217 | ]
218 | 
219 | [[package]]
220 | name = "thiserror-impl"
221 | version = "1.0.69"
222 | source = "registry+https://github.com/rust-lang/crates.io-index"
223 | checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
224 | dependencies = [
225 |  "proc-macro2",
226 |  "quote",
227 |  "syn",
228 | ]
229 | 
230 | [[package]]
231 | name = "unicode-ident"
232 | version = "1.0.14"
233 | source = "registry+https://github.com/rust-lang/crates.io-index"
234 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
235 | 
236 | [[package]]
237 | name = "wasi"
238 | version = "0.11.0+wasi-snapshot-preview1"
239 | source = "registry+https://github.com/rust-lang/crates.io-index"
240 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
241 | 
242 | [[package]]
243 | name = "zerocopy"
244 | version = "0.7.35"
245 | source = "registry+https://github.com/rust-lang/crates.io-index"
246 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
247 | dependencies = [
248 |  "byteorder",
249 |  "zerocopy-derive",
250 | ]
251 | 
252 | [[package]]
253 | name = "zerocopy-derive"
254 | version = "0.7.35"
255 | source = "registry+https://github.com/rust-lang/crates.io-index"
256 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
257 | dependencies = [
258 |  "proc-macro2",
259 |  "quote",
260 |  "syn",
261 | ]
262 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "diskann-rs"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | description = "A Rust implementation of DiskANN (Disk-based Approximate Nearest Neighbor search) featuring a 3-layer index architecture and parallel query processing. This project provides an efficient and scalable solution for large-scale vector similarity search with single-file storage."
 6 | license = "MIT" 
 7 | repository = "https://github.com/lukaesch/diskann-rs"
 8 | 
 9 | [dependencies]
10 | rand = "0.8"
11 | memmap2 = "0.5"
12 | serde = { version = "1.0", features = ["derive"] }
13 | bincode = "1.3"
14 | thiserror = "1.0"
15 | bytemuck = "1.14"
16 | rayon = "1.7"
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Lukas Schmyrczyk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DiskANN Implementation in Rust
  2 | 
  3 | [![Rust](https://github.com/lukaesch/diskann-rs/actions/workflows/rust.yml/badge.svg?branch=main)](https://github.com/lukaesch/diskann-rs/actions/workflows/rust.yml)
  4 | 
  5 | A Rust implementation of DiskANN (Disk-based Approximate Nearest Neighbor search) featuring a 3-layer index architecture and parallel query processing. This project provides an efficient and scalable solution for large-scale vector similarity search with single-file storage.
  6 | 
  7 | ## Overview
  8 | 
  9 | This implementation provides a memory-efficient approach to similarity search by:
 10 | - Using a 3-layer hierarchical index structure for faster search
 11 | - Storing all data in a single file using memory mapping
 12 | - Supporting both Euclidean distance and Cosine similarity
 13 | - Managing adjacency lists for graph-based search
 14 | - Implementing parallel query processing
 15 | - Supporting large-scale datasets that don't fit in RAM
 16 | 
 17 | ## Features
 18 | 
 19 | - Three-layer hierarchical index structure:
 20 |   - Top layer (L0): Smallest, most selective layer
 21 |   - Middle layer (L1): Intermediate connectivity
 22 |   - Base layer (L2): Complete dataset
 23 | - Single-file storage format for simplified deployment
 24 | - Choice of distance metrics:
 25 |   - Euclidean distance
 26 |   - Cosine similarity
 27 | - Cluster-based graph construction for meaningful adjacency
 28 | - Parallel query processing using rayon
 29 | - Memory-mapped file access for handling large datasets
 30 | - Comprehensive error handling with custom error types
 31 | 
 32 | ## Usage
 33 | 
 34 | ### Building a New Index
 35 | 
 36 | ```rust
 37 | use diskannrs::{SingleFileDiskANN, DistanceMetric};
 38 | 
 39 | let index = SingleFileDiskANN::build_index_singlefile(
 40 |     1_000_000,        // number of vectors
 41 |     128,              // dimension
 42 |     32,               // max neighbors per node
 43 |     0.01,             // fraction of vectors in top layer
 44 |     0.1,              // fraction of vectors in middle layer
 45 |     DistanceMetric::Euclidean,  // or DistanceMetric::Cosine
 46 |     "index.db"        // single file to store everything
 47 | )?;
 48 | ```
 49 | 
 50 | ### Opening an Existing Index
 51 | 
 52 | ```rust
 53 | let index = SingleFileDiskANN::open_index_singlefile("index.db")?;
 54 | ```
 55 | 
 56 | ### Searching the Index
 57 | 
 58 | ```rust
 59 | // Prepare your query vector
 60 | let query = vec![0.1, 0.2, ...; 128];  // must match index dimension
 61 | 
 62 | // Search for nearest neighbors
 63 | let k = 10;  // number of neighbors to return
 64 | let beam_width = 64;  // search beam width
 65 | let neighbors = index.search(&query, k, beam_width);
 66 | ```
 67 | 
 68 | ### Parallel Search
 69 | 
 70 | ```rust
 71 | use rayon::prelude::*;
 72 | use std::sync::Arc;
 73 | 
 74 | // Create shared index reference
 75 | let index = Arc::new(index);
 76 | 
 77 | // Perform parallel queries
 78 | let results: Vec<Vec<u32>> = query_batch
 79 |     .par_iter()
 80 |     .map(|query| index.search(query, k, beam_width))
 81 |     .collect();
 82 | ```
 83 | 
 84 | ## Performance Characteristics
 85 | 
 86 | - Memory Usage: O(1) for vector storage due to memory mapping
 87 | - Disk Space: Single file containing:
 88 |   - Vectors: num_vectors * dimension * 4 bytes
 89 |   - Adjacency Lists: Varies by layer size and max_degree
 90 |   - Metadata: Small overhead
 91 | - Search Time: Logarithmic due to hierarchical structure
 92 | - Parallel Processing: Scales with available CPU cores
 93 | 
 94 | ## Building and Testing
 95 | 
 96 | ```bash
 97 | # Build the library
 98 | cargo build --release
 99 | 
100 | # Run tests
101 | cargo test
102 | 
103 | # Run with example
104 | cargo run --release --example simple_search
105 | ```
106 | 
107 | ## Current Status
108 | 
109 | This implementation features:
110 | - [x] Single-file storage format
111 | - [x] 3-layer hierarchical index structure
112 | - [x] Multiple distance metrics support
113 | - [x] Cluster-based graph construction
114 | - [x] Parallel query processing
115 | - [x] Memory-mapped I/O
116 | - [x] Comprehensive test suite
117 | 
118 | ## Future Improvements
119 | 
120 | 1. Add more distance metrics
121 | 2. Implement dynamic index updates
122 | 3. Add parameter auto-tuning
123 | 4. Expand benchmarking suite
124 | 5. Add more examples
125 | 6. Improve documentation
126 | 
127 | ## Contributing
128 | 
129 | Contributions are welcome! Please feel free to:
130 | - Open issues for bugs or feature requests
131 | - Submit PRs for improvements
132 | - Share ideas for optimization
133 | 
134 | ## License
135 | 
136 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
137 | 
138 | ## References
139 | 
140 | - Original DiskANN paper: [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://www.microsoft.com/en-us/research/publication/diskann-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node/)
141 | 


--------------------------------------------------------------------------------
/examples/demo.rs:
--------------------------------------------------------------------------------
 1 | // examples/demo.rs
 2 | use diskann_rs::{DiskAnnError, DistanceMetric, SingleFileDiskANN};
 3 | use std::sync::Arc;
 4 | 
 5 | fn main() -> Result<(), DiskAnnError> {
 6 |     let singlefile_path = "diskann.db";
 7 |     let num_vectors = 100_000;
 8 |     let dim = 128;
 9 |     let max_degree = 32;
10 |     let fraction_top = 0.01;
11 |     let fraction_mid = 0.1;
12 |     let distance_metric = DistanceMetric::Cosine;
13 | 
14 |     // Build if missing
15 |     if !std::path::Path::new(singlefile_path).exists() {
16 |         println!("Building single-file diskann at {singlefile_path}...");
17 |         let index = SingleFileDiskANN::build_index_singlefile(
18 |             num_vectors,
19 |             dim,
20 |             max_degree,
21 |             fraction_top,
22 |             fraction_mid,
23 |             distance_metric,
24 |             singlefile_path,
25 |         )?;
26 |         println!("Build done. Index dimension = {}", index.dim);
27 |     } else {
28 |         println!("Index file {singlefile_path} already exists, skipping build.");
29 |     }
30 | 
31 |     // Open
32 |     let index = Arc::new(SingleFileDiskANN::open_index_singlefile(singlefile_path)?);
33 | 
34 |     // Query
35 |     let query = vec![0.1, 0.2, 0.3 /* ... up to dim */];
36 |     let k = 10;
37 |     let beam_width = 64;
38 |     let neighbors = index.search(&query, k, beam_width);
39 |     println!("Neighbors for the sample query = {:?}", neighbors);
40 | 
41 |     Ok(())
42 | }
43 | 


--------------------------------------------------------------------------------
/examples/perf_test.rs:
--------------------------------------------------------------------------------
 1 | use diskann_rs::{DiskAnnError, DistanceMetric, SingleFileDiskANN};
 2 | use rand::prelude::*;
 3 | use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 4 | use std::sync::Arc;
 5 | use std::time::Instant;
 6 | 
 7 | fn main() -> Result<(), DiskAnnError> {
 8 |     const NUM_VECTORS: usize = 1_000_000;
 9 |     const DIM: usize = 1536;
10 |     const MAX_DEGREE: usize = 32;
11 |     const FRACTION_TOP: f64 = 0.01;
12 |     const FRACTION_MID: f64 = 0.1;
13 |     let distance_metric = DistanceMetric::Cosine;
14 | 
15 |     let singlefile_path = "diskann_parallel.db";
16 | 
17 |     // Build if missing
18 |     if !std::path::Path::new(singlefile_path).exists() {
19 |         println!(
20 |             "Building single-file index with parallel adjacency + distance={:?}",
21 |             distance_metric
22 |         );
23 |         let start = Instant::now();
24 |         let _index = SingleFileDiskANN::build_index_singlefile(
25 |             NUM_VECTORS,
26 |             DIM,
27 |             MAX_DEGREE,
28 |             FRACTION_TOP,
29 |             FRACTION_MID,
30 |             distance_metric,
31 |             singlefile_path,
32 |         )?;
33 |         let elapsed = start.elapsed().as_secs_f32();
34 |         println!("Done building index in {:.2} s", elapsed);
35 |     } else {
36 |         println!(
37 |             "Index file {} already exists, skipping build.",
38 |             singlefile_path
39 |         );
40 |     }
41 | 
42 |     // open
43 |     let open_start = Instant::now();
44 |     let index = Arc::new(SingleFileDiskANN::open_index_singlefile(singlefile_path)?);
45 |     let open_time = open_start.elapsed().as_secs_f32();
46 |     println!(
47 |         "Opened index with {} vectors, dim={}, metric={:?} in {:.2} s",
48 |         index.num_vectors, index.dim, index.distance_metric, open_time
49 |     );
50 | 
51 |     // Create queries
52 |     let queries = 5;
53 |     let k = 10;
54 |     let beam_width = 64;
55 | 
56 |     // Generate all queries in a batch
57 |     let mut rng = rand::thread_rng();
58 |     let mut query_batch: Vec<Vec<f32>> = Vec::with_capacity(queries);
59 |     for _ in 0..queries {
60 |         let q: Vec<f32> = (0..index.dim).map(|_| rng.gen()).collect();
61 |         query_batch.push(q);
62 |     }
63 | 
64 |     // Now run queries in parallel
65 |     let search_start = Instant::now();
66 |     query_batch.par_iter().enumerate().for_each(|(i, query)| {
67 |         let neighbors = index.search(query, k, beam_width);
68 |         println!("Query {i} => top-{k} neighbors = {:?}", neighbors);
69 |     });
70 |     let search_time = search_start.elapsed().as_secs_f32();
71 |     println!("Performed {queries} queries in {:.2} s", search_time);
72 | 
73 |     Ok(())
74 | }
75 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! # DiskAnnRS
  2 | //!
  3 | //! A DiskANN-like Rust library implementing approximate nearest neighbor search with
  4 | //! single-file storage support. The library provides both Euclidean distance and
  5 | //! Cosine similarity metrics, with a three-layer hierarchical search structure.
  6 | //!
  7 | //! ## Features
  8 | //!
  9 | //! - Single-file storage format
 10 | //! - Support for both Euclidean and Cosine distance metrics
 11 | //! - Parallel index construction using Rayon
 12 | //! - Memory-mapped file access for efficient searches
 13 | //! - Three-layer hierarchical search structure
 14 | //!
 15 | //! ## Example
 16 | //!
 17 | //! ```rust,no_run
 18 | //! use diskann_rs::{SingleFileDiskANN, DistanceMetric};
 19 | //!
 20 | //! // Build a new index
 21 | //! let index = SingleFileDiskANN::build_index_singlefile(
 22 | //!     1000,    // number of vectors
 23 | //!     128,     // dimensionality
 24 | //!     32,      // maximum degree
 25 | //!     0.1,     // fraction of vectors in top layer
 26 | //!     0.2,     // fraction of vectors in middle layer
 27 | //!     DistanceMetric::Euclidean,
 28 | //!     "index.db"
 29 | //! ).unwrap();
 30 | //!
 31 | //! // Search the index
 32 | //! let query = vec![0.0; 128];  // your query vector
 33 | //! let neighbors = index.search(&query, 10, 64);  // find top 10 with beam width 64
 34 | //! ```
 35 | 
 36 | use bytemuck;
 37 | use memmap2::Mmap;
 38 | use rand::prelude::{Rng, SliceRandom};
 39 | use rayon::prelude::*;
 40 | use serde::{Deserialize, Serialize};
 41 | use std::hash::Hash;
 42 | use std::{
 43 |     fs::{File, OpenOptions},
 44 |     io::{Seek, SeekFrom, Write},
 45 |     os::unix::fs::FileExt, // for read_at / write_at
 46 |     time::Instant,
 47 | };
 48 | use thiserror::Error;
 49 | 
 50 | /// Custom error type for DiskAnnRS operations
 51 | #[derive(Debug, Error)]
 52 | pub enum DiskAnnError {
 53 |     /// Represents I/O errors during file operations
 54 |     #[error("I/O error: {0}")]
 55 |     Io(#[from] std::io::Error),
 56 | 
 57 |     /// Represents serialization/deserialization errors
 58 |     #[error("Serialization error: {0}")]
 59 |     Bincode(#[from] bincode::Error),
 60 | 
 61 |     /// Represents index-specific errors
 62 |     #[error("Index error: {0}")]
 63 |     IndexError(String),
 64 | }
 65 | 
 66 | /// Supported distance metrics for vector comparison
 67 | #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 68 | pub enum DistanceMetric {
 69 |     /// Standard Euclidean distance
 70 |     Euclidean,
 71 |     /// Cosine similarity (converted to distance as 1 - similarity)
 72 |     Cosine,
 73 | }
 74 | 
 75 | /// Internal metadata structure stored in the index file
 76 | #[derive(Serialize, Deserialize, Debug)]
 77 | struct SingleFileMetadata {
 78 |     dim: usize,
 79 |     num_vectors: usize,
 80 |     max_degree: usize,
 81 |     fraction_top: f64,
 82 |     fraction_mid: f64,
 83 |     distance_metric: DistanceMetric,
 84 |     layer0_ids: Vec<u32>,
 85 |     layer1_ids: Vec<u32>,
 86 |     vectors_offset: u64,
 87 |     adjacency_offset: u64,
 88 |     offset_layer0: usize,
 89 |     offset_layer1: usize,
 90 |     offset_layer2: usize,
 91 | }
 92 | 
 93 | /// Main struct representing a DiskANN index
 94 | pub struct SingleFileDiskANN {
 95 |     /// Dimensionality of vectors in the index
 96 |     pub dim: usize,
 97 |     /// Number of vectors in the index
 98 |     pub num_vectors: usize,
 99 |     /// Maximum number of edges per node
100 |     pub max_degree: usize,
101 |     /// Fraction of vectors in top layer
102 |     pub fraction_top: f64,
103 |     /// Fraction of vectors in middle layer
104 |     pub fraction_mid: f64,
105 |     /// Distance metric used by this index
106 |     pub distance_metric: DistanceMetric,
107 | 
108 |     layer0_ids: Vec<u32>,
109 |     layer1_ids: Vec<u32>,
110 |     offset_layer0: usize,
111 |     offset_layer1: usize,
112 |     offset_layer2: usize,
113 |     vectors_offset: u64,
114 |     adjacency_offset: u64,
115 |     mmap: Mmap,
116 | }
117 | 
118 | impl SingleFileDiskANN {
119 |     /// Builds a new single-file index with the specified parameters
120 |     ///
121 |     /// # Arguments
122 |     ///
123 |     /// * `num_vectors` - Number of vectors to store
124 |     /// * `dim` - Dimensionality of the vectors
125 |     /// * `max_degree` - Maximum number of edges per node
126 |     /// * `fraction_top` - Fraction of vectors in top layer
127 |     /// * `fraction_mid` - Fraction of vectors in middle layer
128 |     /// * `distance_metric` - Distance metric to use
129 |     /// * `singlefile_path` - Path where the index file will be created
130 |     ///
131 |     /// # Returns
132 |     ///
133 |     /// Returns `Result<SingleFileDiskANN, DiskAnnError>`
134 |     pub fn build_index_singlefile(
135 |         num_vectors: usize,
136 |         dim: usize,
137 |         max_degree: usize,
138 |         fraction_top: f64,
139 |         fraction_mid: f64,
140 |         distance_metric: DistanceMetric,
141 |         singlefile_path: &str,
142 |     ) -> Result<Self, DiskAnnError> {
143 |         let mut file = OpenOptions::new()
144 |             .create(true)
145 |             .write(true)
146 |             .read(true)
147 |             .truncate(true)
148 |             .open(singlefile_path)?;
149 | 
150 |         let vectors_offset = 1024 * 1024;
151 |         let total_vector_bytes = (num_vectors as u64) * (dim as u64) * 4;
152 | 
153 |         println!(
154 |             "Generating {num_vectors} random vectors of dim {dim} => ~{:.2} GB on disk...",
155 |             (num_vectors as f64 * dim as f64 * 4.0) / (1024.0 * 1024.0 * 1024.0)
156 |         );
157 |         let chunk_size = 100_000;
158 |         let mut rng = rand::thread_rng();
159 |         let mut buffer = Vec::with_capacity(chunk_size * dim);
160 | 
161 |         let gen_start = Instant::now();
162 |         let mut written = 0usize;
163 |         while written < num_vectors {
164 |             buffer.clear();
165 |             let remaining = num_vectors - written;
166 |             let batch = remaining.min(chunk_size);
167 |             for _ in 0..batch {
168 |                 for _ in 0..dim {
169 |                     let val: f32 = rng.gen();
170 |                     buffer.push(val);
171 |                 }
172 |             }
173 |             let bytes = bytemuck::cast_slice(&buffer);
174 |             let offset = vectors_offset + (written * dim * 4) as u64;
175 |             file.write_at(bytes, offset)?;
176 |             written += batch;
177 |         }
178 |         let gen_time = gen_start.elapsed().as_secs_f32();
179 |         println!("Vector generation took {gen_time:.2} s");
180 | 
181 |         let mut all_ids: Vec<u32> = (0..num_vectors as u32).collect();
182 |         all_ids.shuffle(&mut rng);
183 |         let size_l0 = (num_vectors as f64 * fraction_top).ceil() as usize;
184 |         let size_l1 = (num_vectors as f64 * fraction_mid).ceil() as usize;
185 |         let size_l1 = size_l1.max(size_l0);
186 | 
187 |         let l0slice = &all_ids[..size_l0];
188 |         let l1slice = &all_ids[..size_l1];
189 |         let mut layer0_ids = l0slice.to_vec();
190 |         let mut layer1_ids = l1slice.to_vec();
191 |         layer0_ids.sort_unstable();
192 |         layer1_ids.sort_unstable();
193 | 
194 |         let bytes_per_node = max_degree * 4;
195 |         let offset_layer0 = 0;
196 |         let offset_layer1 = size_l0 * bytes_per_node;
197 |         let offset_layer2 = offset_layer1 + (size_l1 * bytes_per_node);
198 |         let total_adj_bytes = offset_layer2 + (num_vectors * bytes_per_node);
199 | 
200 |         let adjacency_offset = vectors_offset + total_vector_bytes;
201 |         let adjacency_end = adjacency_offset + total_adj_bytes as u64;
202 |         file.set_len(adjacency_end)?;
203 | 
204 |         let cluster_count = 20;
205 |         let centroids =
206 |             pick_random_centroids(cluster_count, &file, vectors_offset, dim, num_vectors)?;
207 | 
208 |         let build_start = Instant::now();
209 |         build_layer_adjacency_parallel(
210 |             &file,
211 |             adjacency_offset,
212 |             offset_layer0,
213 |             &layer0_ids,
214 |             dim,
215 |             max_degree,
216 |             vectors_offset,
217 |             &centroids,
218 |             distance_metric,
219 |         )?;
220 |         build_layer_adjacency_parallel(
221 |             &file,
222 |             adjacency_offset,
223 |             offset_layer1,
224 |             &layer1_ids,
225 |             dim,
226 |             max_degree,
227 |             vectors_offset,
228 |             &centroids,
229 |             distance_metric,
230 |         )?;
231 |         let base_ids: Vec<u32> = (0..num_vectors as u32).collect();
232 |         build_layer_adjacency_parallel(
233 |             &file,
234 |             adjacency_offset,
235 |             offset_layer2,
236 |             &base_ids,
237 |             dim,
238 |             max_degree,
239 |             vectors_offset,
240 |             &centroids,
241 |             distance_metric,
242 |         )?;
243 |         let build_time = build_start.elapsed().as_secs_f32();
244 |         println!("Parallel adjacency build took {build_time:.2} s");
245 | 
246 |         let metadata = SingleFileMetadata {
247 |             dim,
248 |             num_vectors,
249 |             max_degree,
250 |             fraction_top,
251 |             fraction_mid,
252 |             distance_metric,
253 |             layer0_ids,
254 |             layer1_ids,
255 |             vectors_offset,
256 |             adjacency_offset,
257 |             offset_layer0,
258 |             offset_layer1,
259 |             offset_layer2,
260 |         };
261 |         let md_bytes = bincode::serialize(&metadata)?;
262 |         file.seek(SeekFrom::Start(0))?;
263 |         let md_len = md_bytes.len() as u64;
264 |         file.write_all(&md_len.to_le_bytes())?;
265 |         file.write_all(&md_bytes)?;
266 |         file.sync_all()?;
267 | 
268 |         let mmap = unsafe { memmap2::Mmap::map(&file)? };
269 | 
270 |         Ok(Self {
271 |             dim,
272 |             num_vectors,
273 |             max_degree,
274 |             fraction_top,
275 |             fraction_mid,
276 |             distance_metric: metadata.distance_metric,
277 |             layer0_ids: metadata.layer0_ids,
278 |             layer1_ids: metadata.layer1_ids,
279 |             offset_layer0: metadata.offset_layer0,
280 |             offset_layer1: metadata.offset_layer1,
281 |             offset_layer2: metadata.offset_layer2,
282 |             vectors_offset: metadata.vectors_offset,
283 |             adjacency_offset: metadata.adjacency_offset,
284 |             mmap,
285 |         })
286 |     }
287 | 
288 |     /// Opens an existing index file
289 |     ///
290 |     /// # Arguments
291 |     ///
292 |     /// * `path` - Path to the index file
293 |     ///
294 |     /// # Returns
295 |     ///
296 |     /// Returns `Result<SingleFileDiskANN, DiskAnnError>`
297 |     pub fn open_index_singlefile(path: &str) -> Result<Self, DiskAnnError> {
298 |         let file = OpenOptions::new().read(true).write(false).open(path)?;
299 |         let mut buf8 = [0u8; 8];
300 |         file.read_at(&mut buf8, 0)?;
301 |         let md_len = u64::from_le_bytes(buf8);
302 |         let mut md_bytes = vec![0u8; md_len as usize];
303 |         file.read_at(&mut md_bytes, 8)?;
304 |         let metadata: SingleFileMetadata = bincode::deserialize(&md_bytes)?;
305 | 
306 |         let mmap = unsafe { memmap2::Mmap::map(&file)? };
307 | 
308 |         Ok(Self {
309 |             dim: metadata.dim,
310 |             num_vectors: metadata.num_vectors,
311 |             max_degree: metadata.max_degree,
312 |             fraction_top: metadata.fraction_top,
313 |             fraction_mid: metadata.fraction_mid,
314 |             distance_metric: metadata.distance_metric,
315 |             layer0_ids: metadata.layer0_ids,
316 |             layer1_ids: metadata.layer1_ids,
317 |             offset_layer0: metadata.offset_layer0,
318 |             offset_layer1: metadata.offset_layer1,
319 |             offset_layer2: metadata.offset_layer2,
320 |             vectors_offset: metadata.vectors_offset,
321 |             adjacency_offset: metadata.adjacency_offset,
322 |             mmap,
323 |         })
324 |     }
325 | 
326 |     /// Searches the index for nearest neighbors
327 |     ///
328 |     /// # Arguments
329 |     ///
330 |     /// * `query` - Query vector
331 |     /// * `k` - Number of nearest neighbors to return
332 |     /// * `beam_width` - Beam width for the search
333 |     ///
334 |     /// # Returns
335 |     ///
336 |     /// Returns a vector of node IDs representing the nearest neighbors
337 |     pub fn search(&self, query: &[f32], k: usize, beam_width: usize) -> Vec<u32> {
338 |         // Use small layers as is
339 |         let _l0 = self.search_layer(query, &self.layer0_ids, self.offset_layer0, beam_width, 1);
340 |         let _l1 = self.search_layer(query, &self.layer1_ids, self.offset_layer1, beam_width, 1);
341 | 
342 |         // Use chunks for base layer
343 |         const CHUNK_SIZE: usize = 100_000;
344 |         let mut results = Vec::with_capacity(k);
345 |         for chunk_start in (0..self.num_vectors).step_by(CHUNK_SIZE) {
346 |             let chunk_end = (chunk_start + CHUNK_SIZE).min(self.num_vectors);
347 |             let chunk_ids: Vec<u32> = (chunk_start..chunk_end).map(|x| x as u32).collect();
348 |             let chunk_results =
349 |                 self.search_layer(query, &chunk_ids, self.offset_layer2, beam_width, k);
350 |             results.extend(chunk_results);
351 |         }
352 |         results.sort_by(|&a, &b| {
353 |             let da = self.distance_to(query, a as usize);
354 |             let db = self.distance_to(query, b as usize);
355 |             da.partial_cmp(&db).unwrap()
356 |         });
357 |         results.truncate(k);
358 |         results
359 |     }
360 | 
361 |     fn search_layer(
362 |         &self,
363 |         query: &[f32],
364 |         layer_ids: &[u32],
365 |         layer_offset: usize,
366 |         beam_width: usize,
367 |         k: usize,
368 |     ) -> Vec<u32> {
369 |         if layer_ids.is_empty() {
370 |             return vec![];
371 |         }
372 |         use std::cmp::Ordering;
373 |         use std::collections::BinaryHeap;
374 | 
375 |         #[derive(Clone)]
376 |         struct Candidate {
377 |             dist: f32,
378 |             node_id: u32,
379 |         }
380 |         impl PartialEq for Candidate {
381 |             fn eq(&self, other: &Self) -> bool {
382 |                 self.dist == other.dist
383 |             }
384 |         }
385 |         impl Eq for Candidate {}
386 |         impl PartialOrd for Candidate {
387 |             fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
388 |                 other.dist.partial_cmp(&self.dist) // Min-heap: smaller distance is "greater"
389 |             }
390 |         }
391 |         impl Ord for Candidate {
392 |             fn cmp(&self, other: &Self) -> Ordering {
393 |                 self.partial_cmp(other).unwrap_or(Ordering::Equal)
394 |             }
395 |         }
396 | 
397 |         // Find starting point
398 |         let mut best_id = layer_ids[0];
399 |         let mut best_dist = self.distance_to(query, best_id as usize);
400 |         for &candidate_id in layer_ids.iter().skip(1) {
401 |             let d = self.distance_to(query, candidate_id as usize);
402 |             if d < best_dist {
403 |                 best_dist = d;
404 |                 best_id = candidate_id;
405 |             }
406 |         }
407 |         let start_id = best_id;
408 | 
409 |         // Initialize
410 |         let mut visited = vec![false; layer_ids.len()];
411 |         let id_to_idx = layer_ids
412 |             .iter()
413 |             .enumerate()
414 |             .map(|(i, &nid)| (nid, i))
415 |             .collect::<std::collections::HashMap<u32, usize>>();
416 | 
417 |         let mut current = BinaryHeap::new();
418 |         current.push(Candidate {
419 |             dist: best_dist,
420 |             node_id: start_id,
421 |         });
422 |         if let Some(&idx) = id_to_idx.get(&start_id) {
423 |             visited[idx] = true;
424 |         }
425 | 
426 |         let mut best = BinaryHeap::new();
427 |         best.push(Candidate {
428 |             dist: best_dist,
429 |             node_id: start_id,
430 |         });
431 | 
432 |         // Beam search with a maximum number of iterations
433 |         let max_iterations = 100; // Adjust based on experimentation
434 |         for _ in 0..max_iterations {
435 |             let mut next = BinaryHeap::new();
436 |             while let Some(current_cand) = current.pop() {
437 |                 let neighbors = self.get_layer_neighbors(current_cand.node_id, layer_offset);
438 |                 for &nbr in neighbors {
439 |                     if nbr == 0 {
440 |                         continue;
441 |                     }
442 |                     if let Some(&nbr_idx) = id_to_idx.get(&nbr) {
443 |                         if !visited[nbr_idx] {
444 |                             visited[nbr_idx] = true;
445 |                             let d = self.distance_to(query, nbr as usize);
446 |                             let cand = Candidate {
447 |                                 dist: d,
448 |                                 node_id: nbr,
449 |                             };
450 |                             next.push(cand.clone());
451 |                             best.push(cand);
452 |                             if best.len() > beam_width {
453 |                                 best.pop(); // Keep top beam_width
454 |                             }
455 |                         }
456 |                     }
457 |                 }
458 |             }
459 |             // Prepare next iteration: take top beam_width from next
460 |             current.clear();
461 |             while current.len() < beam_width && !next.is_empty() {
462 |                 if let Some(cand) = next.pop() {
463 |                     current.push(cand);
464 |                 }
465 |             }
466 |             if current.is_empty() {
467 |                 break;
468 |             }
469 |         }
470 | 
471 |         let mut final_vec = best.into_vec();
472 |         final_vec.sort_by(|a, b| a.dist.partial_cmp(&b.dist).unwrap());
473 |         final_vec.truncate(k);
474 |         final_vec.into_iter().map(|c| c.node_id).collect()
475 |     }
476 | 
477 |     fn get_layer_neighbors(&self, node_id: u32, layer_offset: usize) -> &[u32] {
478 |         let node_off = layer_offset + (node_id as usize * self.max_degree * 4);
479 |         let start = (self.adjacency_offset as usize) + node_off;
480 |         let end = start + (self.max_degree * 4);
481 |         let bytes = &self.mmap[start..end];
482 |         bytemuck::cast_slice(bytes)
483 |     }
484 | 
485 |     fn distance_to(&self, query: &[f32], idx: usize) -> f32 {
486 |         let vector_offset = self.vectors_offset + (idx * self.dim * 4) as u64;
487 |         let start = vector_offset as usize;
488 |         let end = start + (self.dim * 4);
489 |         let bytes = &self.mmap[start..end];
490 |         let vecf: &[f32] = bytemuck::cast_slice(bytes);
491 | 
492 |         match self.distance_metric {
493 |             DistanceMetric::Euclidean => euclidean_distance(query, vecf),
494 |             DistanceMetric::Cosine => 1.0 - cosine_similarity(query, vecf),
495 |         }
496 |     }
497 | }
498 | 
499 | fn build_layer_adjacency_parallel(
500 |     file: &File,
501 |     adjacency_offset: u64,
502 |     layer_offset: usize,
503 |     layer_ids: &[u32],
504 |     dim: usize,
505 |     max_degree: usize,
506 |     vectors_offset: u64,
507 |     centroids: &[(usize, Vec<f32>)],
508 |     distance_metric: DistanceMetric,
509 | ) -> Result<(), DiskAnnError> {
510 |     if layer_ids.is_empty() {
511 |         return Ok(());
512 |     }
513 | 
514 |     let node_assignments: Vec<(usize, u32)> = layer_ids
515 |         .par_iter()
516 |         .map(|&nid| {
517 |             let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap();
518 |             let mut best_c = 0;
519 |             let mut best_d = f32::MAX;
520 |             for (cidx, (_, cvec)) in centroids.iter().enumerate() {
521 |                 let d = match distance_metric {
522 |                     DistanceMetric::Euclidean => euclidean_distance(&nv, cvec),
523 |                     DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, cvec),
524 |                 };
525 |                 if d < best_d {
526 |                     best_d = d;
527 |                     best_c = cidx;
528 |                 }
529 |             }
530 |             (best_c, nid)
531 |         })
532 |         .collect();
533 | 
534 |     let cluster_count = centroids.len();
535 |     let mut buckets = vec![Vec::new(); cluster_count];
536 |     for (cidx, nid) in node_assignments {
537 |         buckets[cidx].push(nid);
538 |     }
539 | 
540 |     buckets.into_par_iter().for_each(|bucket| {
541 |         if bucket.len() <= 1 {
542 |             return;
543 |         }
544 |         let mut rng = rand::thread_rng();
545 |         let sample_size = 256.min(bucket.len());
546 |         let mut sample_ids = bucket.clone();
547 |         sample_ids.shuffle(&mut rng);
548 |         sample_ids.truncate(sample_size);
549 | 
550 |         let sample_vecs: Vec<(u32, Vec<f32>)> = sample_ids
551 |             .iter()
552 |             .map(|&sid| {
553 |                 let v = read_vector(file, vectors_offset, dim, sid as usize).unwrap();
554 |                 (sid, v)
555 |             })
556 |             .collect();
557 | 
558 |         bucket.par_iter().for_each(|&nid| {
559 |             let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap();
560 |             let mut dists = Vec::with_capacity(sample_vecs.len());
561 |             for (sid, sv) in &sample_vecs {
562 |                 if *sid != nid {
563 |                     let d = match distance_metric {
564 |                         DistanceMetric::Euclidean => euclidean_distance(&nv, sv),
565 |                         DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, sv),
566 |                     };
567 |                     dists.push((*sid, d));
568 |                 }
569 |             }
570 |             dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
571 |             dists.truncate(max_degree);
572 | 
573 |             let mut nbrs: Vec<u32> = dists.iter().map(|(id, _)| *id).collect();
574 |             while nbrs.len() < max_degree {
575 |                 nbrs.push(0);
576 |             }
577 |             let node_off = layer_offset + (nid as usize * max_degree * 4);
578 |             let off = adjacency_offset + node_off as u64;
579 |             let bytes = bytemuck::cast_slice(&nbrs);
580 |             file.write_at(bytes, off).unwrap();
581 |         });
582 |     });
583 | 
584 |     Ok(())
585 | }
586 | 
587 | fn pick_random_centroids(
588 |     cluster_count: usize,
589 |     file: &File,
590 |     vectors_offset: u64,
591 |     dim: usize,
592 |     num_vectors: usize,
593 | ) -> Result<Vec<(usize, Vec<f32>)>, DiskAnnError> {
594 |     let mut rng = rand::thread_rng();
595 |     let mut cents = Vec::with_capacity(cluster_count);
596 |     for _ in 0..cluster_count {
597 |         let id = rng.gen_range(0..num_vectors);
598 |         let vec = read_vector(file, vectors_offset, dim, id)?;
599 |         cents.push((id, vec));
600 |     }
601 |     Ok(cents)
602 | }
603 | 
604 | fn read_vector(
605 |     file: &File,
606 |     vectors_offset: u64,
607 |     dim: usize,
608 |     idx: usize,
609 | ) -> Result<Vec<f32>, DiskAnnError> {
610 |     let off = vectors_offset + (idx * dim * 4) as u64;
611 |     let mut buf = vec![0u8; dim * 4];
612 |     file.read_at(&mut buf, off)?;
613 |     let floats: &[f32] = bytemuck::cast_slice(&buf);
614 |     Ok(floats.to_vec())
615 | }
616 | 
617 | /// Computes Euclidean distance between two vectors
618 | fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
619 |     a.iter()
620 |         .zip(b.iter())
621 |         .map(|(x, y)| (x - y) * (x - y))
622 |         .sum::<f32>()
623 |         .sqrt()
624 | }
625 | 
626 | /// Computes cosine similarity between two vectors
627 | fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
628 |     let mut dot = 0.0;
629 |     let mut norm_a = 0.0;
630 |     let mut norm_b = 0.0;
631 |     for (x, y) in a.iter().zip(b.iter()) {
632 |         dot += x * y;
633 |         norm_a += x * x;
634 |         norm_b += y * y;
635 |     }
636 |     if norm_a == 0.0 || norm_b == 0.0 {
637 |         return 0.0;
638 |     }
639 |     dot / (norm_a.sqrt() * norm_b.sqrt())
640 | }
641 | 
642 | /// Basic unit tests
643 | #[cfg(test)]
644 | mod tests {
645 |     use std::collections::HashSet;
646 | 
647 |     use super::*;
648 | 
649 |     // We'll define a small set of 5 2D vectors in a static array.
650 |     // This is purely deterministic.
651 |     const TEST_VECTORS_2D: &[[f32; 2]] =
652 |         &[[0.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]];
653 | 
654 |     /// A special builder that does not generate random vectors, but writes
655 |     /// the `TEST_VECTORS_2D` to the file, uses cluster_count=1 or 2,
656 |     /// and no partial sampling for the adjacency build.
657 |     fn build_index_singlefile_for_test(
658 |         dim: usize,
659 |         distance_metric: DistanceMetric,
660 |         file_path: &str,
661 |     ) -> Result<SingleFileDiskANN, DiskAnnError> {
662 |         // 1) Create/truncate the file
663 |         let mut file = std::fs::OpenOptions::new()
664 |             .create(true)
665 |             .write(true)
666 |             .read(true)
667 |             .truncate(true)
668 |             .open(file_path)?;
669 | 
670 |         let num_vectors = TEST_VECTORS_2D.len();
671 |         let vectors_offset = 1024 * 1024;
672 |         let total_vector_bytes = (num_vectors as u64) * (dim as u64) * 4;
673 |         file.set_len(vectors_offset + total_vector_bytes)?;
674 | 
675 |         // 2) Write the fixed vectors into the file
676 |         // We'll assume dim=2 matches TEST_VECTORS_2D
677 |         for (i, vec2) in TEST_VECTORS_2D.iter().enumerate() {
678 |             let offset = vectors_offset + (i * dim * 4) as u64;
679 |             let bytes = bytemuck::cast_slice(vec2);
680 |             file.write_at(bytes, offset)?;
681 |         }
682 | 
683 |         // 3) We'll place all vectors in a single "cluster" => cluster_count=1
684 |         // So adjacency is effectively complete (we won't skip anything).
685 |         // Or if you want 2 clusters, do cluster_count=2 but it usually won't matter
686 |         let cluster_count = 1;
687 | 
688 |         // We'll store IDs in [0..num_vectors)
689 |         let layer0_ids: Vec<u32> = (0..num_vectors as u32).collect();
690 |         let layer1_ids = layer0_ids.clone();
691 |         let fraction_top = 0.2;
692 |         let fraction_mid = 0.4;
693 |         let max_degree = 2; // for the small test
694 |         let bytes_per_node = max_degree * 4;
695 | 
696 |         let offset_layer0 = 0;
697 |         let offset_layer1 = layer0_ids.len() * bytes_per_node;
698 |         let offset_layer2 = offset_layer1 + (layer1_ids.len() * bytes_per_node);
699 | 
700 |         let adjacency_offset = vectors_offset + total_vector_bytes;
701 |         let total_adj_bytes = offset_layer2 + (num_vectors * bytes_per_node);
702 |         file.set_len(adjacency_offset + total_adj_bytes as u64)?;
703 | 
704 |         // 4) Build adjacency *without partial sampling*
705 |         //    We'll treat the entire bucket. That ensures a "complete" adjacency for a small set.
706 |         let centroids =
707 |             pick_predefined_centroids(cluster_count, &file, vectors_offset, dim, distance_metric)?;
708 | 
709 |         // Build adjacency for layer0 => all vectors
710 |         build_layer_adjacency_test(
711 |             &file,
712 |             adjacency_offset,
713 |             offset_layer0,
714 |             &layer0_ids,
715 |             dim,
716 |             max_degree,
717 |             vectors_offset,
718 |             &centroids,
719 |             distance_metric,
720 |             /* no partial sampling = entire bucket */
721 |         )?;
722 | 
723 |         // same for layer1 => same IDs
724 |         build_layer_adjacency_test(
725 |             &file,
726 |             adjacency_offset,
727 |             offset_layer1,
728 |             &layer1_ids,
729 |             dim,
730 |             max_degree,
731 |             vectors_offset,
732 |             &centroids,
733 |             distance_metric,
734 |             /* no partial sampling = entire bucket */
735 |         )?;
736 | 
737 |         // layer2 => also same set
738 |         let base_ids: Vec<u32> = (0..num_vectors as u32).collect();
739 |         build_layer_adjacency_test(
740 |             &file,
741 |             adjacency_offset,
742 |             offset_layer2,
743 |             &base_ids,
744 |             dim,
745 |             max_degree,
746 |             vectors_offset,
747 |             &centroids,
748 |             distance_metric,
749 |             /* no partial sampling */
750 |         )?;
751 | 
752 |         // 5) Write metadata
753 |         let md = SingleFileMetadata {
754 |             dim,
755 |             num_vectors,
756 |             max_degree,
757 |             fraction_top,
758 |             fraction_mid,
759 |             distance_metric,
760 |             layer0_ids: layer0_ids.clone(),
761 |             layer1_ids: layer1_ids.clone(),
762 |             vectors_offset,
763 |             adjacency_offset,
764 |             offset_layer0,
765 |             offset_layer1,
766 |             offset_layer2,
767 |         };
768 |         let md_bytes = bincode::serialize(&md)?;
769 |         file.seek(std::io::SeekFrom::Start(0))?;
770 |         let md_len = md_bytes.len() as u64;
771 |         file.write_all(&md_len.to_le_bytes())?;
772 |         file.write_all(&md_bytes)?;
773 |         file.sync_all()?;
774 | 
775 |         let mmap = unsafe { memmap2::Mmap::map(&file)? };
776 | 
777 |         // Return the struct
778 |         Ok(SingleFileDiskANN {
779 |             dim,
780 |             num_vectors,
781 |             max_degree: md.max_degree,
782 |             fraction_top: md.fraction_top,
783 |             fraction_mid: md.fraction_mid,
784 |             distance_metric: md.distance_metric,
785 |             layer0_ids: md.layer0_ids,
786 |             layer1_ids: md.layer1_ids,
787 |             offset_layer0: md.offset_layer0,
788 |             offset_layer1: md.offset_layer1,
789 |             offset_layer2: md.offset_layer2,
790 |             vectors_offset: md.vectors_offset,
791 |             adjacency_offset: md.adjacency_offset,
792 |             mmap,
793 |         })
794 |     }
795 | 
796 |     /// We define a small "centroid" for cluster_count=1 => we just pick e.g. the first vector
797 |     fn pick_predefined_centroids(
798 |         cluster_count: usize,
799 |         file: &std::fs::File,
800 |         vectors_offset: u64,
801 |         dim: usize,
802 |         _distance_metric: DistanceMetric,
803 |     ) -> Result<Vec<(usize, Vec<f32>)>, DiskAnnError> {
804 |         // If cluster_count=1, let's just pick the first vector
805 |         let mut out = Vec::new();
806 |         for i in 0..cluster_count {
807 |             // We'll pick i-th
808 |             let v = read_vector(file, vectors_offset, dim, i)?;
809 |             out.push((i, v));
810 |         }
811 |         Ok(out)
812 |     }
813 | 
814 |     /// A specialized adjacency builder that uses the entire bucket for small tests
815 |     fn build_layer_adjacency_test(
816 |         file: &std::fs::File,
817 |         adjacency_offset: u64,
818 |         layer_offset: usize,
819 |         layer_ids: &[u32],
820 |         dim: usize,
821 |         max_degree: usize,
822 |         vectors_offset: u64,
823 |         _centroids: &[(usize, Vec<f32>)],
824 |         distance_metric: DistanceMetric,
825 |         // no partial sampling => entire bucket
826 |     ) -> Result<(), DiskAnnError> {
827 |         if layer_ids.is_empty() {
828 |             return Ok(());
829 |         }
830 |         // single cluster approach => everything in 1 bucket
831 |         let mut bucket = Vec::new();
832 |         for &nid in layer_ids {
833 |             bucket.push(nid);
834 |         }
835 | 
836 |         // sample = entire bucket
837 |         let sample_vecs: Vec<(u32, Vec<f32>)> = bucket
838 |             .iter()
839 |             .map(|&sid| {
840 |                 let v = read_vector(file, vectors_offset, dim, sid as usize).unwrap();
841 |                 (sid, v)
842 |             })
843 |             .collect();
844 | 
845 |         // adjacency for each node => top max_degree from entire bucket minus itself
846 |         for &nid in bucket.iter() {
847 |             let nv = read_vector(file, vectors_offset, dim, nid as usize).unwrap();
848 |             // compute distance to all other nodes in bucket
849 |             let mut dists = Vec::with_capacity(sample_vecs.len());
850 |             for (sid, sv) in &sample_vecs {
851 |                 if *sid != nid {
852 |                     let d = match distance_metric {
853 |                         DistanceMetric::Euclidean => euclidean_distance(&nv, sv),
854 |                         DistanceMetric::Cosine => 1.0 - cosine_similarity(&nv, sv),
855 |                     };
856 |                     dists.push((*sid, d));
857 |                 }
858 |             }
859 |             dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
860 |             dists.truncate(max_degree);
861 | 
862 |             let mut nbrs: Vec<u32> = dists.iter().map(|(id, _)| *id).collect();
863 |             while nbrs.len() < max_degree {
864 |                 nbrs.push(0);
865 |             }
866 | 
867 |             let node_off = layer_offset + (nid as usize * max_degree * 4);
868 |             let off = adjacency_offset + node_off as u64;
869 |             let bytes = bytemuck::cast_slice(&nbrs);
870 |             file.write_at(bytes, off).unwrap();
871 |         }
872 | 
873 |         Ok(())
874 |     }
875 | 
876 |     // test_small_euclidean using the above "hard-coded" approach
877 |     #[test]
878 |     fn test_small_euclidean() -> Result<(), DiskAnnError> {
879 |         let tmpfile = "test_small_euclid.db";
880 |         if std::path::Path::new(tmpfile).exists() {
881 |             std::fs::remove_file(tmpfile).unwrap();
882 |         }
883 | 
884 |         // build w/ no random + single cluster => adjacency is effectively "complete"
885 |         let index = build_index_singlefile_for_test(
886 |             2, // dim
887 |             DistanceMetric::Euclidean,
888 |             tmpfile,
889 |         )?;
890 | 
891 |         // We'll pick vector 0 as the query
892 |         let query = index.get_vector(0)?;
893 |         let k = 2;
894 |         let beam_width = 4;
895 |         let neighbors = index.search(&query, k, beam_width);
896 | 
897 |         // Manually compute actual top-2 by Euclidean among our TEST_VECTORS_2D
898 |         // We know them from e.g. [0,0], [1,0], [0,1], [1,1], [0.5,0.5]
899 |         // In this approach, let's do it generically:
900 |         let n = index.num_vectors;
901 |         let mut dists: Vec<(usize, f32)> = (0..n)
902 |             .map(|i| {
903 |                 let v = index.get_vector(i)?;
904 |                 Ok((i, euclidean_distance(&query, &v)))
905 |             })
906 |             .collect::<Result<Vec<_>, DiskAnnError>>()?;
907 | 
908 |         dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
909 |         let correct_ids: Vec<usize> = dists[..k].iter().map(|(i, _)| *i).collect();
910 | 
911 |         let set1: std::collections::HashSet<_> = correct_ids.into_iter().collect();
912 |         let set2: std::collections::HashSet<_> = neighbors.iter().map(|&x| x as usize).collect();
913 |         assert_eq!(set1, set2);
914 | 
915 |         Ok(())
916 |     }
917 | 
918 |     #[test]
919 |     fn test_small_cosine() -> Result<(), DiskAnnError> {
920 |         let tmpfile = "test_small_cosine.db";
921 |         if std::path::Path::new(tmpfile).exists() {
922 |             std::fs::remove_file(tmpfile).unwrap();
923 |         }
924 | 
925 |         let index = build_index_singlefile_for_test(
926 |             2, // dim
927 |             DistanceMetric::Cosine,
928 |             tmpfile,
929 |         )?;
930 | 
931 |         // Use vector[1] = [1.0, 0.0] as query instead of vector[0]
932 |         let query = index.get_vector(1)?;
933 |         let k = 2;
934 |         let beam_width = 4;
935 |         let neighbors = index.search(&query, k, beam_width);
936 | 
937 |         // compute actual top-2 by (1 - cos)
938 |         let n = index.num_vectors;
939 |         let mut dists: Vec<(usize, f32)> = (0..n)
940 |             .map(|i| {
941 |                 let v = index.get_vector(i)?;
942 |                 let sim = cosine_similarity(&query, &v);
943 |                 Ok((i, 1.0 - sim)) // interpret distance = 1 - cos
944 |             })
945 |             .collect::<Result<Vec<_>, DiskAnnError>>()?;
946 | 
947 |         dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
948 |         let correct_ids: Vec<usize> = dists[..k].iter().map(|(i, _)| *i).collect();
949 | 
950 |         let set1: HashSet<_> = setify(correct_ids);
951 |         let set2: HashSet<_> = setify(neighbors.iter().map(|&x| x as usize));
952 |         assert_eq!(set1, set2);
953 | 
954 |         Ok(())
955 |     }
956 | 
957 |     // small convenience fn to build a hashset from an iterator
958 |     fn setify<I>(iter: I) -> HashSet<<I as IntoIterator>::Item>
959 |     where
960 |         I: IntoIterator,
961 |         <I as IntoIterator>::Item: Eq + Hash,
962 |     {
963 |         iter.into_iter().collect()
964 |     }
965 | 
966 |     // reuses your get_vector for the test
967 |     impl SingleFileDiskANN {
968 |         pub fn get_vector(&self, idx: usize) -> Result<Vec<f32>, DiskAnnError> {
969 |             let vector_offset = self.vectors_offset + (idx * self.dim * 4) as u64;
970 |             let start = vector_offset as usize;
971 |             let end = start + (self.dim * 4);
972 |             let bytes = &self.mmap[start..end];
973 |             let vecf: &[f32] = bytemuck::cast_slice(bytes);
974 |             Ok(vecf.to_vec())
975 |         }
976 |     }
977 | }
978 | 


--------------------------------------------------------------------------------