├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── basic.rs ├── compare.rs └── concurrent.rs ├── src └── lib.rs └── tests ├── basic.rs └── concurrent.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: ["**"] 6 | pull_request: 7 | branches: ["**"] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | rust: 15 | name: Rust checks 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | 21 | - name: Install Rust (stable) 22 | uses: dtolnay/rust-toolchain@stable 23 | with: 24 | components: rustfmt, clippy 25 | 26 | - name: Cache cargo 27 | uses: Swatinem/rust-cache@v2 28 | with: 29 | cache-on-failure: true 30 | 31 | - name: Format 32 | run: cargo fmt --all -- --check 33 | 34 | - name: Clippy 35 | run: cargo clippy --lib --tests -- -D warnings 36 | 37 | - name: Build 38 | run: cargo build --verbose 39 | 40 | - name: Test 41 | run: cargo test --verbose 42 | 43 | rust-32: 44 | name: Rust checks (i686) 45 | runs-on: ubuntu-latest 46 | steps: 47 | - name: Checkout 48 | uses: actions/checkout@v4 49 | 50 | - name: Install Rust (stable) 51 | uses: dtolnay/rust-toolchain@stable 52 | with: 53 | targets: i686-unknown-linux-gnu 54 | 55 | - name: Install dependencies 56 | run: sudo apt-get update && sudo apt-get install -y gcc-multilib 57 | 58 | - name: Cache cargo 59 | uses: Swatinem/rust-cache@v2 60 | with: 61 | cache-on-failure: true 62 | 63 | - name: Build 64 | run: cargo build --verbose --target i686-unknown-linux-gnu 65 | 66 | - name: Test 67 | run: cargo test --verbose --target i686-unknown-linux-gnu 68 | 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | debug 2 | target 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "atomic-cuckoo-filter" 3 | version = "0.2.0" 4 | edition = "2024" 5 | description = "Highly concurrent Cuckoo Filter" 6 | license = "MIT" 7 | repository = "https://github.com/farhadi/atomic-cuckoo-filter" 8 | readme = "README.md" 9 | 10 | [dependencies] 11 | derive_builder = "0.20" 12 | parking_lot_core = "0.9" 13 | rand = "0.9" 14 | thiserror = "2.0" 15 | 16 | [dev-dependencies] 17 | ahash = "0.8" 18 | cuckoofilter = "0.5" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Ali Farhadi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Atomic Cuckoo Filter 2 | 3 | A high-performance, lock-free concurrent cuckoo filter implementation in Rust for efficient set membership testing. 4 | 5 | [![Crates.io](https://img.shields.io/crates/v/atomic-cuckoo-filter.svg)](https://crates.io/crates/atomic-cuckoo-filter) 6 | [![Documentation](https://docs.rs/atomic-cuckoo-filter/badge.svg)](https://docs.rs/atomic-cuckoo-filter) 7 | [![CI](https://img.shields.io/github/actions/workflow/status/farhadi/atomic-cuckoo-filter/ci.yml?branch=main&style=flat-square&logo=github)](https://github.com/farhadi/atomic-cuckoo-filter/actions) 8 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) 9 | 10 | ## Overview 11 | 12 | This crate provides a sophisticated implementation of a cuckoo filter - a probabilistic data structure 13 | for fast set membership testing. Unlike traditional implementations, this version uses **lock-free** 14 | atomic operations and is designed for high-concurrency environments. 15 | 16 | ## Key Features 17 | 18 | ✨ **Lock-Free Concurrency**: All operations use atomic compare-exchange loops instead of traditional locks 19 | 🚀 **High Performance**: Optimized for multi-threaded environments with minimal blocking 20 | 🔍 **No False Negatives**: Items that were inserted are guaranteed to be found 21 | 🎯 **Controllable False Positives**: Configurable fingerprint size to tune accuracy 22 | 📦 **Space Efficient**: ~20-30% less memory usage than Bloom filters for the same false positive rate 23 | 🗑️ **Deletion Support**: Unlike Bloom filters, inserted items can be safely removed 24 | ⏱️ **Bounded Lookup Time**: Always at most 2 bucket checks maximum 25 | 🔧 **Highly Configurable**: Customizable capacity, fingerprint size, bucket size, and eviction limits 26 | 27 | ## Quick Start 28 | 29 | Add this to your `Cargo.toml`: 30 | 31 | ```toml 32 | [dependencies] 33 | atomic-cuckoo-filter = "0.2" 34 | ``` 35 | 36 | ### Basic Usage 37 | 38 | ```rust 39 | use atomic_cuckoo_filter::CuckooFilter; 40 | 41 | // Create a filter with default settings 42 | let filter = CuckooFilter::new(); 43 | 44 | // Insert items 45 | filter.insert(&"hello").unwrap(); 46 | filter.insert(&"world").unwrap(); 47 | filter.insert(&42).unwrap(); 48 | 49 | // Check membership 50 | assert!(filter.contains(&"hello")); 51 | assert!(filter.contains(&42)); 52 | assert!(!filter.contains(&"rust")); 53 | 54 | // Remove items 55 | assert!(filter.remove(&"hello")); 56 | assert!(!filter.contains(&"hello")); 57 | 58 | // Count occurrences (not meant to be used as a counting filter, but to detect duplicates or hash collisions) 59 | filter.insert(&"duplicate").unwrap(); 60 | filter.insert(&"duplicate").unwrap(); 61 | assert_eq!(filter.count(&"duplicate"), 2); 62 | 63 | println!("Filter contains {} items", filter.len()); 64 | 65 | // Unique Insertions (Atomically check and insert items) 66 | // Returns Ok(true) if inserted, Ok(false) if already present 67 | match filter.insert_unique(&"item") { 68 | Ok(true) => println!("Item was inserted"), 69 | Ok(false) => println!("Item already existed"), 70 | Err(e) => println!("Filter is full: {}", e), 71 | } 72 | ``` 73 | 74 | ### Custom Configuration 75 | 76 | ```rust 77 | use atomic_cuckoo_filter::CuckooFilter; 78 | 79 | let filter = CuckooFilter::builder() 80 | .capacity(1_000_000) // Target capacity 81 | .fingerprint_size(16) // Bits per fingerprint (4, 8, 16, or 32) 82 | .bucket_size(4) // Fingerprints per bucket 83 | .max_evictions(500) // Maximum eviction chain length 84 | .build() 85 | .unwrap(); 86 | ``` 87 | 88 | ### Custom Hash Functions 89 | 90 | ```rust 91 | use ahash::AHasher; 92 | 93 | let filter = CuckooFilterBuilder::::default() 94 | .capacity(1024) 95 | .build() 96 | .unwrap(); 97 | ``` 98 | 99 | ### Concurrent Usage 100 | 101 | The filter is designed for high-concurrency scenarios: 102 | 103 | ```rust 104 | use atomic_cuckoo_filter::CuckooFilter; 105 | use std::sync::Arc; 106 | use std::thread; 107 | 108 | let filter = Arc::new(CuckooFilter::with_capacity(100_000)); 109 | 110 | // Spawn multiple threads for concurrent operations 111 | let mut handles = vec![]; 112 | 113 | // Writer threads 114 | for i in 0..4 { 115 | let filter_clone = Arc::clone(&filter); 116 | handles.push(thread::spawn(move || { 117 | for j in 0..1000 { 118 | let item = format!("item_{}_{}", i, j); 119 | filter_clone.insert(&item).unwrap(); 120 | } 121 | })); 122 | } 123 | 124 | // Reader threads 125 | for i in 0..4 { 126 | let filter_clone = Arc::clone(&filter); 127 | handles.push(thread::spawn(move || { 128 | for j in 0..1000 { 129 | let item = format!("item_{}_{}", i, j); 130 | while !filter_clone.contains(&item) {}; 131 | } 132 | })); 133 | } 134 | 135 | // Wait for all threads to complete 136 | for handle in handles { 137 | handle.join().unwrap(); 138 | } 139 | 140 | println!("Final filter size: {}", filter.len()); 141 | ``` 142 | 143 | ## Configuration Options 144 | 145 | | Parameter | Description | Valid Values | Default | 146 | |-----------|-------------|--------------|---------| 147 | | `capacity` | Target number of items | Any positive integer | 1,048,576 | 148 | | `fingerprint_size` | Bits per fingerprint | 4, 8, 16, or 32 | 16 | 149 | | `bucket_size` | Fingerprints per bucket | Any positive integer | 4 | 150 | | `max_evictions` | Max eviction chain length | Any integer ≥ 0 | 500 | 151 | 152 | ### Choosing Parameters 153 | 154 | **Fingerprint Size**: Larger fingerprints = fewer false positives but more memory usage 155 | 156 | **Bucket Size**: Larger buckets = Faster inserts (fewer evictions), but slower lookups, and slightly higher FPR 157 | 158 | **Max Evictions**: 159 | - 0 = No evictions (faster but may fail to insert occasionally) 160 | - Higher values = Better space utilization but slower inserts when load factor is high 161 | 162 | ## Concurrency Model 163 | 164 | All operations use atomic compare-exchange loops instead of traditional locks, with optimistic 165 | concurrency control for read operations. The only exception is when inserting with evictions, 166 | where an atomic-based lock is used to ensure consistency. 167 | 168 | ## Error Handling 169 | 170 | The main error type is `Error::NotEnoughSpace`, returned when the filter cannot accommodate more items: 171 | 172 | ```rust 173 | use atomic_cuckoo_filter::{CuckooFilter, Error}; 174 | 175 | let small_filter = CuckooFilter::builder() 176 | .capacity(10) 177 | .max_evictions(0) // Disable evictions 178 | .build() 179 | .unwrap(); 180 | 181 | // Fill the filter 182 | for i in 0..20 { 183 | match small_filter.insert(&i) { 184 | Ok(()) => println!("Inserted {}", i), 185 | Err(Error::NotEnoughSpace) => { 186 | println!("Filter is full at {} items", small_filter.len()); 187 | break; 188 | } 189 | } 190 | } 191 | ``` 192 | 193 | ## Testing 194 | 195 | Run the test suite: 196 | 197 | ```bash 198 | # Unit tests 199 | cargo test 200 | 201 | # Benchmarks 202 | cargo bench 203 | ``` 204 | 205 | ## Benchmarks 206 | 207 | - Environment: rustc 1.90.0-nightly (ace633090 2025-07-23), Apple M4 Pro 208 | - Command: `cargo +nightly bench -- --nocapture` 209 | 210 | Basic (single-threaded): 211 | 212 | ``` 213 | contains_false ~ 39.36 ns/iter 214 | contains_true ~ 24.60 ns/iter 215 | contains_with_max_evictions_0 ~ 20.37 ns/iter 216 | insert_and_remove ~ 111.66 ns/iter 217 | insert_and_remove_with_max_evictions_0 ~ 68.10 ns/iter 218 | insert_into_full_filter ~ 27.20 µs/iter 219 | insert_unique ~ 25.10 ns/iter 220 | ``` 221 | 222 | Concurrent (multi-threaded): 223 | 224 | ``` 225 | concurrent_contains ~ 36.93 ns/iter 226 | concurrent_contains_under_write_contention ~ 138.05 ns/iter 227 | ``` 228 | 229 | Comparison suite: 230 | 231 | This suite uses the reference [cuckoofilter](https://crates.io/crates/cuckoofilter) crate (dev-dependency `cuckoofilter = "0.5"`) as the baseline for comparison. 232 | 233 | ``` 234 | concurrent_contains ~ 3.34 µs/iter 235 | concurrent_contains_under_write_contention ~ 2.49 µs/iter 236 | contains_false ~ 14.31 ns/iter 237 | contains_true ~ 26.78 ns/iter 238 | insert_and_remove ~ 83.70 ns/iter 239 | insert_into_full_filter ~ 16.63 µs/iter 240 | insert_unique ~ 27.15 ns/iter 241 | ``` 242 | 243 | ## Safety and Guarantees 244 | 245 | - **Thread Safety**: All operations are thread-safe and can be called concurrently 246 | - **Memory Safety**: No unsafe code in the public API (uses `parking_lot_core` internally) 247 | 248 | ## License 249 | 250 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 251 | -------------------------------------------------------------------------------- /benches/basic.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use atomic_cuckoo_filter::CuckooFilter; 6 | use test::Bencher; 7 | 8 | /// Benchmarks basic single-threaded insert and remove performance of the atomic 9 | /// cuckoo filter. This provides baseline performance metrics for the lock-free 10 | /// implementation without any concurrent access. 11 | /// 12 | /// Setup: 131k capacity filter with 8-bit fingerprints 13 | /// Test: Continuous insert/remove cycle with a sliding window of 100k items 14 | #[bench] 15 | fn insert_and_remove(b: &mut Bencher) { 16 | let filter = CuckooFilter::builder() 17 | .capacity(131072) 18 | .fingerprint_size(8) 19 | .build() 20 | .unwrap(); 21 | let mut i = 0; 22 | b.iter(|| { 23 | i += 1; 24 | let _ = filter.insert(&i); 25 | filter.remove(&(i - 100000)); // Remove item from 100k iterations ago 26 | }); 27 | } 28 | 29 | /// Benchmarks single-threaded insert_unique performance. insert_unique provides 30 | /// atomic test-and-insert semantics, ensuring items are only inserted if they 31 | /// don't already exist in the filter. 32 | /// 33 | /// Setup: 131k capacity filter with 8-bit fingerprints, initially empty 34 | /// Test: Continuous insert_unique operations with incrementing u16 values 35 | #[bench] 36 | fn insert_unique(b: &mut Bencher) { 37 | let filter = CuckooFilter::builder() 38 | .capacity(131072) 39 | .fingerprint_size(8) 40 | .build() 41 | .unwrap(); 42 | let mut i: u16 = 0; 43 | b.iter(|| { 44 | i += 1; 45 | let _ = filter.insert_unique(&i); 46 | }); 47 | } 48 | 49 | /// Benchmarks insert/remove performance when the filter is configured with 50 | /// zero evictions allowed. In this case, the filter can do all operations 51 | /// atomically without any locks. 52 | /// 53 | /// Setup: 131k capacity filter with max_evictions=0 and 8-bit fingerprints 54 | /// Test: Insert/remove cycle with sliding window, no eviction attempts allowed 55 | #[bench] 56 | fn insert_and_remove_with_max_evictions_0(b: &mut Bencher) { 57 | let filter = CuckooFilter::builder() 58 | .capacity(131072) 59 | .max_evictions(0) // No evictions allowed - faster failure on collisions 60 | .fingerprint_size(8) 61 | .build() 62 | .unwrap(); 63 | let mut i = 0; 64 | b.iter(|| { 65 | i += 1; 66 | let _ = filter.insert(&i); 67 | filter.remove(&(i - 100000)) // Remove item from 100k iterations ago 68 | }); 69 | } 70 | 71 | /// Benchmarks insert performance as the filter becomes increasingly full. 72 | /// This tests how performance degrades as the load factor increases and 73 | /// hash collisions become more frequent, requiring more eviction attempts. 74 | /// 75 | /// Setup: 131k capacity filter with 8-bit fingerprints, initially empty 76 | /// Test: Continuous insertions without any removes until filter reaches capacity 77 | #[bench] 78 | fn insert_into_full_filter(b: &mut Bencher) { 79 | let filter = CuckooFilter::builder() 80 | .capacity(131072) 81 | .fingerprint_size(8) 82 | .build() 83 | .unwrap(); 84 | let mut i = 0; 85 | b.iter(|| { 86 | i += 1; 87 | let _ = filter.insert(&i); 88 | }); 89 | } 90 | 91 | /// Benchmarks contains() performance when querying for items that exist in the filter. 92 | /// This tests positive lookup performance with a fully-populated filter, measuring 93 | /// the cost of successful hash table lookups. 94 | /// 95 | /// Setup: 131k capacity filter pre-populated with all u16 values (0-65535) 96 | /// Test: Cycling through contains() calls for values that definitely exist 97 | #[bench] 98 | fn contains_true(b: &mut Bencher) { 99 | let filter = CuckooFilter::builder() 100 | .capacity(131072) 101 | .fingerprint_size(8) 102 | .build() 103 | .unwrap(); 104 | // Pre-populate with all possible u16 values 105 | for i in 0..=65535u16 { 106 | filter.insert(&i).unwrap(); 107 | } 108 | let mut i: u16 = 0; 109 | b.iter(|| { 110 | i += 1; 111 | filter.contains(&i); 112 | }); 113 | } 114 | 115 | /// Benchmarks contains() performance when querying for items that don't exist. 116 | /// This tests negative lookup performance with an empty filter, measuring 117 | /// the cost of failed hash table lookups. 118 | /// 119 | /// Setup: 131k capacity filter with 8-bit fingerprints, completely empty 120 | /// Test: Continuous contains() calls for items that definitely don't exist 121 | #[bench] 122 | fn contains_false(b: &mut Bencher) { 123 | let filter = CuckooFilter::builder() 124 | .capacity(131072) 125 | .fingerprint_size(8) 126 | .build() 127 | .unwrap(); 128 | let mut i: u16 = 0; 129 | b.iter(|| { 130 | i += 1; 131 | filter.contains(&i); 132 | }); 133 | } 134 | 135 | /// Benchmarks contains() performance with a filter configured for zero evictions. 136 | /// In this case, the filter can do all operations atomically without any optimistic concurrency control. 137 | /// 138 | /// Setup: 131k capacity filter with max_evictions=0, attempt to insert all u16 values 139 | /// Test: Contains() calls for values that may or may not exist (depending on insertion success) 140 | #[bench] 141 | fn contains_with_max_evictions_0(b: &mut Bencher) { 142 | let filter = CuckooFilter::builder() 143 | .capacity(131072) 144 | .max_evictions(0) // No evictions - some insertions may fail due to collisions 145 | .build() 146 | .unwrap(); 147 | // Attempt to insert all u16 values (some may fail due to collisions) 148 | for i in 0..=65535u16 { 149 | let _ = filter.insert(&i); 150 | } 151 | let mut i: u16 = 0; 152 | b.iter(|| { 153 | i += 1; 154 | filter.contains(&i); 155 | }); 156 | } 157 | -------------------------------------------------------------------------------- /benches/compare.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use cuckoofilter::CuckooFilter; 6 | use std::hash::DefaultHasher; 7 | use std::sync::RwLock; 8 | use std::sync::atomic::AtomicBool; 9 | use std::sync::{Arc, atomic::Ordering}; 10 | use std::thread; 11 | use test::Bencher; 12 | 13 | /// Benchmarks basic single-threaded insert and remove performance using the 14 | /// reference cuckoofilter implementation. This provides a baseline for 15 | /// comparing against the atomic implementation. 16 | /// 17 | /// Setup: 131k capacity filter using DefaultHasher 18 | /// Test: Continuous insert/remove cycle with a sliding window of 100k items 19 | #[bench] 20 | fn insert_and_remove(b: &mut Bencher) { 21 | let mut filter = CuckooFilter::::with_capacity(131072); 22 | let mut i = 0; 23 | b.iter(|| { 24 | i += 1; 25 | let _ = filter.add(&i); 26 | filter.delete(&(i - 100000)); 27 | }); 28 | } 29 | 30 | /// Benchmarks single-threaded test_and_add performance (insert_unique equivalent). 31 | /// test_and_add ensures an item is only inserted if it doesn't already exist. 32 | /// 33 | /// Setup: 131k capacity filter, initially empty 34 | /// Test: Continuous test_and_add operations with incrementing u16 values 35 | #[bench] 36 | fn insert_unique(b: &mut Bencher) { 37 | let mut filter = CuckooFilter::::with_capacity(131072); 38 | let mut i: u16 = 0; 39 | b.iter(|| { 40 | i += 1; 41 | let _ = filter.test_and_add(&i); 42 | }); 43 | } 44 | 45 | /// Benchmarks insert performance when the filter becomes increasingly full. 46 | /// This tests how performance degrades as the filter reaches capacity and 47 | /// hash collisions become more frequent. 48 | /// 49 | /// Setup: 131k capacity filter, initially empty 50 | /// Test: Continuous insertions until filter is full (no removes) 51 | #[bench] 52 | fn insert_into_full_filter(b: &mut Bencher) { 53 | let mut filter = CuckooFilter::::with_capacity(131072); 54 | let mut i = 0; 55 | b.iter(|| { 56 | i += 1; 57 | let _ = filter.add(&i); 58 | }); 59 | } 60 | 61 | /// Benchmarks contains() performance when querying for items that exist in the filter. 62 | /// This tests positive lookup performance with a fully-populated filter. 63 | /// 64 | /// Setup: 131k capacity filter pre-populated with all u16 values (0-65535) 65 | /// Test: Cycling through contains() calls for values that definitely exist 66 | #[bench] 67 | fn contains_true(b: &mut Bencher) { 68 | let mut filter = CuckooFilter::::with_capacity(131072); 69 | // Pre-populate with all possible u16 values 70 | for i in 0..=65535u16 { 71 | filter.add(&i).unwrap(); 72 | } 73 | let mut i: u16 = 0; 74 | b.iter(|| { 75 | i += 1; 76 | filter.contains(&i); 77 | }); 78 | } 79 | 80 | /// Benchmarks contains() performance when querying for items that don't exist. 81 | /// This tests negative lookup performance with an empty filter. 82 | /// 83 | /// Setup: 131k capacity filter, completely empty 84 | /// Test: Continuous contains() calls for items that definitely don't exist 85 | #[bench] 86 | fn contains_false(b: &mut Bencher) { 87 | let filter = CuckooFilter::::with_capacity(131072); 88 | let mut i: u16 = 0; 89 | b.iter(|| { 90 | i += 1; 91 | filter.contains(&i); 92 | }); 93 | } 94 | 95 | /// Benchmarks concurrent read performance using RwLock-protected filter. 96 | /// This tests read scalability compared to the lock-free atomic implementation. 97 | /// 98 | /// Setup: 131k capacity filter with 100k pre-inserted items, protected by RwLock 99 | /// Scenario: 10 background threads doing continuous reads while main thread benchmarks reads 100 | /// Note: Uses read locks for all contains() operations 101 | #[bench] 102 | fn concurrent_contains(b: &mut Bencher) { 103 | let filter = Arc::new(RwLock::new(CuckooFilter::::with_capacity( 104 | 131072, 105 | ))); 106 | let stop_flag = Arc::new(AtomicBool::new(false)); 107 | let mut handles = vec![]; 108 | 109 | // Pre-populate with 100k items (even numbers) using write lock 110 | { 111 | let f = filter.clone(); 112 | let mut f = f.write().unwrap(); 113 | for i in 0..100000 { 114 | f.add(&(i * 2)).unwrap(); 115 | } 116 | } 117 | 118 | // Start 10 background threads doing continuous contains() with read locks 119 | for _ in 0..10 { 120 | let f = filter.clone(); 121 | let stop = stop_flag.clone(); 122 | handles.push(thread::spawn(move || { 123 | let mut i = 0; 124 | while !stop.load(Ordering::Relaxed) { 125 | if i == 200000 { 126 | i = 0; 127 | } else { 128 | i += 1; 129 | } 130 | f.read().unwrap().contains(&i); 131 | } 132 | })); 133 | } 134 | 135 | // Benchmark contains() performance using read locks 136 | let mut i = 0; 137 | b.iter(|| { 138 | if i == 200000 { 139 | i = 0; 140 | } else { 141 | i += 1; 142 | } 143 | filter.read().unwrap().contains(&i); 144 | }); 145 | 146 | // Clean up background threads 147 | stop_flag.store(true, Ordering::Relaxed); 148 | for h in handles { 149 | h.join().unwrap(); 150 | } 151 | } 152 | 153 | /// Benchmarks read performance while background threads are writing using RwLock. 154 | /// This tests read/write contention compared to the lock-free atomic implementation. 155 | /// 156 | /// Setup: 131k capacity filter protected by RwLock 157 | /// Scenario: 10 background threads doing write operations (insert/remove) 158 | /// while main thread benchmarks read performance 159 | #[bench] 160 | fn concurrent_contains_under_write_contention(b: &mut Bencher) { 161 | let filter = Arc::new(RwLock::new(CuckooFilter::::with_capacity( 162 | 131072, 163 | ))); 164 | let stop_flag = Arc::new(AtomicBool::new(false)); 165 | let mut handles = vec![]; 166 | 167 | // Start 10 background threads doing write operations (insert/remove) 168 | for c in 0..10 { 169 | let f = filter.clone(); 170 | let stop = stop_flag.clone(); 171 | handles.push(thread::spawn(move || { 172 | let mut i: u16 = c; 173 | while !stop.load(Ordering::Relaxed) { 174 | i += 10; 175 | let _ = f.write().unwrap().add(&i); 176 | f.write().unwrap().delete(&(i - 10000)); 177 | } 178 | })); 179 | } 180 | 181 | // Benchmark read performance under write contention 182 | let mut i: u16 = 0; 183 | b.iter(|| { 184 | i += 1; 185 | filter.read().unwrap().contains(&i); 186 | }); 187 | 188 | // Clean up background threads 189 | stop_flag.store(true, Ordering::Relaxed); 190 | for h in handles { 191 | h.join().unwrap(); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /benches/concurrent.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use atomic_cuckoo_filter::CuckooFilter; 6 | use std::sync::atomic::AtomicBool; 7 | use std::sync::{Arc, atomic::Ordering}; 8 | use std::thread; 9 | use test::Bencher; 10 | 11 | /// Benchmarks concurrent read performance (contains() calls) while multiple background 12 | /// threads are also performing reads. This tests the filter's ability to handle 13 | /// high-concurrency read workloads without contention. 14 | /// 15 | /// Setup: 131k capacity filter with 100k pre-inserted items 16 | /// Scenario: 10 background threads continuously calling contains() while main thread benchmarks contains() 17 | #[bench] 18 | fn concurrent_contains(b: &mut Bencher) { 19 | let filter = Arc::new( 20 | CuckooFilter::builder() 21 | .capacity(131072) 22 | .fingerprint_size(8) 23 | .build() 24 | .unwrap(), 25 | ); 26 | let stop_flag = Arc::new(AtomicBool::new(false)); 27 | let mut handles = vec![]; 28 | 29 | // Pre-populate with 100k items (even numbers) 30 | for i in 0..100000 { 31 | filter.insert(&(i * 2)).unwrap(); 32 | } 33 | 34 | // Start 10 background threads doing continuous contains() calls 35 | for _ in 0..10 { 36 | let f = filter.clone(); 37 | let stop = stop_flag.clone(); 38 | handles.push(thread::spawn(move || { 39 | let mut i = 0; 40 | while !stop.load(Ordering::Relaxed) { 41 | if i == 200000 { 42 | i = 0; 43 | } else { 44 | i += 1; 45 | } 46 | f.contains(&i); 47 | } 48 | })) 49 | } 50 | 51 | // Benchmark contains() calls in main thread 52 | let mut i = 0; 53 | b.iter(|| { 54 | if i == 200000 { 55 | i = 0; 56 | } else { 57 | i += 1; 58 | } 59 | filter.contains(&i); 60 | }); 61 | 62 | // Clean up background threads 63 | stop_flag.store(true, Ordering::Relaxed); 64 | for h in handles { 65 | h.join().unwrap(); 66 | } 67 | } 68 | 69 | /// Benchmarks contains() performance while background threads are actively 70 | /// inserting and removing items. This tests read performance under write contention. 71 | /// 72 | /// Setup: 131k capacity filter, initially empty 73 | /// Scenario: 10 background threads inserting new items and removing old ones, 74 | /// while main thread benchmarks contains() performance 75 | #[bench] 76 | fn concurrent_contains_under_write_contention(b: &mut Bencher) { 77 | let filter = Arc::new( 78 | CuckooFilter::builder() 79 | .capacity(131072) 80 | .fingerprint_size(8) 81 | .build() 82 | .unwrap(), 83 | ); 84 | let stop_flag = Arc::new(AtomicBool::new(false)); 85 | let mut handles = vec![]; 86 | 87 | // Start 10 background threads doing insert/remove operations 88 | for c in 0..10 { 89 | let f = filter.clone(); 90 | let stop = stop_flag.clone(); 91 | handles.push(thread::spawn(move || { 92 | let mut i: u16 = c; 93 | while !stop.load(Ordering::Relaxed) { 94 | i += 10; 95 | let _ = f.insert(&i); 96 | f.remove(&(i - 10000)); 97 | } 98 | })) 99 | } 100 | 101 | // Benchmark contains() calls while background threads are modifying the filter 102 | let mut i: u16 = 0; 103 | b.iter(|| { 104 | i += 1; 105 | filter.contains(&i); 106 | }); 107 | 108 | // Clean up background threads 109 | stop_flag.store(true, Ordering::Relaxed); 110 | for h in handles { 111 | h.join().unwrap(); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Lock-Free Concurrent Cuckoo Filter Implementation 2 | // A high-performance probabilistic data structure for efficient set membership testing 3 | // with better space efficiency than Bloom filters, support for deletions, and 4 | // fully concurrent operations using atomic operations and lock-free algorithms. 5 | 6 | use derive_builder::Builder; 7 | use rand::Rng; 8 | use std::collections::HashSet; 9 | use std::collections::hash_map::DefaultHasher; 10 | use std::hash::{Hash, Hasher}; 11 | use std::hint; 12 | use std::marker::PhantomData; 13 | use std::sync::atomic::{AtomicUsize, Ordering}; 14 | 15 | /// Maximum number of spin-loop iterations before parking a thread. 16 | /// This balances CPU usage vs. latency - spinning avoids kernel calls for 17 | /// short waits, but we park threads to avoid wasting CPU on long waits. 18 | const MAX_SPIN: usize = 100; 19 | 20 | /// Error type for Cuckoo Filter insert operation 21 | #[derive(Debug, thiserror::Error, PartialEq)] 22 | pub enum Error { 23 | /// Returned when the filter is full and cannot accommodate more elements 24 | #[error("Not enough space to store this item.")] 25 | NotEnoughSpace, 26 | } 27 | 28 | /// Types of locks that can be acquired on the filter 29 | #[derive(PartialEq)] 30 | pub enum LockKind { 31 | /// Optimistic version tracking - does not block other operations but captures 32 | /// a version number to detect if data changed during the operation 33 | Optimistic, 34 | /// Exclusive among writers only - prevents other writers but allows concurrent readers 35 | WriterExclusive, 36 | /// Fully exclusive access - blocks all other operations (used only during evictions) 37 | FullyExclusive, 38 | } 39 | 40 | /// A sophisticated lock implementation designed for concurrent cuckoo filter operations. 41 | /// 42 | /// This is NOT a traditional mutex but an atomic-based synchronization mechanism that 43 | /// enables three distinct concurrency modes: 44 | /// 45 | /// 1. **Optimistic locks**: Allow maximum concurrency - multiple readers and writers 46 | /// can proceed simultaneously. Used for optimistic reads that detect data races. 47 | /// 48 | /// 2. **WriterExclusive locks**: Mutual exclusion among writers only - prevents 49 | /// concurrent modifications but allows concurrent reads. 50 | /// 51 | /// 3. **FullyExclusive locks**: Complete mutual exclusion - blocks all operations. 52 | /// Only used during complex eviction chains to ensure consistency. 53 | /// 54 | /// ## Version Encoding Scheme 55 | /// The atomic usize encodes both lock state and version information: 56 | /// - **Bits 0-1**: Lock kind (0=Optimistic, 1=WriterExclusive, 2=FullyExclusive) 57 | /// - **Bits 2-63**: Version counter (incremented on FullyExclusive release) 58 | /// 59 | /// This allows optimistic readers to detect when their read might be stale by 60 | /// comparing version numbers before and after the operation. 61 | pub struct Lock<'a> { 62 | /// Reference to the shared atomic value encoding lock state and version 63 | atomic: &'a AtomicUsize, 64 | /// Snapshot of the atomic value when this lock was acquired. 65 | /// Used for optimistic concurrency control and version tracking. 66 | /// The lower 2 bits indicate lock type, upper bits track version changes. 67 | version: usize, 68 | /// The type of lock held by this instance 69 | kind: LockKind, 70 | /// Counter for spin attempts before transitioning to thread parking. 71 | /// Implements adaptive spinning to balance latency vs CPU usage. 72 | retry: usize, 73 | } 74 | 75 | impl<'a> Lock<'a> { 76 | /// Create a new lock of the specified kind 77 | /// Blocks until the lock can be acquired 78 | fn new(atomic: &'a AtomicUsize, kind: LockKind) -> Self { 79 | let mut lock = Self { 80 | atomic, 81 | version: 0, 82 | kind, 83 | retry: 0, 84 | }; 85 | match lock.kind { 86 | LockKind::Optimistic => loop { 87 | // For optimistic locks, we can proceed as long as there's no FullyExclusive lock 88 | lock.version = atomic.load(Ordering::Relaxed); 89 | if Self::kind(lock.version) != LockKind::FullyExclusive { 90 | return lock; 91 | } 92 | lock.spin_or_park() 93 | }, 94 | _ => loop { 95 | // For writer exclusive and fully exclusive locks, we need to ensure no exclusive lock is acquired 96 | lock.version = atomic.load(Ordering::Relaxed); 97 | if Self::kind(lock.version) != LockKind::Optimistic { 98 | lock.spin_or_park(); 99 | continue; 100 | } 101 | // Update lower bits of the version: 1 for WriterExclusive, 2 for FullyExclusive 102 | let new_version = if lock.kind == LockKind::WriterExclusive { 103 | lock.version + 1 104 | } else { 105 | lock.version + 2 106 | }; 107 | if atomic 108 | .compare_exchange_weak( 109 | lock.version, 110 | new_version, 111 | Ordering::Release, 112 | Ordering::Relaxed, 113 | ) 114 | .is_ok() 115 | { 116 | return lock; 117 | } 118 | }, 119 | } 120 | } 121 | 122 | /// Upgrade a WriterExclusive lock to a FullyExclusive lock 123 | /// This assumes the current thread holds the writer exclusive lock. 124 | fn upgrade(&mut self) { 125 | self.atomic.store(self.version + 2, Ordering::Release); 126 | self.kind = LockKind::FullyExclusive; 127 | } 128 | 129 | /// Check if the lock is outdated (version changed) or a FullyExclusive lock is acquired 130 | /// Used for optimistic concurrency control 131 | fn is_outdated(&self) -> bool { 132 | let version = self.atomic.load(Ordering::Acquire); 133 | Self::kind(version) == LockKind::FullyExclusive || version >> 2 != self.version >> 2 134 | } 135 | 136 | /// Get the key for parking a thread 137 | /// Different keys are used for optimistic and exclusive locks 138 | fn park_key(&self) -> usize { 139 | let key = self.atomic.as_ptr() as usize; 140 | match self.kind { 141 | LockKind::Optimistic => key, 142 | _ => key + 1, 143 | } 144 | } 145 | 146 | /// Spin or park the thread when waiting for a lock 147 | fn spin_or_park(&mut self) { 148 | if self.retry > MAX_SPIN { 149 | // After MAX_SPIN attempts, park the thread 150 | self.retry = 0; 151 | unsafe { 152 | parking_lot_core::park( 153 | self.park_key(), 154 | || self.atomic.load(Ordering::Acquire) == self.version, 155 | || (), 156 | |_, _| (), 157 | parking_lot_core::DEFAULT_PARK_TOKEN, 158 | None, 159 | ); 160 | } 161 | } else { 162 | // Otherwise, spin 163 | self.retry += 1; 164 | hint::spin_loop(); 165 | } 166 | } 167 | 168 | /// Extract the lock kind from the lower 2 bits of a version value 169 | fn kind(version: usize) -> LockKind { 170 | match version & 0b11 { 171 | 0 => LockKind::Optimistic, 172 | 1 => LockKind::WriterExclusive, 173 | 2 => LockKind::FullyExclusive, 174 | _ => panic!("Invalid Lock"), 175 | } 176 | } 177 | } 178 | 179 | impl Drop for Lock<'_> { 180 | /// Release the lock when it goes out of scope 181 | fn drop(&mut self) { 182 | match self.kind { 183 | LockKind::Optimistic => return, // No need to do anything for Optimistic locks 184 | LockKind::WriterExclusive => { 185 | // For WriterExclusive locks, release the lock without incrementing the version 186 | self.atomic.store(self.version, Ordering::Release); 187 | } 188 | LockKind::FullyExclusive => { 189 | // For FullyExclusive locks, increment the version to invalidate Optimistic locks 190 | self.atomic.store(self.version + 4, Ordering::Release); 191 | } 192 | } 193 | 194 | // Unpark waiting threads 195 | let optimistic_key = self.atomic.as_ptr() as usize; 196 | let exclusive_key = optimistic_key + 1; 197 | unsafe { 198 | // Unpark all waiting optimistic locks 199 | parking_lot_core::unpark_all(optimistic_key, parking_lot_core::DEFAULT_UNPARK_TOKEN); 200 | // Unpark one waiting exclusive lock (either WriterExclusive or FullyExclusive) 201 | parking_lot_core::unpark_one(exclusive_key, |_| parking_lot_core::DEFAULT_UNPARK_TOKEN); 202 | } 203 | } 204 | } 205 | 206 | /// A highly concurrent lock-free probabilistic data structure for set membership testing. 207 | /// 208 | /// ## What Makes It "Cuckoo" 209 | /// 210 | /// Named after the cuckoo bird's behavior of displacing other birds' eggs, this filter 211 | /// uses **cuckoo hashing** where each item can be stored in one of two possible locations. 212 | /// When both locations are full, existing items are "evicted" (like cuckoo eggs) and 213 | /// relocated to their alternate position, creating eviction chains. 214 | /// 215 | /// ## Algorithm Overview 216 | /// 217 | /// 1. **Fingerprints**: Items are reduced to small fingerprints (4-32 bits) instead of 218 | /// storing full keys, providing excellent space efficiency. 219 | /// 220 | /// 2. **Dual Hashing**: Each item has two possible bucket locations computed from its hash. 221 | /// This provides better space efficiency and flexibility when inserting and removing items. 222 | /// 223 | /// 3. **Eviction Chains**: When both buckets are full, a random item is evicted from one 224 | /// bucket and moved to its alternate location, potentially triggering a chain of evictions. 225 | /// 226 | /// 4. **Lock-Free Concurrency**: All operations use atomic compare-exchange loops instead 227 | /// of traditional locks, with optimistic concurrency control for read operations. 228 | /// The only exception is when inserting with evictions, where a FullyExclusive lock is used 229 | /// to ensure consistency. 230 | /// 231 | /// ## Key Advantages Over Bloom Filters 232 | /// 233 | /// - **Deletions supported**: Items can be removed without false negatives 234 | /// - **Better space efficiency**: ~20-30% less memory for same false positive rate 235 | /// - **Bounded lookup time**: Always at most 2 bucket checks, never more 236 | /// - **High concurrency**: Lock-free design enables excellent parallel performance 237 | /// 238 | /// ## Concurrency Model 239 | /// 240 | /// - **Reads**: Optimistic, can proceed concurrently with most operations 241 | /// - **Simple writes**: Use atomic compare-exchange loops without blocking other operations 242 | /// - **WriterExclusive locks**: Used for removing items, and for unique insertions 243 | /// - **Complex evictions**: Use FullyExclusive locks to ensure consistency 244 | /// 245 | /// ## Time Complexity 246 | /// 247 | /// - **Lookup**: O(1) 248 | /// - **Deletion**: O(1) 249 | /// - **Insertion**: Amortized O(1) due to eviction chains, but the number of evictions is bounded 250 | #[derive(Debug, Builder)] 251 | #[builder( 252 | pattern = "owned", 253 | build_fn(private, name = "base_build", validate = "Self::validate") 254 | )] 255 | pub struct CuckooFilter 256 | where 257 | H: Hasher + Default, 258 | { 259 | // Configuration parameters 260 | /// Maximum number of elements the filter can store 261 | #[builder(default = "1048576")] 262 | capacity: usize, 263 | 264 | /// Size of fingerprints in bits (must be 4, 8, 16, or 32) 265 | #[builder(default = "16")] 266 | fingerprint_size: usize, 267 | 268 | /// Number of fingerprints per bucket 269 | #[builder(default = "4")] 270 | bucket_size: usize, 271 | 272 | /// Maximum number of evictions to try before giving up 273 | #[builder(default = "500")] 274 | max_evictions: usize, 275 | 276 | // Internal values - automatically derived from the configuration 277 | /// Number of fingerprints that can be stored in a single atomic value 278 | #[builder(setter(skip))] 279 | fingerprints_per_atomic: usize, 280 | 281 | /// Number of buckets in the filter (power of 2) 282 | #[builder(setter(skip))] 283 | num_buckets: usize, 284 | 285 | /// Bit mask for extracting fingerprints 286 | #[builder(setter(skip))] 287 | fingerprint_mask: usize, 288 | 289 | /// Storage for buckets, implemented as a vector of atomic values 290 | #[builder(setter(skip))] 291 | buckets: Vec, 292 | 293 | /// Atomic value used for locking 294 | #[builder(setter(skip))] 295 | lock: AtomicUsize, 296 | 297 | /// Counter for the number of elements in the filter 298 | #[builder(setter(skip))] 299 | counter: AtomicUsize, 300 | 301 | /// Phantom data for the hasher type 302 | #[builder(setter(skip))] 303 | _hasher: PhantomData, 304 | } 305 | 306 | impl CuckooFilter { 307 | /// Insert an item into the filter 308 | /// 309 | /// This operation first attempts a direct insertion without acquiring a lock. 310 | /// If that fails due to bucket collisions, it falls back to the eviction-based 311 | /// insertion algorithm which may require a write lock. 312 | /// 313 | /// Concurrent operations are safely handled through atomic operations. 314 | /// 315 | /// Returns Ok(()) if the item was inserted, or Error::NotEnoughSpace if the filter is full 316 | pub fn insert(&self, item: &T) -> Result<(), Error> { 317 | let (index, fingerprint) = self.index_and_fingerprint(item); 318 | self.try_insert(index, fingerprint).or_else(|error| { 319 | let lock = self.lock(LockKind::WriterExclusive).ok_or(error)?; 320 | self.insert_with_evictions(index, fingerprint, lock) 321 | }) 322 | } 323 | 324 | /// Check if an item is in the filter and insert it if is not present (atomically) 325 | /// 326 | /// This method combines lookup and insert into a single atomic operation, 327 | /// ensuring thread safety and consistency even with concurrent operations. 328 | /// 329 | /// Returns Ok(true) if the item was inserted, Ok(false) if it was already present, 330 | /// or Error::NotEnoughSpace if the filter is full 331 | pub fn insert_unique(&self, item: &T) -> Result { 332 | let (index, fingerprint) = self.index_and_fingerprint(item); 333 | if self.lookup_fingerprint(index, fingerprint).is_some() { 334 | return Ok(false); 335 | } 336 | let lock = Lock::new(&self.lock, LockKind::WriterExclusive); 337 | if self.lookup_fingerprint(index, fingerprint).is_some() { 338 | return Ok(false); 339 | } 340 | self.try_insert(index, fingerprint) 341 | .or_else(|error| { 342 | if self.max_evictions == 0 { 343 | return Err(error); 344 | } 345 | self.insert_with_evictions(index, fingerprint, lock) 346 | }) 347 | .map(|_| true) 348 | } 349 | 350 | /// Counts the number of occurrences of an item in the filter. 351 | /// 352 | /// # Notes 353 | /// - This is not a counting filter; it simply counts matching fingerprints in both candidate buckets. 354 | /// - Useful for detecting duplicates or hash collisions, not for precise multiset membership. 355 | /// - The count is limited by the filter's structure: at most `bucket_size * 2` per item. 356 | /// - This method may count false positives due to hash collisions. 357 | pub fn count(&self, item: &T) -> usize { 358 | let (index, fingerprint) = self.index_and_fingerprint(item); 359 | let alt_index = self.alt_index(index, fingerprint); 360 | self.atomic_read( 361 | || { 362 | self.read_bucket(index, Ordering::Acquire) 363 | .filter(|&f| f == fingerprint) 364 | .count() 365 | + self 366 | .read_bucket(alt_index, Ordering::Acquire) 367 | .filter(|&f| f == fingerprint) 368 | .count() 369 | }, 370 | None, 371 | ) 372 | } 373 | 374 | /// Attempts to remove an item from the filter. 375 | /// 376 | /// Returns `true` if the item was successfully removed, or `false` if it was not found. 377 | /// 378 | /// Note: 379 | /// - An item should only be removed if it was previously added. Removing a non-existent 380 | /// item may inadvertently remove a different item due to hash collisions inherent to 381 | /// cuckoo filters. 382 | pub fn remove(&self, item: &T) -> bool { 383 | let (index, fingerprint) = self.index_and_fingerprint(item); 384 | while let Some((index, sub_index)) = self.lookup_fingerprint(index, fingerprint) { 385 | let _lock = self.lock(LockKind::WriterExclusive); 386 | if self.update_bucket(index, sub_index, fingerprint, 0, Ordering::Release) { 387 | return true; 388 | } 389 | } 390 | false 391 | } 392 | 393 | /// Check if an item is in the filter 394 | /// 395 | /// Returns `true` if the item is possibly in the filter (may have false positives), 396 | /// `false` if it is definitely not in the filter 397 | pub fn contains(&self, item: &T) -> bool { 398 | let (index, fingerprint) = self.index_and_fingerprint(item); 399 | self.atomic_read( 400 | || self.lookup_fingerprint(index, fingerprint).is_some(), 401 | Some(true), 402 | ) 403 | } 404 | 405 | /// Get the number of elements in the filter 406 | pub fn len(&self) -> usize { 407 | self.counter.load(Ordering::Acquire) 408 | } 409 | 410 | /// Check if the filter is empty 411 | pub fn is_empty(&self) -> bool { 412 | self.len() == 0 413 | } 414 | 415 | /// Get the capacity of the filter 416 | pub fn capacity(&self) -> usize { 417 | self.capacity 418 | } 419 | 420 | /// Clear the filter, removing all elements 421 | pub fn clear(&self) { 422 | let _lock = self.lock(LockKind::WriterExclusive); 423 | for atomic in &self.buckets { 424 | let old_value = atomic.swap(0, Ordering::Release); 425 | let removed = (0..self.fingerprints_per_atomic) 426 | .filter(|i| (old_value >> (i * self.fingerprint_size)) & self.fingerprint_mask != 0) 427 | .count(); 428 | if removed > 0 { 429 | self.counter.fetch_sub(removed, Ordering::Release); 430 | } 431 | } 432 | } 433 | 434 | /// Compute the hash of an item 435 | /// Uses the generic hasher H for flexibility and performance 436 | fn hash(&self, data: &T) -> u64 { 437 | let mut hasher = ::default(); 438 | data.hash(&mut hasher); 439 | hasher.finish() 440 | } 441 | 442 | /// Compute the bucket index and fingerprint for an item. 443 | /// 444 | /// 1. **Hash the item**: Use the configured hasher to get a 64-bit hash 445 | /// 2. **Extract fingerprint**: Use multiplication + shift for high-quality 446 | /// distribution across the fingerprint space, then add 1 to avoid zero. 447 | /// 3. **Extract index**: Use bitwise AND with (num_buckets-1) since num_buckets 448 | /// is always a power of 2, providing perfect hash distribution. 449 | /// 450 | /// ## Why This Design 451 | /// 452 | /// - **Non-zero fingerprints**: Adding 1 ensures fingerprints are never 0, 453 | /// so 0 can represent empty slots without ambiguity 454 | /// - **Independent bits**: Index uses lower hash bits, fingerprint uses 455 | /// different bits via multiplication, avoiding correlation 456 | /// - **Uniform distribution**: Both index and fingerprint are uniformly 457 | /// distributed across their respective ranges 458 | /// 459 | /// Returns (index, fingerprint) where: 460 | /// - index is the primary bucket index (0 to num_buckets-1) 461 | /// - fingerprint is a compact hash of the item (1 to fingerprint_mask) 462 | fn index_and_fingerprint(&self, item: &T) -> (usize, usize) { 463 | let hash = self.hash(item); 464 | // Compute fingerprint using multiplication and shift for better distribution 465 | let fingerprint = ((hash as u128 * self.fingerprint_mask as u128) >> 64) + 1; 466 | // Compute index using modulo num_buckets (optimized with bitwise AND since num_buckets is a power of 2) 467 | let index = hash as usize & (self.num_buckets - 1); 468 | (index, fingerprint as usize) 469 | } 470 | 471 | /// Computes the alternative bucket index for a given fingerprint using cuckoo hashing. 472 | /// 473 | /// In cuckoo hashing, each item can reside in one of two possible buckets. This function 474 | /// deterministically computes the alternate bucket index from the current index and fingerprint. 475 | /// 476 | /// Properties: 477 | /// 1. Symmetry: `alt_index(alt_index(i, f), f) == i` for any index `i` and fingerprint `f`. 478 | /// 2. Distinctness: For any fingerprint, the two indices are always different. 479 | /// 3. Uniformity: The mapping distributes fingerprints evenly across all buckets. 480 | fn alt_index(&self, index: usize, fingerprint: usize) -> usize { 481 | index ^ (self.hash(&fingerprint) as usize & (self.num_buckets - 1)) 482 | } 483 | 484 | /// Look up a fingerprint at its primary or alternative index 485 | /// Returns `Some((index, sub_index))` if found, None otherwise 486 | fn lookup_fingerprint(&self, index: usize, fingerprint: usize) -> Option<(usize, usize)> { 487 | // First check the primary bucket 488 | self.read_bucket(index, Ordering::Acquire) 489 | .position(|fp| fp == fingerprint) 490 | .map(|sub_index| (index, sub_index)) 491 | .or_else(|| { 492 | // Then check the alternative bucket 493 | let alt_index = self.alt_index(index, fingerprint); 494 | self.read_bucket(alt_index, Ordering::Acquire) 495 | .position(|fp| fp == fingerprint) 496 | .map(|sub_index| (alt_index, sub_index)) 497 | }) 498 | } 499 | 500 | /// Try to insert a fingerprint at its primary or alternative index 501 | /// Returns `Ok(())` if successful, `Error::NotEnoughSpace` if both buckets are full 502 | fn try_insert(&self, index: usize, fingerprint: usize) -> Result<(), Error> { 503 | self.insert_at_index(index, fingerprint).or_else(|_| { 504 | let alt_index = self.alt_index(index, fingerprint); 505 | self.insert_at_index(alt_index, fingerprint) 506 | }) 507 | } 508 | 509 | /// Try to insert a fingerprint at a specific index 510 | /// Returns Ok(()) if successful, Err(Error::NotEnoughSpace) if the bucket is full 511 | fn insert_at_index(&self, index: usize, fingerprint: usize) -> Result<(), Error> { 512 | loop { 513 | let sub_index = self 514 | .read_bucket(index, Ordering::Relaxed) 515 | .position(|i| i == 0) 516 | .ok_or(Error::NotEnoughSpace)?; 517 | 518 | if self.update_bucket(index, sub_index, 0, fingerprint, Ordering::Release) { 519 | return Ok(()); 520 | } 521 | } 522 | } 523 | 524 | /// Insert a fingerprint using cuckoo eviction chains when both buckets are full. 525 | /// 526 | /// This method is invoked only as a fallback when direct insertion fails, preserving 527 | /// the optimistic, lock-free fast path for the common case. 528 | /// 529 | /// # Cuckoo Eviction Algorithm 530 | /// 531 | /// When both possible locations for an item are full: 532 | /// 1. **Randomly select** an existing item from one of the full buckets 533 | /// 2. **Evict** that item and insert our new item in its place 534 | /// 3. **Relocate** the evicted item to its alternate location 535 | /// 4. **Repeat** if the alternate location is also full (eviction chain) 536 | /// 5. **Succeed** when we find an empty slot, or **fail** after max_evictions 537 | /// 538 | /// # Implementation Details 539 | /// 540 | /// - **Eviction tracking**: Collects a sequence of planned evictions, which are 541 | /// atomically applied only if the chain succeeds, ensuring atomicity and consistency. 542 | /// - **Lock upgrading**: Starts with a `WriterExclusive` lock, upgrading to 543 | /// `FullyExclusive` only when actually applying the eviction chain, maximizing 544 | /// read concurrency during planning. 545 | /// - **Loop prevention**: Uses a map to track which sub-indices have been tried 546 | /// in each bucket, to ensure early detection of loops in eviction chains. 547 | fn insert_with_evictions( 548 | &self, 549 | mut index: usize, 550 | mut fingerprint: usize, 551 | mut lock: Lock, 552 | ) -> Result<(), Error> { 553 | let mut rng = rand::rng(); 554 | let mut insertions = Vec::with_capacity(self.max_evictions.min(32)); 555 | let mut used_slots = HashSet::with_capacity(self.max_evictions.min(32)); 556 | while insertions.len() <= self.max_evictions { 557 | // Choose a sub-index in this bucket whose global slot has not been used yet in the plan 558 | let base_slot = index * self.bucket_size; 559 | let mut sub_index = rng.random_range(0..self.bucket_size); 560 | if used_slots.contains(&(base_slot + sub_index)) { 561 | sub_index = (0..self.bucket_size) 562 | .find(|&i| !used_slots.contains(&(base_slot + i))) 563 | .ok_or(Error::NotEnoughSpace)?; 564 | } 565 | used_slots.insert(base_slot + sub_index); 566 | insertions.push((index, sub_index, fingerprint)); 567 | 568 | // Evict the fingerprint at the chosen sub-index 569 | fingerprint = self 570 | .read_bucket(index, Ordering::Relaxed) 571 | .nth(sub_index) 572 | .unwrap(); 573 | // Find the alternative index for the evicted fingerprint 574 | index = self.alt_index(index, fingerprint); 575 | 576 | if self.insert_at_index(index, fingerprint).is_ok() { 577 | // Successfully inserted the fingerprint, now apply all evictions 578 | lock.upgrade(); 579 | let mut evicted = fingerprint; 580 | while let Some((index, sub_index, fingerprint)) = insertions.pop() { 581 | self.update_bucket(index, sub_index, evicted, fingerprint, Ordering::Relaxed); 582 | evicted = fingerprint; 583 | } 584 | return Ok(()); 585 | } 586 | } 587 | // Reached the maximum number of evictions, give up 588 | Err(Error::NotEnoughSpace) 589 | } 590 | 591 | /// Atomically read all fingerprints from a bucket using lock-free bit manipulation. 592 | /// 593 | /// ## Memory Layout Complexity 594 | /// 595 | /// Fingerprints are tightly packed in memory across multiple atomic usize values: 596 | /// - Each bucket contains `bucket_size` fingerprints 597 | /// - Each fingerprint is `fingerprint_size` bits 598 | /// - Multiple fingerprints are packed into each atomic usize 599 | /// - Buckets may span across multiple atomic values 600 | /// 601 | /// ## Algorithm Steps 602 | /// 603 | /// 1. Calculate which atomic values contain this bucket's data 604 | /// 2. Atomically load each relevant atomic value (using Acquire ordering) 605 | /// 3. Extract fingerprints using bit manipulation and masking 606 | /// 4. Handle boundary cases where buckets span multiple atomics 607 | /// 5. Skip any padding bits and return exactly `bucket_size` fingerprints 608 | /// 609 | /// This is completely lock-free - multiple threads can read concurrently, 610 | /// and reads can proceed even during writes (though they might see 611 | /// intermediate states that get resolved by retry logic). 612 | /// 613 | /// Returns an Iterator over the fingerprints in the bucket, (0 = empty slot). 614 | fn read_bucket(&self, index: usize, ordering: Ordering) -> impl Iterator { 615 | let fingerprint_index = index * self.bucket_size; 616 | let bit_index = fingerprint_index * self.fingerprint_size; 617 | let start_index = bit_index / usize::BITS as usize; 618 | let skip_bits = bit_index % usize::BITS as usize; 619 | let skip_fingerprints = skip_bits >> self.fingerprint_size.trailing_zeros(); 620 | // No need to calculate end_index; just iterate from start_index to the end of the bucket 621 | self.buckets[start_index..] 622 | .iter() 623 | .flat_map(move |atomic| { 624 | let atomic_value = atomic.load(ordering); 625 | (0..self.fingerprints_per_atomic).map(move |i| { 626 | (atomic_value 627 | >> (self.fingerprint_size * (self.fingerprints_per_atomic - i - 1))) 628 | & self.fingerprint_mask 629 | }) 630 | }) 631 | .skip(skip_fingerprints) 632 | .take(self.bucket_size) 633 | } 634 | 635 | /// Atomically update a single fingerprint using lock-free compare-exchange. 636 | /// 637 | /// ## Lock-Free Update Algorithm 638 | /// 639 | /// 1. **Locate the target**: Calculate which atomic usize contains the fingerprint 640 | /// and the exact bit position within that atomic value 641 | /// 2. **Read current state**: Load the current atomic value 642 | /// 3. **Verify expectation**: Check that the target position contains `old_value` 643 | /// 4. **Atomic update**: Use compare_exchange_weak to atomically replace `old_value` 644 | /// with `new_value`, but only if the atomic hasn't changed since step 2 645 | /// 5. **Retry on conflict**: If another thread modified the atomic concurrently, 646 | /// restart from step 2 647 | /// 648 | /// ## Concurrency Safety 649 | /// 650 | /// - Uses `compare_exchange_weak` which can fail spuriously on some architectures 651 | /// but is more efficient than the strong version 652 | /// - Employs Release ordering on success to ensure other threads see the change 653 | /// - Updates the global counter atomically to maintain consistency 654 | /// - Returns false if the expected `old_value` is no longer present (indicating 655 | /// another thread already modified this slot) 656 | /// 657 | /// Returns `true` if update succeeded, `false` if the slot no longer contains 658 | /// the expected `old_value` due to concurrent modification. 659 | fn update_bucket( 660 | &self, 661 | index: usize, 662 | sub_index: usize, 663 | old_value: usize, 664 | new_value: usize, 665 | ordering: Ordering, 666 | ) -> bool { 667 | let bit_index = (index * self.bucket_size + sub_index) * self.fingerprint_size; 668 | let atomic_index = bit_index / usize::BITS as usize; 669 | let skip_bits = bit_index % usize::BITS as usize; 670 | let shift = usize::BITS as usize - self.fingerprint_size - skip_bits; 671 | let fingerprint_mask = self.fingerprint_mask << shift; 672 | let atomic = &self.buckets[atomic_index]; 673 | 674 | loop { 675 | let atomic_value = atomic.load(Ordering::Relaxed); 676 | if (atomic_value & fingerprint_mask) >> shift != old_value { 677 | // The expected fingerprint is not present in the atomic value 678 | return false; 679 | } 680 | let new_atomic_value = (atomic_value & !fingerprint_mask) | (new_value << shift); 681 | if atomic 682 | .compare_exchange_weak(atomic_value, new_atomic_value, ordering, Ordering::Relaxed) 683 | .is_ok() 684 | { 685 | // Update the counter based on the change 686 | match (old_value, new_value) { 687 | (0, _) => self.counter.fetch_add(1, Ordering::Release), 688 | (_, 0) => self.counter.fetch_sub(1, Ordering::Release), 689 | (_, _) => 0, 690 | }; 691 | return true; 692 | } 693 | } 694 | } 695 | 696 | /// Acquires a lock on the filter, if necessary. 697 | /// 698 | /// A lock is only required when evictions are enabled (i.e., `max_evictions > 0`). 699 | /// If `max_evictions` is set to 0, no lock is acquired. 700 | /// 701 | /// Returns `Some(Lock)` if a lock is needed, or `None` if no locking is required. 702 | pub fn lock(&self, kind: LockKind) -> Option> { 703 | if self.max_evictions == 0 { 704 | None 705 | } else { 706 | Some(Lock::new(&self.lock, kind)) 707 | } 708 | } 709 | 710 | /// Execute a read operation with optimistic concurrency control and automatic retry. 711 | /// 712 | /// This is the cornerstone of the lock-free design, implementing a sophisticated 713 | /// optimistic concurrency protocol that allows reads to proceed concurrently with 714 | /// most write operations. 715 | /// 716 | /// ## Optimistic Concurrency Protocol 717 | /// 718 | /// 1. **Snapshot version**: Acquire an Optimistic lock (capturing version number) 719 | /// 2. **Execute read**: Run the provided function without any blocking 720 | /// 3. **Validate consistency**: Check if version changed or FullyExclusive lock acquired 721 | /// 4. **Retry or return**: If data may be stale, retry; otherwise return result 722 | /// 723 | /// ## How It Works 724 | /// 725 | /// - **WriterExclusive operations**: Don't invalidate optimistic reads because they 726 | /// coordinate through atomic compare-exchange operations that are linearizable 727 | /// - **FullyExclusive operations**: Do invalidate optimistic reads because they 728 | /// perform complex multi-step updates that require consistency 729 | /// - **Early return optimization**: For operations that can short-circuit (like 730 | /// `contains()` returning true), we skip version validation as an optimization 731 | /// 732 | /// This pattern is essential for achieving lock-free performance while maintaining 733 | /// correctness in the presence of concurrent modifications. 734 | fn atomic_read(&self, fun: F, early_return: Option) -> T 735 | where 736 | F: Fn() -> T, 737 | T: PartialEq, 738 | { 739 | if self.max_evictions == 0 { 740 | return fun(); 741 | } 742 | loop { 743 | let lock = Lock::new(&self.lock, LockKind::Optimistic); 744 | let result = fun(); 745 | if Some(&result) == early_return.as_ref() || !lock.is_outdated() { 746 | return result; 747 | } 748 | } 749 | } 750 | } 751 | 752 | impl CuckooFilter { 753 | /// Create a new CuckooFilterBuilder with default settings 754 | pub fn builder() -> CuckooFilterBuilder { 755 | CuckooFilterBuilder::default() 756 | } 757 | 758 | /// Create a new CuckooFilter with default settings 759 | pub fn new() -> CuckooFilter { 760 | Self::builder().build().unwrap() 761 | } 762 | 763 | /// Create a new CuckooFilter with the specified capacity 764 | pub fn with_capacity(capacity: usize) -> CuckooFilter { 765 | Self::builder().capacity(capacity).build().unwrap() 766 | } 767 | } 768 | 769 | impl Default for CuckooFilter { 770 | /// Create a new CuckooFilter with default settings 771 | fn default() -> Self { 772 | Self::new() 773 | } 774 | } 775 | 776 | impl CuckooFilterBuilder { 777 | /// Validate the builder configuration 778 | fn validate(&self) -> Result<(), String> { 779 | if let Some(fingerprint_size) = self.fingerprint_size 780 | && ![4, 8, 16, 32].contains(&fingerprint_size) 781 | { 782 | return Err("Invalid fingerprint_size".into()); 783 | } 784 | if self.bucket_size == Some(0) { 785 | return Err("bucket_size must be greater than zero".into()); 786 | } 787 | if self.capacity == Some(0) { 788 | return Err("capacity must be greater than zero".into()); 789 | } 790 | Ok(()) 791 | } 792 | 793 | /// Build a CuckooFilter with the specified configuration 794 | pub fn build(self) -> Result, CuckooFilterBuilderError> { 795 | let mut cuckoo_filter = self.base_build()?; 796 | // Calculate the number of buckets (power of 2) 797 | cuckoo_filter.num_buckets = cuckoo_filter 798 | .capacity 799 | .div_ceil(cuckoo_filter.bucket_size) 800 | .next_power_of_two(); 801 | // Adjust the capacity to match the actual number of buckets 802 | cuckoo_filter.capacity = cuckoo_filter.num_buckets * cuckoo_filter.bucket_size; 803 | // Calculate the fingerprint mask 804 | cuckoo_filter.fingerprint_mask = ((1u64 << cuckoo_filter.fingerprint_size) - 1) as usize; 805 | // Calculate the number of fingerprints per atomic value 806 | cuckoo_filter.fingerprints_per_atomic = 807 | usize::BITS as usize / cuckoo_filter.fingerprint_size; 808 | // Calculate the total number of atomic values needed 809 | let bit_size = cuckoo_filter.capacity * cuckoo_filter.fingerprint_size; 810 | let atomic_size = bit_size.div_ceil(usize::BITS as usize); 811 | // Initialize the buckets 812 | cuckoo_filter.buckets = (0..atomic_size).map(|_| AtomicUsize::new(0)).collect(); 813 | Ok(cuckoo_filter) 814 | } 815 | } 816 | -------------------------------------------------------------------------------- /tests/basic.rs: -------------------------------------------------------------------------------- 1 | use ahash::AHasher; 2 | use atomic_cuckoo_filter::{CuckooFilter, CuckooFilterBuilder}; 3 | // Helper function to create test data 4 | fn test_items(count: usize) -> Vec { 5 | (0..count).map(|i| format!("test_item_{i}")).collect() 6 | } 7 | 8 | #[test] 9 | fn test_new_filter() { 10 | let filter = CuckooFilter::new(); 11 | assert_eq!(filter.len(), 0); 12 | assert!(filter.is_empty()); 13 | assert_eq!(filter.capacity(), 1048576); // Default capacity 14 | } 15 | 16 | #[test] 17 | fn test_with_capacity() { 18 | let filter = CuckooFilter::with_capacity(1000); 19 | assert_eq!(filter.len(), 0); 20 | assert!(filter.is_empty()); 21 | assert_eq!(filter.capacity(), 1024); // Rounded up to power of 2 22 | } 23 | 24 | #[test] 25 | fn test_builder_default() { 26 | let filter = CuckooFilter::builder().build().unwrap(); 27 | assert_eq!(filter.len(), 0); 28 | assert!(filter.is_empty()); 29 | } 30 | 31 | #[test] 32 | fn test_builder_custom_config() { 33 | let filter = CuckooFilter::builder() 34 | .capacity(2048) 35 | .fingerprint_size(8) 36 | .bucket_size(2) 37 | .max_evictions(100) 38 | .build() 39 | .unwrap(); 40 | 41 | assert_eq!(filter.len(), 0); 42 | assert_eq!(filter.capacity(), 2048); 43 | } 44 | 45 | #[test] 46 | fn test_builder_validation_invalid_fingerprint_size() { 47 | let result = CuckooFilter::builder() 48 | .fingerprint_size(7) // Invalid: must be 4, 8, 16, or 32 49 | .build(); 50 | 51 | assert!(result.is_err()); 52 | assert!( 53 | result 54 | .unwrap_err() 55 | .to_string() 56 | .contains("Invalid fingerprint_size") 57 | ); 58 | } 59 | 60 | #[test] 61 | fn test_builder_validation_zero_bucket_size() { 62 | let result = CuckooFilter::builder().bucket_size(0).build(); 63 | 64 | assert!(result.is_err()); 65 | assert!( 66 | result 67 | .unwrap_err() 68 | .to_string() 69 | .contains("bucket_size must be greater than zero") 70 | ); 71 | } 72 | 73 | #[test] 74 | fn test_builder_validation_zero_capacity() { 75 | let result = CuckooFilter::builder().capacity(0).build(); 76 | 77 | assert!(result.is_err()); 78 | assert!( 79 | result 80 | .unwrap_err() 81 | .to_string() 82 | .contains("capacity must be greater than zero") 83 | ); 84 | } 85 | 86 | #[test] 87 | fn test_empty_filter_operations() { 88 | let filter = CuckooFilter::with_capacity(1024); 89 | 90 | // Test operations on empty filter 91 | assert!(!filter.contains(&"nonexistent")); 92 | assert_eq!(filter.count(&"nonexistent"), 0); 93 | assert!(!filter.remove(&"nonexistent")); 94 | assert_eq!(filter.len(), 0); 95 | assert!(filter.is_empty()); 96 | } 97 | 98 | #[test] 99 | fn test_basic_insert_contains() { 100 | let filter = CuckooFilter::with_capacity(1024); 101 | let item = "test_item"; 102 | 103 | assert!(!filter.contains(&item)); 104 | assert!(filter.insert(&item).is_ok()); 105 | assert!(filter.contains(&item)); 106 | assert_eq!(filter.len(), 1); 107 | assert!(!filter.is_empty()); 108 | } 109 | 110 | #[test] 111 | fn test_insert_duplicate_items() { 112 | let filter = CuckooFilter::with_capacity(1024); 113 | let item = "duplicate_item"; 114 | 115 | // Insert same item multiple times 116 | assert!(filter.insert(&item).is_ok()); 117 | assert!(filter.insert(&item).is_ok()); 118 | assert!(filter.insert(&item).is_ok()); 119 | 120 | assert!(filter.contains(&item)); 121 | assert_eq!(filter.count(&item), 3); 122 | assert_eq!(filter.len(), 3); 123 | } 124 | 125 | #[test] 126 | fn test_insert_unique() { 127 | let filter = CuckooFilter::with_capacity(1024); 128 | let item = "unique_item"; 129 | 130 | // First insertion should succeed 131 | assert_eq!(filter.insert_unique(&item), Ok(true)); 132 | assert_eq!(filter.count(&item), 1); 133 | 134 | // Second insertion should return false (already exists) 135 | assert_eq!(filter.insert_unique(&item), Ok(false)); 136 | assert_eq!(filter.count(&item), 1); 137 | assert_eq!(filter.len(), 1); 138 | } 139 | 140 | #[test] 141 | fn test_remove_existing_item() { 142 | let filter = CuckooFilter::with_capacity(1024); 143 | let item = "removable_item"; 144 | 145 | // Insert and then remove 146 | assert!(filter.insert(&item).is_ok()); 147 | assert!(filter.contains(&item)); 148 | assert!(filter.remove(&item)); 149 | assert!(!filter.contains(&item)); 150 | assert_eq!(filter.len(), 0); 151 | 152 | // Trying to remove again should return false 153 | assert!(!filter.remove(&item)); 154 | } 155 | 156 | #[test] 157 | fn test_remove_duplicate_items() { 158 | let filter = CuckooFilter::with_capacity(1024); 159 | let item = "dup_removable"; 160 | 161 | // Insert multiple copies 162 | assert!(filter.insert(&item).is_ok()); 163 | assert!(filter.insert(&item).is_ok()); 164 | assert!(filter.insert(&item).is_ok()); 165 | assert_eq!(filter.count(&item), 3); 166 | 167 | // Remove one at a time 168 | assert!(filter.remove(&item)); 169 | assert_eq!(filter.count(&item), 2); 170 | assert!(filter.remove(&item)); 171 | assert_eq!(filter.count(&item), 1); 172 | assert!(filter.remove(&item)); 173 | assert_eq!(filter.count(&item), 0); 174 | assert!(!filter.contains(&item)); 175 | } 176 | 177 | #[test] 178 | fn test_clear() { 179 | let filter = CuckooFilter::with_capacity(1024); 180 | let items = test_items(100); 181 | 182 | // Insert many items 183 | for item in &items { 184 | assert!(filter.insert(item).is_ok()); 185 | } 186 | assert_eq!(filter.len(), 100); 187 | 188 | // Clear all items 189 | filter.clear(); 190 | assert_eq!(filter.len(), 0); 191 | assert!(filter.is_empty()); 192 | 193 | // Verify all items are gone 194 | for item in &items { 195 | assert!(!filter.contains(item)); 196 | } 197 | } 198 | 199 | #[test] 200 | fn test_count_functionality() { 201 | let filter = CuckooFilter::with_capacity(1024); 202 | let item = "countable_item"; 203 | 204 | assert_eq!(filter.count(&item), 0); 205 | 206 | // Add items and verify count increases 207 | for i in 1..=5 { 208 | assert!(filter.insert(&item).is_ok()); 209 | assert_eq!(filter.count(&item), i); 210 | } 211 | 212 | // Remove items and verify count decreases 213 | for i in (1..=5).rev() { 214 | assert!(filter.remove(&item)); 215 | assert_eq!(filter.count(&item), i - 1); 216 | } 217 | } 218 | 219 | #[test] 220 | fn test_different_item_types() { 221 | let filter = CuckooFilter::with_capacity(1024); 222 | 223 | // Test with different types that implement Hash 224 | assert!(filter.insert(&42i32).is_ok()); 225 | assert!(filter.insert(&"string").is_ok()); 226 | assert!(filter.insert(&vec![1, 2, 3]).is_ok()); 227 | assert!(filter.insert(&(1, 2, 3)).is_ok()); 228 | 229 | assert!(filter.contains(&42i32)); 230 | assert!(filter.contains(&"string")); 231 | assert!(filter.contains(&vec![1, 2, 3])); 232 | assert!(filter.contains(&(1, 2, 3))); 233 | 234 | assert_eq!(filter.len(), 4); 235 | } 236 | 237 | #[test] 238 | fn test_false_positives() { 239 | let filter = CuckooFilter::builder() 240 | .capacity(1024) 241 | .fingerprint_size(8) // Smaller fingerprint = higher false positive rate 242 | .build() 243 | .unwrap(); 244 | 245 | // Insert known items 246 | let known_items: Vec = (0..500).collect(); 247 | for item in &known_items { 248 | assert!(filter.insert(item).is_ok()); 249 | } 250 | 251 | // Test with unknown items 252 | let unknown_items: Vec = (1000..2000).collect(); 253 | let false_positives = unknown_items 254 | .iter() 255 | .filter(|item| filter.contains(item)) 256 | .count(); 257 | 258 | // Should have some false positives but not too many 259 | assert!(false_positives > 0); 260 | assert!(false_positives < 50); // Less than 10% false positive rate 261 | } 262 | 263 | #[test] 264 | fn test_no_false_negatives() { 265 | let filter = CuckooFilter::with_capacity(1024); 266 | let items = test_items(1024); 267 | 268 | // Insert items and filter out the ones that failed to insert 269 | let inserted_items = items 270 | .into_iter() 271 | .filter(|item| filter.insert(item).is_ok()) 272 | .collect::>(); 273 | 274 | // All inserted items should be found (no false negatives) 275 | for item in inserted_items { 276 | assert!(filter.contains(&item), "False negative for item: {item}"); 277 | } 278 | } 279 | 280 | #[test] 281 | fn test_full_filter_insertion() { 282 | let filter = CuckooFilter::builder() 283 | .capacity(16) // Very small capacity 284 | .max_evictions(0) // No evictions 285 | .build() 286 | .unwrap(); 287 | 288 | let mut successful_inserts = 0; 289 | 290 | // Try to insert many items 291 | for i in 0..100 { 292 | if filter.insert(&i).is_ok() { 293 | successful_inserts += 1; 294 | } else { 295 | break; // Filter is full 296 | } 297 | } 298 | 299 | // Should fill up and then start failing 300 | assert!(successful_inserts <= filter.capacity()); 301 | assert!(successful_inserts > 0); 302 | assert_eq!(filter.len(), successful_inserts); 303 | } 304 | 305 | #[test] 306 | fn test_eviction_behavior() { 307 | let filter_no_evict = CuckooFilter::builder() 308 | .capacity(1024) 309 | .max_evictions(0) 310 | .build() 311 | .unwrap(); 312 | 313 | let filter_10_evict = CuckooFilter::builder() 314 | .capacity(1024) 315 | .max_evictions(10) 316 | .build() 317 | .unwrap(); 318 | 319 | let filter_100_evict = CuckooFilter::builder() 320 | .capacity(1024) 321 | .max_evictions(100) 322 | .build() 323 | .unwrap(); 324 | 325 | let mut no_evict_count = 0; 326 | let mut evict_10_count = 0; 327 | let mut evict_100_count = 0; 328 | 329 | for i in 0..1024 { 330 | if filter_no_evict.insert(&i).is_ok() { 331 | no_evict_count += 1; 332 | } 333 | if filter_10_evict.insert(&i).is_ok() { 334 | evict_10_count += 1; 335 | } 336 | if filter_100_evict.insert(&i).is_ok() { 337 | evict_100_count += 1; 338 | } 339 | } 340 | 341 | // Filter with evictions should accommodate more items 342 | assert!(no_evict_count < evict_10_count); 343 | assert!(evict_10_count < evict_100_count); 344 | assert_eq!(filter_no_evict.len(), no_evict_count); 345 | assert_eq!(filter_10_evict.len(), evict_10_count); 346 | assert_eq!(filter_100_evict.len(), evict_100_count); 347 | } 348 | 349 | #[test] 350 | fn test_fingerprint_sizes() { 351 | let sizes = [4, 8, 16, 32]; 352 | 353 | for &size in &sizes { 354 | let filter = CuckooFilter::builder() 355 | .capacity(1024) 356 | .fingerprint_size(size) 357 | .build() 358 | .unwrap(); 359 | 360 | // insert items to ensure the filter is fully loaded 361 | let mut i = 0; 362 | while filter.len() < 1024 { 363 | let _ = filter.insert(&i); 364 | i += 1; 365 | } 366 | 367 | // test false positive rate 368 | let non_existing_items = 10000..110000; 369 | let false_positives = non_existing_items.filter(|i| filter.contains(i)).count(); 370 | // Calculate the expected false positive rate (FPR) based on fingerprint size 371 | let expected_fpr = 1.0 - (1.0 - 1.0 / (1u64 << size) as f64).powi(8); 372 | let fpr = false_positives as f64 / 100000.0; 373 | if size == 4 { 374 | assert_eq!(expected_fpr, 0.4032805261667818); 375 | } else if size == 8 { 376 | assert_eq!(expected_fpr, 0.030826075519044704); 377 | } else if size == 16 { 378 | assert_eq!(expected_fpr, 0.00012206379344092966); 379 | } else if size == 32 { 380 | assert_eq!(expected_fpr, 0.000000001862645149230957); 381 | } 382 | // Allow a small margin due to randomness 383 | let tolerance = expected_fpr * 0.1 + 0.0001; 384 | 385 | assert!( 386 | (fpr - expected_fpr).abs() < tolerance, 387 | "Observed FPR ({fpr}) deviates too much from expected FPR ({expected_fpr}) for fingerprint size {size} ({false_positives} false positives)" 388 | ); 389 | } 390 | } 391 | 392 | #[test] 393 | fn test_bucket_sizes() { 394 | let sizes = [1, 2, 4, 8]; 395 | 396 | for &size in &sizes { 397 | let filter = CuckooFilter::builder() 398 | .capacity(1024) 399 | .bucket_size(size) 400 | .build() 401 | .unwrap(); 402 | 403 | // Should be able to insert items regardless of bucket size 404 | for i in 0..100 { 405 | assert!(filter.insert(&i).is_ok()); 406 | } 407 | 408 | // Should be able to find all items 409 | for i in 0..100 { 410 | assert!(filter.contains(&i)); 411 | } 412 | 413 | assert_eq!(filter.len(), 100); 414 | } 415 | } 416 | 417 | #[test] 418 | fn test_custom_hasher() { 419 | // Test that we can use different hashers 420 | let filter = CuckooFilterBuilder::::default() 421 | .capacity(1024) 422 | .build() 423 | .unwrap(); 424 | 425 | let items = test_items(100); 426 | for item in &items { 427 | assert!(filter.insert(item).is_ok()); 428 | } 429 | 430 | for item in &items { 431 | assert!(filter.contains(item)); 432 | } 433 | 434 | assert_eq!(filter.len(), 100); 435 | } 436 | -------------------------------------------------------------------------------- /tests/concurrent.rs: -------------------------------------------------------------------------------- 1 | use atomic_cuckoo_filter::CuckooFilter; 2 | use std::sync::Arc; 3 | use std::thread; 4 | 5 | #[test] 6 | fn test_concurrent_reads() { 7 | let filter = Arc::new(CuckooFilter::with_capacity(1024)); 8 | 9 | // Insert test data 10 | for i in 0..100 { 11 | assert!(filter.insert(&i).is_ok()); 12 | } 13 | 14 | let mut handles = vec![]; 15 | 16 | // Spawn multiple reader threads 17 | for _ in 0..5 { 18 | let filter_clone = Arc::clone(&filter); 19 | handles.push(thread::spawn(move || { 20 | for i in 0..100 { 21 | assert!(filter_clone.contains(&i)); 22 | } 23 | })); 24 | } 25 | 26 | // All reads should succeed 27 | for handle in handles { 28 | handle.join().unwrap(); 29 | } 30 | } 31 | 32 | #[test] 33 | fn test_concurrent_insert() { 34 | let filter = Arc::new(CuckooFilter::with_capacity(10000)); 35 | let mut handles = vec![]; 36 | 37 | // Spawn writer threads 38 | for thread_id in 0..5 { 39 | let filter_clone = Arc::clone(&filter); 40 | handles.push(thread::spawn(move || { 41 | for i in 0..100 { 42 | let item = format!("thread_{thread_id}_item_{i}"); 43 | filter_clone.insert(&item).unwrap(); 44 | } 45 | })); 46 | } 47 | 48 | // Wait for all threads 49 | for handle in handles { 50 | handle.join().unwrap(); 51 | } 52 | 53 | // check if all items are inserted 54 | for thread_id in 0..5 { 55 | for i in 0..100 { 56 | let item = format!("thread_{thread_id}_item_{i}"); 57 | assert!(filter.contains(&item)); 58 | } 59 | } 60 | 61 | // Should have inserted 500 items total 62 | assert_eq!(filter.len(), 500); 63 | } 64 | 65 | #[test] 66 | fn test_concurrent_insert_unique() { 67 | let filter = Arc::new(CuckooFilter::with_capacity(131072)); 68 | let mut handles = vec![]; 69 | 70 | for _ in 0..5 { 71 | let filter_clone = filter.clone(); 72 | handles.push(thread::spawn(move || { 73 | (0..100000) 74 | .filter(|i| filter_clone.insert_unique(i).unwrap()) 75 | .count() 76 | })); 77 | } 78 | 79 | let inserted: usize = handles.into_iter().map(|h| h.join().unwrap()).sum(); 80 | 81 | for i in 0..100000 { 82 | assert!(filter.contains(&i)); 83 | } 84 | 85 | assert_eq!(inserted, filter.len()); 86 | 87 | // inserted items might be less than 100000 due to false positives 88 | assert!(inserted <= 100000); 89 | } 90 | 91 | #[test] 92 | fn concurrent_remove() { 93 | let filter = Arc::new(CuckooFilter::with_capacity(131072)); 94 | let mut handles = vec![]; 95 | 96 | for i in 0..100000 { 97 | assert!(filter.insert(&i).is_ok()); 98 | } 99 | 100 | for _ in 0..5 { 101 | let f = filter.clone(); 102 | handles.push(thread::spawn(move || { 103 | (0..100000).filter(|i| f.remove(i)).count() 104 | })); 105 | } 106 | 107 | let removed: usize = handles.into_iter().map(|h| h.join().unwrap()).sum(); 108 | assert_eq!(removed, 100000) 109 | } 110 | 111 | #[test] 112 | fn test_concurrent_insert_and_remove() { 113 | let filter = Arc::new(CuckooFilter::with_capacity(10000)); 114 | let mut handles = vec![]; 115 | 116 | // Spawn writer threads 117 | for thread_id in 0..5 { 118 | let filter_clone = Arc::clone(&filter); 119 | handles.push(thread::spawn(move || { 120 | for i in 0..100 { 121 | let item = format!("thread_{thread_id}_item_{i}"); 122 | filter_clone.insert(&item).unwrap(); 123 | } 124 | })); 125 | } 126 | 127 | // Spawn remover threads 128 | for thread_id in 0..5 { 129 | let filter_clone = Arc::clone(&filter); 130 | handles.push(thread::spawn(move || { 131 | for i in 0..100 { 132 | let item = format!("thread_{thread_id}_item_{i}"); 133 | while !filter_clone.remove(&item) {} 134 | } 135 | })); 136 | } 137 | 138 | // Wait for all threads 139 | for handle in handles { 140 | handle.join().unwrap(); 141 | } 142 | 143 | // Should have removed all items 144 | assert_eq!(filter.len(), 0); 145 | } 146 | --------------------------------------------------------------------------------