├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── LICENCE.md ├── README.md ├── benches └── benches.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz_targets │ ├── fuzz_fingerprint.rs │ └── fuzz_qfilter.rs └── src ├── lib.rs └── stable_hasher.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous integration 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | 8 | jobs: 9 | test: 10 | name: Tests 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: styfle/cancel-workflow-action@0.10.0 14 | with: 15 | access_token: ${{ github.token }} 16 | - uses: actions/checkout@v2 17 | - uses: actions-rs/toolchain@v1 18 | with: 19 | profile: minimal 20 | toolchain: stable 21 | override: true 22 | - run: cargo test 23 | - run: cargo test --all-features 24 | 25 | fuzz-tests: 26 | name: Fuzz tests 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: styfle/cancel-workflow-action@0.10.0 30 | with: 31 | access_token: ${{ github.token }} 32 | - uses: actions/checkout@v2 33 | - uses: actions-rs/toolchain@v1 34 | with: 35 | profile: minimal 36 | toolchain: nightly 37 | override: true 38 | - run: cargo install cargo-fuzz 39 | - run: for fuzz_test in `cargo fuzz list`; do cargo fuzz run $fuzz_test -- -max_total_time=180 -detect_leaks=0 -len_control=0 || exit 1; done 40 | 41 | lint: 42 | name: Rustfmt & Clippy 43 | runs-on: ubuntu-latest 44 | steps: 45 | - uses: styfle/cancel-workflow-action@0.10.0 46 | with: 47 | access_token: ${{ github.token }} 48 | - uses: actions/checkout@v2 49 | - uses: actions-rs/toolchain@v1 50 | with: 51 | profile: minimal 52 | toolchain: stable 53 | override: true 54 | - run: rustup component add rustfmt 55 | - uses: actions-rs/cargo@v1 56 | with: 57 | command: fmt 58 | args: --all -- --check 59 | - run: rustup component add clippy 60 | - uses: actions-rs/cargo@v1 61 | with: 62 | command: clippy 63 | args: -- -D warnings 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "qfilter" 3 | version = "0.2.5" 4 | description = "Efficient bloom filter like datastructure, based on the Rank Select Quotient Filter (RSQF)" 5 | repository = "https://github.com/arthurprs/qfilter" 6 | authors = ["Arthur Silva "] 7 | edition = "2021" 8 | license = "MIT" 9 | keywords = ["rsqf", "cqf", "quotient-filter", "bloom-filter", "cuckoo-filter"] 10 | categories = ["data-structures"] 11 | readme = "README.md" 12 | exclude = ["fuzz", "benches"] 13 | 14 | [features] 15 | default = [] 16 | # Enable to support running on x64 cpus released before 2008 17 | legacy_x86_64_support = [] 18 | jsonschema = ["schemars"] 19 | serde = ["dep:serde", "dep:serde_bytes"] 20 | 21 | [dependencies] 22 | xxhash-rust = { version = "0.8.12", features = ["xxh3"] } 23 | serde = { optional = true, version = "1", features = ["derive"] } 24 | serde_bytes = { optional = true, version = "0.11" } 25 | schemars = { optional = true, version = "0.8" } 26 | 27 | [dev-dependencies] 28 | serde_cbor = "0.11" 29 | qfilter01 = { package = "qfilter", version = "0.1", features = ["serde"] } 30 | 31 | [profile.bench] 32 | opt-level = 3 33 | debug = true 34 | 35 | [lints.rust] 36 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] } 37 | 38 | [package.metadata.docs.rs] 39 | rustdoc-args = ["--cfg", "docsrs"] 40 | -------------------------------------------------------------------------------- /LICENCE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Arthur Silva 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Qfilter 2 | 3 | [![Crates.io](https://img.shields.io/crates/v/qfilter.svg)](https://crates.io/crates/qfilter) 4 | [![Docs](https://docs.rs/qfilter/badge.svg)](https://docs.rs/qfilter/latest) 5 | [![CI](https://github.com/arthurprs/qfilter/actions/workflows/ci.yml/badge.svg)](https://github.com/arthurprs/qfilter/actions/workflows/ci.yml) 6 | 7 | Efficient bloom filter like data structure, based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963). 8 | 9 | This is a small and flexible general-purpose [AMQ-Filter](https://en.wikipedia.org/wiki/Approximate_Membership_Query_Filter). 10 | It not only supports approximate membership testing like a bloom filter but also deletions, merging, 11 | resizing and [serde](https://crates.io/crates/serde) serialization. 12 | 13 | * High performance 14 | * Supports removals 15 | * Extremely compact, more so than comparable filters 16 | * Can be created with a initial small capacity and grow as needed 17 | * (De)Serializable with [serde](https://crates.io/crates/serde) 18 | * Portable Rust implementation 19 | * Only verifiable usages of unsafe 20 | 21 | This data structure is a succinct hash table that can store fingerprints in a very compact way. 22 | Fingerprints are similar to a hash values, but are possibly truncated. 23 | The reason for false positives is that multiple items can map to the same fingerprint. 24 | For more information see the [quotient filter Wikipedia page](https://en.wikipedia.org/wiki/Quotient_filter) 25 | that describes a similar but less optimized version of the data structure. 26 | The actual implementation is based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963). 27 | 28 | The public API also exposes a fingerprint API, which can be used to succinctly store u64 hash values. 29 | 30 | ### Example 31 | 32 | ```rust 33 | let mut f = qfilter::Filter::new(1000000, 0.01); 34 | for i in 0..1000 { 35 | f.insert(i).unwrap(); 36 | } 37 | for i in 0..1000 { 38 | assert!(f.contains(i)); 39 | } 40 | ``` 41 | 42 | ### Hasher 43 | 44 | The hashing algorithm used is [xxhash3](https://crates.io/crates/xxhash-rust) which offers both high performance and stability across platforms. 45 | 46 | ### Filter size 47 | 48 | For a given capacity and error probability the RSQF may require significantly less space than the equivalent bloom filter or other AMQ-Filters. 49 | 50 | | Bits per item | Error probability when full | Bits per item (Cont.) | Error (cont.) | 51 | |:---:|:---:|:---:|---| 52 | | 3.125 | 0.362 | 19.125 | 6.87e-06 | 53 | | 4.125 | 0.201 | 20.125 | 3.43e-06 | 54 | | 5.125 | 0.106 | 21.125 | 1.72e-06 | 55 | | 6.125 | 0.0547 | 22.125 | 8.58e-07 | 56 | | 7.125 | 0.0277 | 23.125 | 4.29e-07 | 57 | | 8.125 | 0.014 | 24.125 | 2.15e-07 | 58 | | 9.125 | 0.00701 | 25.125 | 1.07e-07 | 59 | | 10.125 | 0.00351 | 26.125 | 5.36e-08 | 60 | | 11.125 | 0.00176 | 27.125 | 2.68e-08 | 61 | | 12.125 | 0.000879 | 28.125 | 1.34e-08 | 62 | | 13.125 | 0.000439 | 29.125 | 6.71e-09 | 63 | | 14.125 | 0.00022 | 30.125 | 3.35e-09 | 64 | | 15.125 | 0.00011 | 31.125 | 1.68e-09 | 65 | | 16.125 | 5.49e-05 | 32.125 | 8.38e-10 | 66 | | 17.125 | 2.75e-05 | .. | .. | 67 | | 18.125 | 1.37e-05 | .. | .. | 68 | 69 | ### Compatibility between versions 0.1 and 0.2 70 | 71 | Version 0.2 changed public APIs (e.g. fallible constructors) which required a major version bump. 72 | 73 | Serialization is bidirectionally compatible between versions 0.1 and 0.2. 74 | 75 | ### Not implemented 76 | 77 | * [ ] Fingerprint attached values 78 | * [ ] Counting with fingerprint attached values, not fingerprint duplication 79 | * [ ] More advanced growth strategies (InfiniFilter). 80 | 81 | ### Legacy x86_64 CPUs support 82 | 83 | The implementation assumes the `popcnt` instruction (equivalent to `integer.count_ones()`) is present 84 | when compiling for x86_64 targets. This is theoretically not guaranteed as the instruction is only 85 | available on AMD/Intel CPUs released after 2007/2008. If that's not the case the Filter constructor will panic. 86 | 87 | Support for such legacy x86_64 CPUs can be optionally enabled with the `legacy_x86_64_support` 88 | which incurs a ~10% performance penalty. 89 | 90 | ### License 91 | 92 | This project is licensed under the MIT license. 93 | -------------------------------------------------------------------------------- /benches/benches.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | extern crate test; 3 | 4 | use qfilter::*; 5 | use test::Bencher; 6 | 7 | #[bench] 8 | fn bench_new(b: &mut Bencher) { 9 | b.iter(|| Filter::new(1000, 0.005).unwrap()); 10 | } 11 | #[bench] 12 | fn bench_get_ok_medium(b: &mut Bencher) { 13 | let mut f = Filter::new(100000, 0.01).unwrap(); 14 | for i in 0..f.capacity() { 15 | f.insert_duplicated(&i).unwrap(); 16 | } 17 | let mut i = 0; 18 | b.iter(|| { 19 | i += 1; 20 | f.contains(&i) 21 | }) 22 | } 23 | 24 | #[bench] 25 | fn bench_get_nok_medium(b: &mut Bencher) { 26 | let mut f = Filter::new(100000, 0.01).unwrap(); 27 | for i in 0..f.capacity() { 28 | f.insert_duplicated(&i).unwrap(); 29 | } 30 | let mut i = f.capacity(); 31 | b.iter(|| { 32 | i += 1; 33 | f.contains(&i) 34 | }) 35 | } 36 | 37 | #[bench] 38 | fn bench_grow(b: &mut Bencher) { 39 | b.iter(|| { 40 | let mut f = Filter::new(10000, 0.01).unwrap(); 41 | for i in 0..f.capacity() { 42 | f.insert_duplicated(i).unwrap(); 43 | } 44 | f 45 | }); 46 | } 47 | 48 | #[bench] 49 | fn bench_grow_from_90pct(b: &mut Bencher) { 50 | let mut f = Filter::new(10000, 0.01).unwrap(); 51 | for i in 0..f.capacity() / 10 * 9 { 52 | f.insert_duplicated(i).unwrap(); 53 | } 54 | b.iter(|| { 55 | let mut f = f.clone(); 56 | for i in f.len()..f.capacity() { 57 | f.insert_duplicated(i).unwrap(); 58 | } 59 | f 60 | }); 61 | } 62 | 63 | #[bench] 64 | fn bench_grow_resizeable(b: &mut Bencher) { 65 | b.iter(|| { 66 | let mut f = Filter::new_resizeable(0, 10000, 0.01).unwrap(); 67 | for i in 0u64.. { 68 | if f.insert_duplicated(i).is_err() { 69 | break; 70 | } 71 | } 72 | assert_eq!(f.len(), 10000u64.next_power_of_two() * 19 / 20); 73 | f 74 | }); 75 | } 76 | 77 | #[bench] 78 | fn bench_shrink(b: &mut Bencher) { 79 | let mut f = Filter::new(10000, 0.01).unwrap(); 80 | for i in 0..f.capacity() { 81 | let _ = f.insert(i); 82 | } 83 | b.iter(|| { 84 | let mut f = f.clone(); 85 | for i in 0..f.capacity() { 86 | f.remove(i); 87 | } 88 | f 89 | }); 90 | } 91 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "qfilter-fuzz" 3 | version = "0.0.0" 4 | authors = ["Automatically generated"] 5 | publish = false 6 | edition = "2021" 7 | 8 | [package.metadata] 9 | cargo-fuzz = true 10 | 11 | [dependencies] 12 | libfuzzer-sys = {version = "0.4", features = ["arbitrary-derive"] } 13 | 14 | [dependencies.qfilter] 15 | path = ".." 16 | 17 | # Prevent this from interfering with workspaces 18 | [workspace] 19 | members = ["."] 20 | 21 | [[bin]] 22 | name = "fuzz_qfilter" 23 | path = "fuzz_targets/fuzz_qfilter.rs" 24 | test = false 25 | doc = false 26 | 27 | [[bin]] 28 | name = "fuzz_fingerprint" 29 | path = "fuzz_targets/fuzz_fingerprint.rs" 30 | test = false 31 | doc = false 32 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_fingerprint.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::arbitrary; 3 | use libfuzzer_sys::arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | 6 | const FUZZ_REMOVES: bool = true; 7 | const CHECK_EVERY: usize = 8; 8 | const CHECK_SHRUNK: bool = true; 9 | 10 | #[derive(Debug, Arbitrary)] 11 | struct Input { 12 | cap: u16, 13 | fp_size: u8, 14 | ops: Vec<(bool, u16)>, 15 | } 16 | 17 | fuzz_target!(|input: Input| { 18 | let Input { cap, ops, fp_size } = input; 19 | // The "Model", tracks the count for each item 20 | let mut counts = [0u64; (u16::MAX as usize) + 1]; 21 | let Ok(mut f) = qfilter::Filter::with_fingerprint_size(cap as u64, fp_size.clamp(7, 64)) else { 22 | return; 23 | }; 24 | for i in 0..ops.len() { 25 | // print_sample(&counts); 26 | // dbg!(ops[i]); 27 | 28 | let (add, item) = ops[i]; 29 | let item = item as u64; 30 | if !FUZZ_REMOVES || add { 31 | if f.insert_fingerprint(true, item).is_err() { 32 | continue; 33 | } 34 | counts[item as usize] += 1; 35 | } else if counts[item as usize] != 0 && f.remove_fingerprint(item) { 36 | counts[item as usize] -= 1; 37 | } else { 38 | continue; 39 | } 40 | 41 | if i % CHECK_EVERY == 0 { 42 | for &(_add, e) in &ops[..=i] { 43 | let min = counts[e as usize]; 44 | // Since we can only check for >= due to collisions skip min = 0 45 | if min != 0 { 46 | let est = f.count_fingerprint(e as u64); 47 | assert!(est >= min, "{e}: est {est} < min {min}"); 48 | } 49 | } 50 | } 51 | } 52 | 53 | for shrunk in [false, true] { 54 | for &(_add, e) in &ops { 55 | let min = counts[e as usize]; 56 | let est = f.count_fingerprint(e as u64); 57 | assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}"); 58 | } 59 | let prints = f.fingerprints().collect::>(); 60 | let mut expected_prints = counts 61 | .iter() 62 | .enumerate() 63 | .flat_map(|(i, n)| { 64 | let t = (i as u64) << (64 - f.fingerprint_size()) >> (64 - f.fingerprint_size()); 65 | std::iter::repeat(t).take(*n as usize) 66 | }) 67 | .collect::>(); 68 | expected_prints.sort_unstable(); 69 | assert_eq!(prints.len(), f.len() as usize); 70 | assert_eq!(prints, expected_prints); 71 | if !CHECK_SHRUNK { 72 | break; 73 | } 74 | f.shrink_to_fit(); 75 | } 76 | }); 77 | 78 | #[allow(dead_code)] 79 | fn print_sample(counts: &[u64]) { 80 | print!("["); 81 | for (i, c) in counts.iter().copied().enumerate() { 82 | if c != 0 { 83 | print!("({i}u16, {c}), "); 84 | } 85 | } 86 | println!("]"); 87 | } 88 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_qfilter.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::arbitrary; 3 | use libfuzzer_sys::arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | 6 | const FUZZ_REMOVES: bool = true; 7 | const CHECK_EVERY: usize = 8; 8 | const CHECK_SHRUNK: bool = true; 9 | 10 | #[derive(Debug, Arbitrary)] 11 | struct Input { 12 | cap: u16, 13 | max_cap: u16, 14 | fp_exp: u16, 15 | ops: Vec<(bool, u16)>, 16 | } 17 | 18 | fuzz_target!(|input: Input| { 19 | let Input { 20 | cap, 21 | max_cap, 22 | fp_exp, 23 | ops, 24 | } = input; 25 | let max_cap = max_cap.max(cap) as u64; 26 | let cap = cap as u64; 27 | let fp = 2f64.powi(-(fp_exp.leading_ones() as i32)); 28 | // The "Model", tracks the count for each item 29 | let mut counts = [0u64; (u16::MAX as usize) + 1]; 30 | let mut f = qfilter::Filter::new_resizeable(cap, max_cap, fp).unwrap(); 31 | for i in 0..ops.len() { 32 | // print_sample(&counts); 33 | // dbg!(ops[i]); 34 | 35 | let (add, item) = ops[i]; 36 | if !FUZZ_REMOVES || add { 37 | if f.insert_duplicated(item).is_err() { 38 | continue; 39 | } 40 | counts[item as usize] += 1; 41 | } else if counts[item as usize] != 0 && f.remove(item) { 42 | counts[item as usize] -= 1; 43 | } else { 44 | continue; 45 | } 46 | 47 | if i % CHECK_EVERY == 0 { 48 | for &(_add, e) in &ops[..=i] { 49 | let min = counts[e as usize]; 50 | // Since we can only check for >= due to collisions skip min = 0 51 | if min != 0 { 52 | let est = f.count(e); 53 | assert!(est >= min, "{e}: est {est} < min {min}"); 54 | } 55 | } 56 | } 57 | } 58 | 59 | for shrunk in [false, true] { 60 | for &(_add, e) in &ops { 61 | let min = counts[e as usize]; 62 | let est = f.count(e); 63 | assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}"); 64 | } 65 | if !CHECK_SHRUNK { 66 | break; 67 | } 68 | f.shrink_to_fit(); 69 | } 70 | }); 71 | 72 | #[allow(dead_code)] 73 | fn print_sample(counts: &[u64]) { 74 | print!("["); 75 | for (i, c) in counts.iter().copied().enumerate() { 76 | if c != 0 { 77 | print!("({i}u16, {c}), "); 78 | } 79 | } 80 | println!("]"); 81 | } 82 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Approximate Membership Query Filter ([AMQ-Filter](https://en.wikipedia.org/wiki/Approximate_Membership_Query_Filter)) 2 | //! based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963). 3 | //! 4 | //! This is a small and flexible general-purpose AMQ-Filter, it not only supports approximate membership testing like a bloom filter 5 | //! but also deletions, merging, resizing and [serde](https://crates.io/crates/serde) serialization. 6 | //! 7 | //! ### Example 8 | //! 9 | //! ```rust 10 | //! let mut f = qfilter::Filter::new(1000000, 0.01).unwrap(); 11 | //! for i in 0..1000 { 12 | //! f.insert(i).unwrap(); 13 | //! } 14 | //! for i in 0..1000 { 15 | //! assert!(f.contains(i)); 16 | //! } 17 | //! ``` 18 | //! 19 | //! ### Hasher 20 | //! 21 | //! The hashing algorithm used is [xxhash3](https://crates.io/crates/xxhash-rust) 22 | //! which offers both high performance and stability across platforms. 23 | //! 24 | //! ### Filter size 25 | //! 26 | //! For a given capacity and error probability the RSQF may require significantly less space than the equivalent bloom filter or other AMQ-Filters. 27 | //! 28 | //! | Bits per item | Error probability when full | Bits per item (cont.) | Error (cont.) | 29 | //! |:---:|:---:|:---:|---| 30 | //! | 3.125 | 0.362 | 19.125 | 6.87e-06 | 31 | //! | 4.125 | 0.201 | 20.125 | 3.43e-06 | 32 | //! | 5.125 | 0.106 | 21.125 | 1.72e-06 | 33 | //! | 6.125 | 0.0547 | 22.125 | 8.58e-07 | 34 | //! | 7.125 | 0.0277 | 23.125 | 4.29e-07 | 35 | //! | 8.125 | 0.014 | 24.125 | 2.15e-07 | 36 | //! | 9.125 | 0.00701 | 25.125 | 1.07e-07 | 37 | //! | 10.125 | 0.00351 | 26.125 | 5.36e-08 | 38 | //! | 11.125 | 0.00176 | 27.125 | 2.68e-08 | 39 | //! | 12.125 | 0.000879 | 28.125 | 1.34e-08 | 40 | //! | 13.125 | 0.000439 | 29.125 | 6.71e-09 | 41 | //! | 14.125 | 0.00022 | 30.125 | 3.35e-09 | 42 | //! | 15.125 | 0.00011 | 31.125 | 1.68e-09 | 43 | //! | 16.125 | 5.49e-05 | 32.125 | 8.38e-10 | 44 | //! | 17.125 | 2.75e-05 | .. | .. | 45 | //! | 18.125 | 1.37e-05 | .. | .. | 46 | //! 47 | //! ### Legacy x86_64 CPUs support 48 | //! 49 | //! The implementation assumes the `popcnt` instruction (equivalent to `integer.count_ones()`) is present 50 | //! when compiling for x86_64 targets. This is theoretically not guaranteed as the instruction in only 51 | //! available on AMD/Intel CPUs released after 2007/2008. If that's not the case the Filter constructor will panic. 52 | //! 53 | //! Support for such legacy x86_64 CPUs can be optionally enabled with the `legacy_x86_64_support` 54 | //! which incurs a ~10% performance penalty. 55 | #![cfg_attr(docsrs, feature(doc_auto_cfg))] 56 | 57 | use std::{ 58 | cmp::Ordering, 59 | hash::{Hash, Hasher}, 60 | num::{NonZeroU64, NonZeroU8}, 61 | ops::{RangeBounds, RangeFrom}, 62 | }; 63 | 64 | #[cfg(feature = "jsonschema")] 65 | use schemars::JsonSchema; 66 | #[cfg(feature = "serde")] 67 | use serde::{Deserialize, Serialize}; 68 | use stable_hasher::StableHasher; 69 | 70 | mod stable_hasher; 71 | 72 | /// Approximate Membership Query Filter (AMQ-Filter) based on the Rank Select Quotient Filter (RSQF). 73 | /// 74 | /// This data structure is similar to a hash table that stores fingerprints in a very compact way. 75 | /// Fingerprints are similar to a hash values, but are possibly truncated. 76 | /// The reason for false positives is that multiple items can map to the same fingerprint. 77 | /// For more information see the [quotient filter Wikipedia page](https://en.wikipedia.org/wiki/Quotient_filter) 78 | /// that describes a similar but less optimized version of the data structure. 79 | /// The actual implementation is based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963). 80 | /// 81 | /// The public API also exposes a fingerprint API, which can be used to succinctly store u64 82 | /// hash values. 83 | #[derive(Clone)] 84 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 85 | #[cfg_attr(feature = "jsonschema", derive(JsonSchema))] 86 | pub struct Filter { 87 | #[cfg_attr( 88 | feature = "serde", 89 | serde( 90 | rename = "b", 91 | serialize_with = "serde_bytes::serialize", 92 | deserialize_with = "serde_bytes::deserialize" 93 | ) 94 | )] 95 | buffer: Box<[u8]>, 96 | #[cfg_attr(feature = "serde", serde(rename = "l"))] 97 | len: u64, 98 | #[cfg_attr(feature = "serde", serde(rename = "q"))] 99 | qbits: NonZeroU8, 100 | #[cfg_attr(feature = "serde", serde(rename = "r"))] 101 | rbits: NonZeroU8, 102 | #[cfg_attr( 103 | feature = "serde", 104 | serde(rename = "g", skip_serializing_if = "Option::is_none", default) 105 | )] 106 | max_qbits: Option, 107 | } 108 | 109 | #[derive(Debug)] 110 | #[non_exhaustive] 111 | pub enum Error { 112 | /// The filter cannot fit another fingerprint 113 | CapacityExceeded, 114 | /// The fingerprint sizes are not compatible 115 | IncompatibleFingerprintSize, 116 | /// The specified filter cannot be constructed with 64 bit hashes 117 | NotEnoughFingerprintBits, 118 | /// Capacity is too large. Filter::MAX_CAPACITY = 2^59 * 19 / 20. 119 | CapacityTooLarge, 120 | } 121 | 122 | impl std::fmt::Display for Error { 123 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 124 | write!(f, "{self:?}") 125 | } 126 | } 127 | 128 | impl std::error::Error for Error {} 129 | 130 | #[derive(Debug)] 131 | struct Block { 132 | offset: u64, 133 | occupieds: u64, 134 | runends: u64, 135 | } 136 | 137 | trait BitExt { 138 | fn is_bit_set(&self, i: usize) -> bool; 139 | fn set_bit(&mut self, i: usize); 140 | fn clear_bit(&mut self, i: usize); 141 | fn shift_right(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self; 142 | fn shift_left(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self; 143 | /// Number of set bits (1s) in the range 144 | fn popcnt(&self, range: impl RangeBounds) -> u64; 145 | /// Index of nth set bits in the range 146 | fn select(&self, range: RangeFrom, n: u64) -> Option; 147 | 148 | #[inline] 149 | fn update_bit(&mut self, i: usize, value: bool) { 150 | if value { 151 | self.set_bit(i) 152 | } else { 153 | self.clear_bit(i) 154 | } 155 | } 156 | } 157 | 158 | impl BitExt for u64 { 159 | #[inline] 160 | fn is_bit_set(&self, i: usize) -> bool { 161 | (*self & (1 << i)) != 0 162 | } 163 | 164 | #[inline] 165 | fn set_bit(&mut self, i: usize) { 166 | *self |= 1 << i 167 | } 168 | 169 | #[inline] 170 | fn clear_bit(&mut self, i: usize) { 171 | *self &= !(1 << i) 172 | } 173 | 174 | #[inline] 175 | fn shift_right(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self { 176 | let bitmask = |n| !u64::MAX.checked_shl(n).unwrap_or(0); 177 | let a_component = *self >> (64 - bits); // select the highest `bits` from A to become lowest 178 | let b_shifted_mask = bitmask((b_end - b_start) as u32) << b_start; 179 | let b_shifted = ((b_shifted_mask & b) << bits) & b_shifted_mask; 180 | let b_mask = !b_shifted_mask; 181 | 182 | a_component | b_shifted | (b & b_mask) 183 | } 184 | 185 | #[inline] 186 | fn shift_left(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self { 187 | let bitmask = |n| !u64::MAX.checked_shl(n).unwrap_or(0); 188 | let a_component = *self << (64 - bits); // select the lowest `bits` from A to become highest 189 | let b_shifted_mask = bitmask((b_end - b_start) as u32) << b_start; 190 | let b_shifted = ((b_shifted_mask & b) >> bits) & b_shifted_mask; 191 | let b_mask = !b_shifted_mask; 192 | 193 | a_component | b_shifted | (b & b_mask) 194 | } 195 | 196 | #[inline] 197 | fn popcnt(&self, range: impl RangeBounds) -> u64 { 198 | let mut v = match range.start_bound() { 199 | std::ops::Bound::Included(&i) => *self >> i << i, 200 | std::ops::Bound::Excluded(&i) => *self >> (i + 1) << (i + 1), 201 | _ => *self, 202 | }; 203 | v = match range.end_bound() { 204 | std::ops::Bound::Included(&i) if i < 63 => v & ((2 << i) - 1), 205 | std::ops::Bound::Excluded(&i) if i <= 63 => v & ((1 << i) - 1), 206 | _ => v, 207 | }; 208 | 209 | #[cfg(all( 210 | target_arch = "x86_64", 211 | not(feature = "legacy_x86_64_support"), 212 | not(target_feature = "popcnt") 213 | ))] 214 | let result = unsafe { 215 | // Using intrinsics introduce a function call, and the resulting code 216 | // ends up slower than the inline assembly below. 217 | // Any calls to is_x86_feature_detected also significantly affect performance. 218 | // Given this is available on all x64 cpus starting 2008 we assume it's present 219 | // (unless legacy_x86_64_support is set) and panic elsewhere otherwise. 220 | let popcnt; 221 | std::arch::asm!( 222 | "popcnt {popcnt}, {v}", 223 | v = in(reg) v, 224 | popcnt = out(reg) popcnt, 225 | options(pure, nomem, nostack) 226 | ); 227 | popcnt 228 | }; 229 | #[cfg(any( 230 | not(target_arch = "x86_64"), 231 | feature = "legacy_x86_64_support", 232 | target_feature = "popcnt" 233 | ))] 234 | let result = v.count_ones() as u64; 235 | 236 | result 237 | } 238 | 239 | #[inline] 240 | fn select(&self, range: RangeFrom, n: u64) -> Option { 241 | debug_assert!(range.start < 64); 242 | let v = *self >> range.start << range.start; 243 | 244 | #[cfg_attr(target_arch = "x86_64", cold)] 245 | #[cfg_attr(not(target_arch = "x86_64"), inline)] 246 | fn fallback(mut v: u64, n: u64) -> Option { 247 | for _ in 0..n / 8 { 248 | for _ in 0..8 { 249 | v &= v.wrapping_sub(1); // remove the least significant bit 250 | } 251 | } 252 | for _ in 0..n % 8 { 253 | v &= v.wrapping_sub(1); // remove the least significant bit 254 | } 255 | 256 | if v == 0 { 257 | None 258 | } else { 259 | Some(v.trailing_zeros() as u64) 260 | } 261 | } 262 | 263 | #[cfg(target_arch = "x86_64")] 264 | let result = { 265 | // TODO: AMD CPUs up to Zen2 have slow BMI implementations 266 | if std::is_x86_feature_detected!("bmi2") { 267 | // This is the equivalent intrinsics version of the inline assembly below. 268 | // #[target_feature(enable = "bmi1")] 269 | // #[target_feature(enable = "bmi2")] 270 | // #[inline] 271 | // unsafe fn select_bmi2(x: u64, k: u64) -> Option { 272 | // use std::arch::x86_64::{_pdep_u64, _tzcnt_u64}; 273 | // let result = _tzcnt_u64(_pdep_u64(1 << k, x)); 274 | // if result != 64 { 275 | // Some(result) 276 | // } else { 277 | // None 278 | // } 279 | // } 280 | // unsafe { select_bmi2(v, n) } 281 | 282 | let result: u64; 283 | unsafe { 284 | std::arch::asm!( 285 | "mov {tmp}, 1", 286 | "shlx {tmp}, {tmp}, {n}", 287 | "pdep {tmp}, {tmp}, {v}", 288 | "tzcnt {tmp}, {tmp}", 289 | n = in(reg) n, 290 | v = in(reg) v, 291 | tmp = out(reg) result, 292 | options(pure, nomem, nostack) 293 | ); 294 | } 295 | if result != 64 { 296 | Some(result) 297 | } else { 298 | None 299 | } 300 | } else { 301 | fallback(v, n) 302 | } 303 | }; 304 | #[cfg(not(target_arch = "x86_64"))] 305 | let result = fallback(v, n); 306 | 307 | result 308 | } 309 | } 310 | 311 | trait CastNonZeroU8 { 312 | fn u64(&self) -> u64; 313 | fn usize(&self) -> usize; 314 | } 315 | 316 | impl CastNonZeroU8 for NonZeroU8 { 317 | #[inline] 318 | fn u64(&self) -> u64 { 319 | self.get() as u64 320 | } 321 | 322 | #[inline] 323 | fn usize(&self) -> usize { 324 | self.get() as usize 325 | } 326 | } 327 | 328 | /// An iterator over the fingerprints of a `Filter`. 329 | pub struct FingerprintIter<'a> { 330 | filter: &'a Filter, 331 | q_bucket_idx: u64, 332 | r_bucket_idx: u64, 333 | remaining: u64, 334 | } 335 | 336 | impl<'a> FingerprintIter<'a> { 337 | fn new(filter: &'a Filter) -> Self { 338 | let mut iter = FingerprintIter { 339 | filter, 340 | q_bucket_idx: 0, 341 | r_bucket_idx: 0, 342 | remaining: filter.len, 343 | }; 344 | if !filter.is_empty() { 345 | while !filter.is_occupied(iter.q_bucket_idx) { 346 | iter.q_bucket_idx += 1; 347 | } 348 | iter.r_bucket_idx = filter.run_start(iter.q_bucket_idx); 349 | } 350 | iter 351 | } 352 | } 353 | 354 | impl Iterator for FingerprintIter<'_> { 355 | type Item = u64; 356 | 357 | fn next(&mut self) -> Option { 358 | if let Some(r) = self.remaining.checked_sub(1) { 359 | self.remaining = r; 360 | } else { 361 | return None; 362 | } 363 | let hash = (self.q_bucket_idx << self.filter.rbits.get()) 364 | | self.filter.get_remainder(self.r_bucket_idx); 365 | 366 | if self.filter.is_runend(self.r_bucket_idx) { 367 | self.q_bucket_idx += 1; 368 | while !self.filter.is_occupied(self.q_bucket_idx) { 369 | self.q_bucket_idx += 1; 370 | } 371 | self.r_bucket_idx = (self.r_bucket_idx + 1).max(self.q_bucket_idx); 372 | } else { 373 | self.r_bucket_idx += 1; 374 | } 375 | 376 | Some(hash) 377 | } 378 | } 379 | 380 | impl Filter { 381 | /// Maximum log2 number of slots that can be used in the filter. 382 | /// Effectively, the largest power of 2 that can be multiplied by 19 without overflowing u64. 383 | const MAX_QBITS: u8 = 59; 384 | 385 | /// Maximum number of items that can be stored in the filter: ceil(2^59 * 19 / 20) 386 | pub const MAX_CAPACITY: u64 = (2u64.pow(Self::MAX_QBITS as u32) * 19).div_ceil(20); 387 | 388 | /// Creates a new filter that can hold at least `capacity` items 389 | /// and with a desired error rate of `fp_rate` (clamped to (0, 0.5]). 390 | /// 391 | /// Errors if capacity is too large if the specified filter isn't achievable using 64 bit hashes. 392 | #[inline] 393 | pub fn new(capacity: u64, fp_rate: f64) -> Result { 394 | Self::new_resizeable(capacity, capacity, fp_rate) 395 | } 396 | 397 | /// Calculates the number of slots needed to fit the desired fingerprints with 95% occupation. 398 | /// Returns the number of slots needed rounded to the next power of two, but always >= 64. 399 | fn calculate_needed_slots(desired: u64) -> Result { 400 | let mut slots = desired 401 | .checked_next_power_of_two() 402 | .ok_or(Error::CapacityTooLarge)? 403 | .max(64); 404 | loop { 405 | let capacity = slots 406 | .checked_mul(19) 407 | .ok_or(Error::CapacityTooLarge)? 408 | .div_ceil(20); 409 | if capacity >= desired { 410 | return Ok(slots); 411 | } 412 | slots = slots.checked_mul(2).ok_or(Error::CapacityTooLarge)?; 413 | } 414 | } 415 | 416 | /// Creates a new filter that can hold at least `initial_capacity` items initially 417 | /// and can resize to hold at least `max_capacity` when fully grown. 418 | /// The desired error rate `fp_rate` (clamped to (0, 0.5]) applies to the fully grown filter. 419 | /// 420 | /// This works by storing fingerprints large enough to satisfy the maximum requirements, 421 | /// so smaller filters will actually have lower error rates, which will increase 422 | /// (up to `fp_rate`) as the filter grows. In practice every time the filter doubles in 423 | /// capacity its error rate also doubles. 424 | /// 425 | /// Errors if max_capacity is too large or if the specified filter isn't achievable using 64 bit hashes. 426 | pub fn new_resizeable( 427 | initial_capacity: u64, 428 | max_capacity: u64, 429 | fp_rate: f64, 430 | ) -> Result { 431 | assert!(max_capacity >= initial_capacity); 432 | let slots_for_capacity = Self::calculate_needed_slots(initial_capacity)?; 433 | let qbits = slots_for_capacity.trailing_zeros() as u8; 434 | let slots_for_max_capacity = Self::calculate_needed_slots(max_capacity)?; 435 | let max_qbits = slots_for_max_capacity.trailing_zeros() as u8; 436 | let fp_rate = fp_rate.clamp(f64::MIN_POSITIVE, 0.5); 437 | let rbits = (-fp_rate.log2()).round().max(1.0) as u8 + (max_qbits - qbits); 438 | let mut result = Self::with_qr(qbits.try_into().unwrap(), rbits.try_into().unwrap())?; 439 | if max_qbits > qbits { 440 | result.max_qbits = Some(max_qbits.try_into().unwrap()); 441 | } 442 | Ok(result) 443 | } 444 | 445 | /// Creates a new resizeable filter that can hold at least `initial_capacity` items initially while 446 | /// utilizing a fingerprint bit size of `fingerprint_bits` (7..=64). Normally this function is only 447 | /// useful if the filter is being used to manually store fingerprints. 448 | pub fn with_fingerprint_size( 449 | initial_capacity: u64, 450 | fingerprint_bits: u8, 451 | ) -> Result { 452 | if !(7..=64).contains(&fingerprint_bits) { 453 | return Err(Error::NotEnoughFingerprintBits); 454 | } 455 | let slots_for_capacity = Self::calculate_needed_slots(initial_capacity)?; 456 | let qbits = slots_for_capacity.trailing_zeros() as u8; 457 | if fingerprint_bits <= qbits { 458 | return Err(Error::NotEnoughFingerprintBits); 459 | } 460 | let rbits = fingerprint_bits - qbits; 461 | let mut result = Self::with_qr(qbits.try_into().unwrap(), rbits.try_into().unwrap())?; 462 | if rbits > 1 { 463 | result.max_qbits = Some((qbits + rbits - 1).min(Self::MAX_QBITS).try_into().unwrap()); 464 | } 465 | Ok(result) 466 | } 467 | 468 | fn with_qr(qbits: NonZeroU8, rbits: NonZeroU8) -> Result { 469 | Self::check_cpu_support(); 470 | if qbits.get() + rbits.get() > 64 { 471 | return Err(Error::NotEnoughFingerprintBits); 472 | } 473 | let num_slots = 1 << qbits.get(); 474 | let num_blocks = num_slots / 64; 475 | assert_ne!(num_blocks, 0); 476 | let block_bytes_size = 1 + 16 + 64 * rbits.u64() / 8; 477 | let buffer_bytes = num_blocks * block_bytes_size; 478 | let buffer = vec![0u8; buffer_bytes.try_into().unwrap()].into_boxed_slice(); 479 | Ok(Self { 480 | buffer, 481 | qbits, 482 | rbits, 483 | len: 0, 484 | max_qbits: None, 485 | }) 486 | } 487 | 488 | fn check_cpu_support() { 489 | #[cfg(all( 490 | target_arch = "x86_64", 491 | not(feature = "legacy_x86_64_support"), 492 | not(target_feature = "popcnt") 493 | ))] 494 | assert!( 495 | std::is_x86_feature_detected!("popcnt"), 496 | "CPU doesn't support the popcnt instruction" 497 | ); 498 | } 499 | 500 | /// The internal fingerprint size in bits. 501 | #[inline] 502 | pub fn fingerprint_size(&self) -> u8 { 503 | self.qbits.get() + self.rbits.get() 504 | } 505 | 506 | /// Whether the filter is empty. 507 | #[inline] 508 | pub fn is_empty(&self) -> bool { 509 | self.len == 0 510 | } 511 | 512 | /// Current number of fingerprints admitted to the filter. 513 | #[inline] 514 | pub fn len(&self) -> u64 { 515 | self.len 516 | } 517 | 518 | /// Current memory usage in bytes. 519 | #[inline] 520 | pub fn memory_usage(&self) -> usize { 521 | self.buffer.len() 522 | } 523 | 524 | /// Resets/Clears the filter. 525 | pub fn clear(&mut self) { 526 | self.buffer.fill(0); 527 | self.len = 0; 528 | } 529 | 530 | /// Maximum filter capacity. 531 | #[inline] 532 | pub fn capacity_resizeable(&self) -> u64 { 533 | // Overflow is not possible here as it'd have overflowed in the constructor. 534 | ((1u64 << self.max_qbits.unwrap_or(self.qbits).get()) * 19).div_ceil(20) 535 | } 536 | 537 | /// Current filter capacity. 538 | #[inline] 539 | pub fn capacity(&self) -> u64 { 540 | if cfg!(fuzzing) { 541 | // 100% occupancy is not realistic but stresses the algorithm much more. 542 | // To generate real counter examples this "pessimisation" must be removed. 543 | self.total_buckets().get() 544 | } else { 545 | // Up to 95% occupancy 546 | // 19/20 == 0.95 547 | // Overflow is not possible here as it'd have overflowed in the constructor. 548 | (self.total_buckets().get() * 19).div_ceil(20) 549 | } 550 | } 551 | 552 | /// Max error ratio when at the resizeable capacity (len == resizeable_capacity). 553 | pub fn max_error_ratio_resizeable(&self) -> f64 { 554 | let extra_rbits = self.max_qbits.unwrap_or(self.qbits).get() - self.qbits.get(); 555 | 2f64.powi(-((self.rbits.get() - extra_rbits) as i32)) 556 | } 557 | 558 | /// Max error ratio when at full capacity (len == capacity). 559 | pub fn max_error_ratio(&self) -> f64 { 560 | 2f64.powi(-(self.rbits.get() as i32)) 561 | } 562 | 563 | /// Current error ratio at the current occupancy. 564 | pub fn current_error_ratio(&self) -> f64 { 565 | let occupancy = self.len as f64 / self.total_buckets().get() as f64; 566 | 1.0 - std::f64::consts::E.powf(-occupancy / 2f64.powi(self.rbits.get() as i32)) 567 | } 568 | 569 | #[inline] 570 | fn block_byte_size(&self) -> usize { 571 | 1 + 8 + 8 + 64 * self.rbits.usize() / 8 572 | } 573 | 574 | #[inline] 575 | fn set_block_runends(&mut self, block_num: u64, runends: u64) { 576 | let block_num = block_num % self.total_blocks(); 577 | let block_start = block_num as usize * self.block_byte_size(); 578 | let block_bytes: &mut [u8; 1 + 8 + 8] = (&mut self.buffer[block_start..][..1 + 8 + 8]) 579 | .try_into() 580 | .unwrap(); 581 | block_bytes[1 + 8..1 + 8 + 8].copy_from_slice(&runends.to_le_bytes()); 582 | } 583 | 584 | #[inline] 585 | fn raw_block(&self, block_num: u64) -> Block { 586 | let block_num = block_num % self.total_blocks(); 587 | let block_start = block_num as usize * self.block_byte_size(); 588 | let block_bytes: &[u8; 1 + 8 + 8] = 589 | &self.buffer[block_start..][..1 + 8 + 8].try_into().unwrap(); 590 | Block { 591 | offset: block_bytes[0] as u64, 592 | occupieds: u64::from_le_bytes(block_bytes[1..1 + 8].try_into().unwrap()), 593 | runends: u64::from_le_bytes(block_bytes[1 + 8..1 + 8 + 8].try_into().unwrap()), 594 | } 595 | } 596 | 597 | #[inline] 598 | fn block(&self, block_num: u64) -> Block { 599 | let block_num = block_num % self.total_blocks(); 600 | let block_start = block_num as usize * self.block_byte_size(); 601 | let block_bytes: &[u8; 1 + 8 + 8] = 602 | &self.buffer[block_start..][..1 + 8 + 8].try_into().unwrap(); 603 | let offset = { 604 | if block_bytes[0] < u8::MAX { 605 | block_bytes[0] as u64 606 | } else { 607 | self.calc_offset(block_num) 608 | } 609 | }; 610 | Block { 611 | offset, 612 | occupieds: u64::from_le_bytes(block_bytes[1..1 + 8].try_into().unwrap()), 613 | runends: u64::from_le_bytes(block_bytes[1 + 8..1 + 8 + 8].try_into().unwrap()), 614 | } 615 | } 616 | 617 | #[inline] 618 | fn adjust_block_offset(&mut self, block_num: u64, inc: bool) { 619 | let block_num = block_num % self.total_blocks(); 620 | let block_start = block_num as usize * self.block_byte_size(); 621 | let offset = &mut self.buffer[block_start]; 622 | if inc { 623 | *offset = offset.saturating_add(1); 624 | } else if *offset != u8::MAX { 625 | *offset -= 1; 626 | } else { 627 | self.buffer[block_start] = self.calc_offset(block_num).try_into().unwrap_or(u8::MAX); 628 | } 629 | } 630 | 631 | #[inline] 632 | fn inc_offsets(&mut self, start_bucket: u64, end_bucket: u64) { 633 | let original_block = start_bucket / 64; 634 | let mut last_affected_block = end_bucket / 64; 635 | if end_bucket < start_bucket { 636 | last_affected_block += self.total_blocks().get(); 637 | } 638 | for b in original_block + 1..=last_affected_block { 639 | self.adjust_block_offset(b, true); 640 | } 641 | } 642 | 643 | #[inline] 644 | fn dec_offsets(&mut self, start_bucket: u64, end_bucket: u64) { 645 | let original_block = start_bucket / 64; 646 | let mut last_affected_block = end_bucket / 64; 647 | if end_bucket < start_bucket { 648 | last_affected_block += self.total_blocks().get(); 649 | } 650 | 651 | // As an edge case we may decrement the offsets of 2+ blocks and the block B' offset 652 | // may be saturated and depend on a previous Block B" with a non saturated offset. 653 | // But B" offset may also(!) be affected by the decremented operation, so we must 654 | // decrement B" offset first before the remaining offsets. 655 | if last_affected_block - original_block >= 2 656 | && self.raw_block(original_block + 1).offset >= u8::MAX as u64 657 | { 658 | // last affected block offset is always <= 64 (BLOCK SIZE) 659 | // otherwise the decrement operation would be to affecting a subsequent block 660 | debug_assert!(self.raw_block(last_affected_block).offset <= 64); 661 | self.adjust_block_offset(last_affected_block, false); 662 | last_affected_block -= 1; 663 | } 664 | for b in original_block + 1..=last_affected_block { 665 | self.adjust_block_offset(b, false); 666 | } 667 | 668 | #[cfg(fuzzing)] 669 | self.validate_offsets(original_block, last_affected_block); 670 | } 671 | 672 | #[cfg(any(fuzzing, test))] 673 | fn validate_offsets(&mut self, original_block: u64, last_affected_block: u64) { 674 | for b in original_block..=last_affected_block { 675 | let raw_offset = self.raw_block(b).offset; 676 | let offset = self.calc_offset(b); 677 | debug_assert!( 678 | (raw_offset >= u8::MAX as u64 && offset >= u8::MAX as u64) 679 | || (offset == raw_offset), 680 | "block {} offset {} calc {}", 681 | b, 682 | raw_offset, 683 | offset, 684 | ); 685 | } 686 | } 687 | 688 | #[inline(always)] 689 | fn is_occupied(&self, hash_bucket_idx: u64) -> bool { 690 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 691 | let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size(); 692 | let occupieds = u64::from_le_bytes(self.buffer[block_start + 1..][..8].try_into().unwrap()); 693 | occupieds.is_bit_set((hash_bucket_idx % 64) as usize) 694 | } 695 | 696 | #[inline(always)] 697 | fn set_occupied(&mut self, hash_bucket_idx: u64, value: bool) { 698 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 699 | let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size(); 700 | let mut occupieds = 701 | u64::from_le_bytes(self.buffer[block_start + 1..][..8].try_into().unwrap()); 702 | occupieds.update_bit((hash_bucket_idx % 64) as usize, value); 703 | self.buffer[block_start + 1..][..8].copy_from_slice(&occupieds.to_le_bytes()); 704 | } 705 | 706 | #[inline(always)] 707 | fn is_runend(&self, hash_bucket_idx: u64) -> bool { 708 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 709 | let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size(); 710 | let runends = 711 | u64::from_le_bytes(self.buffer[block_start + 1 + 8..][..8].try_into().unwrap()); 712 | runends.is_bit_set((hash_bucket_idx % 64) as usize) 713 | } 714 | 715 | #[inline(always)] 716 | fn set_runend(&mut self, hash_bucket_idx: u64, value: bool) { 717 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 718 | let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size(); 719 | let mut runends = 720 | u64::from_le_bytes(self.buffer[block_start + 1 + 8..][..8].try_into().unwrap()); 721 | runends.update_bit((hash_bucket_idx % 64) as usize, value); 722 | self.buffer[block_start + 1 + 8..][..8].copy_from_slice(&runends.to_le_bytes()); 723 | } 724 | 725 | #[inline(always)] 726 | fn get_remainder(&self, hash_bucket_idx: u64) -> u64 { 727 | debug_assert!(self.rbits.get() > 0 && self.rbits.get() < 64); 728 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 729 | let remainders_start = (hash_bucket_idx / 64) as usize * self.block_byte_size() + 1 + 8 + 8; 730 | let start_bit_idx = self.rbits.usize() * (hash_bucket_idx % 64) as usize; 731 | let end_bit_idx = start_bit_idx + self.rbits.usize(); 732 | let start_u64 = start_bit_idx / 64; 733 | let num_rem_parts = 1 + (end_bit_idx > (start_u64 + 1) * 64) as usize; 734 | let rem_parts_bytes = &self.buffer[remainders_start + start_u64 * 8..][..num_rem_parts * 8]; 735 | let extra_low = start_bit_idx - start_u64 * 64; 736 | let extra_high = ((start_u64 + 1) * 64).saturating_sub(end_bit_idx); 737 | let rem_part = u64::from_le_bytes(rem_parts_bytes[..8].try_into().unwrap()); 738 | // zero high bits & truncate low bits 739 | let mut remainder = (rem_part << extra_high) >> (extra_high + extra_low); 740 | if let Some(rem_part) = rem_parts_bytes.get(8..16) { 741 | let remaining_bits = end_bit_idx - (start_u64 + 1) * 64; 742 | let rem_part = u64::from_le_bytes(rem_part.try_into().unwrap()); 743 | remainder |= 744 | (rem_part & !(u64::MAX << remaining_bits)) << (self.rbits.usize() - remaining_bits); 745 | } 746 | debug_assert!(remainder.leading_zeros() >= 64 - self.rbits.get() as u32); 747 | remainder 748 | } 749 | 750 | #[inline(always)] 751 | fn set_remainder(&mut self, hash_bucket_idx: u64, remainder: u64) { 752 | debug_assert!(self.rbits.get() > 0 && self.rbits.get() < 64); 753 | debug_assert!(remainder.leading_zeros() >= 64 - self.rbits.get() as u32); 754 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 755 | let remainders_start = (hash_bucket_idx / 64) as usize * self.block_byte_size() + 1 + 8 + 8; 756 | let start_bit_idx = self.rbits.usize() * (hash_bucket_idx % 64) as usize; 757 | let end_bit_idx = start_bit_idx + self.rbits.usize(); 758 | let start_u64 = start_bit_idx / 64; 759 | let num_rem_parts = 1 + (end_bit_idx > (start_u64 + 1) * 64) as usize; 760 | let rem_parts_bytes = 761 | &mut self.buffer[remainders_start + start_u64 * 8..][..num_rem_parts * 8]; 762 | let mut rem_part = u64::from_le_bytes(rem_parts_bytes[..8].try_into().unwrap()); 763 | let extra_low = start_bit_idx - start_u64 * 64; 764 | let extra_high = ((start_u64 + 1) * 64).saturating_sub(end_bit_idx); 765 | // zero region we'll copy remainder bits in 766 | rem_part &= !((u64::MAX << extra_low) & (u64::MAX >> extra_high)); 767 | let low_bits_to_copy = 64 - extra_high - extra_low; 768 | rem_part |= (remainder & !(u64::MAX << low_bits_to_copy)) << extra_low; 769 | rem_parts_bytes[..8].copy_from_slice(&rem_part.to_le_bytes()); 770 | if rem_parts_bytes.len() < 16 { 771 | return; 772 | } 773 | 774 | let remaining_bits = end_bit_idx - (start_u64 + 1) * 64; 775 | rem_part = u64::from_le_bytes(rem_parts_bytes[8..16].try_into().unwrap()); 776 | // zero region we'll copy remainder bits in 777 | rem_part &= u64::MAX << remaining_bits; 778 | rem_part |= remainder >> (self.rbits.usize() - remaining_bits); 779 | rem_parts_bytes[8..16].copy_from_slice(&rem_part.to_le_bytes()); 780 | } 781 | 782 | #[inline] 783 | fn get_rem_u64(&self, rem_u64: u64) -> u64 { 784 | let rbits = NonZeroU64::from(self.rbits); 785 | let bucket_block_idx = (rem_u64 / rbits) % self.total_blocks(); 786 | let bucket_rem_u64 = (rem_u64 % rbits) as usize; 787 | let bucket_rem_start = (bucket_block_idx as usize * self.block_byte_size()) + 1 + 8 + 8; 788 | u64::from_le_bytes( 789 | self.buffer[bucket_rem_start + bucket_rem_u64 * 8..][..8] 790 | .try_into() 791 | .unwrap(), 792 | ) 793 | } 794 | 795 | #[inline] 796 | fn set_rem_u64(&mut self, rem_u64: u64, rem: u64) { 797 | let rbits = NonZeroU64::from(self.rbits); 798 | let bucket_block_idx = (rem_u64 / rbits) % self.total_blocks(); 799 | let bucket_rem_u64 = (rem_u64 % rbits) as usize; 800 | let bucket_rem_start = (bucket_block_idx as usize * self.block_byte_size()) + 1 + 8 + 8; 801 | self.buffer[bucket_rem_start + bucket_rem_u64 * 8..][..8] 802 | .copy_from_slice(&rem.to_le_bytes()); 803 | } 804 | 805 | fn shift_remainders_by_1(&mut self, start: u64, end_inc: u64) { 806 | let end = if end_inc < start { 807 | end_inc + self.total_buckets().get() + 1 808 | } else { 809 | end_inc + 1 810 | }; 811 | let mut end_u64 = end * self.rbits.u64() / 64; 812 | let mut bend = (end * self.rbits.u64() % 64) as usize; 813 | let start_u64 = start * self.rbits.u64() / 64; 814 | let bstart = (start * self.rbits.u64() % 64) as usize; 815 | while end_u64 != start_u64 { 816 | let prev_rem_u64 = self.get_rem_u64(end_u64 - 1); 817 | let mut rem_u64 = self.get_rem_u64(end_u64); 818 | rem_u64 = prev_rem_u64.shift_right(self.rbits.usize(), &rem_u64, 0, bend); 819 | self.set_rem_u64(end_u64, rem_u64); 820 | end_u64 -= 1; 821 | bend = 64; 822 | } 823 | let mut rem_u64 = self.get_rem_u64(start_u64); 824 | rem_u64 = 0u64.shift_right(self.rbits.usize(), &rem_u64, bstart, bend); 825 | self.set_rem_u64(start_u64, rem_u64); 826 | } 827 | 828 | fn shift_remainders_back_by_1(&mut self, start: u64, end_inc: u64) { 829 | let end = if end_inc < start { 830 | end_inc + self.total_buckets().get() + 1 831 | } else { 832 | end_inc + 1 833 | }; 834 | let end_u64 = end * self.rbits.u64() / 64; 835 | let bend = (end * self.rbits.u64() % 64) as usize; 836 | let mut start_u64 = start * self.rbits.u64() / 64; 837 | let mut bstart = (start * self.rbits.u64() % 64) as usize; 838 | while end_u64 != start_u64 { 839 | let next_rem_u64 = self.get_rem_u64(start_u64 + 1); 840 | let mut rem_u64 = self.get_rem_u64(start_u64); 841 | rem_u64 = next_rem_u64.shift_left(self.rbits.usize(), &rem_u64, bstart, 64); 842 | self.set_rem_u64(start_u64, rem_u64); 843 | start_u64 += 1; 844 | bstart = 0; 845 | } 846 | let mut rem_u64 = self.get_rem_u64(end_u64); 847 | rem_u64 = 0u64.shift_left(self.rbits.usize(), &rem_u64, bstart, bend); 848 | self.set_rem_u64(end_u64, rem_u64); 849 | } 850 | 851 | fn shift_runends_by_1(&mut self, start: u64, end_inc: u64) { 852 | let end = if end_inc < start { 853 | end_inc + self.total_buckets().get() + 1 854 | } else { 855 | end_inc + 1 856 | }; 857 | let mut end_block = end / 64; 858 | let mut bend = (end % 64) as usize; 859 | let start_block = start / 64; 860 | let bstart = (start % 64) as usize; 861 | while end_block != start_block { 862 | let prev_block_runends = self.raw_block(end_block - 1).runends; 863 | let mut block_runends = self.raw_block(end_block).runends; 864 | block_runends = prev_block_runends.shift_right(1, &block_runends, 0, bend); 865 | self.set_block_runends(end_block, block_runends); 866 | end_block -= 1; 867 | bend = 64; 868 | } 869 | let mut block_runends = self.raw_block(start_block).runends; 870 | block_runends = 0u64.shift_right(1, &block_runends, bstart, bend); 871 | self.set_block_runends(start_block, block_runends); 872 | } 873 | 874 | fn shift_runends_back_by_1(&mut self, start: u64, end_inc: u64) { 875 | let end = if end_inc < start { 876 | end_inc + self.total_buckets().get() + 1 877 | } else { 878 | end_inc + 1 879 | }; 880 | let end_block = end / 64; 881 | let bend = (end % 64) as usize; 882 | let mut start_block = start / 64; 883 | let mut bstart = (start % 64) as usize; 884 | while start_block != end_block { 885 | let next_block_runends = self.raw_block(start_block + 1).runends; 886 | let mut block_runends = self.raw_block(start_block).runends; 887 | block_runends = next_block_runends.shift_left(1, &block_runends, bstart, 64); 888 | self.set_block_runends(start_block, block_runends); 889 | start_block += 1; 890 | bstart = 0; 891 | } 892 | let mut block_runends = self.raw_block(end_block).runends; 893 | block_runends = 0u64.shift_left(1, &block_runends, bstart, bend); 894 | self.set_block_runends(end_block, block_runends); 895 | } 896 | 897 | #[cold] 898 | #[inline(never)] 899 | fn calc_offset(&self, block_num: u64) -> u64 { 900 | // The block offset can be calculated as the difference between its position and runstart. 901 | let block_start = (block_num * 64) % self.total_buckets(); 902 | let mut run_start = self.run_start(block_start); 903 | if run_start < block_start { 904 | run_start += self.total_buckets().get(); 905 | } 906 | run_start - block_start 907 | } 908 | 909 | /// Start idx of of the run (inclusive) 910 | #[inline] 911 | fn run_start(&self, hash_bucket_idx: u64) -> u64 { 912 | // runstart is equivalent to the runend of the previous bucket + 1. 913 | let prev_bucket = hash_bucket_idx.wrapping_sub(1) % self.total_buckets(); 914 | (self.run_end(prev_bucket) + 1) % self.total_buckets() 915 | } 916 | 917 | /// End idx of the end of the run (inclusive). 918 | fn run_end(&self, hash_bucket_idx: u64) -> u64 { 919 | let hash_bucket_idx = hash_bucket_idx % self.total_buckets(); 920 | let bucket_block_idx = hash_bucket_idx / 64; 921 | let bucket_intrablock_offset = hash_bucket_idx % 64; 922 | let bucket_block = self.block(bucket_block_idx); 923 | let bucket_intrablock_rank = bucket_block.occupieds.popcnt(..=bucket_intrablock_offset); 924 | // No occupied buckets all the way to bucket_intrablock_offset 925 | // which also means hash_bucket_idx isn't occupied 926 | if bucket_intrablock_rank == 0 { 927 | return if bucket_block.offset <= bucket_intrablock_offset { 928 | // hash_bucket_idx points to an empty bucket unaffected by block offset, 929 | // thus end == start 930 | hash_bucket_idx 931 | } else { 932 | // hash_bucket_idx fall within the section occupied by the offset, 933 | // thus end == last bucket of offset section 934 | (bucket_block_idx * 64 + bucket_block.offset - 1) % self.total_buckets() 935 | }; 936 | } 937 | 938 | // Must search runends to figure out the end of the run 939 | let mut runend_block_idx = bucket_block_idx + bucket_block.offset / 64; 940 | let mut runend_ignore_bits = bucket_block.offset % 64; 941 | let mut runend_block = self.raw_block(runend_block_idx); 942 | // Try to find the runend for the bucket in this block. 943 | // We're looking for the runend_rank'th bit set (0 based) 944 | let mut runend_rank = bucket_intrablock_rank - 1; 945 | let mut runend_block_offset = runend_block 946 | .runends 947 | .select(runend_ignore_bits.., runend_rank); 948 | 949 | if let Some(runend_block_offset) = runend_block_offset { 950 | let runend_idx = runend_block_idx * 64 + runend_block_offset; 951 | return runend_idx.max(hash_bucket_idx) % self.total_buckets(); 952 | } 953 | // There were not enough runend bits set, keep looking... 954 | loop { 955 | // subtract any runend bits found 956 | runend_rank -= runend_block.runends.popcnt(runend_ignore_bits..); 957 | // move to the next block 958 | runend_block_idx += 1; 959 | runend_ignore_bits = 0; 960 | runend_block = self.raw_block(runend_block_idx); 961 | runend_block_offset = runend_block 962 | .runends 963 | .select(runend_ignore_bits.., runend_rank); 964 | 965 | if let Some(runend_block_offset) = runend_block_offset { 966 | let runend_idx = runend_block_idx * 64 + runend_block_offset; 967 | return runend_idx.max(hash_bucket_idx) % self.total_buckets(); 968 | } 969 | } 970 | } 971 | 972 | /// Returns whether item is present (probabilistically) in the filter. 973 | pub fn contains(&self, item: T) -> bool { 974 | self.contains_fingerprint(self.hash(item)) 975 | } 976 | 977 | /// Returns whether the fingerprint is present (probabilistically) in the filter. 978 | pub fn contains_fingerprint(&self, hash: u64) -> bool { 979 | let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash); 980 | if !self.is_occupied(hash_bucket_idx) { 981 | return false; 982 | } 983 | let mut runstart_idx = self.run_start(hash_bucket_idx); 984 | loop { 985 | if hash_remainder == self.get_remainder(runstart_idx) { 986 | return true; 987 | } 988 | if self.is_runend(runstart_idx) { 989 | return false; 990 | } 991 | runstart_idx += 1; 992 | } 993 | } 994 | 995 | /// Returns the number of times the item appears (probabilistically) in the filter. 996 | pub fn count(&mut self, item: T) -> u64 { 997 | self.count_fingerprint(self.hash(item)) 998 | } 999 | 1000 | /// Returns the amount of times the fingerprint appears (probabilistically) in the filter. 1001 | pub fn count_fingerprint(&mut self, hash: u64) -> u64 { 1002 | let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash); 1003 | if !self.is_occupied(hash_bucket_idx) { 1004 | return 0; 1005 | } 1006 | 1007 | let mut count = 0u64; 1008 | let mut runstart_idx = self.run_start(hash_bucket_idx); 1009 | loop { 1010 | if hash_remainder == self.get_remainder(runstart_idx) { 1011 | count += 1; 1012 | } 1013 | if self.is_runend(runstart_idx) { 1014 | return count; 1015 | } 1016 | runstart_idx += 1; 1017 | } 1018 | } 1019 | 1020 | #[inline] 1021 | fn offset_lower_bound(&self, hash_bucket_idx: u64) -> u64 { 1022 | let bucket_block_idx = hash_bucket_idx / 64; 1023 | let bucket_intrablock_offset = hash_bucket_idx % 64; 1024 | let bucket_block = self.raw_block(bucket_block_idx); 1025 | let num_occupied = bucket_block.occupieds.popcnt(..=bucket_intrablock_offset); 1026 | if bucket_block.offset <= bucket_intrablock_offset { 1027 | num_occupied 1028 | - bucket_block 1029 | .runends 1030 | .popcnt(bucket_block.offset..bucket_intrablock_offset) 1031 | } else { 1032 | bucket_block.offset + num_occupied - bucket_intrablock_offset 1033 | } 1034 | } 1035 | 1036 | fn find_first_empty_slot(&self, mut hash_bucket_idx: u64) -> u64 { 1037 | loop { 1038 | let olb = self.offset_lower_bound(hash_bucket_idx); 1039 | if olb == 0 { 1040 | return hash_bucket_idx % self.total_buckets(); 1041 | } 1042 | hash_bucket_idx += olb; 1043 | } 1044 | } 1045 | 1046 | fn find_first_not_shifted_slot(&self, mut hash_bucket_idx: u64) -> u64 { 1047 | loop { 1048 | let run_end = self.run_end(hash_bucket_idx); 1049 | if run_end == hash_bucket_idx { 1050 | return hash_bucket_idx; 1051 | } 1052 | hash_bucket_idx = run_end; 1053 | } 1054 | } 1055 | 1056 | /// Removes `item` from the filter. 1057 | /// Returns whether item was actually found and removed. 1058 | /// 1059 | /// Note that removing an item who wasn't previously added to the filter 1060 | /// may introduce **false negatives**. This is because it could be removing 1061 | /// fingerprints from a colliding item! 1062 | pub fn remove(&mut self, item: T) -> bool { 1063 | self.remove_fingerprint(self.hash(item)) 1064 | } 1065 | 1066 | /// Removes the fingerprint specified by `hash` was from the filter. 1067 | /// Returns whether a fingerprint was actually found and removed. 1068 | /// 1069 | /// Note that removing a fingerprint that wasn't previously added to the filter 1070 | /// may introduce false negatives. This is because it could be removing 1071 | /// fingerprints from a colliding hash! 1072 | pub fn remove_fingerprint(&mut self, hash: u64) -> bool { 1073 | let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash); 1074 | if !self.is_occupied(hash_bucket_idx) { 1075 | return false; 1076 | } 1077 | let mut run_start = self.run_start(hash_bucket_idx); 1078 | // adjust run_start so we can have 1079 | // hash_bucket_idx <= run_start <= found_idx <= run_end 1080 | if run_start < hash_bucket_idx { 1081 | run_start += self.total_buckets().get(); 1082 | } 1083 | let mut run_end = run_start; 1084 | let mut found_idx = None; 1085 | let found_idx = loop { 1086 | if hash_remainder == self.get_remainder(run_end) { 1087 | found_idx = Some(run_end); 1088 | } 1089 | if self.is_runend(run_end) { 1090 | if let Some(i) = found_idx { 1091 | break i; 1092 | } else { 1093 | return false; 1094 | }; 1095 | } 1096 | run_end += 1; 1097 | }; 1098 | 1099 | let mut last_bucket_shifted_run_end = run_end; 1100 | if last_bucket_shifted_run_end != hash_bucket_idx { 1101 | last_bucket_shifted_run_end = self.find_first_not_shifted_slot(run_end); 1102 | if last_bucket_shifted_run_end < run_end { 1103 | last_bucket_shifted_run_end += self.total_buckets().get(); 1104 | } 1105 | } 1106 | 1107 | // run_end points to the end of the run (inc) which contains the target remainder (found_idx) 1108 | // If we had a single remainder in the run the run is no more 1109 | if run_end == run_start { 1110 | self.set_occupied(hash_bucket_idx, false); 1111 | } else { 1112 | // More than one remainder in the run. 1113 | // If the removed rem is the last one in the run 1114 | // the before last remainder becomes the new runend. 1115 | if found_idx == run_end { 1116 | self.set_runend(run_end - 1, true); 1117 | } 1118 | } 1119 | if found_idx != last_bucket_shifted_run_end { 1120 | self.set_remainder(found_idx, 0); 1121 | self.shift_remainders_back_by_1(found_idx, last_bucket_shifted_run_end); 1122 | self.shift_runends_back_by_1(found_idx, last_bucket_shifted_run_end); 1123 | } 1124 | self.set_runend(last_bucket_shifted_run_end, false); 1125 | self.set_remainder(last_bucket_shifted_run_end, 0); 1126 | self.dec_offsets(hash_bucket_idx, last_bucket_shifted_run_end); 1127 | self.len -= 1; 1128 | true 1129 | } 1130 | 1131 | /// Inserts `item` in the filter, even if already appears to be in the filter. 1132 | /// This works by inserting a possibly duplicated fingerprint in the filter. 1133 | /// 1134 | /// This function should be used when the filter is also subject to removals 1135 | /// and the item is known to not have been added to the filter before (or was removed). 1136 | /// 1137 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1138 | #[inline] 1139 | pub fn insert_duplicated(&mut self, item: T) -> Result<(), Error> { 1140 | self.insert_counting(u64::MAX, item).map(|_| ()) 1141 | } 1142 | 1143 | /// Inserts `item` in the filter if it's not already present (probabilistically). 1144 | /// Note that membership is probabilistic, so this function may return false positives 1145 | /// but never false negatives. 1146 | /// 1147 | /// Returns `Ok(true)` if the item was successfully added to the filter. 1148 | /// Returns `Ok(false)` if the item is already contained (probabilistically) in the filter. 1149 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1150 | #[inline] 1151 | pub fn insert(&mut self, item: T) -> Result { 1152 | self.insert_counting(1, item).map(|count| count == 0) 1153 | } 1154 | 1155 | /// Inserts `item` in the filter, even if already appears to be in the filter. 1156 | /// This works by inserting a possibly duplicated fingerprint in the filter. 1157 | /// The argument `max_count` specifies how many duplicates can be inserted. 1158 | /// 1159 | /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter. So if the item 1160 | /// was already in the filter `C` times, another insertion was performed if `C < max_count`. 1161 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1162 | pub fn insert_counting(&mut self, max_count: u64, item: T) -> Result { 1163 | let hash = self.hash(item); 1164 | match self.insert_impl(max_count, hash) { 1165 | Ok(count) => Ok(count), 1166 | Err(_) => { 1167 | self.grow_if_possible()?; 1168 | self.insert_impl(max_count, hash) 1169 | } 1170 | } 1171 | } 1172 | 1173 | /// Inserts the fingerprint specified by `hash` in the filter. 1174 | /// `duplicate` specifies if the fingerprint should be added even if it's already in the filter. 1175 | /// 1176 | /// Note that this function will automatically grow the filter if needed. 1177 | /// The implementation uses the first [`Self::fingerprint_size`] bits of `hash` to place the fingerprint in the appropriate slot. 1178 | /// The remaining bits are ignored and will be returned as 0 if the fingerprint is retrieved via [`Self::fingerprints`]. 1179 | /// 1180 | /// Returns `Ok(true)` if the item was successfully added to the filter. 1181 | /// Returns `Ok(false)` if the item is already contained (probabilistically) in the filter. Possible if `duplicate` is `false`. 1182 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1183 | #[inline] 1184 | pub fn insert_fingerprint(&mut self, duplicate: bool, hash: u64) -> Result { 1185 | let max_count = if duplicate { u64::MAX } else { 1 }; 1186 | self.insert_fingerprint_counting(max_count, hash) 1187 | .map(|count| count < max_count) 1188 | } 1189 | 1190 | /// Inserts the fingerprint specified by `hash` in the filter. 1191 | /// `max_count` specifies how many occurences of the fingerprint can be added to the filter. 1192 | /// 1193 | /// Note that this function will automatically grow the filter if needed. 1194 | /// The implementation uses the first [`Self::fingerprint_size`] bits of `hash` to place the fingerprint in the appropriate slot. 1195 | /// The remaining bits are ignored and will be returned as 0 if the fingerprint is retrieved via [`Self::fingerprints`]. 1196 | /// 1197 | /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter. So if the item 1198 | /// was already in the filter `C` times, another insertion was performed if `C < max_count`. 1199 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1200 | pub fn insert_fingerprint_counting(&mut self, max_count: u64, hash: u64) -> Result { 1201 | match self.insert_impl(max_count, hash) { 1202 | Ok(count) => Ok(count), 1203 | Err(_) => { 1204 | self.grow_if_possible()?; 1205 | self.insert_impl(max_count, hash) 1206 | } 1207 | } 1208 | } 1209 | 1210 | /// Inserts the fingerprint specified by `hash` in the filter. 1211 | /// `max_count` specifies how many occurences of the fingerprint can be added to the filter. 1212 | /// It's up to the caller to grow the filter if needed and retry the insert. 1213 | /// 1214 | /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter. 1215 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item. 1216 | fn insert_impl(&mut self, max_count: u64, hash: u64) -> Result { 1217 | enum Operation { 1218 | NewRun, 1219 | BeforeRunend, 1220 | NewRunend, 1221 | } 1222 | 1223 | let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash); 1224 | if self.offset_lower_bound(hash_bucket_idx) == 0 { 1225 | if self.len >= self.capacity() { 1226 | return Err(Error::CapacityExceeded); 1227 | } 1228 | debug_assert!(!self.is_occupied(hash_bucket_idx)); 1229 | debug_assert!(!self.is_runend(hash_bucket_idx)); 1230 | self.set_occupied(hash_bucket_idx, true); 1231 | self.set_runend(hash_bucket_idx, true); 1232 | self.set_remainder(hash_bucket_idx, hash_remainder); 1233 | self.len += 1; 1234 | return Ok(0); 1235 | } 1236 | 1237 | let mut runstart_idx = self.run_start(hash_bucket_idx); 1238 | let mut runend_idx = self.run_end(hash_bucket_idx); 1239 | let mut fingerprint_count = 0; 1240 | let insert_idx; 1241 | let operation; 1242 | if self.is_occupied(hash_bucket_idx) { 1243 | // adjust runend so its >= runstart even if it wrapped around 1244 | if runend_idx < runstart_idx { 1245 | runend_idx += self.total_buckets().get(); 1246 | } 1247 | while runstart_idx <= runend_idx { 1248 | match self.get_remainder(runstart_idx).cmp(&hash_remainder) { 1249 | Ordering::Equal => { 1250 | fingerprint_count += 1; 1251 | if fingerprint_count >= max_count { 1252 | return Ok(fingerprint_count); 1253 | } 1254 | } 1255 | Ordering::Greater => break, 1256 | Ordering::Less => (), 1257 | } 1258 | 1259 | runstart_idx += 1; 1260 | } 1261 | 1262 | if runstart_idx > runend_idx { 1263 | /* new remainder is >= than any remainder in the run. */ 1264 | operation = Operation::NewRunend; 1265 | insert_idx = runstart_idx % self.total_buckets(); 1266 | } else { 1267 | /* there are larger remainders already in the run. */ 1268 | operation = Operation::BeforeRunend; /* Inserting */ 1269 | insert_idx = runstart_idx % self.total_buckets(); 1270 | } 1271 | } else { 1272 | insert_idx = (runend_idx + 1) % self.total_buckets(); 1273 | operation = Operation::NewRun; /* Insert into empty bucket */ 1274 | } 1275 | 1276 | if self.len >= self.capacity() { 1277 | return Err(Error::CapacityExceeded); 1278 | } 1279 | let empty_slot_idx = self.find_first_empty_slot(runend_idx + 1); 1280 | if insert_idx != empty_slot_idx { 1281 | self.shift_remainders_by_1(insert_idx, empty_slot_idx); 1282 | self.shift_runends_by_1(insert_idx, empty_slot_idx); 1283 | } 1284 | self.set_remainder(insert_idx, hash_remainder); 1285 | match operation { 1286 | Operation::NewRun => { 1287 | /* Insert into empty bucket */ 1288 | self.set_runend(insert_idx, true); 1289 | self.set_occupied(hash_bucket_idx, true); 1290 | } 1291 | Operation::NewRunend => { 1292 | /* new remainder it is >= than any remainder in the run. */ 1293 | self.set_runend(insert_idx.wrapping_sub(1) % self.total_buckets(), false); 1294 | self.set_runend(insert_idx, true); 1295 | } 1296 | Operation::BeforeRunend => { /* there are larger remainders already in the run. */ } 1297 | } 1298 | 1299 | self.inc_offsets(hash_bucket_idx, empty_slot_idx); 1300 | self.len += 1; 1301 | Ok(fingerprint_count) 1302 | } 1303 | 1304 | /// Returns an iterator over the fingerprints stored in the filter. 1305 | /// 1306 | /// Fingerprints will be returned in ascending order. 1307 | pub fn fingerprints(&self) -> FingerprintIter { 1308 | FingerprintIter::new(self) 1309 | } 1310 | 1311 | /// Shrinks the capacity of the filter as much as possible while preserving 1312 | /// the false positive ratios and fingerprint size. 1313 | pub fn shrink_to_fit(&mut self) { 1314 | if self.total_blocks().get() > 1 && self.len() <= self.capacity() / 2 { 1315 | let mut new = Self::with_qr( 1316 | (self.qbits.get() - 1).try_into().unwrap(), 1317 | (self.rbits.get() + 1).try_into().unwrap(), 1318 | ) 1319 | .unwrap(); 1320 | new.max_qbits = self.max_qbits; 1321 | for hash in self.fingerprints() { 1322 | let _ = new.insert_fingerprint(true, hash); 1323 | } 1324 | debug_assert_eq!(new.len, self.len); 1325 | debug_assert_eq!(new.fingerprint_size(), self.fingerprint_size()); 1326 | *self = new; 1327 | } 1328 | } 1329 | 1330 | /// Merges `other` filter into `self`. 1331 | /// 1332 | /// `keep_duplicates` specifies whether duplicated fingerprints should be store, 1333 | /// this is normally only useful is the filter is being used for counting. 1334 | /// 1335 | /// Note that the `other` filter must have a fingerprint >= `self` fingerprint size, 1336 | /// otherwise the function will fail with `Err(Error::IncompatibleFingerprintSize)`. 1337 | /// This is the case for filters created with the same parameters or if the `other` 1338 | /// filter has a lower target false positive ratio. 1339 | /// 1340 | /// Returns `Err(Error::CapacityExceeded)` if the filter cannot merge all items. 1341 | /// Note that in this case items could have already been added and the filter is left 1342 | /// full but in an otherwise valid state. 1343 | pub fn merge(&mut self, keep_duplicates: bool, other: &Self) -> Result<(), Error> { 1344 | if other.fingerprint_size() < self.fingerprint_size() { 1345 | return Err(Error::IncompatibleFingerprintSize); 1346 | } 1347 | let max_count = if keep_duplicates { u64::MAX } else { 1 }; 1348 | for hash in other.fingerprints() { 1349 | self.insert_impl(max_count, hash)?; 1350 | } 1351 | Ok(()) 1352 | } 1353 | 1354 | #[inline] 1355 | fn grow_if_possible(&mut self) -> Result<(), Error> { 1356 | if let Some(m) = self.max_qbits { 1357 | if m > self.qbits { 1358 | self.grow(); 1359 | return Ok(()); 1360 | } 1361 | } 1362 | Err(Error::CapacityExceeded) 1363 | } 1364 | 1365 | #[cold] 1366 | #[inline(never)] 1367 | fn grow(&mut self) { 1368 | let qbits = self.qbits.checked_add(1).unwrap(); 1369 | let rbits = NonZeroU8::new(self.rbits.get() - 1).unwrap(); 1370 | let mut new = Self::with_qr(qbits, rbits).unwrap(); 1371 | new.max_qbits = self.max_qbits; 1372 | for hash in self.fingerprints() { 1373 | new.insert_fingerprint(true, hash).unwrap(); 1374 | } 1375 | assert_eq!(self.len, new.len); 1376 | *self = new; 1377 | } 1378 | 1379 | #[inline] 1380 | fn hash(&self, item: T) -> u64 { 1381 | let mut hasher = StableHasher::new(); 1382 | item.hash(&mut hasher); 1383 | hasher.finish() 1384 | } 1385 | 1386 | #[inline] 1387 | fn calc_qr(&self, hash: u64) -> (u64, u64) { 1388 | let hash_bucket_idx = (hash >> self.rbits.get()) & ((1 << self.qbits.get()) - 1); 1389 | let remainder = hash & ((1 << self.rbits.get()) - 1); 1390 | (hash_bucket_idx, remainder) 1391 | } 1392 | 1393 | #[inline] 1394 | fn total_blocks(&self) -> NonZeroU64 { 1395 | // The way this is calculated ensures the compilers sees that the result is both != 0 and a power of 2, 1396 | // both of which allow the optimizer to generate much faster division/remainder code. 1397 | #[cfg(any(debug_assertions, fuzzing))] 1398 | { 1399 | NonZeroU64::new((1u64 << self.qbits.get()) / 64).unwrap() 1400 | } 1401 | #[cfg(not(any(debug_assertions, fuzzing)))] 1402 | { 1403 | // Safety: All filter have at least 1 block (which have 64 slots each) 1404 | unsafe { NonZeroU64::new_unchecked((1u64 << self.qbits.get()) / 64) } 1405 | } 1406 | } 1407 | 1408 | #[inline] 1409 | fn total_buckets(&self) -> NonZeroU64 { 1410 | NonZeroU64::new(1 << self.qbits.get()).unwrap() 1411 | } 1412 | 1413 | #[doc(hidden)] 1414 | #[cfg(any(fuzzing, test))] 1415 | pub fn printout(&self) { 1416 | eprintln!( 1417 | "=== q {} r {} len {} cap {} ===", 1418 | self.qbits, 1419 | self.rbits, 1420 | self.len(), 1421 | self.capacity() 1422 | ); 1423 | for b in 0..self.total_blocks().get() { 1424 | let block = self.raw_block(b); 1425 | eprintln!( 1426 | "block {} offset {:?}\noccup {:064b}\nrunen {:064b}", 1427 | b, block.offset, block.occupieds, block.runends 1428 | ); 1429 | eprintln!( 1430 | " 3210987654321098765432109876543210987654321098765432109876543210 {}", 1431 | b * 64 1432 | ); 1433 | eprint!("rem "); 1434 | for i in (0..64).rev() { 1435 | let r = self.get_remainder(b * 64 + i); 1436 | eprint!("{}", r % 100 / 10); 1437 | } 1438 | eprint!("\nrem "); 1439 | for i in (0..64).rev() { 1440 | let r = self.get_remainder(b * 64 + i); 1441 | eprint!("{}", r % 10); 1442 | } 1443 | println!(); 1444 | } 1445 | eprintln!("==="); 1446 | } 1447 | } 1448 | 1449 | impl std::fmt::Debug for Filter { 1450 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 1451 | f.debug_struct("Filter") 1452 | .field("buffer", &"[..]") 1453 | .field("len", &self.len) 1454 | .field("qbits", &self.qbits) 1455 | .field("rbits", &self.rbits) 1456 | .field("max_qbits", &self.max_qbits) 1457 | .finish() 1458 | } 1459 | } 1460 | 1461 | #[cfg(test)] 1462 | mod tests { 1463 | use super::*; 1464 | 1465 | #[test] 1466 | fn run_end_simple() { 1467 | let mut f = Filter::new(50, 0.01).unwrap(); 1468 | f.set_occupied(5, true); 1469 | f.set_runend(5, true); 1470 | assert_eq!(f.run_end(4), 4); 1471 | assert_eq!(f.run_end(5), 5); 1472 | assert_eq!(f.run_end(6), 6); 1473 | 1474 | f.set_occupied(6, true); 1475 | f.set_runend(6, true); 1476 | assert_eq!(f.run_end(4), 4); 1477 | assert_eq!(f.run_end(5), 5); 1478 | assert_eq!(f.run_end(6), 6); 1479 | 1480 | f.set_runend(6, false); 1481 | f.set_runend(7, true); 1482 | assert_eq!(f.run_end(4), 4); 1483 | assert_eq!(f.run_end(5), 5); 1484 | assert_eq!(f.run_end(6), 7); 1485 | 1486 | f.set_runend(7, false); 1487 | f.set_runend(8, true); 1488 | assert_eq!(f.run_end(4), 4); 1489 | assert_eq!(f.run_end(5), 5); 1490 | assert_eq!(f.run_end(6), 8); 1491 | 1492 | f.set_occupied(10, true); 1493 | f.set_runend(12, true); 1494 | f.set_occupied(12, true); 1495 | f.set_runend(13, true); 1496 | assert_eq!(f.run_end(10), 12); 1497 | assert_eq!(f.run_end(12), 13); 1498 | 1499 | f.set_occupied(11, true); 1500 | f.set_runend(14, true); 1501 | assert_eq!(f.run_end(10), 12); 1502 | assert_eq!(f.run_end(11), 13); 1503 | assert_eq!(f.run_end(12), 14); 1504 | } 1505 | 1506 | #[test] 1507 | fn run_end_eob() { 1508 | let mut f = Filter::new(50, 0.01).unwrap(); 1509 | assert_eq!(f.total_buckets().get(), 64); 1510 | f.set_occupied(63, true); 1511 | f.set_runend(63, true); 1512 | assert_eq!(f.run_end(62), 62); 1513 | assert_eq!(f.run_end(63), 63); 1514 | assert_eq!(f.find_first_empty_slot(62), 62); 1515 | assert_eq!(f.find_first_empty_slot(63), 0); 1516 | } 1517 | 1518 | #[test] 1519 | fn run_end_crossing() { 1520 | let mut f = Filter::new(50, 0.01).unwrap(); 1521 | f.set_occupied(0, true); 1522 | f.set_runend(0, true); 1523 | f.set_occupied(63, true); 1524 | f.set_runend(63, true); 1525 | assert_eq!(f.run_end(0), 0); 1526 | assert_eq!(f.run_end(1), 1); 1527 | assert_eq!(f.run_end(62), 62); 1528 | assert_eq!(f.run_end(63), 63); 1529 | 1530 | f.set_runend(63, false); 1531 | f.set_runend(1, true); 1532 | f.adjust_block_offset(1, true); 1533 | assert_eq!(f.run_end(0), 1); 1534 | assert_eq!(f.run_end(1), 1); 1535 | assert_eq!(f.run_end(62), 62); 1536 | assert_eq!(f.run_end(63), 0); 1537 | 1538 | f.set_runend(1, false); 1539 | f.set_runend(2, true); 1540 | assert_eq!(f.run_end(63), 0); 1541 | assert_eq!(f.run_end(0), 2); 1542 | assert_eq!(f.run_end(1), 2); 1543 | 1544 | f.set_runend(2, false); 1545 | f.set_runend(3, true); 1546 | assert_eq!(f.run_end(63), 0); 1547 | assert_eq!(f.run_end(1), 3); 1548 | assert_eq!(f.run_end(2), 3); 1549 | 1550 | f.set_occupied(65, true); 1551 | f.set_runend(68, true); 1552 | assert_eq!(f.run_end(63), 0); 1553 | assert_eq!(f.run_end(0), 3); 1554 | assert_eq!(f.run_end(1), 4); 1555 | } 1556 | 1557 | #[test] 1558 | fn test_insert_duplicated() { 1559 | for cap in [100, 200, 500, 1000] { 1560 | let mut f = Filter::new(cap, 0.01).unwrap(); 1561 | for i in 0..f.capacity() / 2 { 1562 | f.insert_duplicated(-1).unwrap(); 1563 | f.insert_duplicated(i).unwrap(); 1564 | assert!(f.count(-1) >= i); 1565 | assert!(f.count(i) >= 1); 1566 | } 1567 | } 1568 | } 1569 | 1570 | #[test] 1571 | fn test_insert_duplicated_two() { 1572 | for s in 0..10 { 1573 | for c in [200, 800, 1500] { 1574 | let mut f = Filter::new(c, 0.001).unwrap(); 1575 | for i in 0..f.capacity() / 2 { 1576 | f.insert_duplicated(-1).unwrap(); 1577 | assert_eq!(f.count(-1), i + 1); 1578 | assert_eq!(f.count(s), i); 1579 | f.insert_duplicated(s).unwrap(); 1580 | assert_eq!(f.count(-1), i + 1); 1581 | assert_eq!(f.count(s), i + 1); 1582 | } 1583 | } 1584 | } 1585 | } 1586 | 1587 | #[test] 1588 | fn test_insert_duplicated_one() { 1589 | for s in 0..10 { 1590 | for cap in [100, 200, 500, 1000] { 1591 | let mut f = Filter::new(cap, 0.01).unwrap(); 1592 | for i in 0..f.capacity() { 1593 | f.insert_duplicated(s).unwrap(); 1594 | assert!(f.count(s) > i); 1595 | } 1596 | assert_eq!(f.count(s), f.capacity()); 1597 | } 1598 | } 1599 | } 1600 | 1601 | #[test] 1602 | fn test_auto_resize_two() { 1603 | let mut f = Filter::new_resizeable(50, 1000, 0.01).unwrap(); 1604 | for _ in 0..50 { 1605 | f.insert_duplicated(0).unwrap(); 1606 | } 1607 | for _ in 0..3 { 1608 | f.insert_duplicated(1).unwrap(); 1609 | } 1610 | f.grow(); 1611 | f.grow(); 1612 | f.grow(); 1613 | assert_eq!(f.count(0), 50); 1614 | assert_eq!(f.count(1), 3); 1615 | } 1616 | 1617 | #[test] 1618 | fn test_new_resizeable() { 1619 | let mut f = Filter::new_resizeable(100, 100, 0.01).unwrap(); 1620 | assert!(f.grow_if_possible().is_err()); 1621 | let mut f = Filter::new_resizeable(0, 100, 0.01).unwrap(); 1622 | assert!(f.grow_if_possible().is_ok()); 1623 | } 1624 | 1625 | #[test] 1626 | #[should_panic] 1627 | fn test_new_capacity_overflow() { 1628 | Filter::new_resizeable(100, u64::MAX, 0.01).unwrap(); 1629 | } 1630 | 1631 | #[test] 1632 | #[should_panic] 1633 | fn test_new_hash_overflow() { 1634 | Filter::new_resizeable(100, u64::MAX / 20, 0.01).unwrap(); 1635 | } 1636 | 1637 | #[test] 1638 | fn test_auto_resize_one() { 1639 | let mut f = Filter::new_resizeable(100, 500, 0.01).unwrap(); 1640 | for i in 0u64.. { 1641 | if f.insert_duplicated(i).is_err() { 1642 | assert_eq!(f.len(), i); 1643 | break; 1644 | } 1645 | } 1646 | assert!(f.len() >= 500); 1647 | for i in 0u64..f.len() { 1648 | assert!(f.contains(i), "{}", i); 1649 | } 1650 | } 1651 | 1652 | #[test] 1653 | fn test_remainders_and_shifts() { 1654 | let mut f = Filter::new(200, 0.01).unwrap(); 1655 | let c = f.capacity(); 1656 | for j in 0..c { 1657 | f.set_remainder(j, 0b1011101); 1658 | assert_eq!(f.get_remainder(j), 0b1011101); 1659 | f.set_runend(j, true); 1660 | assert!(f.is_runend(j)); 1661 | } 1662 | for j in 0..c { 1663 | f.set_remainder(j, 0b1111111); 1664 | assert_eq!(f.get_remainder(j), 0b1111111); 1665 | f.set_runend(j, false); 1666 | assert!(!f.is_runend(j)); 1667 | } 1668 | for j in 0..c { 1669 | f.set_remainder(j, 0b1101101); 1670 | assert_eq!(f.get_remainder(j), 0b1101101); 1671 | f.set_runend(j, true); 1672 | assert!(f.is_runend(j)); 1673 | } 1674 | f.shift_remainders_by_1(0, c); 1675 | f.shift_runends_by_1(0, c); 1676 | 1677 | for j in 1..=c { 1678 | assert_eq!(f.get_remainder(j), 0b1101101); 1679 | } 1680 | assert!(!f.is_runend(0)); 1681 | for j in 1..=c { 1682 | assert_eq!(f.get_remainder(j), 0b1101101); 1683 | assert!(f.is_runend(j)); 1684 | } 1685 | } 1686 | 1687 | #[test] 1688 | fn test_remove() { 1689 | for fp in [0.0001, 0.00001, 0.000001] { 1690 | for cap in [0, 100, 200, 400, 1000] { 1691 | let mut f = Filter::new(cap, fp).unwrap(); 1692 | dbg!(f.rbits, f.capacity()); 1693 | let c = f.capacity(); 1694 | for i in 0..c { 1695 | assert!(f.insert(i).unwrap()); 1696 | } 1697 | assert_eq!(f.len(), c); 1698 | for i in 0..c { 1699 | for j in 0..c { 1700 | assert_eq!(f.count(j), (j >= i) as u64, "{}", j); 1701 | } 1702 | // f.printout(); 1703 | assert!(f.remove(i)); 1704 | // f.printout(); 1705 | } 1706 | assert!(f.is_empty()); 1707 | } 1708 | } 1709 | } 1710 | #[test] 1711 | fn test_remove_dup_one() { 1712 | for s in 0..10 { 1713 | for cap in [0, 100, 200, 500, 1000] { 1714 | let mut f = Filter::new(cap, 0.0001).unwrap(); 1715 | let c = f.capacity(); 1716 | for _ in 0..c { 1717 | f.insert_duplicated(s).unwrap(); 1718 | } 1719 | assert_eq!(f.len(), c); 1720 | for i in 0..c { 1721 | assert_eq!(f.count(s), c - i); 1722 | assert!(f.remove(s)); 1723 | } 1724 | assert!(f.is_empty()); 1725 | } 1726 | } 1727 | } 1728 | #[test] 1729 | fn test_remove_dup_two() { 1730 | for s in 0..10 { 1731 | dbg!(s); 1732 | for cap in [100, 200, 500, 1000] { 1733 | let mut f = Filter::new(cap, 0.0001).unwrap(); 1734 | let c = f.capacity(); 1735 | for _ in 0..c / 2 { 1736 | f.insert_duplicated(-1).unwrap(); 1737 | f.insert_duplicated(s).unwrap(); 1738 | } 1739 | assert_eq!(f.count(-1), c / 2); 1740 | assert_eq!(f.count(s), c / 2); 1741 | for i in 0..c / 2 { 1742 | assert_eq!(f.count(-1), c / 2 - i); 1743 | assert_eq!(f.count(s), c / 2 - i); 1744 | assert!(f.remove(-1)); 1745 | assert_eq!(f.count(-1), c / 2 - i - 1); 1746 | assert_eq!(f.count(s), c / 2 - i); 1747 | assert!(f.remove(s)); 1748 | assert_eq!(f.count(-1), c / 2 - i - 1); 1749 | assert_eq!(f.count(s), c / 2 - i - 1); 1750 | } 1751 | assert!(f.is_empty()); 1752 | } 1753 | } 1754 | } 1755 | 1756 | #[test] 1757 | fn test_it_works() { 1758 | for fp_rate_arg in [0.01, 0.001, 0.0001] { 1759 | let mut f = Filter::new(100_000, fp_rate_arg).unwrap(); 1760 | assert!(!f.contains(0)); 1761 | assert_eq!(f.len(), 0); 1762 | for i in 0..f.capacity() { 1763 | f.insert_duplicated(i).unwrap(); 1764 | } 1765 | for i in 0..f.capacity() { 1766 | assert!(f.contains(i)); 1767 | } 1768 | let est_fp_rate = 1769 | (0..).take(50_000).filter(|i| f.contains(i)).count() as f64 / 50_000.0; 1770 | dbg!(f.max_error_ratio(), est_fp_rate); 1771 | assert!(est_fp_rate <= f.max_error_ratio()); 1772 | } 1773 | } 1774 | 1775 | #[test] 1776 | fn test_with_fingerprint_size_resizes() { 1777 | let mut f = Filter::with_fingerprint_size(0, 8).unwrap(); 1778 | assert_eq!(f.fingerprint_size(), 8); 1779 | assert_eq!(f.capacity_resizeable(), (128u64 * 19).div_ceil(20)); 1780 | assert_eq!(f.capacity(), (64u64 * 19).div_ceil(20)); 1781 | for i in 0..f.capacity_resizeable() { 1782 | f.insert_fingerprint(false, i).unwrap(); 1783 | } 1784 | assert_eq!(f.len(), f.capacity_resizeable()); 1785 | assert!(f 1786 | .insert_fingerprint(false, f.capacity_resizeable()) 1787 | .is_err()); 1788 | } 1789 | 1790 | #[test] 1791 | fn test_with_fingerprint_size() { 1792 | let fingerprints = [ 1793 | 0u64, 1794 | 0, 1795 | 1, 1796 | 1, 1797 | 1, 1798 | 1, 1799 | 1, 1800 | 0x777777777777, 1801 | u32::MAX as u64 - 1, 1802 | u32::MAX as u64 - 1, 1803 | u32::MAX as u64, 1804 | u64::MAX - 1, 1805 | u64::MAX - 1, 1806 | u64::MAX, 1807 | u64::MAX, 1808 | ]; 1809 | for fip_size in [7, 16, 24, 31, 49, 64] { 1810 | let mut filter = Filter::with_fingerprint_size(1, fip_size).unwrap(); 1811 | for h in fingerprints { 1812 | filter.insert_fingerprint(true, h).unwrap(); 1813 | } 1814 | let out: Vec = filter.fingerprints().collect::>(); 1815 | let mut expect = fingerprints.map(|h| h << (64 - fip_size) >> (64 - fip_size)); 1816 | expect.sort_unstable(); 1817 | assert_eq!(out, expect); 1818 | } 1819 | } 1820 | 1821 | #[test] 1822 | fn test_merge() { 1823 | fn test(mut f1: Filter, mut f2: Filter, mut f3: Filter) { 1824 | assert!(f1.merge(true, &f1.clone()).is_ok()); 1825 | assert!(f1.merge(true, &f2).is_ok()); 1826 | assert!(f1.merge(true, &f3).is_ok()); 1827 | assert!(f2.merge(true, &f1).is_err()); 1828 | assert!(f2.merge(true, &f2.clone()).is_ok()); 1829 | assert!(f2.merge(true, &f3).is_ok()); 1830 | assert!(f3.merge(true, &f1).is_err()); 1831 | assert!(f3.merge(true, &f2).is_err()); 1832 | assert!(f3.merge(true, &f3.clone()).is_ok()); 1833 | 1834 | f1.insert_fingerprint(true, 1).unwrap(); 1835 | f2.insert_fingerprint(true, 1).unwrap(); 1836 | f2.insert_fingerprint(true, 2).unwrap(); 1837 | f3.insert_fingerprint(true, 1).unwrap(); 1838 | f3.insert_fingerprint(true, 2).unwrap(); 1839 | f3.insert_fingerprint(true, 3).unwrap(); 1840 | assert_eq!(f1.len(), 1); 1841 | assert_eq!(f2.len(), 2); 1842 | assert_eq!(f3.len(), 3); 1843 | 1844 | f1.merge(false, &f1.clone()).unwrap(); 1845 | assert_eq!(f1.len(), 1); 1846 | f1.merge(true, &f2.clone()).unwrap(); 1847 | assert_eq!(f1.len(), 3); 1848 | f1.merge(false, &f3.clone()).unwrap(); 1849 | assert_eq!(f1.len(), 4); 1850 | 1851 | for _ in f1.len()..f1.capacity() { 1852 | f1.insert_fingerprint(true, 1).unwrap(); 1853 | } 1854 | assert_eq!(f1.len(), f1.capacity()); 1855 | assert!(matches!( 1856 | f1.insert_impl(u64::MAX, 1), 1857 | Err(Error::CapacityExceeded) 1858 | )); 1859 | assert!(matches!( 1860 | f1.merge(true, &f1.clone()), 1861 | Err(Error::CapacityExceeded) 1862 | )); 1863 | assert!(matches!(f1.insert_fingerprint(false, 1), Ok(false))); 1864 | assert!(matches!(f1.merge(false, &f1.clone()), Ok(()))); 1865 | } 1866 | test( 1867 | Filter::with_fingerprint_size(1, 10).unwrap(), 1868 | Filter::with_fingerprint_size(1, 11).unwrap(), 1869 | Filter::with_fingerprint_size(1, 12).unwrap(), 1870 | ); 1871 | test( 1872 | Filter::new(1, 0.01).unwrap(), 1873 | Filter::new(1, 0.001).unwrap(), 1874 | Filter::new(1, 0.0001).unwrap(), 1875 | ); 1876 | } 1877 | 1878 | #[cfg(feature = "serde")] 1879 | #[test] 1880 | fn test_serde() { 1881 | for capacity in [100, 1000, 10000] { 1882 | for fp_ratio in [0.2, 0.1, 0.01, 0.001, 0.0001] { 1883 | let mut f = Filter::new(capacity, fp_ratio).unwrap(); 1884 | for i in 0..f.capacity() { 1885 | f.insert(i).unwrap(); 1886 | } 1887 | 1888 | let ser = serde_cbor::to_vec(&f).unwrap(); 1889 | f = serde_cbor::from_slice(&ser).unwrap(); 1890 | for i in 0..f.capacity() { 1891 | f.contains(i); 1892 | } 1893 | dbg!( 1894 | f.current_error_ratio(), 1895 | f.max_error_ratio(), 1896 | f.capacity(), 1897 | f.len(), 1898 | ser.len() 1899 | ); 1900 | } 1901 | } 1902 | } 1903 | 1904 | #[test] 1905 | fn test_dec_offset_edge_case() { 1906 | // case found in fuzz testing 1907 | #[rustfmt::skip] 1908 | let sample = [(0u16, 287), (2u16, 1), (9u16, 2), (10u16, 1), (53u16, 5), (61u16, 5), (127u16, 2), (232u16, 1), (255u16, 21), (314u16, 2), (317u16, 2), (384u16, 2), (511u16, 3), (512u16, 2), (1599u16, 2), (2303u16, 5), (2559u16, 2), (2568u16, 3), (2815u16, 2), (6400u16, 2), (9211u16, 2), (9728u16, 2), (10790u16, 1), (10794u16, 94), (10797u16, 2), (10999u16, 2), (11007u16, 2), (11520u16, 1), (12800u16, 4), (12842u16, 2), (13823u16, 1), (14984u16, 2), (15617u16, 2), (15871u16, 4), (16128u16, 3), (16383u16, 2), (16394u16, 1), (18167u16, 2), (23807u16, 1), (32759u16, 2)]; 1909 | let mut f = Filter::new(400, 0.1).unwrap(); 1910 | for (i, c) in sample { 1911 | for _ in 0..c { 1912 | f.insert_duplicated(i).unwrap(); 1913 | } 1914 | } 1915 | assert_eq!(f.raw_block(2).offset, 3); 1916 | assert_eq!(f.raw_block(3).offset, u8::MAX as u64); 1917 | f.validate_offsets(0, f.total_buckets().get()); 1918 | f.remove(0u16); 1919 | assert_eq!(f.raw_block(2).offset, 2); 1920 | assert_eq!(f.raw_block(3).offset, 254); 1921 | f.validate_offsets(0, f.total_buckets().get()); 1922 | } 1923 | 1924 | #[test] 1925 | fn test_capacity_edge_cases() { 1926 | for n in 1..32 { 1927 | let base = (1 << n) * 19 / 20; 1928 | // Test numbers around the edge 1929 | for i in [base - 1, base, base + 1] { 1930 | let filter = Filter::new(i, 0.01).unwrap(); 1931 | assert!( 1932 | filter.capacity() >= i, 1933 | "Requested capacity {} but got {}", 1934 | i, 1935 | filter.capacity() 1936 | ); 1937 | assert_eq!(filter.capacity(), filter.capacity_resizeable()); 1938 | } 1939 | } 1940 | } 1941 | 1942 | #[test] 1943 | fn test_max_capacity() { 1944 | for i in 7..=64 { 1945 | let f = Filter::with_fingerprint_size(0, i).unwrap(); 1946 | assert!(f.capacity() <= f.capacity_resizeable()); 1947 | assert_eq!( 1948 | f.capacity_resizeable(), 1949 | ((1u64 << (i - 1).min(Filter::MAX_QBITS)) * 19).div_ceil(20) 1950 | ); 1951 | } 1952 | for i in 1..Filter::MAX_QBITS { 1953 | let f = Filter::new_resizeable(0, 2u64.pow(i as u32), 0.5).unwrap(); 1954 | assert_eq!(f.capacity(), 61); 1955 | assert!(f.capacity() <= f.capacity_resizeable()); 1956 | } 1957 | // Test the maximum capacity 1958 | let f = Filter::new_resizeable(0, Filter::MAX_CAPACITY, 0.5).unwrap(); 1959 | assert_eq!(f.capacity(), 61); 1960 | assert_eq!(f.capacity_resizeable(), Filter::MAX_CAPACITY); 1961 | // Test the maximum capacity + 1, which should fail 1962 | Filter::new_resizeable(0, Filter::MAX_CAPACITY + 1, 0.5).unwrap_err(); 1963 | } 1964 | } 1965 | -------------------------------------------------------------------------------- /src/stable_hasher.rs: -------------------------------------------------------------------------------- 1 | use std::hash::Hasher; 2 | 3 | /// Wrapper over a hasher that provides stable output across platforms 4 | /// Based on https://github.com/rust-lang/rust/blob/c0955a34bcb17f0b31d7b86522a520ebe7fa93ac/src/librustc_data_structures/stable_hasher.rs#L78-L166 5 | /// 6 | /// To that end we always convert integers to little-endian format before 7 | /// hashing and the architecture dependent `isize` and `usize` types are 8 | /// extended to 64 bits if needed. 9 | pub struct StableHasher { 10 | /// Using xxh3-64 with default seed/secret as the portable hasher. 11 | state: xxhash_rust::xxh3::Xxh3Default, 12 | } 13 | 14 | impl StableHasher { 15 | #[inline] 16 | pub fn new() -> Self { 17 | Self { 18 | state: xxhash_rust::xxh3::Xxh3Default::new(), 19 | } 20 | } 21 | } 22 | 23 | impl Hasher for StableHasher { 24 | #[inline] 25 | fn finish(&self) -> u64 { 26 | self.state.finish() 27 | } 28 | 29 | #[inline] 30 | fn write(&mut self, bytes: &[u8]) { 31 | self.state.write(bytes); 32 | } 33 | 34 | #[inline] 35 | fn write_u8(&mut self, i: u8) { 36 | self.state.write_u8(i); 37 | } 38 | 39 | #[inline] 40 | fn write_u16(&mut self, i: u16) { 41 | self.state.write_u16(i.to_le()); 42 | } 43 | 44 | #[inline] 45 | fn write_u32(&mut self, i: u32) { 46 | self.state.write_u32(i.to_le()); 47 | } 48 | 49 | #[inline] 50 | fn write_u64(&mut self, i: u64) { 51 | self.state.write_u64(i.to_le()); 52 | } 53 | 54 | #[inline] 55 | fn write_u128(&mut self, i: u128) { 56 | self.state.write_u128(i.to_le()); 57 | } 58 | 59 | #[inline] 60 | fn write_usize(&mut self, i: usize) { 61 | // Always treat usize as u64 so we get the same results on 32 and 64 bit 62 | // platforms. This is important for symbol hashes when cross compiling, 63 | // for example. 64 | self.state.write_u64((i as u64).to_le()); 65 | } 66 | 67 | #[inline] 68 | fn write_i8(&mut self, i: i8) { 69 | self.state.write_i8(i); 70 | } 71 | 72 | #[inline] 73 | fn write_i16(&mut self, i: i16) { 74 | self.state.write_i16(i.to_le()); 75 | } 76 | 77 | #[inline] 78 | fn write_i32(&mut self, i: i32) { 79 | self.state.write_i32(i.to_le()); 80 | } 81 | 82 | #[inline] 83 | fn write_i64(&mut self, i: i64) { 84 | self.state.write_i64(i.to_le()); 85 | } 86 | 87 | #[inline] 88 | fn write_i128(&mut self, i: i128) { 89 | self.state.write_i128(i.to_le()); 90 | } 91 | 92 | #[inline] 93 | fn write_isize(&mut self, i: isize) { 94 | // Always treat isize as i64 so we get the same results on 32 and 64 bit 95 | // platforms. This is important for symbol hashes when cross compiling, 96 | // for example. 97 | self.state.write_i64((i as i64).to_le()); 98 | } 99 | } 100 | --------------------------------------------------------------------------------