├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Cargo.toml
├── LICENCE.md
├── README.md
├── benches
    └── benches.rs
├── fuzz
    ├── .gitignore
    ├── Cargo.toml
    └── fuzz_targets
    │   ├── fuzz_fingerprint.rs
    │   └── fuzz_qfilter.rs
└── src
    ├── lib.rs
    └── stable_hasher.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   test:
10 |     name: Tests
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: styfle/cancel-workflow-action@0.10.0
14 |         with:
15 |           access_token: ${{ github.token }}
16 |       - uses: actions/checkout@v2
17 |       - uses: actions-rs/toolchain@v1
18 |         with:
19 |           profile: minimal
20 |           toolchain: stable
21 |           override: true
22 |       - run: cargo test
23 |       - run: cargo test --all-features
24 | 
25 |   fuzz-tests:
26 |     name: Fuzz tests
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: styfle/cancel-workflow-action@0.10.0
30 |         with:
31 |           access_token: ${{ github.token }}
32 |       - uses: actions/checkout@v2
33 |       - uses: actions-rs/toolchain@v1
34 |         with:
35 |           profile: minimal
36 |           toolchain: nightly
37 |           override: true
38 |       - run: cargo install cargo-fuzz
39 |       - run: for fuzz_test in `cargo fuzz list`; do cargo fuzz run $fuzz_test -- -max_total_time=180 -detect_leaks=0 -len_control=0 || exit 1; done
40 | 
41 |   lint:
42 |     name: Rustfmt & Clippy
43 |     runs-on: ubuntu-latest
44 |     steps:
45 |       - uses: styfle/cancel-workflow-action@0.10.0
46 |         with:
47 |           access_token: ${{ github.token }}
48 |       - uses: actions/checkout@v2
49 |       - uses: actions-rs/toolchain@v1
50 |         with:
51 |           profile: minimal
52 |           toolchain: stable
53 |           override: true
54 |       - run: rustup component add rustfmt
55 |       - uses: actions-rs/cargo@v1
56 |         with:
57 |           command: fmt
58 |           args: --all -- --check
59 |       - run: rustup component add clippy
60 |       - uses: actions-rs/cargo@v1
61 |         with:
62 |           command: clippy
63 |           args: -- -D warnings
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | debug/
 4 | target/
 5 | 
 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 8 | Cargo.lock
 9 | 
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "qfilter"
 3 | version = "0.2.5"
 4 | description = "Efficient bloom filter like datastructure, based on the Rank Select Quotient Filter (RSQF)"
 5 | repository = "https://github.com/arthurprs/qfilter"
 6 | authors = ["Arthur Silva <arthurprs@gmail.com>"]
 7 | edition = "2021"
 8 | license = "MIT"
 9 | keywords = ["rsqf", "cqf", "quotient-filter", "bloom-filter", "cuckoo-filter"]
10 | categories = ["data-structures"]
11 | readme = "README.md"
12 | exclude = ["fuzz", "benches"]
13 | 
14 | [features]
15 | default = []
16 | # Enable to support running on x64 cpus released before 2008
17 | legacy_x86_64_support = []
18 | jsonschema = ["schemars"]
19 | serde = ["dep:serde", "dep:serde_bytes"]
20 | 
21 | [dependencies]
22 | xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
23 | serde = { optional = true, version = "1", features = ["derive"] }
24 | serde_bytes = { optional = true, version = "0.11" }
25 | schemars = { optional = true, version = "0.8" }
26 | 
27 | [dev-dependencies]
28 | serde_cbor = "0.11"
29 | qfilter01 = { package = "qfilter", version = "0.1", features = ["serde"] }
30 | 
31 | [profile.bench]
32 | opt-level = 3
33 | debug = true
34 | 
35 | [lints.rust]
36 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }
37 | 
38 | [package.metadata.docs.rs]
39 | rustdoc-args = ["--cfg", "docsrs"]
40 | 


--------------------------------------------------------------------------------
/LICENCE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Arthur Silva
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Qfilter
 2 | 
 3 | [![Crates.io](https://img.shields.io/crates/v/qfilter.svg)](https://crates.io/crates/qfilter)
 4 | [![Docs](https://docs.rs/qfilter/badge.svg)](https://docs.rs/qfilter/latest)
 5 | [![CI](https://github.com/arthurprs/qfilter/actions/workflows/ci.yml/badge.svg)](https://github.com/arthurprs/qfilter/actions/workflows/ci.yml)
 6 | 
 7 | Efficient bloom filter like data structure, based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
 8 | 
 9 | This is a small and flexible general-purpose [AMQ-Filter](https://en.wikipedia.org/wiki/Approximate_Membership_Query_Filter).
10 | It not only supports approximate membership testing like a bloom filter but also deletions, merging,
11 | resizing and [serde](https://crates.io/crates/serde) serialization.
12 | 
13 | * High performance
14 | * Supports removals
15 | * Extremely compact, more so than comparable filters
16 | * Can be created with a initial small capacity and grow as needed
17 | * (De)Serializable with [serde](https://crates.io/crates/serde)
18 | * Portable Rust implementation
19 | * Only verifiable usages of unsafe
20 | 
21 | This data structure is a succinct hash table that can store fingerprints in a very compact way.
22 | Fingerprints are similar to a hash values, but are possibly truncated.
23 | The reason for false positives is that multiple items can map to the same fingerprint.
24 | For more information see the [quotient filter Wikipedia page](https://en.wikipedia.org/wiki/Quotient_filter)
25 | that describes a similar but less optimized version of the data structure.
26 | The actual implementation is based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
27 | 
28 | The public API also exposes a fingerprint API, which can be used to succinctly store u64 hash values.
29 | 
30 | ### Example
31 | 
32 | ```rust
33 | let mut f = qfilter::Filter::new(1000000, 0.01);
34 | for i in 0..1000 {
35 |     f.insert(i).unwrap();
36 | }
37 | for i in 0..1000 {
38 |     assert!(f.contains(i));
39 | }
40 | ```
41 | 
42 | ### Hasher
43 | 
44 | The hashing algorithm used is [xxhash3](https://crates.io/crates/xxhash-rust) which offers both high performance and stability across platforms.
45 | 
46 | ### Filter size
47 | 
48 | For a given capacity and error probability the RSQF may require significantly less space than the equivalent bloom filter or other AMQ-Filters.
49 | 
50 | | Bits per item | Error probability when full | Bits per item (Cont.) | Error (cont.) |
51 | |:---:|:---:|:---:|---|
52 | | 3.125 | 0.362 | 19.125 | 6.87e-06 |
53 | | 4.125 | 0.201 | 20.125 | 3.43e-06 |
54 | | 5.125 | 0.106 | 21.125 | 1.72e-06 |
55 | | 6.125 | 0.0547 | 22.125 | 8.58e-07 |
56 | | 7.125 | 0.0277 | 23.125 | 4.29e-07 |
57 | | 8.125 | 0.014 | 24.125 | 2.15e-07 |
58 | | 9.125 | 0.00701 | 25.125 | 1.07e-07 |
59 | | 10.125 | 0.00351 | 26.125 | 5.36e-08 |
60 | | 11.125 | 0.00176 | 27.125 | 2.68e-08 |
61 | | 12.125 | 0.000879 | 28.125 | 1.34e-08 |
62 | | 13.125 | 0.000439 | 29.125 | 6.71e-09 |
63 | | 14.125 | 0.00022 | 30.125 | 3.35e-09 |
64 | | 15.125 | 0.00011 | 31.125 | 1.68e-09 |
65 | | 16.125 | 5.49e-05 | 32.125 | 8.38e-10 |
66 | | 17.125 | 2.75e-05 | .. | .. |
67 | | 18.125 | 1.37e-05 | .. | .. |
68 | 
69 | ### Compatibility between versions 0.1 and 0.2
70 | 
71 | Version 0.2 changed public APIs (e.g. fallible constructors) which required a major version bump.
72 | 
73 | Serialization is bidirectionally compatible between versions 0.1 and 0.2.
74 | 
75 | ### Not implemented
76 | 
77 | * [ ] Fingerprint attached values
78 | * [ ] Counting with fingerprint attached values, not fingerprint duplication
79 | * [ ] More advanced growth strategies (InfiniFilter).
80 | 
81 | ### Legacy x86_64 CPUs support
82 | 
83 | The implementation assumes the `popcnt` instruction (equivalent to `integer.count_ones()`) is present
84 | when compiling for x86_64 targets. This is theoretically not guaranteed as the instruction is only
85 | available on AMD/Intel CPUs released after 2007/2008. If that's not the case the Filter constructor will panic.
86 | 
87 | Support for such legacy x86_64 CPUs can be optionally enabled with the `legacy_x86_64_support`
88 | which incurs a ~10% performance penalty.
89 | 
90 | ### License
91 | 
92 | This project is licensed under the MIT license.
93 | 


--------------------------------------------------------------------------------
/benches/benches.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | extern crate test;
 3 | 
 4 | use qfilter::*;
 5 | use test::Bencher;
 6 | 
 7 | #[bench]
 8 | fn bench_new(b: &mut Bencher) {
 9 |     b.iter(|| Filter::new(1000, 0.005).unwrap());
10 | }
11 | #[bench]
12 | fn bench_get_ok_medium(b: &mut Bencher) {
13 |     let mut f = Filter::new(100000, 0.01).unwrap();
14 |     for i in 0..f.capacity() {
15 |         f.insert_duplicated(&i).unwrap();
16 |     }
17 |     let mut i = 0;
18 |     b.iter(|| {
19 |         i += 1;
20 |         f.contains(&i)
21 |     })
22 | }
23 | 
24 | #[bench]
25 | fn bench_get_nok_medium(b: &mut Bencher) {
26 |     let mut f = Filter::new(100000, 0.01).unwrap();
27 |     for i in 0..f.capacity() {
28 |         f.insert_duplicated(&i).unwrap();
29 |     }
30 |     let mut i = f.capacity();
31 |     b.iter(|| {
32 |         i += 1;
33 |         f.contains(&i)
34 |     })
35 | }
36 | 
37 | #[bench]
38 | fn bench_grow(b: &mut Bencher) {
39 |     b.iter(|| {
40 |         let mut f = Filter::new(10000, 0.01).unwrap();
41 |         for i in 0..f.capacity() {
42 |             f.insert_duplicated(i).unwrap();
43 |         }
44 |         f
45 |     });
46 | }
47 | 
48 | #[bench]
49 | fn bench_grow_from_90pct(b: &mut Bencher) {
50 |     let mut f = Filter::new(10000, 0.01).unwrap();
51 |     for i in 0..f.capacity() / 10 * 9 {
52 |         f.insert_duplicated(i).unwrap();
53 |     }
54 |     b.iter(|| {
55 |         let mut f = f.clone();
56 |         for i in f.len()..f.capacity() {
57 |             f.insert_duplicated(i).unwrap();
58 |         }
59 |         f
60 |     });
61 | }
62 | 
63 | #[bench]
64 | fn bench_grow_resizeable(b: &mut Bencher) {
65 |     b.iter(|| {
66 |         let mut f = Filter::new_resizeable(0, 10000, 0.01).unwrap();
67 |         for i in 0u64.. {
68 |             if f.insert_duplicated(i).is_err() {
69 |                 break;
70 |             }
71 |         }
72 |         assert_eq!(f.len(), 10000u64.next_power_of_two() * 19 / 20);
73 |         f
74 |     });
75 | }
76 | 
77 | #[bench]
78 | fn bench_shrink(b: &mut Bencher) {
79 |     let mut f = Filter::new(10000, 0.01).unwrap();
80 |     for i in 0..f.capacity() {
81 |         let _ = f.insert(i);
82 |     }
83 |     b.iter(|| {
84 |         let mut f = f.clone();
85 |         for i in 0..f.capacity() {
86 |             f.remove(i);
87 |         }
88 |         f
89 |     });
90 | }
91 | 


--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | corpus
3 | artifacts
4 | 


--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "qfilter-fuzz"
 3 | version = "0.0.0"
 4 | authors = ["Automatically generated"]
 5 | publish = false
 6 | edition = "2021"
 7 | 
 8 | [package.metadata]
 9 | cargo-fuzz = true
10 | 
11 | [dependencies]
12 | libfuzzer-sys = {version = "0.4", features = ["arbitrary-derive"] }
13 | 
14 | [dependencies.qfilter]
15 | path = ".."
16 | 
17 | # Prevent this from interfering with workspaces
18 | [workspace]
19 | members = ["."]
20 | 
21 | [[bin]]
22 | name = "fuzz_qfilter"
23 | path = "fuzz_targets/fuzz_qfilter.rs"
24 | test = false
25 | doc = false
26 | 
27 | [[bin]]
28 | name = "fuzz_fingerprint"
29 | path = "fuzz_targets/fuzz_fingerprint.rs"
30 | test = false
31 | doc = false
32 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/fuzz_fingerprint.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | use libfuzzer_sys::arbitrary;
 3 | use libfuzzer_sys::arbitrary::Arbitrary;
 4 | use libfuzzer_sys::fuzz_target;
 5 | 
 6 | const FUZZ_REMOVES: bool = true;
 7 | const CHECK_EVERY: usize = 8;
 8 | const CHECK_SHRUNK: bool = true;
 9 | 
10 | #[derive(Debug, Arbitrary)]
11 | struct Input {
12 |     cap: u16,
13 |     fp_size: u8,
14 |     ops: Vec<(bool, u16)>,
15 | }
16 | 
17 | fuzz_target!(|input: Input| {
18 |     let Input { cap, ops, fp_size } = input;
19 |     // The "Model", tracks the count for each item
20 |     let mut counts = [0u64; (u16::MAX as usize) + 1];
21 |     let Ok(mut f) = qfilter::Filter::with_fingerprint_size(cap as u64, fp_size.clamp(7, 64)) else {
22 |         return;
23 |     };
24 |     for i in 0..ops.len() {
25 |         // print_sample(&counts);
26 |         // dbg!(ops[i]);
27 | 
28 |         let (add, item) = ops[i];
29 |         let item = item as u64;
30 |         if !FUZZ_REMOVES || add {
31 |             if f.insert_fingerprint(true, item).is_err() {
32 |                 continue;
33 |             }
34 |             counts[item as usize] += 1;
35 |         } else if counts[item as usize] != 0 && f.remove_fingerprint(item) {
36 |             counts[item as usize] -= 1;
37 |         } else {
38 |             continue;
39 |         }
40 | 
41 |         if i % CHECK_EVERY == 0 {
42 |             for &(_add, e) in &ops[..=i] {
43 |                 let min = counts[e as usize];
44 |                 // Since we can only check for >= due to collisions skip min = 0
45 |                 if min != 0 {
46 |                     let est = f.count_fingerprint(e as u64);
47 |                     assert!(est >= min, "{e}: est {est} < min {min}");
48 |                 }
49 |             }
50 |         }
51 |     }
52 | 
53 |     for shrunk in [false, true] {
54 |         for &(_add, e) in &ops {
55 |             let min = counts[e as usize];
56 |             let est = f.count_fingerprint(e as u64);
57 |             assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}");
58 |         }
59 |         let prints = f.fingerprints().collect::<Vec<_>>();
60 |         let mut expected_prints = counts
61 |             .iter()
62 |             .enumerate()
63 |             .flat_map(|(i, n)| {
64 |                 let t = (i as u64) << (64 - f.fingerprint_size()) >> (64 - f.fingerprint_size());
65 |                 std::iter::repeat(t).take(*n as usize)
66 |             })
67 |             .collect::<Vec<_>>();
68 |         expected_prints.sort_unstable();
69 |         assert_eq!(prints.len(), f.len() as usize);
70 |         assert_eq!(prints, expected_prints);
71 |         if !CHECK_SHRUNK {
72 |             break;
73 |         }
74 |         f.shrink_to_fit();
75 |     }
76 | });
77 | 
78 | #[allow(dead_code)]
79 | fn print_sample(counts: &[u64]) {
80 |     print!("[");
81 |     for (i, c) in counts.iter().copied().enumerate() {
82 |         if c != 0 {
83 |             print!("({i}u16, {c}), ");
84 |         }
85 |     }
86 |     println!("]");
87 | }
88 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/fuzz_qfilter.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | use libfuzzer_sys::arbitrary;
 3 | use libfuzzer_sys::arbitrary::Arbitrary;
 4 | use libfuzzer_sys::fuzz_target;
 5 | 
 6 | const FUZZ_REMOVES: bool = true;
 7 | const CHECK_EVERY: usize = 8;
 8 | const CHECK_SHRUNK: bool = true;
 9 | 
10 | #[derive(Debug, Arbitrary)]
11 | struct Input {
12 |     cap: u16,
13 |     max_cap: u16,
14 |     fp_exp: u16,
15 |     ops: Vec<(bool, u16)>,
16 | }
17 | 
18 | fuzz_target!(|input: Input| {
19 |     let Input {
20 |         cap,
21 |         max_cap,
22 |         fp_exp,
23 |         ops,
24 |     } = input;
25 |     let max_cap = max_cap.max(cap) as u64;
26 |     let cap = cap as u64;
27 |     let fp = 2f64.powi(-(fp_exp.leading_ones() as i32));
28 |     // The "Model", tracks the count for each item
29 |     let mut counts = [0u64; (u16::MAX as usize) + 1];
30 |     let mut f = qfilter::Filter::new_resizeable(cap, max_cap, fp).unwrap();
31 |     for i in 0..ops.len() {
32 |         // print_sample(&counts);
33 |         // dbg!(ops[i]);
34 | 
35 |         let (add, item) = ops[i];
36 |         if !FUZZ_REMOVES || add {
37 |             if f.insert_duplicated(item).is_err() {
38 |                 continue;
39 |             }
40 |             counts[item as usize] += 1;
41 |         } else if counts[item as usize] != 0 && f.remove(item) {
42 |             counts[item as usize] -= 1;
43 |         } else {
44 |             continue;
45 |         }
46 | 
47 |         if i % CHECK_EVERY == 0 {
48 |             for &(_add, e) in &ops[..=i] {
49 |                 let min = counts[e as usize];
50 |                 // Since we can only check for >= due to collisions skip min = 0
51 |                 if min != 0 {
52 |                     let est = f.count(e);
53 |                     assert!(est >= min, "{e}: est {est} < min {min}");
54 |                 }
55 |             }
56 |         }
57 |     }
58 | 
59 |     for shrunk in [false, true] {
60 |         for &(_add, e) in &ops {
61 |             let min = counts[e as usize];
62 |             let est = f.count(e);
63 |             assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}");
64 |         }
65 |         if !CHECK_SHRUNK {
66 |             break;
67 |         }
68 |         f.shrink_to_fit();
69 |     }
70 | });
71 | 
72 | #[allow(dead_code)]
73 | fn print_sample(counts: &[u64]) {
74 |     print!("[");
75 |     for (i, c) in counts.iter().copied().enumerate() {
76 |         if c != 0 {
77 |             print!("({i}u16, {c}), ");
78 |         }
79 |     }
80 |     println!("]");
81 | }
82 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
   1 | //! Approximate Membership Query Filter ([AMQ-Filter](https://en.wikipedia.org/wiki/Approximate_Membership_Query_Filter))
   2 | //! based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
   3 | //!
   4 | //! This is a small and flexible general-purpose AMQ-Filter, it not only supports approximate membership testing like a bloom filter
   5 | //! but also deletions, merging, resizing and [serde](https://crates.io/crates/serde) serialization.
   6 | //!
   7 | //! ### Example
   8 | //!
   9 | //! ```rust
  10 | //! let mut f = qfilter::Filter::new(1000000, 0.01).unwrap();
  11 | //! for i in 0..1000 {
  12 | //!     f.insert(i).unwrap();
  13 | //! }
  14 | //! for i in 0..1000 {
  15 | //!     assert!(f.contains(i));
  16 | //! }
  17 | //! ```
  18 | //!
  19 | //! ### Hasher
  20 | //!
  21 | //! The hashing algorithm used is [xxhash3](https://crates.io/crates/xxhash-rust)
  22 | //! which offers both high performance and stability across platforms.
  23 | //!
  24 | //! ### Filter size
  25 | //!
  26 | //! For a given capacity and error probability the RSQF may require significantly less space than the equivalent bloom filter or other AMQ-Filters.
  27 | //!
  28 | //! | Bits per item | Error probability when full | Bits per item (cont.) | Error (cont.) |
  29 | //! |:---:|:---:|:---:|---|
  30 | //! | 3.125 | 0.362 | 19.125 | 6.87e-06 |
  31 | //! | 4.125 | 0.201 | 20.125 | 3.43e-06 |
  32 | //! | 5.125 | 0.106 | 21.125 | 1.72e-06 |
  33 | //! | 6.125 | 0.0547 | 22.125 | 8.58e-07 |
  34 | //! | 7.125 | 0.0277 | 23.125 | 4.29e-07 |
  35 | //! | 8.125 | 0.014 | 24.125 | 2.15e-07 |
  36 | //! | 9.125 | 0.00701 | 25.125 | 1.07e-07 |
  37 | //! | 10.125 | 0.00351 | 26.125 | 5.36e-08 |
  38 | //! | 11.125 | 0.00176 | 27.125 | 2.68e-08 |
  39 | //! | 12.125 | 0.000879 | 28.125 | 1.34e-08 |
  40 | //! | 13.125 | 0.000439 | 29.125 | 6.71e-09 |
  41 | //! | 14.125 | 0.00022 | 30.125 | 3.35e-09 |
  42 | //! | 15.125 | 0.00011 | 31.125 | 1.68e-09 |
  43 | //! | 16.125 | 5.49e-05 | 32.125 | 8.38e-10 |
  44 | //! | 17.125 | 2.75e-05 | .. | .. |
  45 | //! | 18.125 | 1.37e-05 | .. | .. |
  46 | //!
  47 | //! ### Legacy x86_64 CPUs support
  48 | //!
  49 | //! The implementation assumes the `popcnt` instruction (equivalent to `integer.count_ones()`) is present
  50 | //! when compiling for x86_64 targets. This is theoretically not guaranteed as the instruction in only
  51 | //! available on AMD/Intel CPUs released after 2007/2008. If that's not the case the Filter constructor will panic.
  52 | //!
  53 | //! Support for such legacy x86_64 CPUs can be optionally enabled with the `legacy_x86_64_support`
  54 | //! which incurs a ~10% performance penalty.
  55 | #![cfg_attr(docsrs, feature(doc_auto_cfg))]
  56 | 
  57 | use std::{
  58 |     cmp::Ordering,
  59 |     hash::{Hash, Hasher},
  60 |     num::{NonZeroU64, NonZeroU8},
  61 |     ops::{RangeBounds, RangeFrom},
  62 | };
  63 | 
  64 | #[cfg(feature = "jsonschema")]
  65 | use schemars::JsonSchema;
  66 | #[cfg(feature = "serde")]
  67 | use serde::{Deserialize, Serialize};
  68 | use stable_hasher::StableHasher;
  69 | 
  70 | mod stable_hasher;
  71 | 
  72 | /// Approximate Membership Query Filter (AMQ-Filter) based on the Rank Select Quotient Filter (RSQF).
  73 | ///
  74 | /// This data structure is similar to a hash table that stores fingerprints in a very compact way.
  75 | /// Fingerprints are similar to a hash values, but are possibly truncated.
  76 | /// The reason for false positives is that multiple items can map to the same fingerprint.
  77 | /// For more information see the [quotient filter Wikipedia page](https://en.wikipedia.org/wiki/Quotient_filter)
  78 | /// that describes a similar but less optimized version of the data structure.
  79 | /// The actual implementation is based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
  80 | ///
  81 | /// The public API also exposes a fingerprint API, which can be used to succinctly store u64
  82 | /// hash values.
  83 | #[derive(Clone)]
  84 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
  85 | #[cfg_attr(feature = "jsonschema", derive(JsonSchema))]
  86 | pub struct Filter {
  87 |     #[cfg_attr(
  88 |         feature = "serde",
  89 |         serde(
  90 |             rename = "b",
  91 |             serialize_with = "serde_bytes::serialize",
  92 |             deserialize_with = "serde_bytes::deserialize"
  93 |         )
  94 |     )]
  95 |     buffer: Box<[u8]>,
  96 |     #[cfg_attr(feature = "serde", serde(rename = "l"))]
  97 |     len: u64,
  98 |     #[cfg_attr(feature = "serde", serde(rename = "q"))]
  99 |     qbits: NonZeroU8,
 100 |     #[cfg_attr(feature = "serde", serde(rename = "r"))]
 101 |     rbits: NonZeroU8,
 102 |     #[cfg_attr(
 103 |         feature = "serde",
 104 |         serde(rename = "g", skip_serializing_if = "Option::is_none", default)
 105 |     )]
 106 |     max_qbits: Option<NonZeroU8>,
 107 | }
 108 | 
 109 | #[derive(Debug)]
 110 | #[non_exhaustive]
 111 | pub enum Error {
 112 |     /// The filter cannot fit another fingerprint
 113 |     CapacityExceeded,
 114 |     /// The fingerprint sizes are not compatible
 115 |     IncompatibleFingerprintSize,
 116 |     /// The specified filter cannot be constructed with 64 bit hashes
 117 |     NotEnoughFingerprintBits,
 118 |     /// Capacity is too large. Filter::MAX_CAPACITY = 2^59 * 19 / 20.
 119 |     CapacityTooLarge,
 120 | }
 121 | 
 122 | impl std::fmt::Display for Error {
 123 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 124 |         write!(f, "{self:?}")
 125 |     }
 126 | }
 127 | 
 128 | impl std::error::Error for Error {}
 129 | 
 130 | #[derive(Debug)]
 131 | struct Block {
 132 |     offset: u64,
 133 |     occupieds: u64,
 134 |     runends: u64,
 135 | }
 136 | 
 137 | trait BitExt {
 138 |     fn is_bit_set(&self, i: usize) -> bool;
 139 |     fn set_bit(&mut self, i: usize);
 140 |     fn clear_bit(&mut self, i: usize);
 141 |     fn shift_right(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self;
 142 |     fn shift_left(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self;
 143 |     /// Number of set bits (1s) in the range
 144 |     fn popcnt(&self, range: impl RangeBounds<u64>) -> u64;
 145 |     /// Index of nth set bits in the range
 146 |     fn select(&self, range: RangeFrom<u64>, n: u64) -> Option<u64>;
 147 | 
 148 |     #[inline]
 149 |     fn update_bit(&mut self, i: usize, value: bool) {
 150 |         if value {
 151 |             self.set_bit(i)
 152 |         } else {
 153 |             self.clear_bit(i)
 154 |         }
 155 |     }
 156 | }
 157 | 
 158 | impl BitExt for u64 {
 159 |     #[inline]
 160 |     fn is_bit_set(&self, i: usize) -> bool {
 161 |         (*self & (1 << i)) != 0
 162 |     }
 163 | 
 164 |     #[inline]
 165 |     fn set_bit(&mut self, i: usize) {
 166 |         *self |= 1 << i
 167 |     }
 168 | 
 169 |     #[inline]
 170 |     fn clear_bit(&mut self, i: usize) {
 171 |         *self &= !(1 << i)
 172 |     }
 173 | 
 174 |     #[inline]
 175 |     fn shift_right(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self {
 176 |         let bitmask = |n| !u64::MAX.checked_shl(n).unwrap_or(0);
 177 |         let a_component = *self >> (64 - bits); // select the highest `bits` from A to become lowest
 178 |         let b_shifted_mask = bitmask((b_end - b_start) as u32) << b_start;
 179 |         let b_shifted = ((b_shifted_mask & b) << bits) & b_shifted_mask;
 180 |         let b_mask = !b_shifted_mask;
 181 | 
 182 |         a_component | b_shifted | (b & b_mask)
 183 |     }
 184 | 
 185 |     #[inline]
 186 |     fn shift_left(&self, bits: usize, b: &Self, b_start: usize, b_end: usize) -> Self {
 187 |         let bitmask = |n| !u64::MAX.checked_shl(n).unwrap_or(0);
 188 |         let a_component = *self << (64 - bits); // select the lowest `bits` from A to become highest
 189 |         let b_shifted_mask = bitmask((b_end - b_start) as u32) << b_start;
 190 |         let b_shifted = ((b_shifted_mask & b) >> bits) & b_shifted_mask;
 191 |         let b_mask = !b_shifted_mask;
 192 | 
 193 |         a_component | b_shifted | (b & b_mask)
 194 |     }
 195 | 
 196 |     #[inline]
 197 |     fn popcnt(&self, range: impl RangeBounds<u64>) -> u64 {
 198 |         let mut v = match range.start_bound() {
 199 |             std::ops::Bound::Included(&i) => *self >> i << i,
 200 |             std::ops::Bound::Excluded(&i) => *self >> (i + 1) << (i + 1),
 201 |             _ => *self,
 202 |         };
 203 |         v = match range.end_bound() {
 204 |             std::ops::Bound::Included(&i) if i < 63 => v & ((2 << i) - 1),
 205 |             std::ops::Bound::Excluded(&i) if i <= 63 => v & ((1 << i) - 1),
 206 |             _ => v,
 207 |         };
 208 | 
 209 |         #[cfg(all(
 210 |             target_arch = "x86_64",
 211 |             not(feature = "legacy_x86_64_support"),
 212 |             not(target_feature = "popcnt")
 213 |         ))]
 214 |         let result = unsafe {
 215 |             // Using intrinsics introduce a function call, and the resulting code
 216 |             // ends up slower than the inline assembly below.
 217 |             // Any calls to is_x86_feature_detected also significantly affect performance.
 218 |             // Given this is available on all x64 cpus starting 2008 we assume it's present
 219 |             // (unless legacy_x86_64_support is set) and panic elsewhere otherwise.
 220 |             let popcnt;
 221 |             std::arch::asm!(
 222 |                 "popcnt {popcnt}, {v}",
 223 |                 v = in(reg) v,
 224 |                 popcnt = out(reg) popcnt,
 225 |                 options(pure, nomem, nostack)
 226 |             );
 227 |             popcnt
 228 |         };
 229 |         #[cfg(any(
 230 |             not(target_arch = "x86_64"),
 231 |             feature = "legacy_x86_64_support",
 232 |             target_feature = "popcnt"
 233 |         ))]
 234 |         let result = v.count_ones() as u64;
 235 | 
 236 |         result
 237 |     }
 238 | 
 239 |     #[inline]
 240 |     fn select(&self, range: RangeFrom<u64>, n: u64) -> Option<u64> {
 241 |         debug_assert!(range.start < 64);
 242 |         let v = *self >> range.start << range.start;
 243 | 
 244 |         #[cfg_attr(target_arch = "x86_64", cold)]
 245 |         #[cfg_attr(not(target_arch = "x86_64"), inline)]
 246 |         fn fallback(mut v: u64, n: u64) -> Option<u64> {
 247 |             for _ in 0..n / 8 {
 248 |                 for _ in 0..8 {
 249 |                     v &= v.wrapping_sub(1); // remove the least significant bit
 250 |                 }
 251 |             }
 252 |             for _ in 0..n % 8 {
 253 |                 v &= v.wrapping_sub(1); // remove the least significant bit
 254 |             }
 255 | 
 256 |             if v == 0 {
 257 |                 None
 258 |             } else {
 259 |                 Some(v.trailing_zeros() as u64)
 260 |             }
 261 |         }
 262 | 
 263 |         #[cfg(target_arch = "x86_64")]
 264 |         let result = {
 265 |             // TODO: AMD CPUs up to Zen2 have slow BMI implementations
 266 |             if std::is_x86_feature_detected!("bmi2") {
 267 |                 // This is the equivalent intrinsics version of the inline assembly below.
 268 |                 // #[target_feature(enable = "bmi1")]
 269 |                 // #[target_feature(enable = "bmi2")]
 270 |                 // #[inline]
 271 |                 // unsafe fn select_bmi2(x: u64, k: u64) -> Option<u64> {
 272 |                 //     use std::arch::x86_64::{_pdep_u64, _tzcnt_u64};
 273 |                 //     let result = _tzcnt_u64(_pdep_u64(1 << k, x));
 274 |                 //     if result != 64 {
 275 |                 //         Some(result)
 276 |                 //     } else {
 277 |                 //         None
 278 |                 //     }
 279 |                 // }
 280 |                 // unsafe { select_bmi2(v, n) }
 281 | 
 282 |                 let result: u64;
 283 |                 unsafe {
 284 |                     std::arch::asm!(
 285 |                         "mov     {tmp}, 1",
 286 |                         "shlx    {tmp}, {tmp}, {n}",
 287 |                         "pdep    {tmp}, {tmp}, {v}",
 288 |                         "tzcnt   {tmp}, {tmp}",
 289 |                         n = in(reg) n,
 290 |                         v = in(reg) v,
 291 |                         tmp = out(reg) result,
 292 |                         options(pure, nomem, nostack)
 293 |                     );
 294 |                 }
 295 |                 if result != 64 {
 296 |                     Some(result)
 297 |                 } else {
 298 |                     None
 299 |                 }
 300 |             } else {
 301 |                 fallback(v, n)
 302 |             }
 303 |         };
 304 |         #[cfg(not(target_arch = "x86_64"))]
 305 |         let result = fallback(v, n);
 306 | 
 307 |         result
 308 |     }
 309 | }
 310 | 
 311 | trait CastNonZeroU8 {
 312 |     fn u64(&self) -> u64;
 313 |     fn usize(&self) -> usize;
 314 | }
 315 | 
 316 | impl CastNonZeroU8 for NonZeroU8 {
 317 |     #[inline]
 318 |     fn u64(&self) -> u64 {
 319 |         self.get() as u64
 320 |     }
 321 | 
 322 |     #[inline]
 323 |     fn usize(&self) -> usize {
 324 |         self.get() as usize
 325 |     }
 326 | }
 327 | 
 328 | /// An iterator over the fingerprints of a `Filter`.
 329 | pub struct FingerprintIter<'a> {
 330 |     filter: &'a Filter,
 331 |     q_bucket_idx: u64,
 332 |     r_bucket_idx: u64,
 333 |     remaining: u64,
 334 | }
 335 | 
 336 | impl<'a> FingerprintIter<'a> {
 337 |     fn new(filter: &'a Filter) -> Self {
 338 |         let mut iter = FingerprintIter {
 339 |             filter,
 340 |             q_bucket_idx: 0,
 341 |             r_bucket_idx: 0,
 342 |             remaining: filter.len,
 343 |         };
 344 |         if !filter.is_empty() {
 345 |             while !filter.is_occupied(iter.q_bucket_idx) {
 346 |                 iter.q_bucket_idx += 1;
 347 |             }
 348 |             iter.r_bucket_idx = filter.run_start(iter.q_bucket_idx);
 349 |         }
 350 |         iter
 351 |     }
 352 | }
 353 | 
 354 | impl Iterator for FingerprintIter<'_> {
 355 |     type Item = u64;
 356 | 
 357 |     fn next(&mut self) -> Option<Self::Item> {
 358 |         if let Some(r) = self.remaining.checked_sub(1) {
 359 |             self.remaining = r;
 360 |         } else {
 361 |             return None;
 362 |         }
 363 |         let hash = (self.q_bucket_idx << self.filter.rbits.get())
 364 |             | self.filter.get_remainder(self.r_bucket_idx);
 365 | 
 366 |         if self.filter.is_runend(self.r_bucket_idx) {
 367 |             self.q_bucket_idx += 1;
 368 |             while !self.filter.is_occupied(self.q_bucket_idx) {
 369 |                 self.q_bucket_idx += 1;
 370 |             }
 371 |             self.r_bucket_idx = (self.r_bucket_idx + 1).max(self.q_bucket_idx);
 372 |         } else {
 373 |             self.r_bucket_idx += 1;
 374 |         }
 375 | 
 376 |         Some(hash)
 377 |     }
 378 | }
 379 | 
 380 | impl Filter {
 381 |     /// Maximum log2 number of slots that can be used in the filter.
 382 |     /// Effectively, the largest power of 2 that can be multiplied by 19 without overflowing u64.
 383 |     const MAX_QBITS: u8 = 59;
 384 | 
 385 |     /// Maximum number of items that can be stored in the filter: ceil(2^59 * 19 / 20)
 386 |     pub const MAX_CAPACITY: u64 = (2u64.pow(Self::MAX_QBITS as u32) * 19).div_ceil(20);
 387 | 
 388 |     /// Creates a new filter that can hold at least `capacity` items
 389 |     /// and with a desired error rate of `fp_rate` (clamped to (0, 0.5]).
 390 |     ///
 391 |     /// Errors if capacity is too large if the specified filter isn't achievable using 64 bit hashes.
 392 |     #[inline]
 393 |     pub fn new(capacity: u64, fp_rate: f64) -> Result<Self, Error> {
 394 |         Self::new_resizeable(capacity, capacity, fp_rate)
 395 |     }
 396 | 
 397 |     /// Calculates the number of slots needed to fit the desired fingerprints with 95% occupation.
 398 |     /// Returns the number of slots needed rounded to the next power of two, but always >= 64.
 399 |     fn calculate_needed_slots(desired: u64) -> Result<u64, Error> {
 400 |         let mut slots = desired
 401 |             .checked_next_power_of_two()
 402 |             .ok_or(Error::CapacityTooLarge)?
 403 |             .max(64);
 404 |         loop {
 405 |             let capacity = slots
 406 |                 .checked_mul(19)
 407 |                 .ok_or(Error::CapacityTooLarge)?
 408 |                 .div_ceil(20);
 409 |             if capacity >= desired {
 410 |                 return Ok(slots);
 411 |             }
 412 |             slots = slots.checked_mul(2).ok_or(Error::CapacityTooLarge)?;
 413 |         }
 414 |     }
 415 | 
 416 |     /// Creates a new filter that can hold at least `initial_capacity` items initially
 417 |     /// and can resize to hold at least `max_capacity` when fully grown.
 418 |     /// The desired error rate `fp_rate` (clamped to (0, 0.5]) applies to the fully grown filter.
 419 |     ///
 420 |     /// This works by storing fingerprints large enough to satisfy the maximum requirements,
 421 |     /// so smaller filters will actually have lower error rates, which will increase
 422 |     /// (up to `fp_rate`) as the filter grows. In practice every time the filter doubles in
 423 |     /// capacity its error rate also doubles.
 424 |     ///
 425 |     /// Errors if max_capacity is too large or if the specified filter isn't achievable using 64 bit hashes.
 426 |     pub fn new_resizeable(
 427 |         initial_capacity: u64,
 428 |         max_capacity: u64,
 429 |         fp_rate: f64,
 430 |     ) -> Result<Self, Error> {
 431 |         assert!(max_capacity >= initial_capacity);
 432 |         let slots_for_capacity = Self::calculate_needed_slots(initial_capacity)?;
 433 |         let qbits = slots_for_capacity.trailing_zeros() as u8;
 434 |         let slots_for_max_capacity = Self::calculate_needed_slots(max_capacity)?;
 435 |         let max_qbits = slots_for_max_capacity.trailing_zeros() as u8;
 436 |         let fp_rate = fp_rate.clamp(f64::MIN_POSITIVE, 0.5);
 437 |         let rbits = (-fp_rate.log2()).round().max(1.0) as u8 + (max_qbits - qbits);
 438 |         let mut result = Self::with_qr(qbits.try_into().unwrap(), rbits.try_into().unwrap())?;
 439 |         if max_qbits > qbits {
 440 |             result.max_qbits = Some(max_qbits.try_into().unwrap());
 441 |         }
 442 |         Ok(result)
 443 |     }
 444 | 
 445 |     /// Creates a new resizeable filter that can hold at least `initial_capacity` items initially while
 446 |     /// utilizing a fingerprint bit size of `fingerprint_bits` (7..=64). Normally this function is only
 447 |     /// useful if the filter is being used to manually store fingerprints.
 448 |     pub fn with_fingerprint_size(
 449 |         initial_capacity: u64,
 450 |         fingerprint_bits: u8,
 451 |     ) -> Result<Filter, Error> {
 452 |         if !(7..=64).contains(&fingerprint_bits) {
 453 |             return Err(Error::NotEnoughFingerprintBits);
 454 |         }
 455 |         let slots_for_capacity = Self::calculate_needed_slots(initial_capacity)?;
 456 |         let qbits = slots_for_capacity.trailing_zeros() as u8;
 457 |         if fingerprint_bits <= qbits {
 458 |             return Err(Error::NotEnoughFingerprintBits);
 459 |         }
 460 |         let rbits = fingerprint_bits - qbits;
 461 |         let mut result = Self::with_qr(qbits.try_into().unwrap(), rbits.try_into().unwrap())?;
 462 |         if rbits > 1 {
 463 |             result.max_qbits = Some((qbits + rbits - 1).min(Self::MAX_QBITS).try_into().unwrap());
 464 |         }
 465 |         Ok(result)
 466 |     }
 467 | 
 468 |     fn with_qr(qbits: NonZeroU8, rbits: NonZeroU8) -> Result<Filter, Error> {
 469 |         Self::check_cpu_support();
 470 |         if qbits.get() + rbits.get() > 64 {
 471 |             return Err(Error::NotEnoughFingerprintBits);
 472 |         }
 473 |         let num_slots = 1 << qbits.get();
 474 |         let num_blocks = num_slots / 64;
 475 |         assert_ne!(num_blocks, 0);
 476 |         let block_bytes_size = 1 + 16 + 64 * rbits.u64() / 8;
 477 |         let buffer_bytes = num_blocks * block_bytes_size;
 478 |         let buffer = vec![0u8; buffer_bytes.try_into().unwrap()].into_boxed_slice();
 479 |         Ok(Self {
 480 |             buffer,
 481 |             qbits,
 482 |             rbits,
 483 |             len: 0,
 484 |             max_qbits: None,
 485 |         })
 486 |     }
 487 | 
 488 |     fn check_cpu_support() {
 489 |         #[cfg(all(
 490 |             target_arch = "x86_64",
 491 |             not(feature = "legacy_x86_64_support"),
 492 |             not(target_feature = "popcnt")
 493 |         ))]
 494 |         assert!(
 495 |             std::is_x86_feature_detected!("popcnt"),
 496 |             "CPU doesn't support the popcnt instruction"
 497 |         );
 498 |     }
 499 | 
 500 |     /// The internal fingerprint size in bits.
 501 |     #[inline]
 502 |     pub fn fingerprint_size(&self) -> u8 {
 503 |         self.qbits.get() + self.rbits.get()
 504 |     }
 505 | 
 506 |     /// Whether the filter is empty.
 507 |     #[inline]
 508 |     pub fn is_empty(&self) -> bool {
 509 |         self.len == 0
 510 |     }
 511 | 
 512 |     /// Current number of fingerprints admitted to the filter.
 513 |     #[inline]
 514 |     pub fn len(&self) -> u64 {
 515 |         self.len
 516 |     }
 517 | 
 518 |     /// Current memory usage in bytes.
 519 |     #[inline]
 520 |     pub fn memory_usage(&self) -> usize {
 521 |         self.buffer.len()
 522 |     }
 523 | 
 524 |     /// Resets/Clears the filter.
 525 |     pub fn clear(&mut self) {
 526 |         self.buffer.fill(0);
 527 |         self.len = 0;
 528 |     }
 529 | 
 530 |     /// Maximum filter capacity.
 531 |     #[inline]
 532 |     pub fn capacity_resizeable(&self) -> u64 {
 533 |         // Overflow is not possible here as it'd have overflowed in the constructor.
 534 |         ((1u64 << self.max_qbits.unwrap_or(self.qbits).get()) * 19).div_ceil(20)
 535 |     }
 536 | 
 537 |     /// Current filter capacity.
 538 |     #[inline]
 539 |     pub fn capacity(&self) -> u64 {
 540 |         if cfg!(fuzzing) {
 541 |             // 100% occupancy is not realistic but stresses the algorithm much more.
 542 |             // To generate real counter examples this "pessimisation" must be removed.
 543 |             self.total_buckets().get()
 544 |         } else {
 545 |             // Up to 95% occupancy
 546 |             // 19/20 == 0.95
 547 |             // Overflow is not possible here as it'd have overflowed in the constructor.
 548 |             (self.total_buckets().get() * 19).div_ceil(20)
 549 |         }
 550 |     }
 551 | 
 552 |     /// Max error ratio when at the resizeable capacity (len == resizeable_capacity).
 553 |     pub fn max_error_ratio_resizeable(&self) -> f64 {
 554 |         let extra_rbits = self.max_qbits.unwrap_or(self.qbits).get() - self.qbits.get();
 555 |         2f64.powi(-((self.rbits.get() - extra_rbits) as i32))
 556 |     }
 557 | 
 558 |     /// Max error ratio when at full capacity (len == capacity).
 559 |     pub fn max_error_ratio(&self) -> f64 {
 560 |         2f64.powi(-(self.rbits.get() as i32))
 561 |     }
 562 | 
 563 |     /// Current error ratio at the current occupancy.
 564 |     pub fn current_error_ratio(&self) -> f64 {
 565 |         let occupancy = self.len as f64 / self.total_buckets().get() as f64;
 566 |         1.0 - std::f64::consts::E.powf(-occupancy / 2f64.powi(self.rbits.get() as i32))
 567 |     }
 568 | 
 569 |     #[inline]
 570 |     fn block_byte_size(&self) -> usize {
 571 |         1 + 8 + 8 + 64 * self.rbits.usize() / 8
 572 |     }
 573 | 
 574 |     #[inline]
 575 |     fn set_block_runends(&mut self, block_num: u64, runends: u64) {
 576 |         let block_num = block_num % self.total_blocks();
 577 |         let block_start = block_num as usize * self.block_byte_size();
 578 |         let block_bytes: &mut [u8; 1 + 8 + 8] = (&mut self.buffer[block_start..][..1 + 8 + 8])
 579 |             .try_into()
 580 |             .unwrap();
 581 |         block_bytes[1 + 8..1 + 8 + 8].copy_from_slice(&runends.to_le_bytes());
 582 |     }
 583 | 
 584 |     #[inline]
 585 |     fn raw_block(&self, block_num: u64) -> Block {
 586 |         let block_num = block_num % self.total_blocks();
 587 |         let block_start = block_num as usize * self.block_byte_size();
 588 |         let block_bytes: &[u8; 1 + 8 + 8] =
 589 |             &self.buffer[block_start..][..1 + 8 + 8].try_into().unwrap();
 590 |         Block {
 591 |             offset: block_bytes[0] as u64,
 592 |             occupieds: u64::from_le_bytes(block_bytes[1..1 + 8].try_into().unwrap()),
 593 |             runends: u64::from_le_bytes(block_bytes[1 + 8..1 + 8 + 8].try_into().unwrap()),
 594 |         }
 595 |     }
 596 | 
 597 |     #[inline]
 598 |     fn block(&self, block_num: u64) -> Block {
 599 |         let block_num = block_num % self.total_blocks();
 600 |         let block_start = block_num as usize * self.block_byte_size();
 601 |         let block_bytes: &[u8; 1 + 8 + 8] =
 602 |             &self.buffer[block_start..][..1 + 8 + 8].try_into().unwrap();
 603 |         let offset = {
 604 |             if block_bytes[0] < u8::MAX {
 605 |                 block_bytes[0] as u64
 606 |             } else {
 607 |                 self.calc_offset(block_num)
 608 |             }
 609 |         };
 610 |         Block {
 611 |             offset,
 612 |             occupieds: u64::from_le_bytes(block_bytes[1..1 + 8].try_into().unwrap()),
 613 |             runends: u64::from_le_bytes(block_bytes[1 + 8..1 + 8 + 8].try_into().unwrap()),
 614 |         }
 615 |     }
 616 | 
 617 |     #[inline]
 618 |     fn adjust_block_offset(&mut self, block_num: u64, inc: bool) {
 619 |         let block_num = block_num % self.total_blocks();
 620 |         let block_start = block_num as usize * self.block_byte_size();
 621 |         let offset = &mut self.buffer[block_start];
 622 |         if inc {
 623 |             *offset = offset.saturating_add(1);
 624 |         } else if *offset != u8::MAX {
 625 |             *offset -= 1;
 626 |         } else {
 627 |             self.buffer[block_start] = self.calc_offset(block_num).try_into().unwrap_or(u8::MAX);
 628 |         }
 629 |     }
 630 | 
 631 |     #[inline]
 632 |     fn inc_offsets(&mut self, start_bucket: u64, end_bucket: u64) {
 633 |         let original_block = start_bucket / 64;
 634 |         let mut last_affected_block = end_bucket / 64;
 635 |         if end_bucket < start_bucket {
 636 |             last_affected_block += self.total_blocks().get();
 637 |         }
 638 |         for b in original_block + 1..=last_affected_block {
 639 |             self.adjust_block_offset(b, true);
 640 |         }
 641 |     }
 642 | 
 643 |     #[inline]
 644 |     fn dec_offsets(&mut self, start_bucket: u64, end_bucket: u64) {
 645 |         let original_block = start_bucket / 64;
 646 |         let mut last_affected_block = end_bucket / 64;
 647 |         if end_bucket < start_bucket {
 648 |             last_affected_block += self.total_blocks().get();
 649 |         }
 650 | 
 651 |         // As an edge case we may decrement the offsets of 2+ blocks and the block B' offset
 652 |         // may be saturated and depend on a previous Block B" with a non saturated offset.
 653 |         // But B" offset may also(!) be affected by the decremented operation, so we must
 654 |         // decrement B" offset first before the remaining offsets.
 655 |         if last_affected_block - original_block >= 2
 656 |             && self.raw_block(original_block + 1).offset >= u8::MAX as u64
 657 |         {
 658 |             // last affected block offset is always <= 64 (BLOCK SIZE)
 659 |             // otherwise the decrement operation would be to affecting a subsequent block
 660 |             debug_assert!(self.raw_block(last_affected_block).offset <= 64);
 661 |             self.adjust_block_offset(last_affected_block, false);
 662 |             last_affected_block -= 1;
 663 |         }
 664 |         for b in original_block + 1..=last_affected_block {
 665 |             self.adjust_block_offset(b, false);
 666 |         }
 667 | 
 668 |         #[cfg(fuzzing)]
 669 |         self.validate_offsets(original_block, last_affected_block);
 670 |     }
 671 | 
 672 |     #[cfg(any(fuzzing, test))]
 673 |     fn validate_offsets(&mut self, original_block: u64, last_affected_block: u64) {
 674 |         for b in original_block..=last_affected_block {
 675 |             let raw_offset = self.raw_block(b).offset;
 676 |             let offset = self.calc_offset(b);
 677 |             debug_assert!(
 678 |                 (raw_offset >= u8::MAX as u64 && offset >= u8::MAX as u64)
 679 |                     || (offset == raw_offset),
 680 |                 "block {} offset {} calc {}",
 681 |                 b,
 682 |                 raw_offset,
 683 |                 offset,
 684 |             );
 685 |         }
 686 |     }
 687 | 
 688 |     #[inline(always)]
 689 |     fn is_occupied(&self, hash_bucket_idx: u64) -> bool {
 690 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 691 |         let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size();
 692 |         let occupieds = u64::from_le_bytes(self.buffer[block_start + 1..][..8].try_into().unwrap());
 693 |         occupieds.is_bit_set((hash_bucket_idx % 64) as usize)
 694 |     }
 695 | 
 696 |     #[inline(always)]
 697 |     fn set_occupied(&mut self, hash_bucket_idx: u64, value: bool) {
 698 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 699 |         let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size();
 700 |         let mut occupieds =
 701 |             u64::from_le_bytes(self.buffer[block_start + 1..][..8].try_into().unwrap());
 702 |         occupieds.update_bit((hash_bucket_idx % 64) as usize, value);
 703 |         self.buffer[block_start + 1..][..8].copy_from_slice(&occupieds.to_le_bytes());
 704 |     }
 705 | 
 706 |     #[inline(always)]
 707 |     fn is_runend(&self, hash_bucket_idx: u64) -> bool {
 708 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 709 |         let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size();
 710 |         let runends =
 711 |             u64::from_le_bytes(self.buffer[block_start + 1 + 8..][..8].try_into().unwrap());
 712 |         runends.is_bit_set((hash_bucket_idx % 64) as usize)
 713 |     }
 714 | 
 715 |     #[inline(always)]
 716 |     fn set_runend(&mut self, hash_bucket_idx: u64, value: bool) {
 717 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 718 |         let block_start = (hash_bucket_idx / 64) as usize * self.block_byte_size();
 719 |         let mut runends =
 720 |             u64::from_le_bytes(self.buffer[block_start + 1 + 8..][..8].try_into().unwrap());
 721 |         runends.update_bit((hash_bucket_idx % 64) as usize, value);
 722 |         self.buffer[block_start + 1 + 8..][..8].copy_from_slice(&runends.to_le_bytes());
 723 |     }
 724 | 
 725 |     #[inline(always)]
 726 |     fn get_remainder(&self, hash_bucket_idx: u64) -> u64 {
 727 |         debug_assert!(self.rbits.get() > 0 && self.rbits.get() < 64);
 728 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 729 |         let remainders_start = (hash_bucket_idx / 64) as usize * self.block_byte_size() + 1 + 8 + 8;
 730 |         let start_bit_idx = self.rbits.usize() * (hash_bucket_idx % 64) as usize;
 731 |         let end_bit_idx = start_bit_idx + self.rbits.usize();
 732 |         let start_u64 = start_bit_idx / 64;
 733 |         let num_rem_parts = 1 + (end_bit_idx > (start_u64 + 1) * 64) as usize;
 734 |         let rem_parts_bytes = &self.buffer[remainders_start + start_u64 * 8..][..num_rem_parts * 8];
 735 |         let extra_low = start_bit_idx - start_u64 * 64;
 736 |         let extra_high = ((start_u64 + 1) * 64).saturating_sub(end_bit_idx);
 737 |         let rem_part = u64::from_le_bytes(rem_parts_bytes[..8].try_into().unwrap());
 738 |         // zero high bits & truncate low bits
 739 |         let mut remainder = (rem_part << extra_high) >> (extra_high + extra_low);
 740 |         if let Some(rem_part) = rem_parts_bytes.get(8..16) {
 741 |             let remaining_bits = end_bit_idx - (start_u64 + 1) * 64;
 742 |             let rem_part = u64::from_le_bytes(rem_part.try_into().unwrap());
 743 |             remainder |=
 744 |                 (rem_part & !(u64::MAX << remaining_bits)) << (self.rbits.usize() - remaining_bits);
 745 |         }
 746 |         debug_assert!(remainder.leading_zeros() >= 64 - self.rbits.get() as u32);
 747 |         remainder
 748 |     }
 749 | 
 750 |     #[inline(always)]
 751 |     fn set_remainder(&mut self, hash_bucket_idx: u64, remainder: u64) {
 752 |         debug_assert!(self.rbits.get() > 0 && self.rbits.get() < 64);
 753 |         debug_assert!(remainder.leading_zeros() >= 64 - self.rbits.get() as u32);
 754 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 755 |         let remainders_start = (hash_bucket_idx / 64) as usize * self.block_byte_size() + 1 + 8 + 8;
 756 |         let start_bit_idx = self.rbits.usize() * (hash_bucket_idx % 64) as usize;
 757 |         let end_bit_idx = start_bit_idx + self.rbits.usize();
 758 |         let start_u64 = start_bit_idx / 64;
 759 |         let num_rem_parts = 1 + (end_bit_idx > (start_u64 + 1) * 64) as usize;
 760 |         let rem_parts_bytes =
 761 |             &mut self.buffer[remainders_start + start_u64 * 8..][..num_rem_parts * 8];
 762 |         let mut rem_part = u64::from_le_bytes(rem_parts_bytes[..8].try_into().unwrap());
 763 |         let extra_low = start_bit_idx - start_u64 * 64;
 764 |         let extra_high = ((start_u64 + 1) * 64).saturating_sub(end_bit_idx);
 765 |         // zero region we'll copy remainder bits in
 766 |         rem_part &= !((u64::MAX << extra_low) & (u64::MAX >> extra_high));
 767 |         let low_bits_to_copy = 64 - extra_high - extra_low;
 768 |         rem_part |= (remainder & !(u64::MAX << low_bits_to_copy)) << extra_low;
 769 |         rem_parts_bytes[..8].copy_from_slice(&rem_part.to_le_bytes());
 770 |         if rem_parts_bytes.len() < 16 {
 771 |             return;
 772 |         }
 773 | 
 774 |         let remaining_bits = end_bit_idx - (start_u64 + 1) * 64;
 775 |         rem_part = u64::from_le_bytes(rem_parts_bytes[8..16].try_into().unwrap());
 776 |         // zero region we'll copy remainder bits in
 777 |         rem_part &= u64::MAX << remaining_bits;
 778 |         rem_part |= remainder >> (self.rbits.usize() - remaining_bits);
 779 |         rem_parts_bytes[8..16].copy_from_slice(&rem_part.to_le_bytes());
 780 |     }
 781 | 
 782 |     #[inline]
 783 |     fn get_rem_u64(&self, rem_u64: u64) -> u64 {
 784 |         let rbits = NonZeroU64::from(self.rbits);
 785 |         let bucket_block_idx = (rem_u64 / rbits) % self.total_blocks();
 786 |         let bucket_rem_u64 = (rem_u64 % rbits) as usize;
 787 |         let bucket_rem_start = (bucket_block_idx as usize * self.block_byte_size()) + 1 + 8 + 8;
 788 |         u64::from_le_bytes(
 789 |             self.buffer[bucket_rem_start + bucket_rem_u64 * 8..][..8]
 790 |                 .try_into()
 791 |                 .unwrap(),
 792 |         )
 793 |     }
 794 | 
 795 |     #[inline]
 796 |     fn set_rem_u64(&mut self, rem_u64: u64, rem: u64) {
 797 |         let rbits = NonZeroU64::from(self.rbits);
 798 |         let bucket_block_idx = (rem_u64 / rbits) % self.total_blocks();
 799 |         let bucket_rem_u64 = (rem_u64 % rbits) as usize;
 800 |         let bucket_rem_start = (bucket_block_idx as usize * self.block_byte_size()) + 1 + 8 + 8;
 801 |         self.buffer[bucket_rem_start + bucket_rem_u64 * 8..][..8]
 802 |             .copy_from_slice(&rem.to_le_bytes());
 803 |     }
 804 | 
 805 |     fn shift_remainders_by_1(&mut self, start: u64, end_inc: u64) {
 806 |         let end = if end_inc < start {
 807 |             end_inc + self.total_buckets().get() + 1
 808 |         } else {
 809 |             end_inc + 1
 810 |         };
 811 |         let mut end_u64 = end * self.rbits.u64() / 64;
 812 |         let mut bend = (end * self.rbits.u64() % 64) as usize;
 813 |         let start_u64 = start * self.rbits.u64() / 64;
 814 |         let bstart = (start * self.rbits.u64() % 64) as usize;
 815 |         while end_u64 != start_u64 {
 816 |             let prev_rem_u64 = self.get_rem_u64(end_u64 - 1);
 817 |             let mut rem_u64 = self.get_rem_u64(end_u64);
 818 |             rem_u64 = prev_rem_u64.shift_right(self.rbits.usize(), &rem_u64, 0, bend);
 819 |             self.set_rem_u64(end_u64, rem_u64);
 820 |             end_u64 -= 1;
 821 |             bend = 64;
 822 |         }
 823 |         let mut rem_u64 = self.get_rem_u64(start_u64);
 824 |         rem_u64 = 0u64.shift_right(self.rbits.usize(), &rem_u64, bstart, bend);
 825 |         self.set_rem_u64(start_u64, rem_u64);
 826 |     }
 827 | 
 828 |     fn shift_remainders_back_by_1(&mut self, start: u64, end_inc: u64) {
 829 |         let end = if end_inc < start {
 830 |             end_inc + self.total_buckets().get() + 1
 831 |         } else {
 832 |             end_inc + 1
 833 |         };
 834 |         let end_u64 = end * self.rbits.u64() / 64;
 835 |         let bend = (end * self.rbits.u64() % 64) as usize;
 836 |         let mut start_u64 = start * self.rbits.u64() / 64;
 837 |         let mut bstart = (start * self.rbits.u64() % 64) as usize;
 838 |         while end_u64 != start_u64 {
 839 |             let next_rem_u64 = self.get_rem_u64(start_u64 + 1);
 840 |             let mut rem_u64 = self.get_rem_u64(start_u64);
 841 |             rem_u64 = next_rem_u64.shift_left(self.rbits.usize(), &rem_u64, bstart, 64);
 842 |             self.set_rem_u64(start_u64, rem_u64);
 843 |             start_u64 += 1;
 844 |             bstart = 0;
 845 |         }
 846 |         let mut rem_u64 = self.get_rem_u64(end_u64);
 847 |         rem_u64 = 0u64.shift_left(self.rbits.usize(), &rem_u64, bstart, bend);
 848 |         self.set_rem_u64(end_u64, rem_u64);
 849 |     }
 850 | 
 851 |     fn shift_runends_by_1(&mut self, start: u64, end_inc: u64) {
 852 |         let end = if end_inc < start {
 853 |             end_inc + self.total_buckets().get() + 1
 854 |         } else {
 855 |             end_inc + 1
 856 |         };
 857 |         let mut end_block = end / 64;
 858 |         let mut bend = (end % 64) as usize;
 859 |         let start_block = start / 64;
 860 |         let bstart = (start % 64) as usize;
 861 |         while end_block != start_block {
 862 |             let prev_block_runends = self.raw_block(end_block - 1).runends;
 863 |             let mut block_runends = self.raw_block(end_block).runends;
 864 |             block_runends = prev_block_runends.shift_right(1, &block_runends, 0, bend);
 865 |             self.set_block_runends(end_block, block_runends);
 866 |             end_block -= 1;
 867 |             bend = 64;
 868 |         }
 869 |         let mut block_runends = self.raw_block(start_block).runends;
 870 |         block_runends = 0u64.shift_right(1, &block_runends, bstart, bend);
 871 |         self.set_block_runends(start_block, block_runends);
 872 |     }
 873 | 
 874 |     fn shift_runends_back_by_1(&mut self, start: u64, end_inc: u64) {
 875 |         let end = if end_inc < start {
 876 |             end_inc + self.total_buckets().get() + 1
 877 |         } else {
 878 |             end_inc + 1
 879 |         };
 880 |         let end_block = end / 64;
 881 |         let bend = (end % 64) as usize;
 882 |         let mut start_block = start / 64;
 883 |         let mut bstart = (start % 64) as usize;
 884 |         while start_block != end_block {
 885 |             let next_block_runends = self.raw_block(start_block + 1).runends;
 886 |             let mut block_runends = self.raw_block(start_block).runends;
 887 |             block_runends = next_block_runends.shift_left(1, &block_runends, bstart, 64);
 888 |             self.set_block_runends(start_block, block_runends);
 889 |             start_block += 1;
 890 |             bstart = 0;
 891 |         }
 892 |         let mut block_runends = self.raw_block(end_block).runends;
 893 |         block_runends = 0u64.shift_left(1, &block_runends, bstart, bend);
 894 |         self.set_block_runends(end_block, block_runends);
 895 |     }
 896 | 
 897 |     #[cold]
 898 |     #[inline(never)]
 899 |     fn calc_offset(&self, block_num: u64) -> u64 {
 900 |         // The block offset can be calculated as the difference between its position and runstart.
 901 |         let block_start = (block_num * 64) % self.total_buckets();
 902 |         let mut run_start = self.run_start(block_start);
 903 |         if run_start < block_start {
 904 |             run_start += self.total_buckets().get();
 905 |         }
 906 |         run_start - block_start
 907 |     }
 908 | 
 909 |     /// Start idx of of the run (inclusive)
 910 |     #[inline]
 911 |     fn run_start(&self, hash_bucket_idx: u64) -> u64 {
 912 |         // runstart is equivalent to the runend of the previous bucket + 1.
 913 |         let prev_bucket = hash_bucket_idx.wrapping_sub(1) % self.total_buckets();
 914 |         (self.run_end(prev_bucket) + 1) % self.total_buckets()
 915 |     }
 916 | 
 917 |     /// End idx of the end of the run (inclusive).
 918 |     fn run_end(&self, hash_bucket_idx: u64) -> u64 {
 919 |         let hash_bucket_idx = hash_bucket_idx % self.total_buckets();
 920 |         let bucket_block_idx = hash_bucket_idx / 64;
 921 |         let bucket_intrablock_offset = hash_bucket_idx % 64;
 922 |         let bucket_block = self.block(bucket_block_idx);
 923 |         let bucket_intrablock_rank = bucket_block.occupieds.popcnt(..=bucket_intrablock_offset);
 924 |         // No occupied buckets all the way to bucket_intrablock_offset
 925 |         // which also means hash_bucket_idx isn't occupied
 926 |         if bucket_intrablock_rank == 0 {
 927 |             return if bucket_block.offset <= bucket_intrablock_offset {
 928 |                 // hash_bucket_idx points to an empty bucket unaffected by block offset,
 929 |                 // thus end == start
 930 |                 hash_bucket_idx
 931 |             } else {
 932 |                 // hash_bucket_idx fall within the section occupied by the offset,
 933 |                 // thus end == last bucket of offset section
 934 |                 (bucket_block_idx * 64 + bucket_block.offset - 1) % self.total_buckets()
 935 |             };
 936 |         }
 937 | 
 938 |         // Must search runends to figure out the end of the run
 939 |         let mut runend_block_idx = bucket_block_idx + bucket_block.offset / 64;
 940 |         let mut runend_ignore_bits = bucket_block.offset % 64;
 941 |         let mut runend_block = self.raw_block(runend_block_idx);
 942 |         // Try to find the runend for the bucket in this block.
 943 |         // We're looking for the runend_rank'th bit set (0 based)
 944 |         let mut runend_rank = bucket_intrablock_rank - 1;
 945 |         let mut runend_block_offset = runend_block
 946 |             .runends
 947 |             .select(runend_ignore_bits.., runend_rank);
 948 | 
 949 |         if let Some(runend_block_offset) = runend_block_offset {
 950 |             let runend_idx = runend_block_idx * 64 + runend_block_offset;
 951 |             return runend_idx.max(hash_bucket_idx) % self.total_buckets();
 952 |         }
 953 |         // There were not enough runend bits set, keep looking...
 954 |         loop {
 955 |             // subtract any runend bits found
 956 |             runend_rank -= runend_block.runends.popcnt(runend_ignore_bits..);
 957 |             // move to the next block
 958 |             runend_block_idx += 1;
 959 |             runend_ignore_bits = 0;
 960 |             runend_block = self.raw_block(runend_block_idx);
 961 |             runend_block_offset = runend_block
 962 |                 .runends
 963 |                 .select(runend_ignore_bits.., runend_rank);
 964 | 
 965 |             if let Some(runend_block_offset) = runend_block_offset {
 966 |                 let runend_idx = runend_block_idx * 64 + runend_block_offset;
 967 |                 return runend_idx.max(hash_bucket_idx) % self.total_buckets();
 968 |             }
 969 |         }
 970 |     }
 971 | 
 972 |     /// Returns whether item is present (probabilistically) in the filter.
 973 |     pub fn contains<T: Hash>(&self, item: T) -> bool {
 974 |         self.contains_fingerprint(self.hash(item))
 975 |     }
 976 | 
 977 |     /// Returns whether the fingerprint is present (probabilistically) in the filter.
 978 |     pub fn contains_fingerprint(&self, hash: u64) -> bool {
 979 |         let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash);
 980 |         if !self.is_occupied(hash_bucket_idx) {
 981 |             return false;
 982 |         }
 983 |         let mut runstart_idx = self.run_start(hash_bucket_idx);
 984 |         loop {
 985 |             if hash_remainder == self.get_remainder(runstart_idx) {
 986 |                 return true;
 987 |             }
 988 |             if self.is_runend(runstart_idx) {
 989 |                 return false;
 990 |             }
 991 |             runstart_idx += 1;
 992 |         }
 993 |     }
 994 | 
 995 |     /// Returns the number of times the item appears (probabilistically) in the filter.
 996 |     pub fn count<T: Hash>(&mut self, item: T) -> u64 {
 997 |         self.count_fingerprint(self.hash(item))
 998 |     }
 999 | 
1000 |     /// Returns the amount of times the fingerprint appears (probabilistically) in the filter.
1001 |     pub fn count_fingerprint(&mut self, hash: u64) -> u64 {
1002 |         let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash);
1003 |         if !self.is_occupied(hash_bucket_idx) {
1004 |             return 0;
1005 |         }
1006 | 
1007 |         let mut count = 0u64;
1008 |         let mut runstart_idx = self.run_start(hash_bucket_idx);
1009 |         loop {
1010 |             if hash_remainder == self.get_remainder(runstart_idx) {
1011 |                 count += 1;
1012 |             }
1013 |             if self.is_runend(runstart_idx) {
1014 |                 return count;
1015 |             }
1016 |             runstart_idx += 1;
1017 |         }
1018 |     }
1019 | 
1020 |     #[inline]
1021 |     fn offset_lower_bound(&self, hash_bucket_idx: u64) -> u64 {
1022 |         let bucket_block_idx = hash_bucket_idx / 64;
1023 |         let bucket_intrablock_offset = hash_bucket_idx % 64;
1024 |         let bucket_block = self.raw_block(bucket_block_idx);
1025 |         let num_occupied = bucket_block.occupieds.popcnt(..=bucket_intrablock_offset);
1026 |         if bucket_block.offset <= bucket_intrablock_offset {
1027 |             num_occupied
1028 |                 - bucket_block
1029 |                     .runends
1030 |                     .popcnt(bucket_block.offset..bucket_intrablock_offset)
1031 |         } else {
1032 |             bucket_block.offset + num_occupied - bucket_intrablock_offset
1033 |         }
1034 |     }
1035 | 
1036 |     fn find_first_empty_slot(&self, mut hash_bucket_idx: u64) -> u64 {
1037 |         loop {
1038 |             let olb = self.offset_lower_bound(hash_bucket_idx);
1039 |             if olb == 0 {
1040 |                 return hash_bucket_idx % self.total_buckets();
1041 |             }
1042 |             hash_bucket_idx += olb;
1043 |         }
1044 |     }
1045 | 
1046 |     fn find_first_not_shifted_slot(&self, mut hash_bucket_idx: u64) -> u64 {
1047 |         loop {
1048 |             let run_end = self.run_end(hash_bucket_idx);
1049 |             if run_end == hash_bucket_idx {
1050 |                 return hash_bucket_idx;
1051 |             }
1052 |             hash_bucket_idx = run_end;
1053 |         }
1054 |     }
1055 | 
1056 |     /// Removes `item` from the filter.
1057 |     /// Returns whether item was actually found and removed.
1058 |     ///
1059 |     /// Note that removing an item who wasn't previously added to the filter
1060 |     /// may introduce **false negatives**. This is because it could be removing
1061 |     /// fingerprints from a colliding item!
1062 |     pub fn remove<T: Hash>(&mut self, item: T) -> bool {
1063 |         self.remove_fingerprint(self.hash(item))
1064 |     }
1065 | 
1066 |     /// Removes the fingerprint specified by `hash` was from the filter.
1067 |     /// Returns whether a fingerprint was actually found and removed.
1068 |     ///
1069 |     /// Note that removing a fingerprint that wasn't previously added to the filter
1070 |     /// may introduce false negatives. This is because it could be removing
1071 |     /// fingerprints from a colliding hash!
1072 |     pub fn remove_fingerprint(&mut self, hash: u64) -> bool {
1073 |         let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash);
1074 |         if !self.is_occupied(hash_bucket_idx) {
1075 |             return false;
1076 |         }
1077 |         let mut run_start = self.run_start(hash_bucket_idx);
1078 |         // adjust run_start so we can have
1079 |         // hash_bucket_idx <= run_start <= found_idx <= run_end
1080 |         if run_start < hash_bucket_idx {
1081 |             run_start += self.total_buckets().get();
1082 |         }
1083 |         let mut run_end = run_start;
1084 |         let mut found_idx = None;
1085 |         let found_idx = loop {
1086 |             if hash_remainder == self.get_remainder(run_end) {
1087 |                 found_idx = Some(run_end);
1088 |             }
1089 |             if self.is_runend(run_end) {
1090 |                 if let Some(i) = found_idx {
1091 |                     break i;
1092 |                 } else {
1093 |                     return false;
1094 |                 };
1095 |             }
1096 |             run_end += 1;
1097 |         };
1098 | 
1099 |         let mut last_bucket_shifted_run_end = run_end;
1100 |         if last_bucket_shifted_run_end != hash_bucket_idx {
1101 |             last_bucket_shifted_run_end = self.find_first_not_shifted_slot(run_end);
1102 |             if last_bucket_shifted_run_end < run_end {
1103 |                 last_bucket_shifted_run_end += self.total_buckets().get();
1104 |             }
1105 |         }
1106 | 
1107 |         // run_end points to the end of the run (inc) which contains the target remainder (found_idx)
1108 |         // If we had a single remainder in the run the run is no more
1109 |         if run_end == run_start {
1110 |             self.set_occupied(hash_bucket_idx, false);
1111 |         } else {
1112 |             // More than one remainder in the run.
1113 |             // If the removed rem is the last one in the run
1114 |             // the before last remainder becomes the new runend.
1115 |             if found_idx == run_end {
1116 |                 self.set_runend(run_end - 1, true);
1117 |             }
1118 |         }
1119 |         if found_idx != last_bucket_shifted_run_end {
1120 |             self.set_remainder(found_idx, 0);
1121 |             self.shift_remainders_back_by_1(found_idx, last_bucket_shifted_run_end);
1122 |             self.shift_runends_back_by_1(found_idx, last_bucket_shifted_run_end);
1123 |         }
1124 |         self.set_runend(last_bucket_shifted_run_end, false);
1125 |         self.set_remainder(last_bucket_shifted_run_end, 0);
1126 |         self.dec_offsets(hash_bucket_idx, last_bucket_shifted_run_end);
1127 |         self.len -= 1;
1128 |         true
1129 |     }
1130 | 
1131 |     /// Inserts `item` in the filter, even if already appears to be in the filter.
1132 |     /// This works by inserting a possibly duplicated fingerprint in the filter.
1133 |     ///
1134 |     /// This function should be used when the filter is also subject to removals
1135 |     /// and the item is known to not have been added to the filter before (or was removed).
1136 |     ///
1137 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1138 |     #[inline]
1139 |     pub fn insert_duplicated<T: Hash>(&mut self, item: T) -> Result<(), Error> {
1140 |         self.insert_counting(u64::MAX, item).map(|_| ())
1141 |     }
1142 | 
1143 |     /// Inserts `item` in the filter if it's not already present (probabilistically).
1144 |     /// Note that membership is probabilistic, so this function may return false positives
1145 |     /// but never false negatives.
1146 |     ///
1147 |     /// Returns `Ok(true)` if the item was successfully added to the filter.
1148 |     /// Returns `Ok(false)` if the item is already contained (probabilistically) in the filter.
1149 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1150 |     #[inline]
1151 |     pub fn insert<T: Hash>(&mut self, item: T) -> Result<bool, Error> {
1152 |         self.insert_counting(1, item).map(|count| count == 0)
1153 |     }
1154 | 
1155 |     /// Inserts `item` in the filter, even if already appears to be in the filter.
1156 |     /// This works by inserting a possibly duplicated fingerprint in the filter.
1157 |     /// The argument `max_count` specifies how many duplicates can be inserted.
1158 |     ///
1159 |     /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter. So if the item
1160 |     /// was already in the filter `C` times, another insertion was performed if `C < max_count`.
1161 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1162 |     pub fn insert_counting<T: Hash>(&mut self, max_count: u64, item: T) -> Result<u64, Error> {
1163 |         let hash = self.hash(item);
1164 |         match self.insert_impl(max_count, hash) {
1165 |             Ok(count) => Ok(count),
1166 |             Err(_) => {
1167 |                 self.grow_if_possible()?;
1168 |                 self.insert_impl(max_count, hash)
1169 |             }
1170 |         }
1171 |     }
1172 | 
1173 |     /// Inserts the fingerprint specified by `hash` in the filter.
1174 |     /// `duplicate` specifies if the fingerprint should be added even if it's already in the filter.
1175 |     ///
1176 |     /// Note that this function will automatically grow the filter if needed.
1177 |     /// The implementation uses the first [`Self::fingerprint_size`] bits of `hash` to place the fingerprint in the appropriate slot.
1178 |     /// The remaining bits are ignored and will be returned as 0 if the fingerprint is retrieved via [`Self::fingerprints`].
1179 |     ///
1180 |     /// Returns `Ok(true)` if the item was successfully added to the filter.
1181 |     /// Returns `Ok(false)` if the item is already contained (probabilistically) in the filter. Possible if `duplicate` is `false`.
1182 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1183 |     #[inline]
1184 |     pub fn insert_fingerprint(&mut self, duplicate: bool, hash: u64) -> Result<bool, Error> {
1185 |         let max_count = if duplicate { u64::MAX } else { 1 };
1186 |         self.insert_fingerprint_counting(max_count, hash)
1187 |             .map(|count| count < max_count)
1188 |     }
1189 | 
1190 |     /// Inserts the fingerprint specified by `hash` in the filter.
1191 |     /// `max_count` specifies how many occurences of the fingerprint can be added to the filter.
1192 |     ///
1193 |     /// Note that this function will automatically grow the filter if needed.
1194 |     /// The implementation uses the first [`Self::fingerprint_size`] bits of `hash` to place the fingerprint in the appropriate slot.
1195 |     /// The remaining bits are ignored and will be returned as 0 if the fingerprint is retrieved via [`Self::fingerprints`].
1196 |     ///
1197 |     /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter. So if the item
1198 |     /// was already in the filter `C` times, another insertion was performed if `C < max_count`.
1199 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1200 |     pub fn insert_fingerprint_counting(&mut self, max_count: u64, hash: u64) -> Result<u64, Error> {
1201 |         match self.insert_impl(max_count, hash) {
1202 |             Ok(count) => Ok(count),
1203 |             Err(_) => {
1204 |                 self.grow_if_possible()?;
1205 |                 self.insert_impl(max_count, hash)
1206 |             }
1207 |         }
1208 |     }
1209 | 
1210 |     /// Inserts the fingerprint specified by `hash` in the filter.
1211 |     /// `max_count` specifies how many occurences of the fingerprint can be added to the filter.
1212 |     /// It's up to the caller to grow the filter if needed and retry the insert.
1213 |     ///
1214 |     /// Returns `Ok(count)` of how many equal fingerprints _were_ in the filter.
1215 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot admit the new item.
1216 |     fn insert_impl(&mut self, max_count: u64, hash: u64) -> Result<u64, Error> {
1217 |         enum Operation {
1218 |             NewRun,
1219 |             BeforeRunend,
1220 |             NewRunend,
1221 |         }
1222 | 
1223 |         let (hash_bucket_idx, hash_remainder) = self.calc_qr(hash);
1224 |         if self.offset_lower_bound(hash_bucket_idx) == 0 {
1225 |             if self.len >= self.capacity() {
1226 |                 return Err(Error::CapacityExceeded);
1227 |             }
1228 |             debug_assert!(!self.is_occupied(hash_bucket_idx));
1229 |             debug_assert!(!self.is_runend(hash_bucket_idx));
1230 |             self.set_occupied(hash_bucket_idx, true);
1231 |             self.set_runend(hash_bucket_idx, true);
1232 |             self.set_remainder(hash_bucket_idx, hash_remainder);
1233 |             self.len += 1;
1234 |             return Ok(0);
1235 |         }
1236 | 
1237 |         let mut runstart_idx = self.run_start(hash_bucket_idx);
1238 |         let mut runend_idx = self.run_end(hash_bucket_idx);
1239 |         let mut fingerprint_count = 0;
1240 |         let insert_idx;
1241 |         let operation;
1242 |         if self.is_occupied(hash_bucket_idx) {
1243 |             // adjust runend so its >= runstart even if it wrapped around
1244 |             if runend_idx < runstart_idx {
1245 |                 runend_idx += self.total_buckets().get();
1246 |             }
1247 |             while runstart_idx <= runend_idx {
1248 |                 match self.get_remainder(runstart_idx).cmp(&hash_remainder) {
1249 |                     Ordering::Equal => {
1250 |                         fingerprint_count += 1;
1251 |                         if fingerprint_count >= max_count {
1252 |                             return Ok(fingerprint_count);
1253 |                         }
1254 |                     }
1255 |                     Ordering::Greater => break,
1256 |                     Ordering::Less => (),
1257 |                 }
1258 | 
1259 |                 runstart_idx += 1;
1260 |             }
1261 | 
1262 |             if runstart_idx > runend_idx {
1263 |                 /* new remainder is >= than any remainder in the run. */
1264 |                 operation = Operation::NewRunend;
1265 |                 insert_idx = runstart_idx % self.total_buckets();
1266 |             } else {
1267 |                 /* there are larger remainders already in the run. */
1268 |                 operation = Operation::BeforeRunend; /* Inserting */
1269 |                 insert_idx = runstart_idx % self.total_buckets();
1270 |             }
1271 |         } else {
1272 |             insert_idx = (runend_idx + 1) % self.total_buckets();
1273 |             operation = Operation::NewRun; /* Insert into empty bucket */
1274 |         }
1275 | 
1276 |         if self.len >= self.capacity() {
1277 |             return Err(Error::CapacityExceeded);
1278 |         }
1279 |         let empty_slot_idx = self.find_first_empty_slot(runend_idx + 1);
1280 |         if insert_idx != empty_slot_idx {
1281 |             self.shift_remainders_by_1(insert_idx, empty_slot_idx);
1282 |             self.shift_runends_by_1(insert_idx, empty_slot_idx);
1283 |         }
1284 |         self.set_remainder(insert_idx, hash_remainder);
1285 |         match operation {
1286 |             Operation::NewRun => {
1287 |                 /* Insert into empty bucket */
1288 |                 self.set_runend(insert_idx, true);
1289 |                 self.set_occupied(hash_bucket_idx, true);
1290 |             }
1291 |             Operation::NewRunend => {
1292 |                 /*  new remainder it is >= than any remainder in the run. */
1293 |                 self.set_runend(insert_idx.wrapping_sub(1) % self.total_buckets(), false);
1294 |                 self.set_runend(insert_idx, true);
1295 |             }
1296 |             Operation::BeforeRunend => { /* there are larger remainders already in the run. */ }
1297 |         }
1298 | 
1299 |         self.inc_offsets(hash_bucket_idx, empty_slot_idx);
1300 |         self.len += 1;
1301 |         Ok(fingerprint_count)
1302 |     }
1303 | 
1304 |     /// Returns an iterator over the fingerprints stored in the filter.
1305 |     ///
1306 |     /// Fingerprints will be returned in ascending order.
1307 |     pub fn fingerprints(&self) -> FingerprintIter {
1308 |         FingerprintIter::new(self)
1309 |     }
1310 | 
1311 |     /// Shrinks the capacity of the filter as much as possible while preserving
1312 |     /// the false positive ratios and fingerprint size.
1313 |     pub fn shrink_to_fit(&mut self) {
1314 |         if self.total_blocks().get() > 1 && self.len() <= self.capacity() / 2 {
1315 |             let mut new = Self::with_qr(
1316 |                 (self.qbits.get() - 1).try_into().unwrap(),
1317 |                 (self.rbits.get() + 1).try_into().unwrap(),
1318 |             )
1319 |             .unwrap();
1320 |             new.max_qbits = self.max_qbits;
1321 |             for hash in self.fingerprints() {
1322 |                 let _ = new.insert_fingerprint(true, hash);
1323 |             }
1324 |             debug_assert_eq!(new.len, self.len);
1325 |             debug_assert_eq!(new.fingerprint_size(), self.fingerprint_size());
1326 |             *self = new;
1327 |         }
1328 |     }
1329 | 
1330 |     /// Merges `other` filter into `self`.
1331 |     ///
1332 |     /// `keep_duplicates` specifies whether duplicated fingerprints should be store,
1333 |     /// this is normally only useful is the filter is being used for counting.
1334 |     ///
1335 |     /// Note that the `other` filter must have a fingerprint >= `self` fingerprint size,
1336 |     /// otherwise the function will fail with `Err(Error::IncompatibleFingerprintSize)`.
1337 |     /// This is the case for filters created with the same parameters or if the `other`
1338 |     /// filter has a lower target false positive ratio.
1339 |     ///
1340 |     /// Returns `Err(Error::CapacityExceeded)` if the filter cannot merge all items.
1341 |     /// Note that in this case items could have already been added and the filter is left
1342 |     /// full but in an otherwise valid state.
1343 |     pub fn merge(&mut self, keep_duplicates: bool, other: &Self) -> Result<(), Error> {
1344 |         if other.fingerprint_size() < self.fingerprint_size() {
1345 |             return Err(Error::IncompatibleFingerprintSize);
1346 |         }
1347 |         let max_count = if keep_duplicates { u64::MAX } else { 1 };
1348 |         for hash in other.fingerprints() {
1349 |             self.insert_impl(max_count, hash)?;
1350 |         }
1351 |         Ok(())
1352 |     }
1353 | 
1354 |     #[inline]
1355 |     fn grow_if_possible(&mut self) -> Result<(), Error> {
1356 |         if let Some(m) = self.max_qbits {
1357 |             if m > self.qbits {
1358 |                 self.grow();
1359 |                 return Ok(());
1360 |             }
1361 |         }
1362 |         Err(Error::CapacityExceeded)
1363 |     }
1364 | 
1365 |     #[cold]
1366 |     #[inline(never)]
1367 |     fn grow(&mut self) {
1368 |         let qbits = self.qbits.checked_add(1).unwrap();
1369 |         let rbits = NonZeroU8::new(self.rbits.get() - 1).unwrap();
1370 |         let mut new = Self::with_qr(qbits, rbits).unwrap();
1371 |         new.max_qbits = self.max_qbits;
1372 |         for hash in self.fingerprints() {
1373 |             new.insert_fingerprint(true, hash).unwrap();
1374 |         }
1375 |         assert_eq!(self.len, new.len);
1376 |         *self = new;
1377 |     }
1378 | 
1379 |     #[inline]
1380 |     fn hash<T: Hash>(&self, item: T) -> u64 {
1381 |         let mut hasher = StableHasher::new();
1382 |         item.hash(&mut hasher);
1383 |         hasher.finish()
1384 |     }
1385 | 
1386 |     #[inline]
1387 |     fn calc_qr(&self, hash: u64) -> (u64, u64) {
1388 |         let hash_bucket_idx = (hash >> self.rbits.get()) & ((1 << self.qbits.get()) - 1);
1389 |         let remainder = hash & ((1 << self.rbits.get()) - 1);
1390 |         (hash_bucket_idx, remainder)
1391 |     }
1392 | 
1393 |     #[inline]
1394 |     fn total_blocks(&self) -> NonZeroU64 {
1395 |         // The way this is calculated ensures the compilers sees that the result is both != 0 and a power of 2,
1396 |         // both of which allow the optimizer to generate much faster division/remainder code.
1397 |         #[cfg(any(debug_assertions, fuzzing))]
1398 |         {
1399 |             NonZeroU64::new((1u64 << self.qbits.get()) / 64).unwrap()
1400 |         }
1401 |         #[cfg(not(any(debug_assertions, fuzzing)))]
1402 |         {
1403 |             // Safety: All filter have at least 1 block (which have 64 slots each)
1404 |             unsafe { NonZeroU64::new_unchecked((1u64 << self.qbits.get()) / 64) }
1405 |         }
1406 |     }
1407 | 
1408 |     #[inline]
1409 |     fn total_buckets(&self) -> NonZeroU64 {
1410 |         NonZeroU64::new(1 << self.qbits.get()).unwrap()
1411 |     }
1412 | 
1413 |     #[doc(hidden)]
1414 |     #[cfg(any(fuzzing, test))]
1415 |     pub fn printout(&self) {
1416 |         eprintln!(
1417 |             "=== q {} r {} len {} cap {} ===",
1418 |             self.qbits,
1419 |             self.rbits,
1420 |             self.len(),
1421 |             self.capacity()
1422 |         );
1423 |         for b in 0..self.total_blocks().get() {
1424 |             let block = self.raw_block(b);
1425 |             eprintln!(
1426 |                 "block {} offset {:?}\noccup {:064b}\nrunen {:064b}",
1427 |                 b, block.offset, block.occupieds, block.runends
1428 |             );
1429 |             eprintln!(
1430 |                 "      3210987654321098765432109876543210987654321098765432109876543210 {}",
1431 |                 b * 64
1432 |             );
1433 |             eprint!("rem   ");
1434 |             for i in (0..64).rev() {
1435 |                 let r = self.get_remainder(b * 64 + i);
1436 |                 eprint!("{}", r % 100 / 10);
1437 |             }
1438 |             eprint!("\nrem   ");
1439 |             for i in (0..64).rev() {
1440 |                 let r = self.get_remainder(b * 64 + i);
1441 |                 eprint!("{}", r % 10);
1442 |             }
1443 |             println!();
1444 |         }
1445 |         eprintln!("===");
1446 |     }
1447 | }
1448 | 
1449 | impl std::fmt::Debug for Filter {
1450 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1451 |         f.debug_struct("Filter")
1452 |             .field("buffer", &"[..]")
1453 |             .field("len", &self.len)
1454 |             .field("qbits", &self.qbits)
1455 |             .field("rbits", &self.rbits)
1456 |             .field("max_qbits", &self.max_qbits)
1457 |             .finish()
1458 |     }
1459 | }
1460 | 
1461 | #[cfg(test)]
1462 | mod tests {
1463 |     use super::*;
1464 | 
1465 |     #[test]
1466 |     fn run_end_simple() {
1467 |         let mut f = Filter::new(50, 0.01).unwrap();
1468 |         f.set_occupied(5, true);
1469 |         f.set_runend(5, true);
1470 |         assert_eq!(f.run_end(4), 4);
1471 |         assert_eq!(f.run_end(5), 5);
1472 |         assert_eq!(f.run_end(6), 6);
1473 | 
1474 |         f.set_occupied(6, true);
1475 |         f.set_runend(6, true);
1476 |         assert_eq!(f.run_end(4), 4);
1477 |         assert_eq!(f.run_end(5), 5);
1478 |         assert_eq!(f.run_end(6), 6);
1479 | 
1480 |         f.set_runend(6, false);
1481 |         f.set_runend(7, true);
1482 |         assert_eq!(f.run_end(4), 4);
1483 |         assert_eq!(f.run_end(5), 5);
1484 |         assert_eq!(f.run_end(6), 7);
1485 | 
1486 |         f.set_runend(7, false);
1487 |         f.set_runend(8, true);
1488 |         assert_eq!(f.run_end(4), 4);
1489 |         assert_eq!(f.run_end(5), 5);
1490 |         assert_eq!(f.run_end(6), 8);
1491 | 
1492 |         f.set_occupied(10, true);
1493 |         f.set_runend(12, true);
1494 |         f.set_occupied(12, true);
1495 |         f.set_runend(13, true);
1496 |         assert_eq!(f.run_end(10), 12);
1497 |         assert_eq!(f.run_end(12), 13);
1498 | 
1499 |         f.set_occupied(11, true);
1500 |         f.set_runend(14, true);
1501 |         assert_eq!(f.run_end(10), 12);
1502 |         assert_eq!(f.run_end(11), 13);
1503 |         assert_eq!(f.run_end(12), 14);
1504 |     }
1505 | 
1506 |     #[test]
1507 |     fn run_end_eob() {
1508 |         let mut f = Filter::new(50, 0.01).unwrap();
1509 |         assert_eq!(f.total_buckets().get(), 64);
1510 |         f.set_occupied(63, true);
1511 |         f.set_runend(63, true);
1512 |         assert_eq!(f.run_end(62), 62);
1513 |         assert_eq!(f.run_end(63), 63);
1514 |         assert_eq!(f.find_first_empty_slot(62), 62);
1515 |         assert_eq!(f.find_first_empty_slot(63), 0);
1516 |     }
1517 | 
1518 |     #[test]
1519 |     fn run_end_crossing() {
1520 |         let mut f = Filter::new(50, 0.01).unwrap();
1521 |         f.set_occupied(0, true);
1522 |         f.set_runend(0, true);
1523 |         f.set_occupied(63, true);
1524 |         f.set_runend(63, true);
1525 |         assert_eq!(f.run_end(0), 0);
1526 |         assert_eq!(f.run_end(1), 1);
1527 |         assert_eq!(f.run_end(62), 62);
1528 |         assert_eq!(f.run_end(63), 63);
1529 | 
1530 |         f.set_runend(63, false);
1531 |         f.set_runend(1, true);
1532 |         f.adjust_block_offset(1, true);
1533 |         assert_eq!(f.run_end(0), 1);
1534 |         assert_eq!(f.run_end(1), 1);
1535 |         assert_eq!(f.run_end(62), 62);
1536 |         assert_eq!(f.run_end(63), 0);
1537 | 
1538 |         f.set_runend(1, false);
1539 |         f.set_runend(2, true);
1540 |         assert_eq!(f.run_end(63), 0);
1541 |         assert_eq!(f.run_end(0), 2);
1542 |         assert_eq!(f.run_end(1), 2);
1543 | 
1544 |         f.set_runend(2, false);
1545 |         f.set_runend(3, true);
1546 |         assert_eq!(f.run_end(63), 0);
1547 |         assert_eq!(f.run_end(1), 3);
1548 |         assert_eq!(f.run_end(2), 3);
1549 | 
1550 |         f.set_occupied(65, true);
1551 |         f.set_runend(68, true);
1552 |         assert_eq!(f.run_end(63), 0);
1553 |         assert_eq!(f.run_end(0), 3);
1554 |         assert_eq!(f.run_end(1), 4);
1555 |     }
1556 | 
1557 |     #[test]
1558 |     fn test_insert_duplicated() {
1559 |         for cap in [100, 200, 500, 1000] {
1560 |             let mut f = Filter::new(cap, 0.01).unwrap();
1561 |             for i in 0..f.capacity() / 2 {
1562 |                 f.insert_duplicated(-1).unwrap();
1563 |                 f.insert_duplicated(i).unwrap();
1564 |                 assert!(f.count(-1) >= i);
1565 |                 assert!(f.count(i) >= 1);
1566 |             }
1567 |         }
1568 |     }
1569 | 
1570 |     #[test]
1571 |     fn test_insert_duplicated_two() {
1572 |         for s in 0..10 {
1573 |             for c in [200, 800, 1500] {
1574 |                 let mut f = Filter::new(c, 0.001).unwrap();
1575 |                 for i in 0..f.capacity() / 2 {
1576 |                     f.insert_duplicated(-1).unwrap();
1577 |                     assert_eq!(f.count(-1), i + 1);
1578 |                     assert_eq!(f.count(s), i);
1579 |                     f.insert_duplicated(s).unwrap();
1580 |                     assert_eq!(f.count(-1), i + 1);
1581 |                     assert_eq!(f.count(s), i + 1);
1582 |                 }
1583 |             }
1584 |         }
1585 |     }
1586 | 
1587 |     #[test]
1588 |     fn test_insert_duplicated_one() {
1589 |         for s in 0..10 {
1590 |             for cap in [100, 200, 500, 1000] {
1591 |                 let mut f = Filter::new(cap, 0.01).unwrap();
1592 |                 for i in 0..f.capacity() {
1593 |                     f.insert_duplicated(s).unwrap();
1594 |                     assert!(f.count(s) > i);
1595 |                 }
1596 |                 assert_eq!(f.count(s), f.capacity());
1597 |             }
1598 |         }
1599 |     }
1600 | 
1601 |     #[test]
1602 |     fn test_auto_resize_two() {
1603 |         let mut f = Filter::new_resizeable(50, 1000, 0.01).unwrap();
1604 |         for _ in 0..50 {
1605 |             f.insert_duplicated(0).unwrap();
1606 |         }
1607 |         for _ in 0..3 {
1608 |             f.insert_duplicated(1).unwrap();
1609 |         }
1610 |         f.grow();
1611 |         f.grow();
1612 |         f.grow();
1613 |         assert_eq!(f.count(0), 50);
1614 |         assert_eq!(f.count(1), 3);
1615 |     }
1616 | 
1617 |     #[test]
1618 |     fn test_new_resizeable() {
1619 |         let mut f = Filter::new_resizeable(100, 100, 0.01).unwrap();
1620 |         assert!(f.grow_if_possible().is_err());
1621 |         let mut f = Filter::new_resizeable(0, 100, 0.01).unwrap();
1622 |         assert!(f.grow_if_possible().is_ok());
1623 |     }
1624 | 
1625 |     #[test]
1626 |     #[should_panic]
1627 |     fn test_new_capacity_overflow() {
1628 |         Filter::new_resizeable(100, u64::MAX, 0.01).unwrap();
1629 |     }
1630 | 
1631 |     #[test]
1632 |     #[should_panic]
1633 |     fn test_new_hash_overflow() {
1634 |         Filter::new_resizeable(100, u64::MAX / 20, 0.01).unwrap();
1635 |     }
1636 | 
1637 |     #[test]
1638 |     fn test_auto_resize_one() {
1639 |         let mut f = Filter::new_resizeable(100, 500, 0.01).unwrap();
1640 |         for i in 0u64.. {
1641 |             if f.insert_duplicated(i).is_err() {
1642 |                 assert_eq!(f.len(), i);
1643 |                 break;
1644 |             }
1645 |         }
1646 |         assert!(f.len() >= 500);
1647 |         for i in 0u64..f.len() {
1648 |             assert!(f.contains(i), "{}", i);
1649 |         }
1650 |     }
1651 | 
1652 |     #[test]
1653 |     fn test_remainders_and_shifts() {
1654 |         let mut f = Filter::new(200, 0.01).unwrap();
1655 |         let c = f.capacity();
1656 |         for j in 0..c {
1657 |             f.set_remainder(j, 0b1011101);
1658 |             assert_eq!(f.get_remainder(j), 0b1011101);
1659 |             f.set_runend(j, true);
1660 |             assert!(f.is_runend(j));
1661 |         }
1662 |         for j in 0..c {
1663 |             f.set_remainder(j, 0b1111111);
1664 |             assert_eq!(f.get_remainder(j), 0b1111111);
1665 |             f.set_runend(j, false);
1666 |             assert!(!f.is_runend(j));
1667 |         }
1668 |         for j in 0..c {
1669 |             f.set_remainder(j, 0b1101101);
1670 |             assert_eq!(f.get_remainder(j), 0b1101101);
1671 |             f.set_runend(j, true);
1672 |             assert!(f.is_runend(j));
1673 |         }
1674 |         f.shift_remainders_by_1(0, c);
1675 |         f.shift_runends_by_1(0, c);
1676 | 
1677 |         for j in 1..=c {
1678 |             assert_eq!(f.get_remainder(j), 0b1101101);
1679 |         }
1680 |         assert!(!f.is_runend(0));
1681 |         for j in 1..=c {
1682 |             assert_eq!(f.get_remainder(j), 0b1101101);
1683 |             assert!(f.is_runend(j));
1684 |         }
1685 |     }
1686 | 
1687 |     #[test]
1688 |     fn test_remove() {
1689 |         for fp in [0.0001, 0.00001, 0.000001] {
1690 |             for cap in [0, 100, 200, 400, 1000] {
1691 |                 let mut f = Filter::new(cap, fp).unwrap();
1692 |                 dbg!(f.rbits, f.capacity());
1693 |                 let c = f.capacity();
1694 |                 for i in 0..c {
1695 |                     assert!(f.insert(i).unwrap());
1696 |                 }
1697 |                 assert_eq!(f.len(), c);
1698 |                 for i in 0..c {
1699 |                     for j in 0..c {
1700 |                         assert_eq!(f.count(j), (j >= i) as u64, "{}", j);
1701 |                     }
1702 |                     // f.printout();
1703 |                     assert!(f.remove(i));
1704 |                     // f.printout();
1705 |                 }
1706 |                 assert!(f.is_empty());
1707 |             }
1708 |         }
1709 |     }
1710 |     #[test]
1711 |     fn test_remove_dup_one() {
1712 |         for s in 0..10 {
1713 |             for cap in [0, 100, 200, 500, 1000] {
1714 |                 let mut f = Filter::new(cap, 0.0001).unwrap();
1715 |                 let c = f.capacity();
1716 |                 for _ in 0..c {
1717 |                     f.insert_duplicated(s).unwrap();
1718 |                 }
1719 |                 assert_eq!(f.len(), c);
1720 |                 for i in 0..c {
1721 |                     assert_eq!(f.count(s), c - i);
1722 |                     assert!(f.remove(s));
1723 |                 }
1724 |                 assert!(f.is_empty());
1725 |             }
1726 |         }
1727 |     }
1728 |     #[test]
1729 |     fn test_remove_dup_two() {
1730 |         for s in 0..10 {
1731 |             dbg!(s);
1732 |             for cap in [100, 200, 500, 1000] {
1733 |                 let mut f = Filter::new(cap, 0.0001).unwrap();
1734 |                 let c = f.capacity();
1735 |                 for _ in 0..c / 2 {
1736 |                     f.insert_duplicated(-1).unwrap();
1737 |                     f.insert_duplicated(s).unwrap();
1738 |                 }
1739 |                 assert_eq!(f.count(-1), c / 2);
1740 |                 assert_eq!(f.count(s), c / 2);
1741 |                 for i in 0..c / 2 {
1742 |                     assert_eq!(f.count(-1), c / 2 - i);
1743 |                     assert_eq!(f.count(s), c / 2 - i);
1744 |                     assert!(f.remove(-1));
1745 |                     assert_eq!(f.count(-1), c / 2 - i - 1);
1746 |                     assert_eq!(f.count(s), c / 2 - i);
1747 |                     assert!(f.remove(s));
1748 |                     assert_eq!(f.count(-1), c / 2 - i - 1);
1749 |                     assert_eq!(f.count(s), c / 2 - i - 1);
1750 |                 }
1751 |                 assert!(f.is_empty());
1752 |             }
1753 |         }
1754 |     }
1755 | 
1756 |     #[test]
1757 |     fn test_it_works() {
1758 |         for fp_rate_arg in [0.01, 0.001, 0.0001] {
1759 |             let mut f = Filter::new(100_000, fp_rate_arg).unwrap();
1760 |             assert!(!f.contains(0));
1761 |             assert_eq!(f.len(), 0);
1762 |             for i in 0..f.capacity() {
1763 |                 f.insert_duplicated(i).unwrap();
1764 |             }
1765 |             for i in 0..f.capacity() {
1766 |                 assert!(f.contains(i));
1767 |             }
1768 |             let est_fp_rate =
1769 |                 (0..).take(50_000).filter(|i| f.contains(i)).count() as f64 / 50_000.0;
1770 |             dbg!(f.max_error_ratio(), est_fp_rate);
1771 |             assert!(est_fp_rate <= f.max_error_ratio());
1772 |         }
1773 |     }
1774 | 
1775 |     #[test]
1776 |     fn test_with_fingerprint_size_resizes() {
1777 |         let mut f = Filter::with_fingerprint_size(0, 8).unwrap();
1778 |         assert_eq!(f.fingerprint_size(), 8);
1779 |         assert_eq!(f.capacity_resizeable(), (128u64 * 19).div_ceil(20));
1780 |         assert_eq!(f.capacity(), (64u64 * 19).div_ceil(20));
1781 |         for i in 0..f.capacity_resizeable() {
1782 |             f.insert_fingerprint(false, i).unwrap();
1783 |         }
1784 |         assert_eq!(f.len(), f.capacity_resizeable());
1785 |         assert!(f
1786 |             .insert_fingerprint(false, f.capacity_resizeable())
1787 |             .is_err());
1788 |     }
1789 | 
1790 |     #[test]
1791 |     fn test_with_fingerprint_size() {
1792 |         let fingerprints = [
1793 |             0u64,
1794 |             0,
1795 |             1,
1796 |             1,
1797 |             1,
1798 |             1,
1799 |             1,
1800 |             0x777777777777,
1801 |             u32::MAX as u64 - 1,
1802 |             u32::MAX as u64 - 1,
1803 |             u32::MAX as u64,
1804 |             u64::MAX - 1,
1805 |             u64::MAX - 1,
1806 |             u64::MAX,
1807 |             u64::MAX,
1808 |         ];
1809 |         for fip_size in [7, 16, 24, 31, 49, 64] {
1810 |             let mut filter = Filter::with_fingerprint_size(1, fip_size).unwrap();
1811 |             for h in fingerprints {
1812 |                 filter.insert_fingerprint(true, h).unwrap();
1813 |             }
1814 |             let out: Vec<u64> = filter.fingerprints().collect::<Vec<_>>();
1815 |             let mut expect = fingerprints.map(|h| h << (64 - fip_size) >> (64 - fip_size));
1816 |             expect.sort_unstable();
1817 |             assert_eq!(out, expect);
1818 |         }
1819 |     }
1820 | 
1821 |     #[test]
1822 |     fn test_merge() {
1823 |         fn test(mut f1: Filter, mut f2: Filter, mut f3: Filter) {
1824 |             assert!(f1.merge(true, &f1.clone()).is_ok());
1825 |             assert!(f1.merge(true, &f2).is_ok());
1826 |             assert!(f1.merge(true, &f3).is_ok());
1827 |             assert!(f2.merge(true, &f1).is_err());
1828 |             assert!(f2.merge(true, &f2.clone()).is_ok());
1829 |             assert!(f2.merge(true, &f3).is_ok());
1830 |             assert!(f3.merge(true, &f1).is_err());
1831 |             assert!(f3.merge(true, &f2).is_err());
1832 |             assert!(f3.merge(true, &f3.clone()).is_ok());
1833 | 
1834 |             f1.insert_fingerprint(true, 1).unwrap();
1835 |             f2.insert_fingerprint(true, 1).unwrap();
1836 |             f2.insert_fingerprint(true, 2).unwrap();
1837 |             f3.insert_fingerprint(true, 1).unwrap();
1838 |             f3.insert_fingerprint(true, 2).unwrap();
1839 |             f3.insert_fingerprint(true, 3).unwrap();
1840 |             assert_eq!(f1.len(), 1);
1841 |             assert_eq!(f2.len(), 2);
1842 |             assert_eq!(f3.len(), 3);
1843 | 
1844 |             f1.merge(false, &f1.clone()).unwrap();
1845 |             assert_eq!(f1.len(), 1);
1846 |             f1.merge(true, &f2.clone()).unwrap();
1847 |             assert_eq!(f1.len(), 3);
1848 |             f1.merge(false, &f3.clone()).unwrap();
1849 |             assert_eq!(f1.len(), 4);
1850 | 
1851 |             for _ in f1.len()..f1.capacity() {
1852 |                 f1.insert_fingerprint(true, 1).unwrap();
1853 |             }
1854 |             assert_eq!(f1.len(), f1.capacity());
1855 |             assert!(matches!(
1856 |                 f1.insert_impl(u64::MAX, 1),
1857 |                 Err(Error::CapacityExceeded)
1858 |             ));
1859 |             assert!(matches!(
1860 |                 f1.merge(true, &f1.clone()),
1861 |                 Err(Error::CapacityExceeded)
1862 |             ));
1863 |             assert!(matches!(f1.insert_fingerprint(false, 1), Ok(false)));
1864 |             assert!(matches!(f1.merge(false, &f1.clone()), Ok(())));
1865 |         }
1866 |         test(
1867 |             Filter::with_fingerprint_size(1, 10).unwrap(),
1868 |             Filter::with_fingerprint_size(1, 11).unwrap(),
1869 |             Filter::with_fingerprint_size(1, 12).unwrap(),
1870 |         );
1871 |         test(
1872 |             Filter::new(1, 0.01).unwrap(),
1873 |             Filter::new(1, 0.001).unwrap(),
1874 |             Filter::new(1, 0.0001).unwrap(),
1875 |         );
1876 |     }
1877 | 
1878 |     #[cfg(feature = "serde")]
1879 |     #[test]
1880 |     fn test_serde() {
1881 |         for capacity in [100, 1000, 10000] {
1882 |             for fp_ratio in [0.2, 0.1, 0.01, 0.001, 0.0001] {
1883 |                 let mut f = Filter::new(capacity, fp_ratio).unwrap();
1884 |                 for i in 0..f.capacity() {
1885 |                     f.insert(i).unwrap();
1886 |                 }
1887 | 
1888 |                 let ser = serde_cbor::to_vec(&f).unwrap();
1889 |                 f = serde_cbor::from_slice(&ser).unwrap();
1890 |                 for i in 0..f.capacity() {
1891 |                     f.contains(i);
1892 |                 }
1893 |                 dbg!(
1894 |                     f.current_error_ratio(),
1895 |                     f.max_error_ratio(),
1896 |                     f.capacity(),
1897 |                     f.len(),
1898 |                     ser.len()
1899 |                 );
1900 |             }
1901 |         }
1902 |     }
1903 | 
1904 |     #[test]
1905 |     fn test_dec_offset_edge_case() {
1906 |         // case found in fuzz testing
1907 |         #[rustfmt::skip]
1908 |         let sample = [(0u16, 287), (2u16, 1), (9u16, 2), (10u16, 1), (53u16, 5), (61u16, 5), (127u16, 2), (232u16, 1), (255u16, 21), (314u16, 2), (317u16, 2), (384u16, 2), (511u16, 3), (512u16, 2), (1599u16, 2), (2303u16, 5), (2559u16, 2), (2568u16, 3), (2815u16, 2), (6400u16, 2), (9211u16, 2), (9728u16, 2), (10790u16, 1), (10794u16, 94), (10797u16, 2), (10999u16, 2), (11007u16, 2), (11520u16, 1), (12800u16, 4), (12842u16, 2), (13823u16, 1), (14984u16, 2), (15617u16, 2), (15871u16, 4), (16128u16, 3), (16383u16, 2), (16394u16, 1), (18167u16, 2), (23807u16, 1), (32759u16, 2)];
1909 |         let mut f = Filter::new(400, 0.1).unwrap();
1910 |         for (i, c) in sample {
1911 |             for _ in 0..c {
1912 |                 f.insert_duplicated(i).unwrap();
1913 |             }
1914 |         }
1915 |         assert_eq!(f.raw_block(2).offset, 3);
1916 |         assert_eq!(f.raw_block(3).offset, u8::MAX as u64);
1917 |         f.validate_offsets(0, f.total_buckets().get());
1918 |         f.remove(0u16);
1919 |         assert_eq!(f.raw_block(2).offset, 2);
1920 |         assert_eq!(f.raw_block(3).offset, 254);
1921 |         f.validate_offsets(0, f.total_buckets().get());
1922 |     }
1923 | 
1924 |     #[test]
1925 |     fn test_capacity_edge_cases() {
1926 |         for n in 1..32 {
1927 |             let base = (1 << n) * 19 / 20;
1928 |             // Test numbers around the edge
1929 |             for i in [base - 1, base, base + 1] {
1930 |                 let filter = Filter::new(i, 0.01).unwrap();
1931 |                 assert!(
1932 |                     filter.capacity() >= i,
1933 |                     "Requested capacity {} but got {}",
1934 |                     i,
1935 |                     filter.capacity()
1936 |                 );
1937 |                 assert_eq!(filter.capacity(), filter.capacity_resizeable());
1938 |             }
1939 |         }
1940 |     }
1941 | 
1942 |     #[test]
1943 |     fn test_max_capacity() {
1944 |         for i in 7..=64 {
1945 |             let f = Filter::with_fingerprint_size(0, i).unwrap();
1946 |             assert!(f.capacity() <= f.capacity_resizeable());
1947 |             assert_eq!(
1948 |                 f.capacity_resizeable(),
1949 |                 ((1u64 << (i - 1).min(Filter::MAX_QBITS)) * 19).div_ceil(20)
1950 |             );
1951 |         }
1952 |         for i in 1..Filter::MAX_QBITS {
1953 |             let f = Filter::new_resizeable(0, 2u64.pow(i as u32), 0.5).unwrap();
1954 |             assert_eq!(f.capacity(), 61);
1955 |             assert!(f.capacity() <= f.capacity_resizeable());
1956 |         }
1957 |         // Test the maximum capacity
1958 |         let f = Filter::new_resizeable(0, Filter::MAX_CAPACITY, 0.5).unwrap();
1959 |         assert_eq!(f.capacity(), 61);
1960 |         assert_eq!(f.capacity_resizeable(), Filter::MAX_CAPACITY);
1961 |         // Test the maximum capacity + 1, which should fail
1962 |         Filter::new_resizeable(0, Filter::MAX_CAPACITY + 1, 0.5).unwrap_err();
1963 |     }
1964 | }
1965 | 


--------------------------------------------------------------------------------
/src/stable_hasher.rs:
--------------------------------------------------------------------------------
  1 | use std::hash::Hasher;
  2 | 
  3 | /// Wrapper over a hasher that provides stable output across platforms
  4 | /// Based on https://github.com/rust-lang/rust/blob/c0955a34bcb17f0b31d7b86522a520ebe7fa93ac/src/librustc_data_structures/stable_hasher.rs#L78-L166
  5 | ///
  6 | /// To that end we always convert integers to little-endian format before
  7 | /// hashing and the architecture dependent `isize` and `usize` types are
  8 | /// extended to 64 bits if needed.
  9 | pub struct StableHasher {
 10 |     /// Using xxh3-64 with default seed/secret as the portable hasher.
 11 |     state: xxhash_rust::xxh3::Xxh3Default,
 12 | }
 13 | 
 14 | impl StableHasher {
 15 |     #[inline]
 16 |     pub fn new() -> Self {
 17 |         Self {
 18 |             state: xxhash_rust::xxh3::Xxh3Default::new(),
 19 |         }
 20 |     }
 21 | }
 22 | 
 23 | impl Hasher for StableHasher {
 24 |     #[inline]
 25 |     fn finish(&self) -> u64 {
 26 |         self.state.finish()
 27 |     }
 28 | 
 29 |     #[inline]
 30 |     fn write(&mut self, bytes: &[u8]) {
 31 |         self.state.write(bytes);
 32 |     }
 33 | 
 34 |     #[inline]
 35 |     fn write_u8(&mut self, i: u8) {
 36 |         self.state.write_u8(i);
 37 |     }
 38 | 
 39 |     #[inline]
 40 |     fn write_u16(&mut self, i: u16) {
 41 |         self.state.write_u16(i.to_le());
 42 |     }
 43 | 
 44 |     #[inline]
 45 |     fn write_u32(&mut self, i: u32) {
 46 |         self.state.write_u32(i.to_le());
 47 |     }
 48 | 
 49 |     #[inline]
 50 |     fn write_u64(&mut self, i: u64) {
 51 |         self.state.write_u64(i.to_le());
 52 |     }
 53 | 
 54 |     #[inline]
 55 |     fn write_u128(&mut self, i: u128) {
 56 |         self.state.write_u128(i.to_le());
 57 |     }
 58 | 
 59 |     #[inline]
 60 |     fn write_usize(&mut self, i: usize) {
 61 |         // Always treat usize as u64 so we get the same results on 32 and 64 bit
 62 |         // platforms. This is important for symbol hashes when cross compiling,
 63 |         // for example.
 64 |         self.state.write_u64((i as u64).to_le());
 65 |     }
 66 | 
 67 |     #[inline]
 68 |     fn write_i8(&mut self, i: i8) {
 69 |         self.state.write_i8(i);
 70 |     }
 71 | 
 72 |     #[inline]
 73 |     fn write_i16(&mut self, i: i16) {
 74 |         self.state.write_i16(i.to_le());
 75 |     }
 76 | 
 77 |     #[inline]
 78 |     fn write_i32(&mut self, i: i32) {
 79 |         self.state.write_i32(i.to_le());
 80 |     }
 81 | 
 82 |     #[inline]
 83 |     fn write_i64(&mut self, i: i64) {
 84 |         self.state.write_i64(i.to_le());
 85 |     }
 86 | 
 87 |     #[inline]
 88 |     fn write_i128(&mut self, i: i128) {
 89 |         self.state.write_i128(i.to_le());
 90 |     }
 91 | 
 92 |     #[inline]
 93 |     fn write_isize(&mut self, i: isize) {
 94 |         // Always treat isize as i64 so we get the same results on 32 and 64 bit
 95 |         // platforms. This is important for symbol hashes when cross compiling,
 96 |         // for example.
 97 |         self.state.write_i64((i as i64).to_le());
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------