├── .gitignore
├── .travis.yml
├── Cargo.toml
├── Makefile
├── README.md
├── benches
    ├── buckets.rs
    └── stable.rs
└── src
    ├── buckets.rs
    ├── fnv.rs
    ├── lib.rs
    └── stable.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | 
4 | Cargo.lock
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | rust: 1.36.0
 3 | dist: xenial
 4 | sudo: true
 5 | cache:
 6 |   cargo: true
 7 |   timeout: 1024
 8 | 
 9 | notifications:
10 |   email: false
11 | 
12 | before_cache:
13 |   - rm -rf $HOME/.cargo/registry
14 |   - rm -rf ./target/debug/incremental/
15 |   - rm -rf ./target/release/incremental/
16 | 
17 | matrix:
18 |   include:
19 |     - name: Linter on macOS
20 |       os: osx
21 |       env: CACHE_NAME=linters
22 |       install:
23 |         - cargo fmt --version || travis_retry rustup component add rustfmt
24 |         - cargo clippy --version || travis_retry rustup component add clippy
25 |       script:
26 |         - make check-whitespaces
27 |         - make fmt
28 |         - make clippy
29 |     - name: Test on macOS
30 |       os: osx
31 |       env: CACHE_NAME=Test
32 |       script:
33 |         - make test
34 |     - name: Test on macOS
35 |       os: osx
36 |       env: CACHE_NAME=Test
37 |       script:
38 |         - make test
39 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "stable-bloom-filter"
 3 | version = "0.3.0"
 4 | authors = ["u2 <zhangyaning1985@gmail.com>"]
 5 | edition = "2018"
 6 | license = "MIT"
 7 | description = "A Rust-implementation of a stable Bloom filter for filtering duplicates out of data streams."
 8 | repository = "https://github.com/u2/stable-bloom-filter"
 9 | 
10 | [dependencies]
11 | rand = "0.7"
12 | 
13 | [dev-dependencies]
14 | criterion = "0.2"
15 | float-cmp = "0.5"
16 | rand = "0.7"
17 | 
18 | [[bench]]
19 | name = "buckets"
20 | harness = false
21 | 
22 | [[bench]]
23 | name = "stable"
24 | harness = false
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL = /bin/sh
 2 | 
 3 | test:
 4 | 	cargo test --all -- --nocapture
 5 | 
 6 | fmt:
 7 | 	cargo fmt --all -- --check
 8 | 
 9 | clippy:
10 | 	cargo clippy --all --all-targets --all-features
11 | 
12 | check-whitespaces:
13 | 	git diff-index --check --cached $$(git rev-parse --verify master 2>/dev/null || echo "1e9e43b42759e784e18c61fabaae6f9ab2dc20b7") --
14 | 
15 | bench:
16 | 	cargo bench
17 | 
18 | .PHONY: test fmt clippy check-whitespaces bench
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stable Bloom-Filter
 2 | 
 3 | A Rust-implementation of a stable Bloom filter for filtering duplicates out of data streams, port of [BoomFilters](https://github.com/tylertreat/BoomFilters)
 4 | 
 5 | [![Travis CI]](https://travis-ci.com/u2/stable-bloom-filter) [![Stable Badge]](https://crates.io/crates/stable-bloom-filter)
 6 | 
 7 | [Travis CI]: https://img.shields.io/travis/com/u2/stable-bloom-filter.svg
 8 | [Stable Badge]: https://img.shields.io/crates/v/stable-bloom-filter.svg
 9 | 
10 | This is an implementation of Stable Bloom Filters as described by Deng and Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable Bloom Filters.
11 | 
12 | A Stable Bloom Filter (SBF) continuously evicts stale information so that it has room for more recent elements. Like traditional Bloom filters, an SBF has a non-zero probability of false positives, which is controlled by several parameters. Unlike the classic Bloom filter, an SBF has a tight upper bound on the rate of false positives while introducing a non-zero rate of false negatives. The false-positive rate of a classic Bloom filter eventually reaches 1, after which all queries result in a false positive. The stable-point property of an SBF means the false-positive rate asymptotically approaches a configurable fixed constant. A classic Bloom filter is actually a special case of SBF where the eviction rate is zero and the cell size is one, so this provides support for them as well (in addition to bitset-based Bloom filters).
13 | 
14 | Stable Bloom Filters are useful for cases where the size of the data set isn't known a priori and memory is bounded. For example, an SBF can be used to deduplicate events from an unbounded event stream with a specified upper bound on false positives and minimal false negatives.
15 | 
16 | ## Usage
17 | 
18 | ```toml
19 | stable-bloom-filter = "0.3"
20 | ```
21 | 
22 | ```rust
23 | use stable-bloom-filter::StableBloomFilter;
24 | 
25 | let mut f = StableBloomFilter::new_default(10_000, 0.01);
26 | assert!(!f.test(b"a"));
27 | 
28 | f.add(b"a");
29 | assert!(f.test(b"a"));
30 | 
31 | assert!(f.test_and_add(b"a"));
32 | 
33 | assert!(!f.test_and_add(b"b"));
34 | assert!(f.test(b"a"));
35 | 
36 | assert!(f.test(b"b"));
37 | 
38 | assert!(!f.test(b"c"));
39 | ```
40 | 


--------------------------------------------------------------------------------
/benches/buckets.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion, Fun};
 2 | use rand::{thread_rng, Rng};
 3 | use stable_bloom_filter::buckets::Buckets;
 4 | 
 5 | fn bench(c: &mut Criterion) {
 6 |     let increment = Fun::new("Increment", |b, _| {
 7 |         let mut buckets = Buckets::new(10_000, 8);
 8 |         let mut rng = thread_rng();
 9 |         let mut data = vec![];
10 | 
11 |         for _ in 0..2_500_000 {
12 |             let r: usize = rng.gen_range(0, 10_000);
13 |             data.push(r);
14 |         }
15 | 
16 |         b.iter(|| {
17 |             for i in data.iter() {
18 |                 buckets.increment(*i, 1);
19 |             }
20 |         })
21 |     });
22 | 
23 |     let set = Fun::new("Set", |b, _| {
24 |         let mut buckets = Buckets::new(1000, 8);
25 |         let mut rng = thread_rng();
26 |         let mut data = vec![];
27 | 
28 |         for _ in 0..100_000 {
29 |             let r: usize = rng.gen_range(0, 1000);
30 |             let v: u8 = rng.gen_range(0, 255);
31 |             data.push((r, v));
32 |         }
33 | 
34 |         b.iter(|| {
35 |             for (i, v) in data.iter() {
36 |                 buckets.set(*i, *v);
37 |             }
38 |         })
39 |     });
40 | 
41 |     let get = Fun::new("Get", |b, _| {
42 |         let mut buckets = Buckets::new(1000, 8);
43 |         let mut rng = thread_rng();
44 |         let mut data = vec![];
45 | 
46 |         for i in 0..=1000 {
47 |             let v: u8 = rng.gen_range(0, 255);
48 |             buckets.set(i % 1000, v);
49 |         }
50 | 
51 |         for _ in 0..100_000 {
52 |             let r: usize = rng.gen_range(0, 1000);
53 |             data.push(r);
54 |         }
55 | 
56 |         b.iter(|| {
57 |             for i in 0..1000 {
58 |                 buckets.get(i);
59 |             }
60 |         })
61 |     });
62 | 
63 |     let functions = vec![increment, set, get];
64 |     c.bench_functions("Buckets", functions, 0);
65 | }
66 | 
67 | criterion_group!(benches, bench);
68 | criterion_main!(benches);
69 | 


--------------------------------------------------------------------------------
/benches/stable.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion, Fun};
 2 | use stable_bloom_filter::stable::StableBloomFilter;
 3 | use stable_bloom_filter::Filter;
 4 | 
 5 | fn bench(c: &mut Criterion) {
 6 |     let add = Fun::new("Add", |b, _| {
 7 |         let mut s = StableBloomFilter::new_default(200, 0.01);
 8 |         let mut data = Vec::new();
 9 |         for i in 0..100_000 {
10 |             data.push(i.to_string().into_bytes());
11 |         }
12 | 
13 |         b.iter(|| {
14 |             for i in data.iter() {
15 |                 s.add(i);
16 |             }
17 |         })
18 |     });
19 | 
20 |     let test = Fun::new("Test", |b, _| {
21 |         let s = StableBloomFilter::new_default(200, 0.01);
22 |         let mut data = Vec::new();
23 |         for i in 0..100_000 {
24 |             data.push(i.to_string().into_bytes());
25 |         }
26 | 
27 |         b.iter(|| {
28 |             for i in data.iter() {
29 |                 s.test(i);
30 |             }
31 |         })
32 |     });
33 | 
34 |     let test_and_add = Fun::new("TestAndAdd", |b, _| {
35 |         let mut s = StableBloomFilter::new_default(200, 0.01);
36 |         let mut data = Vec::new();
37 |         for i in 0..100_000 {
38 |             data.push(i.to_string().into_bytes());
39 |         }
40 | 
41 |         b.iter(|| {
42 |             for i in data.iter() {
43 |                 s.test_and_add(i);
44 |             }
45 |         })
46 |     });
47 | 
48 |     let functions = vec![add, test, test_and_add];
49 |     c.bench_functions("StableBloomFilter", functions, 0);
50 | }
51 | 
52 | criterion_group!(benches, bench);
53 | criterion_main!(benches);
54 | 


--------------------------------------------------------------------------------
/src/buckets.rs:
--------------------------------------------------------------------------------
  1 | /// Buckets is a fast, space-efficient array of buckets where each bucket can
  2 | /// store up to a configured maximum value.
  3 | pub struct Buckets {
  4 |     data: Vec<u8>,
  5 |     bucket_size: u8,
  6 |     max: u8,
  7 |     count: usize,
  8 | }
  9 | 
 10 | impl Buckets {
 11 |     /// Creates a new Buckets with the provided number of buckets where
 12 |     /// each bucket is the specified number of bits.
 13 |     pub fn new(count: usize, bucket_size: u8) -> Self {
 14 |         if bucket_size > 8 {
 15 |             panic!("max bucket_size is 8");
 16 |         }
 17 |         Buckets {
 18 |             count,
 19 |             bucket_size,
 20 |             data: vec![0; (count * usize::from(bucket_size) + 7) / 8],
 21 |             max: ((1u16 << u16::from(bucket_size)) - 1) as u8,
 22 |         }
 23 |     }
 24 | 
 25 |     /// Returns the maximum value that can be stored in a bucket.
 26 |     pub fn max_bucket_value(&self) -> u8 {
 27 |         self.max
 28 |     }
 29 | 
 30 |     /// Returns the number of buckets.
 31 |     pub fn count(&self) -> usize {
 32 |         self.count
 33 |     }
 34 | 
 35 |     /// Decrease the value in the specified bucket by the provided delta.
 36 |     /// The value is clamped to zero and the maximum bucket value.
 37 |     /// Returns itself to allow for chaining.
 38 |     #[inline]
 39 |     pub fn decrease(&mut self, bucket: usize, delta: u8) -> &Self {
 40 |         let val = (self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8)
 41 |             .saturating_sub(delta);
 42 | 
 43 |         self.set_bits(
 44 |             (bucket as u32) * u32::from(self.bucket_size),
 45 |             self.bucket_size,
 46 |             val,
 47 |         );
 48 |         self
 49 |     }
 50 | 
 51 |     /// Increment the value in the specified bucket by the provided delta.
 52 |     /// The value is clamped to zero and the maximum bucket value.
 53 |     /// Returns itself to allow for chaining.
 54 |     #[inline]
 55 |     pub fn increment(&mut self, bucket: usize, delta: u8) -> &Self {
 56 |         let val = (self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8)
 57 |             .saturating_add(delta)
 58 |             .min(self.max);
 59 | 
 60 |         self.set_bits(
 61 |             (bucket as u32) * u32::from(self.bucket_size),
 62 |             self.bucket_size,
 63 |             val,
 64 |         );
 65 |         self
 66 |     }
 67 | 
 68 |     /// Set the bucket value. The value is clamped to zero and the maximum
 69 |     /// bucket value. Returns itself to allow for chaining.
 70 |     #[inline]
 71 |     pub fn set(&mut self, bucket: usize, value: u8) -> &Self {
 72 |         let value = value.min(self.max);
 73 | 
 74 |         self.set_bits(
 75 |             (bucket as u32) * u32::from(self.bucket_size),
 76 |             self.bucket_size,
 77 |             value,
 78 |         );
 79 |         self
 80 |     }
 81 | 
 82 |     /// Returns the value in the specified bucket.
 83 |     #[inline]
 84 |     pub fn get(&self, bucket: usize) -> u8 {
 85 |         self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8
 86 |     }
 87 | 
 88 |     /// Reset restores the Buckets to the original state.
 89 |     /// Returns itself to allow for chaining.
 90 |     pub fn reset(&mut self) -> &Self {
 91 |         self.data = vec![0; (self.count * usize::from(self.bucket_size) + 7) / 8];
 92 |         self
 93 |     }
 94 | 
 95 |     /// Returns the bits at the specified offset and length.
 96 |     #[inline]
 97 |     fn get_bits(&self, offset: usize, length: u8) -> u32 {
 98 |         let byte_index = offset / 8;
 99 |         let byte_offset = offset % 8;
100 |         if byte_offset as u8 + length > 8 {
101 |             let rem = 8 - byte_offset as u8;
102 |             return self.get_bits(offset, rem)
103 |                 | (self.get_bits(offset + rem as usize, length - rem) << rem);
104 |         }
105 | 
106 |         let bit_mask = (1 << length) - 1;
107 |         (u32::from(self.data[byte_index as usize]) & (bit_mask << byte_offset) as u32)
108 |             >> byte_offset
109 |     }
110 | 
111 |     /// setBits sets bits at the specified offset and length.
112 |     #[inline]
113 |     fn set_bits(&mut self, offset: u32, length: u8, bits: u8) {
114 |         let byte_index = offset / 8;
115 |         let byte_offset = offset % 8;
116 |         if byte_offset as u8 + length > 8 {
117 |             let rem = 8 - byte_offset as u8;
118 |             self.set_bits(offset, rem, bits);
119 |             self.set_bits(offset + u32::from(rem), length - rem, bits >> rem);
120 |             return;
121 |         }
122 | 
123 |         let bit_mask: u32 = (1 << length) - 1;
124 |         self.data[byte_index as usize] =
125 |             (u32::from(self.data[byte_index as usize]) & !(bit_mask << byte_offset)) as u8;
126 |         self.data[byte_index as usize] = (u32::from(self.data[byte_index as usize])
127 |             | ((u32::from(bits) & bit_mask) << byte_offset))
128 |             as u8;
129 |     }
130 | }
131 | 
132 | #[cfg(test)]
133 | mod tests {
134 |     use super::Buckets;
135 | 
136 |     // Ensures that MaxBucketValue returns the correct maximum based on the bucket
137 |     // size.
138 |     #[test]
139 |     fn test_max_bucket_value() {
140 |         let b = Buckets::new(10, 2);
141 |         assert_eq!(b.max_bucket_value(), 3);
142 |     }
143 | 
144 |     // Ensures that Count returns the number of buckets.
145 |     #[test]
146 |     fn test_buckets_count() {
147 |         let b = Buckets::new(10, 2);
148 |         assert_eq!(b.count(), 10);
149 |     }
150 | 
151 |     // Ensures that Increment increments the bucket value by the correct delta and
152 |     // clamps to zero and the maximum, Get returns the correct bucket value, and
153 |     // Set sets the bucket value correctly.
154 |     #[test]
155 |     fn test_buckets_increment_decrease_and_get_and_set() {
156 |         // bucket_size = 2
157 |         let mut b = Buckets::new(5, 2);
158 | 
159 |         let _b = b.increment(0, 1);
160 |         assert_eq!(b.get(0), 1);
161 | 
162 |         let _b = b.decrease(1, 1);
163 |         assert_eq!(b.get(1), 0);
164 | 
165 |         let _b = b.set(2, 100);
166 |         assert_eq!(b.get(2), 3);
167 | 
168 |         let _b = b.increment(3, 2);
169 |         assert_eq!(b.get(3), 2);
170 |         // bucket_size = 3
171 |         let mut b = Buckets::new(5, 3);
172 | 
173 |         let _b = b.increment(0, 1);
174 |         assert_eq!(b.get(0), 1);
175 | 
176 |         let _b = b.decrease(1, 1);
177 |         assert_eq!(b.get(1), 0);
178 | 
179 |         let _b = b.set(2, 100);
180 |         assert_eq!(b.get(2), 7);
181 | 
182 |         let _b = b.increment(3, 2);
183 |         assert_eq!(b.get(3), 2);
184 |         // bucket_size = 8
185 |         let mut b = Buckets::new(5, 8);
186 | 
187 |         let _b = b.increment(0, 1);
188 |         assert_eq!(b.get(0), 1);
189 | 
190 |         let _b = b.decrease(1, 1);
191 |         assert_eq!(b.get(1), 0);
192 | 
193 |         let _b = b.set(2, 255);
194 |         assert_eq!(b.get(2), 255);
195 | 
196 |         let _b = b.increment(3, 2);
197 |         assert_eq!(b.get(3), 2);
198 |     }
199 | 
200 |     // Ensures that Reset restores the Buckets to the original state.
201 |     #[test]
202 |     fn test_buckets_reset() {
203 |         let mut b = Buckets::new(5, 2);
204 | 
205 |         for i in 0..5 {
206 |             b.increment(i, 1);
207 |         }
208 | 
209 |         let _b = b.reset();
210 | 
211 |         for i in 0..5 {
212 |             assert_eq!(b.get(i), 0);
213 |         }
214 |     }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/fnv.rs:
--------------------------------------------------------------------------------
 1 | use std::hash::Hasher;
 2 | 
 3 | #[derive(Clone)]
 4 | pub struct FnvHasher(u64);
 5 | 
 6 | impl Default for FnvHasher {
 7 |     #[inline]
 8 |     fn default() -> FnvHasher {
 9 |         FnvHasher(0xcbf2_9ce4_8422_2325)
10 |     }
11 | }
12 | 
13 | impl FnvHasher {
14 |     /// Create an FNV hasher starting with a state corresponding
15 |     /// to the hash `key`.
16 |     #[inline]
17 |     pub fn with_key(key: u64) -> FnvHasher {
18 |         FnvHasher(key)
19 |     }
20 | }
21 | 
22 | impl Hasher for FnvHasher {
23 |     #[inline]
24 |     fn finish(&self) -> u64 {
25 |         self.0
26 |     }
27 | 
28 |     #[inline]
29 |     fn write(&mut self, bytes: &[u8]) {
30 |         let FnvHasher(mut hash) = *self;
31 | 
32 |         for byte in bytes.iter() {
33 |             hash ^= u64::from(*byte);
34 |             hash = hash.wrapping_mul(0x0100_0000_01b3);
35 |         }
36 | 
37 |         *self = FnvHasher(hash);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // StableBloomFilter implements a Stable Bloom Filter as described by Deng and
 2 | // Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable
 3 | // Bloom Filters:
 4 | //
 5 | // http://webdocs.cs.ualberta.ca/~drafiei/papers/DupDet06Sigmod.pdf
 6 | //
 7 | // A Stable Bloom Filter (SBF) continuously evicts stale information so that it
 8 | // has room for more recent elements. Like traditional Bloom filters, an SBF
 9 | // has a non-zero probability of false positives, which is controlled by
10 | // several parameters. Unlike the classic Bloom filter, an SBF has a tight
11 | // upper bound on the rate of false positives while introducing a non-zero rate
12 | // of false negatives. The false-positive rate of a classic Bloom filter
13 | // eventually reaches 1, after which all queries result in a false positive.
14 | // The stable-point property of an SBF means the false-positive rate
15 | // asymptotically approaches a configurable fixed constant. A classic Bloom
16 | // filter is actually a special case of SBF where the eviction rate is zero, so
17 | // this package provides support for them as well.
18 | //
19 | // Stable Bloom Filters are useful for cases where the size of the data set
20 | // isn't known a priori, which is a requirement for traditional Bloom filters,
21 | // and memory is bounded.  For example, an SBF can be used to deduplicate
22 | // events from an unbounded event stream with a specified upper bound on false
23 | // positives and minimal false negatives.
24 | pub mod buckets;
25 | pub mod fnv;
26 | pub mod stable;
27 | 
28 | pub trait Filter {
29 |     fn test(&self, _data: &[u8]) -> bool;
30 | 
31 |     fn add(&mut self, _data: &[u8]) -> &Self;
32 | 
33 |     fn test_and_add(&mut self, _data: &[u8]) -> bool;
34 | }
35 | 
36 | /// Calculates the optimal number of hash functions to use for a Bloom
37 | /// filter based on the desired rate of false positives.
38 | pub(crate) fn optimal_k(fp_rate: f64) -> usize {
39 |     (1.0 / fp_rate).log2().ceil() as usize
40 | }
41 | 
42 | /// Returns the optimal number of cells to decrement, p, per
43 | /// iteration for the provided parameters of an SBF.
44 | pub(crate) fn optimal_stable_p(m: usize, k: usize, d: u8, fp_rate: f64) -> usize {
45 |     let max = (2_u64.pow(u32::from(d)) - 1) as f64;
46 |     let sub_denom = (1.0 - fp_rate.powf(1.0 / (k as f64))).powf(1.0 / max);
47 |     let denom = (1.0 / sub_denom - 1.0) * (1.0 / (k as f64) - 1.0 / (m as f64));
48 | 
49 |     let mut p = (1.0 / denom) as usize;
50 | 
51 |     if p == 0 {
52 |         p = 1;
53 |     }
54 | 
55 |     p
56 | }
57 | 


--------------------------------------------------------------------------------
/src/stable.rs:
--------------------------------------------------------------------------------
  1 | use crate::buckets::Buckets;
  2 | use crate::fnv::FnvHasher;
  3 | use crate::Filter;
  4 | use crate::{optimal_k, optimal_stable_p};
  5 | use rand::{thread_rng, Rng};
  6 | use std::hash::Hasher;
  7 | 
  8 | pub struct StableBloomFilter {
  9 |     /// filter data
 10 |     cells: Buckets,
 11 |     /// hash function (kernel for all k functions)
 12 |     hash: FnvHasher,
 13 |     /// number of cells
 14 |     m: usize,
 15 |     /// number of cells to decrement
 16 |     p: usize,
 17 |     /// number of hash functions
 18 |     k: usize,
 19 |     /// cell max value
 20 |     max: u8,
 21 |     /// buffer used to cache indices
 22 |     index_buffer: Vec<usize>,
 23 | }
 24 | 
 25 | impl StableBloomFilter {
 26 |     /// Creates a new Stable Bloom Filter with m cells and d
 27 |     /// bits allocated per cell optimized for the target false-positive rate. Use
 28 |     /// default if you don't want to calculate d.
 29 |     pub fn new(m: usize, d: u8, fp_rate: f64) -> Self {
 30 |         let mut k = optimal_k(fp_rate) / 2;
 31 |         if k > m {
 32 |             k = m;
 33 |         } else if k == 0 {
 34 |             k = 1;
 35 |         }
 36 | 
 37 |         let cells = Buckets::new(m, d);
 38 | 
 39 |         StableBloomFilter {
 40 |             hash: FnvHasher::default(),
 41 |             m,
 42 |             k,
 43 |             p: optimal_stable_p(m, k, d, fp_rate),
 44 |             max: cells.max_bucket_value(),
 45 |             cells,
 46 |             index_buffer: vec![0; k],
 47 |         }
 48 |     }
 49 | 
 50 |     /// Creates a new Stable Bloom Filter with m 1-bit
 51 |     /// cells and which is optimized for cases where there is no prior knowledge of
 52 |     /// the input data stream while maintaining an upper bound using the provided
 53 |     /// rate of false positives.
 54 |     pub fn new_default(m: usize, fp_rate: f64) -> Self {
 55 |         Self::new(m, 1, fp_rate)
 56 |     }
 57 | 
 58 |     /// NewUnstableBloomFilter creates a new special case of Stable Bloom Filter
 59 |     /// which is a traditional Bloom filter with m bits and an optimal number of
 60 |     /// hash functions for the target false-positive rate. Unlike the stable
 61 |     /// variant, data is not evicted and a cell contains a maximum of 1 hash value.
 62 |     pub fn new_unstable(m: usize, fp_rate: f64) -> Self {
 63 |         let cells = Buckets::new(m, 1);
 64 |         let k = optimal_k(fp_rate);
 65 | 
 66 |         StableBloomFilter {
 67 |             hash: FnvHasher::default(),
 68 |             m,
 69 |             k,
 70 |             p: 0,
 71 |             max: cells.max_bucket_value(),
 72 |             cells,
 73 |             index_buffer: vec![0; k],
 74 |         }
 75 |     }
 76 | 
 77 |     /// Returns the number of cells in the Stable Bloom Filter.
 78 |     pub fn cells(&self) -> usize {
 79 |         self.m
 80 |     }
 81 | 
 82 |     /// Returns the number of hash functions.
 83 |     pub fn k(&self) -> usize {
 84 |         self.k
 85 |     }
 86 | 
 87 |     /// Returns the number of cells decremented on every add.
 88 |     pub fn p(&self) -> usize {
 89 |         self.p
 90 |     }
 91 | 
 92 |     pub fn max(&self) -> u8 {
 93 |         self.max
 94 |     }
 95 | 
 96 |     /// Returns the limit of the expected fraction of zeros in the
 97 |     /// Stable Bloom Filter when the number of iterations goes to infinity. When
 98 |     /// this limit is reached, the Stable Bloom Filter is considered stable.
 99 |     pub fn stable_point(&self) -> f64 {
100 |         let sub_denom = (self.p as f64) * ((1.0 / (self.k as f64)) - (1.0 / (self.m as f64)));
101 |         let denom = 1.0 + 1.0 / sub_denom;
102 |         let base = 1.0 / denom;
103 | 
104 |         base.powf(f64::from(self.max))
105 |     }
106 | 
107 |     /// Returns the upper bound on false positives when the filter
108 |     /// has become stable.
109 |     pub fn false_positive_rate(&self) -> f64 {
110 |         (1.0 - self.stable_point()).powf(self.k as f64)
111 |     }
112 | 
113 |     #[inline]
114 |     pub fn hash_kernel(&self, data: &[u8]) -> (u32, u32) {
115 |         let mut hasher = self.hash.clone();
116 |         hasher.write(data);
117 |         let hash: u64 = hasher.finish();
118 |         let lower = hash as u32;
119 |         let upper = (hash >> 32) as u32;
120 |         (lower, upper)
121 |     }
122 | 
123 |     /// Restores the Stable Bloom Filter to its original state. It returns the
124 |     /// filter to allow for chaining.
125 |     pub fn reset(&mut self) -> &Self {
126 |         self.cells.reset();
127 |         self
128 |     }
129 | 
130 |     /// Will decrement a random cell and (p-1) adjacent cells by 1. This
131 |     /// is faster than generating p random numbers. Although the processes of
132 |     /// picking the p cells are not independent, each cell has a probability of p/m
133 |     /// for being picked at each iteration, which means the properties still hold.
134 |     #[inline]
135 |     pub fn decrement(&mut self) {
136 |         let mut rng = thread_rng();
137 |         let r: usize = rng.gen_range(0, self.m);
138 | 
139 |         for i in 0..(self.p) {
140 |             let idx = (r + i) % self.m;
141 |             self.cells.decrease(idx, 1);
142 |         }
143 |     }
144 | }
145 | 
146 | impl Filter for StableBloomFilter {
147 |     /// Will test for membership of the data and returns true if it is a
148 |     /// member, false if not. This is a probabilistic test, meaning there is a
149 |     /// non-zero probability of false positives and false negatives.
150 |     #[inline]
151 |     fn test(&self, data: &[u8]) -> bool {
152 |         let (lower, upper) = self.hash_kernel(data);
153 |         for i in 0..(self.k) {
154 |             if self
155 |                 .cells
156 |                 .get((lower as usize + upper as usize * i) % self.m)
157 |                 == 0
158 |             {
159 |                 return false;
160 |             }
161 |         }
162 |         true
163 |     }
164 | 
165 |     /// Will add the data to the Stable Bloom Filter. It returns the filter to
166 |     /// allow for chaining.
167 |     #[inline]
168 |     fn add(&mut self, data: &[u8]) -> &Self {
169 |         // Randomly decrement p cells to make room for new elements.
170 |         self.decrement();
171 |         let (lower, upper) = self.hash_kernel(data);
172 | 
173 |         for i in 0..(self.k) {
174 |             self.cells
175 |                 .set((lower as usize + upper as usize * i) % self.m, self.max);
176 |         }
177 | 
178 |         self
179 |     }
180 | 
181 |     /// Is equivalent to calling Test followed by Add. It returns true if
182 |     /// the data is a member, false if not.
183 |     #[inline]
184 |     fn test_and_add(&mut self, data: &[u8]) -> bool {
185 |         let (lower, upper) = self.hash_kernel(data);
186 |         let mut member = true;
187 | 
188 |         // If any of the K cells are 0, then it's not a member.
189 |         for i in 0..(self.k) {
190 |             self.index_buffer[i] = (lower as usize + upper as usize * i) % self.m;
191 |             if self.cells.get(self.index_buffer[i]) == 0 {
192 |                 member = false;
193 |             }
194 |         }
195 | 
196 |         // Randomly decrement p cells to make room for new elements.
197 |         self.decrement();
198 |         // Set the K cells to max.
199 |         for i in self.index_buffer.iter() {
200 |             self.cells.set(*i, self.max);
201 |         }
202 | 
203 |         member
204 |     }
205 | }
206 | 
207 | #[cfg(test)]
208 | mod tests {
209 |     use super::StableBloomFilter;
210 |     use crate::optimal_k;
211 |     use crate::Filter;
212 |     use float_cmp::ApproxEq;
213 |     use std::f64;
214 | 
215 |     fn round(val: f64, round_on: f64, places: usize) -> f64 {
216 |         let pow = (10.0_f64).powf(places as f64);
217 |         let digit = pow * val;
218 |         let div = digit - digit.floor();
219 |         let round = if div >= round_on {
220 |             digit.ceil()
221 |         } else {
222 |             digit.floor()
223 |         };
224 | 
225 |         round / pow
226 |     }
227 | 
228 |     // Ensures that new_unstable creates a Stable Bloom Filter with p=0,
229 |     // max=1 and k hash functions.
230 |     #[test]
231 |     fn test_new_unstable() {
232 |         let f = StableBloomFilter::new_unstable(100, 0.1);
233 |         let k = optimal_k(0.1);
234 | 
235 |         assert_eq!(f.k, k);
236 |         assert_eq!(f.m, 100);
237 |         assert_eq!(f.p(), 0);
238 |         assert_eq!(f.max(), 1);
239 |     }
240 | 
241 |     // Ensures that Cells returns the number of cells, m, in the Stable Bloom
242 |     // Filter.
243 |     #[test]
244 |     fn test_cells() {
245 |         let f = StableBloomFilter::new(100, 1, 0.1);
246 | 
247 |         assert_eq!(f.cells(), 100);
248 |     }
249 | 
250 |     // Ensures that K returns the number of hash functions in the Stable Bloom
251 |     // Filter.
252 |     #[test]
253 |     fn test_k() {
254 |         let f = StableBloomFilter::new(100, 1, 0.01);
255 |         assert_eq!(f.k(), 3);
256 |     }
257 | 
258 |     // Ensures that Test, Add, and TestAndAdd behave correctly.
259 |     #[test]
260 |     fn test_test_and_add() {
261 |         let mut f = StableBloomFilter::new_default(1_000, 0.01);
262 |         assert!(!f.test(b"a"));
263 | 
264 |         f.add(b"a");
265 |         assert!(f.test(b"a"));
266 | 
267 |         assert!(f.test_and_add(b"a"));
268 | 
269 |         assert!(!f.test_and_add(b"b"));
270 |         assert!(f.test(b"a"));
271 | 
272 |         assert!(f.test(b"b"));
273 | 
274 |         assert!(!f.test(b"c"));
275 | 
276 |         for i in 0..1_000_000 {
277 |             f.test_and_add(i.to_string().as_bytes());
278 |         }
279 | 
280 |         // `a` should have been evicted.
281 |         assert!(!f.test(b"a"));
282 |     }
283 | 
284 |     // Ensures that StablePoint returns the expected fraction of zeros for large
285 |     // iterations.
286 |     #[test]
287 |     fn test_stable_point() {
288 |         let mut f = StableBloomFilter::new(1000, 1, 0.1);
289 |         for i in 0..1_000_000 {
290 |             f.add(i.to_string().as_bytes());
291 |         }
292 | 
293 |         let mut zero = 0;
294 |         for i in 0..(f.m) {
295 |             if f.cells.get(i) == 0 {
296 |                 zero += 1;
297 |             }
298 |         }
299 | 
300 |         let actual = round(f64::from(zero) / (f.m as f64), 0.5, 1);
301 |         let expected = round(f.stable_point(), 0.5, 1);
302 | 
303 |         assert!(actual.approx_eq(expected, (f64::EPSILON, 1)));
304 |         // A classic Bloom filter is a special case of SBF where P is 0 and max is
305 |         // 1. It doesn't have a stable point.
306 |         let bf = StableBloomFilter::new_unstable(1000, 0.1);
307 |         assert!(bf.stable_point().approx_eq(0.0, (f64::EPSILON, 1)));
308 |     }
309 | 
310 |     // Ensures that FalsePositiveRate returns the upper bound on false positives
311 |     // for stable filters.
312 |     #[test]
313 |     fn test_false_positive_rate() {
314 |         let f = StableBloomFilter::new_default(1000, 0.01);
315 |         let fps = round(f.false_positive_rate(), 0.5, 2);
316 | 
317 |         assert!(fps.approx_eq(0.01, (f64::EPSILON, 1)));
318 | 
319 |         // Classic Bloom filters have an unbounded rate of false positives. Once
320 |         // they become full, every query returns a false positive.
321 |         let bf = StableBloomFilter::new_unstable(1000, 0.1);
322 |         assert!(bf.false_positive_rate().approx_eq(1.0, (f64::EPSILON, 1)));
323 |     }
324 | 
325 |     // Ensures that Reset sets every cell to zero.
326 |     #[test]
327 |     fn test_reset() {
328 |         let mut f = StableBloomFilter::new_default(1000, 0.01);
329 | 
330 |         for i in 0..1000 {
331 |             f.add(i.to_string().as_bytes());
332 |         }
333 | 
334 |         f.reset();
335 | 
336 |         for i in 0..(f.m) {
337 |             assert_eq!(f.cells.get(i), 0);
338 |         }
339 |     }
340 | }
341 | 


--------------------------------------------------------------------------------