├── .gitignore ├── .travis.yml ├── Cargo.toml ├── Makefile ├── README.md ├── benches ├── buckets.rs └── stable.rs └── src ├── buckets.rs ├── fnv.rs ├── lib.rs └── stable.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 1.36.0 3 | dist: xenial 4 | sudo: true 5 | cache: 6 | cargo: true 7 | timeout: 1024 8 | 9 | notifications: 10 | email: false 11 | 12 | before_cache: 13 | - rm -rf $HOME/.cargo/registry 14 | - rm -rf ./target/debug/incremental/ 15 | - rm -rf ./target/release/incremental/ 16 | 17 | matrix: 18 | include: 19 | - name: Linter on macOS 20 | os: osx 21 | env: CACHE_NAME=linters 22 | install: 23 | - cargo fmt --version || travis_retry rustup component add rustfmt 24 | - cargo clippy --version || travis_retry rustup component add clippy 25 | script: 26 | - make check-whitespaces 27 | - make fmt 28 | - make clippy 29 | - name: Test on macOS 30 | os: osx 31 | env: CACHE_NAME=Test 32 | script: 33 | - make test 34 | - name: Test on macOS 35 | os: osx 36 | env: CACHE_NAME=Test 37 | script: 38 | - make test 39 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "stable-bloom-filter" 3 | version = "0.3.0" 4 | authors = ["u2 "] 5 | edition = "2018" 6 | license = "MIT" 7 | description = "A Rust-implementation of a stable Bloom filter for filtering duplicates out of data streams." 8 | repository = "https://github.com/u2/stable-bloom-filter" 9 | 10 | [dependencies] 11 | rand = "0.7" 12 | 13 | [dev-dependencies] 14 | criterion = "0.2" 15 | float-cmp = "0.5" 16 | rand = "0.7" 17 | 18 | [[bench]] 19 | name = "buckets" 20 | harness = false 21 | 22 | [[bench]] 23 | name = "stable" 24 | harness = false 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL = /bin/sh 2 | 3 | test: 4 | cargo test --all -- --nocapture 5 | 6 | fmt: 7 | cargo fmt --all -- --check 8 | 9 | clippy: 10 | cargo clippy --all --all-targets --all-features 11 | 12 | check-whitespaces: 13 | git diff-index --check --cached $$(git rev-parse --verify master 2>/dev/null || echo "1e9e43b42759e784e18c61fabaae6f9ab2dc20b7") -- 14 | 15 | bench: 16 | cargo bench 17 | 18 | .PHONY: test fmt clippy check-whitespaces bench 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stable Bloom-Filter 2 | 3 | A Rust-implementation of a stable Bloom filter for filtering duplicates out of data streams, port of [BoomFilters](https://github.com/tylertreat/BoomFilters) 4 | 5 | [![Travis CI]](https://travis-ci.com/u2/stable-bloom-filter) [![Stable Badge]](https://crates.io/crates/stable-bloom-filter) 6 | 7 | [Travis CI]: https://img.shields.io/travis/com/u2/stable-bloom-filter.svg 8 | [Stable Badge]: https://img.shields.io/crates/v/stable-bloom-filter.svg 9 | 10 | This is an implementation of Stable Bloom Filters as described by Deng and Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable Bloom Filters. 11 | 12 | A Stable Bloom Filter (SBF) continuously evicts stale information so that it has room for more recent elements. Like traditional Bloom filters, an SBF has a non-zero probability of false positives, which is controlled by several parameters. Unlike the classic Bloom filter, an SBF has a tight upper bound on the rate of false positives while introducing a non-zero rate of false negatives. The false-positive rate of a classic Bloom filter eventually reaches 1, after which all queries result in a false positive. The stable-point property of an SBF means the false-positive rate asymptotically approaches a configurable fixed constant. A classic Bloom filter is actually a special case of SBF where the eviction rate is zero and the cell size is one, so this provides support for them as well (in addition to bitset-based Bloom filters). 13 | 14 | Stable Bloom Filters are useful for cases where the size of the data set isn't known a priori and memory is bounded. For example, an SBF can be used to deduplicate events from an unbounded event stream with a specified upper bound on false positives and minimal false negatives. 15 | 16 | ## Usage 17 | 18 | ```toml 19 | stable-bloom-filter = "0.3" 20 | ``` 21 | 22 | ```rust 23 | use stable-bloom-filter::StableBloomFilter; 24 | 25 | let mut f = StableBloomFilter::new_default(10_000, 0.01); 26 | assert!(!f.test(b"a")); 27 | 28 | f.add(b"a"); 29 | assert!(f.test(b"a")); 30 | 31 | assert!(f.test_and_add(b"a")); 32 | 33 | assert!(!f.test_and_add(b"b")); 34 | assert!(f.test(b"a")); 35 | 36 | assert!(f.test(b"b")); 37 | 38 | assert!(!f.test(b"c")); 39 | ``` 40 | -------------------------------------------------------------------------------- /benches/buckets.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion, Fun}; 2 | use rand::{thread_rng, Rng}; 3 | use stable_bloom_filter::buckets::Buckets; 4 | 5 | fn bench(c: &mut Criterion) { 6 | let increment = Fun::new("Increment", |b, _| { 7 | let mut buckets = Buckets::new(10_000, 8); 8 | let mut rng = thread_rng(); 9 | let mut data = vec![]; 10 | 11 | for _ in 0..2_500_000 { 12 | let r: usize = rng.gen_range(0, 10_000); 13 | data.push(r); 14 | } 15 | 16 | b.iter(|| { 17 | for i in data.iter() { 18 | buckets.increment(*i, 1); 19 | } 20 | }) 21 | }); 22 | 23 | let set = Fun::new("Set", |b, _| { 24 | let mut buckets = Buckets::new(1000, 8); 25 | let mut rng = thread_rng(); 26 | let mut data = vec![]; 27 | 28 | for _ in 0..100_000 { 29 | let r: usize = rng.gen_range(0, 1000); 30 | let v: u8 = rng.gen_range(0, 255); 31 | data.push((r, v)); 32 | } 33 | 34 | b.iter(|| { 35 | for (i, v) in data.iter() { 36 | buckets.set(*i, *v); 37 | } 38 | }) 39 | }); 40 | 41 | let get = Fun::new("Get", |b, _| { 42 | let mut buckets = Buckets::new(1000, 8); 43 | let mut rng = thread_rng(); 44 | let mut data = vec![]; 45 | 46 | for i in 0..=1000 { 47 | let v: u8 = rng.gen_range(0, 255); 48 | buckets.set(i % 1000, v); 49 | } 50 | 51 | for _ in 0..100_000 { 52 | let r: usize = rng.gen_range(0, 1000); 53 | data.push(r); 54 | } 55 | 56 | b.iter(|| { 57 | for i in 0..1000 { 58 | buckets.get(i); 59 | } 60 | }) 61 | }); 62 | 63 | let functions = vec![increment, set, get]; 64 | c.bench_functions("Buckets", functions, 0); 65 | } 66 | 67 | criterion_group!(benches, bench); 68 | criterion_main!(benches); 69 | -------------------------------------------------------------------------------- /benches/stable.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion, Fun}; 2 | use stable_bloom_filter::stable::StableBloomFilter; 3 | use stable_bloom_filter::Filter; 4 | 5 | fn bench(c: &mut Criterion) { 6 | let add = Fun::new("Add", |b, _| { 7 | let mut s = StableBloomFilter::new_default(200, 0.01); 8 | let mut data = Vec::new(); 9 | for i in 0..100_000 { 10 | data.push(i.to_string().into_bytes()); 11 | } 12 | 13 | b.iter(|| { 14 | for i in data.iter() { 15 | s.add(i); 16 | } 17 | }) 18 | }); 19 | 20 | let test = Fun::new("Test", |b, _| { 21 | let s = StableBloomFilter::new_default(200, 0.01); 22 | let mut data = Vec::new(); 23 | for i in 0..100_000 { 24 | data.push(i.to_string().into_bytes()); 25 | } 26 | 27 | b.iter(|| { 28 | for i in data.iter() { 29 | s.test(i); 30 | } 31 | }) 32 | }); 33 | 34 | let test_and_add = Fun::new("TestAndAdd", |b, _| { 35 | let mut s = StableBloomFilter::new_default(200, 0.01); 36 | let mut data = Vec::new(); 37 | for i in 0..100_000 { 38 | data.push(i.to_string().into_bytes()); 39 | } 40 | 41 | b.iter(|| { 42 | for i in data.iter() { 43 | s.test_and_add(i); 44 | } 45 | }) 46 | }); 47 | 48 | let functions = vec![add, test, test_and_add]; 49 | c.bench_functions("StableBloomFilter", functions, 0); 50 | } 51 | 52 | criterion_group!(benches, bench); 53 | criterion_main!(benches); 54 | -------------------------------------------------------------------------------- /src/buckets.rs: -------------------------------------------------------------------------------- 1 | /// Buckets is a fast, space-efficient array of buckets where each bucket can 2 | /// store up to a configured maximum value. 3 | pub struct Buckets { 4 | data: Vec, 5 | bucket_size: u8, 6 | max: u8, 7 | count: usize, 8 | } 9 | 10 | impl Buckets { 11 | /// Creates a new Buckets with the provided number of buckets where 12 | /// each bucket is the specified number of bits. 13 | pub fn new(count: usize, bucket_size: u8) -> Self { 14 | if bucket_size > 8 { 15 | panic!("max bucket_size is 8"); 16 | } 17 | Buckets { 18 | count, 19 | bucket_size, 20 | data: vec![0; (count * usize::from(bucket_size) + 7) / 8], 21 | max: ((1u16 << u16::from(bucket_size)) - 1) as u8, 22 | } 23 | } 24 | 25 | /// Returns the maximum value that can be stored in a bucket. 26 | pub fn max_bucket_value(&self) -> u8 { 27 | self.max 28 | } 29 | 30 | /// Returns the number of buckets. 31 | pub fn count(&self) -> usize { 32 | self.count 33 | } 34 | 35 | /// Decrease the value in the specified bucket by the provided delta. 36 | /// The value is clamped to zero and the maximum bucket value. 37 | /// Returns itself to allow for chaining. 38 | #[inline] 39 | pub fn decrease(&mut self, bucket: usize, delta: u8) -> &Self { 40 | let val = (self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8) 41 | .saturating_sub(delta); 42 | 43 | self.set_bits( 44 | (bucket as u32) * u32::from(self.bucket_size), 45 | self.bucket_size, 46 | val, 47 | ); 48 | self 49 | } 50 | 51 | /// Increment the value in the specified bucket by the provided delta. 52 | /// The value is clamped to zero and the maximum bucket value. 53 | /// Returns itself to allow for chaining. 54 | #[inline] 55 | pub fn increment(&mut self, bucket: usize, delta: u8) -> &Self { 56 | let val = (self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8) 57 | .saturating_add(delta) 58 | .min(self.max); 59 | 60 | self.set_bits( 61 | (bucket as u32) * u32::from(self.bucket_size), 62 | self.bucket_size, 63 | val, 64 | ); 65 | self 66 | } 67 | 68 | /// Set the bucket value. The value is clamped to zero and the maximum 69 | /// bucket value. Returns itself to allow for chaining. 70 | #[inline] 71 | pub fn set(&mut self, bucket: usize, value: u8) -> &Self { 72 | let value = value.min(self.max); 73 | 74 | self.set_bits( 75 | (bucket as u32) * u32::from(self.bucket_size), 76 | self.bucket_size, 77 | value, 78 | ); 79 | self 80 | } 81 | 82 | /// Returns the value in the specified bucket. 83 | #[inline] 84 | pub fn get(&self, bucket: usize) -> u8 { 85 | self.get_bits(bucket * usize::from(self.bucket_size), self.bucket_size) as u8 86 | } 87 | 88 | /// Reset restores the Buckets to the original state. 89 | /// Returns itself to allow for chaining. 90 | pub fn reset(&mut self) -> &Self { 91 | self.data = vec![0; (self.count * usize::from(self.bucket_size) + 7) / 8]; 92 | self 93 | } 94 | 95 | /// Returns the bits at the specified offset and length. 96 | #[inline] 97 | fn get_bits(&self, offset: usize, length: u8) -> u32 { 98 | let byte_index = offset / 8; 99 | let byte_offset = offset % 8; 100 | if byte_offset as u8 + length > 8 { 101 | let rem = 8 - byte_offset as u8; 102 | return self.get_bits(offset, rem) 103 | | (self.get_bits(offset + rem as usize, length - rem) << rem); 104 | } 105 | 106 | let bit_mask = (1 << length) - 1; 107 | (u32::from(self.data[byte_index as usize]) & (bit_mask << byte_offset) as u32) 108 | >> byte_offset 109 | } 110 | 111 | /// setBits sets bits at the specified offset and length. 112 | #[inline] 113 | fn set_bits(&mut self, offset: u32, length: u8, bits: u8) { 114 | let byte_index = offset / 8; 115 | let byte_offset = offset % 8; 116 | if byte_offset as u8 + length > 8 { 117 | let rem = 8 - byte_offset as u8; 118 | self.set_bits(offset, rem, bits); 119 | self.set_bits(offset + u32::from(rem), length - rem, bits >> rem); 120 | return; 121 | } 122 | 123 | let bit_mask: u32 = (1 << length) - 1; 124 | self.data[byte_index as usize] = 125 | (u32::from(self.data[byte_index as usize]) & !(bit_mask << byte_offset)) as u8; 126 | self.data[byte_index as usize] = (u32::from(self.data[byte_index as usize]) 127 | | ((u32::from(bits) & bit_mask) << byte_offset)) 128 | as u8; 129 | } 130 | } 131 | 132 | #[cfg(test)] 133 | mod tests { 134 | use super::Buckets; 135 | 136 | // Ensures that MaxBucketValue returns the correct maximum based on the bucket 137 | // size. 138 | #[test] 139 | fn test_max_bucket_value() { 140 | let b = Buckets::new(10, 2); 141 | assert_eq!(b.max_bucket_value(), 3); 142 | } 143 | 144 | // Ensures that Count returns the number of buckets. 145 | #[test] 146 | fn test_buckets_count() { 147 | let b = Buckets::new(10, 2); 148 | assert_eq!(b.count(), 10); 149 | } 150 | 151 | // Ensures that Increment increments the bucket value by the correct delta and 152 | // clamps to zero and the maximum, Get returns the correct bucket value, and 153 | // Set sets the bucket value correctly. 154 | #[test] 155 | fn test_buckets_increment_decrease_and_get_and_set() { 156 | // bucket_size = 2 157 | let mut b = Buckets::new(5, 2); 158 | 159 | let _b = b.increment(0, 1); 160 | assert_eq!(b.get(0), 1); 161 | 162 | let _b = b.decrease(1, 1); 163 | assert_eq!(b.get(1), 0); 164 | 165 | let _b = b.set(2, 100); 166 | assert_eq!(b.get(2), 3); 167 | 168 | let _b = b.increment(3, 2); 169 | assert_eq!(b.get(3), 2); 170 | // bucket_size = 3 171 | let mut b = Buckets::new(5, 3); 172 | 173 | let _b = b.increment(0, 1); 174 | assert_eq!(b.get(0), 1); 175 | 176 | let _b = b.decrease(1, 1); 177 | assert_eq!(b.get(1), 0); 178 | 179 | let _b = b.set(2, 100); 180 | assert_eq!(b.get(2), 7); 181 | 182 | let _b = b.increment(3, 2); 183 | assert_eq!(b.get(3), 2); 184 | // bucket_size = 8 185 | let mut b = Buckets::new(5, 8); 186 | 187 | let _b = b.increment(0, 1); 188 | assert_eq!(b.get(0), 1); 189 | 190 | let _b = b.decrease(1, 1); 191 | assert_eq!(b.get(1), 0); 192 | 193 | let _b = b.set(2, 255); 194 | assert_eq!(b.get(2), 255); 195 | 196 | let _b = b.increment(3, 2); 197 | assert_eq!(b.get(3), 2); 198 | } 199 | 200 | // Ensures that Reset restores the Buckets to the original state. 201 | #[test] 202 | fn test_buckets_reset() { 203 | let mut b = Buckets::new(5, 2); 204 | 205 | for i in 0..5 { 206 | b.increment(i, 1); 207 | } 208 | 209 | let _b = b.reset(); 210 | 211 | for i in 0..5 { 212 | assert_eq!(b.get(i), 0); 213 | } 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/fnv.rs: -------------------------------------------------------------------------------- 1 | use std::hash::Hasher; 2 | 3 | #[derive(Clone)] 4 | pub struct FnvHasher(u64); 5 | 6 | impl Default for FnvHasher { 7 | #[inline] 8 | fn default() -> FnvHasher { 9 | FnvHasher(0xcbf2_9ce4_8422_2325) 10 | } 11 | } 12 | 13 | impl FnvHasher { 14 | /// Create an FNV hasher starting with a state corresponding 15 | /// to the hash `key`. 16 | #[inline] 17 | pub fn with_key(key: u64) -> FnvHasher { 18 | FnvHasher(key) 19 | } 20 | } 21 | 22 | impl Hasher for FnvHasher { 23 | #[inline] 24 | fn finish(&self) -> u64 { 25 | self.0 26 | } 27 | 28 | #[inline] 29 | fn write(&mut self, bytes: &[u8]) { 30 | let FnvHasher(mut hash) = *self; 31 | 32 | for byte in bytes.iter() { 33 | hash ^= u64::from(*byte); 34 | hash = hash.wrapping_mul(0x0100_0000_01b3); 35 | } 36 | 37 | *self = FnvHasher(hash); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // StableBloomFilter implements a Stable Bloom Filter as described by Deng and 2 | // Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable 3 | // Bloom Filters: 4 | // 5 | // http://webdocs.cs.ualberta.ca/~drafiei/papers/DupDet06Sigmod.pdf 6 | // 7 | // A Stable Bloom Filter (SBF) continuously evicts stale information so that it 8 | // has room for more recent elements. Like traditional Bloom filters, an SBF 9 | // has a non-zero probability of false positives, which is controlled by 10 | // several parameters. Unlike the classic Bloom filter, an SBF has a tight 11 | // upper bound on the rate of false positives while introducing a non-zero rate 12 | // of false negatives. The false-positive rate of a classic Bloom filter 13 | // eventually reaches 1, after which all queries result in a false positive. 14 | // The stable-point property of an SBF means the false-positive rate 15 | // asymptotically approaches a configurable fixed constant. A classic Bloom 16 | // filter is actually a special case of SBF where the eviction rate is zero, so 17 | // this package provides support for them as well. 18 | // 19 | // Stable Bloom Filters are useful for cases where the size of the data set 20 | // isn't known a priori, which is a requirement for traditional Bloom filters, 21 | // and memory is bounded. For example, an SBF can be used to deduplicate 22 | // events from an unbounded event stream with a specified upper bound on false 23 | // positives and minimal false negatives. 24 | pub mod buckets; 25 | pub mod fnv; 26 | pub mod stable; 27 | 28 | pub trait Filter { 29 | fn test(&self, _data: &[u8]) -> bool; 30 | 31 | fn add(&mut self, _data: &[u8]) -> &Self; 32 | 33 | fn test_and_add(&mut self, _data: &[u8]) -> bool; 34 | } 35 | 36 | /// Calculates the optimal number of hash functions to use for a Bloom 37 | /// filter based on the desired rate of false positives. 38 | pub(crate) fn optimal_k(fp_rate: f64) -> usize { 39 | (1.0 / fp_rate).log2().ceil() as usize 40 | } 41 | 42 | /// Returns the optimal number of cells to decrement, p, per 43 | /// iteration for the provided parameters of an SBF. 44 | pub(crate) fn optimal_stable_p(m: usize, k: usize, d: u8, fp_rate: f64) -> usize { 45 | let max = (2_u64.pow(u32::from(d)) - 1) as f64; 46 | let sub_denom = (1.0 - fp_rate.powf(1.0 / (k as f64))).powf(1.0 / max); 47 | let denom = (1.0 / sub_denom - 1.0) * (1.0 / (k as f64) - 1.0 / (m as f64)); 48 | 49 | let mut p = (1.0 / denom) as usize; 50 | 51 | if p == 0 { 52 | p = 1; 53 | } 54 | 55 | p 56 | } 57 | -------------------------------------------------------------------------------- /src/stable.rs: -------------------------------------------------------------------------------- 1 | use crate::buckets::Buckets; 2 | use crate::fnv::FnvHasher; 3 | use crate::Filter; 4 | use crate::{optimal_k, optimal_stable_p}; 5 | use rand::{thread_rng, Rng}; 6 | use std::hash::Hasher; 7 | 8 | pub struct StableBloomFilter { 9 | /// filter data 10 | cells: Buckets, 11 | /// hash function (kernel for all k functions) 12 | hash: FnvHasher, 13 | /// number of cells 14 | m: usize, 15 | /// number of cells to decrement 16 | p: usize, 17 | /// number of hash functions 18 | k: usize, 19 | /// cell max value 20 | max: u8, 21 | /// buffer used to cache indices 22 | index_buffer: Vec, 23 | } 24 | 25 | impl StableBloomFilter { 26 | /// Creates a new Stable Bloom Filter with m cells and d 27 | /// bits allocated per cell optimized for the target false-positive rate. Use 28 | /// default if you don't want to calculate d. 29 | pub fn new(m: usize, d: u8, fp_rate: f64) -> Self { 30 | let mut k = optimal_k(fp_rate) / 2; 31 | if k > m { 32 | k = m; 33 | } else if k == 0 { 34 | k = 1; 35 | } 36 | 37 | let cells = Buckets::new(m, d); 38 | 39 | StableBloomFilter { 40 | hash: FnvHasher::default(), 41 | m, 42 | k, 43 | p: optimal_stable_p(m, k, d, fp_rate), 44 | max: cells.max_bucket_value(), 45 | cells, 46 | index_buffer: vec![0; k], 47 | } 48 | } 49 | 50 | /// Creates a new Stable Bloom Filter with m 1-bit 51 | /// cells and which is optimized for cases where there is no prior knowledge of 52 | /// the input data stream while maintaining an upper bound using the provided 53 | /// rate of false positives. 54 | pub fn new_default(m: usize, fp_rate: f64) -> Self { 55 | Self::new(m, 1, fp_rate) 56 | } 57 | 58 | /// NewUnstableBloomFilter creates a new special case of Stable Bloom Filter 59 | /// which is a traditional Bloom filter with m bits and an optimal number of 60 | /// hash functions for the target false-positive rate. Unlike the stable 61 | /// variant, data is not evicted and a cell contains a maximum of 1 hash value. 62 | pub fn new_unstable(m: usize, fp_rate: f64) -> Self { 63 | let cells = Buckets::new(m, 1); 64 | let k = optimal_k(fp_rate); 65 | 66 | StableBloomFilter { 67 | hash: FnvHasher::default(), 68 | m, 69 | k, 70 | p: 0, 71 | max: cells.max_bucket_value(), 72 | cells, 73 | index_buffer: vec![0; k], 74 | } 75 | } 76 | 77 | /// Returns the number of cells in the Stable Bloom Filter. 78 | pub fn cells(&self) -> usize { 79 | self.m 80 | } 81 | 82 | /// Returns the number of hash functions. 83 | pub fn k(&self) -> usize { 84 | self.k 85 | } 86 | 87 | /// Returns the number of cells decremented on every add. 88 | pub fn p(&self) -> usize { 89 | self.p 90 | } 91 | 92 | pub fn max(&self) -> u8 { 93 | self.max 94 | } 95 | 96 | /// Returns the limit of the expected fraction of zeros in the 97 | /// Stable Bloom Filter when the number of iterations goes to infinity. When 98 | /// this limit is reached, the Stable Bloom Filter is considered stable. 99 | pub fn stable_point(&self) -> f64 { 100 | let sub_denom = (self.p as f64) * ((1.0 / (self.k as f64)) - (1.0 / (self.m as f64))); 101 | let denom = 1.0 + 1.0 / sub_denom; 102 | let base = 1.0 / denom; 103 | 104 | base.powf(f64::from(self.max)) 105 | } 106 | 107 | /// Returns the upper bound on false positives when the filter 108 | /// has become stable. 109 | pub fn false_positive_rate(&self) -> f64 { 110 | (1.0 - self.stable_point()).powf(self.k as f64) 111 | } 112 | 113 | #[inline] 114 | pub fn hash_kernel(&self, data: &[u8]) -> (u32, u32) { 115 | let mut hasher = self.hash.clone(); 116 | hasher.write(data); 117 | let hash: u64 = hasher.finish(); 118 | let lower = hash as u32; 119 | let upper = (hash >> 32) as u32; 120 | (lower, upper) 121 | } 122 | 123 | /// Restores the Stable Bloom Filter to its original state. It returns the 124 | /// filter to allow for chaining. 125 | pub fn reset(&mut self) -> &Self { 126 | self.cells.reset(); 127 | self 128 | } 129 | 130 | /// Will decrement a random cell and (p-1) adjacent cells by 1. This 131 | /// is faster than generating p random numbers. Although the processes of 132 | /// picking the p cells are not independent, each cell has a probability of p/m 133 | /// for being picked at each iteration, which means the properties still hold. 134 | #[inline] 135 | pub fn decrement(&mut self) { 136 | let mut rng = thread_rng(); 137 | let r: usize = rng.gen_range(0, self.m); 138 | 139 | for i in 0..(self.p) { 140 | let idx = (r + i) % self.m; 141 | self.cells.decrease(idx, 1); 142 | } 143 | } 144 | } 145 | 146 | impl Filter for StableBloomFilter { 147 | /// Will test for membership of the data and returns true if it is a 148 | /// member, false if not. This is a probabilistic test, meaning there is a 149 | /// non-zero probability of false positives and false negatives. 150 | #[inline] 151 | fn test(&self, data: &[u8]) -> bool { 152 | let (lower, upper) = self.hash_kernel(data); 153 | for i in 0..(self.k) { 154 | if self 155 | .cells 156 | .get((lower as usize + upper as usize * i) % self.m) 157 | == 0 158 | { 159 | return false; 160 | } 161 | } 162 | true 163 | } 164 | 165 | /// Will add the data to the Stable Bloom Filter. It returns the filter to 166 | /// allow for chaining. 167 | #[inline] 168 | fn add(&mut self, data: &[u8]) -> &Self { 169 | // Randomly decrement p cells to make room for new elements. 170 | self.decrement(); 171 | let (lower, upper) = self.hash_kernel(data); 172 | 173 | for i in 0..(self.k) { 174 | self.cells 175 | .set((lower as usize + upper as usize * i) % self.m, self.max); 176 | } 177 | 178 | self 179 | } 180 | 181 | /// Is equivalent to calling Test followed by Add. It returns true if 182 | /// the data is a member, false if not. 183 | #[inline] 184 | fn test_and_add(&mut self, data: &[u8]) -> bool { 185 | let (lower, upper) = self.hash_kernel(data); 186 | let mut member = true; 187 | 188 | // If any of the K cells are 0, then it's not a member. 189 | for i in 0..(self.k) { 190 | self.index_buffer[i] = (lower as usize + upper as usize * i) % self.m; 191 | if self.cells.get(self.index_buffer[i]) == 0 { 192 | member = false; 193 | } 194 | } 195 | 196 | // Randomly decrement p cells to make room for new elements. 197 | self.decrement(); 198 | // Set the K cells to max. 199 | for i in self.index_buffer.iter() { 200 | self.cells.set(*i, self.max); 201 | } 202 | 203 | member 204 | } 205 | } 206 | 207 | #[cfg(test)] 208 | mod tests { 209 | use super::StableBloomFilter; 210 | use crate::optimal_k; 211 | use crate::Filter; 212 | use float_cmp::ApproxEq; 213 | use std::f64; 214 | 215 | fn round(val: f64, round_on: f64, places: usize) -> f64 { 216 | let pow = (10.0_f64).powf(places as f64); 217 | let digit = pow * val; 218 | let div = digit - digit.floor(); 219 | let round = if div >= round_on { 220 | digit.ceil() 221 | } else { 222 | digit.floor() 223 | }; 224 | 225 | round / pow 226 | } 227 | 228 | // Ensures that new_unstable creates a Stable Bloom Filter with p=0, 229 | // max=1 and k hash functions. 230 | #[test] 231 | fn test_new_unstable() { 232 | let f = StableBloomFilter::new_unstable(100, 0.1); 233 | let k = optimal_k(0.1); 234 | 235 | assert_eq!(f.k, k); 236 | assert_eq!(f.m, 100); 237 | assert_eq!(f.p(), 0); 238 | assert_eq!(f.max(), 1); 239 | } 240 | 241 | // Ensures that Cells returns the number of cells, m, in the Stable Bloom 242 | // Filter. 243 | #[test] 244 | fn test_cells() { 245 | let f = StableBloomFilter::new(100, 1, 0.1); 246 | 247 | assert_eq!(f.cells(), 100); 248 | } 249 | 250 | // Ensures that K returns the number of hash functions in the Stable Bloom 251 | // Filter. 252 | #[test] 253 | fn test_k() { 254 | let f = StableBloomFilter::new(100, 1, 0.01); 255 | assert_eq!(f.k(), 3); 256 | } 257 | 258 | // Ensures that Test, Add, and TestAndAdd behave correctly. 259 | #[test] 260 | fn test_test_and_add() { 261 | let mut f = StableBloomFilter::new_default(1_000, 0.01); 262 | assert!(!f.test(b"a")); 263 | 264 | f.add(b"a"); 265 | assert!(f.test(b"a")); 266 | 267 | assert!(f.test_and_add(b"a")); 268 | 269 | assert!(!f.test_and_add(b"b")); 270 | assert!(f.test(b"a")); 271 | 272 | assert!(f.test(b"b")); 273 | 274 | assert!(!f.test(b"c")); 275 | 276 | for i in 0..1_000_000 { 277 | f.test_and_add(i.to_string().as_bytes()); 278 | } 279 | 280 | // `a` should have been evicted. 281 | assert!(!f.test(b"a")); 282 | } 283 | 284 | // Ensures that StablePoint returns the expected fraction of zeros for large 285 | // iterations. 286 | #[test] 287 | fn test_stable_point() { 288 | let mut f = StableBloomFilter::new(1000, 1, 0.1); 289 | for i in 0..1_000_000 { 290 | f.add(i.to_string().as_bytes()); 291 | } 292 | 293 | let mut zero = 0; 294 | for i in 0..(f.m) { 295 | if f.cells.get(i) == 0 { 296 | zero += 1; 297 | } 298 | } 299 | 300 | let actual = round(f64::from(zero) / (f.m as f64), 0.5, 1); 301 | let expected = round(f.stable_point(), 0.5, 1); 302 | 303 | assert!(actual.approx_eq(expected, (f64::EPSILON, 1))); 304 | // A classic Bloom filter is a special case of SBF where P is 0 and max is 305 | // 1. It doesn't have a stable point. 306 | let bf = StableBloomFilter::new_unstable(1000, 0.1); 307 | assert!(bf.stable_point().approx_eq(0.0, (f64::EPSILON, 1))); 308 | } 309 | 310 | // Ensures that FalsePositiveRate returns the upper bound on false positives 311 | // for stable filters. 312 | #[test] 313 | fn test_false_positive_rate() { 314 | let f = StableBloomFilter::new_default(1000, 0.01); 315 | let fps = round(f.false_positive_rate(), 0.5, 2); 316 | 317 | assert!(fps.approx_eq(0.01, (f64::EPSILON, 1))); 318 | 319 | // Classic Bloom filters have an unbounded rate of false positives. Once 320 | // they become full, every query returns a false positive. 321 | let bf = StableBloomFilter::new_unstable(1000, 0.1); 322 | assert!(bf.false_positive_rate().approx_eq(1.0, (f64::EPSILON, 1))); 323 | } 324 | 325 | // Ensures that Reset sets every cell to zero. 326 | #[test] 327 | fn test_reset() { 328 | let mut f = StableBloomFilter::new_default(1000, 0.01); 329 | 330 | for i in 0..1000 { 331 | f.add(i.to_string().as_bytes()); 332 | } 333 | 334 | f.reset(); 335 | 336 | for i in 0..(f.m) { 337 | assert_eq!(f.cells.get(i), 0); 338 | } 339 | } 340 | } 341 | --------------------------------------------------------------------------------