├── LICENSE.txt ├── README.md ├── bloom_test.go └── bloom.go /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bloom filters 2 | ------------- 3 | 4 | A Bloom filter is a representation of a set of _n_ items, where the main 5 | requirement is to make membership queries; _i.e._, whether an item is a 6 | member of a set. 7 | 8 | A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large 9 | multiple of the cardinality of the set to represent) and _k_, the number of hashing 10 | functions on elements of the set. (The actual hashing functions are important, too, 11 | but this is not a parameter for this implementation). A Bloom filter is backed by 12 | a BitSet; a key is represented in the filter by setting the bits at each value of the 13 | hashing functions (modulo _m_). Set membership is done by _testing_ whether the 14 | bits at each value of the hashing functions (again, modulo _m_) are set. If so, 15 | the item is in the set. If the item is actually in the set, a Bloom filter will 16 | never fail (the true positive rate is 1.0); but it is susceptible to false 17 | positives. The art is to choose _k_ and _m_ correctly. 18 | 19 | In this implementation, the hashing function used is FNV, a non-cryptographic 20 | hashing function which is part of the Go package (hash/fnv). For a item, the 21 | 64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and 22 | h2, are used. Then, the _i_th hashing function is: 23 | 24 | h1 + h2*i 25 | 26 | Thus, the underlying hash function, FNV, is only called once per key. 27 | 28 | This implementation accepts keys for setting as testing as []byte. Thus, to 29 | add a string item, "Love": 30 | 31 | uint n = 1000 32 | filter := bloom.New(20*n, 5) // load of 20, 5 keys 33 | filter.Add([]byte("Love")) 34 | 35 | Similarly, to test if "Love" is in bloom: 36 | 37 | if filter.Test([]byte("Love")) 38 | 39 | For numeric data, I recommend that you look into the binary/encoding library. But, 40 | for example, to add a uint32 to the filter: 41 | 42 | i := uint32(100) 43 | n1 := make([]byte,4) 44 | binary.BigEndian.PutUint32(n1,i) 45 | f.Add(n1) 46 | 47 | Finally, there is a method to estimate the false positive rate of a particular 48 | bloom filter for a set of size _n_: 49 | 50 | if filter.EstimateFalsePositiveRate(1000) > 0.001 51 | 52 | Given the particular hashing scheme, it's best to be empirical about this. Note 53 | that estimating the FP rate will clear the Bloom filter. 54 | 55 | Discussion here: [Bloom filter](https://groups.google.com/d/topic/golang-nuts/6MktecKi1bE/discussion) -------------------------------------------------------------------------------- /bloom_test.go: -------------------------------------------------------------------------------- 1 | package bloom 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "testing" 7 | ) 8 | 9 | func TestBasic(t *testing.T) { 10 | f := New(1000, 4) 11 | n1 := []byte("Bess") 12 | n2 := []byte("Jane") 13 | n3 := []byte("Emma") 14 | f.Add(n1) 15 | n3a := f.TestAndAdd(n3) 16 | n1b := f.Test(n1) 17 | n2b := f.Test(n2) 18 | n3b := f.Test(n3) 19 | if !n1b { 20 | t.Errorf("%v should be in.", n1) 21 | } 22 | if n2b { 23 | t.Errorf("%v should not be in.", n2) 24 | } 25 | if n3a { 26 | t.Errorf("%v should not be in the first time we look.", n3) 27 | } 28 | if !n3b { 29 | t.Errorf("%v should be in the second time we look.", n3) 30 | } 31 | } 32 | 33 | func TestBasicUint32(t *testing.T) { 34 | f := New(1000, 4) 35 | n1 := make([]byte, 4) 36 | n2 := make([]byte, 4) 37 | n3 := make([]byte, 4) 38 | n4 := make([]byte, 4) 39 | binary.BigEndian.PutUint32(n1, 100) 40 | binary.BigEndian.PutUint32(n2, 101) 41 | binary.BigEndian.PutUint32(n3, 102) 42 | binary.BigEndian.PutUint32(n4, 103) 43 | f.Add(n1) 44 | n3a := f.TestAndAdd(n3) 45 | n1b := f.Test(n1) 46 | n2b := f.Test(n2) 47 | n3b := f.Test(n3) 48 | f.Test(n4) 49 | if !n1b { 50 | t.Errorf("%v should be in.", n1) 51 | } 52 | if n2b { 53 | t.Errorf("%v should not be in.", n2) 54 | } 55 | if n3a { 56 | t.Errorf("%v should not be in the first time we look.", n3) 57 | } 58 | if !n3b { 59 | t.Errorf("%v should be in the second time we look.", n3) 60 | } 61 | } 62 | 63 | func TestDirect20_5(t *testing.T) { 64 | n := uint(10000) 65 | k := uint(5) 66 | load := uint(20) 67 | f := New(n*load, k) 68 | fp_rate := f.EstimateFalsePositiveRate(n) 69 | if fp_rate > 0.0001 { 70 | t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate) 71 | } 72 | } 73 | 74 | func TestDirect15_10(t *testing.T) { 75 | n := uint(10000) 76 | k := uint(10) 77 | load := uint(15) 78 | f := New(n*load, k) 79 | fp_rate := f.EstimateFalsePositiveRate(n) 80 | if fp_rate > 0.0001 { 81 | t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate) 82 | } 83 | } 84 | 85 | func TestEstimated10_0001(t *testing.T) { 86 | n := uint(10000) 87 | fp := 0.0001 88 | m, k := estimateParameters(n, fp) 89 | f := NewWithEstimates(n, fp) 90 | fp_rate := f.EstimateFalsePositiveRate(n) 91 | if fp_rate > fp { 92 | t.Errorf("False positive rate too high: n: %v, fp: %f, n: %v, k: %v result: %f", n, fp, m, k, fp_rate) 93 | } 94 | } 95 | 96 | func TestEstimated10_001(t *testing.T) { 97 | n := uint(10000) 98 | fp := 0.001 99 | m, k := estimateParameters(n, fp) 100 | f := NewWithEstimates(n, fp) 101 | fp_rate := f.EstimateFalsePositiveRate(n) 102 | if fp_rate > fp { 103 | t.Errorf("False positive rate too high: n: %v, fp: %f, n: %v, k: %v result: %f", n, fp, m, k, fp_rate) 104 | } 105 | } 106 | 107 | func BenchmarkDirect(b *testing.B) { 108 | n := uint(10000) 109 | max_k := uint(10) 110 | max_load := uint(20) 111 | fmt.Printf("m/n") 112 | for k := uint(2); k <= max_k; k++ { 113 | fmt.Printf("\tk=%v", k) 114 | } 115 | fmt.Println() 116 | for load := uint(2); load <= max_load; load++ { 117 | fmt.Print(load) 118 | for k := uint(2); k <= max_k; k++ { 119 | f := New(n*load, k) 120 | fp_rate := f.EstimateFalsePositiveRate(n) 121 | fmt.Printf("\t%f", fp_rate) 122 | } 123 | fmt.Println() 124 | } 125 | } 126 | 127 | func BenchmarkEstimted(b *testing.B) { 128 | for n := uint(5000); n <= 50000; n += 5000 { 129 | fmt.Printf("%v", n) 130 | for fp := 0.1; fp >= 0.00001; fp /= 10.0 { 131 | fmt.Printf("\t%f", fp) 132 | m, k := estimateParameters(n, fp) 133 | f := NewWithEstimates(n, fp) 134 | fp_rate := f.EstimateFalsePositiveRate(n) 135 | fmt.Printf("\t%v\t%v\t%f", m, k, fp_rate) 136 | } 137 | fmt.Println() 138 | } 139 | } 140 | 141 | func BenchmarkSeparateTestAndAdd(b *testing.B) { 142 | f := NewWithEstimates(uint(b.N), 0.0001) 143 | key := make([]byte, 100) 144 | b.ResetTimer() 145 | for i := 0; i < b.N; i++ { 146 | binary.BigEndian.PutUint32(key, uint32(i)) 147 | f.Test(key) 148 | f.Add(key) 149 | } 150 | } 151 | 152 | func BenchmarkCombinedTestAndAdd(b *testing.B) { 153 | f := NewWithEstimates(uint(b.N), 0.0001) 154 | key := make([]byte, 100) 155 | b.ResetTimer() 156 | for i := 0; i < b.N; i++ { 157 | binary.BigEndian.PutUint32(key, uint32(i)) 158 | f.TestAndAdd(key) 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /bloom.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file.package bloom 4 | 5 | /* 6 | A Bloom filter is a representation of a set of _n_ items, where the main 7 | requirement is to make membership queries; _i.e._, whether an item is a 8 | member of a set. 9 | 10 | A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large 11 | multiple of the cardinality of the set to represent) and _k_, the number of hashing 12 | functions on elements of the set. (The actual hashing functions are important, too, 13 | but this is not a parameter for this implementation). A Bloom filter is backed by 14 | a BitSet; a key is represented in the filter by setting the bits at each value of the 15 | hashing functions (modulo _m_). Set membership is done by _testing_ whether the 16 | bits at each value of the hashing functions (again, modulo _m_) are set. If so, 17 | the item is in the set. If the item is actually in the set, a Bloom filter will 18 | never fail (the true positive rate is 1.0); but it is susceptible to false 19 | positives. The art is to choose _k_ and _m_ correctly. 20 | 21 | In this implementation, the hashing function used is FNV, a non-cryptographic 22 | hashing function which is part of the Go package (hash/fnv). For a item, the 23 | 64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and 24 | h2, are used. Then, the _i_th hashing function is: 25 | 26 | h1 + h2*i 27 | 28 | Thus, the underlying hash function, FNV, is only called once per key. 29 | 30 | This implementation accepts keys for setting as testing as []byte. Thus, to 31 | add a string item, "Love": 32 | 33 | uint n = 1000 34 | filter := bloom.New(20*n, 5) // load of 20, 5 keys 35 | filter.Add([]byte("Love")) 36 | 37 | Similarly, to test if "Love" is in bloom: 38 | 39 | if filter.Test([]byte("Love")) 40 | 41 | For numeric data, I recommend that you look into the binary/encoding library. But, 42 | for example, to add a uint32 to the filter: 43 | 44 | i := uint32(100) 45 | n1 := make([]byte,4) 46 | binary.BigEndian.PutUint32(n1,i) 47 | f.Add(n1) 48 | 49 | Finally, there is a method to estimate the false positive rate of a particular 50 | bloom filter for a set of size _n_: 51 | 52 | if filter.EstimateFalsePositiveRate(1000) > 0.001 53 | 54 | Given the particular hashing scheme, it's best to be empirical about this. Note 55 | that estimating the FP rate will clear the Bloom filter. 56 | */ 57 | package bloom 58 | 59 | import ( 60 | "encoding/binary" 61 | "github.com/willf/bitset" 62 | "hash" 63 | "hash/fnv" 64 | "math" 65 | //"fmt" 66 | ) 67 | 68 | type BloomFilter struct { 69 | m uint 70 | k uint 71 | b *bitset.BitSet 72 | hasher hash.Hash64 73 | } 74 | 75 | // Create a new Bloom filter with _m_ bits and _k_ hashing functions 76 | func New(m uint, k uint) *BloomFilter { 77 | return &BloomFilter{m, k, bitset.New(m), fnv.New64()} 78 | } 79 | 80 | // estimate parameters. Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go 81 | // used with permission. 82 | func estimateParameters(n uint, p float64) (m uint, k uint) { 83 | m = uint(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)) 84 | k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n))) 85 | return 86 | } 87 | 88 | // Create a new Bloom filter for about n items with fp 89 | // false positive rate 90 | func NewWithEstimates(n uint, fp float64) *BloomFilter { 91 | m, k := estimateParameters(n, fp) 92 | return New(m, k) 93 | } 94 | 95 | // Return the capacity, _m_, of a Bloom filter 96 | func (b *BloomFilter) Cap() uint { 97 | return b.m 98 | } 99 | 100 | // Return the number of hash functions used 101 | func (b *BloomFilter) K() uint { 102 | return b.k 103 | } 104 | 105 | // get the two basic hash function values for data 106 | func (f *BloomFilter) base_hashes(data []byte) (a uint32, b uint32) { 107 | f.hasher.Reset() 108 | f.hasher.Write(data) 109 | sum := f.hasher.Sum(nil) 110 | upper := sum[0:4] 111 | lower := sum[4:8] 112 | a = binary.BigEndian.Uint32(lower) 113 | b = binary.BigEndian.Uint32(upper) 114 | return 115 | } 116 | 117 | // get the _k_ locations to set/test in the underlying bitset 118 | func (f *BloomFilter) locations(data []byte) (locs []uint) { 119 | locs = make([]uint, f.k) 120 | a, b := f.base_hashes(data) 121 | ua := uint(a) 122 | ub := uint(b) 123 | //fmt.Println(ua, ub) 124 | for i := uint(0); i < f.k; i++ { 125 | locs[i] = (ua + ub*i) % f.m 126 | } 127 | //fmt.Println(data, "->", locs) 128 | return 129 | } 130 | 131 | // Add data to the Bloom Filter. Returns the filter (allows chaining) 132 | func (f *BloomFilter) Add(data []byte) *BloomFilter { 133 | for _, loc := range f.locations(data) { 134 | f.b.Set(loc) 135 | } 136 | return f 137 | } 138 | 139 | // Tests for the presence of data in the Bloom filter 140 | func (f *BloomFilter) Test(data []byte) bool { 141 | for _, loc := range f.locations(data) { 142 | if !f.b.Test(loc) { 143 | return false 144 | } 145 | } 146 | return true 147 | } 148 | 149 | // Equivalent to calling Test(data) then Add(data). Returns the result of Test. 150 | func (f *BloomFilter) TestAndAdd(data []byte) bool { 151 | present := true 152 | for _, loc := range f.locations(data) { 153 | if !f.b.Test(loc) { 154 | present = false 155 | } 156 | f.b.Set(loc) 157 | } 158 | return present 159 | } 160 | 161 | // Clear all the data in a Bloom filter, removing all keys 162 | func (f *BloomFilter) ClearAll() *BloomFilter { 163 | f.b.ClearAll() 164 | return f 165 | } 166 | 167 | // Estimate, for a BloomFilter with a limit of m bytes 168 | // and k hash functions, what the false positive rate will be 169 | // whilst storing n entries; runs 10k tests 170 | func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fp_rate float64) { 171 | f.ClearAll() 172 | n1 := make([]byte, 4) 173 | for i := uint32(0); i < uint32(n); i++ { 174 | binary.BigEndian.PutUint32(n1, i) 175 | f.Add(n1) 176 | } 177 | fp := 0 178 | // test 10k numbers 179 | for i := uint32(0); i < uint32(10000); i++ { 180 | binary.BigEndian.PutUint32(n1, i+uint32(n)+1) 181 | if f.Test(n1) { 182 | fp++ 183 | } 184 | } 185 | fp_rate = float64(fp) / float64(100) 186 | f.ClearAll() 187 | return 188 | } 189 | --------------------------------------------------------------------------------