├── LICENSE.txt
├── README.md
├── bloom_test.go
└── bloom.go


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Bloom filters
 2 | -------------
 3 | 
 4 | A Bloom filter is a representation of a set of _n_ items, where the main
 5 | requirement is to make membership queries; _i.e._, whether an item is a 
 6 | member of a set.
 7 | 
 8 | A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
 9 | multiple of the cardinality of the set to represent) and _k_, the number of hashing
10 | functions on elements of the set. (The actual hashing functions are important, too,
11 | but this is not a parameter for this implementation). A Bloom filter is backed by
12 | a BitSet; a key is represented in the filter by setting the bits at each value of the 
13 | hashing functions (modulo _m_). Set membership is done by _testing_ whether the
14 | bits at each value of the hashing functions (again, modulo _m_) are set. If so,
15 | the item is in the set. If the item is actually in the set, a Bloom filter will
16 | never fail (the true positive rate is 1.0); but it is susceptible to false
17 | positives. The art is to choose _k_ and _m_ correctly.
18 | 
19 | In this implementation, the hashing function used is FNV, a non-cryptographic
20 | hashing function which is part of the Go package (hash/fnv). For a item, the
21 | 64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and
22 | h2, are used. Then, the _i_th hashing function is:
23 | 
24 |     h1 + h2*i
25 |     
26 | Thus, the underlying hash function, FNV, is only called once per key.
27 | 
28 | This implementation accepts keys for setting as testing as []byte. Thus, to 
29 | add a string item, "Love":
30 | 
31 |     uint n = 1000
32 |     filter := bloom.New(20*n, 5) // load of 20, 5 keys
33 |     filter.Add([]byte("Love"))
34 |     
35 | Similarly, to test if "Love" is in bloom:
36 | 
37 |     if filter.Test([]byte("Love"))
38 |     
39 | For numeric data, I recommend that you look into the binary/encoding library. But,
40 | for example, to add a uint32 to the filter:
41 | 
42 |     i := uint32(100)
43 |     n1 := make([]byte,4)
44 |     binary.BigEndian.PutUint32(n1,i)
45 |     f.Add(n1)
46 | 
47 | Finally, there is a method to estimate the false positive rate of a particular
48 | bloom filter for a set of size _n_:
49 | 
50 |     if filter.EstimateFalsePositiveRate(1000) > 0.001 
51 |     
52 | Given the particular hashing scheme, it's best to be empirical about this. Note
53 | that estimating the FP rate will clear the Bloom filter.
54 |                                                          
55 | Discussion here: [Bloom filter](https://groups.google.com/d/topic/golang-nuts/6MktecKi1bE/discussion)


--------------------------------------------------------------------------------
/bloom_test.go:
--------------------------------------------------------------------------------
  1 | package bloom
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestBasic(t *testing.T) {
 10 | 	f := New(1000, 4)
 11 | 	n1 := []byte("Bess")
 12 | 	n2 := []byte("Jane")
 13 | 	n3 := []byte("Emma")
 14 | 	f.Add(n1)
 15 | 	n3a := f.TestAndAdd(n3)
 16 | 	n1b := f.Test(n1)
 17 | 	n2b := f.Test(n2)
 18 | 	n3b := f.Test(n3)
 19 | 	if !n1b {
 20 | 		t.Errorf("%v should be in.", n1)
 21 | 	}
 22 | 	if n2b {
 23 | 		t.Errorf("%v should not be in.", n2)
 24 | 	}
 25 | 	if n3a {
 26 | 		t.Errorf("%v should not be in the first time we look.", n3)
 27 | 	}
 28 | 	if !n3b {
 29 | 		t.Errorf("%v should be in the second time we look.", n3)
 30 | 	}
 31 | }
 32 | 
 33 | func TestBasicUint32(t *testing.T) {
 34 | 	f := New(1000, 4)
 35 | 	n1 := make([]byte, 4)
 36 | 	n2 := make([]byte, 4)
 37 | 	n3 := make([]byte, 4)
 38 | 	n4 := make([]byte, 4)
 39 | 	binary.BigEndian.PutUint32(n1, 100)
 40 | 	binary.BigEndian.PutUint32(n2, 101)
 41 | 	binary.BigEndian.PutUint32(n3, 102)
 42 | 	binary.BigEndian.PutUint32(n4, 103)
 43 | 	f.Add(n1)
 44 | 	n3a := f.TestAndAdd(n3)
 45 | 	n1b := f.Test(n1)
 46 | 	n2b := f.Test(n2)
 47 | 	n3b := f.Test(n3)
 48 | 	f.Test(n4)
 49 | 	if !n1b {
 50 | 		t.Errorf("%v should be in.", n1)
 51 | 	}
 52 | 	if n2b {
 53 | 		t.Errorf("%v should not be in.", n2)
 54 | 	}
 55 | 	if n3a {
 56 | 		t.Errorf("%v should not be in the first time we look.", n3)
 57 | 	}
 58 | 	if !n3b {
 59 | 		t.Errorf("%v should be in the second time we look.", n3)
 60 | 	}
 61 | }
 62 | 
 63 | func TestDirect20_5(t *testing.T) {
 64 | 	n := uint(10000)
 65 | 	k := uint(5)
 66 | 	load := uint(20)
 67 | 	f := New(n*load, k)
 68 | 	fp_rate := f.EstimateFalsePositiveRate(n)
 69 | 	if fp_rate > 0.0001 {
 70 | 		t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate)
 71 | 	}
 72 | }
 73 | 
 74 | func TestDirect15_10(t *testing.T) {
 75 | 	n := uint(10000)
 76 | 	k := uint(10)
 77 | 	load := uint(15)
 78 | 	f := New(n*load, k)
 79 | 	fp_rate := f.EstimateFalsePositiveRate(n)
 80 | 	if fp_rate > 0.0001 {
 81 | 		t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate)
 82 | 	}
 83 | }
 84 | 
 85 | func TestEstimated10_0001(t *testing.T) {
 86 | 	n := uint(10000)
 87 | 	fp := 0.0001
 88 | 	m, k := estimateParameters(n, fp)
 89 | 	f := NewWithEstimates(n, fp)
 90 | 	fp_rate := f.EstimateFalsePositiveRate(n)
 91 | 	if fp_rate > fp {
 92 | 		t.Errorf("False positive rate too high: n: %v, fp: %f, n: %v, k: %v result: %f", n, fp, m, k, fp_rate)
 93 | 	}
 94 | }
 95 | 
 96 | func TestEstimated10_001(t *testing.T) {
 97 | 	n := uint(10000)
 98 | 	fp := 0.001
 99 | 	m, k := estimateParameters(n, fp)
100 | 	f := NewWithEstimates(n, fp)
101 | 	fp_rate := f.EstimateFalsePositiveRate(n)
102 | 	if fp_rate > fp {
103 | 		t.Errorf("False positive rate too high: n: %v, fp: %f, n: %v, k: %v result: %f", n, fp, m, k, fp_rate)
104 | 	}
105 | }
106 | 
107 | func BenchmarkDirect(b *testing.B) {
108 | 	n := uint(10000)
109 | 	max_k := uint(10)
110 | 	max_load := uint(20)
111 | 	fmt.Printf("m/n")
112 | 	for k := uint(2); k <= max_k; k++ {
113 | 		fmt.Printf("\tk=%v", k)
114 | 	}
115 | 	fmt.Println()
116 | 	for load := uint(2); load <= max_load; load++ {
117 | 		fmt.Print(load)
118 | 		for k := uint(2); k <= max_k; k++ {
119 | 			f := New(n*load, k)
120 | 			fp_rate := f.EstimateFalsePositiveRate(n)
121 | 			fmt.Printf("\t%f", fp_rate)
122 | 		}
123 | 		fmt.Println()
124 | 	}
125 | }
126 | 
127 | func BenchmarkEstimted(b *testing.B) {
128 | 	for n := uint(5000); n <= 50000; n += 5000 {
129 | 		fmt.Printf("%v", n)
130 | 		for fp := 0.1; fp >= 0.00001; fp /= 10.0 {
131 | 			fmt.Printf("\t%f", fp)
132 | 			m, k := estimateParameters(n, fp)
133 | 			f := NewWithEstimates(n, fp)
134 | 			fp_rate := f.EstimateFalsePositiveRate(n)
135 | 			fmt.Printf("\t%v\t%v\t%f", m, k, fp_rate)
136 | 		}
137 | 		fmt.Println()
138 | 	}
139 | }
140 | 
141 | func BenchmarkSeparateTestAndAdd(b *testing.B) {
142 | 	f := NewWithEstimates(uint(b.N), 0.0001)
143 | 	key := make([]byte, 100)
144 | 	b.ResetTimer()
145 | 	for i := 0; i < b.N; i++ {
146 | 		binary.BigEndian.PutUint32(key, uint32(i))
147 | 		f.Test(key)
148 | 		f.Add(key)
149 | 	}
150 | }
151 | 
152 | func BenchmarkCombinedTestAndAdd(b *testing.B) {
153 | 	f := NewWithEstimates(uint(b.N), 0.0001)
154 | 	key := make([]byte, 100)
155 | 	b.ResetTimer()
156 | 	for i := 0; i < b.N; i++ {
157 | 		binary.BigEndian.PutUint32(key, uint32(i))
158 | 		f.TestAndAdd(key)
159 | 	}
160 | }
161 | 


--------------------------------------------------------------------------------
/bloom.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.package bloom
  4 | 
  5 | /*
  6 | A Bloom filter is a representation of a set of _n_ items, where the main
  7 | requirement is to make membership queries; _i.e._, whether an item is a 
  8 | member of a set.
  9 | 
 10 | A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
 11 | multiple of the cardinality of the set to represent) and _k_, the number of hashing
 12 | functions on elements of the set. (The actual hashing functions are important, too,
 13 | but this is not a parameter for this implementation). A Bloom filter is backed by
 14 | a BitSet; a key is represented in the filter by setting the bits at each value of the 
 15 | hashing functions (modulo _m_). Set membership is done by _testing_ whether the
 16 | bits at each value of the hashing functions (again, modulo _m_) are set. If so,
 17 | the item is in the set. If the item is actually in the set, a Bloom filter will
 18 | never fail (the true positive rate is 1.0); but it is susceptible to false
 19 | positives. The art is to choose _k_ and _m_ correctly.
 20 | 
 21 | In this implementation, the hashing function used is FNV, a non-cryptographic
 22 | hashing function which is part of the Go package (hash/fnv). For a item, the
 23 | 64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and
 24 | h2, are used. Then, the _i_th hashing function is:
 25 | 
 26 |     h1 + h2*i
 27 | 
 28 | Thus, the underlying hash function, FNV, is only called once per key.
 29 | 
 30 | This implementation accepts keys for setting as testing as []byte. Thus, to 
 31 | add a string item, "Love":
 32 | 
 33 |     uint n = 1000
 34 |     filter := bloom.New(20*n, 5) // load of 20, 5 keys
 35 |     filter.Add([]byte("Love"))
 36 | 
 37 | Similarly, to test if "Love" is in bloom:
 38 | 
 39 |     if filter.Test([]byte("Love"))
 40 | 
 41 | For numeric data, I recommend that you look into the binary/encoding library. But,
 42 | for example, to add a uint32 to the filter:
 43 | 
 44 |     i := uint32(100)
 45 |     n1 := make([]byte,4)
 46 |     binary.BigEndian.PutUint32(n1,i)
 47 |     f.Add(n1)
 48 | 
 49 | Finally, there is a method to estimate the false positive rate of a particular
 50 | bloom filter for a set of size _n_:
 51 | 
 52 |     if filter.EstimateFalsePositiveRate(1000) > 0.001 
 53 | 
 54 | Given the particular hashing scheme, it's best to be empirical about this. Note
 55 | that estimating the FP rate will clear the Bloom filter.
 56 | */
 57 | package bloom
 58 | 
 59 | import (
 60 | 	"encoding/binary"
 61 | 	"github.com/willf/bitset"
 62 | 	"hash"
 63 | 	"hash/fnv"
 64 | 	"math"
 65 | 	//"fmt"
 66 | )
 67 | 
 68 | type BloomFilter struct {
 69 | 	m      uint
 70 | 	k      uint
 71 | 	b      *bitset.BitSet
 72 | 	hasher hash.Hash64
 73 | }
 74 | 
 75 | // Create a new Bloom filter with _m_ bits and _k_ hashing functions 
 76 | func New(m uint, k uint) *BloomFilter {
 77 | 	return &BloomFilter{m, k, bitset.New(m), fnv.New64()}
 78 | }
 79 | 
 80 | // estimate parameters. Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
 81 | // used with permission.
 82 | func estimateParameters(n uint, p float64) (m uint, k uint) {
 83 | 	m = uint(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2))
 84 | 	k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
 85 | 	return
 86 | }
 87 | 
 88 | // Create a new Bloom filter for about n items with fp 
 89 | // false positive rate
 90 | func NewWithEstimates(n uint, fp float64) *BloomFilter {
 91 | 	m, k := estimateParameters(n, fp)
 92 | 	return New(m, k)
 93 | }
 94 | 
 95 | // Return the capacity, _m_, of a Bloom filter
 96 | func (b *BloomFilter) Cap() uint {
 97 | 	return b.m
 98 | }
 99 | 
100 | // Return the number of hash functions used
101 | func (b *BloomFilter) K() uint {
102 | 	return b.k
103 | }
104 | 
105 | // get the two basic hash function values for data
106 | func (f *BloomFilter) base_hashes(data []byte) (a uint32, b uint32) {
107 | 	f.hasher.Reset()
108 | 	f.hasher.Write(data)
109 | 	sum := f.hasher.Sum(nil)
110 | 	upper := sum[0:4]
111 | 	lower := sum[4:8]
112 | 	a = binary.BigEndian.Uint32(lower)
113 | 	b = binary.BigEndian.Uint32(upper)
114 | 	return
115 | }
116 | 
117 | // get the _k_ locations to set/test in the underlying bitset
118 | func (f *BloomFilter) locations(data []byte) (locs []uint) {
119 | 	locs = make([]uint, f.k)
120 | 	a, b := f.base_hashes(data)
121 | 	ua := uint(a)
122 | 	ub := uint(b)
123 | 	//fmt.Println(ua, ub)
124 | 	for i := uint(0); i < f.k; i++ {
125 | 		locs[i] = (ua + ub*i) % f.m
126 | 	}
127 | 	//fmt.Println(data, "->", locs)
128 | 	return
129 | }
130 | 
131 | // Add data to the Bloom Filter. Returns the filter (allows chaining)
132 | func (f *BloomFilter) Add(data []byte) *BloomFilter {
133 | 	for _, loc := range f.locations(data) {
134 | 		f.b.Set(loc)
135 | 	}
136 | 	return f
137 | }
138 | 
139 | // Tests for the presence of data in the Bloom filter
140 | func (f *BloomFilter) Test(data []byte) bool {
141 | 	for _, loc := range f.locations(data) {
142 | 		if !f.b.Test(loc) {
143 | 			return false
144 | 		}
145 | 	}
146 | 	return true
147 | }
148 | 
149 | // Equivalent to calling Test(data) then Add(data).  Returns the result of Test.
150 | func (f *BloomFilter) TestAndAdd(data []byte) bool {
151 | 	present := true
152 | 	for _, loc := range f.locations(data) {
153 | 		if !f.b.Test(loc) {
154 | 			present = false
155 | 		}
156 | 		f.b.Set(loc)
157 | 	}
158 | 	return present
159 | }
160 | 
161 | // Clear all the data in a Bloom filter, removing all keys
162 | func (f *BloomFilter) ClearAll() *BloomFilter {
163 | 	f.b.ClearAll()
164 | 	return f
165 | }
166 | 
167 | // Estimate, for a BloomFilter with a limit of m bytes
168 | // and k hash functions, what the false positive rate will be
169 | // whilst storing n entries; runs 10k tests
170 | func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fp_rate float64) {
171 | 	f.ClearAll()
172 | 	n1 := make([]byte, 4)
173 | 	for i := uint32(0); i < uint32(n); i++ {
174 | 		binary.BigEndian.PutUint32(n1, i)
175 | 		f.Add(n1)
176 | 	}
177 | 	fp := 0
178 | 	// test 10k numbers
179 | 	for i := uint32(0); i < uint32(10000); i++ {
180 | 		binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
181 | 		if f.Test(n1) {
182 | 			fp++
183 | 		}
184 | 	}
185 | 	fp_rate = float64(fp) / float64(100)
186 | 	f.ClearAll()
187 | 	return
188 | }
189 | 


--------------------------------------------------------------------------------