├── README.md
├── filter.go
├── filter_test.go
└── go.mod


/README.md:
--------------------------------------------------------------------------------
 1 | ## cuckoofilter
 2 | 
 3 | This is an implementation of a data structure known as a **cuckoo
 4 | filter**.  The data structure is described in a paper called *[Cuckoo
 5 | Filter: Practically Better Than
 6 | Bloom](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)* by
 7 | Bin Fan, David G. Andersen, Michael Kaminsky and Michael
 8 | D. Mitzenmacher.
 9 | 
10 | Cuckoo filters, like Bloom filters, are probabilistic data structures
11 | useful for determining whether a piece of data is present in a set.
12 | Like Bloom filters, cuckoo filters do not store the key being looked
13 | up, or the value of the data, so they are appropriate only for
14 | checking whether the primary data source should be queried.
15 | 
16 | Cuckoo filters (and Bloom filters) can return false positives when
17 | checking for presence, but will never return a false negative.
18 | 
19 | Unlike (standard, non-counting) Bloom filters, data can be deleted
20 | from cuckoo filters.
21 | 
22 | To use,
23 | 
24 | ```go
25 | maxKeys := uint32(1000000)
26 | f := cuckoofilter.New(maxKeys)
27 | 
28 | f.Add([]byte("hello"))
29 | f.Add([]byte("world"))
30 | 
31 | f.Contains("hello") // => true
32 | f.Contains("earth") // => true (if false positive) or false
33 | 
34 | f.Delete("world")
35 | ```
36 | 
37 | That's pretty much it.  API docs are available [on
38 | GoDoc](https://godoc.org/github.com/joeshaw/cuckoofilter).
39 | 


--------------------------------------------------------------------------------
/filter.go:
--------------------------------------------------------------------------------
  1 | // Package cuckoofilter implements cuckoo filters from the paper
  2 | // "Cuckoo Filter: Practically Better Than Bloom" by Fan et al.
  3 | // https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf
  4 | package cuckoofilter
  5 | 
  6 | import (
  7 | 	"encoding/binary"
  8 | 	"errors"
  9 | 	"math/rand"
 10 | 
 11 | 	"github.com/zhenjl/cityhash"
 12 | )
 13 | 
 14 | // 4 entries per bucket is suggested by the paper in section 5.1,
 15 | // "Optimal bucket size"
 16 | const entriesPerBucket = 4
 17 | 
 18 | // With 4 entries per bucket, we can expect up to 95% load factor
 19 | const loadFactor = 0.95
 20 | 
 21 | // Length of fingerprints in bits
 22 | const fpBits = 16
 23 | 
 24 | // Arbitrarily chosen value
 25 | const maxDisplacements = 500
 26 | 
 27 | // ErrTooFull is returned when a filter is too full and needs to be
 28 | // resized.
 29 | var ErrTooFull = errors.New("cuckoo filter too full")
 30 | 
 31 | // Filter is an implementation of a cuckoo filter.
 32 | type Filter struct {
 33 | 	nBuckets uint32
 34 | 	table    [][entriesPerBucket]uint16
 35 | }
 36 | 
 37 | func nearestPowerOfTwo(val uint32) uint32 {
 38 | 	for i := uint32(0); i < 32; i++ {
 39 | 		if pow := uint32(1) << i; pow >= val {
 40 | 			return pow
 41 | 		}
 42 | 	}
 43 | 
 44 | 	panic("will never happen")
 45 | }
 46 | 
 47 | // New returns a new cuckoo filter sized for the maximum number of
 48 | // keys passed in as maxKeys.
 49 | func New(maxKeys uint32) *Filter {
 50 | 	nBuckets := nearestPowerOfTwo(maxKeys / entriesPerBucket)
 51 | 
 52 | 	// If load factor is above the max value, we'll likely hit the
 53 | 	// max number of fingerprint displacements.  In that case,
 54 | 	// expand the number of buckets.
 55 | 	if float64(maxKeys)/float64(nBuckets)/entriesPerBucket > loadFactor {
 56 | 		nBuckets <<= 1
 57 | 	}
 58 | 
 59 | 	f := &Filter{
 60 | 		nBuckets: nBuckets,
 61 | 		table:    make([][entriesPerBucket]uint16, nBuckets),
 62 | 	}
 63 | 
 64 | 	return f
 65 | }
 66 | 
 67 | func hash(data []byte) uint64 {
 68 | 	return cityhash.CityHash64(data, uint32(len(data)))
 69 | }
 70 | 
 71 | func (f *Filter) bucketIndex(hv uint32) uint32 {
 72 | 	return hv % f.nBuckets
 73 | }
 74 | 
 75 | func (f *Filter) fingerprint(hv uint32) uint16 {
 76 | 	fp := uint16(hv & ((1 << fpBits) - 1))
 77 | 
 78 | 	// gross
 79 | 	if fp == 0 {
 80 | 		fp = 1
 81 | 	}
 82 | 
 83 | 	return fp
 84 | }
 85 | 
 86 | func (f *Filter) alternateIndex(idx uint32, fp uint16) uint32 {
 87 | 	d := make([]byte, 2)
 88 | 	binary.LittleEndian.PutUint16(d, fp)
 89 | 	hv := hash(d)
 90 | 	return f.bucketIndex(idx ^ uint32(hv))
 91 | }
 92 | 
 93 | func (f *Filter) matchPosition(idx uint32, fp uint16) int {
 94 | 	for i := 0; i < entriesPerBucket; i++ {
 95 | 		if f.table[idx][i] == fp {
 96 | 			return i
 97 | 		}
 98 | 	}
 99 | 
100 | 	return -1
101 | }
102 | 
103 | func (f *Filter) emptyPosition(idx uint32) int {
104 | 	return f.matchPosition(idx, 0)
105 | }
106 | 
107 | // Add adds an element to the cuckoo filter.  If the filter is too
108 | // heavily loaded, ErrTooFull may be returned, which signifies that
109 | // the filter must be rebuilt with an increased maxKeys parameter.
110 | func (f *Filter) Add(d []byte) error {
111 | 	h := hash(d)
112 | 
113 | 	fp := f.fingerprint(uint32(h))
114 | 	i1 := f.bucketIndex(uint32(h >> 32))
115 | 	i2 := f.alternateIndex(i1, fp)
116 | 
117 | 	if i := f.emptyPosition(i1); i != -1 {
118 | 		f.table[i1][i] = fp
119 | 		return nil
120 | 	}
121 | 
122 | 	if i := f.emptyPosition(i2); i != -1 {
123 | 		f.table[i2][i] = fp
124 | 		return nil
125 | 	}
126 | 
127 | 	// Choose which index to use randomly
128 | 	idx := [2]uint32{i1, i2}[rand.Intn(2)]
129 | 
130 | 	for i := 0; i < maxDisplacements; i++ {
131 | 		j := uint32(rand.Intn(entriesPerBucket))
132 | 
133 | 		fp, f.table[idx][j] = f.table[idx][j], fp
134 | 		idx = f.alternateIndex(idx, fp)
135 | 
136 | 		if ni := f.emptyPosition(idx); ni != -1 {
137 | 			f.table[idx][ni] = fp
138 | 			return nil
139 | 		}
140 | 	}
141 | 
142 | 	return ErrTooFull
143 | }
144 | 
145 | // Contains returns whether an element may be present in the set.
146 | // Cuckoo filters are probablistic data structures which can return
147 | // false positives.  False negatives are not possible.
148 | func (f *Filter) Contains(d []byte) bool {
149 | 	h := hash(d)
150 | 
151 | 	fp := f.fingerprint(uint32(h))
152 | 	i1 := f.bucketIndex(uint32(h >> 32))
153 | 	i2 := f.alternateIndex(i1, fp)
154 | 
155 | 	return f.matchPosition(i1, fp) != -1 || f.matchPosition(i2, fp) != -1
156 | }
157 | 
158 | // Delete deletes an element from the set.  To delete an item safely,
159 | // it must have been previously inserted.  Deleting a non-inserted
160 | // item might unintentionally remove a real, different item.
161 | func (f *Filter) Delete(d []byte) bool {
162 | 	h := hash(d)
163 | 
164 | 	fp := f.fingerprint(uint32(h))
165 | 	i1 := f.bucketIndex(uint32(h >> 32))
166 | 	i2 := f.alternateIndex(i1, fp)
167 | 
168 | 	if i := f.matchPosition(i1, fp); i != -1 {
169 | 		f.table[i1][i] = 0
170 | 		return true
171 | 	}
172 | 
173 | 	if i := f.matchPosition(i2, fp); i != -1 {
174 | 		f.table[i2][i] = 0
175 | 		return true
176 | 	}
177 | 
178 | 	return false
179 | }
180 | 


--------------------------------------------------------------------------------
/filter_test.go:
--------------------------------------------------------------------------------
  1 | package cuckoofilter
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestXor(t *testing.T) {
 10 | 	f := New(1000000)
 11 | 	h := hash([]byte("foo"))
 12 | 	fp := f.fingerprint(uint32(h))
 13 | 	i1 := f.bucketIndex(uint32(h >> 32))
 14 | 	i2 := f.alternateIndex(i1, fp)
 15 | 
 16 | 	fmt.Println("f = fingerprint(x):", fp)
 17 | 	fmt.Println("i1 = hash(x):", i1)
 18 | 	fmt.Println("i2 = i1 XOR hash(f)", i2)
 19 | 	fmt.Println("i1 = i2 XOR hash(f)", f.alternateIndex(i2, fp))
 20 | 
 21 | 	if actual := f.alternateIndex(i2, fp); actual != i1 {
 22 | 		t.Fatalf("expected %d, got %d", i1, actual)
 23 | 	}
 24 | }
 25 | 
 26 | func key(i int) []byte {
 27 | 	d := make([]byte, 8)
 28 | 	binary.LittleEndian.PutUint64(d, uint64(i))
 29 | 	return d
 30 | }
 31 | 
 32 | func TestFilter(t *testing.T) {
 33 | 	//rand.Seed(time.Now().UnixNano())
 34 | 	n := 100000
 35 | 	f := New(uint32(n))
 36 | 	for i := 0; i < n; i++ {
 37 | 		err := f.Add(key(i))
 38 | 		if err != nil {
 39 | 			t.Fatalf("%d: expected nil, got %s", i, err)
 40 | 		}
 41 | 	}
 42 | 
 43 | 	for i := 0; i < n; i++ {
 44 | 		if !f.Contains(key(i)) {
 45 | 			t.Fatalf("%d: expected true, got false", i)
 46 | 		}
 47 | 	}
 48 | 
 49 | 	falseCount := 0
 50 | 	for i := n; i < n*2; i++ {
 51 | 		if f.Contains(key(i)) {
 52 | 			falseCount++
 53 | 		}
 54 | 	}
 55 | 	fmt.Printf("False positive rate (before deletes): %d / %d\n", falseCount, n)
 56 | 
 57 | 	// Remove half the keys, make sure the still-existing ones
 58 | 	// always return true for Contains
 59 | 
 60 | 	for i := 0; i < n/2; i++ {
 61 | 		f.Delete(key(i))
 62 | 	}
 63 | 
 64 | 	for i := n / 2; i < n; i++ {
 65 | 		if !f.Contains(key(i)) {
 66 | 			t.Fatalf("%d: expected true, got false", i)
 67 | 		}
 68 | 	}
 69 | 
 70 | 	falseCount = 0
 71 | 	for i := n; i < n*2; i++ {
 72 | 		if f.Contains(key(i)) {
 73 | 			falseCount++
 74 | 		}
 75 | 	}
 76 | 	fmt.Printf("False positive rate (after deletes): %d / %d\n", falseCount, n)
 77 | }
 78 | 
 79 | func benchmarkNew(b *testing.B, maxKeys uint32) {
 80 | 	for i := 0; i < b.N; i++ {
 81 | 		New(maxKeys)
 82 | 	}
 83 | }
 84 | 
 85 | func BenchmarkNew1(b *testing.B)     { benchmarkNew(b, 1) }
 86 | func BenchmarkNew10(b *testing.B)    { benchmarkNew(b, 10) }
 87 | func BenchmarkNew100(b *testing.B)   { benchmarkNew(b, 100) }
 88 | func BenchmarkNew1000(b *testing.B)  { benchmarkNew(b, 1000) }
 89 | func BenchmarkNew10000(b *testing.B) { benchmarkNew(b, 10000) }
 90 | 
 91 | func BenchmarkAdd(b *testing.B) {
 92 | 	f := New(uint32(b.N))
 93 | 	for i := 0; i < b.N; i++ {
 94 | 		f.Add(key(i))
 95 | 	}
 96 | }
 97 | 
 98 | func BenchmarkContains(b *testing.B) {
 99 | 	f := New(uint32(b.N))
100 | 	for i := 0; i < b.N; i++ {
101 | 		f.Add(key(i))
102 | 	}
103 | 
104 | 	b.ResetTimer()
105 | 	for i := 0; i < b.N; i++ {
106 | 		f.Contains(key(i))
107 | 	}
108 | }
109 | 
110 | func BenchmarkDelete(b *testing.B) {
111 | 	f := New(uint32(b.N))
112 | 	for i := 0; i < b.N; i++ {
113 | 		f.Add(key(i))
114 | 	}
115 | 
116 | 	b.ResetTimer()
117 | 	for i := 0; i < b.N; i++ {
118 | 		f.Delete(key(i))
119 | 	}
120 | }
121 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/joeshaw/cuckoofilter
2 | 
3 | go 1.12
4 | 
5 | require github.com/zhenjl/cityhash v0.0.0-20131128155616-cdd6a94144ab
6 | 


--------------------------------------------------------------------------------