├── README.md ├── filter.go ├── filter_test.go └── go.mod /README.md: -------------------------------------------------------------------------------- 1 | ## cuckoofilter 2 | 3 | This is an implementation of a data structure known as a **cuckoo 4 | filter**. The data structure is described in a paper called *[Cuckoo 5 | Filter: Practically Better Than 6 | Bloom](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)* by 7 | Bin Fan, David G. Andersen, Michael Kaminsky and Michael 8 | D. Mitzenmacher. 9 | 10 | Cuckoo filters, like Bloom filters, are probabilistic data structures 11 | useful for determining whether a piece of data is present in a set. 12 | Like Bloom filters, cuckoo filters do not store the key being looked 13 | up, or the value of the data, so they are appropriate only for 14 | checking whether the primary data source should be queried. 15 | 16 | Cuckoo filters (and Bloom filters) can return false positives when 17 | checking for presence, but will never return a false negative. 18 | 19 | Unlike (standard, non-counting) Bloom filters, data can be deleted 20 | from cuckoo filters. 21 | 22 | To use, 23 | 24 | ```go 25 | maxKeys := uint32(1000000) 26 | f := cuckoofilter.New(maxKeys) 27 | 28 | f.Add([]byte("hello")) 29 | f.Add([]byte("world")) 30 | 31 | f.Contains("hello") // => true 32 | f.Contains("earth") // => true (if false positive) or false 33 | 34 | f.Delete("world") 35 | ``` 36 | 37 | That's pretty much it. API docs are available [on 38 | GoDoc](https://godoc.org/github.com/joeshaw/cuckoofilter). 39 | -------------------------------------------------------------------------------- /filter.go: -------------------------------------------------------------------------------- 1 | // Package cuckoofilter implements cuckoo filters from the paper 2 | // "Cuckoo Filter: Practically Better Than Bloom" by Fan et al. 3 | // https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf 4 | package cuckoofilter 5 | 6 | import ( 7 | "encoding/binary" 8 | "errors" 9 | "math/rand" 10 | 11 | "github.com/zhenjl/cityhash" 12 | ) 13 | 14 | // 4 entries per bucket is suggested by the paper in section 5.1, 15 | // "Optimal bucket size" 16 | const entriesPerBucket = 4 17 | 18 | // With 4 entries per bucket, we can expect up to 95% load factor 19 | const loadFactor = 0.95 20 | 21 | // Length of fingerprints in bits 22 | const fpBits = 16 23 | 24 | // Arbitrarily chosen value 25 | const maxDisplacements = 500 26 | 27 | // ErrTooFull is returned when a filter is too full and needs to be 28 | // resized. 29 | var ErrTooFull = errors.New("cuckoo filter too full") 30 | 31 | // Filter is an implementation of a cuckoo filter. 32 | type Filter struct { 33 | nBuckets uint32 34 | table [][entriesPerBucket]uint16 35 | } 36 | 37 | func nearestPowerOfTwo(val uint32) uint32 { 38 | for i := uint32(0); i < 32; i++ { 39 | if pow := uint32(1) << i; pow >= val { 40 | return pow 41 | } 42 | } 43 | 44 | panic("will never happen") 45 | } 46 | 47 | // New returns a new cuckoo filter sized for the maximum number of 48 | // keys passed in as maxKeys. 49 | func New(maxKeys uint32) *Filter { 50 | nBuckets := nearestPowerOfTwo(maxKeys / entriesPerBucket) 51 | 52 | // If load factor is above the max value, we'll likely hit the 53 | // max number of fingerprint displacements. In that case, 54 | // expand the number of buckets. 55 | if float64(maxKeys)/float64(nBuckets)/entriesPerBucket > loadFactor { 56 | nBuckets <<= 1 57 | } 58 | 59 | f := &Filter{ 60 | nBuckets: nBuckets, 61 | table: make([][entriesPerBucket]uint16, nBuckets), 62 | } 63 | 64 | return f 65 | } 66 | 67 | func hash(data []byte) uint64 { 68 | return cityhash.CityHash64(data, uint32(len(data))) 69 | } 70 | 71 | func (f *Filter) bucketIndex(hv uint32) uint32 { 72 | return hv % f.nBuckets 73 | } 74 | 75 | func (f *Filter) fingerprint(hv uint32) uint16 { 76 | fp := uint16(hv & ((1 << fpBits) - 1)) 77 | 78 | // gross 79 | if fp == 0 { 80 | fp = 1 81 | } 82 | 83 | return fp 84 | } 85 | 86 | func (f *Filter) alternateIndex(idx uint32, fp uint16) uint32 { 87 | d := make([]byte, 2) 88 | binary.LittleEndian.PutUint16(d, fp) 89 | hv := hash(d) 90 | return f.bucketIndex(idx ^ uint32(hv)) 91 | } 92 | 93 | func (f *Filter) matchPosition(idx uint32, fp uint16) int { 94 | for i := 0; i < entriesPerBucket; i++ { 95 | if f.table[idx][i] == fp { 96 | return i 97 | } 98 | } 99 | 100 | return -1 101 | } 102 | 103 | func (f *Filter) emptyPosition(idx uint32) int { 104 | return f.matchPosition(idx, 0) 105 | } 106 | 107 | // Add adds an element to the cuckoo filter. If the filter is too 108 | // heavily loaded, ErrTooFull may be returned, which signifies that 109 | // the filter must be rebuilt with an increased maxKeys parameter. 110 | func (f *Filter) Add(d []byte) error { 111 | h := hash(d) 112 | 113 | fp := f.fingerprint(uint32(h)) 114 | i1 := f.bucketIndex(uint32(h >> 32)) 115 | i2 := f.alternateIndex(i1, fp) 116 | 117 | if i := f.emptyPosition(i1); i != -1 { 118 | f.table[i1][i] = fp 119 | return nil 120 | } 121 | 122 | if i := f.emptyPosition(i2); i != -1 { 123 | f.table[i2][i] = fp 124 | return nil 125 | } 126 | 127 | // Choose which index to use randomly 128 | idx := [2]uint32{i1, i2}[rand.Intn(2)] 129 | 130 | for i := 0; i < maxDisplacements; i++ { 131 | j := uint32(rand.Intn(entriesPerBucket)) 132 | 133 | fp, f.table[idx][j] = f.table[idx][j], fp 134 | idx = f.alternateIndex(idx, fp) 135 | 136 | if ni := f.emptyPosition(idx); ni != -1 { 137 | f.table[idx][ni] = fp 138 | return nil 139 | } 140 | } 141 | 142 | return ErrTooFull 143 | } 144 | 145 | // Contains returns whether an element may be present in the set. 146 | // Cuckoo filters are probablistic data structures which can return 147 | // false positives. False negatives are not possible. 148 | func (f *Filter) Contains(d []byte) bool { 149 | h := hash(d) 150 | 151 | fp := f.fingerprint(uint32(h)) 152 | i1 := f.bucketIndex(uint32(h >> 32)) 153 | i2 := f.alternateIndex(i1, fp) 154 | 155 | return f.matchPosition(i1, fp) != -1 || f.matchPosition(i2, fp) != -1 156 | } 157 | 158 | // Delete deletes an element from the set. To delete an item safely, 159 | // it must have been previously inserted. Deleting a non-inserted 160 | // item might unintentionally remove a real, different item. 161 | func (f *Filter) Delete(d []byte) bool { 162 | h := hash(d) 163 | 164 | fp := f.fingerprint(uint32(h)) 165 | i1 := f.bucketIndex(uint32(h >> 32)) 166 | i2 := f.alternateIndex(i1, fp) 167 | 168 | if i := f.matchPosition(i1, fp); i != -1 { 169 | f.table[i1][i] = 0 170 | return true 171 | } 172 | 173 | if i := f.matchPosition(i2, fp); i != -1 { 174 | f.table[i2][i] = 0 175 | return true 176 | } 177 | 178 | return false 179 | } 180 | -------------------------------------------------------------------------------- /filter_test.go: -------------------------------------------------------------------------------- 1 | package cuckoofilter 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "testing" 7 | ) 8 | 9 | func TestXor(t *testing.T) { 10 | f := New(1000000) 11 | h := hash([]byte("foo")) 12 | fp := f.fingerprint(uint32(h)) 13 | i1 := f.bucketIndex(uint32(h >> 32)) 14 | i2 := f.alternateIndex(i1, fp) 15 | 16 | fmt.Println("f = fingerprint(x):", fp) 17 | fmt.Println("i1 = hash(x):", i1) 18 | fmt.Println("i2 = i1 XOR hash(f)", i2) 19 | fmt.Println("i1 = i2 XOR hash(f)", f.alternateIndex(i2, fp)) 20 | 21 | if actual := f.alternateIndex(i2, fp); actual != i1 { 22 | t.Fatalf("expected %d, got %d", i1, actual) 23 | } 24 | } 25 | 26 | func key(i int) []byte { 27 | d := make([]byte, 8) 28 | binary.LittleEndian.PutUint64(d, uint64(i)) 29 | return d 30 | } 31 | 32 | func TestFilter(t *testing.T) { 33 | //rand.Seed(time.Now().UnixNano()) 34 | n := 100000 35 | f := New(uint32(n)) 36 | for i := 0; i < n; i++ { 37 | err := f.Add(key(i)) 38 | if err != nil { 39 | t.Fatalf("%d: expected nil, got %s", i, err) 40 | } 41 | } 42 | 43 | for i := 0; i < n; i++ { 44 | if !f.Contains(key(i)) { 45 | t.Fatalf("%d: expected true, got false", i) 46 | } 47 | } 48 | 49 | falseCount := 0 50 | for i := n; i < n*2; i++ { 51 | if f.Contains(key(i)) { 52 | falseCount++ 53 | } 54 | } 55 | fmt.Printf("False positive rate (before deletes): %d / %d\n", falseCount, n) 56 | 57 | // Remove half the keys, make sure the still-existing ones 58 | // always return true for Contains 59 | 60 | for i := 0; i < n/2; i++ { 61 | f.Delete(key(i)) 62 | } 63 | 64 | for i := n / 2; i < n; i++ { 65 | if !f.Contains(key(i)) { 66 | t.Fatalf("%d: expected true, got false", i) 67 | } 68 | } 69 | 70 | falseCount = 0 71 | for i := n; i < n*2; i++ { 72 | if f.Contains(key(i)) { 73 | falseCount++ 74 | } 75 | } 76 | fmt.Printf("False positive rate (after deletes): %d / %d\n", falseCount, n) 77 | } 78 | 79 | func benchmarkNew(b *testing.B, maxKeys uint32) { 80 | for i := 0; i < b.N; i++ { 81 | New(maxKeys) 82 | } 83 | } 84 | 85 | func BenchmarkNew1(b *testing.B) { benchmarkNew(b, 1) } 86 | func BenchmarkNew10(b *testing.B) { benchmarkNew(b, 10) } 87 | func BenchmarkNew100(b *testing.B) { benchmarkNew(b, 100) } 88 | func BenchmarkNew1000(b *testing.B) { benchmarkNew(b, 1000) } 89 | func BenchmarkNew10000(b *testing.B) { benchmarkNew(b, 10000) } 90 | 91 | func BenchmarkAdd(b *testing.B) { 92 | f := New(uint32(b.N)) 93 | for i := 0; i < b.N; i++ { 94 | f.Add(key(i)) 95 | } 96 | } 97 | 98 | func BenchmarkContains(b *testing.B) { 99 | f := New(uint32(b.N)) 100 | for i := 0; i < b.N; i++ { 101 | f.Add(key(i)) 102 | } 103 | 104 | b.ResetTimer() 105 | for i := 0; i < b.N; i++ { 106 | f.Contains(key(i)) 107 | } 108 | } 109 | 110 | func BenchmarkDelete(b *testing.B) { 111 | f := New(uint32(b.N)) 112 | for i := 0; i < b.N; i++ { 113 | f.Add(key(i)) 114 | } 115 | 116 | b.ResetTimer() 117 | for i := 0; i < b.N; i++ { 118 | f.Delete(key(i)) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/joeshaw/cuckoofilter 2 | 3 | go 1.12 4 | 5 | require github.com/zhenjl/cityhash v0.0.0-20131128155616-cdd6a94144ab 6 | --------------------------------------------------------------------------------