├── .travis.yml ├── const.go ├── filter_test.go ├── bucket.go ├── LICENSE ├── hash.go ├── README.md └── filter.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.5 4 | -------------------------------------------------------------------------------- /const.go: -------------------------------------------------------------------------------- 1 | // zheng-ji.info 2 | 3 | package cuckoo 4 | 5 | const ( 6 | // NotFound const 7 | NotFound = -1 8 | 9 | // SlotSize const 10 | SlotSize = 4 11 | 12 | // SignatureSize const 13 | SignatureSize = 1 14 | 15 | // MaxCuckooCount max times to try when collision 16 | MaxCuckooCount = 800 17 | ) 18 | -------------------------------------------------------------------------------- /filter_test.go: -------------------------------------------------------------------------------- 1 | // zheng-ji.info 2 | 3 | package cuckoo 4 | 5 | import ( 6 | "testing" 7 | ) 8 | 9 | func TestCuckoo(t *testing.T) { 10 | filter := NewFilter(10) 11 | t.Log(getCeilingCap(uint64(10)) / SlotSize) 12 | 13 | filter.Insert([]byte("zheng-ji")) 14 | filter.Insert([]byte("scut")) 15 | filter.Insert([]byte("coder")) 16 | filter.Insert([]byte("stupid")) 17 | 18 | t.Log(filter.buckets) 19 | t.Log(filter.Size()) 20 | 21 | if filter.Find([]byte("stupid")) { 22 | t.Log("exist") 23 | } else { 24 | t.Log("Not exist") 25 | } 26 | 27 | filter.Del([]byte("stupid")) 28 | if filter.Find([]byte("stupid")) { 29 | t.Log("exist") 30 | } else { 31 | t.Log("Not exist") 32 | } 33 | 34 | t.Log(filter.buckets) 35 | t.Log(filter.Size()) 36 | } 37 | -------------------------------------------------------------------------------- /bucket.go: -------------------------------------------------------------------------------- 1 | // zheng-ji.info 2 | 3 | package cuckoo 4 | 5 | // Signature Type,mean FingerPrint 6 | type Signature [SignatureSize]byte 7 | 8 | // Bucket Type, has slotsize signature 9 | type Bucket [SlotSize]Signature 10 | 11 | // Empty Signature 12 | var Empty = Signature{0} 13 | 14 | func (bk *Bucket) insert(sign Signature) bool { 15 | for index, vsign := range bk { 16 | if vsign == Empty { 17 | bk[index] = sign 18 | return true 19 | } 20 | } 21 | return false 22 | } 23 | 24 | func (bk *Bucket) del(sign Signature) bool { 25 | for index, vsign := range bk { 26 | if vsign == sign { 27 | bk[index] = Empty 28 | return true 29 | } 30 | } 31 | return false 32 | } 33 | 34 | func (bk *Bucket) lookupIndex(sign Signature) int { 35 | for index, vsign := range bk { 36 | if vsign == sign { 37 | return index 38 | } 39 | } 40 | return NotFound 41 | } 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 zheng-ji.info 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /hash.go: -------------------------------------------------------------------------------- 1 | // zheng-ji.info 2 | 3 | package cuckoo 4 | 5 | import ( 6 | "encoding/binary" 7 | "hash/fnv" 8 | "math" 9 | ) 10 | 11 | func getCeilingCap(capacity uint64) uint { 12 | num := 1 13 | for ; capacity/2 != 0; capacity = capacity / 2 { 14 | num++ 15 | } 16 | return uint(math.Pow(2, float64(num))) 17 | } 18 | 19 | func genSignature(data []byte) Signature { 20 | hashInstance := fnv.New64() 21 | hashInstance.Reset() 22 | hashInstance.Write(data) 23 | hash := hashInstance.Sum(nil) 24 | sign := Signature{} 25 | for i := 0; i < SignatureSize; i++ { 26 | sign[i] = hash[i] 27 | } 28 | if sign == Empty { 29 | sign[0] ^= 1 30 | } 31 | return sign 32 | } 33 | 34 | func genFirstIndex(sign Signature, numBuckets uint) uint { 35 | bytes := make([]byte, 64, 64) 36 | for i, b := range sign { 37 | bytes[i] = b 38 | } 39 | hash := binary.LittleEndian.Uint64(bytes) 40 | return uint(hash) & (numBuckets - 1) 41 | } 42 | 43 | func genBackupIndex(sign Signature, numBuckets uint) uint { 44 | bytes := make([]byte, 64, 64) 45 | for i, b := range sign { 46 | bytes[i] = b 47 | } 48 | hash := binary.BigEndian.Uint64(bytes) 49 | return uint(hash) & (numBuckets - 1) 50 | } 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## goCuckoo 2 | 3 | [![Build Status](https://travis-ci.org/zheng-ji/goCuckoo.svg)](https://travis-ci.org/zheng-ji/goCuckoo) 4 | [![GoDoc](https://godoc.org/github.com/zheng-ji/goCuckoo?status.svg)](https://godoc.org/github.com/zheng-ji/goCuckoo) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/zheng-ji/goCuckoo)](https://goreportcard.com/report/github.com/zheng-ji/goCuckoo) 6 | 7 | 8 | A Cuckoo hashing, substituting for bloom filter. written by Go 9 | 10 | 一个 CuckooFilter 的 Go 库, BloomFilter 的替代物 11 | 12 | ![goCuckoo](https://cloud.githubusercontent.com/assets/1414745/17084380/8c3a4896-51ee-11e6-869e-b087226cc5ce.jpg) 13 | 14 | Description 15 | ----------- 16 | 17 | 面对海量数据,我们需要一个索引数据结构,用来帮助查询,快速判断数据记录是否存在,这类数据结构叫过滤器,常用的选择是 `Bloom Filter`. 而 `Cuckoo Filter` 是它的优化变种。 18 | 19 | `Bloom Filter` 的位图模式有两个问题: 20 | 21 | * 误报,它能判断元素一定不存在,但只能判断可能存在,因为存在其它元素被映射到部分相同位上,导致该位置1,那么一个不存在的元素可能会被误报成存在; 22 | * 漏报,如果删除了某个元素,导致该映射位被置0,那么本来存在的元素会被漏报成不存在。 23 | 24 | `Cuckoo Filter`,可以确保该元素存在的必然性,又可以在不违背此前提下删除任意元素,仅仅比 `Bloom Filter` 牺牲了微量空间效率。 它的的数据模型: 25 | 26 | * 每个元素对应两个哈希算法,在哈希碰撞时会启用备用哈希算法。 27 | * 每一个桶是有4路的槽,每个槽对应一个指纹。 28 | 29 | ![model](https://cloud.githubusercontent.com/assets/1414745/17103421/c97635e0-52b0-11e6-83ac-1b1fdbb5d31c.png) 30 | 31 | 32 | Feature 33 | -------- 34 | 35 | * Deletion Support 36 | * FastLoopUp O(1) 37 | * High Space Utilization,4-way set-associative table: > 95% entries occupied 38 | * Subsituting for Bloom Filters 39 | 40 | 41 | Installation 42 | ------------- 43 | 44 | ``` 45 | go get github.com/zheng-ji/goCuckoo 46 | ``` 47 | 48 | Example 49 | ------- 50 | 51 | ```go 52 | import ( 53 | "fmt" 54 | "github.com/zheng-ji/goCuckoo" 55 | ) 56 | 57 | func main() { 58 | // speicify capacity 59 | filter := cuckoo.NewFilter(10000) 60 | 61 | filter.Insert([]byte("zheng-ji")) 62 | filter.Insert([]byte("stupid")) 63 | filter.Insert([]byte("coder")) 64 | 65 | if filter.Find([]byte("stupid")) { 66 | fmt.Println("exist") 67 | } else { 68 | fmt.Println("Not exist") 69 | } 70 | 71 | filter.Del([]byte("stupid")) 72 | filter.Println(filter.Size()) 73 | } 74 | ``` 75 | 76 | Documentation 77 | ------------- 78 | 79 | - [CMU Paper](http://www.cs.cmu.edu/~binfan/papers/conext14_cuckoofilter.pdf) 80 | - [CMU PPT](http://www.cs.cmu.edu/~binfan/papers/conext14_cuckoofilter.pptx) 81 | - [CoolShell Article](http://coolshell.cn/articles/17225.html) 82 | 83 | License 84 | ------- 85 | 86 | Copyright (c) 2016 by [zheng-ji](http://zheng-ji.info) released under MIT License. 87 | -------------------------------------------------------------------------------- /filter.go: -------------------------------------------------------------------------------- 1 | // zheng-ji.info 2 | 3 | package cuckoo 4 | 5 | import ( 6 | "math/rand" 7 | "sync" 8 | ) 9 | 10 | // Filter struct 11 | type Filter struct { 12 | num int 13 | buckets []Bucket 14 | lock *sync.Mutex 15 | } 16 | 17 | // NewFilter Init a Filter with capacity 18 | func NewFilter(capacity uint) *Filter { 19 | capacity = getCeilingCap(uint64(capacity)) / SlotSize 20 | if capacity == 0 { 21 | capacity = 1 22 | } 23 | buckets := make([]Bucket, capacity, capacity) 24 | for i := range buckets { 25 | buckets[i] = [SlotSize]Signature{} 26 | } 27 | return &Filter{ 28 | buckets: buckets, 29 | num: 0, 30 | lock: new(sync.Mutex), 31 | } 32 | } 33 | 34 | // Find Func,check an entry exist or not 35 | func (filter *Filter) Find(data []byte) bool { 36 | sign := genSignature(data) 37 | firstIndex := genFirstIndex(sign, uint(len(filter.buckets))) 38 | backupIndex := genBackupIndex(sign, uint(len(filter.buckets))) 39 | 40 | bk1 := &filter.buckets[firstIndex] 41 | bk2 := &filter.buckets[backupIndex] 42 | 43 | if bk1.lookupIndex(sign) != NotFound || bk2.lookupIndex(sign) != NotFound { 44 | return true 45 | } 46 | return false 47 | } 48 | 49 | // Insert Func,Insert an entry 50 | func (filter *Filter) Insert(data []byte) bool { 51 | filter.lock.Lock() 52 | defer filter.lock.Unlock() 53 | 54 | sign := genSignature(data) 55 | firstIndex := genFirstIndex(sign, uint(len(filter.buckets))) 56 | backupIndex := genBackupIndex(sign, uint(len(filter.buckets))) 57 | bk1 := &filter.buckets[firstIndex] 58 | bk2 := &filter.buckets[backupIndex] 59 | if bk1.insert(sign) || bk2.insert(sign) { 60 | filter.num++ 61 | return true 62 | } 63 | return filter.resolveCollision(sign, backupIndex) 64 | } 65 | 66 | func (filter *Filter) resolveCollision(sign Signature, index uint) bool { 67 | for i := 0; i < MaxCuckooCount; i++ { 68 | j := rand.Intn(SlotSize) 69 | tmpsign := sign 70 | sign = filter.buckets[index][j] 71 | filter.buckets[index][j] = tmpsign 72 | index = genBackupIndex(sign, uint(len(filter.buckets))) 73 | bk := &filter.buckets[index] 74 | if bk.insert(sign) { 75 | filter.num++ 76 | return true 77 | } 78 | } 79 | return false 80 | } 81 | 82 | // Del Func:delete entry 83 | func (filter *Filter) Del(data []byte) bool { 84 | filter.lock.Lock() 85 | defer filter.lock.Unlock() 86 | 87 | sign := genSignature(data) 88 | firstIndex := genFirstIndex(sign, uint(len(filter.buckets))) 89 | backupIndex := genBackupIndex(sign, uint(len(filter.buckets))) 90 | bk1 := &filter.buckets[firstIndex] 91 | bk2 := &filter.buckets[backupIndex] 92 | return bk1.del(sign) || bk2.del(sign) 93 | } 94 | 95 | // Size Fun: get size of Filter's element 96 | func (filter *Filter) Size() int { 97 | return filter.num 98 | } 99 | --------------------------------------------------------------------------------