├── minsketch.go ├── minsketch8.go ├── README.md ├── abacus.go └── 199506.svg /minsketch.go: -------------------------------------------------------------------------------- 1 | package abacus 2 | 3 | import "sync" 4 | import ( 5 | "github.com/spaolacci/murmur3" 6 | "unsafe" 7 | ) 8 | 9 | type CountType uint32 10 | const Max = ^(CountType(0)) 11 | 12 | type Sketch struct { 13 | Width uint32 14 | Depth uint32 15 | Count [][]CountType 16 | mutex sync.RWMutex 17 | } 18 | 19 | func sizeOfCell() uintptr{ 20 | var a CountType 21 | return unsafe.Sizeof(a) 22 | } 23 | 24 | func NewSketch(width, depth uint32) (sk *Sketch) { 25 | sk = &Sketch{ 26 | Width: width, 27 | Depth: depth, 28 | Count: make([][]CountType, depth), 29 | } 30 | for i := uint32(0); i < depth; i++ { 31 | sk.Count[i] = make([]CountType, width) 32 | } 33 | return sk 34 | } 35 | 36 | func (sk *Sketch) Incr(dat []byte) (min CountType) { 37 | return sk.Add(dat, 1) 38 | } 39 | 40 | func (sk *Sketch) positions(dat []byte) (pos []uint32) { 41 | // reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java 42 | hash1 := murmur3.Sum32WithSeed(dat, 0) 43 | hash2 := murmur3.Sum32WithSeed(dat, hash1) 44 | pos = make([]uint32, sk.Depth) 45 | for i := uint32(0); i < sk.Depth; i++ { 46 | pos[i] = (hash1 + i*hash2) % sk.Width 47 | } 48 | return pos 49 | } 50 | 51 | func (sk *Sketch) Add(dat []byte, cnt CountType) (min CountType) { 52 | pos := sk.positions(dat) 53 | min = sk.query(pos) 54 | 55 | min += cnt 56 | 57 | sk.mutex.Lock() 58 | for i := uint32(0); i < sk.Depth; i++ { 59 | v := sk.Count[i][pos[i]] 60 | if v < min { 61 | sk.Count[i][pos[i]] = min 62 | } 63 | } 64 | sk.mutex.Unlock() 65 | 66 | return min 67 | } 68 | 69 | func (sk *Sketch) Query(dat []byte) (min CountType) { 70 | pos := sk.positions(dat) 71 | return sk.query(pos) 72 | } 73 | 74 | func (sk *Sketch) query(pos []uint32) (min CountType) { 75 | min = Max 76 | 77 | sk.mutex.RLock() 78 | for i := uint32(0); i < sk.Depth; i++ { 79 | v := sk.Count[i][pos[i]] 80 | if min > v { 81 | min = v 82 | } 83 | } 84 | sk.mutex.RUnlock() 85 | 86 | return min 87 | } -------------------------------------------------------------------------------- /minsketch8.go: -------------------------------------------------------------------------------- 1 | package abacus 2 | 3 | import "sync" 4 | import ( 5 | "github.com/spaolacci/murmur3" 6 | "unsafe" 7 | ) 8 | 9 | type CountTypeLog8 uint8 10 | const MaxLog8 = ^(CountTypeLog8(0)) 11 | 12 | type SketchLog8 struct { 13 | Width uint32 14 | Depth uint32 15 | Count [][]CountTypeLog8 16 | mutex sync.RWMutex 17 | } 18 | 19 | func sizeOfCellLog8() uintptr{ 20 | var a CountTypeLog8 21 | return unsafe.Sizeof(a) 22 | } 23 | 24 | func NewSketchLog8(width, depth uint32) (sk *SketchLog8) { 25 | sk = &SketchLog8{ 26 | Width: width, 27 | Depth: depth, 28 | Count: make([][]CountTypeLog8, depth), 29 | } 30 | for i := uint32(0); i < depth; i++ { 31 | sk.Count[i] = make([]CountTypeLog8, width) 32 | } 33 | return sk 34 | } 35 | 36 | func (sk *SketchLog8) Incr(dat []byte) (min CountTypeLog8) { 37 | return sk.Add(dat, 1) 38 | } 39 | 40 | func (sk *SketchLog8) positions(dat []byte) (pos []uint32) { 41 | // reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java 42 | hash1 := murmur3.Sum32WithSeed(dat, 0) 43 | hash2 := murmur3.Sum32WithSeed(dat, hash1) 44 | pos = make([]uint32, sk.Depth) 45 | for i := uint32(0); i < sk.Depth; i++ { 46 | pos[i] = (hash1 + i*hash2) % sk.Width 47 | } 48 | return pos 49 | } 50 | 51 | func (sk *SketchLog8) Add(dat []byte, cnt CountTypeLog8) (min CountTypeLog8) { 52 | pos := sk.positions(dat) 53 | min = sk.query(pos) 54 | 55 | min += cnt 56 | 57 | sk.mutex.Lock() 58 | for i := uint32(0); i < sk.Depth; i++ { 59 | v := sk.Count[i][pos[i]] 60 | if v < min { 61 | sk.Count[i][pos[i]] = min 62 | } 63 | } 64 | sk.mutex.Unlock() 65 | 66 | return min 67 | } 68 | 69 | func (sk *SketchLog8) Query(dat []byte) (min CountTypeLog8) { 70 | pos := sk.positions(dat) 71 | return sk.query(pos) 72 | } 73 | 74 | func (sk *SketchLog8) query(pos []uint32) (min CountTypeLog8) { 75 | min = MaxLog8 76 | 77 | sk.mutex.RLock() 78 | for i := uint32(0); i < sk.Depth; i++ { 79 | v := sk.Count[i][pos[i]] 80 | if min > v { 81 | min = v 82 | } 83 | } 84 | sk.mutex.RUnlock() 85 | 86 | return min 87 | } 88 | 89 | 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Abacus 4 | 5 | Abacus let you count item frequencies in big datasets with a fixed amount of memory. 6 | 7 | Unlike a regular counter it trades off accuracy for memory. 8 | This is useful for particular tasks, for example in NLP/ML related tasks you might want to count millions of items 9 | however approximate counts are good enough. 10 | 11 | Example: 12 | 13 | ```go 14 | counter := abacus.New(maxMemoryMB=10) // abacus will use max 10MB to store your counts 15 | counter.Update([]string{"item1", "item2", "item2"}) 16 | counter.Counts("item1") // 1 , counts for "item1" 17 | counter.Total() // 3 ,Total number of counts (sum of counts of all elements) 18 | counter.Cardinality() // 2 , How many different items are there? 19 | ``` 20 | 21 | Abacus lets you define how much memory you want to use and you go from there counting items. 22 | Of course there are some limitations, and if you set the memory threshold too low, you might get innacurate counts. 23 | 24 | ## Benchmarks 25 | 26 | - Counting bigrams (words) from [Wiki corpus](http://www.cs.upc.edu/~nlp/wikicorpus/). 27 | - Compared memory and accuracy of `Abacus` vs using a `map[string]int` 28 | 29 | 30 | Corpus Data Structure Used Memory Accuracy 31 | 32 | | Corpus | Data Structure | Used Memory | Accuracy | 33 | |---------|-----------------|-----------------|-----------| 34 | | Half of Wiki corpus (English) | Abacus (1000MB) | 1.75GB | 96% | 35 | | Half of Wiki corpus (English) | Abacus (Log8) (200MB) | 369MB | 70% | 36 | | Half of Wiki corpus (English) | Abacus (Log8) (400MB) | 407MB | 98% | 37 | | Half of Wiki corpus (English) | Map | 3.3GB | 100% | 38 | 39 | | Corpus | Data Structure | Used Memory | Accuracy | 40 | |---------|-----------------|-----------------|-----------| 41 | | Complete Wiki corpus (English) | Abacus (2200MB) | 3.63GB | 98% | 42 | | Complete Wiki corpus (English) | Abacus (500MB) | 741MB | 15% | 43 | | Complete Wiki corpus (English) | Abacus (Log8) (500MB) | 760MB | 90% | 44 | | Complete Wiki corpus (English) | Abacus (Log8) (700MB) | 889MB | 97% | 45 | | Complete Wiki corpus (English) | Map | 10.46GB | 100% | 46 | 47 | Note: This is me playing with Golang again, heavily based on [Bounter](https://github.com/RaRe-Technologies/bounter) 48 | 49 | 50 | 51 | 52 | ## Under the hood 53 | 54 | ### Count–min sketch 55 | 56 | Used to count item frequencies. 57 | 58 | ### HyperLogLog 59 | 60 | Used to calculate the cardinality 61 | 62 | ----------- 63 | 64 | Icon made by [free-icon](https://www.flaticon.com/free-icon/) 65 | -------------------------------------------------------------------------------- /abacus.go: -------------------------------------------------------------------------------- 1 | package abacus 2 | 3 | import ( 4 | "github.com/sasha-s/go-hll" 5 | "github.com/spaolacci/murmur3" 6 | "math/big" 7 | ) 8 | 9 | type Abacus interface{ 10 | Counts(key string) (CountType, error) 11 | Update(items []string) error 12 | Total() (*big.Int, error) 13 | Cardinality() (CountType, error) 14 | } 15 | 16 | func widthAndDepthFromSize(sizeMB uint) (uint32, uint32){ 17 | width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCell() )) 18 | depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCell())) 19 | return uint32(width), uint32(depth) 20 | } 21 | 22 | type memoryAbacus struct{ 23 | MaxMemorySize uint 24 | s *Sketch 25 | h hll.HLL 26 | total *big.Int 27 | } 28 | 29 | func (a *memoryAbacus) Counts(key string) (CountType, error) { 30 | return a.s.Query([]byte(key)), nil 31 | } 32 | 33 | func (a *memoryAbacus) Update(items []string) error { 34 | for _, key := range items { 35 | a.s.Incr([]byte(key)) 36 | a.h.Add(uint64(murmur3.Sum32([]byte(key)))) 37 | a.total = a.total.Add(big.NewInt(1), a.total) 38 | } 39 | 40 | return nil 41 | } 42 | 43 | func (a *memoryAbacus) Total() (*big.Int, error){ 44 | return a.total, nil 45 | } 46 | 47 | func (a *memoryAbacus) Cardinality() (CountType, error){ 48 | return CountType(a.h.EstimateCardinality()),nil 49 | } 50 | 51 | func New(maxMemoryMB uint) memoryAbacus { 52 | w, d := widthAndDepthFromSize(maxMemoryMB) 53 | sketch := NewSketch(w, d) 54 | s, _ := hll.SizeByP(16) 55 | h := make(hll.HLL, s) 56 | a:= memoryAbacus{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)} 57 | return a 58 | } 59 | 60 | 61 | type memoryAbacusLog8 struct{ 62 | MaxMemorySize uint 63 | s *SketchLog8 64 | h hll.HLL 65 | total *big.Int 66 | } 67 | 68 | func (a *memoryAbacusLog8) Counts(key string) (CountType, error) { 69 | return CountType(a.s.Query([]byte(key))), nil 70 | } 71 | 72 | func (a *memoryAbacusLog8) Update(items []string) error { 73 | for _, key := range items { 74 | a.s.Incr([]byte(key)) 75 | a.h.Add(uint64(murmur3.Sum32([]byte(key)))) 76 | a.total = a.total.Add(big.NewInt(1), a.total) 77 | } 78 | 79 | return nil 80 | } 81 | 82 | func (a *memoryAbacusLog8) Total() (*big.Int, error){ 83 | return a.total, nil 84 | } 85 | 86 | func (a *memoryAbacusLog8) Cardinality() (CountType, error){ 87 | return CountType(a.h.EstimateCardinality()),nil 88 | } 89 | 90 | func widthAndDepthFromSizeLog8(sizeMB uint) (uint32, uint32){ 91 | width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCellLog8() )) 92 | depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCellLog8())) 93 | return uint32(width), uint32(depth) 94 | } 95 | 96 | func NewAbacus8Log(maxMemoryMB uint) memoryAbacusLog8 { 97 | w, d := widthAndDepthFromSizeLog8(maxMemoryMB) 98 | sketch := NewSketchLog8(w, d) 99 | s, _ := hll.SizeByP(16) 100 | h := make(hll.HLL, s) 101 | a:= memoryAbacusLog8{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)} 102 | return a 103 | } 104 | -------------------------------------------------------------------------------- /199506.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 9 | 11 | 13 | 14 | 16 | 18 | 19 | 21 | 22 | 23 | 25 | 26 | 27 | 29 | 30 | 31 | 33 | 34 | 36 | 38 | 39 | 41 | 42 | 43 | 45 | 46 | 47 | 49 | 50 | 51 | 53 | 54 | 56 | 58 | 59 | 61 | 63 | 64 | 66 | 68 | 69 | 71 | 73 | 74 | 76 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | --------------------------------------------------------------------------------