├── minsketch.go
├── minsketch8.go
├── README.md
├── abacus.go
└── 199506.svg
/minsketch.go:
--------------------------------------------------------------------------------
1 | package abacus
2 |
3 | import "sync"
4 | import (
5 | "github.com/spaolacci/murmur3"
6 | "unsafe"
7 | )
8 |
9 | type CountType uint32
10 | const Max = ^(CountType(0))
11 |
12 | type Sketch struct {
13 | Width uint32
14 | Depth uint32
15 | Count [][]CountType
16 | mutex sync.RWMutex
17 | }
18 |
19 | func sizeOfCell() uintptr{
20 | var a CountType
21 | return unsafe.Sizeof(a)
22 | }
23 |
24 | func NewSketch(width, depth uint32) (sk *Sketch) {
25 | sk = &Sketch{
26 | Width: width,
27 | Depth: depth,
28 | Count: make([][]CountType, depth),
29 | }
30 | for i := uint32(0); i < depth; i++ {
31 | sk.Count[i] = make([]CountType, width)
32 | }
33 | return sk
34 | }
35 |
36 | func (sk *Sketch) Incr(dat []byte) (min CountType) {
37 | return sk.Add(dat, 1)
38 | }
39 |
40 | func (sk *Sketch) positions(dat []byte) (pos []uint32) {
41 | // reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java
42 | hash1 := murmur3.Sum32WithSeed(dat, 0)
43 | hash2 := murmur3.Sum32WithSeed(dat, hash1)
44 | pos = make([]uint32, sk.Depth)
45 | for i := uint32(0); i < sk.Depth; i++ {
46 | pos[i] = (hash1 + i*hash2) % sk.Width
47 | }
48 | return pos
49 | }
50 |
51 | func (sk *Sketch) Add(dat []byte, cnt CountType) (min CountType) {
52 | pos := sk.positions(dat)
53 | min = sk.query(pos)
54 |
55 | min += cnt
56 |
57 | sk.mutex.Lock()
58 | for i := uint32(0); i < sk.Depth; i++ {
59 | v := sk.Count[i][pos[i]]
60 | if v < min {
61 | sk.Count[i][pos[i]] = min
62 | }
63 | }
64 | sk.mutex.Unlock()
65 |
66 | return min
67 | }
68 |
69 | func (sk *Sketch) Query(dat []byte) (min CountType) {
70 | pos := sk.positions(dat)
71 | return sk.query(pos)
72 | }
73 |
74 | func (sk *Sketch) query(pos []uint32) (min CountType) {
75 | min = Max
76 |
77 | sk.mutex.RLock()
78 | for i := uint32(0); i < sk.Depth; i++ {
79 | v := sk.Count[i][pos[i]]
80 | if min > v {
81 | min = v
82 | }
83 | }
84 | sk.mutex.RUnlock()
85 |
86 | return min
87 | }
--------------------------------------------------------------------------------
/minsketch8.go:
--------------------------------------------------------------------------------
1 | package abacus
2 |
3 | import "sync"
4 | import (
5 | "github.com/spaolacci/murmur3"
6 | "unsafe"
7 | )
8 |
9 | type CountTypeLog8 uint8
10 | const MaxLog8 = ^(CountTypeLog8(0))
11 |
12 | type SketchLog8 struct {
13 | Width uint32
14 | Depth uint32
15 | Count [][]CountTypeLog8
16 | mutex sync.RWMutex
17 | }
18 |
19 | func sizeOfCellLog8() uintptr{
20 | var a CountTypeLog8
21 | return unsafe.Sizeof(a)
22 | }
23 |
24 | func NewSketchLog8(width, depth uint32) (sk *SketchLog8) {
25 | sk = &SketchLog8{
26 | Width: width,
27 | Depth: depth,
28 | Count: make([][]CountTypeLog8, depth),
29 | }
30 | for i := uint32(0); i < depth; i++ {
31 | sk.Count[i] = make([]CountTypeLog8, width)
32 | }
33 | return sk
34 | }
35 |
36 | func (sk *SketchLog8) Incr(dat []byte) (min CountTypeLog8) {
37 | return sk.Add(dat, 1)
38 | }
39 |
40 | func (sk *SketchLog8) positions(dat []byte) (pos []uint32) {
41 | // reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java
42 | hash1 := murmur3.Sum32WithSeed(dat, 0)
43 | hash2 := murmur3.Sum32WithSeed(dat, hash1)
44 | pos = make([]uint32, sk.Depth)
45 | for i := uint32(0); i < sk.Depth; i++ {
46 | pos[i] = (hash1 + i*hash2) % sk.Width
47 | }
48 | return pos
49 | }
50 |
51 | func (sk *SketchLog8) Add(dat []byte, cnt CountTypeLog8) (min CountTypeLog8) {
52 | pos := sk.positions(dat)
53 | min = sk.query(pos)
54 |
55 | min += cnt
56 |
57 | sk.mutex.Lock()
58 | for i := uint32(0); i < sk.Depth; i++ {
59 | v := sk.Count[i][pos[i]]
60 | if v < min {
61 | sk.Count[i][pos[i]] = min
62 | }
63 | }
64 | sk.mutex.Unlock()
65 |
66 | return min
67 | }
68 |
69 | func (sk *SketchLog8) Query(dat []byte) (min CountTypeLog8) {
70 | pos := sk.positions(dat)
71 | return sk.query(pos)
72 | }
73 |
74 | func (sk *SketchLog8) query(pos []uint32) (min CountTypeLog8) {
75 | min = MaxLog8
76 |
77 | sk.mutex.RLock()
78 | for i := uint32(0); i < sk.Depth; i++ {
79 | v := sk.Count[i][pos[i]]
80 | if min > v {
81 | min = v
82 | }
83 | }
84 | sk.mutex.RUnlock()
85 |
86 | return min
87 | }
88 |
89 |
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Abacus
4 |
5 | Abacus let you count item frequencies in big datasets with a fixed amount of memory.
6 |
7 | Unlike a regular counter it trades off accuracy for memory.
8 | This is useful for particular tasks, for example in NLP/ML related tasks you might want to count millions of items
9 | however approximate counts are good enough.
10 |
11 | Example:
12 |
13 | ```go
14 | counter := abacus.New(maxMemoryMB=10) // abacus will use max 10MB to store your counts
15 | counter.Update([]string{"item1", "item2", "item2"})
16 | counter.Counts("item1") // 1 , counts for "item1"
17 | counter.Total() // 3 ,Total number of counts (sum of counts of all elements)
18 | counter.Cardinality() // 2 , How many different items are there?
19 | ```
20 |
21 | Abacus lets you define how much memory you want to use and you go from there counting items.
22 | Of course there are some limitations, and if you set the memory threshold too low, you might get innacurate counts.
23 |
24 | ## Benchmarks
25 |
26 | - Counting bigrams (words) from [Wiki corpus](http://www.cs.upc.edu/~nlp/wikicorpus/).
27 | - Compared memory and accuracy of `Abacus` vs using a `map[string]int`
28 |
29 |
30 | Corpus Data Structure Used Memory Accuracy
31 |
32 | | Corpus | Data Structure | Used Memory | Accuracy |
33 | |---------|-----------------|-----------------|-----------|
34 | | Half of Wiki corpus (English) | Abacus (1000MB) | 1.75GB | 96% |
35 | | Half of Wiki corpus (English) | Abacus (Log8) (200MB) | 369MB | 70% |
36 | | Half of Wiki corpus (English) | Abacus (Log8) (400MB) | 407MB | 98% |
37 | | Half of Wiki corpus (English) | Map | 3.3GB | 100% |
38 |
39 | | Corpus | Data Structure | Used Memory | Accuracy |
40 | |---------|-----------------|-----------------|-----------|
41 | | Complete Wiki corpus (English) | Abacus (2200MB) | 3.63GB | 98% |
42 | | Complete Wiki corpus (English) | Abacus (500MB) | 741MB | 15% |
43 | | Complete Wiki corpus (English) | Abacus (Log8) (500MB) | 760MB | 90% |
44 | | Complete Wiki corpus (English) | Abacus (Log8) (700MB) | 889MB | 97% |
45 | | Complete Wiki corpus (English) | Map | 10.46GB | 100% |
46 |
47 | Note: This is me playing with Golang again, heavily based on [Bounter](https://github.com/RaRe-Technologies/bounter)
48 |
49 |
50 |
51 |
52 | ## Under the hood
53 |
54 | ### Count–min sketch
55 |
56 | Used to count item frequencies.
57 |
58 | ### HyperLogLog
59 |
60 | Used to calculate the cardinality
61 |
62 | -----------
63 |
64 | Icon made by [free-icon](https://www.flaticon.com/free-icon/)
65 |
--------------------------------------------------------------------------------
/abacus.go:
--------------------------------------------------------------------------------
1 | package abacus
2 |
3 | import (
4 | "github.com/sasha-s/go-hll"
5 | "github.com/spaolacci/murmur3"
6 | "math/big"
7 | )
8 |
9 | type Abacus interface{
10 | Counts(key string) (CountType, error)
11 | Update(items []string) error
12 | Total() (*big.Int, error)
13 | Cardinality() (CountType, error)
14 | }
15 |
16 | func widthAndDepthFromSize(sizeMB uint) (uint32, uint32){
17 | width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCell() ))
18 | depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCell()))
19 | return uint32(width), uint32(depth)
20 | }
21 |
22 | type memoryAbacus struct{
23 | MaxMemorySize uint
24 | s *Sketch
25 | h hll.HLL
26 | total *big.Int
27 | }
28 |
29 | func (a *memoryAbacus) Counts(key string) (CountType, error) {
30 | return a.s.Query([]byte(key)), nil
31 | }
32 |
33 | func (a *memoryAbacus) Update(items []string) error {
34 | for _, key := range items {
35 | a.s.Incr([]byte(key))
36 | a.h.Add(uint64(murmur3.Sum32([]byte(key))))
37 | a.total = a.total.Add(big.NewInt(1), a.total)
38 | }
39 |
40 | return nil
41 | }
42 |
43 | func (a *memoryAbacus) Total() (*big.Int, error){
44 | return a.total, nil
45 | }
46 |
47 | func (a *memoryAbacus) Cardinality() (CountType, error){
48 | return CountType(a.h.EstimateCardinality()),nil
49 | }
50 |
51 | func New(maxMemoryMB uint) memoryAbacus {
52 | w, d := widthAndDepthFromSize(maxMemoryMB)
53 | sketch := NewSketch(w, d)
54 | s, _ := hll.SizeByP(16)
55 | h := make(hll.HLL, s)
56 | a:= memoryAbacus{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)}
57 | return a
58 | }
59 |
60 |
61 | type memoryAbacusLog8 struct{
62 | MaxMemorySize uint
63 | s *SketchLog8
64 | h hll.HLL
65 | total *big.Int
66 | }
67 |
68 | func (a *memoryAbacusLog8) Counts(key string) (CountType, error) {
69 | return CountType(a.s.Query([]byte(key))), nil
70 | }
71 |
72 | func (a *memoryAbacusLog8) Update(items []string) error {
73 | for _, key := range items {
74 | a.s.Incr([]byte(key))
75 | a.h.Add(uint64(murmur3.Sum32([]byte(key))))
76 | a.total = a.total.Add(big.NewInt(1), a.total)
77 | }
78 |
79 | return nil
80 | }
81 |
82 | func (a *memoryAbacusLog8) Total() (*big.Int, error){
83 | return a.total, nil
84 | }
85 |
86 | func (a *memoryAbacusLog8) Cardinality() (CountType, error){
87 | return CountType(a.h.EstimateCardinality()),nil
88 | }
89 |
90 | func widthAndDepthFromSizeLog8(sizeMB uint) (uint32, uint32){
91 | width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCellLog8() ))
92 | depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCellLog8()))
93 | return uint32(width), uint32(depth)
94 | }
95 |
96 | func NewAbacus8Log(maxMemoryMB uint) memoryAbacusLog8 {
97 | w, d := widthAndDepthFromSizeLog8(maxMemoryMB)
98 | sketch := NewSketchLog8(w, d)
99 | s, _ := hll.SizeByP(16)
100 | h := make(hll.HLL, s)
101 | a:= memoryAbacusLog8{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)}
102 | return a
103 | }
104 |
--------------------------------------------------------------------------------
/199506.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
109 |
--------------------------------------------------------------------------------