├── .gitignore
├── LICENSE
├── README.md
├── vhll.go
└── vhll_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 |
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 |
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 |
20 | _testmain.go
21 |
22 | *.exe
23 | *.test
24 | *.prof
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Seif Lotfy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Virtual HyperLogLog
2 |
3 | A virtual HyperLogLog is a highly compact virtual maximum likelihood Sketch for counting big network data.
4 |
5 | TL;DR: Multiple HyperLogLogs in one HyperLogLog, by sharing bits amongst each other (down to 0.1 bits in theory per register)
6 |
7 | Long version:
8 | The datastructure takes from the paper (see below) which proposes a new method, called virtual maximum likelihood sketches, to reduce memory consumption by cardinality estimation on a large number of flows. It embodies two ideas. The first idea is called virtual sketches, which uses no more than two bits per sketch on average, while retaining the functional equivalence to an FM sketch. The second idea is called virtual sketch vectors, which combine the sketches of all flows into a mixed common pool. Together, these two ideas can drastically reduce the overall memory overhead. Based on virtual sketches and virtual vectors, we design a cardinality estimation solution with an online operation module and an offline estimation module.
9 |
10 |
11 | For details about the algorithm and citations please use this paper for now
12 |
13 | ["Hyper-Compact Virtual Estimators for Big Network Data Based on Register Sharing" by Qingjun Xiao, Shigang Chen, Min Chen & Yibei Ling](http://www.cise.ufl.edu/~min/paper/sigmetrics15.pdf)
14 |
15 | ##Note
16 | This implementation uses a a static bucket size of register size of 1 byte instead of 6 bits. It's still under development, but the main concept is implemented, just needs optimizations.
17 |
18 | ##Example usage:
19 | ```go
20 |
21 | import "github.com/seiflotfy/vhll"
22 |
23 | v, _ := vhll.NewForLog2m(24)
24 |
25 | //repeat several times for higher accuracy since this is a maximum likelihood sketch
26 | v.Add([]byte("first flow"), []byte("some data"))
27 |
28 | count := cf.GetCardinality([]byte("first flow"))
29 | // count == +/- 13% for now
30 |
31 | count := cf.GetTotalCardinality()
32 | // count == +/- 13% for now
33 |
34 | ```
35 |
--------------------------------------------------------------------------------
/vhll.go:
--------------------------------------------------------------------------------
1 | package vhll
2 |
3 | import (
4 | "errors"
5 | "math"
6 |
7 | metro "github.com/dgryski/go-metro"
8 | )
9 |
10 | func alpha(m float64) float64 {
11 | switch m {
12 | case 16:
13 | return 0.673
14 | case 32:
15 | return 0.697
16 | case 64:
17 | return 0.709
18 | }
19 | return 0.7213 / (1 + 1.079/m)
20 | }
21 |
22 | func zeros(registers []uint8) (z float64) {
23 | for _, val := range registers {
24 | if val == 0 {
25 | z++
26 | }
27 | }
28 | return z
29 | }
30 |
31 | func beta(ez float64) float64 {
32 | zl := math.Log(ez + 1)
33 | return -0.370393911*ez +
34 | 0.070471823*zl +
35 | 0.17393686*math.Pow(zl, 2) +
36 | 0.16339839*math.Pow(zl, 3) +
37 | -0.09237745*math.Pow(zl, 4) +
38 | 0.03738027*math.Pow(zl, 5) +
39 | -0.005384159*math.Pow(zl, 6) +
40 | 0.00042419*math.Pow(zl, 7)
41 | }
42 |
43 | // Calculate the position of the leftmost 1-bit.
44 | func rho(val uint64) (r uint8) {
45 | for val&0x8000000000000000 == 0 {
46 | val <<= 1
47 | r++
48 | }
49 | return r + 1
50 | }
51 |
52 | func hash(e []byte) uint64 {
53 | return metro.Hash64(e, 1337)
54 | }
55 |
56 | func sumAndZeros(register []uint8) (float64, float64) {
57 | ez := 0.0
58 | sum := 0.0
59 | for _, val := range register {
60 | sum += 1.0 / math.Pow(2.0, float64(val))
61 | if val == 0 {
62 | ez++
63 | }
64 | }
65 | return sum, ez
66 | }
67 |
68 | // VHLL ...
69 | type VHLL struct {
70 | M []uint8
71 | m uint64
72 | s uint64
73 | log2s uint64
74 | mAlpha float64
75 | sAlpha float64
76 | }
77 |
78 | func (v *VHLL) hashi(i uint64, f []byte) uint64 {
79 | return metro.Hash64(f, i) % v.m
80 | }
81 |
82 | // NewVHLL ...
83 | func NewVHLL(precision, vPrecision uint8) (*VHLL, error) {
84 | if precision < 9 {
85 | return nil, errors.New("precision needs to be >= 9")
86 | }
87 | if vPrecision < 8 || vPrecision > 12 {
88 | return nil, errors.New("virtual precision needs to be >= 8 and <= 10")
89 | }
90 | if precision < vPrecision {
91 | return nil, errors.New("virtual precision needs to be > precision")
92 | }
93 | m := uint64(math.Pow(2, float64(precision)))
94 | s := uint64(math.Pow(2, float64(vPrecision)))
95 | return &VHLL{
96 | M: make([]uint8, m, m),
97 | m: m,
98 | s: s,
99 | log2s: uint64(vPrecision),
100 | mAlpha: alpha(float64(m)),
101 | sAlpha: alpha(float64(s)),
102 | }, nil
103 | }
104 |
105 | // Insert ...
106 | func (v *VHLL) Insert(f []byte, e []byte) {
107 | he := hash(e)
108 | p := he % v.s
109 | q := he << v.log2s
110 | r := rho(q)
111 | index := metro.Hash64(f, p) % v.m
112 | if r > v.M[index] {
113 | v.M[index] = r
114 | }
115 | }
116 |
117 | // Estimate ...
118 | func (v *VHLL) Estimate(f []byte) uint64 {
119 | M := make([]uint8, v.s, v.s)
120 | for i := range M {
121 | index := metro.Hash64(f, uint64(i)) % v.m
122 | M[i] = v.M[index]
123 | }
124 |
125 | sum, ez := sumAndZeros(M)
126 | s := float64(v.s)
127 | beta := beta(ez)
128 | ns := (v.sAlpha * s * (s - ez) / (beta + sum))
129 |
130 | // estimate error
131 | m := float64(v.m)
132 | n := float64(v.totalCardinality())
133 | e := ns - (s * n / m)
134 |
135 | // rounding
136 | return uint64(e + 0.5)
137 | }
138 |
139 | func (v *VHLL) totalCardinality() uint64 {
140 | sum, ez := sumAndZeros(v.M)
141 | m := float64(len(v.M))
142 | beta := beta(ez)
143 | return uint64(v.mAlpha * m * (m - ez) / (beta + sum))
144 | }
145 |
--------------------------------------------------------------------------------
/vhll_test.go:
--------------------------------------------------------------------------------
1 | package vhll
2 |
3 | import (
4 | "strconv"
5 | "testing"
6 | )
7 |
8 | func TestVHLL(t *testing.T) {
9 |
10 | vhll, _ := NewVHLL(18, 12)
11 | for i := uint(0); i <= 1000000; i++ {
12 | for j := uint(1); j <= 5; j++ {
13 | if i%j == 0 {
14 | id := []byte(strconv.Itoa(int(j)))
15 | vhll.Insert([]byte(id), []byte(strconv.Itoa(int(i))))
16 | }
17 | }
18 | }
19 |
20 | expected := make(map[uint]uint)
21 | expected[1] = 1000000
22 | expected[2] = 500000
23 | expected[3] = 333333
24 | expected[4] = 250000
25 | expected[5] = 200000
26 |
27 | for j := uint(1); j <= 5; j++ {
28 | id := []byte(strconv.Itoa(int(j)))
29 | card := float64(vhll.Estimate(id))
30 | p4 := float64(4 * card / 100)
31 | if float64(card) > float64(expected[j])+p4 || float64(card) < float64(expected[j])-p4 {
32 | t.Error("Expected error < 4 percent, got count for", j, "=", expected[j], "got", card)
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------