├── README ├── capacity.go ├── capacity_test.go ├── coin.go ├── coin_test.go ├── kll.go ├── kll_test.go ├── sampler.go ├── serialization.go └── serialization_test.go /README: -------------------------------------------------------------------------------- 1 | go-kll: almost optimal streaming quantiles 2 | 3 | godoc: https://godoc.org/github.com/dgryski/go-kll 4 | 5 | -------------------------------------------------------------------------------- /capacity.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import "math" 4 | 5 | func computeHeight(h int) float64 { 6 | if h < len(heightsCache) { 7 | return heightsCache[h] 8 | } 9 | return math.Pow((2.0 / 3.0), float64(h)) 10 | } 11 | 12 | var heightsCache = [...]float64{ 13 | 1, 14 | 0.6666666666666666, 15 | 0.4444444444444444, 16 | 0.2962962962962963, 17 | 0.19753086419753085, 18 | 0.1316872427983539, 19 | 0.0877914951989026, 20 | 0.05852766346593506, 21 | 0.039018442310623375, 22 | 0.026012294873748915, 23 | 0.01734152991583261, 24 | 0.011561019943888407, 25 | 0.007707346629258938, 26 | 0.005138231086172625, 27 | 0.00342548739078175, 28 | 0.0022836582605211663, 29 | 0.0015224388403474443, 30 | 0.0010149592268982961, 31 | 0.0006766394845988641, 32 | 0.00045109298973257606, 33 | 0.0003007286598217174, 34 | 0.00020048577321447823, 35 | 0.0001336571821429855, 36 | 8.910478809532365e-05, 37 | 5.9403192063549106e-05, 38 | 3.960212804236607e-05, 39 | 2.640141869491071e-05, 40 | 1.760094579660714e-05, 41 | 1.1733963864404761e-05, 42 | 7.82264257626984e-06, 43 | 5.21509505084656e-06, 44 | 3.4767300338977064e-06, 45 | 2.3178200225984708e-06, 46 | 1.5452133483989804e-06, 47 | 1.030142232265987e-06, 48 | 6.867614881773246e-07, 49 | 4.5784099211821645e-07, 50 | 3.0522732807881095e-07, 51 | 2.0348488538587396e-07, 52 | 1.356565902572493e-07, 53 | 9.04377268381662e-08, 54 | 6.02918178921108e-08, 55 | 4.019454526140719e-08, 56 | 2.67963635076048e-08, 57 | 1.78642423384032e-08, 58 | 1.1909494892268798e-08, 59 | 7.939663261512532e-09, 60 | 5.293108841008354e-09, 61 | 3.528739227338903e-09, 62 | 2.352492818225935e-09, 63 | 1.5683285454839568e-09, 64 | 1.0455523636559712e-09, 65 | 6.970349091039809e-10, 66 | 4.646899394026538e-10, 67 | 3.097932929351026e-10, 68 | 2.0652886195673503e-10, 69 | 1.3768590797115669e-10, 70 | 9.179060531410445e-11, 71 | 6.119373687606963e-11, 72 | 4.0795824584046424e-11, 73 | 2.7197216389364282e-11, 74 | 1.813147759290952e-11, 75 | 1.2087651728606347e-11, 76 | 8.058434485737563e-12, 77 | 5.372289657158376e-12, 78 | 3.581526438105584e-12, 79 | 2.3876842920703892e-12, 80 | 1.5917895280469262e-12, 81 | 1.0611930186979508e-12, 82 | 7.074620124653005e-13, 83 | 4.716413416435336e-13, 84 | 3.1442756109568906e-13, 85 | 2.0961837406379272e-13, 86 | 1.3974558270919513e-13, 87 | 9.316372180613009e-14, 88 | 6.21091478707534e-14, 89 | 4.140609858050226e-14, 90 | 2.760406572033484e-14, 91 | 1.8402710480223226e-14, 92 | 1.226847365348215e-14, 93 | 8.178982435654766e-15, 94 | 5.452654957103177e-15, 95 | 3.635103304735452e-15, 96 | 2.423402203156968e-15, 97 | 1.615601468771312e-15, 98 | 1.0770676458475411e-15, 99 | 7.180450972316942e-16, 100 | 4.786967314877961e-16, 101 | 3.191311543251974e-16, 102 | 2.1275410288346492e-16, 103 | 1.418360685889766e-16, 104 | 9.455737905931773e-17, 105 | 6.303825270621183e-17, 106 | 4.2025501804141215e-17, 107 | 2.801700120276081e-17, 108 | 1.8678000801840538e-17, 109 | 1.2452000534560357e-17, 110 | 8.301333689706904e-18, 111 | 5.534222459804603e-18, 112 | 3.6894816398697355e-18, 113 | 2.459654426579824e-18, 114 | 1.6397696177198825e-18, 115 | 1.0931797451465883e-18, 116 | 7.287864967643922e-19, 117 | 4.858576645095947e-19, 118 | 3.239051096730632e-19, 119 | 2.1593673978204208e-19, 120 | 1.439578265213614e-19, 121 | 9.597188434757427e-20, 122 | 6.398125623171617e-20, 123 | 4.2654170821144116e-20, 124 | 2.843611388076274e-20, 125 | 1.8957409253841826e-20, 126 | 1.263827283589455e-20, 127 | 8.4255152239297e-21, 128 | 5.6170101492864665e-21, 129 | 3.744673432857645e-21, 130 | 2.4964489552384296e-21, 131 | 1.6642993034922866e-21, 132 | 1.1095328689948576e-21, 133 | 7.39688579329905e-22, 134 | 4.931257195532699e-22, 135 | 3.2875047970218e-22, 136 | 2.1916698646812e-22, 137 | 1.4611132431208001e-22, 138 | 9.740754954138666e-23, 139 | 6.493836636092445e-23, 140 | 4.3292244240616286e-23, 141 | } 142 | -------------------------------------------------------------------------------- /capacity_test.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func TestComputeHeight(t *testing.T) { 9 | for i := range heightsCache { 10 | computed := math.Pow((2.0 / 3.0), float64(i)) 11 | if heightsCache[i] != computed { 12 | t.Fatalf("cache bad: %v: %v != %v", i, heightsCache[i], computed) 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /coin.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import "math/rand" 4 | 5 | // 64-bit xorshift multiply rng from http://vigna.di.unimi.it/ftp/papers/xorshift.pdf 6 | func xorshiftMult64(x uint64) uint64 { 7 | x ^= x >> 12 // a 8 | x ^= x << 25 // b 9 | x ^= x >> 27 // c 10 | return x * 2685821657736338717 11 | } 12 | 13 | // coin is a simple struct to let us get random bools and make minimum calls 14 | // to the random number generator. 15 | type coin struct { 16 | st uint64 17 | mask uint64 18 | } 19 | 20 | // v is either 0 or 1 21 | func (c *coin) toss() (v int) { 22 | if c.mask == 0 { 23 | if c.st == 0 { 24 | c.st = uint64(rand.Int63()) 25 | } 26 | c.st = xorshiftMult64(c.st) 27 | c.mask = 1 28 | } 29 | if c.st&c.mask > 0 { 30 | v = 1 31 | } 32 | c.mask <<= 1 33 | return v 34 | } 35 | -------------------------------------------------------------------------------- /coin_test.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestCoin(t *testing.T) { 10 | // set up a coin that should return alternating bits 11 | c := coin{ 12 | st: 0xaaaaaaaaaaaaaaaa, 13 | mask: 1, 14 | } 15 | 16 | for i := 0; i < 64; i++ { 17 | if v := c.toss(); v != i&1 { 18 | t.Fatalf("toss %d: %d != %d", i, v, i&1) 19 | } 20 | } 21 | } 22 | 23 | func TestCoinMany(t *testing.T) { 24 | rng := rand.New(rand.NewSource(time.Now().UnixNano())) 25 | c := coin{ 26 | st: uint64(rng.Int63()), 27 | mask: 0, 28 | } 29 | t.Logf("state: 0x%016x", c.st) 30 | 31 | pos := 0 32 | for i := 0; i < 1000; i++ { 33 | v := c.toss() 34 | if v != 0 && v != 1 { 35 | t.Fatal("invalid value from coin:", v) 36 | } 37 | if v == 1 { 38 | pos++ 39 | } 40 | } 41 | 42 | t.Logf("pos: %v", pos) 43 | 44 | // someone can do the binomial/normal, but i expect somewhere between 45 | // 400 and 600 will never fail. 46 | if pos < 400 || pos > 600 { 47 | t.Fatal("abornmal bias in flips:", pos) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /kll.go: -------------------------------------------------------------------------------- 1 | // Package kll implements the KLL streaming quantiles sketch 2 | /* 3 | http://arxiv.org/pdf/1603.05346v1.pdf 4 | */ 5 | package kll 6 | 7 | import ( 8 | "math" 9 | "sort" 10 | ) 11 | 12 | // Sketch is a streaming quantiles sketch 13 | type Sketch struct { 14 | compactors []compactor 15 | k int 16 | H int 17 | size int 18 | maxSize int 19 | 20 | co coin 21 | } 22 | 23 | // New returns a new Sketch. k controls the maximum memory used by the stream, which is 3*k + lg(n). 24 | func New(k int) *Sketch { 25 | s := Sketch{ 26 | k: k, 27 | } 28 | s.grow() 29 | return &s 30 | } 31 | 32 | func (s *Sketch) grow() { 33 | s.compactors = append(s.compactors, compactor{}) 34 | s.H = len(s.compactors) 35 | 36 | s.maxSize = 0 37 | for h := 0; h < s.H; h++ { 38 | s.maxSize += s.capacity(h) 39 | } 40 | } 41 | 42 | func (s *Sketch) capacity(h int) int { 43 | return int(math.Ceil(float64(s.k)*computeHeight(s.H-h-1))) + 1 44 | } 45 | 46 | // Update adds x to the stream. 47 | func (s *Sketch) Update(x float64) { 48 | s.compactors[0] = append(s.compactors[0], x) 49 | s.size++ 50 | s.compact() 51 | } 52 | 53 | func (s *Sketch) compact() { 54 | for s.size >= s.maxSize { 55 | for h := 0; h < len(s.compactors); h++ { 56 | if len(s.compactors[h]) >= s.capacity(h) { 57 | if h+1 >= s.H { 58 | s.grow() 59 | } 60 | 61 | prev_h := len(s.compactors[h]) 62 | prev_h1 := len(s.compactors[h+1]) 63 | 64 | s.compactors[h+1] = s.compactors[h].compact( 65 | &s.co, s.compactors[h+1]) 66 | 67 | s.size += len(s.compactors[h]) - prev_h 68 | s.size += len(s.compactors[h+1]) - prev_h1 69 | 70 | if s.size < s.maxSize { 71 | break 72 | } 73 | } 74 | } 75 | } 76 | } 77 | 78 | func (s *Sketch) updateSize() { 79 | s.size = 0 80 | for _, c := range s.compactors { 81 | s.size += len(c) 82 | } 83 | } 84 | 85 | // Merge merges a second sketch into this one 86 | func (s *Sketch) Merge(t *Sketch) { 87 | for s.H < t.H { 88 | s.grow() 89 | } 90 | 91 | for h, c := range t.compactors { 92 | s.compactors[h] = append(s.compactors[h], c...) 93 | } 94 | 95 | s.updateSize() 96 | s.compact() 97 | } 98 | 99 | // Rank estimates the rank of the value x in the stream. 100 | func (s *Sketch) Rank(x float64) int { 101 | var r int 102 | for h, c := range s.compactors { 103 | for _, v := range c { 104 | if v <= x { 105 | r += 1 << uint(h) 106 | } 107 | } 108 | } 109 | return r 110 | } 111 | 112 | func (s *Sketch) Count() int { 113 | var n int 114 | for h, c := range s.compactors { 115 | n += len(c) * (1 << uint(h)) 116 | } 117 | return n 118 | } 119 | 120 | // Quantile estimates the quantile of the value x in the stream. 121 | func (s *Sketch) Quantile(x float64) float64 { 122 | var r, n int 123 | for h, c := range s.compactors { 124 | for _, v := range c { 125 | w := 1 << uint(h) 126 | if v <= x { 127 | r += w 128 | } 129 | n += w 130 | } 131 | } 132 | return float64(r) / float64(n) 133 | } 134 | 135 | type CDF []Quantile 136 | 137 | func (q CDF) Len() int { return len(q) } 138 | 139 | func (q CDF) Less(i int, j int) bool { return q[i].V < q[j].V } 140 | 141 | func (q CDF) Swap(i int, j int) { q[i], q[j] = q[j], q[i] } 142 | 143 | type Quantile struct { 144 | Q float64 145 | V float64 146 | } 147 | 148 | func (s *Sketch) CDF() CDF { 149 | q := make(CDF, 0, s.size) 150 | 151 | var totalW float64 152 | for h, c := range s.compactors { 153 | weight := float64(int(1 << uint(h))) 154 | for _, v := range c { 155 | q = append(q, Quantile{Q: weight, V: v}) 156 | } 157 | totalW += float64(len(c)) * weight 158 | } 159 | 160 | sort.Sort(q) 161 | 162 | var curW float64 163 | for i := range q { 164 | curW += q[i].Q 165 | q[i].Q = curW / totalW 166 | } 167 | 168 | return q 169 | } 170 | 171 | // Quantile estimates the quantile of the value x in the stream. 172 | func (q CDF) Quantile(x float64) float64 { 173 | idx := sort.Search(len(q), func(i int) bool { return q[i].V >= x }) 174 | if idx == 0 { 175 | return 0 176 | } 177 | return q[idx-1].Q 178 | } 179 | 180 | // Query estimates the value given quantile p. 181 | func (q CDF) Query(p float64) float64 { 182 | idx := sort.Search(len(q), func(i int) bool { return q[i].Q >= p }) 183 | if idx == len(q) { 184 | return q[len(q)-1].V 185 | } 186 | return q[idx].V 187 | } 188 | 189 | // QuantileLI estimates the quantile of the value x in the stream using linear interpolation. 190 | func (q CDF) QuantileLI(x float64) float64 { 191 | idx := sort.Search(len(q), func(i int) bool { return q[i].V >= x }) 192 | if idx == len(q) { 193 | return 1 194 | } 195 | if idx == 0 { 196 | return 0 197 | } 198 | // a < x <= b 199 | a, aq := q[idx-1].V, q[idx-1].Q 200 | b, bq := q[idx].V, q[idx].Q 201 | return ((a-x)*bq + (x-b)*aq) / (a - b) 202 | } 203 | 204 | // QueryLI estimates the value given quantile p using linear interpolation. 205 | func (q CDF) QueryLI(p float64) float64 { 206 | idx := sort.Search(len(q), func(i int) bool { return q[i].Q >= p }) 207 | if idx == len(q) { 208 | return q[len(q)-1].V 209 | } 210 | if idx == 0 { 211 | return q[0].V 212 | } 213 | // aq < p <= b 214 | a, aq := q[idx-1].V, q[idx-1].Q 215 | b, bq := q[idx].V, q[idx].Q 216 | return ((aq-p)*b + (p-bq)*a) / (aq - bq) 217 | } 218 | 219 | type compactor []float64 220 | 221 | func (c *compactor) compact(co *coin, dst []float64) []float64 { 222 | l := len(*c) 223 | 224 | if l == 0 || l == 1 { 225 | } else if l == 2 { 226 | c := *c 227 | if c[0] > c[1] { 228 | c[0], c[1] = c[1], c[0] 229 | } 230 | } else if l > 100 { 231 | sort.Float64s([]float64(*c)) 232 | } else { 233 | c.insertionSort() 234 | } 235 | 236 | free := cap(dst) - len(dst) 237 | if free < len(*c)/2 { 238 | extra := len(*c)/2 - free 239 | newdst := make([]float64, len(dst), cap(dst)+extra) 240 | copy(newdst, dst) 241 | dst = newdst 242 | } 243 | 244 | // choose either the evens or the odds 245 | offs := co.toss() 246 | for len(*c) >= 2 { 247 | l := len(*c) - 2 248 | dst = append(dst, (*c)[l+offs]) 249 | *c = (*c)[:l] 250 | } 251 | 252 | return dst 253 | } 254 | 255 | func (c compactor) insertionSort() { 256 | l := len(c) 257 | for i := 1; i < l; i++ { 258 | v := c[i] 259 | j := i 260 | for ; j > 0 && c[j-1] > v; j-- { 261 | } 262 | if j == i { 263 | continue 264 | } 265 | copy(c[j+1:], c[j:i]) 266 | c[j] = v 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /kll_test.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import ( 4 | "math/rand" 5 | "sort" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func benchmarkAdd(b *testing.B, cons func() float64, k int) { 11 | // generate the random data 12 | values := make([]float64, b.N) 13 | for i := range values { 14 | values[i] = cons() 15 | } 16 | r := New(k) 17 | 18 | b.ResetTimer() 19 | b.ReportAllocs() 20 | 21 | for i := 0; i < b.N; i++ { 22 | r.Update(values[i]) 23 | } 24 | } 25 | 26 | func BenchmarkAddNormal_1(b *testing.B) { 27 | benchmarkAdd(b, rand.NormFloat64, 1) 28 | } 29 | 30 | func BenchmarkAddNormal_5(b *testing.B) { 31 | benchmarkAdd(b, rand.NormFloat64, 5) 32 | } 33 | 34 | func BenchmarkAddNormal_10(b *testing.B) { 35 | benchmarkAdd(b, rand.NormFloat64, 10) 36 | } 37 | 38 | func BenchmarkAddNormal_100(b *testing.B) { 39 | benchmarkAdd(b, rand.NormFloat64, 100) 40 | } 41 | 42 | func BenchmarkAddNormal_1000(b *testing.B) { 43 | benchmarkAdd(b, rand.NormFloat64, 1000) 44 | } 45 | 46 | func TestCompactorInsertionSort(t *testing.T) { 47 | rng := rand.New(rand.NewSource(time.Now().UnixNano())) 48 | 49 | for _, dup := range []bool{false, true} { 50 | for _, l := range []int{0, 1, 2, 3, 5, 8, 1 << 5, 1 << 10} { 51 | for i := 0; i < 100; i++ { 52 | c := make(compactor, l) 53 | for i := range c { 54 | if dup && i%2 == 1 { 55 | c[i] = c[i-1] 56 | } else { 57 | c[i] = rng.NormFloat64() 58 | } 59 | } 60 | cp := make(compactor, l) 61 | copy(cp, c) 62 | c.insertionSort() 63 | if !sort.Float64sAreSorted([]float64(c)) { 64 | t.Fatalf("failed to sort: %v", c) 65 | } 66 | sort.Float64s(cp) 67 | for i, v := range c { 68 | if v != cp[i] { 69 | t.Fatalf("failed to sort: %f!=%f @%d\nexpected: %v, got %v", cp[i], v, i, cp, c) 70 | } 71 | } 72 | } 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /sampler.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import "math/rand" 4 | 5 | type sampler struct { 6 | h uint16 7 | w uint64 8 | y float64 9 | } 10 | 11 | func (s *sampler) update(x float64, w uint64, to []float64) []float64 { 12 | ph := uint64(1 << s.h) 13 | switch { 14 | case s.w+w <= ph: 15 | s.w += w 16 | if rand.Float64()*float64(w) < float64(s.w) { 17 | s.y = x 18 | } 19 | if s.w == ph { 20 | s.w = 0 21 | return append(to, s.y) 22 | } 23 | case s.w < w: 24 | if rand.Float64()*float64(w) < float64(ph) { 25 | return append(to, x) 26 | } 27 | default: // W >= w 28 | s.w = w 29 | s.y = x 30 | if rand.Float64()*float64(w) < float64(ph) { 31 | return append(to, x) 32 | } 33 | } 34 | return to 35 | } 36 | 37 | func (s *sampler) grow() { 38 | s.h++ 39 | } 40 | -------------------------------------------------------------------------------- /serialization.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "unsafe" 7 | ) 8 | 9 | // we know that compactor is really a []float64, and we want to refer to them 10 | // in the state, so we can just unsafely convert them. 11 | 12 | func compactorsAsFloats(c []compactor) [][]float64 { 13 | return *(*[][]float64)(unsafe.Pointer(&c)) 14 | } 15 | 16 | func floatsAsCompactors(f [][]float64) []compactor { 17 | return *(*[]compactor)(unsafe.Pointer(&f)) 18 | } 19 | 20 | // State represents the state of the Sketch. It is used for serializing and 21 | // deserializing to disk. 22 | type State struct { 23 | Compactors [][]float64 24 | K int 25 | H int 26 | Size int 27 | MaxSize int 28 | Count int 29 | } 30 | 31 | // State returns the current state of the Sketch. The state is invalid if any 32 | // other methods of the Sketch are called, and it must not be mutated. 33 | func (s *Sketch) State() State { 34 | return State{ 35 | Compactors: compactorsAsFloats(s.compactors), 36 | K: s.k, 37 | H: s.H, 38 | Size: s.size, 39 | MaxSize: s.maxSize, 40 | } 41 | } 42 | 43 | // SetState sets the state of the Sketch to the passed State. The memory is 44 | // shared, so the passed State is invalid to be read from or written to after 45 | // this call. 46 | func (s *Sketch) SetState(state State) { 47 | s.compactors = floatsAsCompactors(state.Compactors) 48 | s.k = state.K 49 | s.H = state.H 50 | s.size = state.Size 51 | s.maxSize = state.MaxSize 52 | } 53 | 54 | // MarshalBinary implements encoding.BinaryMarshaler. 55 | func (s *Sketch) MarshalBinary() ([]byte, error) { 56 | var buf bytes.Buffer 57 | err := gob.NewEncoder(&buf).Encode(s.State()) 58 | if err != nil { 59 | return nil, err 60 | } 61 | return buf.Bytes(), nil 62 | } 63 | 64 | // UnmarshalBinary implements encoding.BinaryUnmarshaler. 65 | func (s *Sketch) UnmarshalBinary(data []byte) error { 66 | var r State 67 | err := gob.NewDecoder(bytes.NewReader(data)).Decode(&r) 68 | if err != nil { 69 | return err 70 | } 71 | s.SetState(r) 72 | return nil 73 | } 74 | -------------------------------------------------------------------------------- /serialization_test.go: -------------------------------------------------------------------------------- 1 | package kll 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | ) 7 | 8 | var stateBlackhole State 9 | 10 | func BenchmarkGetState(b *testing.B) { 11 | const k = 1000 12 | r := New(k) 13 | for i := 0; i < 100*k; i++ { 14 | r.Update(rand.NormFloat64()) 15 | } 16 | 17 | b.ResetTimer() 18 | b.ReportAllocs() 19 | 20 | for i := 0; i < b.N; i++ { 21 | stateBlackhole = r.State() 22 | } 23 | } 24 | --------------------------------------------------------------------------------