├── .gitignore
├── go.mod
├── utils.go
├── .circleci
    └── config.yml
├── demo
    └── main.go
├── LICENSE
├── stream_example_test.go
├── buffer.go
├── buffer_test.go
├── README.md
├── go.sum
├── sketch.go
├── summary.go
├── summary_test.go
└── sketch_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, build with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/axiomhq/quantiles
 2 | 
 3 | go 1.12
 4 | 
 5 | require (
 6 | 	github.com/beorn7/perks v1.0.0
 7 | 	github.com/gogo/protobuf v1.3.2 // indirect
 8 | 	github.com/pkg/errors v0.8.1 // indirect
 9 | 	github.com/stretchr/testify v1.3.0
10 | 	github.com/stripe/veneur v12.0.0+incompatible
11 | )
12 | 


--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
 1 | package quantiles
 2 | 
 3 | func maxInt64(a, b int64) int64 {
 4 | 	if a > b {
 5 | 		return a
 6 | 	}
 7 | 	return b
 8 | }
 9 | 
10 | func maxFloat64(a, b float64) float64 {
11 | 	if a > b {
12 | 		return a
13 | 	}
14 | 	return b
15 | }
16 | 
17 | func minFloat64(a, b float64) float64 {
18 | 	if a < b {
19 | 		return a
20 | 	}
21 | 	return b
22 | }
23 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     docker:
 5 |       - image: circleci/golang:1.12
 6 |     steps:
 7 |       - checkout
 8 |       - run:
 9 |           name: Run tests
10 |           command: |
11 |             go get gotest.tools/gotestsum@v0.4.0
12 |             mkdir -p test-results/gotestsum
13 |             gotestsum --junitfile test-results/gotestsum/results.xml -f short-verbose -- ./...
14 |       - run: 
15 |           name: Run benchmarks
16 |           command: go test -bench .
17 |       - store_test_results:
18 |           path: test-results
19 | 


--------------------------------------------------------------------------------
/demo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | 
 7 | 	"github.com/axiomhq/quantiles"
 8 | 	"github.com/beorn7/perks/quantile"
 9 | 	"github.com/stripe/veneur/tdigest"
10 | )
11 | 
12 | func bToMb(b uint64) uint64 {
13 | 	return b / 1024 / 1024
14 | }
15 | 
16 | func veneur() {
17 | 	t := tdigest.NewMerging(20, false)
18 | 	now := time.Now()
19 | 	for i := 0.0; i < 1e6; i++ {
20 | 		t.Add(i, 1.0)
21 | 	}
22 | 	fmt.Println("veneur:", time.Since(now))
23 | }
24 | 
25 | func axiom() {
26 | 	qstream, _ := quantiles.New(0.01, 1000)
27 | 	now := time.Now()
28 | 	for i := 0.0; i < 1e6; i++ {
29 | 		if err := qstream.Push(i, 1.0); err != nil {
30 | 			panic(err)
31 | 		}
32 | 	}
33 | 	fmt.Println("axiom:", time.Since(now))
34 | }
35 | 
36 | func prom() {
37 | 	tstream := quantile.NewLowBiased(0.01)
38 | 	now := time.Now()
39 | 	for i := 0.0; i < 1e6; i++ {
40 | 		tstream.Insert(i)
41 | 	}
42 | 	fmt.Println("prometheus:", time.Since(now))
43 | }
44 | 
45 | func main() {
46 | 	veneur()
47 | 	prom()
48 | 	axiom()
49 | }
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Axiom Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/stream_example_test.go:
--------------------------------------------------------------------------------
 1 | package quantiles_test
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/axiomhq/quantiles"
 7 | )
 8 | 
 9 | func Example() {
10 | 	sketch := quantiles.NewDefault()
11 | 	for i := 0.0; i < 1e6; i++ {
12 | 		if err := sketch.Push(i, 1.0); err != nil {
13 | 			panic(err)
14 | 		}
15 | 	}
16 | 	fmt.Print("ApproximationError:")
17 | 	fmt.Println(sketch.ApproximationError(1))
18 | 
19 | 	fmt.Print("Finalize:")
20 | 	fmt.Println(sketch.Finalize())
21 | 
22 | 	fmt.Print("GenerateQuantiles(4):")
23 | 	fmt.Println(sketch.GenerateQuantiles(4))
24 | 
25 | 	fmt.Print("GenerateQuantiles(10):")
26 | 	fmt.Println(sketch.GenerateQuantiles(10))
27 | 
28 | 	sum, err := sketch.FinalSummary()
29 | 	if err != nil {
30 | 		panic(err)
31 | 	}
32 | 	fmt.Print("GenerateQuantiles(4):")
33 | 	fmt.Println(sum.GenerateQuantiles(4))
34 | 
35 | 	// Output:
36 | 	// ApproximationError:0.006218905472636816 <nil>
37 | 	// Finalize:<nil>
38 | 	// GenerateQuantiles(4):[0 249854 499710 749566 999999] <nil>
39 | 	// GenerateQuantiles(10):[0 98302 200702 299006 401406 499710 598014 700414 798718 900094 999999] <nil>
40 | 	// GenerateQuantiles(4):[0 249854 499710 749566 999999]
41 | }
42 | 


--------------------------------------------------------------------------------
/buffer.go:
--------------------------------------------------------------------------------
 1 | package quantiles
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sort"
 6 | )
 7 | 
 8 | // byValue implements sort.Interface based on the value field.
 9 | type byValue []bufEntry
10 | 
11 | func (a byValue) Len() int           { return len(a) }
12 | func (a byValue) Less(i, j int) bool { return a[i].value < a[j].value }
13 | func (a byValue) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
14 | 
15 | // bufEntry ...
16 | type bufEntry struct {
17 | 	value  float64
18 | 	weight float64
19 | }
20 | 
21 | type buffer struct {
22 | 	vec     byValue
23 | 	maxSize int64
24 | 	curSize int64
25 | }
26 | 
27 | func newBuffer(blockSize, maxElements int64) (*buffer, error) {
28 | 	maxSize := blockSize << 1
29 | 	if maxSize > maxElements {
30 | 		maxSize = maxElements
31 | 	}
32 | 
33 | 	if maxSize <= 0 {
34 | 		return nil, fmt.Errorf("Invalid buffer specification: (%v, %v)", blockSize, maxElements)
35 | 	}
36 | 
37 | 	return &buffer{
38 | 		maxSize: maxSize,
39 | 		curSize: 0,
40 | 		vec:     make([]bufEntry, maxSize),
41 | 	}, nil
42 | }
43 | 
44 | func (buf *buffer) clone() *buffer {
45 | 	newBuffer := &buffer{
46 | 		maxSize: buf.maxSize,
47 | 		curSize: buf.curSize,
48 | 		vec:     make([]bufEntry, buf.maxSize),
49 | 	}
50 | 	for i, e := range buf.vec {
51 | 		newBuffer.vec[i] = e
52 | 	}
53 | 	return newBuffer
54 | }
55 | 
56 | func (buf *buffer) push(value, weight float64) error {
57 | 	//QCHECK magic
58 | 	if buf.isFull() {
59 | 		return fmt.Errorf("Buffer already full: %v", buf.maxSize)
60 | 	}
61 | 
62 | 	if weight > 0 {
63 | 		buf.vec[buf.curSize] = bufEntry{value, weight}
64 | 		buf.curSize++
65 | 	}
66 | 	return nil
67 | }
68 | 
69 | // generateEntryList returns a sorted vector view of the base buffer and clears the buffer.
70 | // Callers should minimize how often this is called, ideally only right after
71 | // the buffer becomes full.
72 | func (buf *buffer) generateEntryList() []bufEntry {
73 | 	sort.Sort(buf.vec[:buf.curSize])
74 | 	ret := buf.vec[:buf.curSize]
75 | 	buf.vec = make([]bufEntry, buf.maxSize)
76 | 	if buf.curSize == 0 {
77 | 		return ret
78 | 	}
79 | 	buf.curSize = 0
80 | 	numEntries := 0
81 | 	for i := 1; i < len(ret); i++ {
82 | 		if ret[i].value != ret[i-1].value {
83 | 			numEntries++
84 | 			ret[numEntries] = ret[i]
85 | 		} else {
86 | 			ret[numEntries].weight += ret[i].weight
87 | 		}
88 | 	}
89 | 	return ret[:numEntries+1]
90 | }
91 | 
92 | // isFull ...
93 | func (buf *buffer) isFull() bool {
94 | 	return buf.curSize >= buf.maxSize
95 | }
96 | 


--------------------------------------------------------------------------------
/buffer_test.go:
--------------------------------------------------------------------------------
  1 | package quantiles
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"reflect"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestBufferInvalid(t *testing.T) {
 10 | 	if _, err := newBuffer(2, 0); err == nil {
 11 | 		t.Error("expected error, got nil")
 12 | 	}
 13 | 	if _, err := newBuffer(0, 2); err == nil {
 14 | 		t.Error("expected error, got nil")
 15 | 	}
 16 | }
 17 | 
 18 | func TestBufferPushEntryNotFull(t *testing.T) {
 19 | 	buf, err := newBuffer(2, 100)
 20 | 	if err != nil {
 21 | 		t.Error("expected no err, got", err)
 22 | 	}
 23 | 	buf.push(5, 9)
 24 | 	buf.push(2, 3)
 25 | 	buf.push(-1, 7)
 26 | 	buf.push(3, 0)
 27 | 
 28 | 	if buf.isFull() {
 29 | 		t.Error("expected not full, got full")
 30 | 	}
 31 | 	if val := len(buf.vec); val == 2 {
 32 | 		t.Error("expected 3, got full", val)
 33 | 	}
 34 | }
 35 | 
 36 | func TestBufferPushEntryFull(t *testing.T) {
 37 | 	buf, err := newBuffer(2, 100)
 38 | 	if err != nil {
 39 | 		t.Error("expected no err, got", err)
 40 | 	}
 41 | 	buf.push(5, 9)
 42 | 	buf.push(2, 3)
 43 | 	buf.push(-1, 7)
 44 | 	buf.push(2, 1)
 45 | 
 46 | 	expected := []bufEntry{}
 47 | 	expected = append(expected, bufEntry{-1, 7})
 48 | 	expected = append(expected, bufEntry{2, 4})
 49 | 	expected = append(expected, bufEntry{5, 9})
 50 | 
 51 | 	if !buf.isFull() {
 52 | 		t.Error("expected full, got not full")
 53 | 	}
 54 | 	if got := buf.generateEntryList(); !reflect.DeepEqual(expected, got) {
 55 | 		t.Errorf("expected %v, got %v", expected, got)
 56 | 	}
 57 | }
 58 | func TestBufferPushEntryFullDeath(t *testing.T) {
 59 | 	buf, err := newBuffer(2, 100)
 60 | 	if err != nil {
 61 | 		t.Error("expected no err, got", err)
 62 | 	}
 63 | 	buf.push(5, 9)
 64 | 	buf.push(2, 3)
 65 | 	buf.push(-1, 7)
 66 | 	buf.push(2, 1)
 67 | 
 68 | 	expected := []bufEntry{}
 69 | 	expected = append(expected, bufEntry{-1, 7})
 70 | 	expected = append(expected, bufEntry{2, 4})
 71 | 	expected = append(expected, bufEntry{5, 9})
 72 | 
 73 | 	if !buf.isFull() {
 74 | 		t.Error("expected full, got not full")
 75 | 	}
 76 | 	if err := buf.push(6, 6); err == nil {
 77 | 		t.Error("expected buffer already full")
 78 | 	}
 79 | }
 80 | 
 81 | func push(n int) error {
 82 | 	buf, _ := newBuffer(int64(n), int64(n))
 83 | 	for i := 0; i < n; i++ {
 84 | 		if err := buf.push(rand.Float64(), rand.Float64()); err != nil {
 85 | 			return err
 86 | 		}
 87 | 	}
 88 | 	return nil
 89 | }
 90 | 
 91 | func BenchmarkPush100(b *testing.B) {
 92 | 	// run the Fib function b.N times
 93 | 	for n := 0; n < b.N; n++ {
 94 | 		if err := push(100); err != nil {
 95 | 			b.Error(err)
 96 | 			return
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func BenchmarkPush1000(b *testing.B) {
102 | 	// run the Fib function b.N times
103 | 	for n := 0; n < b.N; n++ {
104 | 		if err := push(1000); err != nil {
105 | 			b.Error(err)
106 | 			return
107 | 		}
108 | 	}
109 | }
110 | 
111 | func BenchmarkPush10000(b *testing.B) {
112 | 	// run the Fib function b.N times
113 | 	for n := 0; n < b.N; n++ {
114 | 		if err := push(10000); err != nil {
115 | 			b.Error(err)
116 | 			return
117 | 		}
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # quantiles - Optimal Quantile Approximation in Streams
 2 | [![GoDoc](https://godoc.org/github.com/axiomhq/quantiles?status.svg)](https://godoc.org/github.com/axiomhq/quantiles) [![CircleCI](https://circleci.com/gh/axiomhq/quantiles/tree/master.svg?style=svg)](https://circleci.com/gh/axiomhq/quantiles/tree/master)
 3 | 
 4 | This is a translation of [TensorFlow's quantile helper class](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/boosted_trees/lib/quantiles), it aims to compute approximate quantiles with error bound guarantees for weighted data sets.
 5 | This implementation is an adaptation of techniques from the following papers:
 6 | * (2001) [Space-efficient online computation of quantile summaries](http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf).
 7 | * (2004) [Power-conserving computation of order-statistics over sensor networks](http://www.cis.upenn.edu/~mbgreen/papers/pods04.pdf).
 8 | * (2007) [A fast algorithm for approximate quantiles in high speed data streams](http://web.cs.ucla.edu/~weiwang/paper/SSDBM07_2.pdf).
 9 | * (2016) [XGBoost: A Scalable Tree Boosting System](https://arxiv.org/pdf/1603.02754.pdf).
10 | 
11 | #### The key ideas at play are the following:
12 | * Maintain an in-memory multi-level quantile summary in a way to guarantee
13 |   a maximum approximation error of `eps * W` per bucket where `W` is the total
14 |   weight across all points in the input dataset.
15 | * Two base operations are defined: `MERGE` and `COMPRESS`. `MERGE` combines two
16 |   summaries guaranteeing a `epsNew = max(eps1, eps2)`. `COMPRESS` compresses
17 |   a summary to `b + 1` elements guaranteeing `epsNew = epsOld + 1/b`.
18 | * `b * sizeof(summary entry)` must ideally be small enough to fit in an
19 |   average CPU L2 cache.
20 | * To distribute this algorithm with maintaining error bounds, we need
21 |   the worker-computed summaries to have no more than `eps / h` error
22 |   where h is the height of the distributed computation graph which
23 |   is 2 for an MR with no combiner.
24 | 
25 | We mainly want to max out IO bw by ensuring we're not compute-bound and
26 | using a reasonable amount of RAM.
27 | 
28 | #### Complexity:
29 | * Compute: `O(n * log(1/eps * log(eps * n)))`.
30 | * Memory: `O(1/eps * log^2(eps * n))` <- for one worker streaming through the entire dataset.
31 | 
32 | An epsilon value of zero would make the algorithm extremely inefficent and
33 | therefore, is disallowed.
34 | 
35 | 
36 | ## Example Usage
37 | ```go
38 | package quantiles_test
39 | 
40 | import (
41 | 	"fmt"
42 | 
43 | 	"github.com/axiomhq/quantiles"
44 | )
45 | 
46 | func Example() {
47 | 	sketch := quantiles.NewDefault()
48 | 	for i := 0.0; i < 1e6; i++ {
49 | 		if err := sketch.Push(i, 1.0); err != nil {
50 | 			panic(err)
51 | 		}
52 | 	}
53 | 	fmt.Print("ApproximationError:") 	
54 | 	fmt.Println(sketch.ApproximationError(1))  // 0 <nil>
55 | 
56 | 	fmt.Print("Finalize:") 
57 | 	fmt.Println(sketch.Finalize())            // <nil>
58 | 
59 |  
60 | 	fmt.Print("GenerateQuantiles(4):")         
61 | 	fmt.Println(sketch.GenerateQuantiles(4))  // [0 251865 503730 746595 999999] <nil>
62 | 
63 | 
64 | 	fmt.Print("GenerateQuantiles(10):")
65 | 	fmt.Println(sketch.GenerateQuantiles(10)) // [0 98946 197892 296838 395789 503730 602676 701622 800568 899514 999999] <nil>
66 | 
67 | 	sum, err := sketch.FinalSummary()
68 | 	if err != nil {
69 | 		panic(err)
70 | 	}
71 | 	fmt.Print("GenerateQuantiles(4):")
72 | 	fmt.Println(sum.GenerateQuantiles(4))     // [0 251865 503730 746595 999999]
73 | }
74 | ```
75 | 
76 | ## TODO
77 | * [x] Implement an online estimator without the need of finalizing the stream
78 | * [x] Add proper documentation
79 | * [ ] Benchmark
80 | * [ ] Add serialization
81 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0=
 2 | github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 3 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 5 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 6 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 7 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 8 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 9 | github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
10 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
11 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
12 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
14 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
15 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
16 | github.com/stripe/veneur v12.0.0+incompatible h1:goZhHLUUxzN7gbJlaULhoLEd3PAyvB6CjXmEkfsSQ/k=
17 | github.com/stripe/veneur v12.0.0+incompatible/go.mod h1:oEfQGGOeGcs/N7jAfByGwjGGAh1X9tF2gYpU5Nzuljk=
18 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
19 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
20 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
21 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
22 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
23 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
24 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
25 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
26 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
27 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
28 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
29 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
30 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
31 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
32 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
33 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
34 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
35 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
36 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
37 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
38 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
39 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
40 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
41 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
42 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
43 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
44 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
45 | 


--------------------------------------------------------------------------------
/sketch.go:
--------------------------------------------------------------------------------
  1 | package quantiles
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | )
  7 | 
  8 | var errFinalized = fmt.Errorf("Finalize() already called")
  9 | 
 10 | // Sketch ...
 11 | type Sketch struct {
 12 | 	eps           float64
 13 | 	maxLevels     int64
 14 | 	blockSize     int64
 15 | 	buffer        *buffer
 16 | 	localSummary  *Summary
 17 | 	summaryLevels []*Summary
 18 | 	finalized     bool
 19 | 	n             uint64
 20 | }
 21 | 
 22 | // NewDefault returns a new Sketch with the eps = 0.01 and maxElements 1000
 23 | func NewDefault() *Sketch {
 24 | 	stream, _ := New(0.01, 1000)
 25 | 	return stream
 26 | }
 27 | 
 28 | // New returns a new Sketch for a given eps and maxElements
 29 | func New(eps float64, maxElements int64) (*Sketch, error) {
 30 | 	if eps <= 0 {
 31 | 		return nil, fmt.Errorf("an epsilon value of zero is not allowed")
 32 | 	}
 33 | 
 34 | 	maxLevels, blockSize, err := getQuantileSpecs(eps, maxElements)
 35 | 	if err != nil {
 36 | 		return nil, err
 37 | 	}
 38 | 
 39 | 	buffer, err := newBuffer(blockSize, maxElements)
 40 | 	if err != nil {
 41 | 		return nil, err
 42 | 	}
 43 | 
 44 | 	stream := &Sketch{
 45 | 		eps:           eps,
 46 | 		buffer:        buffer,
 47 | 		finalized:     false,
 48 | 		maxLevels:     maxLevels,
 49 | 		blockSize:     blockSize,
 50 | 		localSummary:  newSummary(),
 51 | 		summaryLevels: []*Summary{},
 52 | 	}
 53 | 	return stream, nil
 54 | }
 55 | 
 56 | func (stream *Sketch) clone() *Sketch {
 57 | 	newStream := &Sketch{
 58 | 		eps:           stream.eps,
 59 | 		buffer:        stream.buffer.clone(),
 60 | 		finalized:     stream.finalized,
 61 | 		maxLevels:     stream.maxLevels,
 62 | 		blockSize:     stream.blockSize,
 63 | 		localSummary:  stream.localSummary.clone(),
 64 | 		summaryLevels: stream.summaryLevels,
 65 | 	}
 66 | 	for i, sum := range stream.summaryLevels {
 67 | 		newStream.summaryLevels[i] = sum.clone()
 68 | 	}
 69 | 	return newStream
 70 | }
 71 | 
 72 | // Push a value and a weight into the stream
 73 | func (stream *Sketch) Push(value float64, weight float64) error {
 74 | 	// Validate state.
 75 | 	var err error
 76 | 	if stream.finalized {
 77 | 		return errFinalized
 78 | 	}
 79 | 
 80 | 	if err = stream.buffer.push(value, weight); err != nil {
 81 | 		return err
 82 | 	}
 83 | 
 84 | 	if stream.buffer.isFull() {
 85 | 		err = stream.pushBuffer(stream.buffer)
 86 | 	}
 87 | 	stream.n++
 88 | 	return err
 89 | }
 90 | 
 91 | func (stream *Sketch) pushBuffer(buf *buffer) error {
 92 | 	// Validate state.
 93 | 	if stream.finalized {
 94 | 		return errFinalized
 95 | 	}
 96 | 	stream.localSummary.buildFromBufferEntries(buf.generateEntryList())
 97 | 	stream.localSummary.compress(stream.blockSize, stream.eps)
 98 | 	return stream.propagateLocalSummary()
 99 | }
100 | 
101 | // PushSummary pushes full summary while maintaining approximation error invariants.
102 | func (stream *Sketch) PushSummary(summary []SumEntry) error {
103 | 	// Validate state.
104 | 	if stream.finalized {
105 | 		return errFinalized
106 | 	}
107 | 	stream.localSummary.buildFromSummaryEntries(summary)
108 | 	stream.localSummary.compress(stream.blockSize, stream.eps)
109 | 	return stream.propagateLocalSummary()
110 | }
111 | 
112 | // Finalize flushes approximator and finalizes state.
113 | func (stream *Sketch) Finalize() error {
114 | 	// Validate state.
115 | 	if stream.finalized {
116 | 		return errFinalized
117 | 	}
118 | 
119 | 	// Flush any remaining buffer elements.
120 | 	stream.pushBuffer(stream.buffer)
121 | 
122 | 	// Create final merged summary
123 | 	stream.localSummary.Clear()
124 | 	for _, summary := range stream.summaryLevels {
125 | 		stream.localSummary.Merge(summary)
126 | 	}
127 | 	stream.localSummary.n = stream.n
128 | 
129 | 	stream.summaryLevels = []*Summary{}
130 | 	stream.finalized = true
131 | 	return nil
132 | }
133 | 
134 | /*
135 | propagates local summary through summary levels while maintaining
136 | approximation error invariants.
137 | */
138 | func (stream *Sketch) propagateLocalSummary() error {
139 | 	// Validate state.
140 | 	if stream.finalized {
141 | 		return errFinalized
142 | 	}
143 | 
144 | 	// No-op if there's nothing to add.
145 | 	if stream.localSummary.Size() <= 0 {
146 | 		return nil
147 | 	}
148 | 
149 | 	for level, settled := int64(0), false; !settled; level++ {
150 | 		// Ensure we have enough depth.
151 | 		if int64(len(stream.summaryLevels)) <= level {
152 | 			stream.summaryLevels = append(stream.summaryLevels, &Summary{})
153 | 		}
154 | 
155 | 		// Merge summaries.
156 | 		currentSummary := stream.summaryLevels[level]
157 | 		stream.localSummary.Merge(currentSummary)
158 | 
159 | 		// Check if we need to compress and propagate summary higher.
160 | 		if currentSummary.Size() == 0 ||
161 | 			stream.localSummary.Size() <= stream.blockSize+1 {
162 | 			*currentSummary = *(stream.localSummary)
163 | 			stream.localSummary = newSummary()
164 | 			settled = true
165 | 		} else {
166 | 			// Compress, empty current level and propagate.
167 | 			stream.localSummary.compress(stream.blockSize, stream.eps)
168 | 			currentSummary.Clear()
169 | 		}
170 | 	}
171 | 	stream.localSummary.n = stream.n
172 | 	return nil
173 | }
174 | 
175 | // Quantile ...
176 | func (stream *Sketch) Quantile(q float64) (float64, error) {
177 | 	if !stream.finalized {
178 | 		return 0, fmt.Errorf("Finalize() must be called before generating quantiles")
179 | 	}
180 | 	return stream.localSummary.Quantile(q)
181 | }
182 | 
183 | /*
184 | GenerateQuantiles generates requested number of quantiles after finalizing stream.
185 | The returned quantiles can be queried using std::lower_bound to get
186 | the bucket for a given value.
187 | */
188 | func (stream *Sketch) GenerateQuantiles(numQuantiles int64) ([]float64, error) {
189 | 	if !stream.finalized {
190 | 		return nil, fmt.Errorf("Finalize() must be called before generating quantiles")
191 | 	}
192 | 	return stream.localSummary.GenerateQuantiles(numQuantiles), nil
193 | }
194 | 
195 | /*
196 | GenerateBoundaries generates requested number of boundaries after finalizing stream.
197 | The returned boundaries can be queried using std::lower_bound to get
198 | the bucket for a given value.
199 | The boundaries, while still guaranteeing approximation bounds, don't
200 | necessarily represent the actual quantiles of the distribution.
201 | Boundaries are preferable over quantiles when the caller is less
202 | interested in the actual quantiles distribution and more interested in
203 | getting a representative sample of boundary values.
204 | */
205 | func (stream *Sketch) GenerateBoundaries(numBoundaries int64) ([]float64, error) {
206 | 	if !stream.finalized {
207 | 		return nil, fmt.Errorf("Finalize() must be called before generating quantiles")
208 | 	}
209 | 	return stream.localSummary.GenerateBoundaries(numBoundaries), nil
210 | }
211 | 
212 | /*
213 | ApproximationError calculates approximation error for the specified level.
214 | If the passed level is negative, the approximation error for the entire
215 | summary is returned. Note that after Finalize is called, only the overall
216 | error is available.
217 | */
218 | func (stream *Sketch) ApproximationError(level int64) (float64, error) {
219 | 	if stream.finalized {
220 | 		if level > 0 {
221 | 			return 0, fmt.Errorf("only overall error is available after Finalize()")
222 | 		}
223 | 		return stream.localSummary.ApproximationError(), nil
224 | 	}
225 | 
226 | 	if len(stream.summaryLevels) == 0 {
227 | 		// No error even if base buffer isn't empty.
228 | 		return 0, nil
229 | 	}
230 | 
231 | 	// If level is negative, we get the approximation error
232 | 	// for the top-most level which is the max approximation error
233 | 	// in all summaries by construction.
234 | 	if level < 0 {
235 | 		level = int64(len(stream.summaryLevels)) - 1
236 | 	}
237 | 	if level >= int64(len(stream.summaryLevels)) {
238 | 		return 0, fmt.Errorf("invalid level")
239 | 	}
240 | 	return stream.summaryLevels[level].ApproximationError(), nil
241 | }
242 | 
243 | // MaxDepth ...
244 | func (stream *Sketch) MaxDepth() int {
245 | 	return len(stream.summaryLevels)
246 | }
247 | 
248 | // FinalSummary ...
249 | func (stream *Sketch) FinalSummary() (*Summary, error) {
250 | 	if !stream.finalized {
251 | 		return nil, fmt.Errorf("Finalize() must be called before generating quantiles")
252 | 	}
253 | 	return stream.localSummary, nil
254 | }
255 | 
256 | /*
257 | Helper method which, given the desired approximation error
258 | and an upper bound on the number of elements, computes the optimal
259 | number of levels and block size and returns them in the tuple.
260 | */
261 | func getQuantileSpecs(eps float64, maxElements int64) (int64, int64, error) {
262 | 	var (
263 | 		maxLevel  int64 = 1
264 | 		blockSize int64 = 2
265 | 	)
266 | 	if eps < 0 || eps >= 1 {
267 | 		return maxLevel, blockSize, fmt.Errorf("eps should be element of [0, 1)")
268 | 	}
269 | 	if maxElements <= 0 {
270 | 		return maxLevel, blockSize, fmt.Errorf("maxElements should be > 0")
271 | 	}
272 | 
273 | 	if eps <= math.SmallestNonzeroFloat64 {
274 | 		// Exact quantile computation at the expense of RAM.
275 | 		maxLevel = 1
276 | 		blockSize = maxInt64(maxElements, 2)
277 | 	} else {
278 | 		// The bottom-most level will become full at most
279 | 		// (max_elements / block_size) times, the level above will become full
280 | 		// (max_elements / 2 * block_size) times and generally level l becomes
281 | 		// full (max_elements / 2^l * block_size) times until the last
282 | 		// level max_level becomes full at most once meaning when the inequality
283 | 		// (2^max_level * block_size >= max_elements) is satisfied.
284 | 		// In what follows, we jointly solve for max_level and block_size by
285 | 		// gradually increasing the level until the inequality above is satisfied.
286 | 		// We could alternatively set max_level = ceil(log2(eps * max_elements));
287 | 		// and block_size = ceil(max_level / eps) + 1 but that tends to give more
288 | 		// pessimistic bounds and wastes RAM needlessly.
289 | 
290 | 		blockSize = 2
291 | 		for maxLevel = 1; (uint64(1)<<uint64(maxLevel))*uint64(blockSize) < uint64(maxElements); maxLevel++ {
292 | 			// Update upper bound on block size at current level, we always
293 | 			// increase the estimate by 2 to hold the min/max elements seen so far.
294 | 			blockSize = int64(math.Ceil(float64(maxLevel)/eps) + 1)
295 | 		}
296 | 	}
297 | 	return maxLevel, maxInt64(blockSize, 2), nil
298 | }
299 | 


--------------------------------------------------------------------------------
/summary.go:
--------------------------------------------------------------------------------
  1 | package quantiles
  2 | 
  3 | import "fmt"
  4 | 
  5 | // SumEntry represents a summary entry
  6 | type SumEntry struct {
  7 | 	value   float64
  8 | 	weight  float64
  9 | 	minRank float64
 10 | 	maxRank float64
 11 | }
 12 | 
 13 | // Value returns the entries value
 14 | func (se SumEntry) Value() float64 {
 15 | 	return se.value
 16 | }
 17 | 
 18 | // Weight returns the entries weight
 19 | func (se SumEntry) Weight() float64 {
 20 | 	return se.weight
 21 | }
 22 | 
 23 | // MaxRank returns the entries maximum rank
 24 | func (se SumEntry) MaxRank() float64 {
 25 | 	return se.maxRank
 26 | }
 27 | 
 28 | // MinRank returns the entries minimum rank
 29 | func (se SumEntry) MinRank() float64 {
 30 | 	return se.minRank
 31 | }
 32 | 
 33 | func (se SumEntry) prevMaxRank() float64 {
 34 | 	return se.maxRank - se.weight
 35 | }
 36 | 
 37 | func (se SumEntry) nextMinRank() float64 {
 38 | 	return se.minRank + se.weight
 39 | }
 40 | 
 41 | // Summary is a summarizes the stream entries
 42 | type Summary struct {
 43 | 	entries   []SumEntry
 44 | 	n         uint64
 45 | 	quantiles []float64
 46 | }
 47 | 
 48 | // newSummary ...
 49 | func newSummary() *Summary {
 50 | 	return &Summary{
 51 | 		entries: make([]SumEntry, 0),
 52 | 	}
 53 | }
 54 | 
 55 | func (sum *Summary) clone() *Summary {
 56 | 	newSum := &Summary{
 57 | 		entries: make([]SumEntry, len(sum.entries)),
 58 | 	}
 59 | 	for i, entry := range sum.entries {
 60 | 		newSum.entries[i] = entry
 61 | 	}
 62 | 	return newSum
 63 | }
 64 | 
 65 | func (sum *Summary) buildFromBufferEntries(bes []bufEntry) {
 66 | 	sum.entries = make([]SumEntry, len(bes))
 67 | 	cumWeight := 0.0
 68 | 	for i, entry := range bes {
 69 | 		curWeight := entry.weight
 70 | 		sum.entries[i] = SumEntry{
 71 | 			value:   entry.value,
 72 | 			weight:  entry.weight,
 73 | 			minRank: cumWeight,
 74 | 			maxRank: cumWeight + curWeight,
 75 | 		}
 76 | 		cumWeight += curWeight
 77 | 	}
 78 | }
 79 | 
 80 | func (sum *Summary) buildFromSummaryEntries(ses []SumEntry) {
 81 | 	sum.entries = ses
 82 | }
 83 | 
 84 | // Merge another summary into the this summary (great for esimating quantiles over several streams)
 85 | func (sum *Summary) Merge(other *Summary) {
 86 | 	otherEntries := other.entries
 87 | 	if len(otherEntries) == 0 {
 88 | 		return
 89 | 	}
 90 | 	if len(sum.entries) == 0 {
 91 | 		sum.entries = otherEntries
 92 | 		return
 93 | 	}
 94 | 
 95 | 	baseEntries := sum.entries
 96 | 	sum.entries = make([]SumEntry, len(baseEntries)+len(otherEntries))
 97 | 
 98 | 	// Merge entries maintaining ranks. The idea is to stack values
 99 | 	// in order which we can do in linear time as the two summaries are
100 | 	// already sorted. We keep track of the next lower rank from either
101 | 	// summary and update it as we pop elements from the summaries.
102 | 	// We handle the special case when the next two elements from either
103 | 	// summary are equal, in which case we just merge the two elements
104 | 	// and simultaneously update both ranks.
105 | 	var (
106 | 		i            int
107 | 		j            int
108 | 		nextMinRank1 float64
109 | 		nextMinRank2 float64
110 | 	)
111 | 
112 | 	num := 0
113 | 	for i != len(baseEntries) && j != len(otherEntries) {
114 | 		it1 := baseEntries[i]
115 | 		it2 := otherEntries[j]
116 | 		if it1.value < it2.value {
117 | 			sum.entries[num] = SumEntry{
118 | 				value: it1.value, weight: it1.weight,
119 | 				minRank: it1.minRank + nextMinRank2,
120 | 				maxRank: it1.maxRank + it2.prevMaxRank(),
121 | 			}
122 | 			nextMinRank1 = it1.nextMinRank()
123 | 			i++
124 | 		} else if it1.value > it2.value {
125 | 			sum.entries[num] = SumEntry{
126 | 				value: it2.value, weight: it2.weight,
127 | 				minRank: it2.minRank + nextMinRank1,
128 | 				maxRank: it2.maxRank + it1.prevMaxRank(),
129 | 			}
130 | 			nextMinRank2 = it2.nextMinRank()
131 | 			j++
132 | 		} else {
133 | 			sum.entries[num] = SumEntry{
134 | 				value: it1.value, weight: it1.weight + it2.weight,
135 | 				minRank: it1.minRank + it2.minRank,
136 | 				maxRank: it1.maxRank + it2.maxRank,
137 | 			}
138 | 			nextMinRank1 = it1.nextMinRank()
139 | 			nextMinRank2 = it2.nextMinRank()
140 | 			i++
141 | 			j++
142 | 		}
143 | 		num++
144 | 	}
145 | 
146 | 	// Fill in any residual.
147 | 	for i != len(baseEntries) {
148 | 		it1 := baseEntries[i]
149 | 		sum.entries[num] = SumEntry{
150 | 			value: it1.value, weight: it1.weight,
151 | 			minRank: it1.minRank + nextMinRank2,
152 | 			maxRank: it1.maxRank + otherEntries[len(otherEntries)-1].maxRank,
153 | 		}
154 | 		i++
155 | 		num++
156 | 	}
157 | 	for j != len(otherEntries) {
158 | 		it2 := otherEntries[j]
159 | 		sum.entries[num] = SumEntry{
160 | 			value: it2.value, weight: it2.weight,
161 | 			minRank: it2.minRank + nextMinRank1,
162 | 			maxRank: it2.maxRank + baseEntries[len(baseEntries)-1].maxRank,
163 | 		}
164 | 		j++
165 | 		num++
166 | 	}
167 | 	sum.entries = sum.entries[:num]
168 | 
169 | }
170 | 
171 | func (sum *Summary) compress(sizeHint int64, minEps float64) {
172 | 	// No-op if we're already within the size requirement.
173 | 	sizeHint = maxInt64(sizeHint, 2)
174 | 	if int64(len(sum.entries)) <= sizeHint {
175 | 		return
176 | 	}
177 | 
178 | 	// First compute the max error bound delta resulting from this compression.
179 | 	epsDelta := sum.TotalWeight() * maxFloat64(1/float64(sizeHint), minEps)
180 | 
181 | 	// Compress elements ensuring approximation bounds and elements diversity are both maintained.
182 | 	var (
183 | 		addAccumulator int64
184 | 		addStep        = int64(len(sum.entries))
185 | 	)
186 | 
187 | 	wi := 1
188 | 	li := wi
189 | 
190 | 	for ri := 0; ri+1 != len(sum.entries); {
191 | 		ni := ri + 1
192 | 		for ni != len(sum.entries) && addAccumulator < addStep &&
193 | 			sum.entries[ni].prevMaxRank()-sum.entries[ri].nextMinRank() <= epsDelta {
194 | 			addAccumulator += sizeHint
195 | 			ni++
196 | 		}
197 | 		if sum.entries[ri] == sum.entries[ni-1] {
198 | 			ri++
199 | 		} else {
200 | 			ri = ni - 1
201 | 		}
202 | 
203 | 		sum.entries[wi] = sum.entries[ri]
204 | 		wi++
205 | 		li = ri
206 | 		addAccumulator -= addStep
207 | 	}
208 | 
209 | 	if li+1 != len(sum.entries) {
210 | 		sum.entries[wi] = sum.entries[len(sum.entries)-1]
211 | 		wi++
212 | 	}
213 | 
214 | 	sum.entries = sum.entries[:wi]
215 | }
216 | 
217 | // GenerateBoundaries ...
218 | func (sum *Summary) GenerateBoundaries(numBoundaries int64) []float64 {
219 | 	// To construct the boundaries we first run a soft compress over a copy
220 | 	// of the summary and retrieve the values.
221 | 	// The resulting boundaries are guaranteed to both contain at least
222 | 	// num_boundaries unique elements and maintain approximation bounds.
223 | 	if len(sum.entries) == 0 {
224 | 		return []float64{}
225 | 	}
226 | 
227 | 	// Generate soft compressed summary.
228 | 	compressedSummary := &Summary{}
229 | 	compressedSummary.buildFromSummaryEntries(sum.entries)
230 | 	// Set an epsilon for compression that's at most 1.0 / num_boundaries
231 | 	// more than epsilon of original our summary since the compression operation
232 | 	// adds ~1.0/num_boundaries to final approximation error.
233 | 	compressionEps := sum.ApproximationError() + 1.0/float64(numBoundaries)
234 | 	compressedSummary.compress(numBoundaries, compressionEps)
235 | 
236 | 	// Return boundaries.
237 | 	output := make([]float64, len(compressedSummary.entries))
238 | 	for _, entry := range compressedSummary.entries {
239 | 		output = append(output, entry.value)
240 | 	}
241 | 	return output
242 | }
243 | 
244 | // Quantile returns the value for quantile q
245 | func (sum *Summary) Quantile(q float64) (float64, error) {
246 | 	// To construct the desired n-quantiles we repetitively query n ranks from the
247 | 	// original summary. The following algorithm is an efficient cache-friendly
248 | 	// O(n) implementation of that idea which avoids the cost of the repetitive
249 | 	// full rank queries O(nlogn).
250 | 	if q < 0 || q > 1 {
251 | 		return 0, fmt.Errorf("expected 0 <= q <= 1, got q = %v", q)
252 | 	}
253 | 	numQuantiles := int64(sum.n)
254 | 	if numQuantiles == 0 {
255 | 		return 0, nil
256 | 	}
257 | 	if len(sum.quantiles) == 0 {
258 | 		sum.quantiles = sum.GenerateQuantiles(numQuantiles + 1)
259 | 	}
260 | 	qIdx := int(float64(numQuantiles)*q + 0.5)
261 | 	return sum.quantiles[qIdx], nil
262 | }
263 | 
264 | // GenerateQuantiles returns a slice of float64 of size numQuantiles+1, the ith entry is the `i * 1/numQuantiles+1` quantile
265 | func (sum *Summary) GenerateQuantiles(numQuantiles int64) []float64 {
266 | 	// To construct the desired n-quantiles we repetitively query n ranks from the
267 | 	// original summary. The following algorithm is an efficient cache-friendly
268 | 	// O(n) implementation of that idea which avoids the cost of the repetitive
269 | 	// full rank queries O(nlogn).
270 | 	if len(sum.entries) == 0 {
271 | 		return []float64{}
272 | 	}
273 | 	if numQuantiles < 2 {
274 | 		numQuantiles = 2
275 | 	}
276 | 	curIdx := 0
277 | 	output := make([]float64, numQuantiles+1)
278 | 	for rank := 0.0; rank <= float64(numQuantiles); rank++ {
279 | 		d2 := 2 * (rank * sum.entries[len(sum.entries)-1].maxRank / float64(numQuantiles))
280 | 		nextIdx := curIdx + 1
281 | 		for nextIdx < len(sum.entries) && d2 >= sum.entries[nextIdx].minRank+sum.entries[nextIdx].maxRank {
282 | 			nextIdx++
283 | 		}
284 | 		curIdx = nextIdx - 1
285 | 		// Determine insertion order.
286 | 		if nextIdx == len(sum.entries) || d2 < sum.entries[curIdx].nextMinRank()+sum.entries[nextIdx].prevMaxRank() {
287 | 			output[int(rank)] = sum.entries[curIdx].value
288 | 		} else {
289 | 			output[int(rank)] = sum.entries[nextIdx].value
290 | 		}
291 | 	}
292 | 	return output
293 | }
294 | 
295 | // ApproximationError ...
296 | func (sum *Summary) ApproximationError() float64 {
297 | 	if len(sum.entries) == 0 {
298 | 		return 0
299 | 	}
300 | 
301 | 	var maxGap float64
302 | 	for i := 1; i < len(sum.entries); i++ {
303 | 		it := sum.entries[i]
304 | 		if tmp := it.maxRank - it.minRank - it.weight; tmp > maxGap {
305 | 			maxGap = tmp
306 | 		}
307 | 		if tmp := it.prevMaxRank() - sum.entries[i-1].nextMinRank(); tmp > maxGap {
308 | 			maxGap = tmp
309 | 		}
310 | 	}
311 | 	return maxGap / sum.TotalWeight()
312 | }
313 | 
314 | // MinValue returns the min weight value of the summary
315 | func (sum *Summary) MinValue() float64 {
316 | 	if len(sum.entries) != 0 {
317 | 		return sum.entries[0].value
318 | 	}
319 | 	return 0
320 | }
321 | 
322 | // MaxValue returns the max weight value of the summary
323 | func (sum *Summary) MaxValue() float64 {
324 | 	if len(sum.entries) != 0 {
325 | 		return sum.entries[len(sum.entries)-1].value
326 | 	}
327 | 	return 0
328 | }
329 | 
330 | // TotalWeight returns the total weight of the summary
331 | func (sum *Summary) TotalWeight() float64 {
332 | 	if len(sum.entries) != 0 {
333 | 		return sum.entries[len(sum.entries)-1].maxRank
334 | 	}
335 | 	return 0
336 | }
337 | 
338 | // Size returns the size (num of entries) in the summary
339 | func (sum *Summary) Size() int64 {
340 | 	return int64(len(sum.entries))
341 | }
342 | 
343 | // Clear reset the summary
344 | func (sum *Summary) Clear() {
345 | 	sum.entries = []SumEntry{}
346 | }
347 | 
348 | // Entries returns all summary entries
349 | func (sum *Summary) Entries() []SumEntry {
350 | 	return sum.entries
351 | }
352 | 


--------------------------------------------------------------------------------
/summary_test.go:
--------------------------------------------------------------------------------
  1 | package quantiles
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | type SummaryDummy struct {
 11 | 	buffer1 *buffer
 12 | 	buffer2 *buffer
 13 | 
 14 | 	buffer1MinValue    float64
 15 | 	buffer1MaxValue    float64
 16 | 	buffer1TotalWeight float64
 17 | 
 18 | 	buffer2MinValue    float64
 19 | 	buffer2MaxValue    float64
 20 | 	buffer2TotalWeight float64
 21 | 
 22 | 	*Summary
 23 | }
 24 | 
 25 | func NewWeightedQuantilesSummaryDummy() (*SummaryDummy, error) {
 26 | 	sum := &Summary{
 27 | 		entries: make([]SumEntry, 0),
 28 | 	}
 29 | 	wqsd := &SummaryDummy{
 30 | 		Summary:            sum,
 31 | 		buffer1MinValue:    -13,
 32 | 		buffer1MaxValue:    21,
 33 | 		buffer1TotalWeight: 45,
 34 | 		buffer2MinValue:    -7,
 35 | 		buffer2MaxValue:    11,
 36 | 		buffer2TotalWeight: 30,
 37 | 	}
 38 | 	if err := wqsd.setup(); err != nil {
 39 | 		return nil, err
 40 | 	}
 41 | 	return wqsd, nil
 42 | }
 43 | 
 44 | func (wqsd *SummaryDummy) setup() error {
 45 | 	var err error
 46 | 	wqsd.buffer1, err = newBuffer(10, 1000)
 47 | 	if err != nil {
 48 | 		return err
 49 | 	}
 50 | 	for _, val := range [][2]float64{
 51 | 		[2]float64{5, 9},
 52 | 		[2]float64{2, 3},
 53 | 		[2]float64{-1, 7},
 54 | 		[2]float64{-7, 1},
 55 | 		[2]float64{3, 2},
 56 | 		[2]float64{-2, 3},
 57 | 		[2]float64{21, 8},
 58 | 		[2]float64{-13, 4},
 59 | 		[2]float64{8, 2},
 60 | 		[2]float64{-5, 6},
 61 | 	} {
 62 | 		if err := wqsd.buffer1.push(val[0], val[1]); err != nil {
 63 | 			return err
 64 | 		}
 65 | 	}
 66 | 
 67 | 	wqsd.buffer2, err = newBuffer(7, 1000)
 68 | 	if err != nil {
 69 | 		return err
 70 | 	}
 71 | 	for _, val := range [][2]float64{
 72 | 		[2]float64{9, 2},
 73 | 		[2]float64{-7, 3},
 74 | 		[2]float64{2, 1},
 75 | 		[2]float64{4, 13},
 76 | 		[2]float64{0, 5},
 77 | 		[2]float64{-5, 3},
 78 | 		[2]float64{11, 3},
 79 | 	} {
 80 | 		if err := wqsd.buffer2.push(val[0], val[1]); err != nil {
 81 | 			return err
 82 | 		}
 83 | 	}
 84 | 	return nil
 85 | }
 86 | 
 87 | func TestSummaryBuildFromBuffer(t *testing.T) {
 88 | 	wqsd, err := NewWeightedQuantilesSummaryDummy()
 89 | 	if err != nil {
 90 | 		t.Error(err)
 91 | 	}
 92 | 	sum := &Summary{}
 93 | 	sum.buildFromBufferEntries(wqsd.buffer1.generateEntryList())
 94 | 
 95 | 	// We expect no approximation error because no compress operation occurred.
 96 | 	if approx := sum.ApproximationError(); approx != 0 {
 97 | 		t.Error("expected no approximation error, got", approx)
 98 | 	}
 99 | 
100 | 	entries := sum.entries
101 | 
102 | 	// First element's rmin should be zero.
103 | 	// EXPECT_EQ(summary.MinValue(), buffer1_min_value_)
104 | 	if val := sum.MinValue(); val != wqsd.buffer1MinValue {
105 | 		t.Error("first element's rmin should be zero, got", val)
106 | 	}
107 | 	// EXPECT_EQ(entries.front(), SummaryEntry(-13, 4, 0, 4))
108 | 	exp := SumEntry{
109 | 		value: -13, weight: 4, minRank: 0, maxRank: 4,
110 | 	}
111 | 	if val := entries[0]; val != exp {
112 | 		t.Errorf("expected %v, got %v", exp, val)
113 | 	}
114 | 
115 | 	// Last element's rmax should be cumulative weight.
116 | 	// EXPECT_EQ(summary.MaxValue(), buffer1_max_value_)
117 | 	if val := sum.MaxValue(); val != wqsd.buffer1MaxValue {
118 | 		t.Errorf("expected %v, got %v", wqsd.buffer1MaxValue, val)
119 | 	}
120 | 
121 | 	//EXPECT_EQ(entries.back(), SummaryEntry(21, 8, 37, 45))
122 | 	exp = SumEntry{
123 | 		value: 21, weight: 8, minRank: 37, maxRank: 45,
124 | 	}
125 | 	if val := entries[len(entries)-1]; val != exp {
126 | 		t.Errorf("expected %v, got %v", exp, val)
127 | 	}
128 | 
129 | 	// Check total weight.
130 | 	// EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_)
131 | 	if val := sum.TotalWeight(); val != wqsd.buffer1TotalWeight {
132 | 		t.Errorf("expected %v, got %v", wqsd.buffer1TotalWeight, val)
133 | 	}
134 | }
135 | 
136 | func TestSummaryCompressSeparately(t *testing.T) {
137 | 	wqsd, err := NewWeightedQuantilesSummaryDummy()
138 | 	if err != nil {
139 | 		t.Error(err)
140 | 	}
141 | 	entryList := wqsd.buffer1.generateEntryList()
142 | 	for newSize := int64(9); newSize >= 2; newSize-- {
143 | 		sum := &Summary{}
144 | 		sum.buildFromBufferEntries(entryList)
145 | 		sum.compress(newSize, 0)
146 | 
147 | 		// Expect a max approximation error of 1 / n
148 | 		// ie. eps0 + 1/n but eps0 = 0.
149 | 
150 | 		// EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
151 | 		if val := sum.Size(); val < newSize {
152 | 			t.Errorf("expected val >= newSize, got %v < %v", val, newSize)
153 | 		} else if val > newSize+2 {
154 | 			t.Errorf("expected val <= newSize+2, got %v > %v", val, newSize+2)
155 | 		}
156 | 
157 | 		// EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
158 | 		if approx := sum.ApproximationError(); approx > 1.0/float64(newSize) {
159 | 			t.Errorf("expected approx <= newSize, got %v > %v", approx, 1.0/float64(newSize))
160 | 		}
161 | 
162 | 		// Min/Max elements and total weight should not change.
163 | 		// EXPECT_EQ(summary.MinValue(), buffer1_min_value_)
164 | 		if sum.MinValue() != wqsd.buffer1MinValue {
165 | 			t.Errorf("expected %v, got %v", wqsd.buffer1MinValue, sum.MinValue())
166 | 		}
167 | 		// EXPECT_EQ(summary.MaxValue(), buffer1_max_value_)
168 | 		if sum.MaxValue() != wqsd.buffer1MaxValue {
169 | 			t.Errorf("expected %v, got %v", wqsd.buffer1MaxValue, sum.MaxValue())
170 | 		}
171 | 		// EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_)
172 | 		if sum.TotalWeight() != wqsd.buffer1TotalWeight {
173 | 			t.Errorf("expected %v, got %v", wqsd.buffer1TotalWeight, sum.TotalWeight())
174 | 		}
175 | 	}
176 | }
177 | func TestSummaryCompressSequentially(t *testing.T) {
178 | 	wqsd, err := NewWeightedQuantilesSummaryDummy()
179 | 	if err != nil {
180 | 		t.Error(err)
181 | 	}
182 | 	entryList := wqsd.buffer1.generateEntryList()
183 | 	sum := &Summary{}
184 | 	sum.buildFromBufferEntries(entryList)
185 | 	for newSize := int64(9); newSize >= 2; newSize -= 2 {
186 | 
187 | 		prevEps := sum.ApproximationError()
188 | 		sum.compress(newSize, 0)
189 | 
190 | 		// Expect a max approximation error of prev_eps + 1 / n.
191 | 
192 | 		// EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
193 | 		if val := sum.Size(); val < newSize {
194 | 			t.Errorf("expected val >= newSize, got %v < %v", val, newSize)
195 | 		} else if val > newSize+2 {
196 | 			t.Errorf("expected val <= newSize+2, got %v > %v", val, newSize+2)
197 | 		}
198 | 
199 | 		// EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
200 | 		if approx := sum.ApproximationError(); approx > prevEps+1.0/float64(newSize) {
201 | 			t.Errorf("expected approx <= newSize, got %v > %v", approx, prevEps+1.0/float64(newSize))
202 | 		}
203 | 
204 | 		// Min/Max elements and total weight should not change.
205 | 		// EXPECT_EQ(summary.MinValue(), buffer1_min_value_)
206 | 		if sum.MinValue() != wqsd.buffer1MinValue {
207 | 			t.Errorf("expected %v, got %v", wqsd.buffer1MinValue, sum.MinValue())
208 | 		}
209 | 		// EXPECT_EQ(summary.MaxValue(), buffer1_max_value_)
210 | 		if sum.MaxValue() != wqsd.buffer1MaxValue {
211 | 			t.Errorf("expected %v, got %v", wqsd.buffer1MaxValue, sum.MaxValue())
212 | 		}
213 | 		// EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_)
214 | 		if sum.TotalWeight() != wqsd.buffer1TotalWeight {
215 | 			t.Errorf("expected %v, got %v", wqsd.buffer1TotalWeight, sum.TotalWeight())
216 | 		}
217 | 	}
218 | }
219 | 
220 | func TestSummaryCompressRandomized(t *testing.T) {
221 | 	var (
222 | 		prevSize int64 = 1
223 | 		size     int64 = 2
224 | 		maxValue       = float64(1 << 20)
225 | 	)
226 | 
227 | 	for size < (1 << 16) {
228 | 		buffer, err := newBuffer(size, size<<4)
229 | 		if err != nil {
230 | 			t.Error("expected no error, got", err)
231 | 		}
232 | 		for i := int64(0); i < size; i++ {
233 | 			buffer.push(
234 | 				rand.Float64()*maxValue,
235 | 				rand.Float64()*maxValue,
236 | 			)
237 | 		}
238 | 
239 | 		sum := &Summary{}
240 | 		sum.buildFromBufferEntries(buffer.generateEntryList())
241 | 		newSize := maxInt64(rand.Int63n(size), 2)
242 | 		sum.compress(newSize, 0)
243 | 
244 | 		// EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
245 | 		if val := sum.Size(); val < newSize {
246 | 			t.Errorf("expected val >= newSize, got %v < %v", val, newSize)
247 | 		} else if val > newSize+2 {
248 | 			t.Errorf("expected val <= newSize+2, got %v > %v", val, newSize+2)
249 | 		}
250 | 
251 | 		// EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
252 | 		if approx := sum.ApproximationError(); approx > 1.0/float64(newSize) {
253 | 			t.Errorf("expected approx <= newSize, got %v > %v", approx, 1.0/float64(newSize))
254 | 		}
255 | 
256 | 		lastSize := size
257 | 		size += prevSize
258 | 		prevSize = lastSize
259 | 	}
260 | }
261 | 
262 | func TestSummaryMergeSymmetry(t *testing.T) {
263 | 	assert := assert.New(t)
264 | 
265 | 	wqsd, err := NewWeightedQuantilesSummaryDummy()
266 | 	if err != nil {
267 | 		t.Error(err)
268 | 	}
269 | 
270 | 	list1 := wqsd.buffer1.generateEntryList()
271 | 	list2 := wqsd.buffer2.generateEntryList()
272 | 	sum1 := &Summary{}
273 | 	sum1.buildFromBufferEntries(list1)
274 | 	sum2 := &Summary{}
275 | 	sum2.buildFromBufferEntries(list2)
276 | 
277 | 	sum1.Merge(sum2)
278 | 	assert.Equal(sum1.ApproximationError(), 0.0)
279 | 	assert.Equal(sum1.MinValue(),
280 | 		minFloat64(wqsd.buffer1MinValue, wqsd.buffer2MinValue))
281 | 
282 | 	assert.Equal(sum1.MaxValue(),
283 | 		maxFloat64(wqsd.buffer1MaxValue, wqsd.buffer2MaxValue))
284 | 	assert.Equal(sum1.TotalWeight(),
285 | 		wqsd.buffer1TotalWeight+wqsd.buffer2TotalWeight)
286 | 	assert.Equal(sum1.Size(), int64(14))
287 | 
288 | 	sum1.buildFromBufferEntries(list1)
289 | 	sum2.Merge(sum1)
290 | 	assert.Equal(sum2.ApproximationError(), 0.0)
291 | 	assert.Equal(sum2.MinValue(),
292 | 		minFloat64(wqsd.buffer1MinValue, wqsd.buffer2MinValue))
293 | 	assert.Equal(sum2.MaxValue(),
294 | 		maxFloat64(wqsd.buffer1MaxValue, wqsd.buffer2MaxValue))
295 | 	assert.Equal(sum2.TotalWeight(),
296 | 		wqsd.buffer1TotalWeight+wqsd.buffer2TotalWeight)
297 | 	assert.Equal(sum2.Size(), int64(14))
298 | }
299 | 
300 | func TestSummaryCompressThenMerge(t *testing.T) {
301 | 	assert := assert.New(t)
302 | 	wqsd, err := NewWeightedQuantilesSummaryDummy()
303 | 	if err != nil {
304 | 		t.Error(err)
305 | 	}
306 | 
307 | 	sum1 := &Summary{}
308 | 	sum1.buildFromBufferEntries(wqsd.buffer1.generateEntryList())
309 | 	sum2 := &Summary{}
310 | 	sum2.buildFromBufferEntries(wqsd.buffer2.generateEntryList())
311 | 
312 | 	sum1.compress(5, 0)
313 | 	eps1 := 1.0 / 5
314 | 	// EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
315 | 	if approx := sum1.ApproximationError(); approx > eps1 {
316 | 		t.Errorf("expected approx <= newSize, got %v > %v", approx, eps1)
317 | 	}
318 | 	sum2.compress(3, 0)
319 | 	eps2 := 1.0 / 3
320 | 	// EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
321 | 	if approx := sum1.ApproximationError(); approx > eps1 {
322 | 		t.Errorf("expected approx <= newSize, got %v > %v", approx, eps2)
323 | 	}
324 | 
325 | 	// Merge guarantees an approximation error of max(eps1, eps2).
326 | 	// Merge summary 2 into 1 and verify.
327 | 	sum1.Merge(sum2)
328 | 	if approx := sum1.ApproximationError(); approx > maxFloat64(eps1, eps2) {
329 | 		t.Errorf("expected approx <= newSize, got %v > %v", approx, maxFloat64(eps1, eps2))
330 | 	}
331 | 	assert.Equal(sum1.MinValue(),
332 | 		minFloat64(wqsd.buffer1MinValue, wqsd.buffer2MinValue))
333 | 	assert.Equal(sum1.MaxValue(),
334 | 		maxFloat64(wqsd.buffer1MaxValue, wqsd.buffer2MaxValue))
335 | 	assert.Equal(sum1.TotalWeight(),
336 | 		wqsd.buffer1TotalWeight+wqsd.buffer2TotalWeight)
337 | }
338 | 


--------------------------------------------------------------------------------
/sketch_test.go:
--------------------------------------------------------------------------------
  1 | package quantiles
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/rand"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/stretchr/testify/assert"
  9 | )
 10 | 
 11 | type tuple [2]int64
 12 | 
 13 | func TestInvalidEps(t *testing.T) {
 14 | 	assert := assert.New(t)
 15 | 	_, _, err := getQuantileSpecs(-0.01, 0)
 16 | 	assert.Error(err)
 17 | 	_, _, err = getQuantileSpecs(1.01, 0)
 18 | 	assert.Error(err)
 19 | }
 20 | func TestZeroEps(t *testing.T) {
 21 | 	assert := assert.New(t)
 22 | 	var (
 23 | 		tup tuple
 24 | 		err error
 25 | 	)
 26 | 	tup[0], tup[1], err = getQuantileSpecs(0, 0)
 27 | 	assert.Error(err)
 28 | 	tup[0], tup[1], err = getQuantileSpecs(0, 1)
 29 | 	assert.Equal(tup, tuple{1, 2})
 30 | 	tup[0], tup[1], err = getQuantileSpecs(0, 20)
 31 | 	assert.Equal(tup, tuple{1, 20})
 32 | }
 33 | func TestNonZeroEps(t *testing.T) {
 34 | 	assert := assert.New(t)
 35 | 	var (
 36 | 		tup tuple
 37 | 		err error
 38 | 	)
 39 | 	tup[0], tup[1], err = getQuantileSpecs(0.01, 0)
 40 | 	assert.Error(err)
 41 | 	tup[0], tup[1], err = getQuantileSpecs(0.1, 320)
 42 | 	assert.Equal(tup, tuple{4, 31})
 43 | 	tup[0], tup[1], err = getQuantileSpecs(0.01, 25600)
 44 | 	assert.Equal(tup, tuple{6, 501})
 45 | 	tup[0], tup[1], err = getQuantileSpecs(0.01, 104857600)
 46 | 	assert.Equal(tup, tuple{17, 1601})
 47 | 	tup[0], tup[1], err = getQuantileSpecs(0.1, 104857600)
 48 | 	assert.Equal(tup, tuple{20, 191})
 49 | 	tup[0], tup[1], err = getQuantileSpecs(0.01, 1<<40)
 50 | 	assert.Equal(tup, tuple{29, 2801})
 51 | 	tup[0], tup[1], err = getQuantileSpecs(0.001, 1<<40)
 52 | 	assert.Equal(tup, tuple{26, 25001})
 53 | }
 54 | 
 55 | func generateFixedUniformSummary(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
 56 | 	for i := int64(0); i < maxElements; i++ {
 57 | 		x := float64(i) / float64(maxElements)
 58 | 		if err := stream.Push(x, 1); err != nil {
 59 | 			return err
 60 | 		}
 61 | 		*totalWeight++
 62 | 	}
 63 | 	return stream.Finalize()
 64 | }
 65 | 
 66 | func generateRandUniformFixedWeightsSummary(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
 67 | 	for i := int64(0); i < maxElements; i++ {
 68 | 		x := rand.Float64()
 69 | 		stream.Push(x, 1)
 70 | 		*totalWeight++
 71 | 	}
 72 | 	return stream.Finalize()
 73 | }
 74 | 
 75 | func generateFixedNonUniformSummary(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
 76 | 	for i := int64(0); i < maxElements; i++ {
 77 | 		x := float64(i) / float64(maxElements)
 78 | 		stream.Push(x, x)
 79 | 		*totalWeight += x
 80 | 	}
 81 | 	return stream.Finalize()
 82 | }
 83 | 
 84 | func generateRandUniformRandWeightsSummary(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
 85 | 	for i := int64(0); i < maxElements; i++ {
 86 | 		x := rand.Float64()
 87 | 		w := rand.Float64()
 88 | 		stream.Push(x, w)
 89 | 		*totalWeight += w
 90 | 	}
 91 | 	return stream.Finalize()
 92 | }
 93 | 
 94 | type workerSummaryGeneratorFunc func(int32, int64, *float64, *Sketch) error
 95 | 
 96 | func testSingleWorkerStreams(t *testing.T, eps float64, maxElements int64,
 97 | 	workerSummaryGenerator workerSummaryGeneratorFunc,
 98 | 	expectedQuantiles []float64, quantilesMatcherEpsilon float64) {
 99 | 
100 | 	totalWeight := 0.0
101 | 	stream, err := New(eps, maxElements)
102 | 	if err != nil {
103 | 		t.Error("expected no error, got ", err)
104 | 		return
105 | 	}
106 | 	if err := workerSummaryGenerator(0, maxElements, &totalWeight, stream); err != nil {
107 | 		t.Error("expected no error, got ", err)
108 | 		return
109 | 	}
110 | 
111 | 	// Ensure we didn't lose track of any elements and are
112 | 	// within approximation error bound.
113 | 	if val, err := stream.ApproximationError(0); err != nil {
114 | 		t.Error("expected no error, got ", err)
115 | 		return
116 | 	} else if val > eps {
117 | 		t.Errorf("expected val <= %v, got %v > %v", eps, val, eps)
118 | 		return
119 | 	}
120 | 
121 | 	sum, err := stream.FinalSummary()
122 | 	if err != nil {
123 | 		t.Error("expected no error, got ", err)
124 | 		return
125 | 	}
126 | 	w := sum.TotalWeight()
127 | 	if math.Abs(totalWeight-w) > 1e-6 {
128 | 		t.Errorf("expected %v <= %v", math.Abs(totalWeight-w), 1e-6)
129 | 		return
130 | 	}
131 | 
132 | 	// Verify expected quantiles.
133 | 	actuals, err := stream.GenerateQuantiles(int64(len(expectedQuantiles) - 1))
134 | 	if err != nil {
135 | 		t.Error("expected no error, got ", err)
136 | 		return
137 | 	}
138 | 	for i, eq := range expectedQuantiles {
139 | 		if val := math.Abs(actuals[i] - eq); val > quantilesMatcherEpsilon {
140 | 			t.Errorf("expected %v <= %v", val, quantilesMatcherEpsilon)
141 | 			return
142 | 		}
143 | 	}
144 | }
145 | 
146 | // Stream generators.
147 | func generateOneValue(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
148 | 	stream.Push(10, 1)
149 | 	*totalWeight++
150 | 	return stream.Finalize()
151 | }
152 | 
153 | // Stream generators.
154 | func generateOneZeroWeightedValue(workerID int32, maxElements int64, totalWeight *float64, stream *Sketch) error {
155 | 	stream.Push(10, 0)
156 | 	return stream.Finalize()
157 | }
158 | 
159 | func TestStreamOneValue(t *testing.T) {
160 | 	var (
161 | 		eps         = 0.01
162 | 		maxElements = int64(1 << 16)
163 | 	)
164 | 	testSingleWorkerStreams(t, eps, maxElements, generateOneValue,
165 | 		[]float64{10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2)
166 | }
167 | 
168 | func TestStreamOneZeroWeightValue(t *testing.T) {
169 | 	var (
170 | 		eps         = 0.01
171 | 		maxElements = int64(1 << 16)
172 | 	)
173 | 	testSingleWorkerStreams(t, eps, maxElements, generateOneZeroWeightedValue,
174 | 		[]float64{}, 1e-2)
175 | }
176 | 
177 | func TestStreamFixedUniform(t *testing.T) {
178 | 	var (
179 | 		eps         = 0.01
180 | 		maxElements = int64(1 << 16)
181 | 	)
182 | 	testSingleWorkerStreams(t, eps, maxElements, generateFixedUniformSummary,
183 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
184 | }
185 | 
186 | func TestStreamFixedNonUniform(t *testing.T) {
187 | 	var (
188 | 		eps         = 0.01
189 | 		maxElements = int64(1 << 16)
190 | 	)
191 | 	testSingleWorkerStreams(t, eps, maxElements, generateFixedNonUniformSummary,
192 | 		[]float64{0, math.Sqrt(0.1), math.Sqrt(0.2), math.Sqrt(0.3), math.Sqrt(0.4), math.Sqrt(0.5), math.Sqrt(0.6), math.Sqrt(0.7), math.Sqrt(0.8), math.Sqrt(0.9), 1.0}, 1e-2)
193 | }
194 | 
195 | func TestStreamRandUniformFixedWeights(t *testing.T) {
196 | 	var (
197 | 		eps         = 0.01
198 | 		maxElements = int64(1 << 16)
199 | 	)
200 | 	testSingleWorkerStreams(t, eps, maxElements, generateRandUniformFixedWeightsSummary,
201 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
202 | }
203 | 
204 | func TestStreamRandUniformRandWeights(t *testing.T) {
205 | 	var (
206 | 		eps         = 0.01
207 | 		maxElements = int64(1 << 16)
208 | 	)
209 | 	testSingleWorkerStreams(t, eps, maxElements, generateRandUniformRandWeightsSummary,
210 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
211 | }
212 | 
213 | // Distributed tests.
214 | func testDistributedStreams(t *testing.T, numWorkers int32, eps float64, maxElements int64,
215 | 	workerSummaryGenerator workerSummaryGeneratorFunc,
216 | 	expectedQuantiles []float64, quantilesMatcherEpsilon float64) {
217 | 
218 | 	// Simulate streams on each worker running independently
219 | 	totalWeight := 0.0
220 | 	workerSummaries := [][]SumEntry{}
221 | 	for i := int32(0); i < numWorkers; i++ {
222 | 		stream, err := New(eps/2, maxElements)
223 | 		if err != nil {
224 | 			t.Error("expected no error, got", err)
225 | 			return
226 | 		}
227 | 		workerSummaryGenerator(i, maxElements/int64(numWorkers), &totalWeight, stream)
228 | 		sum, err := stream.FinalSummary()
229 | 		if err != nil {
230 | 			t.Error("expected no error, got ", err)
231 | 			return
232 | 		}
233 | 		workerSummaries = append(workerSummaries, sum.entries)
234 | 	}
235 | 
236 | 	// In the accumulation phase, we aggregate the summaries from each worker
237 | 	// and build an overall summary while maintaining error bounds by ensuring we
238 | 	// don't increase the error by more than eps / 2.
239 | 	reducerStream, err := New(eps, maxElements)
240 | 	if err != nil {
241 | 		t.Error("expected no error, got ", err)
242 | 		return
243 | 	}
244 | 	for _, summary := range workerSummaries {
245 | 		if err := reducerStream.PushSummary(summary); err != nil {
246 | 			t.Error("expected no error, got", err)
247 | 			return
248 | 		}
249 | 	}
250 | 	if err := reducerStream.Finalize(); err != nil {
251 | 		t.Error("expected no error, got", err)
252 | 		return
253 | 	}
254 | 
255 | 	// Ensure we didn't lose track of any elements and are
256 | 	// within approximation error bound.
257 | 	if val, err := reducerStream.ApproximationError(0); err != nil {
258 | 		t.Error("expected no error, got ", err)
259 | 		return
260 | 	} else if val > eps {
261 | 		t.Errorf("expected val <= %v, got %v > %v", eps, val, eps)
262 | 		return
263 | 	}
264 | 
265 | 	sum, err := reducerStream.FinalSummary()
266 | 	if err != nil {
267 | 		t.Error("expected no error, got ", err)
268 | 		return
269 | 	}
270 | 	w := sum.TotalWeight()
271 | 	if math.Abs(totalWeight-w) > totalWeight {
272 | 		t.Errorf("expected %v <= %v", math.Abs(totalWeight-w), totalWeight)
273 | 		return
274 | 	}
275 | 
276 | 	// Verify expected quantiles.
277 | 	actuals, err := reducerStream.GenerateQuantiles(int64(len(expectedQuantiles) - 1))
278 | 	if err != nil {
279 | 		t.Error("expected no error, got ", err)
280 | 		return
281 | 	}
282 | 	for i, eq := range expectedQuantiles {
283 | 		if val := math.Abs(actuals[i] - eq); val > quantilesMatcherEpsilon {
284 | 			t.Errorf("expected %v <= %v", val, quantilesMatcherEpsilon)
285 | 			return
286 | 		}
287 | 	}
288 | }
289 | 
290 | func TestStreamFixedUniformDistributed(t *testing.T) {
291 | 	var (
292 | 		numWorkers  int32 = 10
293 | 		eps               = 0.01
294 | 		maxElements       = int64(numWorkers) * int64(1<<16)
295 | 	)
296 | 	testDistributedStreams(t, numWorkers, eps, maxElements, generateFixedUniformSummary,
297 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
298 | }
299 | 
300 | func TestStreamFixedNonUniformDistributed(t *testing.T) {
301 | 	var (
302 | 		numWorkers  int32 = 10
303 | 		eps               = 0.01
304 | 		maxElements       = int64(numWorkers) * int64(1<<16)
305 | 	)
306 | 	testDistributedStreams(t, numWorkers, eps, maxElements, generateFixedNonUniformSummary,
307 | 		[]float64{0, math.Sqrt(0.1), math.Sqrt(0.2), math.Sqrt(0.3), math.Sqrt(0.4), math.Sqrt(0.5), math.Sqrt(0.6), math.Sqrt(0.7), math.Sqrt(0.8), math.Sqrt(0.9), 1.0}, 1e-2)
308 | 
309 | }
310 | 
311 | func TestRandUniformFixedWeightsDistributed(t *testing.T) {
312 | 	var (
313 | 		numWorkers  int32 = 10
314 | 		eps               = 0.01
315 | 		maxElements       = int64(numWorkers) * int64(1<<16)
316 | 	)
317 | 	testDistributedStreams(t, numWorkers, eps, maxElements, generateRandUniformFixedWeightsSummary,
318 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
319 | }
320 | 
321 | func TestRandUniformRandWeightsDistributed(t *testing.T) {
322 | 	var (
323 | 		numWorkers  int32 = 10
324 | 		eps               = 0.01
325 | 		maxElements       = int64(numWorkers) * int64(1<<16)
326 | 	)
327 | 	testDistributedStreams(t, numWorkers, eps, maxElements, generateRandUniformRandWeightsSummary,
328 | 		[]float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2)
329 | }
330 | 
331 | func TestSketchMedian(t *testing.T) {
332 | 	assert := assert.New(t)
333 | 	q, _ := New(0.5, 1000)
334 | 
335 | 	for i := 0; i < 402; i++ {
336 | 		q.Push(10, 1)
337 | 	}
338 | 
339 | 	for i := 0; i < 401; i++ {
340 | 		q.Push(5, 1)
341 | 	}
342 | 	// make sure median is 6
343 | 	q.Push(6, 1)
344 | 	q.Push(6, 1)
345 | 
346 | 	exp := map[float64]float64{
347 | 		0.1: 5,
348 | 		0.2: 5,
349 | 		0.3: 5,
350 | 		0.4: 5,
351 | 		0.5: 6,
352 | 		0.6: 10,
353 | 		0.7: 10,
354 | 		0.8: 10,
355 | 		0.9: 10,
356 | 	}
357 | 
358 | 	err := q.Finalize()
359 | 	assert.NoError(err)
360 | 	for i, val := range exp {
361 | 		x, err := q.Quantile(i)
362 | 		assert.NoError(err)
363 | 		assert.Equal(val, x)
364 | 	}
365 | 
366 | }
367 | 


--------------------------------------------------------------------------------