├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── benchmarks_test.go
├── docs
    └── compression_benchmark.png
├── fuzz_test.go
├── go.mod
├── go.sum
├── serde.go
├── serde_test.go
├── tdigest.go
└── tdigest_test.go


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | on: [push]
 2 | name: Unit tests
 3 | jobs:
 4 |   test:
 5 |     strategy:
 6 |       matrix:
 7 |         go-version: [1.16.x, 1.17.x]
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - name: Install Go
11 |       uses: actions/setup-go@v2
12 |       with:
13 |         go-version: ${{ matrix.go-version }}
14 |     - name: Checkout code
15 |       uses: actions/checkout@v2
16 |     - name: Run tests
17 |       run: go test ./...
18 | 
19 |   fuzz:
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |     - name: Install Go
23 |       uses: actions/setup-go@v2
24 |       with:
25 |         stable: 'false'
26 |         go-version: 1.18.0-beta1
27 |     - name: Checkout code
28 |       uses: actions/checkout@v2
29 |     - name: Run fuzzing tests
30 |       run: go test -fuzz Fuzz -fuzztime 60s ./...
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.test
2 | 
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Spencer Nelson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Archived Status #
  2 | 
  3 | **I don't plan on making updates to this package. I recommend using https://github.com/influxdata/tdigest instead.**
  4 | 
  5 | ---
  6 | 
  7 | # tdigest #
  8 | [![GoDoc](https://godoc.org/github.com/spenczar/tdigest?status.svg)](https://godoc.org/github.com/spenczar/tdigest) [![Build Status](https://travis-ci.org/spenczar/tdigest.svg)](https://travis-ci.org/spenczar/tdigest)
  9 | 
 10 | This is a Go implementation of Ted Dunning's
 11 | [t-digest](https://github.com/tdunning/t-digest), which is a clever
 12 | data structure/algorithm for computing approximate quantiles of a
 13 | stream of data.
 14 | 
 15 | You should use this if you want to efficiently compute extreme rank
 16 | statistics of a large stream of data, like the 99.9th percentile.
 17 | 
 18 | ## Usage ##
 19 | 
 20 | An example is available in the Godoc which shows the API:
 21 | 
 22 | ```go
 23 | func ExampleTDigest() {
 24 | 	rand.Seed(5678)
 25 | 	values := make(chan float64)
 26 | 
 27 | 	// Generate 100k uniform random data between 0 and 100
 28 | 	var (
 29 | 		n        int     = 100000
 30 | 		min, max float64 = 0, 100
 31 | 	)
 32 | 	go func() {
 33 | 		for i := 0; i < n; i++ {
 34 | 			values <- min + rand.Float64()*(max-min)
 35 | 		}
 36 | 		close(values)
 37 | 	}()
 38 | 
 39 | 	// Pass the values through a TDigest.
 40 | 	td := New()
 41 | 
 42 | 	for val := range values {
 43 | 		// Add the value with weight 1
 44 | 		td.Add(val, 1)
 45 | 	}
 46 | 
 47 | 	// Print the 50th, 90th, 99th, 99.9th, and 99.99th percentiles
 48 | 	fmt.Printf("50th: %.5f\n", td.Quantile(0.5))
 49 | 	fmt.Printf("90th: %.5f\n", td.Quantile(0.9))
 50 | 	fmt.Printf("99th: %.5f\n", td.Quantile(0.99))
 51 | 	fmt.Printf("99.9th: %.5f\n", td.Quantile(0.999))
 52 | 	fmt.Printf("99.99th: %.5f\n", td.Quantile(0.9999))
 53 | 	// Output:
 54 | 	// 50th: 48.74854
 55 | 	// 90th: 89.79825
 56 | 	// 99th: 98.92954
 57 | 	// 99.9th: 99.90189
 58 | 	// 99.99th: 99.98740
 59 | }
 60 | ```
 61 | 
 62 | ## Algorithm ##
 63 | 
 64 | For example, in the Real World, the stream of data might be *service
 65 | timings*, measuring how long a server takes to respond to clients. You
 66 | can feed this stream of data through a t-digest and get out
 67 | approximations of any quantile you like: the 50th percentile or 95th
 68 | percentile or 99th or 99.99th or 28.31th are all computable.
 69 | 
 70 | Exact quantiles would require that you hold all the data in memory,
 71 | but the t-digest can hold a small fraction - often just a few
 72 | kilobytes to represent many millions of datapoints. Measurements of
 73 | the compression ratio show that compression improves super-linearly as
 74 | more datapoints are fed into the t-digest.
 75 | 
 76 | How good are the approximations? Well, it depends, but they tend to be
 77 | quite good, especially out towards extreme percentiles like the 99th
 78 | or 99.9th; Ted Dunning found errors of just a few parts per million at
 79 | the 99.9th and 0.1th percentiles.
 80 | 
 81 | Error will be largest in the middle - the median is the least accurate
 82 | point in the t-digest.
 83 | 
 84 | The actual precision can be controlled with the `compression`
 85 | parameter passed to the constructor function `NewWithCompression` in
 86 | this package. Lower `compression` parameters will result in poorer
 87 | compression, but will improve performance in estimating quantiles. If
 88 | you care deeply about tuning such things, experiment with the
 89 | compression ratio.
 90 | 
 91 | ## Benchmarks ##
 92 | 
 93 | Data compresses well, with compression ratios of around 20 for small
 94 | datasets (1k datapoints) and 500 for largeish ones (1M
 95 | datapoints). The precise compression ratio depends a bit on your
 96 | data's distribution - exponential data does well, while ordered data
 97 | does poorly:
 98 | 
 99 | ![compression benchmark](docs/compression_benchmark.png)
100 | 
101 | In general, adding a datapoint takes about 1 to 4 microseconds on my
102 | 2014 Macbook Pro. This is fast enough for many purposes, but if you
103 | have any concern, you should just run the benchmarks on your targeted
104 | syste. You can do that with `go test -bench . ./...`.
105 | 
106 | Quantiles are very, very quick to calculate, and typically take tens
107 | of nanoseconds. They might take up to a few hundred nanoseconds for
108 | large, poorly compressed (read: ordered) datasets, but in general, you
109 | don't have to worry about the speed of calls to Quantile.
110 | 


--------------------------------------------------------------------------------
/benchmarks_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"testing"
  6 | )
  7 | 
  8 | const rngSeed = 1234567
  9 | 
 10 | type valueSource interface {
 11 | 	Next() float64
 12 | }
 13 | 
 14 | func benchmarkAdd(b *testing.B, n int, src valueSource) {
 15 | 	valsToAdd := make([]float64, n)
 16 | 
 17 | 	d := NewWithCompression(100)
 18 | 	for i := 0; i < n; i++ {
 19 | 		v := src.Next()
 20 | 		valsToAdd[i] = v
 21 | 		d.Add(v, 1)
 22 | 	}
 23 | 
 24 | 	b.ResetTimer()
 25 | 	for i := 0; i < b.N; i++ {
 26 | 		d.Add(valsToAdd[i%n], 1)
 27 | 	}
 28 | 	b.StopTimer()
 29 | }
 30 | 
 31 | func benchmarkQuantile(b *testing.B, n int, src valueSource) {
 32 | 	quantilesToCheck := make([]float64, n)
 33 | 
 34 | 	d := NewWithCompression(100)
 35 | 	for i := 0; i < n; i++ {
 36 | 		v := src.Next()
 37 | 		quantilesToCheck[i] = v
 38 | 		d.Add(v, 1)
 39 | 	}
 40 | 
 41 | 	b.ResetTimer()
 42 | 	for i := 0; i < b.N; i++ {
 43 | 		_ = d.Quantile(quantilesToCheck[i%n])
 44 | 	}
 45 | 	b.StopTimer()
 46 | }
 47 | 
 48 | type orderedValues struct {
 49 | 	last float64
 50 | }
 51 | 
 52 | func (ov *orderedValues) Next() float64 {
 53 | 	ov.last += 1
 54 | 	return ov.last
 55 | }
 56 | 
 57 | func BenchmarkAdd_1k_Ordered(b *testing.B) {
 58 | 	benchmarkAdd(b, 1000, &orderedValues{})
 59 | }
 60 | 
 61 | func BenchmarkAdd_10k_Ordered(b *testing.B) {
 62 | 	benchmarkAdd(b, 10000, &orderedValues{})
 63 | }
 64 | 
 65 | func BenchmarkAdd_100k_Ordered(b *testing.B) {
 66 | 	benchmarkAdd(b, 100000, &orderedValues{})
 67 | }
 68 | 
 69 | func BenchmarkQuantile_1k_Ordered(b *testing.B) {
 70 | 	benchmarkQuantile(b, 1000, &orderedValues{})
 71 | }
 72 | 
 73 | func BenchmarkQuantile_10k_Ordered(b *testing.B) {
 74 | 	benchmarkQuantile(b, 10000, &orderedValues{})
 75 | }
 76 | 
 77 | func BenchmarkQuantile_100k_Ordered(b *testing.B) {
 78 | 	benchmarkQuantile(b, 100000, &orderedValues{})
 79 | }
 80 | 
 81 | type zipfValues struct {
 82 | 	z *rand.Zipf
 83 | }
 84 | 
 85 | func newZipfValues() *zipfValues {
 86 | 	r := rand.New(rand.NewSource(rngSeed))
 87 | 	z := rand.NewZipf(r, 1.2, 1, 1024*1024)
 88 | 	return &zipfValues{
 89 | 		z: z,
 90 | 	}
 91 | }
 92 | 
 93 | func (zv *zipfValues) Next() float64 {
 94 | 	return float64(zv.z.Uint64())
 95 | }
 96 | 
 97 | func BenchmarkAdd_1k_Zipfian(b *testing.B) {
 98 | 	benchmarkAdd(b, 1000, newZipfValues())
 99 | }
100 | 
101 | func BenchmarkAdd_10k_Zipfian(b *testing.B) {
102 | 	benchmarkAdd(b, 10000, newZipfValues())
103 | }
104 | 
105 | func BenchmarkAdd_100k_Zipfian(b *testing.B) {
106 | 	benchmarkAdd(b, 100000, newZipfValues())
107 | }
108 | 
109 | func BenchmarkQuantile_1k_Zipfian(b *testing.B) {
110 | 	benchmarkQuantile(b, 1000, newZipfValues())
111 | }
112 | 
113 | func BenchmarkQuantile_10k_Zipfian(b *testing.B) {
114 | 	benchmarkQuantile(b, 10000, newZipfValues())
115 | }
116 | 
117 | func BenchmarkQuantile_100k_Zipfian(b *testing.B) {
118 | 	benchmarkQuantile(b, 100000, newZipfValues())
119 | }
120 | 
121 | type uniformValues struct {
122 | 	r *rand.Rand
123 | }
124 | 
125 | func newUniformValues() *uniformValues {
126 | 	return &uniformValues{rand.New(rand.NewSource(rngSeed))}
127 | }
128 | 
129 | func (uv *uniformValues) Next() float64 {
130 | 	return uv.r.Float64()
131 | }
132 | 
133 | func BenchmarkAdd_1k_Uniform(b *testing.B) {
134 | 	benchmarkAdd(b, 1000, newUniformValues())
135 | }
136 | 
137 | func BenchmarkAdd_10k_Uniform(b *testing.B) {
138 | 	benchmarkAdd(b, 10000, newUniformValues())
139 | }
140 | 
141 | func BenchmarkAdd_100k_Uniform(b *testing.B) {
142 | 	benchmarkAdd(b, 100000, newUniformValues())
143 | }
144 | 
145 | func BenchmarkQuantile_1k_Uniform(b *testing.B) {
146 | 	benchmarkQuantile(b, 1000, newUniformValues())
147 | }
148 | 
149 | func BenchmarkQuantile_10k_Uniform(b *testing.B) {
150 | 	benchmarkQuantile(b, 10000, newUniformValues())
151 | }
152 | 
153 | func BenchmarkQuantile_100k_Uniform(b *testing.B) {
154 | 	benchmarkQuantile(b, 100000, newUniformValues())
155 | }
156 | 
157 | type normalValues struct {
158 | 	r *rand.Rand
159 | }
160 | 
161 | func newNormalValues() *normalValues {
162 | 	return &normalValues{rand.New(rand.NewSource(rngSeed))}
163 | }
164 | 
165 | func (uv *normalValues) Next() float64 {
166 | 	return uv.r.NormFloat64()
167 | }
168 | 
169 | func BenchmarkAdd_1k_Normal(b *testing.B) {
170 | 	benchmarkAdd(b, 1000, newNormalValues())
171 | }
172 | 
173 | func BenchmarkAdd_10k_Normal(b *testing.B) {
174 | 	benchmarkAdd(b, 10000, newNormalValues())
175 | }
176 | 
177 | func BenchmarkAdd_100k_Normal(b *testing.B) {
178 | 	benchmarkAdd(b, 100000, newNormalValues())
179 | }
180 | 
181 | func BenchmarkQuantile_1k_Normal(b *testing.B) {
182 | 	benchmarkQuantile(b, 1000, newNormalValues())
183 | }
184 | 
185 | func BenchmarkQuantile_10k_Normal(b *testing.B) {
186 | 	benchmarkQuantile(b, 10000, newNormalValues())
187 | }
188 | 
189 | func BenchmarkQuantile_100k_Normal(b *testing.B) {
190 | 	benchmarkQuantile(b, 100000, newNormalValues())
191 | }
192 | 


--------------------------------------------------------------------------------
/docs/compression_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spenczar/tdigest/b766947010a67d71100f172226d6023e56ec9f3c/docs/compression_benchmark.png


--------------------------------------------------------------------------------
/fuzz_test.go:
--------------------------------------------------------------------------------
 1 | //go:build go1.18
 2 | // +build go1.18
 3 | 
 4 | package tdigest
 5 | 
 6 | import (
 7 | 	"bytes"
 8 | 	"testing"
 9 | )
10 | 
11 | // Past cases that revealed panics.
12 | var fuzzFailures = [][]byte{
13 | 	[]byte{
14 | 		0x01, 0x00, 0x00, 0x00, 0x30, 0x30, 0x30, 0x30,
15 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
16 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
17 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0xfc,
18 | 	},
19 | 	[]byte{
20 | 		0x01, 0x00, 0x00, 0x00, 0xdb, 0x46, 0x5f, 0xbd,
21 | 		0xdb, 0x46, 0x00, 0xbd, 0xe0, 0xdf, 0xca, 0xab,
22 | 		0x37, 0x31, 0x37, 0x32, 0x37, 0x33, 0x37, 0x34,
23 | 		0x37, 0x35, 0x37, 0x36, 0x37, 0x37, 0x37, 0x38,
24 | 		0x37, 0x39, 0x28,
25 | 	},
26 | 	[]byte{
27 | 		0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30,
28 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00,
29 | 		0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
30 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
31 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
32 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
33 | 		0x30, 0xbf,
34 | 	},
35 | 	[]byte{
36 | 		0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30,
37 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00,
38 | 		0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
39 | 		0x30, 0x63, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
40 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
41 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
42 | 		0x30, 0x4e,
43 | 	},
44 | 	[]byte{
45 | 		0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30,
46 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00,
47 | 		0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
48 | 		0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
49 | 		0x30, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
50 | 		0x30, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
51 | 		0x92, 0x00,
52 | 	},
53 | }
54 | 
55 | func FuzzRoundTrip(f *testing.F) {
56 | 
57 | 	for _, data := range fuzzFailures {
58 | 		f.Add(data)
59 | 	}
60 | 	f.Fuzz(func(t *testing.T, data []byte) {
61 | 		v := new(TDigest)
62 | 		err := v.UnmarshalBinary(data)
63 | 		if err != nil {
64 | 			// Input is not valid; skip it.
65 | 			t.Skip()
66 | 		}
67 | 
68 | 		t.Logf("input: %v", data)
69 | 		remarshaled, err := v.MarshalBinary()
70 | 		if err != nil {
71 | 			t.Fatalf("marshal error for valid data: %v", err)
72 | 		}
73 | 
74 | 		if !bytes.HasPrefix(data, remarshaled) {
75 | 			t.Logf("tdigest: %s", v.debugStr())
76 | 			t.Fatal("remarshaling does not round-trip")
77 | 		}
78 | 
79 | 		for q := float64(0.1); q <= 1.0; q += 0.05 {
80 | 			prev, this := v.Quantile(q-0.1), v.Quantile(q)
81 | 			if prev-this > 1e-100 { // Floating point math makes this slightly imprecise.
82 | 				t.Logf("tdigest: %s", v.debugStr())
83 | 				t.Logf("q: %v", q)
84 | 				t.Logf("prev: %v", prev)
85 | 				t.Logf("this: %v", this)
86 | 				t.Fatal("quantiles should only increase")
87 | 			}
88 | 		}
89 | 		v.Add(1, 1)
90 | 
91 | 	})
92 | }
93 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/spenczar/tdigest/v2
2 | 
3 | go 1.17
4 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spenczar/tdigest/b766947010a67d71100f172226d6023e56ec9f3c/go.sum


--------------------------------------------------------------------------------
/serde.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"math"
  9 | )
 10 | 
 11 | const (
 12 | 	magic           = int16(0xc80)
 13 | 	encodingVersion = int32(1)
 14 | )
 15 | 
 16 | func marshalBinary(d *TDigest) ([]byte, error) {
 17 | 	buf := bytes.NewBuffer(nil)
 18 | 	w := &binaryBufferWriter{buf: buf}
 19 | 	w.writeValue(magic)
 20 | 	w.writeValue(encodingVersion)
 21 | 	w.writeValue(d.compression)
 22 | 	w.writeValue(int32(len(d.centroids)))
 23 | 	for _, c := range d.centroids {
 24 | 		w.writeValue(c.count)
 25 | 		w.writeValue(c.mean)
 26 | 	}
 27 | 
 28 | 	if w.err != nil {
 29 | 		return nil, w.err
 30 | 	}
 31 | 	return buf.Bytes(), nil
 32 | }
 33 | 
 34 | func unmarshalBinary(d *TDigest, p []byte) error {
 35 | 	var (
 36 | 		mv int16
 37 | 		ev int32
 38 | 		n  int32
 39 | 	)
 40 | 	r := &binaryReader{r: bytes.NewReader(p)}
 41 | 	r.readValue(&mv)
 42 | 	if r.err != nil {
 43 | 		return r.err
 44 | 	}
 45 | 	if mv != magic {
 46 | 		return fmt.Errorf("data corruption detected: invalid header magic value 0x%04x", mv)
 47 | 	}
 48 | 	r.readValue(&ev)
 49 | 	if r.err != nil {
 50 | 		return r.err
 51 | 	}
 52 | 	if ev != encodingVersion {
 53 | 		return fmt.Errorf("data corruption detected: invalid encoding version %d", ev)
 54 | 	}
 55 | 	r.readValue(&d.compression)
 56 | 	r.readValue(&n)
 57 | 	if r.err != nil {
 58 | 		return r.err
 59 | 	}
 60 | 	if n < 0 {
 61 | 		return fmt.Errorf("data corruption detected: number of centroids cannot be negative, have %v", n)
 62 | 
 63 | 	}
 64 | 	if n > 1<<20 {
 65 | 		return fmt.Errorf("invalid n, cannot be greater than 2^20: %v", n)
 66 | 	}
 67 | 	d.centroids = make([]*centroid, int(n))
 68 | 	for i := 0; i < int(n); i++ {
 69 | 		c := new(centroid)
 70 | 		r.readValue(&c.count)
 71 | 		r.readValue(&c.mean)
 72 | 		if r.err != nil {
 73 | 			return r.err
 74 | 		}
 75 | 		if c.count < 0 {
 76 | 			return fmt.Errorf("data corruption detected: negative count: %d", c.count)
 77 | 		}
 78 | 		if math.IsNaN(c.mean) {
 79 | 			return fmt.Errorf("data corruption detected: NaN mean not permitted")
 80 | 		}
 81 | 		if math.IsInf(c.mean, 0) {
 82 | 			return fmt.Errorf("data corruption detected: Inf mean not permitted")
 83 | 		}
 84 | 		if i > 0 {
 85 | 			prev := d.centroids[i-1]
 86 | 			if c.mean < prev.mean {
 87 | 				return fmt.Errorf("data corruption detected: centroid %d has lower mean (%v) than preceding centroid %d (%v)", i, c.mean, i-1, prev.mean)
 88 | 			}
 89 | 		}
 90 | 		d.centroids[i] = c
 91 | 		if c.count > math.MaxInt64-d.countTotal {
 92 | 			return fmt.Errorf("data corruption detected: centroid total size overflow")
 93 | 		}
 94 | 		d.countTotal += c.count
 95 | 	}
 96 | 
 97 | 	if n := r.r.Len(); n > 0 {
 98 | 		return fmt.Errorf("found %d unexpected bytes trailing the tdigest", n)
 99 | 	}
100 | 
101 | 	return nil
102 | }
103 | 
104 | type binaryBufferWriter struct {
105 | 	buf *bytes.Buffer
106 | 	err error
107 | }
108 | 
109 | func (w *binaryBufferWriter) writeValue(v interface{}) {
110 | 	if w.err != nil {
111 | 		return
112 | 	}
113 | 	w.err = binary.Write(w.buf, binary.LittleEndian, v)
114 | }
115 | 
116 | type binaryReader struct {
117 | 	r   *bytes.Reader
118 | 	err error
119 | }
120 | 
121 | func (r *binaryReader) readValue(v interface{}) {
122 | 	if r.err != nil {
123 | 		return
124 | 	}
125 | 	r.err = binary.Read(r.r, binary.LittleEndian, v)
126 | 	if r.err == io.EOF {
127 | 		r.err = io.ErrUnexpectedEOF
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/serde_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"io"
  6 | 	"reflect"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func TestMarshalRoundTrip(t *testing.T) {
 11 | 	testcase := func(in *TDigest) func(*testing.T) {
 12 | 		return func(t *testing.T) {
 13 | 			b, err := in.MarshalBinary()
 14 | 			if err != nil {
 15 | 				t.Fatalf("MarshalBinary err: %v", err)
 16 | 			}
 17 | 			out := new(TDigest)
 18 | 			err = out.UnmarshalBinary(b)
 19 | 			if err != nil {
 20 | 				t.Fatalf("UnmarshalBinary err: %v", err)
 21 | 			}
 22 | 			if !reflect.DeepEqual(in, out) {
 23 | 				t.Errorf("marshaling round trip resulted in changes")
 24 | 				t.Logf("in: %+v", in)
 25 | 				t.Logf("out: %+v", out)
 26 | 			}
 27 | 		}
 28 | 	}
 29 | 	t.Run("empty", testcase(New()))
 30 | 	t.Run("1 value", testcase(simpleTDigest(1)))
 31 | 	t.Run("1000 values", testcase(simpleTDigest(1000)))
 32 | 
 33 | 	d := New()
 34 | 	d.Add(1, 1)
 35 | 	d.Add(1, 1)
 36 | 	d.Add(0, 1)
 37 | 	t.Run("1, 1, 0 input", testcase(d))
 38 | }
 39 | 
 40 | func TestUnmarshalErrors(t *testing.T) {
 41 | 	testcase := func(in []byte, wantErr error) func(*testing.T) {
 42 | 		return func(t *testing.T) {
 43 | 			have := new(TDigest)
 44 | 			err := unmarshalBinary(have, in)
 45 | 			if err != nil {
 46 | 				if wantErr == nil {
 47 | 					t.Fatalf("unexpected unmarshal err: %v", err)
 48 | 				}
 49 | 				if err.Error() != wantErr.Error() {
 50 | 					t.Fatalf("wrong error, want=%q, have=%q", wantErr.Error(), err.Error())
 51 | 				} else {
 52 | 					return
 53 | 				}
 54 | 			} else if wantErr != nil {
 55 | 				t.Fatalf("expected err=%q, got nil", wantErr.Error())
 56 | 			}
 57 | 		}
 58 | 	}
 59 | 	t.Run("nil", testcase(
 60 | 		nil,
 61 | 		io.ErrUnexpectedEOF,
 62 | 	))
 63 | 	t.Run("bad magic", testcase(
 64 | 		[]byte{
 65 | 			0x80, 0x0d,
 66 | 		},
 67 | 		errors.New("data corruption detected: invalid header magic value 0x0d80"),
 68 | 	))
 69 | 	t.Run("incomplete encoding", testcase(
 70 | 		[]byte{
 71 | 			0x80, 0x0c,
 72 | 			0x00,
 73 | 		},
 74 | 		io.ErrUnexpectedEOF,
 75 | 	))
 76 | 	t.Run("bad encoding", testcase(
 77 | 		[]byte{
 78 | 			0x80, 0x0c,
 79 | 			0xFF, 0xFF, 0xFF, 0xFF,
 80 | 		},
 81 | 		errors.New("data corruption detected: invalid encoding version -1"),
 82 | 	))
 83 | 	t.Run("incomplete compression", testcase(
 84 | 		[]byte{
 85 | 			0x80, 0x0c,
 86 | 			0x01, 0x00, 0x00, 0x00,
 87 | 			0x00, 0x00,
 88 | 		},
 89 | 		io.ErrUnexpectedEOF,
 90 | 	))
 91 | 	t.Run("incomplete n", testcase(
 92 | 		[]byte{
 93 | 			0x80, 0x0c,
 94 | 			0x01, 0x00, 0x00, 0x00,
 95 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
 96 | 			0x00,
 97 | 		},
 98 | 		io.ErrUnexpectedEOF,
 99 | 	))
100 | 	t.Run("negative n", testcase(
101 | 		[]byte{
102 | 			0x80, 0x0c,
103 | 			0x01, 0x00, 0x00, 0x00,
104 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
105 | 			0xFF, 0xFF, 0xFF, 0xFF,
106 | 		},
107 | 		errors.New("data corruption detected: number of centroids cannot be negative, have -1"),
108 | 	))
109 | 	t.Run("huge n", testcase(
110 | 		[]byte{
111 | 			0x80, 0x0c,
112 | 			0x01, 0x00, 0x00, 0x00,
113 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
114 | 			0xFF, 0xFF, 0xFF, 0x7F,
115 | 		},
116 | 		errors.New("invalid n, cannot be greater than 2^20: 2147483647"),
117 | 	))
118 | 	t.Run("missing centroids", testcase(
119 | 		[]byte{
120 | 			0x80, 0x0c,
121 | 			0x01, 0x00, 0x00, 0x00,
122 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
123 | 			0x01, 0x00, 0x00, 0x00,
124 | 		},
125 | 		io.ErrUnexpectedEOF,
126 | 	))
127 | 	t.Run("partial centroid", testcase(
128 | 		[]byte{
129 | 			0x80, 0x0c,
130 | 			0x01, 0x00, 0x00, 0x00,
131 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
132 | 			0x01, 0x00, 0x00, 0x00,
133 | 			0x01, 0x00, 0x00, 0x00,
134 | 		},
135 | 		io.ErrUnexpectedEOF,
136 | 	))
137 | 	t.Run("negative count", testcase(
138 | 		[]byte{
139 | 			0x80, 0x0c,
140 | 			0x01, 0x00, 0x00, 0x00,
141 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
142 | 			0x01, 0x00, 0x00, 0x00,
143 | 			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
144 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
145 | 		},
146 | 		errors.New("data corruption detected: negative count: -1"),
147 | 	))
148 | 	t.Run("decreasing means", testcase(
149 | 		[]byte{
150 | 			0x80, 0x0c,
151 | 			0x01, 0x00, 0x00, 0x00,
152 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
153 | 			0x02, 0x00, 0x00, 0x00,
154 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
155 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
156 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
157 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
158 | 		},
159 | 		errors.New("data corruption detected: centroid 1 has lower mean (1) than preceding centroid 0 (2)"),
160 | 	))
161 | 	t.Run("nan mean", testcase(
162 | 		[]byte{
163 | 			0x80, 0x0c,
164 | 			0x01, 0x00, 0x00, 0x00,
165 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
166 | 			0x01, 0x00, 0x00, 0x00,
167 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
168 | 			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
169 | 		},
170 | 		errors.New("data corruption detected: NaN mean not permitted"),
171 | 	))
172 | 	t.Run("+inf mean", testcase(
173 | 		[]byte{
174 | 			0x80, 0x0c,
175 | 			0x01, 0x00, 0x00, 0x00,
176 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
177 | 			0x01, 0x00, 0x00, 0x00,
178 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
179 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x7F,
180 | 		},
181 | 		errors.New("data corruption detected: Inf mean not permitted"),
182 | 	))
183 | 	t.Run("-inf mean", testcase(
184 | 		[]byte{
185 | 			0x80, 0x0c,
186 | 			0x01, 0x00, 0x00, 0x00,
187 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
188 | 			0x01, 0x00, 0x00, 0x00,
189 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
190 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFF,
191 | 		},
192 | 		errors.New("data corruption detected: Inf mean not permitted"),
193 | 	))
194 | 	t.Run("total size overflow", testcase(
195 | 		[]byte{
196 | 			0x80, 0x0c,
197 | 			0x01, 0x00, 0x00, 0x00,
198 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
199 | 			0x02, 0x00, 0x00, 0x00,
200 | 			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F,
201 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
202 | 			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F,
203 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
204 | 		},
205 | 		errors.New("data corruption detected: centroid total size overflow"),
206 | 	))
207 | 	t.Run("trailing bytes", testcase(
208 | 		[]byte{
209 | 			0x80, 0x0c,
210 | 			0x01, 0x00, 0x00, 0x00,
211 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
212 | 			0x02, 0x00, 0x00, 0x00,
213 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
214 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
215 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
216 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
217 | 			0x00,
218 | 		},
219 | 		errors.New("found 1 unexpected bytes trailing the tdigest"),
220 | 	))
221 | }
222 | 
223 | func TestUnmarshal(t *testing.T) {
224 | 	testcase := func(in []byte, want *TDigest) func(*testing.T) {
225 | 		return func(t *testing.T) {
226 | 			have := new(TDigest)
227 | 			err := unmarshalBinary(have, in)
228 | 			if err != nil {
229 | 				t.Fatalf("unexpected unmarshal err: %v", err)
230 | 			}
231 | 			if !reflect.DeepEqual(have, want) {
232 | 				t.Error("unmarshal did not produce expected digest")
233 | 				t.Logf("want=%s", want.debugStr())
234 | 				t.Logf("have=%s", have.debugStr())
235 | 			}
236 | 		}
237 | 	}
238 | 	t.Run("no centroids", testcase(
239 | 		[]byte{
240 | 			0x80, 0x0c,
241 | 			0x01, 0x00, 0x00, 0x00,
242 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
243 | 			0x00, 0x00, 0x00, 0x00,
244 | 		},
245 | 		&TDigest{
246 | 			centroids:   make([]*centroid, 0),
247 | 			compression: 100,
248 | 			countTotal:  0,
249 | 		},
250 | 	))
251 | 	t.Run("one centroid", testcase(
252 | 		[]byte{
253 | 			0x80, 0x0c,
254 | 			0x01, 0x00, 0x00, 0x00,
255 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
256 | 			0x01, 0x00, 0x00, 0x00,
257 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
258 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
259 | 		},
260 | 		&TDigest{
261 | 			centroids: []*centroid{
262 | 				&centroid{
263 | 					count: 1,
264 | 					mean:  1,
265 | 				},
266 | 			},
267 | 			compression: 100,
268 | 			countTotal:  1,
269 | 		},
270 | 	))
271 | 	t.Run("two centroids", testcase(
272 | 		[]byte{
273 | 			0x80, 0x0c,
274 | 			0x01, 0x00, 0x00, 0x00,
275 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40,
276 | 			0x02, 0x00, 0x00, 0x00,
277 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
278 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F,
279 | 			0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
280 | 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
281 | 		},
282 | 		&TDigest{
283 | 			centroids: []*centroid{
284 | 				&centroid{
285 | 					count: 1,
286 | 					mean:  1,
287 | 				},
288 | 				&centroid{
289 | 					count: 1,
290 | 					mean:  2,
291 | 				},
292 | 			},
293 | 			compression: 100,
294 | 			countTotal:  2,
295 | 		},
296 | 	))
297 | }
298 | 


--------------------------------------------------------------------------------
/tdigest.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/rand"
  7 | )
  8 | 
  9 | // centroid is a simple container for a mean,count pair.
 10 | type centroid struct {
 11 | 	mean  float64
 12 | 	count int64
 13 | }
 14 | 
 15 | func (c *centroid) String() string {
 16 | 	return fmt.Sprintf("c{%f x%d}", c.mean, c.count)
 17 | }
 18 | 
 19 | // A TDigest is an efficient data structure for computing streaming approximate
 20 | // quantiles of a dataset.
 21 | type TDigest struct {
 22 | 	centroids   []*centroid
 23 | 	compression float64
 24 | 	countTotal  int64
 25 | }
 26 | 
 27 | // New produces a new TDigest using the default compression level of
 28 | // 100.
 29 | func New() *TDigest {
 30 | 	return NewWithCompression(100)
 31 | }
 32 | 
 33 | // NewWithCompression produces a new TDigest with a specific
 34 | // compression level. The input compression value, which should be >=
 35 | // 1.0, will control how aggressively the TDigest compresses data
 36 | // together.
 37 | //
 38 | // The original TDigest paper suggests using a value of 100 for a good
 39 | // balance between precision and efficiency. It will land at very
 40 | // small (think like 1e-6 percentile points) errors at extreme points
 41 | // in the distribution, and compression ratios of around 500 for large
 42 | // data sets (1 millionish datapoints).
 43 | func NewWithCompression(compression float64) *TDigest {
 44 | 	return &TDigest{
 45 | 		centroids:   make([]*centroid, 0),
 46 | 		compression: compression,
 47 | 		countTotal:  0,
 48 | 	}
 49 | }
 50 | 
 51 | // Find the indexes of centroids which have the minimum distance to the
 52 | // input value.
 53 | //
 54 | // TODO: Use a better data structure to avoid this loop.
 55 | func (d *TDigest) nearest(val float64) []int {
 56 | 	var (
 57 | 		nearestDist float64 = math.Inf(+1)
 58 | 		thisDist    float64
 59 | 		delta       float64
 60 | 		result      []int = make([]int, 0)
 61 | 	)
 62 | 	for i, c := range d.centroids {
 63 | 		thisDist = val - c.mean
 64 | 		if thisDist < 0 {
 65 | 			thisDist *= -1
 66 | 		}
 67 | 
 68 | 		delta = thisDist - nearestDist
 69 | 		switch {
 70 | 		case delta < 0:
 71 | 			// we have a new winner!
 72 | 			nearestDist = thisDist
 73 | 			result = result[0:0] // wipe result
 74 | 			result = append(result, i)
 75 | 		case delta == 0:
 76 | 			// we have a tie
 77 | 			result = append(result, i)
 78 | 		default:
 79 | 			// Since d.centroids is sorted by mean, this means we
 80 | 			// have passed the best spot, so we may as well break
 81 | 			break
 82 | 		}
 83 | 	}
 84 | 	return result
 85 | }
 86 | 
 87 | // returns the maximum weight that can be placed at specified index
 88 | func (d *TDigest) weightLimit(idx int) int64 {
 89 | 	ptile := d.quantileOf(idx)
 90 | 	limit := int64(4 * d.compression * ptile * (1 - ptile) * float64(len(d.centroids)))
 91 | 	return limit
 92 | }
 93 | 
 94 | // checks whether the centroid has room for more weight
 95 | func (d *TDigest) centroidHasRoom(idx int) bool {
 96 | 	return d.centroids[idx].count < d.weightLimit(idx)
 97 | }
 98 | 
 99 | // find which centroid to add the value to (by index)
100 | func (d *TDigest) findAddTarget(val float64) int {
101 | 	nearest := d.nearest(val)
102 | 	// There could be no centroids yet, one centroid which is the 'nearest', or
103 | 	// multiple centroids that are equidistant.
104 | 	switch len(nearest) {
105 | 	case 0:
106 | 		// There are no centroids at all. Return -1, signaling that we should add a
107 | 		// new centroid.
108 | 		return -1
109 | 	case 1:
110 | 		// When there is exactly one centroid which is the 'nearest' one, return it
111 | 		// if it has room.
112 | 		if d.centroidHasRoom(nearest[0]) {
113 | 			return nearest[0]
114 | 		}
115 | 		return -1
116 | 	default:
117 | 		// Multiple eligible centroids to add to. They must be equidistant
118 | 		// from this value. Four cases are possible:
119 | 		//
120 | 		//   1. All eligible centroids' means are less than val
121 | 		//   2. All eligible centroids' means are greater than val
122 | 		//   3. All eligible centroids' means are exactly equal to val
123 | 		//   4. Some eligible centroids' means are less than val, some are greater
124 | 		//
125 | 		// If 1, then we should take the highest indexed centroid to preserve
126 | 		// ordering. If 2, we should take the lowest for the same reason. If 2, we
127 | 		// can pick randomly among the ones that have room, since they are
128 | 		// indistinguishable. If 4, we should first trim down to having just 2
129 | 		// eligible centroids and then can pick randomly.
130 | 
131 | 		// First, establish which of the 4 cases we have.
132 | 		var anyLesser, anyGreater bool
133 | 		for _, c := range nearest {
134 | 			m := d.centroids[c].mean
135 | 			if m < val {
136 | 				anyLesser = true
137 | 			} else if m > val {
138 | 				anyGreater = true
139 | 			}
140 | 		}
141 | 
142 | 		switch {
143 | 		case anyLesser && !anyGreater:
144 | 			// case 1: all are less, none are greater. Take highest one.
145 | 			c := max(nearest)
146 | 			if d.centroidHasRoom(c) {
147 | 				return c
148 | 			}
149 | 			return -1
150 | 
151 | 		case !anyLesser && anyGreater:
152 | 			// case 2: all are greater, none are less. Take the lowest one.
153 | 			c := min(nearest)
154 | 			if d.centroidHasRoom(c) {
155 | 				return c
156 | 			}
157 | 			return -1
158 | 
159 | 		case !anyLesser && !anyGreater:
160 | 			// case 3: all are equal. Take a random one that has room.
161 | 			var eligible []int
162 | 			for _, c := range nearest {
163 | 				if d.centroidHasRoom(c) {
164 | 					eligible = append(eligible, c)
165 | 				}
166 | 			}
167 | 			if len(eligible) == 0 {
168 | 				return -1
169 | 			}
170 | 			if len(eligible) == 1 {
171 | 				return eligible[0]
172 | 			}
173 | 			return eligible[rand.Intn(len(eligible))]
174 | 
175 | 		default:
176 | 			// case 4: It's a mixed bag. We need to first trim down to the two
177 | 			// innermost centroids which straddle the value.
178 | 			var lower, upper int
179 | 			for _, c := range nearest {
180 | 				m := d.centroids[c].mean
181 | 				if m < val {
182 | 					lower = c
183 | 				} else if m > val {
184 | 					upper = c
185 | 					break
186 | 				}
187 | 			}
188 | 			// Now, check which has room. If both do, pick randomly.
189 | 			lowerHasRoom := d.centroidHasRoom(lower)
190 | 			upperHasRoom := d.centroidHasRoom(upper)
191 | 			switch {
192 | 			case !lowerHasRoom && !upperHasRoom:
193 | 				return -1
194 | 			case lowerHasRoom && !upperHasRoom:
195 | 				return lower
196 | 			case !lowerHasRoom && upperHasRoom:
197 | 				return upper
198 | 			default:
199 | 				if rand.Intn(2) == 1 {
200 | 					return lower
201 | 				} else {
202 | 					return upper
203 | 				}
204 | 			}
205 | 		}
206 | 	}
207 | }
208 | 
209 | func (d *TDigest) addNewCentroid(mean float64, weight int64) {
210 | 	var idx int = len(d.centroids)
211 | 
212 | 	for i, c := range d.centroids {
213 | 		// add in sorted order
214 | 		if mean < c.mean {
215 | 			idx = i
216 | 			break
217 | 		}
218 | 	}
219 | 
220 | 	d.centroids = append(d.centroids, nil)
221 | 	copy(d.centroids[idx+1:], d.centroids[idx:])
222 | 	d.centroids[idx] = &centroid{mean, weight}
223 | }
224 | 
225 | // Add will add a value to the TDigest, updating all quantiles. A
226 | // weight can be specified; use weight of 1 if you don't care about
227 | // weighting your dataset.
228 | //
229 | // Add will ignore input values of NaN or Inf.
230 | func (d *TDigest) Add(val float64, weight int) {
231 | 	if math.IsNaN(val) || math.IsInf(val, 0) {
232 | 		return
233 | 	}
234 | 	d.add(val, int64(weight))
235 | }
236 | 
237 | func (d *TDigest) add(val float64, weight int64) {
238 | 	d.countTotal += weight
239 | 	var idx = d.findAddTarget(val)
240 | 
241 | 	if idx == -1 {
242 | 		d.addNewCentroid(val, weight)
243 | 		return
244 | 	}
245 | 
246 | 	c := d.centroids[idx]
247 | 
248 | 	limit := d.weightLimit(idx)
249 | 	// how much weight will we be adding?
250 | 	// if adding this node to this centroid would put it over the
251 | 	// weight limit, just add the most we can and recur with the remainder
252 | 	if c.count+weight > limit {
253 | 		add := limit - c.count
254 | 		if add < 0 {
255 | 			// this node was already overweight
256 | 			add = 0
257 | 		}
258 | 		remainder := weight - add
259 | 
260 | 		c.count += add
261 | 		c.mean = c.mean + float64(add)*(val-c.mean)/float64(c.count)
262 | 
263 | 		d.add(val, remainder)
264 | 	} else {
265 | 		c.count += weight
266 | 		c.mean = c.mean + float64(weight)*(val-c.mean)/float64(c.count)
267 | 	}
268 | }
269 | 
270 | // returns the approximate quantile that a particular centroid
271 | // represents
272 | func (d *TDigest) quantileOf(idx int) float64 {
273 | 	var total int64
274 | 	for _, c := range d.centroids[:idx] {
275 | 		total += c.count
276 | 	}
277 | 	return (float64(d.centroids[idx].count/2) + float64(total)) / float64(d.countTotal)
278 | }
279 | 
280 | // Quantile(q) will estimate the qth quantile value of the dataset. The input
281 | // value of q should be in the range [0.0, 1.0]; if it is outside that range, it
282 | // will be clipped into it automatically.
283 | //
284 | // Calling Quantile on a TDigest with no data will return NaN.
285 | func (d *TDigest) Quantile(q float64) float64 {
286 | 	var n = len(d.centroids)
287 | 	if n == 0 {
288 | 		return math.NaN()
289 | 	}
290 | 	if n == 1 {
291 | 		return d.centroids[0].mean
292 | 	}
293 | 
294 | 	if q < 0 {
295 | 		q = 0
296 | 	} else if q > 1 {
297 | 		q = 1
298 | 	}
299 | 
300 | 	// rescale into count units instead of 0 to 1 units
301 | 	q = float64(d.countTotal) * q
302 | 	// find the first centroid which straddles q
303 | 	var (
304 | 		qTotal float64 = 0
305 | 		i      int
306 | 	)
307 | 	for i = 0; i < n && float64(d.centroids[i].count)/2+qTotal < q; i++ {
308 | 		qTotal += float64(d.centroids[i].count)
309 | 	}
310 | 
311 | 	if i == 0 {
312 | 		// special case 1: the targeted quantile is before the
313 | 		// left-most centroid. extrapolate from the slope from
314 | 		// centroid0 to centroid1.
315 | 		c0 := d.centroids[0]
316 | 		c1 := d.centroids[1]
317 | 		slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2)
318 | 		deltaQ := q - float64(c0.count)/2 // this is negative
319 | 		return c0.mean + slope*deltaQ
320 | 	}
321 | 	if i == n {
322 | 		// special case 2: the targeted quantile is from the
323 | 		// right-most centroid. extrapolate from the slope at the
324 | 		// right edge.
325 | 		c0 := d.centroids[n-2]
326 | 		c1 := d.centroids[n-1]
327 | 		slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2)
328 | 		deltaQ := q - (qTotal - float64(c1.count)/2)
329 | 		return c1.mean + slope*deltaQ
330 | 	}
331 | 	// common case: targeted quantile is between 2 centroids
332 | 	c0 := d.centroids[i-1]
333 | 	c1 := d.centroids[i]
334 | 	slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2)
335 | 	deltaQ := q - (float64(c1.count)/2 + qTotal)
336 | 	return c1.mean + slope*deltaQ
337 | }
338 | 
339 | // MergeInto(other) will add all of the data within a TDigest into other,
340 | // combining them into one larger TDigest.
341 | func (d *TDigest) MergeInto(other *TDigest) {
342 | 	// Add each centroid in d into other. They should be added in
343 | 	// random order.
344 | 	addOrder := rand.Perm(len(d.centroids))
345 | 	for _, idx := range addOrder {
346 | 		c := d.centroids[idx]
347 | 		// gradually write up the volume written so that the tdigest doesnt overload early
348 | 		added := int64(0)
349 | 		for i := int64(1); i < 10; i++ {
350 | 			toAdd := i * 2
351 | 			if added+i > c.count {
352 | 				toAdd = c.count - added
353 | 			}
354 | 			other.add(c.mean, toAdd)
355 | 			added += toAdd
356 | 			if added >= c.count {
357 | 				break
358 | 			}
359 | 		}
360 | 		if added < c.count {
361 | 			other.add(c.mean, c.count-added)
362 | 		}
363 | 		other.add(c.mean, c.count)
364 | 	}
365 | }
366 | 
367 | // MarshalBinary serializes d as a sequence of bytes, suitable to be
368 | // deserialized later with UnmarshalBinary.
369 | func (d *TDigest) MarshalBinary() ([]byte, error) {
370 | 	return marshalBinary(d)
371 | }
372 | 
373 | // UnmarshalBinary populates d with the parsed contents of p, which should have
374 | // been created with a call to MarshalBinary.
375 | func (d *TDigest) UnmarshalBinary(p []byte) error {
376 | 	return unmarshalBinary(d, p)
377 | }
378 | 
379 | // Render a TDigest's internal state for test logging output purposes.
380 | func (d *TDigest) debugStr() string {
381 | 	var centroids = "[]*centroids{"
382 | 
383 | 	for _, c := range d.centroids {
384 | 		centroids += fmt.Sprintf("&centroid{mean: %f, count: %d},", c.mean, c.count)
385 | 	}
386 | 	centroids += "}"
387 | 
388 | 	return fmt.Sprintf("TDigest{compression: %f, countTotal: %d, centroids: %s", d.compression, d.countTotal, centroids)
389 | 
390 | }
391 | 
392 | func max(ii []int) int {
393 | 	max := ii[0]
394 | 	if len(ii) == 1 {
395 | 		return max
396 | 	}
397 | 	for _, v := range ii[1:] {
398 | 		if v > max {
399 | 			max = v
400 | 		}
401 | 	}
402 | 	return max
403 | }
404 | 
405 | func min(ii []int) int {
406 | 	min := ii[0]
407 | 	if len(ii) == 1 {
408 | 		return min
409 | 	}
410 | 	for _, v := range ii[1:] {
411 | 		if v < min {
412 | 			min = v
413 | 		}
414 | 	}
415 | 	return min
416 | 
417 | }
418 | 


--------------------------------------------------------------------------------
/tdigest_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/rand"
  7 | 	"reflect"
  8 | 	"testing"
  9 | )
 10 | 
 11 | func TestFindNearest(t *testing.T) {
 12 | 	type testcase struct {
 13 | 		centroids []*centroid
 14 | 		val       float64
 15 | 		want      []int
 16 | 	}
 17 | 
 18 | 	testcases := []testcase{
 19 | 		{[]*centroid{{0, 1}, {1, 1}, {2, 1}}, -1, []int{0}},
 20 | 		{[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 0, []int{0}},
 21 | 		{[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 1, []int{1}},
 22 | 		{[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 2, []int{2}},
 23 | 		{[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 3, []int{2}},
 24 | 		{[]*centroid{{0, 1}, {2, 1}}, 1, []int{0, 1}},
 25 | 		{[]*centroid{}, 1, []int{}},
 26 | 	}
 27 | 
 28 | 	for i, tc := range testcases {
 29 | 		d := TDigest{centroids: tc.centroids}
 30 | 		have := d.nearest(tc.val)
 31 | 		if len(tc.want) == 0 {
 32 | 			if len(have) != 0 {
 33 | 				t.Errorf("TDigest.nearest wrong test=%d, have=%v, want=%v", i, have, tc.want)
 34 | 			}
 35 | 		} else {
 36 | 			if !reflect.DeepEqual(tc.want, have) {
 37 | 				t.Errorf("TDigest.nearest wrong test=%d, have=%v, want=%v", i, have, tc.want)
 38 | 			}
 39 | 		}
 40 | 	}
 41 | }
 42 | 
 43 | func BenchmarkFindNearest(b *testing.B) {
 44 | 	n := 500
 45 | 	d := simpleTDigest(n)
 46 | 
 47 | 	b.ResetTimer()
 48 | 	var val float64
 49 | 	for i := int64(0); i < int64(b.N); i++ {
 50 | 		val = float64(i % d.countTotal)
 51 | 		_ = d.nearest(val)
 52 | 	}
 53 | }
 54 | 
 55 | func TestFindAddTarget(t *testing.T) {
 56 | 	testcase := func(in []*centroid, val float64, want int) func(*testing.T) {
 57 | 		return func(t *testing.T) {
 58 | 			d := TDigest{centroids: in, compression: 1}
 59 | 			for _, c := range in {
 60 | 				d.countTotal += c.count
 61 | 			}
 62 | 			have := d.findAddTarget(val)
 63 | 			if have != want {
 64 | 				t.Errorf("TDigest.findAddTarget wrong  have=%v, want=%v", have, want)
 65 | 			}
 66 | 		}
 67 | 	}
 68 | 	t.Run("empty digest", testcase(nil, 1, -1))
 69 | 	t.Run("exactly one with room", testcase(
 70 | 		[]*centroid{{0.0, 1}, {1.0, 1}, {2.0, 1}},
 71 | 		1, 1))
 72 | 	t.Run("exactly one without room", testcase(
 73 | 		[]*centroid{{0.0, 1}, {1.0, 3}, {2.0, 1}},
 74 | 		1, -1))
 75 | 	t.Run("multiple candidates", func(t *testing.T) {
 76 | 		t.Run("all lesser", func(t *testing.T) {
 77 | 			t.Run("with room", testcase(
 78 | 				[]*centroid{{0.0, 1}, {1.0, 1}, {1.0, 3}, {2.0, 1}},
 79 | 				1.1, 2))
 80 | 			t.Run("without room", testcase(
 81 | 				[]*centroid{{0.0, 1}, {1.0, 1}, {1.0, 4}, {2.0, 1}},
 82 | 				1.1, -1))
 83 | 		})
 84 | 		t.Run("all greater", func(t *testing.T) {
 85 | 			t.Run("with room", testcase(
 86 | 				[]*centroid{{0.0, 1}, {1.0, 1}, {1.0, 3}, {2.0, 1}},
 87 | 				0.9, 1))
 88 | 			t.Run("without room", testcase(
 89 | 				[]*centroid{{0.0, 1}, {1.0, 3}, {1.0, 4}, {2.0, 1}},
 90 | 				0.9, -1))
 91 | 		})
 92 | 		t.Run("all equal", func(t *testing.T) {
 93 | 			t.Run("with room in none", testcase(
 94 | 				[]*centroid{{0.0, 1}, {1.0, 3}, {1.0, 3}, {2.0, 1}},
 95 | 				1.0, -1))
 96 | 			t.Run("with room in one", testcase(
 97 | 				[]*centroid{{0.0, 1}, {1.0, 2}, {1.0, 3}, {2.0, 1}},
 98 | 				1.0, 1))
 99 | 			t.Run("with room in multiple", func(t *testing.T) {
100 | 				d := TDigest{
101 | 					centroids:   []*centroid{{0.0, 1}, {1.0, 1}, {1.0, 2}, {2.0, 1}},
102 | 					compression: 1,
103 | 				}
104 | 				for _, c := range d.centroids {
105 | 					d.countTotal += c.count
106 | 				}
107 | 				have := d.findAddTarget(1.0)
108 | 				if have != 1 && have != 2 {
109 | 					t.Errorf("TDigest.findAddTarget wrong  have=%v, want=1 or 2", have)
110 | 				}
111 | 			})
112 | 		})
113 | 		t.Run("both greater and lesser", func(t *testing.T) {
114 | 			t.Run("with room below", testcase(
115 | 				[]*centroid{{0.0, 1}, {0.8, 1}, {0.8, 1}, {1.0, 6}, {1.0, 1}, {2.0, 1}},
116 | 				0.9, 2))
117 | 			t.Run("with room above", testcase(
118 | 				[]*centroid{{0.0, 1}, {0.8, 1}, {0.8, 6}, {1.0, 1}, {1.0, 1}, {2.0, 1}},
119 | 				0.9, 3))
120 | 			t.Run("with no room", testcase(
121 | 				[]*centroid{{0.0, 1}, {0.8, 1}, {0.8, 6}, {1.0, 6}, {1.0, 1}, {2.0, 1}},
122 | 				0.9, -1))
123 | 			t.Run("with room above and below", func(t *testing.T) {
124 | 				d := TDigest{
125 | 					centroids: []*centroid{
126 | 						{0.0, 1}, {0.8, 1}, {0.8, 1},
127 | 						{1.0, 1}, {1.0, 1}, {2.0, 1}},
128 | 					compression: 1,
129 | 				}
130 | 				for _, c := range d.centroids {
131 | 					d.countTotal += c.count
132 | 				}
133 | 				have := d.findAddTarget(0.9)
134 | 				if have != 2 && have != 3 {
135 | 					t.Errorf("TDigest.findAddTarget wrong  have=%v, want=2 or 3", have)
136 | 				}
137 | 			})
138 | 		})
139 | 	})
140 | }
141 | 
142 | // adding a new centroid should maintain sorted order
143 | func TestAddNewCentroid(t *testing.T) {
144 | 	type testcase struct {
145 | 		centroidVals []float64
146 | 		add          float64
147 | 		want         []float64
148 | 	}
149 | 	testcases := []testcase{
150 | 		{[]float64{}, 1, []float64{1}},
151 | 		{[]float64{1}, 2, []float64{1, 2}},
152 | 		{[]float64{1, 2}, 1.5, []float64{1, 1.5, 2}},
153 | 		{[]float64{1, 1.5, 2}, -1, []float64{-1, 1, 1.5, 2}},
154 | 		{[]float64{1, 1.5, 2}, 3, []float64{1, 1.5, 2, 3}},
155 | 		{[]float64{1, 1.5, 2}, 1.6, []float64{1, 1.5, 1.6, 2}},
156 | 	}
157 | 
158 | 	for i, tc := range testcases {
159 | 		d := tdFromMeans(tc.centroidVals)
160 | 		d.addNewCentroid(tc.add, 1)
161 | 
162 | 		have := make([]float64, len(d.centroids))
163 | 		for i, c := range d.centroids {
164 | 			have[i] = c.mean
165 | 		}
166 | 
167 | 		if !reflect.DeepEqual(tc.want, have) {
168 | 			t.Errorf("TDigest.addNewCentroid wrong test=%d, have=%v, want=%v", i, have, tc.want)
169 | 		}
170 | 	}
171 | }
172 | 
173 | func verifyCentroidOrder(t *testing.T, cs *TDigest) {
174 | 	if len(cs.centroids) < 2 {
175 | 		return
176 | 	}
177 | 	last := cs.centroids[0]
178 | 	for i, c := range cs.centroids[1:] {
179 | 		if c.mean < last.mean {
180 | 			t.Errorf("centroid %d lt %d: %v < %v", i+1, i, c.mean, last.mean)
181 | 		}
182 | 		last = c
183 | 	}
184 | }
185 | 
186 | func TestQuantileOrder(t *testing.T) {
187 | 	// stumbled upon in real world application: adding a 1 to this
188 | 	// resulted in the 6th centroid getting incremented instead of the
189 | 	// 7th.
190 | 	d := &TDigest{
191 | 		countTotal:  14182,
192 | 		compression: 100,
193 | 		centroids: []*centroid{
194 | 			&centroid{0.000000, 1},
195 | 			&centroid{0.000000, 564},
196 | 			&centroid{0.000000, 1140},
197 | 			&centroid{0.000000, 1713},
198 | 			&centroid{0.000000, 2380},
199 | 			&centroid{0.000000, 2688},
200 | 			&centroid{0.000000, 1262},
201 | 			&centroid{2.005758, 1563},
202 | 			&centroid{30.499251, 1336},
203 | 			&centroid{381.533509, 761},
204 | 			&centroid{529.600000, 5},
205 | 			&centroid{1065.294118, 17},
206 | 			&centroid{2266.444444, 36},
207 | 			&centroid{4268.809783, 368},
208 | 			&centroid{14964.148148, 27},
209 | 			&centroid{41024.579618, 157},
210 | 			&centroid{124311.192308, 52},
211 | 			&centroid{219674.636364, 22},
212 | 			&centroid{310172.775000, 40},
213 | 			&centroid{412388.642857, 14},
214 | 			&centroid{582867.000000, 16},
215 | 			&centroid{701434.777778, 9},
216 | 			&centroid{869363.800000, 5},
217 | 			&centroid{968264.000000, 1},
218 | 			&centroid{987100.666667, 3},
219 | 			&centroid{1029895.000000, 1},
220 | 			&centroid{1034640.000000, 1},
221 | 		},
222 | 	}
223 | 	d.Add(1.0, 1)
224 | 	verifyCentroidOrder(t, d)
225 | }
226 | 
227 | func TestQuantile(t *testing.T) {
228 | 	type testcase struct {
229 | 		weights []int64
230 | 		idx     int
231 | 		want    float64
232 | 	}
233 | 	testcases := []testcase{
234 | 		{[]int64{1, 1, 1, 1}, 0, 0.0},
235 | 		{[]int64{1, 1, 1, 1}, 1, 0.25},
236 | 		{[]int64{1, 1, 1, 1}, 2, 0.5},
237 | 		{[]int64{1, 1, 1, 1}, 3, 0.75},
238 | 
239 | 		{[]int64{5, 1, 1, 1}, 0, 0.250},
240 | 		{[]int64{5, 1, 1, 1}, 1, 0.625},
241 | 		{[]int64{5, 1, 1, 1}, 2, 0.750},
242 | 		{[]int64{5, 1, 1, 1}, 3, 0.875},
243 | 
244 | 		{[]int64{1, 1, 1, 5}, 0, 0.0},
245 | 		{[]int64{1, 1, 1, 5}, 1, 0.125},
246 | 		{[]int64{1, 1, 1, 5}, 2, 0.250},
247 | 		{[]int64{1, 1, 1, 5}, 3, 0.625},
248 | 	}
249 | 
250 | 	for i, tc := range testcases {
251 | 		d := tdFromWeights(tc.weights)
252 | 		have := d.quantileOf(tc.idx)
253 | 		if have != tc.want {
254 | 			t.Errorf("TDigest.quantile wrong test=%d, have=%.3f, want=%.3f", i, have, tc.want)
255 | 		}
256 | 	}
257 | }
258 | 
259 | func TestAddValue(t *testing.T) {
260 | 	type testcase struct {
261 | 		value  float64
262 | 		weight int
263 | 		want   []*centroid
264 | 	}
265 | 
266 | 	testcases := []testcase{
267 | 		{1.0, 1, []*centroid{{1, 1}}},
268 | 		{0.0, 1, []*centroid{{0, 1}, {1, 1}}},
269 | 		{2.0, 1, []*centroid{{0, 1}, {1, 1}, {2, 1}}},
270 | 		{3.0, 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}}},
271 | 		{4.0, 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}},
272 | 		{math.NaN(), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}},
273 | 		{math.Inf(-1), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}},
274 | 		{math.Inf(+1), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}},
275 | 	}
276 | 
277 | 	d := NewWithCompression(1)
278 | 	for i, tc := range testcases {
279 | 		d.Add(tc.value, tc.weight)
280 | 		if !reflect.DeepEqual(d.centroids, tc.want) {
281 | 			t.Fatalf("TDigest.addValue unexpected state step=%d, have=%v, want=%v", i, d.centroids, tc.want)
282 | 		}
283 | 	}
284 | }
285 | 
286 | func TestQuantileValue(t *testing.T) {
287 | 	d := NewWithCompression(1)
288 | 	d.countTotal = 8
289 | 	d.centroids = []*centroid{{0.5, 3}, {1, 1}, {2, 2}, {3, 1}, {8, 1}}
290 | 
291 | 	type testcase struct {
292 | 		q    float64
293 | 		want float64
294 | 	}
295 | 
296 | 	// correct values, determined by hand with pen and paper for this set of centroids
297 | 	testcases := []testcase{
298 | 		{0.0, 5.0 / 40.0},
299 | 		{0.1, 13.0 / 40.0},
300 | 		{0.2, 21.0 / 40.0},
301 | 		{0.3, 29.0 / 40.0},
302 | 		{0.4, 37.0 / 40.0},
303 | 		{0.5, 20.0 / 15.0},
304 | 		{0.6, 28.0 / 15.0},
305 | 		{0.7, 36.0 / 15.0},
306 | 		{0.8, 44.0 / 15.0},
307 | 		{0.9, 13.0 / 2.0},
308 | 		{1.0, 21.0 / 2.0},
309 | 	}
310 | 
311 | 	var epsilon = 1e-8
312 | 
313 | 	for i, tc := range testcases {
314 | 		have := d.Quantile(tc.q)
315 | 		if math.Abs(have-tc.want) > epsilon {
316 | 			t.Errorf("TDigest.Quantile wrong step=%d, have=%v, want=%v",
317 | 				i, have, tc.want)
318 | 		}
319 | 	}
320 | }
321 | 
322 | func BenchmarkFindAddTarget(b *testing.B) {
323 | 	n := 500
324 | 	d := simpleTDigest(n)
325 | 
326 | 	b.ResetTimer()
327 | 	var val float64
328 | 	for i := int64(0); i < int64(b.N); i++ {
329 | 		val = float64(i % d.countTotal)
330 | 		_ = d.findAddTarget(val)
331 | 	}
332 | }
333 | 
334 | // add the values [0,n) to a centroid set, equal weights
335 | func simpleTDigest(n int) *TDigest {
336 | 	d := NewWithCompression(1.0)
337 | 	for i := 0; i < n; i++ {
338 | 		d.Add(float64(i), 1)
339 | 	}
340 | 	return d
341 | }
342 | 
343 | func tdFromMeans(means []float64) *TDigest {
344 | 	centroids := make([]*centroid, len(means))
345 | 	for i, m := range means {
346 | 		centroids[i] = &centroid{m, 1}
347 | 	}
348 | 	d := NewWithCompression(1.0)
349 | 	d.centroids = centroids
350 | 	d.countTotal = int64(len(centroids))
351 | 	return d
352 | }
353 | 
354 | func tdFromWeights(weights []int64) *TDigest {
355 | 	centroids := make([]*centroid, len(weights))
356 | 	countTotal := int64(0)
357 | 	for i, w := range weights {
358 | 		centroids[i] = &centroid{float64(i), w}
359 | 		countTotal += w
360 | 	}
361 | 	d := NewWithCompression(1.0)
362 | 	d.centroids = centroids
363 | 	d.countTotal = countTotal
364 | 	return d
365 | }
366 | 
367 | func ExampleTDigest() {
368 | 	rand.Seed(5678)
369 | 	values := make(chan float64)
370 | 
371 | 	// Generate 100k uniform random data between 0 and 100
372 | 	var (
373 | 		n        int     = 100000
374 | 		min, max float64 = 0, 100
375 | 	)
376 | 	go func() {
377 | 		for i := 0; i < n; i++ {
378 | 			values <- min + rand.Float64()*(max-min)
379 | 		}
380 | 		close(values)
381 | 	}()
382 | 
383 | 	// Pass the values through a TDigest, compression parameter 100
384 | 	td := New()
385 | 
386 | 	for val := range values {
387 | 		// Add the value with weight 1
388 | 		td.Add(val, 1)
389 | 	}
390 | 
391 | 	// Print the 50th, 90th, 99th, 99.9th, and 99.99th percentiles
392 | 	fmt.Printf("50th: %.5f\n", td.Quantile(0.5))
393 | 	fmt.Printf("90th: %.5f\n", td.Quantile(0.9))
394 | 	fmt.Printf("99th: %.5f\n", td.Quantile(0.99))
395 | 	fmt.Printf("99.9th: %.5f\n", td.Quantile(0.999))
396 | 	fmt.Printf("99.99th: %.5f\n", td.Quantile(0.9999))
397 | }
398 | 
399 | func TestMerge(t *testing.T) {
400 | 	values := make(chan float64)
401 | 
402 | 	// Generate 100k uniform random data between 0 and 100
403 | 	var (
404 | 		n        int     = 100000
405 | 		min, max float64 = 0, 100
406 | 	)
407 | 	go func() {
408 | 		for i := 0; i < n; i++ {
409 | 			values <- min + rand.Float64()*(max-min)
410 | 		}
411 | 		close(values)
412 | 	}()
413 | 
414 | 	// Pass the values through two TDigests
415 | 	td1 := New()
416 | 	td2 := New()
417 | 
418 | 	i := 0
419 | 	for val := range values {
420 | 		// Add the value with weight 1. Alternate between the digests.
421 | 		if i%2 == 0 {
422 | 			td1.Add(val, 1)
423 | 		} else {
424 | 			td2.Add(val, 1)
425 | 		}
426 | 		i += 1
427 | 	}
428 | 
429 | 	rand.Seed(2)
430 | 	// merge both into a third tdigest.
431 | 	td := New()
432 | 	td1.MergeInto(td)
433 | 	td2.MergeInto(td)
434 | 	t.Logf("10th: %.5f\n", td1.Quantile(0.1))
435 | 	t.Logf("50th: %.5f\n", td1.Quantile(0.5))
436 | 	t.Logf("90th: %.5f\n", td1.Quantile(0.9))
437 | 	t.Logf("99th: %.5f\n", td1.Quantile(0.99))
438 | 	t.Logf("99.9th: %.5f\n", td1.Quantile(0.999))
439 | 	t.Logf("99.99th: %.5f\n", td1.Quantile(0.9999))
440 | 
441 | 	t.Logf("10th: %.5f\n", td2.Quantile(0.1))
442 | 	t.Logf("50th: %.5f\n", td2.Quantile(0.5))
443 | 	t.Logf("90th: %.5f\n", td2.Quantile(0.9))
444 | 	t.Logf("99th: %.5f\n", td2.Quantile(0.99))
445 | 	t.Logf("99.9th: %.5f\n", td2.Quantile(0.999))
446 | 	t.Logf("99.99th: %.5f\n", td2.Quantile(0.9999))
447 | 
448 | 	t.Logf("10th: %.5f\n", td.Quantile(0.1))
449 | 	t.Logf("50th: %.5f\n", td.Quantile(0.5))
450 | 	t.Logf("90th: %.5f\n", td.Quantile(0.9))
451 | 	t.Logf("99th: %.5f\n", td.Quantile(0.99))
452 | 	t.Logf("99.9th: %.5f\n", td.Quantile(0.999))
453 | 	t.Logf("99.99th: %.5f\n", td.Quantile(0.9999))
454 | }
455 | 


--------------------------------------------------------------------------------