├── .github └── workflows │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── benchmarks_test.go ├── docs └── compression_benchmark.png ├── fuzz_test.go ├── go.mod ├── go.sum ├── serde.go ├── serde_test.go ├── tdigest.go └── tdigest_test.go /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | on: [push] 2 | name: Unit tests 3 | jobs: 4 | test: 5 | strategy: 6 | matrix: 7 | go-version: [1.16.x, 1.17.x] 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Install Go 11 | uses: actions/setup-go@v2 12 | with: 13 | go-version: ${{ matrix.go-version }} 14 | - name: Checkout code 15 | uses: actions/checkout@v2 16 | - name: Run tests 17 | run: go test ./... 18 | 19 | fuzz: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Install Go 23 | uses: actions/setup-go@v2 24 | with: 25 | stable: 'false' 26 | go-version: 1.18.0-beta1 27 | - name: Checkout code 28 | uses: actions/checkout@v2 29 | - name: Run fuzzing tests 30 | run: go test -fuzz Fuzz -fuzztime 60s ./... 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.test 2 | 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Spencer Nelson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archived Status # 2 | 3 | **I don't plan on making updates to this package. I recommend using https://github.com/influxdata/tdigest instead.** 4 | 5 | --- 6 | 7 | # tdigest # 8 | [![GoDoc](https://godoc.org/github.com/spenczar/tdigest?status.svg)](https://godoc.org/github.com/spenczar/tdigest) [![Build Status](https://travis-ci.org/spenczar/tdigest.svg)](https://travis-ci.org/spenczar/tdigest) 9 | 10 | This is a Go implementation of Ted Dunning's 11 | [t-digest](https://github.com/tdunning/t-digest), which is a clever 12 | data structure/algorithm for computing approximate quantiles of a 13 | stream of data. 14 | 15 | You should use this if you want to efficiently compute extreme rank 16 | statistics of a large stream of data, like the 99.9th percentile. 17 | 18 | ## Usage ## 19 | 20 | An example is available in the Godoc which shows the API: 21 | 22 | ```go 23 | func ExampleTDigest() { 24 | rand.Seed(5678) 25 | values := make(chan float64) 26 | 27 | // Generate 100k uniform random data between 0 and 100 28 | var ( 29 | n int = 100000 30 | min, max float64 = 0, 100 31 | ) 32 | go func() { 33 | for i := 0; i < n; i++ { 34 | values <- min + rand.Float64()*(max-min) 35 | } 36 | close(values) 37 | }() 38 | 39 | // Pass the values through a TDigest. 40 | td := New() 41 | 42 | for val := range values { 43 | // Add the value with weight 1 44 | td.Add(val, 1) 45 | } 46 | 47 | // Print the 50th, 90th, 99th, 99.9th, and 99.99th percentiles 48 | fmt.Printf("50th: %.5f\n", td.Quantile(0.5)) 49 | fmt.Printf("90th: %.5f\n", td.Quantile(0.9)) 50 | fmt.Printf("99th: %.5f\n", td.Quantile(0.99)) 51 | fmt.Printf("99.9th: %.5f\n", td.Quantile(0.999)) 52 | fmt.Printf("99.99th: %.5f\n", td.Quantile(0.9999)) 53 | // Output: 54 | // 50th: 48.74854 55 | // 90th: 89.79825 56 | // 99th: 98.92954 57 | // 99.9th: 99.90189 58 | // 99.99th: 99.98740 59 | } 60 | ``` 61 | 62 | ## Algorithm ## 63 | 64 | For example, in the Real World, the stream of data might be *service 65 | timings*, measuring how long a server takes to respond to clients. You 66 | can feed this stream of data through a t-digest and get out 67 | approximations of any quantile you like: the 50th percentile or 95th 68 | percentile or 99th or 99.99th or 28.31th are all computable. 69 | 70 | Exact quantiles would require that you hold all the data in memory, 71 | but the t-digest can hold a small fraction - often just a few 72 | kilobytes to represent many millions of datapoints. Measurements of 73 | the compression ratio show that compression improves super-linearly as 74 | more datapoints are fed into the t-digest. 75 | 76 | How good are the approximations? Well, it depends, but they tend to be 77 | quite good, especially out towards extreme percentiles like the 99th 78 | or 99.9th; Ted Dunning found errors of just a few parts per million at 79 | the 99.9th and 0.1th percentiles. 80 | 81 | Error will be largest in the middle - the median is the least accurate 82 | point in the t-digest. 83 | 84 | The actual precision can be controlled with the `compression` 85 | parameter passed to the constructor function `NewWithCompression` in 86 | this package. Lower `compression` parameters will result in poorer 87 | compression, but will improve performance in estimating quantiles. If 88 | you care deeply about tuning such things, experiment with the 89 | compression ratio. 90 | 91 | ## Benchmarks ## 92 | 93 | Data compresses well, with compression ratios of around 20 for small 94 | datasets (1k datapoints) and 500 for largeish ones (1M 95 | datapoints). The precise compression ratio depends a bit on your 96 | data's distribution - exponential data does well, while ordered data 97 | does poorly: 98 | 99 | ![compression benchmark](docs/compression_benchmark.png) 100 | 101 | In general, adding a datapoint takes about 1 to 4 microseconds on my 102 | 2014 Macbook Pro. This is fast enough for many purposes, but if you 103 | have any concern, you should just run the benchmarks on your targeted 104 | syste. You can do that with `go test -bench . ./...`. 105 | 106 | Quantiles are very, very quick to calculate, and typically take tens 107 | of nanoseconds. They might take up to a few hundred nanoseconds for 108 | large, poorly compressed (read: ordered) datasets, but in general, you 109 | don't have to worry about the speed of calls to Quantile. 110 | -------------------------------------------------------------------------------- /benchmarks_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | ) 7 | 8 | const rngSeed = 1234567 9 | 10 | type valueSource interface { 11 | Next() float64 12 | } 13 | 14 | func benchmarkAdd(b *testing.B, n int, src valueSource) { 15 | valsToAdd := make([]float64, n) 16 | 17 | d := NewWithCompression(100) 18 | for i := 0; i < n; i++ { 19 | v := src.Next() 20 | valsToAdd[i] = v 21 | d.Add(v, 1) 22 | } 23 | 24 | b.ResetTimer() 25 | for i := 0; i < b.N; i++ { 26 | d.Add(valsToAdd[i%n], 1) 27 | } 28 | b.StopTimer() 29 | } 30 | 31 | func benchmarkQuantile(b *testing.B, n int, src valueSource) { 32 | quantilesToCheck := make([]float64, n) 33 | 34 | d := NewWithCompression(100) 35 | for i := 0; i < n; i++ { 36 | v := src.Next() 37 | quantilesToCheck[i] = v 38 | d.Add(v, 1) 39 | } 40 | 41 | b.ResetTimer() 42 | for i := 0; i < b.N; i++ { 43 | _ = d.Quantile(quantilesToCheck[i%n]) 44 | } 45 | b.StopTimer() 46 | } 47 | 48 | type orderedValues struct { 49 | last float64 50 | } 51 | 52 | func (ov *orderedValues) Next() float64 { 53 | ov.last += 1 54 | return ov.last 55 | } 56 | 57 | func BenchmarkAdd_1k_Ordered(b *testing.B) { 58 | benchmarkAdd(b, 1000, &orderedValues{}) 59 | } 60 | 61 | func BenchmarkAdd_10k_Ordered(b *testing.B) { 62 | benchmarkAdd(b, 10000, &orderedValues{}) 63 | } 64 | 65 | func BenchmarkAdd_100k_Ordered(b *testing.B) { 66 | benchmarkAdd(b, 100000, &orderedValues{}) 67 | } 68 | 69 | func BenchmarkQuantile_1k_Ordered(b *testing.B) { 70 | benchmarkQuantile(b, 1000, &orderedValues{}) 71 | } 72 | 73 | func BenchmarkQuantile_10k_Ordered(b *testing.B) { 74 | benchmarkQuantile(b, 10000, &orderedValues{}) 75 | } 76 | 77 | func BenchmarkQuantile_100k_Ordered(b *testing.B) { 78 | benchmarkQuantile(b, 100000, &orderedValues{}) 79 | } 80 | 81 | type zipfValues struct { 82 | z *rand.Zipf 83 | } 84 | 85 | func newZipfValues() *zipfValues { 86 | r := rand.New(rand.NewSource(rngSeed)) 87 | z := rand.NewZipf(r, 1.2, 1, 1024*1024) 88 | return &zipfValues{ 89 | z: z, 90 | } 91 | } 92 | 93 | func (zv *zipfValues) Next() float64 { 94 | return float64(zv.z.Uint64()) 95 | } 96 | 97 | func BenchmarkAdd_1k_Zipfian(b *testing.B) { 98 | benchmarkAdd(b, 1000, newZipfValues()) 99 | } 100 | 101 | func BenchmarkAdd_10k_Zipfian(b *testing.B) { 102 | benchmarkAdd(b, 10000, newZipfValues()) 103 | } 104 | 105 | func BenchmarkAdd_100k_Zipfian(b *testing.B) { 106 | benchmarkAdd(b, 100000, newZipfValues()) 107 | } 108 | 109 | func BenchmarkQuantile_1k_Zipfian(b *testing.B) { 110 | benchmarkQuantile(b, 1000, newZipfValues()) 111 | } 112 | 113 | func BenchmarkQuantile_10k_Zipfian(b *testing.B) { 114 | benchmarkQuantile(b, 10000, newZipfValues()) 115 | } 116 | 117 | func BenchmarkQuantile_100k_Zipfian(b *testing.B) { 118 | benchmarkQuantile(b, 100000, newZipfValues()) 119 | } 120 | 121 | type uniformValues struct { 122 | r *rand.Rand 123 | } 124 | 125 | func newUniformValues() *uniformValues { 126 | return &uniformValues{rand.New(rand.NewSource(rngSeed))} 127 | } 128 | 129 | func (uv *uniformValues) Next() float64 { 130 | return uv.r.Float64() 131 | } 132 | 133 | func BenchmarkAdd_1k_Uniform(b *testing.B) { 134 | benchmarkAdd(b, 1000, newUniformValues()) 135 | } 136 | 137 | func BenchmarkAdd_10k_Uniform(b *testing.B) { 138 | benchmarkAdd(b, 10000, newUniformValues()) 139 | } 140 | 141 | func BenchmarkAdd_100k_Uniform(b *testing.B) { 142 | benchmarkAdd(b, 100000, newUniformValues()) 143 | } 144 | 145 | func BenchmarkQuantile_1k_Uniform(b *testing.B) { 146 | benchmarkQuantile(b, 1000, newUniformValues()) 147 | } 148 | 149 | func BenchmarkQuantile_10k_Uniform(b *testing.B) { 150 | benchmarkQuantile(b, 10000, newUniformValues()) 151 | } 152 | 153 | func BenchmarkQuantile_100k_Uniform(b *testing.B) { 154 | benchmarkQuantile(b, 100000, newUniformValues()) 155 | } 156 | 157 | type normalValues struct { 158 | r *rand.Rand 159 | } 160 | 161 | func newNormalValues() *normalValues { 162 | return &normalValues{rand.New(rand.NewSource(rngSeed))} 163 | } 164 | 165 | func (uv *normalValues) Next() float64 { 166 | return uv.r.NormFloat64() 167 | } 168 | 169 | func BenchmarkAdd_1k_Normal(b *testing.B) { 170 | benchmarkAdd(b, 1000, newNormalValues()) 171 | } 172 | 173 | func BenchmarkAdd_10k_Normal(b *testing.B) { 174 | benchmarkAdd(b, 10000, newNormalValues()) 175 | } 176 | 177 | func BenchmarkAdd_100k_Normal(b *testing.B) { 178 | benchmarkAdd(b, 100000, newNormalValues()) 179 | } 180 | 181 | func BenchmarkQuantile_1k_Normal(b *testing.B) { 182 | benchmarkQuantile(b, 1000, newNormalValues()) 183 | } 184 | 185 | func BenchmarkQuantile_10k_Normal(b *testing.B) { 186 | benchmarkQuantile(b, 10000, newNormalValues()) 187 | } 188 | 189 | func BenchmarkQuantile_100k_Normal(b *testing.B) { 190 | benchmarkQuantile(b, 100000, newNormalValues()) 191 | } 192 | -------------------------------------------------------------------------------- /docs/compression_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spenczar/tdigest/b766947010a67d71100f172226d6023e56ec9f3c/docs/compression_benchmark.png -------------------------------------------------------------------------------- /fuzz_test.go: -------------------------------------------------------------------------------- 1 | //go:build go1.18 2 | // +build go1.18 3 | 4 | package tdigest 5 | 6 | import ( 7 | "bytes" 8 | "testing" 9 | ) 10 | 11 | // Past cases that revealed panics. 12 | var fuzzFailures = [][]byte{ 13 | []byte{ 14 | 0x01, 0x00, 0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 15 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 16 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 17 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0xfc, 18 | }, 19 | []byte{ 20 | 0x01, 0x00, 0x00, 0x00, 0xdb, 0x46, 0x5f, 0xbd, 21 | 0xdb, 0x46, 0x00, 0xbd, 0xe0, 0xdf, 0xca, 0xab, 22 | 0x37, 0x31, 0x37, 0x32, 0x37, 0x33, 0x37, 0x34, 23 | 0x37, 0x35, 0x37, 0x36, 0x37, 0x37, 0x37, 0x38, 24 | 0x37, 0x39, 0x28, 25 | }, 26 | []byte{ 27 | 0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30, 28 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00, 29 | 0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 30 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 31 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 32 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 33 | 0x30, 0xbf, 34 | }, 35 | []byte{ 36 | 0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30, 37 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00, 38 | 0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 39 | 0x30, 0x63, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 40 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 41 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 42 | 0x30, 0x4e, 43 | }, 44 | []byte{ 45 | 0x80, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x30, 0x30, 46 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x02, 0x00, 47 | 0x00, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 48 | 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 49 | 0x30, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 50 | 0x30, 0x00, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 51 | 0x92, 0x00, 52 | }, 53 | } 54 | 55 | func FuzzRoundTrip(f *testing.F) { 56 | 57 | for _, data := range fuzzFailures { 58 | f.Add(data) 59 | } 60 | f.Fuzz(func(t *testing.T, data []byte) { 61 | v := new(TDigest) 62 | err := v.UnmarshalBinary(data) 63 | if err != nil { 64 | // Input is not valid; skip it. 65 | t.Skip() 66 | } 67 | 68 | t.Logf("input: %v", data) 69 | remarshaled, err := v.MarshalBinary() 70 | if err != nil { 71 | t.Fatalf("marshal error for valid data: %v", err) 72 | } 73 | 74 | if !bytes.HasPrefix(data, remarshaled) { 75 | t.Logf("tdigest: %s", v.debugStr()) 76 | t.Fatal("remarshaling does not round-trip") 77 | } 78 | 79 | for q := float64(0.1); q <= 1.0; q += 0.05 { 80 | prev, this := v.Quantile(q-0.1), v.Quantile(q) 81 | if prev-this > 1e-100 { // Floating point math makes this slightly imprecise. 82 | t.Logf("tdigest: %s", v.debugStr()) 83 | t.Logf("q: %v", q) 84 | t.Logf("prev: %v", prev) 85 | t.Logf("this: %v", this) 86 | t.Fatal("quantiles should only increase") 87 | } 88 | } 89 | v.Add(1, 1) 90 | 91 | }) 92 | } 93 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/spenczar/tdigest/v2 2 | 3 | go 1.17 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spenczar/tdigest/b766947010a67d71100f172226d6023e56ec9f3c/go.sum -------------------------------------------------------------------------------- /serde.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "math" 9 | ) 10 | 11 | const ( 12 | magic = int16(0xc80) 13 | encodingVersion = int32(1) 14 | ) 15 | 16 | func marshalBinary(d *TDigest) ([]byte, error) { 17 | buf := bytes.NewBuffer(nil) 18 | w := &binaryBufferWriter{buf: buf} 19 | w.writeValue(magic) 20 | w.writeValue(encodingVersion) 21 | w.writeValue(d.compression) 22 | w.writeValue(int32(len(d.centroids))) 23 | for _, c := range d.centroids { 24 | w.writeValue(c.count) 25 | w.writeValue(c.mean) 26 | } 27 | 28 | if w.err != nil { 29 | return nil, w.err 30 | } 31 | return buf.Bytes(), nil 32 | } 33 | 34 | func unmarshalBinary(d *TDigest, p []byte) error { 35 | var ( 36 | mv int16 37 | ev int32 38 | n int32 39 | ) 40 | r := &binaryReader{r: bytes.NewReader(p)} 41 | r.readValue(&mv) 42 | if r.err != nil { 43 | return r.err 44 | } 45 | if mv != magic { 46 | return fmt.Errorf("data corruption detected: invalid header magic value 0x%04x", mv) 47 | } 48 | r.readValue(&ev) 49 | if r.err != nil { 50 | return r.err 51 | } 52 | if ev != encodingVersion { 53 | return fmt.Errorf("data corruption detected: invalid encoding version %d", ev) 54 | } 55 | r.readValue(&d.compression) 56 | r.readValue(&n) 57 | if r.err != nil { 58 | return r.err 59 | } 60 | if n < 0 { 61 | return fmt.Errorf("data corruption detected: number of centroids cannot be negative, have %v", n) 62 | 63 | } 64 | if n > 1<<20 { 65 | return fmt.Errorf("invalid n, cannot be greater than 2^20: %v", n) 66 | } 67 | d.centroids = make([]*centroid, int(n)) 68 | for i := 0; i < int(n); i++ { 69 | c := new(centroid) 70 | r.readValue(&c.count) 71 | r.readValue(&c.mean) 72 | if r.err != nil { 73 | return r.err 74 | } 75 | if c.count < 0 { 76 | return fmt.Errorf("data corruption detected: negative count: %d", c.count) 77 | } 78 | if math.IsNaN(c.mean) { 79 | return fmt.Errorf("data corruption detected: NaN mean not permitted") 80 | } 81 | if math.IsInf(c.mean, 0) { 82 | return fmt.Errorf("data corruption detected: Inf mean not permitted") 83 | } 84 | if i > 0 { 85 | prev := d.centroids[i-1] 86 | if c.mean < prev.mean { 87 | return fmt.Errorf("data corruption detected: centroid %d has lower mean (%v) than preceding centroid %d (%v)", i, c.mean, i-1, prev.mean) 88 | } 89 | } 90 | d.centroids[i] = c 91 | if c.count > math.MaxInt64-d.countTotal { 92 | return fmt.Errorf("data corruption detected: centroid total size overflow") 93 | } 94 | d.countTotal += c.count 95 | } 96 | 97 | if n := r.r.Len(); n > 0 { 98 | return fmt.Errorf("found %d unexpected bytes trailing the tdigest", n) 99 | } 100 | 101 | return nil 102 | } 103 | 104 | type binaryBufferWriter struct { 105 | buf *bytes.Buffer 106 | err error 107 | } 108 | 109 | func (w *binaryBufferWriter) writeValue(v interface{}) { 110 | if w.err != nil { 111 | return 112 | } 113 | w.err = binary.Write(w.buf, binary.LittleEndian, v) 114 | } 115 | 116 | type binaryReader struct { 117 | r *bytes.Reader 118 | err error 119 | } 120 | 121 | func (r *binaryReader) readValue(v interface{}) { 122 | if r.err != nil { 123 | return 124 | } 125 | r.err = binary.Read(r.r, binary.LittleEndian, v) 126 | if r.err == io.EOF { 127 | r.err = io.ErrUnexpectedEOF 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /serde_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "reflect" 7 | "testing" 8 | ) 9 | 10 | func TestMarshalRoundTrip(t *testing.T) { 11 | testcase := func(in *TDigest) func(*testing.T) { 12 | return func(t *testing.T) { 13 | b, err := in.MarshalBinary() 14 | if err != nil { 15 | t.Fatalf("MarshalBinary err: %v", err) 16 | } 17 | out := new(TDigest) 18 | err = out.UnmarshalBinary(b) 19 | if err != nil { 20 | t.Fatalf("UnmarshalBinary err: %v", err) 21 | } 22 | if !reflect.DeepEqual(in, out) { 23 | t.Errorf("marshaling round trip resulted in changes") 24 | t.Logf("in: %+v", in) 25 | t.Logf("out: %+v", out) 26 | } 27 | } 28 | } 29 | t.Run("empty", testcase(New())) 30 | t.Run("1 value", testcase(simpleTDigest(1))) 31 | t.Run("1000 values", testcase(simpleTDigest(1000))) 32 | 33 | d := New() 34 | d.Add(1, 1) 35 | d.Add(1, 1) 36 | d.Add(0, 1) 37 | t.Run("1, 1, 0 input", testcase(d)) 38 | } 39 | 40 | func TestUnmarshalErrors(t *testing.T) { 41 | testcase := func(in []byte, wantErr error) func(*testing.T) { 42 | return func(t *testing.T) { 43 | have := new(TDigest) 44 | err := unmarshalBinary(have, in) 45 | if err != nil { 46 | if wantErr == nil { 47 | t.Fatalf("unexpected unmarshal err: %v", err) 48 | } 49 | if err.Error() != wantErr.Error() { 50 | t.Fatalf("wrong error, want=%q, have=%q", wantErr.Error(), err.Error()) 51 | } else { 52 | return 53 | } 54 | } else if wantErr != nil { 55 | t.Fatalf("expected err=%q, got nil", wantErr.Error()) 56 | } 57 | } 58 | } 59 | t.Run("nil", testcase( 60 | nil, 61 | io.ErrUnexpectedEOF, 62 | )) 63 | t.Run("bad magic", testcase( 64 | []byte{ 65 | 0x80, 0x0d, 66 | }, 67 | errors.New("data corruption detected: invalid header magic value 0x0d80"), 68 | )) 69 | t.Run("incomplete encoding", testcase( 70 | []byte{ 71 | 0x80, 0x0c, 72 | 0x00, 73 | }, 74 | io.ErrUnexpectedEOF, 75 | )) 76 | t.Run("bad encoding", testcase( 77 | []byte{ 78 | 0x80, 0x0c, 79 | 0xFF, 0xFF, 0xFF, 0xFF, 80 | }, 81 | errors.New("data corruption detected: invalid encoding version -1"), 82 | )) 83 | t.Run("incomplete compression", testcase( 84 | []byte{ 85 | 0x80, 0x0c, 86 | 0x01, 0x00, 0x00, 0x00, 87 | 0x00, 0x00, 88 | }, 89 | io.ErrUnexpectedEOF, 90 | )) 91 | t.Run("incomplete n", testcase( 92 | []byte{ 93 | 0x80, 0x0c, 94 | 0x01, 0x00, 0x00, 0x00, 95 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 96 | 0x00, 97 | }, 98 | io.ErrUnexpectedEOF, 99 | )) 100 | t.Run("negative n", testcase( 101 | []byte{ 102 | 0x80, 0x0c, 103 | 0x01, 0x00, 0x00, 0x00, 104 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 105 | 0xFF, 0xFF, 0xFF, 0xFF, 106 | }, 107 | errors.New("data corruption detected: number of centroids cannot be negative, have -1"), 108 | )) 109 | t.Run("huge n", testcase( 110 | []byte{ 111 | 0x80, 0x0c, 112 | 0x01, 0x00, 0x00, 0x00, 113 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 114 | 0xFF, 0xFF, 0xFF, 0x7F, 115 | }, 116 | errors.New("invalid n, cannot be greater than 2^20: 2147483647"), 117 | )) 118 | t.Run("missing centroids", testcase( 119 | []byte{ 120 | 0x80, 0x0c, 121 | 0x01, 0x00, 0x00, 0x00, 122 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 123 | 0x01, 0x00, 0x00, 0x00, 124 | }, 125 | io.ErrUnexpectedEOF, 126 | )) 127 | t.Run("partial centroid", testcase( 128 | []byte{ 129 | 0x80, 0x0c, 130 | 0x01, 0x00, 0x00, 0x00, 131 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 132 | 0x01, 0x00, 0x00, 0x00, 133 | 0x01, 0x00, 0x00, 0x00, 134 | }, 135 | io.ErrUnexpectedEOF, 136 | )) 137 | t.Run("negative count", testcase( 138 | []byte{ 139 | 0x80, 0x0c, 140 | 0x01, 0x00, 0x00, 0x00, 141 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 142 | 0x01, 0x00, 0x00, 0x00, 143 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 144 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 145 | }, 146 | errors.New("data corruption detected: negative count: -1"), 147 | )) 148 | t.Run("decreasing means", testcase( 149 | []byte{ 150 | 0x80, 0x0c, 151 | 0x01, 0x00, 0x00, 0x00, 152 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 153 | 0x02, 0x00, 0x00, 0x00, 154 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 155 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 156 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 157 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 158 | }, 159 | errors.New("data corruption detected: centroid 1 has lower mean (1) than preceding centroid 0 (2)"), 160 | )) 161 | t.Run("nan mean", testcase( 162 | []byte{ 163 | 0x80, 0x0c, 164 | 0x01, 0x00, 0x00, 0x00, 165 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 166 | 0x01, 0x00, 0x00, 0x00, 167 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 168 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 169 | }, 170 | errors.New("data corruption detected: NaN mean not permitted"), 171 | )) 172 | t.Run("+inf mean", testcase( 173 | []byte{ 174 | 0x80, 0x0c, 175 | 0x01, 0x00, 0x00, 0x00, 176 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 177 | 0x01, 0x00, 0x00, 0x00, 178 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 179 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x7F, 180 | }, 181 | errors.New("data corruption detected: Inf mean not permitted"), 182 | )) 183 | t.Run("-inf mean", testcase( 184 | []byte{ 185 | 0x80, 0x0c, 186 | 0x01, 0x00, 0x00, 0x00, 187 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 188 | 0x01, 0x00, 0x00, 0x00, 189 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 190 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFF, 191 | }, 192 | errors.New("data corruption detected: Inf mean not permitted"), 193 | )) 194 | t.Run("total size overflow", testcase( 195 | []byte{ 196 | 0x80, 0x0c, 197 | 0x01, 0x00, 0x00, 0x00, 198 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 199 | 0x02, 0x00, 0x00, 0x00, 200 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 201 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 202 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 203 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 204 | }, 205 | errors.New("data corruption detected: centroid total size overflow"), 206 | )) 207 | t.Run("trailing bytes", testcase( 208 | []byte{ 209 | 0x80, 0x0c, 210 | 0x01, 0x00, 0x00, 0x00, 211 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 212 | 0x02, 0x00, 0x00, 0x00, 213 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 214 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 215 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 216 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 217 | 0x00, 218 | }, 219 | errors.New("found 1 unexpected bytes trailing the tdigest"), 220 | )) 221 | } 222 | 223 | func TestUnmarshal(t *testing.T) { 224 | testcase := func(in []byte, want *TDigest) func(*testing.T) { 225 | return func(t *testing.T) { 226 | have := new(TDigest) 227 | err := unmarshalBinary(have, in) 228 | if err != nil { 229 | t.Fatalf("unexpected unmarshal err: %v", err) 230 | } 231 | if !reflect.DeepEqual(have, want) { 232 | t.Error("unmarshal did not produce expected digest") 233 | t.Logf("want=%s", want.debugStr()) 234 | t.Logf("have=%s", have.debugStr()) 235 | } 236 | } 237 | } 238 | t.Run("no centroids", testcase( 239 | []byte{ 240 | 0x80, 0x0c, 241 | 0x01, 0x00, 0x00, 0x00, 242 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 243 | 0x00, 0x00, 0x00, 0x00, 244 | }, 245 | &TDigest{ 246 | centroids: make([]*centroid, 0), 247 | compression: 100, 248 | countTotal: 0, 249 | }, 250 | )) 251 | t.Run("one centroid", testcase( 252 | []byte{ 253 | 0x80, 0x0c, 254 | 0x01, 0x00, 0x00, 0x00, 255 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 256 | 0x01, 0x00, 0x00, 0x00, 257 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 258 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 259 | }, 260 | &TDigest{ 261 | centroids: []*centroid{ 262 | ¢roid{ 263 | count: 1, 264 | mean: 1, 265 | }, 266 | }, 267 | compression: 100, 268 | countTotal: 1, 269 | }, 270 | )) 271 | t.Run("two centroids", testcase( 272 | []byte{ 273 | 0x80, 0x0c, 274 | 0x01, 0x00, 0x00, 0x00, 275 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40, 276 | 0x02, 0x00, 0x00, 0x00, 277 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 278 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, 279 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 280 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 281 | }, 282 | &TDigest{ 283 | centroids: []*centroid{ 284 | ¢roid{ 285 | count: 1, 286 | mean: 1, 287 | }, 288 | ¢roid{ 289 | count: 1, 290 | mean: 2, 291 | }, 292 | }, 293 | compression: 100, 294 | countTotal: 2, 295 | }, 296 | )) 297 | } 298 | -------------------------------------------------------------------------------- /tdigest.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | ) 8 | 9 | // centroid is a simple container for a mean,count pair. 10 | type centroid struct { 11 | mean float64 12 | count int64 13 | } 14 | 15 | func (c *centroid) String() string { 16 | return fmt.Sprintf("c{%f x%d}", c.mean, c.count) 17 | } 18 | 19 | // A TDigest is an efficient data structure for computing streaming approximate 20 | // quantiles of a dataset. 21 | type TDigest struct { 22 | centroids []*centroid 23 | compression float64 24 | countTotal int64 25 | } 26 | 27 | // New produces a new TDigest using the default compression level of 28 | // 100. 29 | func New() *TDigest { 30 | return NewWithCompression(100) 31 | } 32 | 33 | // NewWithCompression produces a new TDigest with a specific 34 | // compression level. The input compression value, which should be >= 35 | // 1.0, will control how aggressively the TDigest compresses data 36 | // together. 37 | // 38 | // The original TDigest paper suggests using a value of 100 for a good 39 | // balance between precision and efficiency. It will land at very 40 | // small (think like 1e-6 percentile points) errors at extreme points 41 | // in the distribution, and compression ratios of around 500 for large 42 | // data sets (1 millionish datapoints). 43 | func NewWithCompression(compression float64) *TDigest { 44 | return &TDigest{ 45 | centroids: make([]*centroid, 0), 46 | compression: compression, 47 | countTotal: 0, 48 | } 49 | } 50 | 51 | // Find the indexes of centroids which have the minimum distance to the 52 | // input value. 53 | // 54 | // TODO: Use a better data structure to avoid this loop. 55 | func (d *TDigest) nearest(val float64) []int { 56 | var ( 57 | nearestDist float64 = math.Inf(+1) 58 | thisDist float64 59 | delta float64 60 | result []int = make([]int, 0) 61 | ) 62 | for i, c := range d.centroids { 63 | thisDist = val - c.mean 64 | if thisDist < 0 { 65 | thisDist *= -1 66 | } 67 | 68 | delta = thisDist - nearestDist 69 | switch { 70 | case delta < 0: 71 | // we have a new winner! 72 | nearestDist = thisDist 73 | result = result[0:0] // wipe result 74 | result = append(result, i) 75 | case delta == 0: 76 | // we have a tie 77 | result = append(result, i) 78 | default: 79 | // Since d.centroids is sorted by mean, this means we 80 | // have passed the best spot, so we may as well break 81 | break 82 | } 83 | } 84 | return result 85 | } 86 | 87 | // returns the maximum weight that can be placed at specified index 88 | func (d *TDigest) weightLimit(idx int) int64 { 89 | ptile := d.quantileOf(idx) 90 | limit := int64(4 * d.compression * ptile * (1 - ptile) * float64(len(d.centroids))) 91 | return limit 92 | } 93 | 94 | // checks whether the centroid has room for more weight 95 | func (d *TDigest) centroidHasRoom(idx int) bool { 96 | return d.centroids[idx].count < d.weightLimit(idx) 97 | } 98 | 99 | // find which centroid to add the value to (by index) 100 | func (d *TDigest) findAddTarget(val float64) int { 101 | nearest := d.nearest(val) 102 | // There could be no centroids yet, one centroid which is the 'nearest', or 103 | // multiple centroids that are equidistant. 104 | switch len(nearest) { 105 | case 0: 106 | // There are no centroids at all. Return -1, signaling that we should add a 107 | // new centroid. 108 | return -1 109 | case 1: 110 | // When there is exactly one centroid which is the 'nearest' one, return it 111 | // if it has room. 112 | if d.centroidHasRoom(nearest[0]) { 113 | return nearest[0] 114 | } 115 | return -1 116 | default: 117 | // Multiple eligible centroids to add to. They must be equidistant 118 | // from this value. Four cases are possible: 119 | // 120 | // 1. All eligible centroids' means are less than val 121 | // 2. All eligible centroids' means are greater than val 122 | // 3. All eligible centroids' means are exactly equal to val 123 | // 4. Some eligible centroids' means are less than val, some are greater 124 | // 125 | // If 1, then we should take the highest indexed centroid to preserve 126 | // ordering. If 2, we should take the lowest for the same reason. If 2, we 127 | // can pick randomly among the ones that have room, since they are 128 | // indistinguishable. If 4, we should first trim down to having just 2 129 | // eligible centroids and then can pick randomly. 130 | 131 | // First, establish which of the 4 cases we have. 132 | var anyLesser, anyGreater bool 133 | for _, c := range nearest { 134 | m := d.centroids[c].mean 135 | if m < val { 136 | anyLesser = true 137 | } else if m > val { 138 | anyGreater = true 139 | } 140 | } 141 | 142 | switch { 143 | case anyLesser && !anyGreater: 144 | // case 1: all are less, none are greater. Take highest one. 145 | c := max(nearest) 146 | if d.centroidHasRoom(c) { 147 | return c 148 | } 149 | return -1 150 | 151 | case !anyLesser && anyGreater: 152 | // case 2: all are greater, none are less. Take the lowest one. 153 | c := min(nearest) 154 | if d.centroidHasRoom(c) { 155 | return c 156 | } 157 | return -1 158 | 159 | case !anyLesser && !anyGreater: 160 | // case 3: all are equal. Take a random one that has room. 161 | var eligible []int 162 | for _, c := range nearest { 163 | if d.centroidHasRoom(c) { 164 | eligible = append(eligible, c) 165 | } 166 | } 167 | if len(eligible) == 0 { 168 | return -1 169 | } 170 | if len(eligible) == 1 { 171 | return eligible[0] 172 | } 173 | return eligible[rand.Intn(len(eligible))] 174 | 175 | default: 176 | // case 4: It's a mixed bag. We need to first trim down to the two 177 | // innermost centroids which straddle the value. 178 | var lower, upper int 179 | for _, c := range nearest { 180 | m := d.centroids[c].mean 181 | if m < val { 182 | lower = c 183 | } else if m > val { 184 | upper = c 185 | break 186 | } 187 | } 188 | // Now, check which has room. If both do, pick randomly. 189 | lowerHasRoom := d.centroidHasRoom(lower) 190 | upperHasRoom := d.centroidHasRoom(upper) 191 | switch { 192 | case !lowerHasRoom && !upperHasRoom: 193 | return -1 194 | case lowerHasRoom && !upperHasRoom: 195 | return lower 196 | case !lowerHasRoom && upperHasRoom: 197 | return upper 198 | default: 199 | if rand.Intn(2) == 1 { 200 | return lower 201 | } else { 202 | return upper 203 | } 204 | } 205 | } 206 | } 207 | } 208 | 209 | func (d *TDigest) addNewCentroid(mean float64, weight int64) { 210 | var idx int = len(d.centroids) 211 | 212 | for i, c := range d.centroids { 213 | // add in sorted order 214 | if mean < c.mean { 215 | idx = i 216 | break 217 | } 218 | } 219 | 220 | d.centroids = append(d.centroids, nil) 221 | copy(d.centroids[idx+1:], d.centroids[idx:]) 222 | d.centroids[idx] = ¢roid{mean, weight} 223 | } 224 | 225 | // Add will add a value to the TDigest, updating all quantiles. A 226 | // weight can be specified; use weight of 1 if you don't care about 227 | // weighting your dataset. 228 | // 229 | // Add will ignore input values of NaN or Inf. 230 | func (d *TDigest) Add(val float64, weight int) { 231 | if math.IsNaN(val) || math.IsInf(val, 0) { 232 | return 233 | } 234 | d.add(val, int64(weight)) 235 | } 236 | 237 | func (d *TDigest) add(val float64, weight int64) { 238 | d.countTotal += weight 239 | var idx = d.findAddTarget(val) 240 | 241 | if idx == -1 { 242 | d.addNewCentroid(val, weight) 243 | return 244 | } 245 | 246 | c := d.centroids[idx] 247 | 248 | limit := d.weightLimit(idx) 249 | // how much weight will we be adding? 250 | // if adding this node to this centroid would put it over the 251 | // weight limit, just add the most we can and recur with the remainder 252 | if c.count+weight > limit { 253 | add := limit - c.count 254 | if add < 0 { 255 | // this node was already overweight 256 | add = 0 257 | } 258 | remainder := weight - add 259 | 260 | c.count += add 261 | c.mean = c.mean + float64(add)*(val-c.mean)/float64(c.count) 262 | 263 | d.add(val, remainder) 264 | } else { 265 | c.count += weight 266 | c.mean = c.mean + float64(weight)*(val-c.mean)/float64(c.count) 267 | } 268 | } 269 | 270 | // returns the approximate quantile that a particular centroid 271 | // represents 272 | func (d *TDigest) quantileOf(idx int) float64 { 273 | var total int64 274 | for _, c := range d.centroids[:idx] { 275 | total += c.count 276 | } 277 | return (float64(d.centroids[idx].count/2) + float64(total)) / float64(d.countTotal) 278 | } 279 | 280 | // Quantile(q) will estimate the qth quantile value of the dataset. The input 281 | // value of q should be in the range [0.0, 1.0]; if it is outside that range, it 282 | // will be clipped into it automatically. 283 | // 284 | // Calling Quantile on a TDigest with no data will return NaN. 285 | func (d *TDigest) Quantile(q float64) float64 { 286 | var n = len(d.centroids) 287 | if n == 0 { 288 | return math.NaN() 289 | } 290 | if n == 1 { 291 | return d.centroids[0].mean 292 | } 293 | 294 | if q < 0 { 295 | q = 0 296 | } else if q > 1 { 297 | q = 1 298 | } 299 | 300 | // rescale into count units instead of 0 to 1 units 301 | q = float64(d.countTotal) * q 302 | // find the first centroid which straddles q 303 | var ( 304 | qTotal float64 = 0 305 | i int 306 | ) 307 | for i = 0; i < n && float64(d.centroids[i].count)/2+qTotal < q; i++ { 308 | qTotal += float64(d.centroids[i].count) 309 | } 310 | 311 | if i == 0 { 312 | // special case 1: the targeted quantile is before the 313 | // left-most centroid. extrapolate from the slope from 314 | // centroid0 to centroid1. 315 | c0 := d.centroids[0] 316 | c1 := d.centroids[1] 317 | slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2) 318 | deltaQ := q - float64(c0.count)/2 // this is negative 319 | return c0.mean + slope*deltaQ 320 | } 321 | if i == n { 322 | // special case 2: the targeted quantile is from the 323 | // right-most centroid. extrapolate from the slope at the 324 | // right edge. 325 | c0 := d.centroids[n-2] 326 | c1 := d.centroids[n-1] 327 | slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2) 328 | deltaQ := q - (qTotal - float64(c1.count)/2) 329 | return c1.mean + slope*deltaQ 330 | } 331 | // common case: targeted quantile is between 2 centroids 332 | c0 := d.centroids[i-1] 333 | c1 := d.centroids[i] 334 | slope := (c1.mean - c0.mean) / (float64(c1.count)/2 + float64(c0.count)/2) 335 | deltaQ := q - (float64(c1.count)/2 + qTotal) 336 | return c1.mean + slope*deltaQ 337 | } 338 | 339 | // MergeInto(other) will add all of the data within a TDigest into other, 340 | // combining them into one larger TDigest. 341 | func (d *TDigest) MergeInto(other *TDigest) { 342 | // Add each centroid in d into other. They should be added in 343 | // random order. 344 | addOrder := rand.Perm(len(d.centroids)) 345 | for _, idx := range addOrder { 346 | c := d.centroids[idx] 347 | // gradually write up the volume written so that the tdigest doesnt overload early 348 | added := int64(0) 349 | for i := int64(1); i < 10; i++ { 350 | toAdd := i * 2 351 | if added+i > c.count { 352 | toAdd = c.count - added 353 | } 354 | other.add(c.mean, toAdd) 355 | added += toAdd 356 | if added >= c.count { 357 | break 358 | } 359 | } 360 | if added < c.count { 361 | other.add(c.mean, c.count-added) 362 | } 363 | other.add(c.mean, c.count) 364 | } 365 | } 366 | 367 | // MarshalBinary serializes d as a sequence of bytes, suitable to be 368 | // deserialized later with UnmarshalBinary. 369 | func (d *TDigest) MarshalBinary() ([]byte, error) { 370 | return marshalBinary(d) 371 | } 372 | 373 | // UnmarshalBinary populates d with the parsed contents of p, which should have 374 | // been created with a call to MarshalBinary. 375 | func (d *TDigest) UnmarshalBinary(p []byte) error { 376 | return unmarshalBinary(d, p) 377 | } 378 | 379 | // Render a TDigest's internal state for test logging output purposes. 380 | func (d *TDigest) debugStr() string { 381 | var centroids = "[]*centroids{" 382 | 383 | for _, c := range d.centroids { 384 | centroids += fmt.Sprintf("¢roid{mean: %f, count: %d},", c.mean, c.count) 385 | } 386 | centroids += "}" 387 | 388 | return fmt.Sprintf("TDigest{compression: %f, countTotal: %d, centroids: %s", d.compression, d.countTotal, centroids) 389 | 390 | } 391 | 392 | func max(ii []int) int { 393 | max := ii[0] 394 | if len(ii) == 1 { 395 | return max 396 | } 397 | for _, v := range ii[1:] { 398 | if v > max { 399 | max = v 400 | } 401 | } 402 | return max 403 | } 404 | 405 | func min(ii []int) int { 406 | min := ii[0] 407 | if len(ii) == 1 { 408 | return min 409 | } 410 | for _, v := range ii[1:] { 411 | if v < min { 412 | min = v 413 | } 414 | } 415 | return min 416 | 417 | } 418 | -------------------------------------------------------------------------------- /tdigest_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | "reflect" 8 | "testing" 9 | ) 10 | 11 | func TestFindNearest(t *testing.T) { 12 | type testcase struct { 13 | centroids []*centroid 14 | val float64 15 | want []int 16 | } 17 | 18 | testcases := []testcase{ 19 | {[]*centroid{{0, 1}, {1, 1}, {2, 1}}, -1, []int{0}}, 20 | {[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 0, []int{0}}, 21 | {[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 1, []int{1}}, 22 | {[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 2, []int{2}}, 23 | {[]*centroid{{0, 1}, {1, 1}, {2, 1}}, 3, []int{2}}, 24 | {[]*centroid{{0, 1}, {2, 1}}, 1, []int{0, 1}}, 25 | {[]*centroid{}, 1, []int{}}, 26 | } 27 | 28 | for i, tc := range testcases { 29 | d := TDigest{centroids: tc.centroids} 30 | have := d.nearest(tc.val) 31 | if len(tc.want) == 0 { 32 | if len(have) != 0 { 33 | t.Errorf("TDigest.nearest wrong test=%d, have=%v, want=%v", i, have, tc.want) 34 | } 35 | } else { 36 | if !reflect.DeepEqual(tc.want, have) { 37 | t.Errorf("TDigest.nearest wrong test=%d, have=%v, want=%v", i, have, tc.want) 38 | } 39 | } 40 | } 41 | } 42 | 43 | func BenchmarkFindNearest(b *testing.B) { 44 | n := 500 45 | d := simpleTDigest(n) 46 | 47 | b.ResetTimer() 48 | var val float64 49 | for i := int64(0); i < int64(b.N); i++ { 50 | val = float64(i % d.countTotal) 51 | _ = d.nearest(val) 52 | } 53 | } 54 | 55 | func TestFindAddTarget(t *testing.T) { 56 | testcase := func(in []*centroid, val float64, want int) func(*testing.T) { 57 | return func(t *testing.T) { 58 | d := TDigest{centroids: in, compression: 1} 59 | for _, c := range in { 60 | d.countTotal += c.count 61 | } 62 | have := d.findAddTarget(val) 63 | if have != want { 64 | t.Errorf("TDigest.findAddTarget wrong have=%v, want=%v", have, want) 65 | } 66 | } 67 | } 68 | t.Run("empty digest", testcase(nil, 1, -1)) 69 | t.Run("exactly one with room", testcase( 70 | []*centroid{{0.0, 1}, {1.0, 1}, {2.0, 1}}, 71 | 1, 1)) 72 | t.Run("exactly one without room", testcase( 73 | []*centroid{{0.0, 1}, {1.0, 3}, {2.0, 1}}, 74 | 1, -1)) 75 | t.Run("multiple candidates", func(t *testing.T) { 76 | t.Run("all lesser", func(t *testing.T) { 77 | t.Run("with room", testcase( 78 | []*centroid{{0.0, 1}, {1.0, 1}, {1.0, 3}, {2.0, 1}}, 79 | 1.1, 2)) 80 | t.Run("without room", testcase( 81 | []*centroid{{0.0, 1}, {1.0, 1}, {1.0, 4}, {2.0, 1}}, 82 | 1.1, -1)) 83 | }) 84 | t.Run("all greater", func(t *testing.T) { 85 | t.Run("with room", testcase( 86 | []*centroid{{0.0, 1}, {1.0, 1}, {1.0, 3}, {2.0, 1}}, 87 | 0.9, 1)) 88 | t.Run("without room", testcase( 89 | []*centroid{{0.0, 1}, {1.0, 3}, {1.0, 4}, {2.0, 1}}, 90 | 0.9, -1)) 91 | }) 92 | t.Run("all equal", func(t *testing.T) { 93 | t.Run("with room in none", testcase( 94 | []*centroid{{0.0, 1}, {1.0, 3}, {1.0, 3}, {2.0, 1}}, 95 | 1.0, -1)) 96 | t.Run("with room in one", testcase( 97 | []*centroid{{0.0, 1}, {1.0, 2}, {1.0, 3}, {2.0, 1}}, 98 | 1.0, 1)) 99 | t.Run("with room in multiple", func(t *testing.T) { 100 | d := TDigest{ 101 | centroids: []*centroid{{0.0, 1}, {1.0, 1}, {1.0, 2}, {2.0, 1}}, 102 | compression: 1, 103 | } 104 | for _, c := range d.centroids { 105 | d.countTotal += c.count 106 | } 107 | have := d.findAddTarget(1.0) 108 | if have != 1 && have != 2 { 109 | t.Errorf("TDigest.findAddTarget wrong have=%v, want=1 or 2", have) 110 | } 111 | }) 112 | }) 113 | t.Run("both greater and lesser", func(t *testing.T) { 114 | t.Run("with room below", testcase( 115 | []*centroid{{0.0, 1}, {0.8, 1}, {0.8, 1}, {1.0, 6}, {1.0, 1}, {2.0, 1}}, 116 | 0.9, 2)) 117 | t.Run("with room above", testcase( 118 | []*centroid{{0.0, 1}, {0.8, 1}, {0.8, 6}, {1.0, 1}, {1.0, 1}, {2.0, 1}}, 119 | 0.9, 3)) 120 | t.Run("with no room", testcase( 121 | []*centroid{{0.0, 1}, {0.8, 1}, {0.8, 6}, {1.0, 6}, {1.0, 1}, {2.0, 1}}, 122 | 0.9, -1)) 123 | t.Run("with room above and below", func(t *testing.T) { 124 | d := TDigest{ 125 | centroids: []*centroid{ 126 | {0.0, 1}, {0.8, 1}, {0.8, 1}, 127 | {1.0, 1}, {1.0, 1}, {2.0, 1}}, 128 | compression: 1, 129 | } 130 | for _, c := range d.centroids { 131 | d.countTotal += c.count 132 | } 133 | have := d.findAddTarget(0.9) 134 | if have != 2 && have != 3 { 135 | t.Errorf("TDigest.findAddTarget wrong have=%v, want=2 or 3", have) 136 | } 137 | }) 138 | }) 139 | }) 140 | } 141 | 142 | // adding a new centroid should maintain sorted order 143 | func TestAddNewCentroid(t *testing.T) { 144 | type testcase struct { 145 | centroidVals []float64 146 | add float64 147 | want []float64 148 | } 149 | testcases := []testcase{ 150 | {[]float64{}, 1, []float64{1}}, 151 | {[]float64{1}, 2, []float64{1, 2}}, 152 | {[]float64{1, 2}, 1.5, []float64{1, 1.5, 2}}, 153 | {[]float64{1, 1.5, 2}, -1, []float64{-1, 1, 1.5, 2}}, 154 | {[]float64{1, 1.5, 2}, 3, []float64{1, 1.5, 2, 3}}, 155 | {[]float64{1, 1.5, 2}, 1.6, []float64{1, 1.5, 1.6, 2}}, 156 | } 157 | 158 | for i, tc := range testcases { 159 | d := tdFromMeans(tc.centroidVals) 160 | d.addNewCentroid(tc.add, 1) 161 | 162 | have := make([]float64, len(d.centroids)) 163 | for i, c := range d.centroids { 164 | have[i] = c.mean 165 | } 166 | 167 | if !reflect.DeepEqual(tc.want, have) { 168 | t.Errorf("TDigest.addNewCentroid wrong test=%d, have=%v, want=%v", i, have, tc.want) 169 | } 170 | } 171 | } 172 | 173 | func verifyCentroidOrder(t *testing.T, cs *TDigest) { 174 | if len(cs.centroids) < 2 { 175 | return 176 | } 177 | last := cs.centroids[0] 178 | for i, c := range cs.centroids[1:] { 179 | if c.mean < last.mean { 180 | t.Errorf("centroid %d lt %d: %v < %v", i+1, i, c.mean, last.mean) 181 | } 182 | last = c 183 | } 184 | } 185 | 186 | func TestQuantileOrder(t *testing.T) { 187 | // stumbled upon in real world application: adding a 1 to this 188 | // resulted in the 6th centroid getting incremented instead of the 189 | // 7th. 190 | d := &TDigest{ 191 | countTotal: 14182, 192 | compression: 100, 193 | centroids: []*centroid{ 194 | ¢roid{0.000000, 1}, 195 | ¢roid{0.000000, 564}, 196 | ¢roid{0.000000, 1140}, 197 | ¢roid{0.000000, 1713}, 198 | ¢roid{0.000000, 2380}, 199 | ¢roid{0.000000, 2688}, 200 | ¢roid{0.000000, 1262}, 201 | ¢roid{2.005758, 1563}, 202 | ¢roid{30.499251, 1336}, 203 | ¢roid{381.533509, 761}, 204 | ¢roid{529.600000, 5}, 205 | ¢roid{1065.294118, 17}, 206 | ¢roid{2266.444444, 36}, 207 | ¢roid{4268.809783, 368}, 208 | ¢roid{14964.148148, 27}, 209 | ¢roid{41024.579618, 157}, 210 | ¢roid{124311.192308, 52}, 211 | ¢roid{219674.636364, 22}, 212 | ¢roid{310172.775000, 40}, 213 | ¢roid{412388.642857, 14}, 214 | ¢roid{582867.000000, 16}, 215 | ¢roid{701434.777778, 9}, 216 | ¢roid{869363.800000, 5}, 217 | ¢roid{968264.000000, 1}, 218 | ¢roid{987100.666667, 3}, 219 | ¢roid{1029895.000000, 1}, 220 | ¢roid{1034640.000000, 1}, 221 | }, 222 | } 223 | d.Add(1.0, 1) 224 | verifyCentroidOrder(t, d) 225 | } 226 | 227 | func TestQuantile(t *testing.T) { 228 | type testcase struct { 229 | weights []int64 230 | idx int 231 | want float64 232 | } 233 | testcases := []testcase{ 234 | {[]int64{1, 1, 1, 1}, 0, 0.0}, 235 | {[]int64{1, 1, 1, 1}, 1, 0.25}, 236 | {[]int64{1, 1, 1, 1}, 2, 0.5}, 237 | {[]int64{1, 1, 1, 1}, 3, 0.75}, 238 | 239 | {[]int64{5, 1, 1, 1}, 0, 0.250}, 240 | {[]int64{5, 1, 1, 1}, 1, 0.625}, 241 | {[]int64{5, 1, 1, 1}, 2, 0.750}, 242 | {[]int64{5, 1, 1, 1}, 3, 0.875}, 243 | 244 | {[]int64{1, 1, 1, 5}, 0, 0.0}, 245 | {[]int64{1, 1, 1, 5}, 1, 0.125}, 246 | {[]int64{1, 1, 1, 5}, 2, 0.250}, 247 | {[]int64{1, 1, 1, 5}, 3, 0.625}, 248 | } 249 | 250 | for i, tc := range testcases { 251 | d := tdFromWeights(tc.weights) 252 | have := d.quantileOf(tc.idx) 253 | if have != tc.want { 254 | t.Errorf("TDigest.quantile wrong test=%d, have=%.3f, want=%.3f", i, have, tc.want) 255 | } 256 | } 257 | } 258 | 259 | func TestAddValue(t *testing.T) { 260 | type testcase struct { 261 | value float64 262 | weight int 263 | want []*centroid 264 | } 265 | 266 | testcases := []testcase{ 267 | {1.0, 1, []*centroid{{1, 1}}}, 268 | {0.0, 1, []*centroid{{0, 1}, {1, 1}}}, 269 | {2.0, 1, []*centroid{{0, 1}, {1, 1}, {2, 1}}}, 270 | {3.0, 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}}}, 271 | {4.0, 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}}, 272 | {math.NaN(), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}}, 273 | {math.Inf(-1), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}}, 274 | {math.Inf(+1), 1, []*centroid{{0, 1}, {1, 1}, {2.5, 2}, {4, 1}}}, 275 | } 276 | 277 | d := NewWithCompression(1) 278 | for i, tc := range testcases { 279 | d.Add(tc.value, tc.weight) 280 | if !reflect.DeepEqual(d.centroids, tc.want) { 281 | t.Fatalf("TDigest.addValue unexpected state step=%d, have=%v, want=%v", i, d.centroids, tc.want) 282 | } 283 | } 284 | } 285 | 286 | func TestQuantileValue(t *testing.T) { 287 | d := NewWithCompression(1) 288 | d.countTotal = 8 289 | d.centroids = []*centroid{{0.5, 3}, {1, 1}, {2, 2}, {3, 1}, {8, 1}} 290 | 291 | type testcase struct { 292 | q float64 293 | want float64 294 | } 295 | 296 | // correct values, determined by hand with pen and paper for this set of centroids 297 | testcases := []testcase{ 298 | {0.0, 5.0 / 40.0}, 299 | {0.1, 13.0 / 40.0}, 300 | {0.2, 21.0 / 40.0}, 301 | {0.3, 29.0 / 40.0}, 302 | {0.4, 37.0 / 40.0}, 303 | {0.5, 20.0 / 15.0}, 304 | {0.6, 28.0 / 15.0}, 305 | {0.7, 36.0 / 15.0}, 306 | {0.8, 44.0 / 15.0}, 307 | {0.9, 13.0 / 2.0}, 308 | {1.0, 21.0 / 2.0}, 309 | } 310 | 311 | var epsilon = 1e-8 312 | 313 | for i, tc := range testcases { 314 | have := d.Quantile(tc.q) 315 | if math.Abs(have-tc.want) > epsilon { 316 | t.Errorf("TDigest.Quantile wrong step=%d, have=%v, want=%v", 317 | i, have, tc.want) 318 | } 319 | } 320 | } 321 | 322 | func BenchmarkFindAddTarget(b *testing.B) { 323 | n := 500 324 | d := simpleTDigest(n) 325 | 326 | b.ResetTimer() 327 | var val float64 328 | for i := int64(0); i < int64(b.N); i++ { 329 | val = float64(i % d.countTotal) 330 | _ = d.findAddTarget(val) 331 | } 332 | } 333 | 334 | // add the values [0,n) to a centroid set, equal weights 335 | func simpleTDigest(n int) *TDigest { 336 | d := NewWithCompression(1.0) 337 | for i := 0; i < n; i++ { 338 | d.Add(float64(i), 1) 339 | } 340 | return d 341 | } 342 | 343 | func tdFromMeans(means []float64) *TDigest { 344 | centroids := make([]*centroid, len(means)) 345 | for i, m := range means { 346 | centroids[i] = ¢roid{m, 1} 347 | } 348 | d := NewWithCompression(1.0) 349 | d.centroids = centroids 350 | d.countTotal = int64(len(centroids)) 351 | return d 352 | } 353 | 354 | func tdFromWeights(weights []int64) *TDigest { 355 | centroids := make([]*centroid, len(weights)) 356 | countTotal := int64(0) 357 | for i, w := range weights { 358 | centroids[i] = ¢roid{float64(i), w} 359 | countTotal += w 360 | } 361 | d := NewWithCompression(1.0) 362 | d.centroids = centroids 363 | d.countTotal = countTotal 364 | return d 365 | } 366 | 367 | func ExampleTDigest() { 368 | rand.Seed(5678) 369 | values := make(chan float64) 370 | 371 | // Generate 100k uniform random data between 0 and 100 372 | var ( 373 | n int = 100000 374 | min, max float64 = 0, 100 375 | ) 376 | go func() { 377 | for i := 0; i < n; i++ { 378 | values <- min + rand.Float64()*(max-min) 379 | } 380 | close(values) 381 | }() 382 | 383 | // Pass the values through a TDigest, compression parameter 100 384 | td := New() 385 | 386 | for val := range values { 387 | // Add the value with weight 1 388 | td.Add(val, 1) 389 | } 390 | 391 | // Print the 50th, 90th, 99th, 99.9th, and 99.99th percentiles 392 | fmt.Printf("50th: %.5f\n", td.Quantile(0.5)) 393 | fmt.Printf("90th: %.5f\n", td.Quantile(0.9)) 394 | fmt.Printf("99th: %.5f\n", td.Quantile(0.99)) 395 | fmt.Printf("99.9th: %.5f\n", td.Quantile(0.999)) 396 | fmt.Printf("99.99th: %.5f\n", td.Quantile(0.9999)) 397 | } 398 | 399 | func TestMerge(t *testing.T) { 400 | values := make(chan float64) 401 | 402 | // Generate 100k uniform random data between 0 and 100 403 | var ( 404 | n int = 100000 405 | min, max float64 = 0, 100 406 | ) 407 | go func() { 408 | for i := 0; i < n; i++ { 409 | values <- min + rand.Float64()*(max-min) 410 | } 411 | close(values) 412 | }() 413 | 414 | // Pass the values through two TDigests 415 | td1 := New() 416 | td2 := New() 417 | 418 | i := 0 419 | for val := range values { 420 | // Add the value with weight 1. Alternate between the digests. 421 | if i%2 == 0 { 422 | td1.Add(val, 1) 423 | } else { 424 | td2.Add(val, 1) 425 | } 426 | i += 1 427 | } 428 | 429 | rand.Seed(2) 430 | // merge both into a third tdigest. 431 | td := New() 432 | td1.MergeInto(td) 433 | td2.MergeInto(td) 434 | t.Logf("10th: %.5f\n", td1.Quantile(0.1)) 435 | t.Logf("50th: %.5f\n", td1.Quantile(0.5)) 436 | t.Logf("90th: %.5f\n", td1.Quantile(0.9)) 437 | t.Logf("99th: %.5f\n", td1.Quantile(0.99)) 438 | t.Logf("99.9th: %.5f\n", td1.Quantile(0.999)) 439 | t.Logf("99.99th: %.5f\n", td1.Quantile(0.9999)) 440 | 441 | t.Logf("10th: %.5f\n", td2.Quantile(0.1)) 442 | t.Logf("50th: %.5f\n", td2.Quantile(0.5)) 443 | t.Logf("90th: %.5f\n", td2.Quantile(0.9)) 444 | t.Logf("99th: %.5f\n", td2.Quantile(0.99)) 445 | t.Logf("99.9th: %.5f\n", td2.Quantile(0.999)) 446 | t.Logf("99.99th: %.5f\n", td2.Quantile(0.9999)) 447 | 448 | t.Logf("10th: %.5f\n", td.Quantile(0.1)) 449 | t.Logf("50th: %.5f\n", td.Quantile(0.5)) 450 | t.Logf("90th: %.5f\n", td.Quantile(0.9)) 451 | t.Logf("99th: %.5f\n", td.Quantile(0.99)) 452 | t.Logf("99.9th: %.5f\n", td.Quantile(0.999)) 453 | t.Logf("99.99th: %.5f\n", td.Quantile(0.9999)) 454 | } 455 | --------------------------------------------------------------------------------