├── .config
    └── caca.ini
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── go.mod
├── go.sum
├── options.go
├── options_test.go
├── rng.go
├── serialization.go
├── serialization_test.go
├── summary.go
├── summary_test.go
├── tdigest.go
└── tdigest_test.go


/.config/caca.ini:
--------------------------------------------------------------------------------
1 | [meta]
2 | state = pinned
3 | 
4 | [link "Issues"]
5 | href = https://github.com/caio/go-tdigest/issues
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | go-tdigest.test
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | First and foremost: **thank you very much** for your interest in this
 4 | project. Feel free to skip all this and open your issue / pull request
 5 | if reading contribution guidelines is too much for you at this point.
 6 | We value your contribution a lot more than we value your ability to
 7 | follow rules (and thankfully we can afford to take this approach given
 8 | this project's demand).
 9 | 
10 | Any kind of contribution is welcome. We can always use better docs and
11 | tests (and code, of course). If you think you can improve this project
12 | in any dimension _let's talk_ :-)
13 | 
14 | ## Guidelines
15 | 
16 | Be kind and respectful in all your interactions with people inside
17 | (outside too!) this community; There is no excuse for not showing
18 | basic decency. Sarcasm and generally unconstructive remarks are **not
19 | welcome**.
20 | 
21 | ### Issues
22 | 
23 | When opening and interacting with issues please:
24 | 
25 | - Be as clear as possible
26 | - Provide examples if you can
27 | 
28 | ### Pull Requests
29 | 
30 | We expect that pull requests:
31 | 
32 | - Have [good commit messages][commits]
33 | - Contain tests for new features
34 | - Target and can be cleanly merged with the  `master` branch
35 | - Pass the tests
36 | 
37 | [commits]: https://www.git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project#_commit_guidelines
38 | 
39 | ### Project Management
40 | 
41 | Don't bother with labels, milestones, assignments, etc. We don't make
42 | use of those.
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Caio Romão Costa Nascimento
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # T-Digest
 2 | 
 3 | A fast map-reduce and parallel streaming friendly data-structure for accurate
 4 | quantile approximation.
 5 | 
 6 | This package provides an implementation of Ted Dunning's t-digest data
 7 | structure in Go.
 8 | 
 9 | [![GoDoc](https://godoc.org/github.com/caio/go-tdigest?status.svg)](http://godoc.org/github.com/caio/go-tdigest)
10 | [![Go Report Card](https://goreportcard.com/badge/github.com/caio/go-tdigest)](https://goreportcard.com/report/github.com/caio/go-tdigest)
11 | 
12 | ## Project Status
13 | 
14 | This project is actively maintained. We are happy to collaborate on features
15 | and issues if/when they arrive.
16 | 
17 | ## Installation
18 | 
19 | This package uses go modules. Our releases are tagged and signed following
20 | the [Semantic Versioning][semver] scheme.
21 | 
22 |     go get github.com/caio/go-tdigest/v4
23 | 
24 | 
25 | [semver]: http://semver.org/
26 | 
27 | ## Example Usage
28 | 
29 | ```go
30 | package main
31 | 
32 | import (
33 | 	"fmt"
34 | 	"math/rand"
35 | 
36 | 	"github.com/caio/go-tdigest/v4"
37 | )
38 | 
39 | func main() {
40 | 	// Analogue to tdigest.New(tdigest.Compression(100))
41 | 	t, _ := tdigest.New()
42 | 
43 | 	for i := 0; i < 10000; i++ {
44 | 		// Analogue to t.AddWeighted(rand.Float64(), 1)
45 | 		t.Add(rand.Float64())
46 | 	}
47 | 
48 | 	fmt.Printf("p(.5) = %.6f\n", t.Quantile(0.5))
49 | 	fmt.Printf("CDF(Quantile(.5)) = %.6f\n", t.CDF(t.Quantile(0.5)))
50 | }
51 | ```
52 | 
53 | ## Configuration
54 | 
55 | You can configure your digest upon creation with options documented
56 | at [options.go](options.go). Example:
57 | 
58 | ```go
59 | // Construct a digest with compression=200 and its own
60 | // (thread-unsafe) RNG seeded with 0xCA10:
61 | digest, _ := tdigest.New(
62 |         tdigest.Compression(200),
63 |         tdigest.LocalRandomNumberGenerator(0xCA10),
64 | )
65 | ```
66 | 
67 | ## References
68 | 
69 | This is a port of the [reference][1] implementation with some ideas borrowed
70 | from the [python version][2]. If you wanna get a quick grasp of how it works
71 | and why it's useful, [this video and companion article is pretty helpful][3].
72 | 
73 | [1]: https://github.com/tdunning/t-digest
74 | [2]: https://github.com/CamDavidsonPilon/tdigest
75 | [3]: https://www.mapr.com/blog/better-anomaly-detection-t-digest-whiteboard-walkthrough
76 | 
77 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/caio/go-tdigest/v4
2 | 
3 | go 1.18
4 | 
5 | require (
6 | 	github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353
7 | 	gonum.org/v1/gonum v0.11.0
8 | )
9 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 h1:X/79QL0b4YJVO5+OsPH9rF2u428CIrGL/jLmPsoOQQ4=
2 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U=
3 | golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs=
4 | gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
5 | gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
6 | 


--------------------------------------------------------------------------------
/options.go:
--------------------------------------------------------------------------------
 1 | package tdigest
 2 | 
 3 | import "errors"
 4 | 
 5 | type tdigestOption func(*TDigest) error
 6 | 
 7 | // Compression sets the digest compression
 8 | //
 9 | // The compression parameter rules the threshold in which samples are
10 | // merged together - the more often distinct samples are merged the more
11 | // precision is lost. Compression should be tuned according to your data
12 | // distribution, but a value of 100 (the default) is often good enough.
13 | //
14 | // A higher compression value means holding more centroids in memory
15 | // (thus: better precision), which means a bigger serialization payload,
16 | // higher memory footprint and slower addition of new samples.
17 | //
18 | // Compression must be a value greater of equal to 1, will yield an
19 | // error otherwise.
20 | func Compression(compression float64) tdigestOption { // nolint
21 | 	return func(t *TDigest) error {
22 | 		if compression < 1 {
23 | 			return errors.New("Compression should be >= 1")
24 | 		}
25 | 		t.compression = compression
26 | 		return nil
27 | 	}
28 | }
29 | 
30 | // RandomNumberGenerator sets the RNG to be used internally
31 | //
32 | // This allows changing which random number source is used when using
33 | // the TDigest structure (rngs are used when deciding which candidate
34 | // centroid to merge with and when compressing or merging with
35 | // another digest for it increases accuracy). This functionality is
36 | // particularly useful for testing or when you want to disconnect
37 | // your sample collection from the (default) shared random source
38 | // to minimize lock contention.
39 | func RandomNumberGenerator(rng RNG) tdigestOption { // nolint
40 | 	return func(t *TDigest) error {
41 | 		t.rng = rng
42 | 		return nil
43 | 	}
44 | }
45 | 
46 | // LocalRandomNumberGenerator makes the TDigest use the default
47 | // `math/random` functions but with an unshared source that is
48 | // seeded with the given `seed` parameter.
49 | func LocalRandomNumberGenerator(seed int64) tdigestOption { // nolint
50 | 	return RandomNumberGenerator(newLocalRNG(seed))
51 | }
52 | 


--------------------------------------------------------------------------------
/options_test.go:
--------------------------------------------------------------------------------
 1 | package tdigest
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestDefaults(t *testing.T) {
 6 | 	digest, err := New()
 7 | 
 8 | 	if err != nil {
 9 | 		t.Errorf("Creating a default TDigest should never error out. Got %s", err)
10 | 	}
11 | 
12 | 	if digest.compression != 100 {
13 | 		t.Errorf("The default compression should be 100")
14 | 	}
15 | }
16 | 
17 | func TestCompression(t *testing.T) {
18 | 	digest, _ := New(Compression(40))
19 | 	if digest.compression != 40 {
20 | 		t.Errorf("The compression option should change the new digest compression")
21 | 	}
22 | 
23 | 	digest, err := New(Compression(0))
24 | 	if err == nil || digest != nil {
25 | 		t.Errorf("Trying to create a digest with bad compression should give an error")
26 | 	}
27 | }
28 | 
29 | func TestRandomNumberGenerator(t *testing.T) {
30 | 	const numTests = 100
31 | 
32 | 	// Create two digests with unshared rngs seeded with
33 | 	// the same seed
34 | 	t1, _ := New(RandomNumberGenerator(newLocalRNG(0xDEADBEE)))
35 | 	t2, _ := New(LocalRandomNumberGenerator(0xDEADBEE))
36 | 
37 | 	// So that they should emit the same values when called
38 | 	// at the same frequency
39 | 	for i := 0; i < numTests; i++ {
40 | 		if t1.rng.Float32() != t2.rng.Float32() ||
41 | 			t1.rng.Intn(10) != t2.rng.Intn(10) {
42 | 			t.Errorf("r1 and r2 should be distinct RNGs returning the same values")
43 | 		}
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/rng.go:
--------------------------------------------------------------------------------
 1 | package tdigest
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | )
 6 | 
 7 | // RNG is an interface that wraps the needed random number
 8 | // generator calls that tdigest uses during its runtime
 9 | type RNG interface {
10 | 	Float32() float32
11 | 	Intn(int) int
12 | }
13 | 
14 | type globalRNG struct{}
15 | 
16 | func (r globalRNG) Float32() float32 {
17 | 	return rand.Float32()
18 | }
19 | 
20 | func (r globalRNG) Intn(i int) int {
21 | 	return rand.Intn(i)
22 | }
23 | 
24 | type localRNG struct {
25 | 	localRand *rand.Rand
26 | }
27 | 
28 | func newLocalRNG(seed int64) *localRNG {
29 | 	return &localRNG{
30 | 		localRand: rand.New(rand.NewSource(seed)),
31 | 	}
32 | }
33 | 
34 | func (r *localRNG) Float32() float32 {
35 | 	return r.localRand.Float32()
36 | }
37 | 
38 | func (r *localRNG) Intn(i int) int {
39 | 	return r.localRand.Intn(i)
40 | }
41 | 


--------------------------------------------------------------------------------
/serialization.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"math"
  9 | )
 10 | 
 11 | const smallEncoding int32 = 2
 12 | 
 13 | var endianess = binary.BigEndian
 14 | 
 15 | // AsBytes serializes the digest into a byte array so it can be
 16 | // saved to disk or sent over the wire.
 17 | func (t TDigest) AsBytes() ([]byte, error) {
 18 | 	// TODO get rid of the (now) useless error
 19 | 	return t.ToBytes(make([]byte, t.requiredSize())), nil
 20 | }
 21 | 
 22 | func (t *TDigest) requiredSize() int {
 23 | 	return 16 + (4 * len(t.summary.means)) + (len(t.summary.counts) * binary.MaxVarintLen64)
 24 | }
 25 | 
 26 | // ToBytes serializes into the supplied slice, avoiding allocation if the slice
 27 | // is large enough. The result slice is returned.
 28 | func (t *TDigest) ToBytes(b []byte) []byte {
 29 | 	requiredSize := t.requiredSize()
 30 | 	if cap(b) < requiredSize {
 31 | 		b = make([]byte, requiredSize)
 32 | 	}
 33 | 
 34 | 	// The binary.Put* functions helpfully don't extend the slice for you, they
 35 | 	// just panic if it's not already long enough. So pre-set the slice length;
 36 | 	// we'll return it with the actual encoded length.
 37 | 	b = b[:cap(b)]
 38 | 
 39 | 	endianess.PutUint32(b[0:4], uint32(smallEncoding))
 40 | 	endianess.PutUint64(b[4:12], math.Float64bits(t.compression))
 41 | 	endianess.PutUint32(b[12:16], uint32(t.summary.Len()))
 42 | 
 43 | 	var x float64
 44 | 	idx := 16
 45 | 	for _, mean := range t.summary.means {
 46 | 		delta := mean - x
 47 | 		x = mean
 48 | 		endianess.PutUint32(b[idx:], math.Float32bits(float32(delta)))
 49 | 		idx += 4
 50 | 	}
 51 | 
 52 | 	for _, count := range t.summary.counts {
 53 | 		idx += binary.PutUvarint(b[idx:], count)
 54 | 	}
 55 | 	return b[:idx]
 56 | }
 57 | 
 58 | // FromBytes reads a byte buffer with a serialized digest (from AsBytes)
 59 | // and deserializes it.
 60 | //
 61 | // This function creates a new tdigest instance with the provided options,
 62 | // but ignores the compression setting since the correct value comes
 63 | // from the buffer.
 64 | func FromBytes(buf *bytes.Reader, options ...tdigestOption) (*TDigest, error) {
 65 | 	var encoding int32
 66 | 	err := binary.Read(buf, endianess, &encoding)
 67 | 	if err != nil {
 68 | 		return nil, err
 69 | 	}
 70 | 
 71 | 	if encoding != smallEncoding {
 72 | 		return nil, fmt.Errorf("unsupported encoding version: %d", encoding)
 73 | 	}
 74 | 
 75 | 	t, err := newWithoutSummary(options...)
 76 | 
 77 | 	if err != nil {
 78 | 		return nil, err
 79 | 	}
 80 | 
 81 | 	var compression float64
 82 | 	err = binary.Read(buf, endianess, &compression)
 83 | 	if err != nil {
 84 | 		return nil, err
 85 | 	}
 86 | 
 87 | 	t.compression = compression
 88 | 
 89 | 	var numCentroids int32
 90 | 	err = binary.Read(buf, endianess, &numCentroids)
 91 | 	if err != nil {
 92 | 		return nil, err
 93 | 	}
 94 | 
 95 | 	if numCentroids < 0 || numCentroids > 1<<22 {
 96 | 		return nil, errors.New("bad number of centroids in serialization")
 97 | 	}
 98 | 
 99 | 	t.summary = newSummary(int(numCentroids))
100 | 	t.summary.means = t.summary.means[:numCentroids]
101 | 	t.summary.counts = t.summary.counts[:numCentroids]
102 | 
103 | 	var x float64
104 | 	for i := 0; i < int(numCentroids); i++ {
105 | 		var delta float32
106 | 		err = binary.Read(buf, endianess, &delta)
107 | 		if err != nil {
108 | 			return nil, err
109 | 		}
110 | 		x += float64(delta)
111 | 		t.summary.means[i] = x
112 | 	}
113 | 
114 | 	for i := 0; i < int(numCentroids); i++ {
115 | 		count, err := decodeUint(buf)
116 | 		if err != nil {
117 | 			return nil, err
118 | 		}
119 | 		t.summary.counts[i] = count
120 | 		t.count += count
121 | 	}
122 | 
123 | 	return t, nil
124 | }
125 | 
126 | // FromBytes deserializes into the supplied TDigest struct, re-using
127 | // and overwriting any existing buffers.
128 | //
129 | // This method reinitializes the digest from the provided buffer
130 | // discarding any previously collected data. Notice that in case
131 | // of errors this may leave the digest in a unusable state.
132 | func (t *TDigest) FromBytes(buf []byte) error {
133 | 	if len(buf) < 16 {
134 | 		return errors.New("buffer too small for deserialization")
135 | 	}
136 | 
137 | 	encoding := int32(endianess.Uint32(buf))
138 | 	if encoding != smallEncoding {
139 | 		return fmt.Errorf("unsupported encoding version: %d", encoding)
140 | 	}
141 | 
142 | 	compression := math.Float64frombits(endianess.Uint64(buf[4:12]))
143 | 	numCentroids := int(endianess.Uint32(buf[12:16]))
144 | 	if numCentroids < 0 || numCentroids > 1<<22 {
145 | 		return errors.New("bad number of centroids in serialization")
146 | 	}
147 | 
148 | 	if len(buf) < 16+(4*numCentroids) {
149 | 		return errors.New("buffer too small for deserialization")
150 | 	}
151 | 
152 | 	t.count = 0
153 | 	t.compression = compression
154 | 	if t.summary == nil ||
155 | 		cap(t.summary.means) < numCentroids ||
156 | 		cap(t.summary.counts) < numCentroids {
157 | 		t.summary = newSummary(numCentroids)
158 | 	}
159 | 	t.summary.means = t.summary.means[:numCentroids]
160 | 	t.summary.counts = t.summary.counts[:numCentroids]
161 | 
162 | 	idx := 16
163 | 	var x float64
164 | 	for i := 0; i < numCentroids; i++ {
165 | 		delta := math.Float32frombits(endianess.Uint32(buf[idx:]))
166 | 		idx += 4
167 | 		x += float64(delta)
168 | 		t.summary.means[i] = x
169 | 	}
170 | 
171 | 	for i := 0; i < numCentroids; i++ {
172 | 		count, read := binary.Uvarint(buf[idx:])
173 | 		if read < 1 {
174 | 			return errors.New("error decoding varint, this TDigest is now invalid")
175 | 		}
176 | 
177 | 		idx += read
178 | 
179 | 		t.summary.counts[i] = count
180 | 		t.count += count
181 | 	}
182 | 
183 | 	if idx != len(buf) {
184 | 		return errors.New("buffer has unread data")
185 | 	}
186 | 	return nil
187 | }
188 | 
189 | func encodeUint(buf *bytes.Buffer, n uint64) error {
190 | 	var b [binary.MaxVarintLen64]byte
191 | 
192 | 	l := binary.PutUvarint(b[:], n)
193 | 
194 | 	_, err := buf.Write(b[:l])
195 | 
196 | 	return err
197 | }
198 | 
199 | func decodeUint(buf *bytes.Reader) (uint64, error) {
200 | 	v, err := binary.ReadUvarint(buf)
201 | 	return v, err
202 | }
203 | 


--------------------------------------------------------------------------------
/serialization_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/base64"
  6 | 	"math"
  7 | 	"math/rand"
  8 | 	"reflect"
  9 | 	"testing"
 10 | )
 11 | 
 12 | func TestEncodeDecode(t *testing.T) {
 13 | 	testUints := []uint64{0, 10, 100, 1000, 10000, 65535, 2147483647, 2 * math.MaxUint32}
 14 | 	buf := new(bytes.Buffer)
 15 | 
 16 | 	for _, i := range testUints {
 17 | 		err := encodeUint(buf, i)
 18 | 		if err != nil {
 19 | 			t.Error(err)
 20 | 		}
 21 | 	}
 22 | 
 23 | 	readBuf := bytes.NewReader(buf.Bytes())
 24 | 	for _, i := range testUints {
 25 | 		j, err := decodeUint(readBuf)
 26 | 		if err != nil {
 27 | 			t.Error(err)
 28 | 		}
 29 | 
 30 | 		if i != j {
 31 | 			t.Errorf("Basic encode/decode failed. Got %d, wanted %d", j, i)
 32 | 		}
 33 | 	}
 34 | }
 35 | 
 36 | func TestSerialization(t *testing.T) {
 37 | 	t1, _ := New()
 38 | 	for i := 0; i < 100; i++ {
 39 | 		_ = t1.Add(rand.Float64())
 40 | 	}
 41 | 
 42 | 	serialized, _ := t1.AsBytes()
 43 | 
 44 | 	t2, err := FromBytes(bytes.NewReader(serialized))
 45 | 	if err != nil {
 46 | 		t.Fatal(err)
 47 | 	}
 48 | 	assertSerialization(t, t1, t2)
 49 | 
 50 | 	err = t2.FromBytes(serialized)
 51 | 	if err != nil {
 52 | 		t.Fatal(err)
 53 | 	}
 54 | 	assertSerialization(t, t1, t2)
 55 | 
 56 | 	var toBuf []byte
 57 | 	toBuf = t1.ToBytes(toBuf)
 58 | 	if !reflect.DeepEqual(serialized, toBuf) {
 59 | 		t.Errorf("ToBytes serialized to something else")
 60 | 	}
 61 | 
 62 | 	// Make sure we don't re-allocate on buffer re-use
 63 | 	toBuf2 := t1.ToBytes(toBuf[:0])
 64 | 	if &toBuf2[0] != &toBuf[0] {
 65 | 		t.Errorf("Expected ToBytes() to re-use supplied slice")
 66 | 	}
 67 | 	if !reflect.DeepEqual(toBuf2, toBuf) {
 68 | 		t.Errorf("ToBytes serialized to something else")
 69 | 	}
 70 | 
 71 | 	t3, _ := New()
 72 | 	err = t3.FromBytes(serialized)
 73 | 	if err != nil {
 74 | 		t.Error(err)
 75 | 	}
 76 | 
 77 | 	assertSerialization(t, t1, t3)
 78 | 
 79 | 	// Mess up t3's internal state, deserialize again.
 80 | 	t3.compression = 2
 81 | 	t3.count = 1000
 82 | 	t3.summary.means = append(t3.summary.means, 2.0)
 83 | 	t3.summary.counts[0] = 0
 84 | 	err = t3.FromBytes(serialized)
 85 | 	if err != nil {
 86 | 		t.Error(err)
 87 | 	}
 88 | 
 89 | 	assertSerialization(t, t1, t3)
 90 | 
 91 | 	wrong := serialized[:50]
 92 | 	err = t3.FromBytes(wrong)
 93 | 	if err == nil {
 94 | 		t.Error("expected error")
 95 | 	}
 96 | 	wrong = wrong[:2]
 97 | 	err = t3.FromBytes(wrong)
 98 | 	if err == nil {
 99 | 		t.Error("expected error")
100 | 	}
101 | }
102 | 
103 | func assertSerialization(t *testing.T, t1, t2 *TDigest) {
104 | 	if t1.Count() != t2.Count() ||
105 | 		t1.summary.Len() != t2.summary.Len() ||
106 | 		t1.compression != t2.compression {
107 | 		t.Errorf("Deserialized to something different. t1=%v t2=%v", t1, t2)
108 | 	}
109 | 
110 | 	b1, err := t1.AsBytes()
111 | 	if err != nil {
112 | 		t.Error(err)
113 | 	}
114 | 
115 | 	b2, err := t2.AsBytes()
116 | 	if err != nil {
117 | 		t.Error(err)
118 | 	}
119 | 
120 | 	if !bytes.Equal(b1, b2) {
121 | 		t.Errorf("Deserialized to something different. b1=%q b2=%q", b1, b2)
122 | 	}
123 | 
124 | 	// t2 is fully functional.
125 | 
126 | 	err = t2.Add(rand.Float64())
127 | 	if err != nil {
128 | 		t.Error(err)
129 | 	}
130 | 
131 | 	err = t2.Compress()
132 | 	if err != nil {
133 | 		t.Error(err)
134 | 	}
135 | }
136 | 
137 | func TestFromBytesIgnoresCompression(t *testing.T) {
138 | 	digest := uncheckedNew(Compression(42))
139 | 
140 | 	// Instructing FromBytes to use a compression different
141 | 	// than the one in the payload should be ignored
142 | 	payload, err := digest.AsBytes()
143 | 
144 | 	if err != nil {
145 | 		t.Error(err)
146 | 	}
147 | 
148 | 	other, err := FromBytes(bytes.NewReader(payload), Compression(100))
149 | 
150 | 	if err != nil {
151 | 		t.Error(err)
152 | 	}
153 | 
154 | 	if other.Compression() != 42 {
155 | 		t.Errorf("Expected compression to be 42, got %f", other.Compression())
156 | 	}
157 | }
158 | 
159 | func TestLargeSerializaton(t *testing.T) {
160 | 	t1, err := New(Compression(10))
161 | 	if err != nil {
162 | 		t.Error(err)
163 | 	}
164 | 
165 | 	for i := 0; i < 100000; i++ {
166 | 		t1.AddWeighted(rand.Float64(), 1000000000)
167 | 	}
168 | 
169 | 	serialized, _ := t1.AsBytes()
170 | 	serialized2 := t1.ToBytes(nil)
171 | 	if !reflect.DeepEqual(serialized, serialized2) {
172 | 		t.Error("serialized version differ")
173 | 	}
174 | 
175 | 	t2, err := FromBytes(bytes.NewReader(serialized))
176 | 	if err != nil {
177 | 		t.Error(err)
178 | 	}
179 | 
180 | 	t3, _ := New()
181 | 	err = t3.FromBytes(serialized2)
182 | 	if err != nil {
183 | 		t.Error(err)
184 | 	}
185 | 
186 | 	assertSerialization(t, t1, t2)
187 | 	assertSerialization(t, t1, t3)
188 | }
189 | 
190 | func TestJavaSmallBytesCompat(t *testing.T) {
191 | 	// Base64 string generated via (<3 clojure):
192 | 	// (def t (com.tdunning.math.stats.AVLTreeDigest. 100))
193 | 	// (def r (java.util.Random.))
194 | 	// (.setSeed r 0xDEADBEEF)
195 | 	//
196 | 	// (dotimes [x 100000]
197 | 	// (.add t (.nextDouble r)))
198 | 	//
199 | 	// (def buf (java.nio.ByteBuffer/allocate (.smallByteSize t)))
200 | 	// (.asSmallBytes t buf)
201 | 	// (.flip buf)
202 | 	// (.compress t)
203 | 	// (def serialized-tdigest (.encodeToString (java.util.Base64/getEncoder) (.array buf)))
204 | 	//
205 | 	// (println serialized-tdigest)
206 | 
207 | 	serializedJavaTDigestB64 := "AAAAAkBZAAAAAAAAAAAEOzZpD1w24ySbN288eDfDHOI3jwpPN7jIyze1xXM2BzmuNc6x9DdUUcs2o1QFNvb5tzeNwTo2l0VYNgD89jaAiB83GxMBNTdLZzVjwOk3oKiDNxhS4jZ2blc2zTiiN8rlKDc7gN01HN5jNgF8bDYhIGo3BsH5NlbMcDdtCKQ3eJMUNzzuazQuLpY2y0lcNqNDdDcNDr03zOJ1N3ESMjcqxd42omxHNdA+mDbJmlo3KrIGN5i5/DegwGw10QY2NRuEmjdARF42g8qeN8L4yjajFVs1oIo9NvoNwDdrnuk2LeJGNwFHnTgGqu82TzfHN41Syzbd4xU2XjMVN1GPQjbMZOI2l91oNnY8CDdCy7U1wCuMNwLfyjfGDDo3FWWBNiEsSTiE9ZQ3rY03N6fEbDULhxU3i9qZNxuifjbeoMQ3vJ9mNpxU6jbvhEE3qOmYNrG09jcions3F6YRN5Ny1DUG5+E3P2m7NxXWSzd9PD03GBO9N+INZjczo844exOsNxmKIjgnk242m3GdNxrymzcJGSI1MVGaN6OzizblJ+43D9D/NvxA1Df2mZw339fFOB/KWzdN4WM4MhJoNpShjDfafXk3uSflN/uHhDeIUvI3ZOFqNqUkCDgokRU4VWp2Nzz1UjfigVQ4PHzkN8bWhzc21Kc3vOyQN8SJPjhEt344cC6EOAc/ZjfA9D04NZB9OAx8mzgsvD83oqOINzpg9jg9CWo4Y2qdN/r4XjbiH544DQY6OMvJSThcl+g4mnOyOKqdIzd91to4K72fODbsgjiPb2o4AmM6OIXueDhuDMs4PW1yN3ci2jhZGWY4aM5oOCDGwjiBKsk4FLcON7gbNTgr/zQ4e9V2N7qMkjiRTE04OiCKOA+kqDhK2u83jJvIN/P+6Tgw7v04voUSOExQKTgt8OU4ND0lN9CbPzhIfws4UJvSOBqgKzhe1TM4yVlsOLqRxjhsUw03lttXOEGkjTiTqns4kcOmOG2D5DgFx7847tKPONixTDhm8a84mAD1OFCXQzif2W84eVdkOPgJ+jjQy0g4a1HVOHLm7zjKP0k40bH4ODTQizj5Vn448ubuOQbg9TkIbEw4nuqfONUhyTktbsc48dTQOWSR7TkfJfU47iIfOQDP3jkN52Y5OibNOR1tRTk4XgM5JS+XODkp1DjNnOo4zOE/OTcKaDkfd4c5HTjLOTfMtzk1Tng5HH8aOTdpejlQok44yYMwN68whzgmY3o4kGHlOIRTqTkd2Jg45Dd0OHlnWzkEqtA5PENgOT6ckzlmuTQ5LPhpOL2F3zmPFVg5sPneOWfCETkZWu85KkV9ONN2zzlVKg85k3xqOYMdETkujEg5FZlSOYv3FznTwq05w571OXYNQDlfBkQ5NaiZOK74YzjPWAY5BSnsOUOhdTmCIsM5aphbOTm7cjmYQPw5WyLLOV8xQznRMgU5zm+AOb5MBDmEpF45lqbbOW3LNzlc5LI5ny6QObux3zmCqUY5JJyxOXAibjm8mJA5zUCAOeW3Tznyf3w59LruOceUBzn0Gx05vTtsOfquPDoaISI58SiPOdEPpznD/cw5yU1bObG+/zm6Urs5vqXbOcfLwDmrd4s51P3sOhMXXTogDTA51iYXObgArDnGgzQ5/T2FOfi0RjndrFQ5y0IuOhXqcDorpag6RHR1Oixgxjnpq5Y5svRMOdkKbzmou5k5xefuOdiV6DooArU6SnueOkSo6ToPT2o6BvjmOgtxUjoXlSw6G2tQOfUe9jnksDw52dLDOi/y0TpDLTg6NmYNOgqIbTmzGkE52tMyOoJqNzqnDaQ6q5T1Opv6UTqQdX06g9A3OnfuGzqFI246hlrhOmJtZjpf25k6FCyPOhAMgDo1nyI6TLhWOoVgwzqXc746gJc8OlnKVjo/gk86iR5UOpq7PTqdzGY6fspZOp6HRjqEU7w6So02OiKgNToiGkg6aR5oOpHAEDpRg+M6TXTPOnXxkjq5Ct069YXFOvA6NDry/wY65ooNOtzuzDq+ECw6mx8DOpM6qzrBq7E6wn5oOsrlYDsOlM87DKunOw2+iDrzrio6xuE4OrWjPDqoohQ60tC2OvuLODrrWeE63cvKOtKc2jq/DwE6t3QKOrLbvDqrGOI6vpYpOv+cezslrTw7Gp54OxOm8zsWlYM7H8mYOz5/qzs1fHE7MGNuOzCnFDs1TVs7P/9NOx9ocTr+Wzs6+KXkOxOgZjrNhNA6gXVdOmH0LzqUolA6zwgZOw9UyTsD9I46/6KVOwVc4zsa2pc7CwuyOsUzETqLuas6mdYmOwWSnTswT4o7JX7qOwqBMzr5UBY66uyWOvW5NTsKjMk7JL4oOyJEiTsuq1Y7QQhEOzLQBjsjYeg7BRP5OvfqBTsPYUI7Lhs+O1NqIzuFBsU7eMaTOzuwvDsVjz47FheUOzc2FTtf4y87eSLlO4FCMDuE1iI7jx8lO6CXezuLCc87SkkEOw9XGzrUN/o60S9OOwRbLjs1xu87d3xMO32c8zuL04A7kPoQO4I88jtgMhk7PfEaOxILhDrxJ/g6+S+dOxZK4js70sU7ZAatO41myjuP8FU7iNOzO5IsQzuUZZ47anfmOzKbiDssSQo7RbdFOzZGbzsijbg7EdwqOzAoGzs/cx87c6CdO6fSjju/2mE7s2PuO7BVFDuvFvU7uGxCO6KcTzuFdpY7gW0tO1MZejs+0u87bhNOO5hzHDuuqfU7puR+O27/RjsnXA87FRP2OyNyRTtbJ4A7oHBjO6aMVTu2P3I7xGZWO8iz2TvO9Ns7xOtWO6PXxjt9d/47cPPYO31G+Ds3erY6yE0aOk8xCDpGySk6qq7zOvjgZTsgTBs7NM8GOx3z6Tsq29w7NmsfO0Hreztu3xY7dk9WO3TTGDtiz0s7ZOMLO46IeDuO1vk7bpo/O25jnDt3Vek7lJ5YO5mNajtsaXA7Fv+0Ou/dCTsJbAs7PTjsO22GJzt2eTk7XRjBO340CzuCQOw7clTLO19aJDs0gkk7EHVbOx2iqTsiMhI7I3XXOxQtAjsXZlM7JfrQOzPyIjtHLgk7fNvUO4nBdDuOYZI7hCk6O2l7ITtjevg7WwaEO1/S6TtFPiY7MW19O2bjLzt03jI7gsFbO4PWBDuGU9U7noVhO7Hg3zufXSU7gVMiO1o9xztFxDo7OaUcO2D1wztT9/87XyIcO0mFPzszGc47Kht5Oz23WjtmjYg7gKOVO3IZfDtIzFg7GVHmOuB6KDrOLOQ7CF6POyzH1TsuH3Q7OM/DO0xskTtuJHQ7cMB/O4bPzzuLdOU7h2SsO2f2xDs8IKw7JR0+Oyvwhjsae7E7DcaqOvnPADrXxC063tLlOvY0zTsTxUI7IkbcOzkVzzs/RpM7M1ZDOzBmnjs87aU7VcmTO0DL/zsbCFE69oQyOtABijrpNtM69c8UOu225DqbxxA6azZrOlKXRDp7vwE6p0qUOqxh8zqs7JE6rdnqOtzxNzsOlb47FRw2OxtFIDr6nlQ6yv8ROqncETqkc4I6m2kEOmU4oTp3bec6roWNOtkwujq4hkw6iT+5OnlU+DpkEhQ6aw67Opyl9DqM79c6S3J2Ofz8rDn30cA6HOhuOgXIsToEZQQ5znUNOgH78zoTKKU6YLyIOnWfAjqSDVk6hWPrOnGUwDpqn3c6eY3AOlo1SDqC9kU6bS35OoHBLzp8WgI6So6gOgZ8fzo4Ibw6elzBOo5ZVTp5wSU6d0e+OnCgezpxp1M6NoEzOiXlxzpUG4o6ZLCKOmLp9Tot5oU6Ja0OOhfxAzo4qNI6O9+0OjMq5zorV7w6Gv+gOhnd8zn7P1o6IRscOhLPqDngEEg5uIZJOdmxAToNmYA6Gm3GOmg3sjpCPz06WyRCOmGz3zovg7050RuqOfX3WDol3yg6ZUH8OjrgyzoIwlI6H9qmOhtrGTovTtc6SJ3rOkDdRzoYcX46D3P0OhFudToCa2A6BWyTOaegSzlmUn05X3MqOY5bPjnGv7Q55MBkOil2NDo4lt86Ro6YOlW4mjob5ig5wtjpOZheyTmLg/c5iuH1OYyCVjk+Om85JLQMOZCxJjmP6ro5kcX3OfVvATnGgWE5haCUOP8UPDk8C6Q5iKLHOUrz4zk0mMw5WFZuOUwQ6jl2t2A5qF4BOZp7xzmMEhU5yBgqOdWqmDnmxrg5e4YkOUTKvzkklbs42a5MOTN8WTkWOKU5YfgsOUAgnjmPd1g5h/cCOWbI1TmTGvQ5c2+oOYGlXTlz5jU5WuZmOUBj6Tka3hs5kuGEOUJ64DlaTDI5Qx6wOWLktTk6UT44uJu4OSVojjlryTM5HOuzOKzKFjkzpaA5IKMzOU6a+TkZ+fo4/z9xOJ4zlDlRXMw5JFNkOZ/S5Dm+nbg5mNQ2OTYfzjkIhDY5BF12OWhzpzl1yzU5L8odOUxnIDmfni05O9aXOM5YXzkHdR85Z1vyOVD4LDknGEU5AO+mOUFl9jj6flw4qTKoOMJxbDkZ23s45egCOSXNVzlR/6A5YzwWOIiTmzhDuiM4ckLKOGlfETiwlGw4nTLiOQarEzkghTg435iYOTcOYTk6mxE417BqOShVoTjZ/vA5EbZqOMiXfDj1EMo4qezxOG1Ozjixiu04dG/sOCBpCzihxbA4yQo4OK0e0jizeBg5NRWpOPpimziANrc4EZ7jOIeAXziW4aY5BGPgOOxqLzifzCM4JEU7OMG5kjht8D44Jl+qODrHFzgVIbg4mPQ2OCIORjb5Cp84e6X0OADpBDibM2E3YKiFNuQmfTiYFC04i3bKOL4ELjigoew4oYQDOJAK0Th/KW04Zp6VONEgiTjUZOo4jJV6OBFkKzeaXUc4GJisOBQsOjgCZYc39MHDOBTM6DiVqi84KANtOEaOoTiK1ic4DO70N+rSvDhFJ9Q3gn3TOHXRdTiQZXk4CJwPN8TvgzfSFxo4Gw8UN4leLTepicc39wbUOHg9JzgW8tY4bOD7OJ9hjzhJIeE4G7+IOCht/zhuOXM3EV0eOFMPMzcGRI84uDUbN458SjXUYcg3hQD/N6go1jezTCs3jAi9OA1kFDcSnxM3CklMN3sxdjbrFtQ31mLFN/TNLjdi2XI3qtLMOBHGRDgWdfk3wzlgN0BOqzc4DU43H7m6NpuDhzUjc3g3tgLNNwsZ2ja8xdE3cqBUNXhSZjgJ3V44SLYWN6ywljbIXyo4ALH1N4FO5Tdk/u032vrrNpsb/jb7F3k2lZaBN0W5CTdzvwo4K1P2NmooDzhI1os4IJJZN/xf4DddXcM3adDqN/JMMjgE5Ww3CbgmNzxbkzMyJWI3Jmn+NomgODatgGs3skvFNuHb1ze65og2929+NrFZrjcCh5c2wlG+NqssVDgEloc3RZefN1no3DatxX03KevuNrzoSzgIf/UyKB8LNvfYhjeIRBY3FDw7NkHG1zbsOyw3QEX0N3PYhDa2y0Q2Ew3jNrJhZDhi3UE1jsnzNALBbzbCKuU20kNGNxgSUDW1cus29u8qNvHROjbGJ+U2d2R4NqmGPDh5qFE2kj5gNWP33zZqkM43sf31NzEY+DZsWkQ2qJYDNqUvJDYwDfQ2le0fNvwp2TZtVbk3V1A4NT3PtDd6xmY4FhDEN3OfxjfwGV4238DDNoB/qzaIS3E3Jz+iNmZ4nDeLlU43qQX/N6VXuzU+6Bk2N4WXNeQWsTcpydI3j8D1NrSMsjZRwxU23KKANyvmiTaKHrQ3zzqsNf/SMTchLt411j5sN1t3rDWBMKw21n5rOBeltQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQECAQEBAgEBAQEBAQEBAgICAgEBAgIBAwMBAgIBAQIDAQICAQIBAQEBAwIBAgEBAgMDAwIFAQIDAgIBBAQDAgICAQIDBAMEBAIFAgICAwUHAgIEBgYGAggDBgYCBgQEAQUDBgQDAwoCAgQEAwUCAQgIBQIKBQgEBwQNCQIGBwkFBA4MCQUFBwIICAYHBA0FBQkPEQsLDAkQEA4OCxMRDhMECggIFBMQGhQUExIECQYIChELDA4TERYMDSEbEhEKDhQbEQ8PLSobGBkQCBEPFB4RFCYbIjIcHxwhFhUjHQ8UGCckJSkmIjEmOCsiMScsHy0hHxspODcmKi4/MxcvR1RHMyQoMBotOkpXPiw4OkA/KyE3SlE3JyAygQGUAX+CAU1tYGZpXzYvQUpWd3tKPltvdGF+ckxKOUZ5T1BMap4ByAGuAa4BsgGjAYsBbHufAXOxAeUB2gHpAaUBkQGHAaYBygHJAb0BrwGgAZkBoAF6iwGdAeQB+wHfAdMB3AGfArwChQL7AZ8CpgKYArwB1QG2Ac8BhAFSXnirAeMBwQGmAeAB8wGkAW9ojQH4AbwCgQLPAZoBrAGzAfgBgwL7AZkCqgL8AdEBwAHHAeMBgALiArID1gLrAdkB+gG7AoQDiQOgA60D1QP+A/8CggKzAY0BrAHhAd8C4wKaA8YDnAP1AtkCjQK9AaUBzQGDAtwClwO5A5IDmAOHBKoDygLxAb8CugKSAtQB9gGIArMCvQOeBN0ErgSIBJ8EsQSxA/UChAOdAtICpwOZBJgE1AOhAtsB5gGWApsD8wOFBKgEggWFBfEEmQTLA/ICgQPqAswBWj5kuAHiAaUC/AHfAb8CrALSAvgChAPRAswCgAPGA4MD2QLSAp4DgATQA44CtgGfAfgBzwLiAswC6gKLA4sD5wLEAucB7gHrAfcB9AHiAegB+QGMAuAChgOxA5AD0QLiArEC3wK7AoYCpgL7ApMDnwOpA6sDggSdBKQD+gLIAqgCxgLPAtoCvgKWAvUBhgLHAoYD2wLEAvcBxAGdAbsB7QGCApQCmALJAsUCiQOpA68D/AKSAvwB9wGHAukBugGpAaUBngG6AfwBiwKUApsClQKmArECsAKZAuEBkgGsAccBxAGLAVNFWm6PAYoBe5cBwQHhAe0BzgHLAYYBggGbAVhThAGaAaEBhQFJW1p5iQFhOic2NS0mHjJAYGpsYVZcR2VsWF5JOTBKa3hUWGdSSFJXS1NAL0VRQz83LTQ2QDYjFSs+VEpabVQzJjNAWkM1LT5MPVBDKyw5IxcUFyEgPkNNTUUxKRwfHh4PGBkZKSUZEwwZHRcXDxUeGhgdKiYaHhEQCxISEh0REyAdGhoYDxMWFhIWHhEIChQNEA0TDBMMCQ8QEx8mGwsQERcWERcRDQ8TFg4REhILBxMJCQ4UEQQDCAQFDg0EERQLFhIHDA4NBgcKAwQLCQMMEQYDBwcIEAwKBgkFAwQHCAIBCAUDCAEJCwcLBwoFBgYGBQIEBQMCAgMJAwkCAwUBAwMGAwMDAgUFAgEDBgYHBAYBAwcBBQIBBgMCAgYBAQQCAwMBAgMEAgMDAQEBAQEDAQEDAgIBAgECAwECAQMDAwECAwICAQMCAQEBAQEBAgICAQIBAQICAQEBAQEBAQICAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQE="
208 | 
209 | 	tdigestAsBytes, err := base64.StdEncoding.DecodeString(serializedJavaTDigestB64)
210 | 
211 | 	if err != nil {
212 | 		t.Fatal(err.Error())
213 | 	}
214 | 
215 | 	tdigest, err := FromBytes(bytes.NewReader(tdigestAsBytes))
216 | 
217 | 	if err != nil {
218 | 		t.Fatal(err.Error())
219 | 	}
220 | 
221 | 	if tdigest.Count() != 100000 {
222 | 		t.Fatalf("Expected deserialized t-digest to have a count of 100_000. Got %d", tdigest.Count())
223 | 	}
224 | 
225 | 	assertDifferenceSmallerThan(tdigest, 0.5, 0.02, t)
226 | 	assertDifferenceSmallerThan(tdigest, 0.1, 0.01, t)
227 | 	assertDifferenceSmallerThan(tdigest, 0.9, 0.01, t)
228 | 	assertDifferenceSmallerThan(tdigest, 0.01, 0.005, t)
229 | 	assertDifferenceSmallerThan(tdigest, 0.99, 0.005, t)
230 | 	assertDifferenceSmallerThan(tdigest, 0.001, 0.001, t)
231 | 	assertDifferenceSmallerThan(tdigest, 0.999, 0.001, t)
232 | }
233 | 
234 | func BenchmarkAsBytes(b *testing.B) {
235 | 	b.ReportAllocs()
236 | 
237 | 	t1, _ := New(Compression(100))
238 | 	for i := 0; i < 100; i++ {
239 | 		t1.Add(rand.Float64())
240 | 	}
241 | 
242 | 	b.ResetTimer()
243 | 
244 | 	for n := 0; n < b.N; n++ {
245 | 		t1.AsBytes()
246 | 	}
247 | }
248 | 
249 | func BenchmarkToBytes(b *testing.B) {
250 | 	b.ReportAllocs()
251 | 
252 | 	t1, _ := New(Compression(100))
253 | 	for i := 0; i < 100; i++ {
254 | 		t1.Add(rand.Float64())
255 | 	}
256 | 
257 | 	b.ResetTimer()
258 | 	var buf []byte
259 | 	for n := 0; n < b.N; n++ {
260 | 		buf = t1.ToBytes(buf)
261 | 	}
262 | }
263 | 
264 | func BenchmarkFromBytes(b *testing.B) {
265 | 	b.ReportAllocs()
266 | 
267 | 	t1, _ := New(Compression(100))
268 | 	for i := 0; i < 100; i++ {
269 | 		t1.Add(rand.Float64())
270 | 	}
271 | 
272 | 	buf, _ := t1.AsBytes()
273 | 	reader := bytes.NewReader(buf)
274 | 
275 | 	b.ResetTimer()
276 | 	for n := 0; n < b.N; n++ {
277 | 		reader.Reset(buf)
278 | 		FromBytes(reader)
279 | 	}
280 | }
281 | 
282 | func BenchmarkFromBytesMethod(b *testing.B) {
283 | 	b.ReportAllocs()
284 | 
285 | 	t1, _ := New(Compression(100))
286 | 	for i := 0; i < 100; i++ {
287 | 		t1.Add(rand.Float64())
288 | 	}
289 | 
290 | 	buf, _ := t1.AsBytes()
291 | 
292 | 	b.ResetTimer()
293 | 	var t2 TDigest
294 | 	for n := 0; n < b.N; n++ {
295 | 		t2.FromBytes(buf)
296 | 	}
297 | }
298 | 


--------------------------------------------------------------------------------
/summary.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"sort"
  7 | )
  8 | 
  9 | type summary struct {
 10 | 	means  []float64
 11 | 	counts []uint64
 12 | }
 13 | 
 14 | func newSummary(initialCapacity int) *summary {
 15 | 	s := &summary{
 16 | 		means:  make([]float64, 0, initialCapacity),
 17 | 		counts: make([]uint64, 0, initialCapacity),
 18 | 	}
 19 | 	return s
 20 | }
 21 | 
 22 | func (s *summary) Len() int {
 23 | 	return len(s.means)
 24 | }
 25 | 
 26 | func (s *summary) Add(key float64, value uint64) error {
 27 | 	if math.IsNaN(key) {
 28 | 		return fmt.Errorf("key must not be NaN")
 29 | 	}
 30 | 	if value == 0 {
 31 | 		return fmt.Errorf("Count must be >0")
 32 | 	}
 33 | 
 34 | 	idx := s.findInsertionIndex(key)
 35 | 
 36 | 	s.means = append(s.means, math.NaN())
 37 | 	s.counts = append(s.counts, 0)
 38 | 
 39 | 	copy(s.means[idx+1:], s.means[idx:])
 40 | 	copy(s.counts[idx+1:], s.counts[idx:])
 41 | 
 42 | 	s.means[idx] = key
 43 | 	s.counts[idx] = value
 44 | 
 45 | 	return nil
 46 | }
 47 | 
 48 | // Always insert to the right
 49 | func (s *summary) findInsertionIndex(x float64) int {
 50 | 	// Binary search is only worthwhile if we have a lot of keys.
 51 | 	if len(s.means) < 250 {
 52 | 		for i, mean := range s.means {
 53 | 			if mean > x {
 54 | 				return i
 55 | 			}
 56 | 		}
 57 | 		return len(s.means)
 58 | 	}
 59 | 
 60 | 	return sort.Search(len(s.means), func(i int) bool {
 61 | 		return s.means[i] > x
 62 | 	})
 63 | }
 64 | 
 65 | // This method is the hotspot when calling Add(), which in turn is called by
 66 | // Compress() and Merge().
 67 | func (s *summary) HeadSum(idx int) (sum float64) {
 68 | 	return float64(sumUntilIndex(s.counts, idx))
 69 | }
 70 | 
 71 | func (s *summary) Floor(x float64) int {
 72 | 	return s.findIndex(x) - 1
 73 | }
 74 | 
 75 | func (s *summary) findIndex(x float64) int {
 76 | 	// Binary search is only worthwhile if we have a lot of keys.
 77 | 	if len(s.means) < 250 {
 78 | 		for i, mean := range s.means {
 79 | 			if mean >= x {
 80 | 				return i
 81 | 			}
 82 | 		}
 83 | 		return len(s.means)
 84 | 	}
 85 | 
 86 | 	return sort.Search(len(s.means), func(i int) bool {
 87 | 		return s.means[i] >= x
 88 | 	})
 89 | }
 90 | 
 91 | func (s *summary) Mean(uncheckedIndex int) float64 {
 92 | 	return s.means[uncheckedIndex]
 93 | }
 94 | 
 95 | func (s *summary) Count(uncheckedIndex int) uint64 {
 96 | 	return s.counts[uncheckedIndex]
 97 | }
 98 | 
 99 | // return the index of the last item which the sum of counts
100 | // of items before it is less than or equal to `sum`. -1 in
101 | // case no centroid satisfies the requirement.
102 | // Since it's cheap, this also returns the `HeadSum` until
103 | // the found index (i.e. cumSum = HeadSum(FloorSum(x)))
104 | func (s *summary) FloorSum(sum float64) (index int, cumSum float64) {
105 | 	index = -1
106 | 	for i, count := range s.counts {
107 | 		if cumSum <= sum {
108 | 			index = i
109 | 		} else {
110 | 			break
111 | 		}
112 | 		cumSum += float64(count)
113 | 	}
114 | 	if index != -1 {
115 | 		cumSum -= float64(s.counts[index])
116 | 	}
117 | 	return index, cumSum
118 | }
119 | 
120 | func (s *summary) setAt(index int, mean float64, count uint64) {
121 | 	s.means[index] = mean
122 | 	s.counts[index] = count
123 | 	s.adjustRight(index)
124 | 	s.adjustLeft(index)
125 | }
126 | 
127 | func (s *summary) adjustRight(index int) {
128 | 	for i := index + 1; i < len(s.means) && s.means[i-1] > s.means[i]; i++ {
129 | 		s.means[i-1], s.means[i] = s.means[i], s.means[i-1]
130 | 		s.counts[i-1], s.counts[i] = s.counts[i], s.counts[i-1]
131 | 	}
132 | }
133 | 
134 | func (s *summary) adjustLeft(index int) {
135 | 	for i := index - 1; i >= 0 && s.means[i] > s.means[i+1]; i-- {
136 | 		s.means[i], s.means[i+1] = s.means[i+1], s.means[i]
137 | 		s.counts[i], s.counts[i+1] = s.counts[i+1], s.counts[i]
138 | 	}
139 | }
140 | 
141 | func (s *summary) ForEach(f func(float64, uint64) bool) {
142 | 	for i, mean := range s.means {
143 | 		if !f(mean, s.counts[i]) {
144 | 			break
145 | 		}
146 | 	}
147 | }
148 | 
149 | func (s *summary) Perm(rng RNG, f func(float64, uint64) bool) {
150 | 	for _, i := range perm(rng, s.Len()) {
151 | 		if !f(s.means[i], s.counts[i]) {
152 | 			break
153 | 		}
154 | 	}
155 | }
156 | 
157 | func (s *summary) Clone() *summary {
158 | 	return &summary{
159 | 		means:  append([]float64{}, s.means...),
160 | 		counts: append([]uint64{}, s.counts...),
161 | 	}
162 | }
163 | 
164 | // Randomly shuffles summary contents, so they can be added to another summary
165 | // with being pathological. Renders summary invalid.
166 | func (s *summary) shuffle(rng RNG) {
167 | 	for i := len(s.means) - 1; i > 1; i-- {
168 | 		s.Swap(i, rng.Intn(i+1))
169 | 	}
170 | }
171 | 
172 | // for sort.Interface
173 | func (s *summary) Swap(i, j int) {
174 | 	s.means[i], s.means[j] = s.means[j], s.means[i]
175 | 	s.counts[i], s.counts[j] = s.counts[j], s.counts[i]
176 | }
177 | 
178 | func (s *summary) Less(i, j int) bool {
179 | 	return s.means[i] < s.means[j]
180 | }
181 | 
182 | // A simple loop unroll saves a surprising amount of time.
183 | func sumUntilIndex(s []uint64, idx int) uint64 {
184 | 	var cumSum uint64
185 | 	var i int
186 | 	for i = idx - 1; i >= 3; i -= 4 {
187 | 		cumSum += uint64(s[i])
188 | 		cumSum += uint64(s[i-1])
189 | 		cumSum += uint64(s[i-2])
190 | 		cumSum += uint64(s[i-3])
191 | 	}
192 | 	for ; i >= 0; i-- {
193 | 		cumSum += uint64(s[i])
194 | 	}
195 | 	return cumSum
196 | }
197 | 
198 | func perm(rng RNG, n int) []int {
199 | 	m := make([]int, n)
200 | 	for i := 1; i < n; i++ {
201 | 		j := rng.Intn(i + 1)
202 | 		m[i] = m[j]
203 | 		m[j] = i
204 | 	}
205 | 	return m
206 | }
207 | 


--------------------------------------------------------------------------------
/summary_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/rand"
  6 | 	"sort"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func TestBasics(t *testing.T) {
 11 | 	s := newSummary(2)
 12 | 
 13 | 	err := s.Add(1, 1)
 14 | 
 15 | 	if err != nil {
 16 | 		t.Errorf("Failed to add simple item")
 17 | 	}
 18 | 
 19 | 	if s.Add(math.NaN(), 1) == nil {
 20 | 		t.Errorf("Adding math.NaN() shouldn't be allowed")
 21 | 	}
 22 | 
 23 | 	if s.Add(1, 0) == nil {
 24 | 		t.Errorf("Adding count=0 shouldn't be allowed")
 25 | 	}
 26 | }
 27 | 
 28 | func checkSorted(s *summary, t *testing.T) {
 29 | 	if !sort.Float64sAreSorted(s.means) {
 30 | 		t.Fatalf("Keys are not sorted! %v", s.means)
 31 | 	}
 32 | }
 33 | 
 34 | func TestCore(t *testing.T) {
 35 | 
 36 | 	testData := make(map[float64]uint64)
 37 | 
 38 | 	const maxDataSize = 10000
 39 | 	s := newSummary(maxDataSize)
 40 | 	checkSorted(s, t)
 41 | 
 42 | 	if s.Len() != 0 {
 43 | 		t.Errorf("Initial size should be zero regardless of capacity. Got %d", s.Len())
 44 | 	}
 45 | 
 46 | 	// construct a summary made of unique items only
 47 | 	for i := 0; i < maxDataSize; i++ {
 48 | 		k := rand.Float64()
 49 | 		v := rand.Uint64()
 50 | 
 51 | 		_, exists := testData[k]
 52 | 		if !exists {
 53 | 			_ = s.Add(k, v)
 54 | 			testData[k] = v
 55 | 		}
 56 | 	}
 57 | 
 58 | 	checkSorted(s, t)
 59 | 
 60 | 	if s.Len() != len(testData) {
 61 | 		t.Errorf("Got Len() == %d. Expected %d", s.Len(), len(testData))
 62 | 	}
 63 | 
 64 | 	for k, v := range testData {
 65 | 		i := s.findIndex(k)
 66 | 
 67 | 		if i == s.Len() {
 68 | 			t.Errorf("Couldn't find previously added key on summary")
 69 | 			continue
 70 | 		}
 71 | 
 72 | 		if s.means[i] != k || s.counts[i] != v {
 73 | 			t.Errorf("Wanted to find {%.4f,%d}, but found {%.4f,%d} instead", k, v, s.means[i], s.counts[i])
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func TestSetAtNeverBreaksSorting(t *testing.T) {
 79 | 	s := newSummary(10)
 80 | 
 81 | 	for _, i := range []float64{10, 10, 10, 10, 10} {
 82 | 		_ = s.Add(i, 1)
 83 | 	}
 84 | 
 85 | 	s.setAt(0, 30, 1)
 86 | 	checkSorted(s, t)
 87 | 
 88 | 	s.setAt(s.Len()-1, 0, 1)
 89 | 	checkSorted(s, t)
 90 | 
 91 | 	s.setAt(3, 10.1, 1)
 92 | 	checkSorted(s, t)
 93 | 
 94 | 	s.setAt(3, 9.9, 1)
 95 | 	checkSorted(s, t)
 96 | 
 97 | }
 98 | 
 99 | func TestForEach(t *testing.T) {
100 | 
101 | 	s := newSummary(10)
102 | 	for _, i := range []uint64{1, 2, 3, 4, 5, 6} {
103 | 		_ = s.Add(float64(i), i*10)
104 | 	}
105 | 
106 | 	c := 0
107 | 	s.ForEach(func(mean float64, count uint64) bool {
108 | 		c++
109 | 		return false
110 | 	})
111 | 
112 | 	if c != 1 {
113 | 		t.Errorf("ForEach must exit early if the closure returns false")
114 | 	}
115 | 
116 | 	var tot uint64
117 | 	s.ForEach(func(mean float64, count uint64) bool {
118 | 		tot += count
119 | 		return true
120 | 	})
121 | 
122 | 	if tot != 210 {
123 | 		t.Errorf("ForEach must walk through the whole data if it always returns true")
124 | 	}
125 | }
126 | 
127 | func TestFloorSum(t *testing.T) {
128 | 	s := newSummary(100)
129 | 	var total uint64
130 | 	for i := 0; i < 100; i++ {
131 | 		count := uint64(rand.Intn(10)) + 1
132 | 		_ = s.Add(rand.Float64(), count)
133 | 		total += count
134 | 	}
135 | 
136 | 	idx, _ := s.FloorSum(-1)
137 | 	if idx != -1 {
138 | 		t.Errorf("Expected no centroid to satisfy -1 but got index=%d", idx)
139 | 	}
140 | 
141 | 	for i := float64(0); i < float64(total)+10; i++ {
142 | 		node, _ := s.FloorSum(i)
143 | 		if s.HeadSum(node) > i {
144 | 			t.Errorf("headSum(%d)=%.0f (>%.0f)", node, s.HeadSum(node), i)
145 | 		}
146 | 		if node+1 < s.Len() && s.HeadSum(node+1) <= i {
147 | 			t.Errorf("headSum(%d)=%.0f (>%.0f)", node+1, s.HeadSum(node+1), i)
148 | 		}
149 | 	}
150 | }
151 | 
152 | func TestFloor(t *testing.T) {
153 | 	s := newSummary(200)
154 | 	for i := float64(0); i < 101; i++ {
155 | 		_ = s.Add(i/2.0, 1)
156 | 	}
157 | 
158 | 	if s.Floor(-30) != -1 {
159 | 		t.Errorf("Shouldn't have found a floor index. Got %d", s.Floor(-30))
160 | 	}
161 | 
162 | 	for i := 0; i < s.Len(); i++ {
163 | 		m := s.means[i]
164 | 		f := s.means[s.Floor(m+0.1)]
165 | 		if m != f {
166 | 			t.Errorf("Erm, %.4f != %.4f", m, f)
167 | 		}
168 | 	}
169 | }
170 | 
171 | func TestAdjustLeftRight(t *testing.T) {
172 | 
173 | 	keys := []float64{1, 2, 3, 4, 9, 5, 6, 7, 8}
174 | 	counts := []uint64{1, 2, 3, 4, 9, 5, 6, 7, 8}
175 | 
176 | 	s := summary{means: keys, counts: counts}
177 | 
178 | 	s.adjustRight(4)
179 | 
180 | 	if !sort.Float64sAreSorted(s.means) || s.counts[4] != 5 {
181 | 		t.Errorf("adjustRight should have fixed the keys/counts state. %v %v", s.means, s.counts)
182 | 	}
183 | 
184 | 	keys = []float64{1, 2, 3, 4, 0, 5, 6, 7, 8}
185 | 	counts = []uint64{1, 2, 3, 4, 0, 5, 6, 7, 8}
186 | 
187 | 	s = summary{means: keys, counts: counts}
188 | 	s.adjustLeft(4)
189 | 
190 | 	if !sort.Float64sAreSorted(s.means) || s.counts[4] != 4 {
191 | 		t.Errorf("adjustLeft should have fixed the keys/counts state. %v %v", s.means, s.counts)
192 | 	}
193 | }
194 | 


--------------------------------------------------------------------------------
/tdigest.go:
--------------------------------------------------------------------------------
  1 | // Package tdigest provides a highly accurate mergeable data-structure
  2 | // for quantile estimation.
  3 | //
  4 | // Typical T-Digest use cases involve accumulating metrics on several
  5 | // distinct nodes of a cluster and then merging them together to get
  6 | // a system-wide quantile overview. Things such as: sensory data from
  7 | // IoT devices, quantiles over enormous document datasets (think
  8 | // ElasticSearch), performance metrics for distributed systems, etc.
  9 | //
 10 | // After you create (and configure, if desired) the digest:
 11 | //
 12 | //	digest, err := tdigest.New(tdigest.Compression(100))
 13 | //
 14 | // You can then use it for registering measurements:
 15 | //
 16 | //	digest.Add(number)
 17 | //
 18 | // Estimating quantiles:
 19 | //
 20 | //	digest.Quantile(0.99)
 21 | //
 22 | // And merging with another digest:
 23 | //
 24 | //	digest.Merge(otherDigest)
 25 | package tdigest
 26 | 
 27 | import (
 28 | 	"fmt"
 29 | 	"math"
 30 | )
 31 | 
 32 | // TDigest is a quantile approximation data structure.
 33 | type TDigest struct {
 34 | 	summary     *summary
 35 | 	compression float64
 36 | 	count       uint64
 37 | 	rng         RNG
 38 | }
 39 | 
 40 | // New creates a new digest.
 41 | //
 42 | // By default the digest is constructed with a configuration that
 43 | // should be useful for most use-cases. It comes with compression
 44 | // set to 100 and uses a local random number generator for
 45 | // performance reasons.
 46 | func New(options ...tdigestOption) (*TDigest, error) {
 47 | 	tdigest, err := newWithoutSummary(options...)
 48 | 
 49 | 	if err != nil {
 50 | 		return nil, err
 51 | 	}
 52 | 
 53 | 	tdigest.summary = newSummary(estimateCapacity(tdigest.compression))
 54 | 	return tdigest, nil
 55 | }
 56 | 
 57 | // Creates a tdigest instance without allocating a summary.
 58 | func newWithoutSummary(options ...tdigestOption) (*TDigest, error) {
 59 | 	tdigest := &TDigest{
 60 | 		compression: 100,
 61 | 		count:       0,
 62 | 	}
 63 | 
 64 | 	for _, option := range options {
 65 | 		err := option(tdigest)
 66 | 		if err != nil {
 67 | 			return nil, err
 68 | 		}
 69 | 	}
 70 | 
 71 | 	if tdigest.rng == nil {
 72 | 		tdigest.rng = newLocalRNG(1)
 73 | 	}
 74 | 
 75 | 	return tdigest, nil
 76 | }
 77 | 
 78 | func _quantile(index float64, previousIndex float64, nextIndex float64, previousMean float64, nextMean float64) float64 {
 79 | 	delta := nextIndex - previousIndex
 80 | 	previousWeight := (nextIndex - index) / delta
 81 | 	nextWeight := (index - previousIndex) / delta
 82 | 	return previousMean*previousWeight + nextMean*nextWeight
 83 | }
 84 | 
 85 | // Compression returns the TDigest compression.
 86 | func (t *TDigest) Compression() float64 {
 87 | 	return t.compression
 88 | }
 89 | 
 90 | // Quantile returns the desired percentile estimation.
 91 | //
 92 | // Values of p must be between 0 and 1 (inclusive), will panic otherwise.
 93 | func (t *TDigest) Quantile(q float64) float64 {
 94 | 	if q < 0 || q > 1 {
 95 | 		panic("q must be between 0 and 1 (inclusive)")
 96 | 	}
 97 | 
 98 | 	if t.summary.Len() == 0 {
 99 | 		return math.NaN()
100 | 	} else if t.summary.Len() == 1 {
101 | 		return t.summary.Mean(0)
102 | 	}
103 | 
104 | 	index := q * float64(t.count-1)
105 | 	previousMean := math.NaN()
106 | 	previousIndex := float64(0)
107 | 	next, total := t.summary.FloorSum(index)
108 | 
109 | 	if next > 0 {
110 | 		previousMean = t.summary.Mean(next - 1)
111 | 		previousIndex = total - float64(t.summary.Count(next-1)+1)/2
112 | 	}
113 | 
114 | 	for {
115 | 		nextIndex := total + float64(t.summary.Count(next)-1)/2
116 | 		if nextIndex >= index {
117 | 			if math.IsNaN(previousMean) {
118 | 				// the index is before the 1st centroid
119 | 				if nextIndex == previousIndex {
120 | 					return t.summary.Mean(next)
121 | 				}
122 | 				// assume linear growth
123 | 				nextIndex2 := total + float64(t.summary.Count(next)) + float64(t.summary.Count(next+1)-1)/2
124 | 				previousMean = (nextIndex2*t.summary.Mean(next) - nextIndex*t.summary.Mean(next+1)) / (nextIndex2 - nextIndex)
125 | 			}
126 | 			// common case: two centroids found, the result in in between
127 | 			return _quantile(index, previousIndex, nextIndex, previousMean, t.summary.Mean(next))
128 | 		} else if next+1 == t.summary.Len() {
129 | 			// the index is after the last centroid
130 | 			nextIndex2 := float64(t.count - 1)
131 | 			nextMean2 := (t.summary.Mean(next)*(nextIndex2-previousIndex) - previousMean*(nextIndex2-nextIndex)) / (nextIndex - previousIndex)
132 | 			return _quantile(index, nextIndex, nextIndex2, t.summary.Mean(next), nextMean2)
133 | 		}
134 | 		total += float64(t.summary.Count(next))
135 | 		previousMean = t.summary.Mean(next)
136 | 		previousIndex = nextIndex
137 | 		next++
138 | 	}
139 | 	// unreachable
140 | }
141 | 
142 | // boundedWeightedAverage computes the weighted average of two
143 | // centroids guaranteeing that the result will be between x1 and x2,
144 | // inclusive.
145 | //
146 | // Refer to https://github.com/caio/go-tdigest/pull/19 for more details
147 | func boundedWeightedAverage(x1 float64, w1 float64, x2 float64, w2 float64) float64 {
148 | 	if x1 > x2 {
149 | 		x1, x2, w1, w2 = x2, x1, w2, w1
150 | 	}
151 | 	result := (x1*w1 + x2*w2) / (w1 + w2)
152 | 	return math.Max(x1, math.Min(result, x2))
153 | }
154 | 
155 | // AddWeighted registers a new sample in the digest.
156 | //
157 | // It's the main entry point for the digest and very likely the only
158 | // method to be used for collecting samples. The count parameter is for
159 | // when you are registering a sample that occurred multiple times - the
160 | // most common value for this is 1.
161 | //
162 | // This will emit an error if `value` is NaN or if `count` is zero.
163 | func (t *TDigest) AddWeighted(value float64, count uint64) (err error) {
164 | 	if count == 0 {
165 | 		return fmt.Errorf("illegal datapoint <value: %.4f, count: %d>", value, count)
166 | 	}
167 | 
168 | 	if t.summary.Len() == 0 {
169 | 		err = t.summary.Add(value, count)
170 | 		t.count = uint64(count)
171 | 		return err
172 | 	}
173 | 
174 | 	begin := t.summary.Floor(value)
175 | 	if begin == -1 {
176 | 		begin = 0
177 | 	}
178 | 
179 | 	begin, end := t.findNeighbors(begin, value)
180 | 
181 | 	closest := t.chooseMergeCandidate(begin, end, count)
182 | 
183 | 	if closest == t.summary.Len() {
184 | 		err = t.summary.Add(value, count)
185 | 		if err != nil {
186 | 			return err
187 | 		}
188 | 	} else {
189 | 		c := float64(t.summary.Count(closest))
190 | 		newMean := boundedWeightedAverage(t.summary.Mean(closest), c, value, float64(count))
191 | 		t.summary.setAt(closest, newMean, uint64(c)+count)
192 | 	}
193 | 	t.count += uint64(count)
194 | 
195 | 	if float64(t.summary.Len()) > 20*t.compression {
196 | 		err = t.Compress()
197 | 	}
198 | 
199 | 	return err
200 | }
201 | 
202 | // Count returns the total number of samples this digest represents
203 | //
204 | // The result represents how many times Add() was called on a digest
205 | // plus how many samples the digests it has been merged with had.
206 | // This is useful mainly for two scenarios:
207 | //
208 | // - Knowing if there is enough data so you can trust the quantiles
209 | //
210 | // - Knowing if you've registered too many samples already and
211 | // deciding what to do about it.
212 | //
213 | // For the second case one approach would be to create a side empty
214 | // digest and start registering samples on it as well as on the old
215 | // (big) one and then discard the bigger one after a certain criterion
216 | // is reached (say, minimum number of samples or a small relative
217 | // error between new and old digests).
218 | func (t TDigest) Count() uint64 {
219 | 	return t.count
220 | }
221 | 
222 | // Add is an alias for AddWeighted(x,1)
223 | // Read the documentation for AddWeighted for more details.
224 | func (t *TDigest) Add(value float64) error {
225 | 	return t.AddWeighted(value, 1)
226 | }
227 | 
228 | // Compress tries to reduce the number of individual centroids stored
229 | // in the digest.
230 | //
231 | // Compression trades off accuracy for performance and happens
232 | // automatically after a certain amount of distinct samples have been
233 | // stored.
234 | //
235 | // At any point in time you may call Compress on a digest, but you
236 | // may completely ignore this and it will compress itself automatically
237 | // after it grows too much. If you are minimizing network traffic
238 | // it might be a good idea to compress before serializing.
239 | func (t *TDigest) Compress() (err error) {
240 | 	if t.summary.Len() <= 1 {
241 | 		return nil
242 | 	}
243 | 
244 | 	oldTree := t.summary
245 | 	t.summary = newSummary(estimateCapacity(t.compression))
246 | 	t.count = 0
247 | 
248 | 	oldTree.shuffle(t.rng)
249 | 	oldTree.ForEach(func(mean float64, count uint64) bool {
250 | 		err = t.AddWeighted(mean, count)
251 | 		return err == nil
252 | 	})
253 | 	return err
254 | }
255 | 
256 | // Merge joins a given digest into itself.
257 | //
258 | // Merging is useful when you have multiple TDigest instances running
259 | // in separate threads and you want to compute quantiles over all the
260 | // samples. This is particularly important on a scatter-gather/map-reduce
261 | // scenario.
262 | func (t *TDigest) Merge(other *TDigest) (err error) {
263 | 	if other.summary.Len() == 0 {
264 | 		return nil
265 | 	}
266 | 
267 | 	other.summary.Perm(t.rng, func(mean float64, count uint64) bool {
268 | 		err = t.AddWeighted(mean, count)
269 | 		return err == nil
270 | 	})
271 | 	return err
272 | }
273 | 
274 | // MergeDestructive joins a given digest into itself rendering
275 | // the other digest invalid.
276 | //
277 | // This works as Merge above but its faster. Using this method
278 | // requires caution as it makes 'other' useless - you must make
279 | // sure you discard it without making further uses of it.
280 | func (t *TDigest) MergeDestructive(other *TDigest) (err error) {
281 | 	if other.summary.Len() == 0 {
282 | 		return nil
283 | 	}
284 | 
285 | 	other.summary.shuffle(t.rng)
286 | 	other.summary.ForEach(func(mean float64, count uint64) bool {
287 | 		err = t.AddWeighted(mean, count)
288 | 		return err == nil
289 | 	})
290 | 	return err
291 | }
292 | 
293 | // CDF computes the fraction in which all samples are less than
294 | // or equal to the given value.
295 | func (t *TDigest) CDF(value float64) float64 {
296 | 	if t.summary.Len() == 0 {
297 | 		return math.NaN()
298 | 	} else if t.summary.Len() == 1 {
299 | 		if value < t.summary.Mean(0) {
300 | 			return 0
301 | 		}
302 | 		return 1
303 | 	}
304 | 
305 | 	// We have at least 2 centroids
306 | 	left := (t.summary.Mean(1) - t.summary.Mean(0)) / 2
307 | 	right := left
308 | 	tot := 0.0
309 | 
310 | 	for i := 1; i < t.summary.Len()-1; i++ {
311 | 		prevMean := t.summary.Mean(i - 1)
312 | 		if value < prevMean+right {
313 | 			v := (tot + float64(t.summary.Count(i-1))*interpolate(value, prevMean-left, prevMean+right)) / float64(t.Count())
314 | 			if v > 0 {
315 | 				return v
316 | 			}
317 | 			return 0
318 | 		}
319 | 
320 | 		tot += float64(t.summary.Count(i - 1))
321 | 		left = right
322 | 		right = (t.summary.Mean(i+1) - t.summary.Mean(i)) / 2
323 | 	}
324 | 
325 | 	// last centroid, the summary length is at least two
326 | 	aIdx := t.summary.Len() - 2
327 | 	aMean := t.summary.Mean(aIdx)
328 | 	if value < aMean+right {
329 | 		aCount := float64(t.summary.Count(aIdx))
330 | 		return (tot + aCount*interpolate(value, aMean-left, aMean+right)) / float64(t.Count())
331 | 	}
332 | 	return 1
333 | }
334 | 
335 | // Clone returns a deep copy of a TDigest.
336 | func (t *TDigest) Clone() *TDigest {
337 | 	return &TDigest{
338 | 		summary:     t.summary.Clone(),
339 | 		compression: t.compression,
340 | 		count:       t.count,
341 | 		rng:         t.rng,
342 | 	}
343 | }
344 | 
345 | func interpolate(x, x0, x1 float64) float64 {
346 | 	return (x - x0) / (x1 - x0)
347 | }
348 | 
349 | // ForEachCentroid calls the specified function for each centroid.
350 | //
351 | // Iteration stops when the supplied function returns false, or when all
352 | // centroids have been iterated.
353 | func (t *TDigest) ForEachCentroid(f func(mean float64, count uint64) bool) {
354 | 	t.summary.ForEach(f)
355 | }
356 | 
357 | func (t TDigest) findNeighbors(start int, value float64) (int, int) {
358 | 	minDistance := math.MaxFloat64
359 | 	lastNeighbor := t.summary.Len()
360 | 	for neighbor := start; neighbor < t.summary.Len(); neighbor++ {
361 | 		z := math.Abs(t.summary.Mean(neighbor) - value)
362 | 		if z < minDistance {
363 | 			start = neighbor
364 | 			minDistance = z
365 | 		} else if z > minDistance {
366 | 			lastNeighbor = neighbor
367 | 			break
368 | 		}
369 | 	}
370 | 	return start, lastNeighbor
371 | }
372 | 
373 | func (t TDigest) chooseMergeCandidate(begin, end int, count uint64) int {
374 | 	closest := t.summary.Len()
375 | 	sum := t.summary.HeadSum(begin)
376 | 	var n float32
377 | 
378 | 	for neighbor := begin; neighbor != end; neighbor++ {
379 | 		c := float64(t.summary.Count(neighbor))
380 | 		var q float64
381 | 		if t.count == 1 {
382 | 			q = 0.5
383 | 		} else {
384 | 			q = (sum + (c-1)/2) / float64(t.count-1)
385 | 		}
386 | 		k := 4 * float64(t.count) * q * (1 - q) / t.compression
387 | 
388 | 		if c+float64(count) <= k {
389 | 			n++
390 | 			if t.rng.Float32() < 1/n {
391 | 				closest = neighbor
392 | 			}
393 | 		}
394 | 		sum += c
395 | 	}
396 | 	return closest
397 | }
398 | 
399 | // TrimmedMean returns the mean of the distribution between the two
400 | // percentiles p1 and p2.
401 | //
402 | // Values of p1 and p2 must be beetween 0 and 1 (inclusive) and p1
403 | // must be less than p2. Will panic otherwise.
404 | func (t *TDigest) TrimmedMean(p1, p2 float64) float64 {
405 | 	if p1 < 0 || p1 > 1 {
406 | 		panic("p1 must be between 0 and 1 (inclusive)")
407 | 	}
408 | 	if p2 < 0 || p2 > 1 {
409 | 		panic("p2 must be between 0 and 1 (inclusive)")
410 | 	}
411 | 	if p1 >= p2 {
412 | 		panic("p1 must be lower than p2")
413 | 	}
414 | 
415 | 	minCount := p1 * float64(t.count)
416 | 	maxCount := p2 * float64(t.count)
417 | 
418 | 	var trimmedSum, trimmedCount, currCount float64
419 | 	for i, mean := range t.summary.means {
420 | 		count := float64(t.summary.counts[i])
421 | 
422 | 		nextCount := currCount + count
423 | 		if nextCount <= minCount {
424 | 			currCount = nextCount
425 | 			continue
426 | 		}
427 | 
428 | 		if currCount < minCount {
429 | 			count = nextCount - minCount
430 | 		}
431 | 		if nextCount > maxCount {
432 | 			count -= nextCount - maxCount
433 | 		}
434 | 
435 | 		trimmedSum += count * mean
436 | 		trimmedCount += count
437 | 
438 | 		if nextCount >= maxCount {
439 | 			break
440 | 		}
441 | 		currCount = nextCount
442 | 	}
443 | 
444 | 	if trimmedCount == 0 {
445 | 		return 0
446 | 	}
447 | 	return trimmedSum / trimmedCount
448 | }
449 | 
450 | func estimateCapacity(compression float64) int {
451 | 	return int(compression) * 10
452 | }
453 | 


--------------------------------------------------------------------------------
/tdigest_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/rand"
  7 | 	"sort"
  8 | 	"testing"
  9 | 
 10 | 	rng "github.com/leesper/go_rng"
 11 | 	"gonum.org/v1/gonum/stat"
 12 | )
 13 | 
 14 | func uncheckedNew(options ...tdigestOption) *TDigest {
 15 | 	t, _ := New(options...)
 16 | 	return t
 17 | }
 18 | 
 19 | // Test of tdigest internals and accuracy. Note no t.Parallel():
 20 | // during tests the default random seed is consistent, but varying
 21 | // concurrency scheduling mixes up the random values used in each test.
 22 | // Since there's a random number call inside tdigest this breaks repeatability
 23 | // for all tests. So, no test concurrency here.
 24 | 
 25 | func TestTInternals(t *testing.T) {
 26 | 	tdigest := uncheckedNew()
 27 | 
 28 | 	if !math.IsNaN(tdigest.Quantile(0.1)) {
 29 | 		t.Errorf("Quantile() on an empty digest should return NaN. Got: %.4f", tdigest.Quantile(0.1))
 30 | 	}
 31 | 
 32 | 	if !math.IsNaN(tdigest.CDF(1)) {
 33 | 		t.Errorf("CDF() on an empty digest should return NaN. Got: %.4f", tdigest.CDF(1))
 34 | 	}
 35 | 
 36 | 	_ = tdigest.Add(0.4)
 37 | 
 38 | 	if tdigest.Quantile(0.1) != 0.4 {
 39 | 		t.Errorf("Quantile() on a single-sample digest should return the samples's mean. Got %.4f", tdigest.Quantile(0.1))
 40 | 	}
 41 | 
 42 | 	if tdigest.CDF(0.3) != 0 {
 43 | 		t.Errorf("CDF(x) on digest with a single centroid should return 0 if x < mean")
 44 | 	}
 45 | 
 46 | 	if tdigest.CDF(0.5) != 1 {
 47 | 		t.Errorf("CDF(x) on digest with a single centroid should return 1 if x >= mean")
 48 | 	}
 49 | 
 50 | 	_ = tdigest.Add(0.5)
 51 | 
 52 | 	if tdigest.summary.Len() != 2 {
 53 | 		t.Errorf("Expected size 2, got %d", tdigest.summary.Len())
 54 | 	}
 55 | 
 56 | 	err := tdigest.AddWeighted(0, 0)
 57 | 
 58 | 	if err == nil {
 59 | 		t.Errorf("Expected AddWeighted() to error out with input (0,0)")
 60 | 	}
 61 | }
 62 | 
 63 | func closeEnough(a float64, b float64) bool {
 64 | 	const EPS = 0.000001
 65 | 	if (a-b < EPS) && (b-a < EPS) {
 66 | 		return true
 67 | 	}
 68 | 	return false
 69 | }
 70 | 
 71 | func assertDifferenceSmallerThan(tdigest *TDigest, p float64, m float64, t *testing.T) {
 72 | 	tp := tdigest.Quantile(p)
 73 | 	if math.Abs(tp-p) >= m {
 74 | 		t.Errorf("T-Digest.Quantile(%.4f) = %.4f. Diff (%.4f) >= %.4f", p, tp, math.Abs(tp-p), m)
 75 | 	}
 76 | }
 77 | 
 78 | func TestUniformDistribution(t *testing.T) {
 79 | 	tdigest := uncheckedNew()
 80 | 
 81 | 	for i := 0; i < 100000; i++ {
 82 | 		_ = tdigest.Add(rand.Float64())
 83 | 	}
 84 | 
 85 | 	assertDifferenceSmallerThan(tdigest, 0.5, 0.02, t)
 86 | 	assertDifferenceSmallerThan(tdigest, 0.1, 0.01, t)
 87 | 	assertDifferenceSmallerThan(tdigest, 0.9, 0.01, t)
 88 | 	assertDifferenceSmallerThan(tdigest, 0.01, 0.005, t)
 89 | 	assertDifferenceSmallerThan(tdigest, 0.99, 0.005, t)
 90 | 	assertDifferenceSmallerThan(tdigest, 0.001, 0.001, t)
 91 | 	assertDifferenceSmallerThan(tdigest, 0.999, 0.001, t)
 92 | }
 93 | 
 94 | // Asserts quantile p is no greater than absolute m off from "true"
 95 | // fractional quantile for supplied data. So m must be scaled
 96 | // appropriately for source data range.
 97 | func assertDifferenceFromQuantile(data []float64, tdigest *TDigest, p float64, m float64, t *testing.T) {
 98 | 	q := quantile(p, data)
 99 | 	tp := tdigest.Quantile(p)
100 | 
101 | 	if math.Abs(tp-q) >= m {
102 | 		t.Fatalf("T-Digest.Quantile(%.4f) = %.4f vs actual %.4f. Diff (%.4f) >= %.4f", p, tp, q, math.Abs(tp-q), m)
103 | 	}
104 | }
105 | 
106 | func TestSequentialInsertion(t *testing.T) {
107 | 	tdigest := uncheckedNew()
108 | 
109 | 	data := make([]float64, 10000)
110 | 	for i := 0; i < len(data); i++ {
111 | 		data[i] = float64(i)
112 | 	}
113 | 
114 | 	for i := 0; i < len(data); i++ {
115 | 		_ = tdigest.Add(data[i])
116 | 
117 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.001, 1.0+0.001*float64(i), t)
118 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.01, 1.0+0.005*float64(i), t)
119 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.05, 1.0+0.01*float64(i), t)
120 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.25, 1.0+0.03*float64(i), t)
121 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.5, 1.0+0.03*float64(i), t)
122 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.75, 1.0+0.03*float64(i), t)
123 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.95, 1.0+0.01*float64(i), t)
124 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.99, 1.0+0.005*float64(i), t)
125 | 		assertDifferenceFromQuantile(data[:i+1], tdigest, 0.999, 1.0+0.001*float64(i), t)
126 | 	}
127 | }
128 | 
129 | func TestNonSequentialInsertion(t *testing.T) {
130 | 	tdigest := uncheckedNew()
131 | 
132 | 	// Not quite a uniform distribution, but close.
133 | 	data := make([]float64, 1000)
134 | 	for i := 0; i < len(data); i++ {
135 | 		tmp := (i * 1627) % len(data)
136 | 		data[i] = float64(tmp)
137 | 	}
138 | 
139 | 	sorted := make([]float64, 0, len(data))
140 | 
141 | 	for i := 0; i < len(data); i++ {
142 | 		_ = tdigest.Add(data[i])
143 | 		sorted = append(sorted, data[i])
144 | 
145 | 		// Estimated quantiles are all over the place for low counts, which is
146 | 		// OK given that something like P99 is not very meaningful when there are
147 | 		// 25 samples. To account for this, increase the error tolerance for
148 | 		// smaller counts.
149 | 		if i == 0 {
150 | 			continue
151 | 		}
152 | 
153 | 		max := float64(len(data))
154 | 		fac := 1.0 + max/float64(i)
155 | 
156 | 		sort.Float64s(sorted)
157 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.001, fac+0.001*max, t)
158 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.01, fac+0.005*max, t)
159 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.05, fac+0.01*max, t)
160 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.25, fac+0.01*max, t)
161 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.5, fac+0.02*max, t)
162 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.75, fac+0.01*max, t)
163 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.95, fac+0.01*max, t)
164 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.99, fac+0.005*max, t)
165 | 		assertDifferenceFromQuantile(sorted, tdigest, 0.999, fac+0.001*max, t)
166 | 	}
167 | }
168 | 
169 | func TestSingletonInACrowd(t *testing.T) {
170 | 	tdigest := uncheckedNew()
171 | 	for i := 0; i < 10000; i++ {
172 | 		_ = tdigest.Add(10)
173 | 	}
174 | 	_ = tdigest.Add(20)
175 | 	_ = tdigest.Compress()
176 | 
177 | 	for _, q := range []float64{0, 0.5, 0.8, 0.9, 0.99, 0.999} {
178 | 		if q == 0.999 {
179 | 			// Test for 0.999 disabled since it doesn't
180 | 			// pass in the reference implementation
181 | 			continue
182 | 		}
183 | 		result := tdigest.Quantile(q)
184 | 		if !closeEnough(result, 10) {
185 | 			t.Errorf("Expected Quantile(%.3f) = 10, but got %.4f (size=%d)", q, result, tdigest.summary.Len())
186 | 		}
187 | 	}
188 | 
189 | 	result := tdigest.Quantile(1)
190 | 	if result != 20 {
191 | 		t.Errorf("Expected Quantile(1) = 20, but got %.4f (size=%d)", result, tdigest.summary.Len())
192 | 	}
193 | }
194 | 
195 | func TestRespectBounds(t *testing.T) {
196 | 	tdigest := uncheckedNew(Compression(10))
197 | 
198 | 	data := []float64{0, 279, 2, 281}
199 | 	for _, f := range data {
200 | 		_ = tdigest.Add(f)
201 | 	}
202 | 
203 | 	quantiles := []float64{0.01, 0.25, 0.5, 0.75, 0.999}
204 | 	for _, q := range quantiles {
205 | 		result := tdigest.Quantile(q)
206 | 		if result < 0 {
207 | 			t.Errorf("q(%.3f) = %.4f < 0", q, result)
208 | 		}
209 | 		if tdigest.Quantile(q) > 281 {
210 | 			t.Errorf("q(%.3f) = %.4f > 281", q, result)
211 | 		}
212 | 	}
213 | }
214 | 
215 | func TestWeights(t *testing.T) {
216 | 	tdigest := uncheckedNew(Compression(10))
217 | 
218 | 	// Create data slice with repeats matching weights we gave to tdigest
219 | 	data := []float64{}
220 | 	for i := 0; i < 100; i++ {
221 | 		_ = tdigest.AddWeighted(float64(i), uint64(i))
222 | 
223 | 		for j := 0; j < i; j++ {
224 | 			data = append(data, float64(i))
225 | 		}
226 | 	}
227 | 
228 | 	assertDifferenceFromQuantile(data, tdigest, 0.001, 1.0+0.001*100.0, t)
229 | 	assertDifferenceFromQuantile(data, tdigest, 0.01, 1.0+0.005*100.0, t)
230 | 	assertDifferenceFromQuantile(data, tdigest, 0.05, 1.0+0.01*100.0, t)
231 | 	assertDifferenceFromQuantile(data, tdigest, 0.25, 1.0+0.01*100.0, t)
232 | 	assertDifferenceFromQuantile(data, tdigest, 0.5, 1.0+0.02*100.0, t)
233 | 	assertDifferenceFromQuantile(data, tdigest, 0.75, 1.0+0.01*100.0, t)
234 | 	assertDifferenceFromQuantile(data, tdigest, 0.95, 1.0+0.01*100.0, t)
235 | 	assertDifferenceFromQuantile(data, tdigest, 0.99, 1.0+0.005*100.0, t)
236 | 	assertDifferenceFromQuantile(data, tdigest, 0.999, 1.0+0.001*100.0, t)
237 | }
238 | 
239 | func TestIntegers(t *testing.T) {
240 | 	tdigest := uncheckedNew()
241 | 
242 | 	_ = tdigest.Add(1)
243 | 	_ = tdigest.Add(2)
244 | 	_ = tdigest.Add(3)
245 | 
246 | 	if tdigest.Quantile(0.5) != 2 {
247 | 		t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5))
248 | 	}
249 | 
250 | 	tdigest = uncheckedNew()
251 | 
252 | 	for _, i := range []float64{1, 2, 2, 2, 2, 2, 2, 2, 3} {
253 | 		_ = tdigest.Add(i)
254 | 	}
255 | 
256 | 	if tdigest.Quantile(0.5) != 2 {
257 | 		t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5))
258 | 	}
259 | 
260 | 	var tot uint64
261 | 	tdigest.ForEachCentroid(func(mean float64, count uint64) bool {
262 | 		tot += count
263 | 		return true
264 | 	})
265 | 
266 | 	if tot != 9 {
267 | 		t.Errorf("Expected the centroid count to be 9, Got %d instead", tot)
268 | 	}
269 | }
270 | 
271 | func cdf(x float64, data []float64) float64 {
272 | 	var n1, n2 int
273 | 	for i := 0; i < len(data); i++ {
274 | 		if data[i] < x {
275 | 			n1++
276 | 		}
277 | 		if data[i] <= x {
278 | 			n2++
279 | 		}
280 | 	}
281 | 	return float64(n1+n2) / 2.0 / float64(len(data))
282 | }
283 | 
284 | func quantile(q float64, data []float64) float64 {
285 | 	if len(data) == 0 {
286 | 		return math.NaN()
287 | 	}
288 | 
289 | 	if q == 1 || len(data) == 1 {
290 | 		return data[len(data)-1]
291 | 	}
292 | 
293 | 	index := q * (float64(len(data)) - 1)
294 | 	return data[int(index)+1]*(index-float64(int(index))) + data[int(index)]*(float64(int(index)+1)-index)
295 | }
296 | 
297 | func TestMergeNormal(t *testing.T) {
298 | 	testMerge(t, false)
299 | }
300 | 
301 | func TestMergeDescructive(t *testing.T) {
302 | 	testMerge(t, true)
303 | }
304 | 
305 | func testMerge(t *testing.T, destructive bool) {
306 | 	if testing.Short() {
307 | 		t.Skipf("Skipping merge test. Short flag is on")
308 | 	}
309 | 
310 | 	const numItems = 100000
311 | 
312 | 	for _, numSubs := range []int{2, 5, 10, 20, 50, 100} {
313 | 		data := make([]float64, numItems)
314 | 
315 | 		subs := make([]*TDigest, numSubs)
316 | 		for i := 0; i < numSubs; i++ {
317 | 			subs[i] = uncheckedNew()
318 | 		}
319 | 
320 | 		dist := uncheckedNew()
321 | 		for i := 0; i < numItems; i++ {
322 | 			num := rand.Float64()
323 | 
324 | 			data[i] = num
325 | 			_ = dist.Add(num)
326 | 			_ = subs[i%numSubs].Add(num)
327 | 		}
328 | 
329 | 		_ = dist.Compress()
330 | 
331 | 		dist2 := uncheckedNew()
332 | 		for i := 0; i < numSubs; i++ {
333 | 			if destructive {
334 | 				_ = dist2.MergeDestructive(subs[i])
335 | 			} else {
336 | 				_ = dist2.Merge(subs[i])
337 | 			}
338 | 
339 | 		}
340 | 
341 | 		if dist.Count() != dist2.Count() {
342 | 			t.Errorf("Expected the number of centroids to be the same. %d != %d", dist.Count(), dist2.Count())
343 | 		}
344 | 
345 | 		if dist2.Count() != numItems {
346 | 			t.Errorf("Items shouldn't have disappeared. %d != %d", dist2.Count(), numItems)
347 | 		}
348 | 
349 | 		sort.Float64s(data)
350 | 
351 | 		for _, q := range []float64{0.001, 0.01, 0.1, 0.2, 0.3, 0.5} {
352 | 			z := quantile(q, data)
353 | 			p1 := dist.Quantile(q)
354 | 			p2 := dist2.Quantile(q)
355 | 
356 | 			e1 := p1 - z
357 | 			e2 := p2 - z
358 | 
359 | 			if math.Abs(e2)/q >= 0.3 {
360 | 				t.Errorf("rel >= 0.3: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f real=%.3f",
361 | 					numSubs, q, e1, e2, math.Abs(e2)/q, z-q)
362 | 			}
363 | 			if math.Abs(e2) >= 0.015 {
364 | 				t.Errorf("e2 >= 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f real=%.3f",
365 | 					numSubs, q, e1, e2, math.Abs(e2)/q, z-q)
366 | 			}
367 | 
368 | 			z = cdf(q, data)
369 | 			e1 = dist.CDF(q) - z
370 | 			e2 = dist2.CDF(q) - z
371 | 
372 | 			if math.Abs(e2)/q > 0.3 {
373 | 				t.Errorf("CDF e2 < 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f",
374 | 					numSubs, q, e1, e2, math.Abs(e2)/q)
375 | 			}
376 | 
377 | 			if math.Abs(e2) >= 0.015 {
378 | 				t.Errorf("CDF e2 < 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f",
379 | 					numSubs, q, e1, e2, math.Abs(e2)/q)
380 | 			}
381 | 		}
382 | 	}
383 | }
384 | 
385 | func TestCompressDoesntChangeCount(t *testing.T) {
386 | 	tdigest := uncheckedNew()
387 | 
388 | 	for i := 0; i < 1000; i++ {
389 | 		_ = tdigest.Add(rand.Float64())
390 | 	}
391 | 
392 | 	initialCount := tdigest.Count()
393 | 
394 | 	err := tdigest.Compress()
395 | 	if err != nil {
396 | 		t.Errorf("Compress() triggered an unexpected error: %s", err)
397 | 	}
398 | 
399 | 	if tdigest.Count() != initialCount {
400 | 		t.Errorf("Compress() should not change count. Wanted %d, got %d", initialCount, tdigest.Count())
401 | 	}
402 | }
403 | 
404 | func TestGammaDistribution(t *testing.T) {
405 | 	const numItems = 100000
406 | 
407 | 	digest := uncheckedNew()
408 | 	gammaRNG := rng.NewGammaGenerator(0xDEADBEE)
409 | 
410 | 	data := make([]float64, numItems)
411 | 	for i := 0; i < numItems; i++ {
412 | 		data[i] = gammaRNG.Gamma(0.1, 0.1)
413 | 		_ = digest.Add(data[i])
414 | 	}
415 | 
416 | 	sort.Float64s(data)
417 | 
418 | 	softErrors := 0
419 | 	for _, q := range []float64{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999} {
420 | 
421 | 		ix := float64(len(data))*q - 0.5
422 | 		index := int(math.Floor(ix))
423 | 		p := ix - float64(index)
424 | 		realQuantile := data[index]*(1-p) + data[index+1]*p
425 | 
426 | 		// estimated cdf of real quantile(x)
427 | 		if math.Abs(digest.CDF(realQuantile)-q) > 0.005 {
428 | 			t.Errorf("Error in estimated CDF too high")
429 | 		}
430 | 
431 | 		// real cdf of estimated quantile(x)
432 | 		error := math.Abs(q - cdf(digest.Quantile(q), data))
433 | 		if error > 0.005 {
434 | 			softErrors++
435 | 		}
436 | 
437 | 		if error > 0.012 {
438 | 			t.Errorf("Error in estimated Quantile too high")
439 | 		}
440 | 	}
441 | 
442 | 	if softErrors >= 3 {
443 | 		t.Errorf("Too many soft errors")
444 | 	}
445 | 
446 | 	// Issue #17, verify that we are hitting the extreme CDF case
447 | 	// XXX Maybe test this properly instead of having a hardcoded value
448 | 	extreme := digest.CDF(0.71875)
449 | 	if !closeEnough(extreme, 1) {
450 | 		t.Errorf("Expected something close to 1 but got %.4f instead", extreme)
451 | 	}
452 | }
453 | 
454 | func shouldPanic(f func(), t *testing.T, message string) {
455 | 	defer func() {
456 | 		tryRecover := recover()
457 | 		if tryRecover == nil {
458 | 			t.Error(message)
459 | 		}
460 | 	}()
461 | 	f()
462 | }
463 | 
464 | func TestPanic(t *testing.T) {
465 | 	tdigest := uncheckedNew()
466 | 
467 | 	shouldPanic(func() {
468 | 		tdigest.Quantile(-42)
469 | 	}, t, "Quantile < 0 should panic!")
470 | 
471 | 	shouldPanic(func() {
472 | 		tdigest.Quantile(42)
473 | 	}, t, "Quantile > 1 should panic!")
474 | }
475 | 
476 | func TestForEachCentroid(t *testing.T) {
477 | 	tdigest := uncheckedNew(Compression(10))
478 | 
479 | 	for i := 0; i < 100; i++ {
480 | 		_ = tdigest.Add(float64(i))
481 | 	}
482 | 
483 | 	// Iterate limited number.
484 | 	means := []float64{}
485 | 	tdigest.ForEachCentroid(func(mean float64, count uint64) bool {
486 | 		means = append(means, mean)
487 | 		return len(means) != 3
488 | 	})
489 | 	if len(means) != 3 {
490 | 		t.Errorf("ForEachCentroid handled incorrect number of data items")
491 | 	}
492 | 
493 | 	// Iterate all datapoints.
494 | 	means = []float64{}
495 | 	tdigest.ForEachCentroid(func(mean float64, count uint64) bool {
496 | 		means = append(means, mean)
497 | 		return true
498 | 	})
499 | 	if len(means) != tdigest.summary.Len() {
500 | 		t.Errorf("ForEachCentroid did not handle all data")
501 | 	}
502 | }
503 | 
504 | func TestQuantilesDontOverflow(t *testing.T) {
505 | 	tdigest := uncheckedNew(Compression(100))
506 | 	// Add slightly more than math.MaxUint32 samples uniformly in the range
507 | 	// [0, 1). This would overflow a uint32-based implementation.
508 | 	tdigest.Add(1)
509 | 	for i := 0; i < 1024; i++ {
510 | 		tdigest.AddWeighted(float64(i)/1024, 4194304)
511 | 	}
512 | 	assertDifferenceSmallerThan(tdigest, 0.5, .02, t)
513 | }
514 | 
515 | func TestCDFInsideLastCentroid(t *testing.T) {
516 | 	// values pulled from a live digest. sorry it's a lot!
517 | 	td := &TDigest{
518 | 		summary: &summary{
519 | 			means:  []float64{2120.75048828125, 2260.3844299316406, 3900.490264892578, 3937.495807647705, 5390.479816436768, 10450.335285186768, 14152.897296905518, 16442.676349639893, 24303.143146514893, 56961.87361526489, 63891.24959182739, 73982.55232620239, 86477.50447463989, 110746.62556838989, 175479.7388496399, 300492.3404121399, 440452.5279121399, 515611.7700996399, 535827.0025215149, 546241.6822090149, 556965.3648262024, 569791.2124824524, 587320.6870918274, 603969.4175605774, 613751.6177558899, 624708.7593574524, 635060.0718574524, 641924.2007637024, 650656.4302558899, 660653.1714668274, 671380.9009590149, 687094.3667793274, 716595.8824043274, 740870.9800605774, 760276.2437324524, 768857.5786933899, 775021.0025215149, 787686.0337715149, 801473.4624824524, 815225.1255683899, 832358.6997871399, 852438.4751777649, 866134.2935371399, 1.10661549666214e+06, 1.1212118980293274e+06, 1.2230108433418274e+06, 1.5446490620918274e+06, 4.306712312091827e+06, 5.487582562091827e+06, 6.306383562091827e+06, 7.089308312091827e+06, 7.520797593341827e+06},
520 | 			counts: []uint64{0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x4, 0x5, 0x6, 0x3, 0x3, 0x4, 0x11, 0x23, 0x2f, 0x1e, 0x1b, 0x36, 0x31, 0x33, 0x4e, 0x5f, 0x61, 0x48, 0x2e, 0x26, 0x28, 0x2a, 0x31, 0x39, 0x51, 0x32, 0x2b, 0x12, 0x8, 0xb, 0xa, 0x11, 0xa, 0x11, 0x9, 0x7, 0x1, 0x1, 0x1, 0x3, 0x2, 0x1, 0x1, 0x1, 0x1},
521 | 		},
522 | 		compression: 5,
523 | 		count:       1250,
524 | 		rng:         globalRNG{},
525 | 	}
526 | 
527 | 	if cdf := td.CDF(7.144560976650238e+06); cdf > 1 {
528 | 		t.Fatalf("invalid: %v", cdf)
529 | 	}
530 | }
531 | 
532 | func TestTrimmedMean(t *testing.T) {
533 | 	tests := []struct {
534 | 		p1, p2 float64
535 | 	}{
536 | 		{0, 1},
537 | 		{0.1, 0.9},
538 | 		{0.2, 0.8},
539 | 		{0.25, 0.75},
540 | 		{0, 0.5},
541 | 		{0.5, 1},
542 | 		{0.1, 0.7},
543 | 		{0.3, 0.9},
544 | 	}
545 | 
546 | 	for _, size := range []int{100, 1000, 10000} {
547 | 		for _, test := range tests {
548 | 			td := uncheckedNew(Compression(100))
549 | 
550 | 			data := make([]float64, 0, size)
551 | 			for i := 0; i < size; i++ {
552 | 				f := rand.Float64()
553 | 				data = append(data, f)
554 | 				err := td.Add(f)
555 | 				if err != nil {
556 | 					t.Fatal(err)
557 | 				}
558 | 			}
559 | 
560 | 			got := td.TrimmedMean(test.p1, test.p2)
561 | 			wanted := trimmedMean(data, test.p1, test.p2)
562 | 			if math.Abs(got-wanted) > 0.01 {
563 | 				t.Fatalf("got %f, wanted %f (size=%d p1=%f p2=%f)",
564 | 					got, wanted, size, test.p1, test.p2)
565 | 			}
566 | 
567 | 			for i := 0; i < 10; i++ {
568 | 				err := td.Add(float64(i * 100))
569 | 				if err != nil {
570 | 					t.Fatal(err)
571 | 				}
572 | 			}
573 | 			mean := td.TrimmedMean(0.1, 0.999)
574 | 			if mean < 0 {
575 | 				t.Fatalf("mean < 0")
576 | 			}
577 | 		}
578 | 	}
579 | }
580 | 
581 | func TestTrimmedMeanCornerCases(t *testing.T) {
582 | 	td := uncheckedNew(Compression(100))
583 | 
584 | 	mean := td.TrimmedMean(0, 1)
585 | 	if mean != 0 {
586 | 		t.Fatalf("got %f, wanted 0", mean)
587 | 	}
588 | 
589 | 	x := 1.0
590 | 	err := td.Add(x)
591 | 	if err != nil {
592 | 		t.Fatal(err)
593 | 	}
594 | 
595 | 	mean = td.TrimmedMean(0, 1)
596 | 	if mean != 1 {
597 | 		t.Fatalf("got %f, wanted %f", mean, x)
598 | 	}
599 | 
600 | 	err = td.Add(1000)
601 | 	if err != nil {
602 | 		t.Fatal(err)
603 | 	}
604 | 
605 | 	mean = td.TrimmedMean(0, 1)
606 | 	wanted := 500.5
607 | 	if !closeEnough(mean, wanted) {
608 | 		t.Fatalf("got %f, wanted %f", mean, wanted)
609 | 	}
610 | }
611 | 
612 | func trimmedMean(ff []float64, p1, p2 float64) float64 {
613 | 	sort.Float64s(ff)
614 | 	x1 := stat.Quantile(p1, stat.Empirical, ff, nil)
615 | 	x2 := stat.Quantile(p2, stat.Empirical, ff, nil)
616 | 
617 | 	var sum float64
618 | 	var count int
619 | 	for _, f := range ff {
620 | 		if f >= x1 && f <= x2 {
621 | 			sum += f
622 | 			count++
623 | 		}
624 | 	}
625 | 	return sum / float64(count)
626 | }
627 | 
628 | func TestClone(t *testing.T) {
629 | 	seed := func(td *TDigest) {
630 | 		for i := 0; i < 100; i++ {
631 | 			err := td.Add(rand.Float64())
632 | 			if err != nil {
633 | 				t.Fatal(err)
634 | 			}
635 | 		}
636 | 	}
637 | 
638 | 	td := uncheckedNew(Compression(42))
639 | 	seed(td)
640 | 	clone := td.Clone()
641 | 
642 | 	// Clone behaves like td.
643 | 
644 | 	if clone.Compression() != td.Compression() {
645 | 		t.Fatalf("got %f, wanted %f", clone.Compression(), td.Compression())
646 | 	}
647 | 
648 | 	cloneCount := clone.Count()
649 | 	if cloneCount != td.Count() {
650 | 		t.Fatalf("got %d, wanted %d", cloneCount, td.Count())
651 | 	}
652 | 
653 | 	cloneQuantile := clone.Quantile(1)
654 | 	if cloneQuantile != td.Quantile(1) {
655 | 		t.Fatalf("got %f, wanted %f", cloneQuantile, td.Quantile(1))
656 | 	}
657 | 
658 | 	seed(td)
659 | 	if td.Count() == clone.Count() {
660 | 		t.Fatal("seed does not work")
661 | 	}
662 | 
663 | 	// Clone is not changed after td is changed.
664 | 
665 | 	if clone.Count() != cloneCount {
666 | 		t.Fatalf("got %d, wanted %d", clone.Count(), cloneCount)
667 | 	}
668 | 
669 | 	if clone.Quantile(1) != cloneQuantile {
670 | 		t.Fatalf("got %f, wanted %f", clone.Quantile(1), cloneQuantile)
671 | 	}
672 | 
673 | 	// Clone is fully functional.
674 | 
675 | 	err := clone.Add(1)
676 | 	if err != nil {
677 | 		t.Fatal(err)
678 | 	}
679 | }
680 | 
681 | var compressions = []float64{1, 10, 20, 30, 50, 100}
682 | 
683 | func BenchmarkTDigestAddOnce(b *testing.B) {
684 | 	for _, compression := range compressions {
685 | 		compression := compression
686 | 		b.Run(fmt.Sprintf("compression=%.0f", compression), func(b *testing.B) {
687 | 			benchmarkAddOnce(b, compression)
688 | 		})
689 | 	}
690 | }
691 | 
692 | func benchmarkAddOnce(b *testing.B, compression float64) {
693 | 	t := uncheckedNew(Compression(compression))
694 | 
695 | 	data := make([]float64, b.N)
696 | 	for n := 0; n < b.N; n++ {
697 | 		data[n] = rand.Float64()
698 | 	}
699 | 
700 | 	b.ReportAllocs()
701 | 	b.ResetTimer()
702 | 	for n := 0; n < b.N; n++ {
703 | 		err := t.Add(data[n])
704 | 		if err != nil {
705 | 			b.Error(err)
706 | 		}
707 | 	}
708 | 	b.StopTimer()
709 | }
710 | 
711 | func BenchmarkTDigestAddMulti(b *testing.B) {
712 | 	for _, compression := range compressions {
713 | 		compression := compression
714 | 		for _, n := range []int{10, 100, 1000, 10000} {
715 | 			n := n
716 | 			name := fmt.Sprintf("compression=%.0f n=%d", compression, n)
717 | 			b.Run(name, func(b *testing.B) {
718 | 				benchmarkAddMulti(b, compression, n)
719 | 			})
720 | 		}
721 | 	}
722 | }
723 | 
724 | func benchmarkAddMulti(b *testing.B, compression float64, times int) {
725 | 	data := make([]float64, times)
726 | 	for i := 0; i < times; i++ {
727 | 		data[i] = rand.Float64()
728 | 	}
729 | 
730 | 	b.ReportAllocs()
731 | 	b.ResetTimer()
732 | 	for n := 0; n < b.N; n++ {
733 | 		t := uncheckedNew(Compression(compression))
734 | 		for i := 0; i < times; i++ {
735 | 			err := t.AddWeighted(data[i], 1)
736 | 			if err != nil {
737 | 				b.Error(err)
738 | 			}
739 | 		}
740 | 	}
741 | 	b.StopTimer()
742 | }
743 | 
744 | func BenchmarkTDigestMerge(b *testing.B) {
745 | 	for _, compression := range compressions {
746 | 		compression := compression
747 | 		for _, n := range []int{1, 10, 100} {
748 | 			name := fmt.Sprintf("compression=%.0f n=%d", compression, n)
749 | 			b.Run(name, func(b *testing.B) {
750 | 				benchmarkMerge(b, compression, n)
751 | 			})
752 | 		}
753 | 	}
754 | }
755 | 
756 | func benchmarkMerge(b *testing.B, compression float64, times int) {
757 | 	ts := make([]*TDigest, times)
758 | 	for i := 0; i < times; i++ {
759 | 		ts[i] = randomTDigest(compression)
760 | 	}
761 | 
762 | 	b.ReportAllocs()
763 | 	b.ResetTimer()
764 | 	for n := 0; n < b.N; n++ {
765 | 		dst := uncheckedNew(Compression(compression))
766 | 
767 | 		for i := 0; i < times; i++ {
768 | 			err := dst.Merge(ts[i])
769 | 			if err != nil {
770 | 				b.Fatal(err)
771 | 			}
772 | 		}
773 | 
774 | 		err := dst.Compress()
775 | 		if err != nil {
776 | 			b.Fatal(err)
777 | 		}
778 | 	}
779 | }
780 | 
781 | func randomTDigest(compression float64) *TDigest {
782 | 	t := uncheckedNew(Compression(compression))
783 | 	n := 20 * int(compression)
784 | 	for i := 0; i < n; i++ {
785 | 		err := t.Add(rand.Float64())
786 | 		if err != nil {
787 | 			panic(err)
788 | 		}
789 | 	}
790 | 	return t
791 | }
792 | 
793 | // Pathological ordered-input case.
794 | func BenchmarkAddOrdered(b *testing.B) {
795 | 	t, _ := New(Compression(100))
796 | 
797 | 	for n := 0; n < b.N; n++ {
798 | 		err := t.Add(float64(n))
799 | 		if err != nil {
800 | 			b.Error(err)
801 | 		}
802 | 	}
803 | }
804 | 
805 | func BenchmarkMerge(b *testing.B) {
806 | 	b.ReportAllocs()
807 | 
808 | 	t, _ := New(Compression(100))
809 | 	for n := 0; n < 1000; n++ {
810 | 		t.AddWeighted(rand.Float64(), uint64(rand.Intn(100)))
811 | 	}
812 | 
813 | 	dest, _ := New(Compression(100))
814 | 
815 | 	b.ResetTimer()
816 | 	for n := 0; n < b.N; n++ {
817 | 		dest.Merge(t)
818 | 	}
819 | }
820 | 
821 | func BenchmarkMergeDestructive(b *testing.B) {
822 | 	b.ReportAllocs()
823 | 
824 | 	t, _ := New(Compression(100))
825 | 	for n := 0; n < 1000; n++ {
826 | 		t.AddWeighted(rand.Float64(), uint64(rand.Intn(100)))
827 | 	}
828 | 
829 | 	dest, _ := New(Compression(100))
830 | 
831 | 	b.ResetTimer()
832 | 
833 | 	// After the first iteration, t's summary is scrambled, which means it's
834 | 	// mostly useless, but we can still merge it.
835 | 	for n := 0; n < b.N; n++ {
836 | 		dest.MergeDestructive(t)
837 | 	}
838 | }
839 | 


--------------------------------------------------------------------------------