├── .config └── caca.ini ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── go.mod ├── go.sum ├── options.go ├── options_test.go ├── rng.go ├── serialization.go ├── serialization_test.go ├── summary.go ├── summary_test.go ├── tdigest.go └── tdigest_test.go /.config/caca.ini: -------------------------------------------------------------------------------- 1 | [meta] 2 | state = pinned 3 | 4 | [link "Issues"] 5 | href = https://github.com/caio/go-tdigest/issues 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | go-tdigest.test 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | First and foremost: **thank you very much** for your interest in this 4 | project. Feel free to skip all this and open your issue / pull request 5 | if reading contribution guidelines is too much for you at this point. 6 | We value your contribution a lot more than we value your ability to 7 | follow rules (and thankfully we can afford to take this approach given 8 | this project's demand). 9 | 10 | Any kind of contribution is welcome. We can always use better docs and 11 | tests (and code, of course). If you think you can improve this project 12 | in any dimension _let's talk_ :-) 13 | 14 | ## Guidelines 15 | 16 | Be kind and respectful in all your interactions with people inside 17 | (outside too!) this community; There is no excuse for not showing 18 | basic decency. Sarcasm and generally unconstructive remarks are **not 19 | welcome**. 20 | 21 | ### Issues 22 | 23 | When opening and interacting with issues please: 24 | 25 | - Be as clear as possible 26 | - Provide examples if you can 27 | 28 | ### Pull Requests 29 | 30 | We expect that pull requests: 31 | 32 | - Have [good commit messages][commits] 33 | - Contain tests for new features 34 | - Target and can be cleanly merged with the `master` branch 35 | - Pass the tests 36 | 37 | [commits]: https://www.git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project#_commit_guidelines 38 | 39 | ### Project Management 40 | 41 | Don't bother with labels, milestones, assignments, etc. We don't make 42 | use of those. 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Caio Romão Costa Nascimento 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # T-Digest 2 | 3 | A fast map-reduce and parallel streaming friendly data-structure for accurate 4 | quantile approximation. 5 | 6 | This package provides an implementation of Ted Dunning's t-digest data 7 | structure in Go. 8 | 9 | [![GoDoc](https://godoc.org/github.com/caio/go-tdigest?status.svg)](http://godoc.org/github.com/caio/go-tdigest) 10 | [![Go Report Card](https://goreportcard.com/badge/github.com/caio/go-tdigest)](https://goreportcard.com/report/github.com/caio/go-tdigest) 11 | 12 | ## Project Status 13 | 14 | This project is actively maintained. We are happy to collaborate on features 15 | and issues if/when they arrive. 16 | 17 | ## Installation 18 | 19 | This package uses go modules. Our releases are tagged and signed following 20 | the [Semantic Versioning][semver] scheme. 21 | 22 | go get github.com/caio/go-tdigest/v4 23 | 24 | 25 | [semver]: http://semver.org/ 26 | 27 | ## Example Usage 28 | 29 | ```go 30 | package main 31 | 32 | import ( 33 | "fmt" 34 | "math/rand" 35 | 36 | "github.com/caio/go-tdigest/v4" 37 | ) 38 | 39 | func main() { 40 | // Analogue to tdigest.New(tdigest.Compression(100)) 41 | t, _ := tdigest.New() 42 | 43 | for i := 0; i < 10000; i++ { 44 | // Analogue to t.AddWeighted(rand.Float64(), 1) 45 | t.Add(rand.Float64()) 46 | } 47 | 48 | fmt.Printf("p(.5) = %.6f\n", t.Quantile(0.5)) 49 | fmt.Printf("CDF(Quantile(.5)) = %.6f\n", t.CDF(t.Quantile(0.5))) 50 | } 51 | ``` 52 | 53 | ## Configuration 54 | 55 | You can configure your digest upon creation with options documented 56 | at [options.go](options.go). Example: 57 | 58 | ```go 59 | // Construct a digest with compression=200 and its own 60 | // (thread-unsafe) RNG seeded with 0xCA10: 61 | digest, _ := tdigest.New( 62 | tdigest.Compression(200), 63 | tdigest.LocalRandomNumberGenerator(0xCA10), 64 | ) 65 | ``` 66 | 67 | ## References 68 | 69 | This is a port of the [reference][1] implementation with some ideas borrowed 70 | from the [python version][2]. If you wanna get a quick grasp of how it works 71 | and why it's useful, [this video and companion article is pretty helpful][3]. 72 | 73 | [1]: https://github.com/tdunning/t-digest 74 | [2]: https://github.com/CamDavidsonPilon/tdigest 75 | [3]: https://www.mapr.com/blog/better-anomaly-detection-t-digest-whiteboard-walkthrough 76 | 77 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/caio/go-tdigest/v4 2 | 3 | go 1.18 4 | 5 | require ( 6 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 7 | gonum.org/v1/gonum v0.11.0 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 h1:X/79QL0b4YJVO5+OsPH9rF2u428CIrGL/jLmPsoOQQ4= 2 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U= 3 | golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs= 4 | gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E= 5 | gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= 6 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import "errors" 4 | 5 | type tdigestOption func(*TDigest) error 6 | 7 | // Compression sets the digest compression 8 | // 9 | // The compression parameter rules the threshold in which samples are 10 | // merged together - the more often distinct samples are merged the more 11 | // precision is lost. Compression should be tuned according to your data 12 | // distribution, but a value of 100 (the default) is often good enough. 13 | // 14 | // A higher compression value means holding more centroids in memory 15 | // (thus: better precision), which means a bigger serialization payload, 16 | // higher memory footprint and slower addition of new samples. 17 | // 18 | // Compression must be a value greater of equal to 1, will yield an 19 | // error otherwise. 20 | func Compression(compression float64) tdigestOption { // nolint 21 | return func(t *TDigest) error { 22 | if compression < 1 { 23 | return errors.New("Compression should be >= 1") 24 | } 25 | t.compression = compression 26 | return nil 27 | } 28 | } 29 | 30 | // RandomNumberGenerator sets the RNG to be used internally 31 | // 32 | // This allows changing which random number source is used when using 33 | // the TDigest structure (rngs are used when deciding which candidate 34 | // centroid to merge with and when compressing or merging with 35 | // another digest for it increases accuracy). This functionality is 36 | // particularly useful for testing or when you want to disconnect 37 | // your sample collection from the (default) shared random source 38 | // to minimize lock contention. 39 | func RandomNumberGenerator(rng RNG) tdigestOption { // nolint 40 | return func(t *TDigest) error { 41 | t.rng = rng 42 | return nil 43 | } 44 | } 45 | 46 | // LocalRandomNumberGenerator makes the TDigest use the default 47 | // `math/random` functions but with an unshared source that is 48 | // seeded with the given `seed` parameter. 49 | func LocalRandomNumberGenerator(seed int64) tdigestOption { // nolint 50 | return RandomNumberGenerator(newLocalRNG(seed)) 51 | } 52 | -------------------------------------------------------------------------------- /options_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import "testing" 4 | 5 | func TestDefaults(t *testing.T) { 6 | digest, err := New() 7 | 8 | if err != nil { 9 | t.Errorf("Creating a default TDigest should never error out. Got %s", err) 10 | } 11 | 12 | if digest.compression != 100 { 13 | t.Errorf("The default compression should be 100") 14 | } 15 | } 16 | 17 | func TestCompression(t *testing.T) { 18 | digest, _ := New(Compression(40)) 19 | if digest.compression != 40 { 20 | t.Errorf("The compression option should change the new digest compression") 21 | } 22 | 23 | digest, err := New(Compression(0)) 24 | if err == nil || digest != nil { 25 | t.Errorf("Trying to create a digest with bad compression should give an error") 26 | } 27 | } 28 | 29 | func TestRandomNumberGenerator(t *testing.T) { 30 | const numTests = 100 31 | 32 | // Create two digests with unshared rngs seeded with 33 | // the same seed 34 | t1, _ := New(RandomNumberGenerator(newLocalRNG(0xDEADBEE))) 35 | t2, _ := New(LocalRandomNumberGenerator(0xDEADBEE)) 36 | 37 | // So that they should emit the same values when called 38 | // at the same frequency 39 | for i := 0; i < numTests; i++ { 40 | if t1.rng.Float32() != t2.rng.Float32() || 41 | t1.rng.Intn(10) != t2.rng.Intn(10) { 42 | t.Errorf("r1 and r2 should be distinct RNGs returning the same values") 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /rng.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "math/rand" 5 | ) 6 | 7 | // RNG is an interface that wraps the needed random number 8 | // generator calls that tdigest uses during its runtime 9 | type RNG interface { 10 | Float32() float32 11 | Intn(int) int 12 | } 13 | 14 | type globalRNG struct{} 15 | 16 | func (r globalRNG) Float32() float32 { 17 | return rand.Float32() 18 | } 19 | 20 | func (r globalRNG) Intn(i int) int { 21 | return rand.Intn(i) 22 | } 23 | 24 | type localRNG struct { 25 | localRand *rand.Rand 26 | } 27 | 28 | func newLocalRNG(seed int64) *localRNG { 29 | return &localRNG{ 30 | localRand: rand.New(rand.NewSource(seed)), 31 | } 32 | } 33 | 34 | func (r *localRNG) Float32() float32 { 35 | return r.localRand.Float32() 36 | } 37 | 38 | func (r *localRNG) Intn(i int) int { 39 | return r.localRand.Intn(i) 40 | } 41 | -------------------------------------------------------------------------------- /serialization.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "math" 9 | ) 10 | 11 | const smallEncoding int32 = 2 12 | 13 | var endianess = binary.BigEndian 14 | 15 | // AsBytes serializes the digest into a byte array so it can be 16 | // saved to disk or sent over the wire. 17 | func (t TDigest) AsBytes() ([]byte, error) { 18 | // TODO get rid of the (now) useless error 19 | return t.ToBytes(make([]byte, t.requiredSize())), nil 20 | } 21 | 22 | func (t *TDigest) requiredSize() int { 23 | return 16 + (4 * len(t.summary.means)) + (len(t.summary.counts) * binary.MaxVarintLen64) 24 | } 25 | 26 | // ToBytes serializes into the supplied slice, avoiding allocation if the slice 27 | // is large enough. The result slice is returned. 28 | func (t *TDigest) ToBytes(b []byte) []byte { 29 | requiredSize := t.requiredSize() 30 | if cap(b) < requiredSize { 31 | b = make([]byte, requiredSize) 32 | } 33 | 34 | // The binary.Put* functions helpfully don't extend the slice for you, they 35 | // just panic if it's not already long enough. So pre-set the slice length; 36 | // we'll return it with the actual encoded length. 37 | b = b[:cap(b)] 38 | 39 | endianess.PutUint32(b[0:4], uint32(smallEncoding)) 40 | endianess.PutUint64(b[4:12], math.Float64bits(t.compression)) 41 | endianess.PutUint32(b[12:16], uint32(t.summary.Len())) 42 | 43 | var x float64 44 | idx := 16 45 | for _, mean := range t.summary.means { 46 | delta := mean - x 47 | x = mean 48 | endianess.PutUint32(b[idx:], math.Float32bits(float32(delta))) 49 | idx += 4 50 | } 51 | 52 | for _, count := range t.summary.counts { 53 | idx += binary.PutUvarint(b[idx:], count) 54 | } 55 | return b[:idx] 56 | } 57 | 58 | // FromBytes reads a byte buffer with a serialized digest (from AsBytes) 59 | // and deserializes it. 60 | // 61 | // This function creates a new tdigest instance with the provided options, 62 | // but ignores the compression setting since the correct value comes 63 | // from the buffer. 64 | func FromBytes(buf *bytes.Reader, options ...tdigestOption) (*TDigest, error) { 65 | var encoding int32 66 | err := binary.Read(buf, endianess, &encoding) 67 | if err != nil { 68 | return nil, err 69 | } 70 | 71 | if encoding != smallEncoding { 72 | return nil, fmt.Errorf("unsupported encoding version: %d", encoding) 73 | } 74 | 75 | t, err := newWithoutSummary(options...) 76 | 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | var compression float64 82 | err = binary.Read(buf, endianess, &compression) 83 | if err != nil { 84 | return nil, err 85 | } 86 | 87 | t.compression = compression 88 | 89 | var numCentroids int32 90 | err = binary.Read(buf, endianess, &numCentroids) 91 | if err != nil { 92 | return nil, err 93 | } 94 | 95 | if numCentroids < 0 || numCentroids > 1<<22 { 96 | return nil, errors.New("bad number of centroids in serialization") 97 | } 98 | 99 | t.summary = newSummary(int(numCentroids)) 100 | t.summary.means = t.summary.means[:numCentroids] 101 | t.summary.counts = t.summary.counts[:numCentroids] 102 | 103 | var x float64 104 | for i := 0; i < int(numCentroids); i++ { 105 | var delta float32 106 | err = binary.Read(buf, endianess, &delta) 107 | if err != nil { 108 | return nil, err 109 | } 110 | x += float64(delta) 111 | t.summary.means[i] = x 112 | } 113 | 114 | for i := 0; i < int(numCentroids); i++ { 115 | count, err := decodeUint(buf) 116 | if err != nil { 117 | return nil, err 118 | } 119 | t.summary.counts[i] = count 120 | t.count += count 121 | } 122 | 123 | return t, nil 124 | } 125 | 126 | // FromBytes deserializes into the supplied TDigest struct, re-using 127 | // and overwriting any existing buffers. 128 | // 129 | // This method reinitializes the digest from the provided buffer 130 | // discarding any previously collected data. Notice that in case 131 | // of errors this may leave the digest in a unusable state. 132 | func (t *TDigest) FromBytes(buf []byte) error { 133 | if len(buf) < 16 { 134 | return errors.New("buffer too small for deserialization") 135 | } 136 | 137 | encoding := int32(endianess.Uint32(buf)) 138 | if encoding != smallEncoding { 139 | return fmt.Errorf("unsupported encoding version: %d", encoding) 140 | } 141 | 142 | compression := math.Float64frombits(endianess.Uint64(buf[4:12])) 143 | numCentroids := int(endianess.Uint32(buf[12:16])) 144 | if numCentroids < 0 || numCentroids > 1<<22 { 145 | return errors.New("bad number of centroids in serialization") 146 | } 147 | 148 | if len(buf) < 16+(4*numCentroids) { 149 | return errors.New("buffer too small for deserialization") 150 | } 151 | 152 | t.count = 0 153 | t.compression = compression 154 | if t.summary == nil || 155 | cap(t.summary.means) < numCentroids || 156 | cap(t.summary.counts) < numCentroids { 157 | t.summary = newSummary(numCentroids) 158 | } 159 | t.summary.means = t.summary.means[:numCentroids] 160 | t.summary.counts = t.summary.counts[:numCentroids] 161 | 162 | idx := 16 163 | var x float64 164 | for i := 0; i < numCentroids; i++ { 165 | delta := math.Float32frombits(endianess.Uint32(buf[idx:])) 166 | idx += 4 167 | x += float64(delta) 168 | t.summary.means[i] = x 169 | } 170 | 171 | for i := 0; i < numCentroids; i++ { 172 | count, read := binary.Uvarint(buf[idx:]) 173 | if read < 1 { 174 | return errors.New("error decoding varint, this TDigest is now invalid") 175 | } 176 | 177 | idx += read 178 | 179 | t.summary.counts[i] = count 180 | t.count += count 181 | } 182 | 183 | if idx != len(buf) { 184 | return errors.New("buffer has unread data") 185 | } 186 | return nil 187 | } 188 | 189 | func encodeUint(buf *bytes.Buffer, n uint64) error { 190 | var b [binary.MaxVarintLen64]byte 191 | 192 | l := binary.PutUvarint(b[:], n) 193 | 194 | _, err := buf.Write(b[:l]) 195 | 196 | return err 197 | } 198 | 199 | func decodeUint(buf *bytes.Reader) (uint64, error) { 200 | v, err := binary.ReadUvarint(buf) 201 | return v, err 202 | } 203 | -------------------------------------------------------------------------------- /serialization_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "bytes" 5 | "encoding/base64" 6 | "math" 7 | "math/rand" 8 | "reflect" 9 | "testing" 10 | ) 11 | 12 | func TestEncodeDecode(t *testing.T) { 13 | testUints := []uint64{0, 10, 100, 1000, 10000, 65535, 2147483647, 2 * math.MaxUint32} 14 | buf := new(bytes.Buffer) 15 | 16 | for _, i := range testUints { 17 | err := encodeUint(buf, i) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | } 22 | 23 | readBuf := bytes.NewReader(buf.Bytes()) 24 | for _, i := range testUints { 25 | j, err := decodeUint(readBuf) 26 | if err != nil { 27 | t.Error(err) 28 | } 29 | 30 | if i != j { 31 | t.Errorf("Basic encode/decode failed. Got %d, wanted %d", j, i) 32 | } 33 | } 34 | } 35 | 36 | func TestSerialization(t *testing.T) { 37 | t1, _ := New() 38 | for i := 0; i < 100; i++ { 39 | _ = t1.Add(rand.Float64()) 40 | } 41 | 42 | serialized, _ := t1.AsBytes() 43 | 44 | t2, err := FromBytes(bytes.NewReader(serialized)) 45 | if err != nil { 46 | t.Fatal(err) 47 | } 48 | assertSerialization(t, t1, t2) 49 | 50 | err = t2.FromBytes(serialized) 51 | if err != nil { 52 | t.Fatal(err) 53 | } 54 | assertSerialization(t, t1, t2) 55 | 56 | var toBuf []byte 57 | toBuf = t1.ToBytes(toBuf) 58 | if !reflect.DeepEqual(serialized, toBuf) { 59 | t.Errorf("ToBytes serialized to something else") 60 | } 61 | 62 | // Make sure we don't re-allocate on buffer re-use 63 | toBuf2 := t1.ToBytes(toBuf[:0]) 64 | if &toBuf2[0] != &toBuf[0] { 65 | t.Errorf("Expected ToBytes() to re-use supplied slice") 66 | } 67 | if !reflect.DeepEqual(toBuf2, toBuf) { 68 | t.Errorf("ToBytes serialized to something else") 69 | } 70 | 71 | t3, _ := New() 72 | err = t3.FromBytes(serialized) 73 | if err != nil { 74 | t.Error(err) 75 | } 76 | 77 | assertSerialization(t, t1, t3) 78 | 79 | // Mess up t3's internal state, deserialize again. 80 | t3.compression = 2 81 | t3.count = 1000 82 | t3.summary.means = append(t3.summary.means, 2.0) 83 | t3.summary.counts[0] = 0 84 | err = t3.FromBytes(serialized) 85 | if err != nil { 86 | t.Error(err) 87 | } 88 | 89 | assertSerialization(t, t1, t3) 90 | 91 | wrong := serialized[:50] 92 | err = t3.FromBytes(wrong) 93 | if err == nil { 94 | t.Error("expected error") 95 | } 96 | wrong = wrong[:2] 97 | err = t3.FromBytes(wrong) 98 | if err == nil { 99 | t.Error("expected error") 100 | } 101 | } 102 | 103 | func assertSerialization(t *testing.T, t1, t2 *TDigest) { 104 | if t1.Count() != t2.Count() || 105 | t1.summary.Len() != t2.summary.Len() || 106 | t1.compression != t2.compression { 107 | t.Errorf("Deserialized to something different. t1=%v t2=%v", t1, t2) 108 | } 109 | 110 | b1, err := t1.AsBytes() 111 | if err != nil { 112 | t.Error(err) 113 | } 114 | 115 | b2, err := t2.AsBytes() 116 | if err != nil { 117 | t.Error(err) 118 | } 119 | 120 | if !bytes.Equal(b1, b2) { 121 | t.Errorf("Deserialized to something different. b1=%q b2=%q", b1, b2) 122 | } 123 | 124 | // t2 is fully functional. 125 | 126 | err = t2.Add(rand.Float64()) 127 | if err != nil { 128 | t.Error(err) 129 | } 130 | 131 | err = t2.Compress() 132 | if err != nil { 133 | t.Error(err) 134 | } 135 | } 136 | 137 | func TestFromBytesIgnoresCompression(t *testing.T) { 138 | digest := uncheckedNew(Compression(42)) 139 | 140 | // Instructing FromBytes to use a compression different 141 | // than the one in the payload should be ignored 142 | payload, err := digest.AsBytes() 143 | 144 | if err != nil { 145 | t.Error(err) 146 | } 147 | 148 | other, err := FromBytes(bytes.NewReader(payload), Compression(100)) 149 | 150 | if err != nil { 151 | t.Error(err) 152 | } 153 | 154 | if other.Compression() != 42 { 155 | t.Errorf("Expected compression to be 42, got %f", other.Compression()) 156 | } 157 | } 158 | 159 | func TestLargeSerializaton(t *testing.T) { 160 | t1, err := New(Compression(10)) 161 | if err != nil { 162 | t.Error(err) 163 | } 164 | 165 | for i := 0; i < 100000; i++ { 166 | t1.AddWeighted(rand.Float64(), 1000000000) 167 | } 168 | 169 | serialized, _ := t1.AsBytes() 170 | serialized2 := t1.ToBytes(nil) 171 | if !reflect.DeepEqual(serialized, serialized2) { 172 | t.Error("serialized version differ") 173 | } 174 | 175 | t2, err := FromBytes(bytes.NewReader(serialized)) 176 | if err != nil { 177 | t.Error(err) 178 | } 179 | 180 | t3, _ := New() 181 | err = t3.FromBytes(serialized2) 182 | if err != nil { 183 | t.Error(err) 184 | } 185 | 186 | assertSerialization(t, t1, t2) 187 | assertSerialization(t, t1, t3) 188 | } 189 | 190 | func TestJavaSmallBytesCompat(t *testing.T) { 191 | // Base64 string generated via (<3 clojure): 192 | // (def t (com.tdunning.math.stats.AVLTreeDigest. 100)) 193 | // (def r (java.util.Random.)) 194 | // (.setSeed r 0xDEADBEEF) 195 | // 196 | // (dotimes [x 100000] 197 | // (.add t (.nextDouble r))) 198 | // 199 | // (def buf (java.nio.ByteBuffer/allocate (.smallByteSize t))) 200 | // (.asSmallBytes t buf) 201 | // (.flip buf) 202 | // (.compress t) 203 | // (def serialized-tdigest (.encodeToString (java.util.Base64/getEncoder) (.array buf))) 204 | // 205 | // (println serialized-tdigest) 206 | 207 | serializedJavaTDigestB64 := "AAAAAkBZAAAAAAAAAAAEOzZpD1w24ySbN288eDfDHOI3jwpPN7jIyze1xXM2BzmuNc6x9DdUUcs2o1QFNvb5tzeNwTo2l0VYNgD89jaAiB83GxMBNTdLZzVjwOk3oKiDNxhS4jZ2blc2zTiiN8rlKDc7gN01HN5jNgF8bDYhIGo3BsH5NlbMcDdtCKQ3eJMUNzzuazQuLpY2y0lcNqNDdDcNDr03zOJ1N3ESMjcqxd42omxHNdA+mDbJmlo3KrIGN5i5/DegwGw10QY2NRuEmjdARF42g8qeN8L4yjajFVs1oIo9NvoNwDdrnuk2LeJGNwFHnTgGqu82TzfHN41Syzbd4xU2XjMVN1GPQjbMZOI2l91oNnY8CDdCy7U1wCuMNwLfyjfGDDo3FWWBNiEsSTiE9ZQ3rY03N6fEbDULhxU3i9qZNxuifjbeoMQ3vJ9mNpxU6jbvhEE3qOmYNrG09jcions3F6YRN5Ny1DUG5+E3P2m7NxXWSzd9PD03GBO9N+INZjczo844exOsNxmKIjgnk242m3GdNxrymzcJGSI1MVGaN6OzizblJ+43D9D/NvxA1Df2mZw339fFOB/KWzdN4WM4MhJoNpShjDfafXk3uSflN/uHhDeIUvI3ZOFqNqUkCDgokRU4VWp2Nzz1UjfigVQ4PHzkN8bWhzc21Kc3vOyQN8SJPjhEt344cC6EOAc/ZjfA9D04NZB9OAx8mzgsvD83oqOINzpg9jg9CWo4Y2qdN/r4XjbiH544DQY6OMvJSThcl+g4mnOyOKqdIzd91to4K72fODbsgjiPb2o4AmM6OIXueDhuDMs4PW1yN3ci2jhZGWY4aM5oOCDGwjiBKsk4FLcON7gbNTgr/zQ4e9V2N7qMkjiRTE04OiCKOA+kqDhK2u83jJvIN/P+6Tgw7v04voUSOExQKTgt8OU4ND0lN9CbPzhIfws4UJvSOBqgKzhe1TM4yVlsOLqRxjhsUw03lttXOEGkjTiTqns4kcOmOG2D5DgFx7847tKPONixTDhm8a84mAD1OFCXQzif2W84eVdkOPgJ+jjQy0g4a1HVOHLm7zjKP0k40bH4ODTQizj5Vn448ubuOQbg9TkIbEw4nuqfONUhyTktbsc48dTQOWSR7TkfJfU47iIfOQDP3jkN52Y5OibNOR1tRTk4XgM5JS+XODkp1DjNnOo4zOE/OTcKaDkfd4c5HTjLOTfMtzk1Tng5HH8aOTdpejlQok44yYMwN68whzgmY3o4kGHlOIRTqTkd2Jg45Dd0OHlnWzkEqtA5PENgOT6ckzlmuTQ5LPhpOL2F3zmPFVg5sPneOWfCETkZWu85KkV9ONN2zzlVKg85k3xqOYMdETkujEg5FZlSOYv3FznTwq05w571OXYNQDlfBkQ5NaiZOK74YzjPWAY5BSnsOUOhdTmCIsM5aphbOTm7cjmYQPw5WyLLOV8xQznRMgU5zm+AOb5MBDmEpF45lqbbOW3LNzlc5LI5ny6QObux3zmCqUY5JJyxOXAibjm8mJA5zUCAOeW3Tznyf3w59LruOceUBzn0Gx05vTtsOfquPDoaISI58SiPOdEPpznD/cw5yU1bObG+/zm6Urs5vqXbOcfLwDmrd4s51P3sOhMXXTogDTA51iYXObgArDnGgzQ5/T2FOfi0RjndrFQ5y0IuOhXqcDorpag6RHR1Oixgxjnpq5Y5svRMOdkKbzmou5k5xefuOdiV6DooArU6SnueOkSo6ToPT2o6BvjmOgtxUjoXlSw6G2tQOfUe9jnksDw52dLDOi/y0TpDLTg6NmYNOgqIbTmzGkE52tMyOoJqNzqnDaQ6q5T1Opv6UTqQdX06g9A3OnfuGzqFI246hlrhOmJtZjpf25k6FCyPOhAMgDo1nyI6TLhWOoVgwzqXc746gJc8OlnKVjo/gk86iR5UOpq7PTqdzGY6fspZOp6HRjqEU7w6So02OiKgNToiGkg6aR5oOpHAEDpRg+M6TXTPOnXxkjq5Ct069YXFOvA6NDry/wY65ooNOtzuzDq+ECw6mx8DOpM6qzrBq7E6wn5oOsrlYDsOlM87DKunOw2+iDrzrio6xuE4OrWjPDqoohQ60tC2OvuLODrrWeE63cvKOtKc2jq/DwE6t3QKOrLbvDqrGOI6vpYpOv+cezslrTw7Gp54OxOm8zsWlYM7H8mYOz5/qzs1fHE7MGNuOzCnFDs1TVs7P/9NOx9ocTr+Wzs6+KXkOxOgZjrNhNA6gXVdOmH0LzqUolA6zwgZOw9UyTsD9I46/6KVOwVc4zsa2pc7CwuyOsUzETqLuas6mdYmOwWSnTswT4o7JX7qOwqBMzr5UBY66uyWOvW5NTsKjMk7JL4oOyJEiTsuq1Y7QQhEOzLQBjsjYeg7BRP5OvfqBTsPYUI7Lhs+O1NqIzuFBsU7eMaTOzuwvDsVjz47FheUOzc2FTtf4y87eSLlO4FCMDuE1iI7jx8lO6CXezuLCc87SkkEOw9XGzrUN/o60S9OOwRbLjs1xu87d3xMO32c8zuL04A7kPoQO4I88jtgMhk7PfEaOxILhDrxJ/g6+S+dOxZK4js70sU7ZAatO41myjuP8FU7iNOzO5IsQzuUZZ47anfmOzKbiDssSQo7RbdFOzZGbzsijbg7EdwqOzAoGzs/cx87c6CdO6fSjju/2mE7s2PuO7BVFDuvFvU7uGxCO6KcTzuFdpY7gW0tO1MZejs+0u87bhNOO5hzHDuuqfU7puR+O27/RjsnXA87FRP2OyNyRTtbJ4A7oHBjO6aMVTu2P3I7xGZWO8iz2TvO9Ns7xOtWO6PXxjt9d/47cPPYO31G+Ds3erY6yE0aOk8xCDpGySk6qq7zOvjgZTsgTBs7NM8GOx3z6Tsq29w7NmsfO0Hreztu3xY7dk9WO3TTGDtiz0s7ZOMLO46IeDuO1vk7bpo/O25jnDt3Vek7lJ5YO5mNajtsaXA7Fv+0Ou/dCTsJbAs7PTjsO22GJzt2eTk7XRjBO340CzuCQOw7clTLO19aJDs0gkk7EHVbOx2iqTsiMhI7I3XXOxQtAjsXZlM7JfrQOzPyIjtHLgk7fNvUO4nBdDuOYZI7hCk6O2l7ITtjevg7WwaEO1/S6TtFPiY7MW19O2bjLzt03jI7gsFbO4PWBDuGU9U7noVhO7Hg3zufXSU7gVMiO1o9xztFxDo7OaUcO2D1wztT9/87XyIcO0mFPzszGc47Kht5Oz23WjtmjYg7gKOVO3IZfDtIzFg7GVHmOuB6KDrOLOQ7CF6POyzH1TsuH3Q7OM/DO0xskTtuJHQ7cMB/O4bPzzuLdOU7h2SsO2f2xDs8IKw7JR0+Oyvwhjsae7E7DcaqOvnPADrXxC063tLlOvY0zTsTxUI7IkbcOzkVzzs/RpM7M1ZDOzBmnjs87aU7VcmTO0DL/zsbCFE69oQyOtABijrpNtM69c8UOu225DqbxxA6azZrOlKXRDp7vwE6p0qUOqxh8zqs7JE6rdnqOtzxNzsOlb47FRw2OxtFIDr6nlQ6yv8ROqncETqkc4I6m2kEOmU4oTp3bec6roWNOtkwujq4hkw6iT+5OnlU+DpkEhQ6aw67Opyl9DqM79c6S3J2Ofz8rDn30cA6HOhuOgXIsToEZQQ5znUNOgH78zoTKKU6YLyIOnWfAjqSDVk6hWPrOnGUwDpqn3c6eY3AOlo1SDqC9kU6bS35OoHBLzp8WgI6So6gOgZ8fzo4Ibw6elzBOo5ZVTp5wSU6d0e+OnCgezpxp1M6NoEzOiXlxzpUG4o6ZLCKOmLp9Tot5oU6Ja0OOhfxAzo4qNI6O9+0OjMq5zorV7w6Gv+gOhnd8zn7P1o6IRscOhLPqDngEEg5uIZJOdmxAToNmYA6Gm3GOmg3sjpCPz06WyRCOmGz3zovg7050RuqOfX3WDol3yg6ZUH8OjrgyzoIwlI6H9qmOhtrGTovTtc6SJ3rOkDdRzoYcX46D3P0OhFudToCa2A6BWyTOaegSzlmUn05X3MqOY5bPjnGv7Q55MBkOil2NDo4lt86Ro6YOlW4mjob5ig5wtjpOZheyTmLg/c5iuH1OYyCVjk+Om85JLQMOZCxJjmP6ro5kcX3OfVvATnGgWE5haCUOP8UPDk8C6Q5iKLHOUrz4zk0mMw5WFZuOUwQ6jl2t2A5qF4BOZp7xzmMEhU5yBgqOdWqmDnmxrg5e4YkOUTKvzkklbs42a5MOTN8WTkWOKU5YfgsOUAgnjmPd1g5h/cCOWbI1TmTGvQ5c2+oOYGlXTlz5jU5WuZmOUBj6Tka3hs5kuGEOUJ64DlaTDI5Qx6wOWLktTk6UT44uJu4OSVojjlryTM5HOuzOKzKFjkzpaA5IKMzOU6a+TkZ+fo4/z9xOJ4zlDlRXMw5JFNkOZ/S5Dm+nbg5mNQ2OTYfzjkIhDY5BF12OWhzpzl1yzU5L8odOUxnIDmfni05O9aXOM5YXzkHdR85Z1vyOVD4LDknGEU5AO+mOUFl9jj6flw4qTKoOMJxbDkZ23s45egCOSXNVzlR/6A5YzwWOIiTmzhDuiM4ckLKOGlfETiwlGw4nTLiOQarEzkghTg435iYOTcOYTk6mxE417BqOShVoTjZ/vA5EbZqOMiXfDj1EMo4qezxOG1Ozjixiu04dG/sOCBpCzihxbA4yQo4OK0e0jizeBg5NRWpOPpimziANrc4EZ7jOIeAXziW4aY5BGPgOOxqLzifzCM4JEU7OMG5kjht8D44Jl+qODrHFzgVIbg4mPQ2OCIORjb5Cp84e6X0OADpBDibM2E3YKiFNuQmfTiYFC04i3bKOL4ELjigoew4oYQDOJAK0Th/KW04Zp6VONEgiTjUZOo4jJV6OBFkKzeaXUc4GJisOBQsOjgCZYc39MHDOBTM6DiVqi84KANtOEaOoTiK1ic4DO70N+rSvDhFJ9Q3gn3TOHXRdTiQZXk4CJwPN8TvgzfSFxo4Gw8UN4leLTepicc39wbUOHg9JzgW8tY4bOD7OJ9hjzhJIeE4G7+IOCht/zhuOXM3EV0eOFMPMzcGRI84uDUbN458SjXUYcg3hQD/N6go1jezTCs3jAi9OA1kFDcSnxM3CklMN3sxdjbrFtQ31mLFN/TNLjdi2XI3qtLMOBHGRDgWdfk3wzlgN0BOqzc4DU43H7m6NpuDhzUjc3g3tgLNNwsZ2ja8xdE3cqBUNXhSZjgJ3V44SLYWN6ywljbIXyo4ALH1N4FO5Tdk/u032vrrNpsb/jb7F3k2lZaBN0W5CTdzvwo4K1P2NmooDzhI1os4IJJZN/xf4DddXcM3adDqN/JMMjgE5Ww3CbgmNzxbkzMyJWI3Jmn+NomgODatgGs3skvFNuHb1ze65og2929+NrFZrjcCh5c2wlG+NqssVDgEloc3RZefN1no3DatxX03KevuNrzoSzgIf/UyKB8LNvfYhjeIRBY3FDw7NkHG1zbsOyw3QEX0N3PYhDa2y0Q2Ew3jNrJhZDhi3UE1jsnzNALBbzbCKuU20kNGNxgSUDW1cus29u8qNvHROjbGJ+U2d2R4NqmGPDh5qFE2kj5gNWP33zZqkM43sf31NzEY+DZsWkQ2qJYDNqUvJDYwDfQ2le0fNvwp2TZtVbk3V1A4NT3PtDd6xmY4FhDEN3OfxjfwGV4238DDNoB/qzaIS3E3Jz+iNmZ4nDeLlU43qQX/N6VXuzU+6Bk2N4WXNeQWsTcpydI3j8D1NrSMsjZRwxU23KKANyvmiTaKHrQ3zzqsNf/SMTchLt411j5sN1t3rDWBMKw21n5rOBeltQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQECAQEBAgEBAQEBAQEBAgICAgEBAgIBAwMBAgIBAQIDAQICAQIBAQEBAwIBAgEBAgMDAwIFAQIDAgIBBAQDAgICAQIDBAMEBAIFAgICAwUHAgIEBgYGAggDBgYCBgQEAQUDBgQDAwoCAgQEAwUCAQgIBQIKBQgEBwQNCQIGBwkFBA4MCQUFBwIICAYHBA0FBQkPEQsLDAkQEA4OCxMRDhMECggIFBMQGhQUExIECQYIChELDA4TERYMDSEbEhEKDhQbEQ8PLSobGBkQCBEPFB4RFCYbIjIcHxwhFhUjHQ8UGCckJSkmIjEmOCsiMScsHy0hHxspODcmKi4/MxcvR1RHMyQoMBotOkpXPiw4OkA/KyE3SlE3JyAygQGUAX+CAU1tYGZpXzYvQUpWd3tKPltvdGF+ckxKOUZ5T1BMap4ByAGuAa4BsgGjAYsBbHufAXOxAeUB2gHpAaUBkQGHAaYBygHJAb0BrwGgAZkBoAF6iwGdAeQB+wHfAdMB3AGfArwChQL7AZ8CpgKYArwB1QG2Ac8BhAFSXnirAeMBwQGmAeAB8wGkAW9ojQH4AbwCgQLPAZoBrAGzAfgBgwL7AZkCqgL8AdEBwAHHAeMBgALiArID1gLrAdkB+gG7AoQDiQOgA60D1QP+A/8CggKzAY0BrAHhAd8C4wKaA8YDnAP1AtkCjQK9AaUBzQGDAtwClwO5A5IDmAOHBKoDygLxAb8CugKSAtQB9gGIArMCvQOeBN0ErgSIBJ8EsQSxA/UChAOdAtICpwOZBJgE1AOhAtsB5gGWApsD8wOFBKgEggWFBfEEmQTLA/ICgQPqAswBWj5kuAHiAaUC/AHfAb8CrALSAvgChAPRAswCgAPGA4MD2QLSAp4DgATQA44CtgGfAfgBzwLiAswC6gKLA4sD5wLEAucB7gHrAfcB9AHiAegB+QGMAuAChgOxA5AD0QLiArEC3wK7AoYCpgL7ApMDnwOpA6sDggSdBKQD+gLIAqgCxgLPAtoCvgKWAvUBhgLHAoYD2wLEAvcBxAGdAbsB7QGCApQCmALJAsUCiQOpA68D/AKSAvwB9wGHAukBugGpAaUBngG6AfwBiwKUApsClQKmArECsAKZAuEBkgGsAccBxAGLAVNFWm6PAYoBe5cBwQHhAe0BzgHLAYYBggGbAVhThAGaAaEBhQFJW1p5iQFhOic2NS0mHjJAYGpsYVZcR2VsWF5JOTBKa3hUWGdSSFJXS1NAL0VRQz83LTQ2QDYjFSs+VEpabVQzJjNAWkM1LT5MPVBDKyw5IxcUFyEgPkNNTUUxKRwfHh4PGBkZKSUZEwwZHRcXDxUeGhgdKiYaHhEQCxISEh0REyAdGhoYDxMWFhIWHhEIChQNEA0TDBMMCQ8QEx8mGwsQERcWERcRDQ8TFg4REhILBxMJCQ4UEQQDCAQFDg0EERQLFhIHDA4NBgcKAwQLCQMMEQYDBwcIEAwKBgkFAwQHCAIBCAUDCAEJCwcLBwoFBgYGBQIEBQMCAgMJAwkCAwUBAwMGAwMDAgUFAgEDBgYHBAYBAwcBBQIBBgMCAgYBAQQCAwMBAgMEAgMDAQEBAQEDAQEDAgIBAgECAwECAQMDAwECAwICAQMCAQEBAQEBAgICAQIBAQICAQEBAQEBAQICAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQE=" 208 | 209 | tdigestAsBytes, err := base64.StdEncoding.DecodeString(serializedJavaTDigestB64) 210 | 211 | if err != nil { 212 | t.Fatal(err.Error()) 213 | } 214 | 215 | tdigest, err := FromBytes(bytes.NewReader(tdigestAsBytes)) 216 | 217 | if err != nil { 218 | t.Fatal(err.Error()) 219 | } 220 | 221 | if tdigest.Count() != 100000 { 222 | t.Fatalf("Expected deserialized t-digest to have a count of 100_000. Got %d", tdigest.Count()) 223 | } 224 | 225 | assertDifferenceSmallerThan(tdigest, 0.5, 0.02, t) 226 | assertDifferenceSmallerThan(tdigest, 0.1, 0.01, t) 227 | assertDifferenceSmallerThan(tdigest, 0.9, 0.01, t) 228 | assertDifferenceSmallerThan(tdigest, 0.01, 0.005, t) 229 | assertDifferenceSmallerThan(tdigest, 0.99, 0.005, t) 230 | assertDifferenceSmallerThan(tdigest, 0.001, 0.001, t) 231 | assertDifferenceSmallerThan(tdigest, 0.999, 0.001, t) 232 | } 233 | 234 | func BenchmarkAsBytes(b *testing.B) { 235 | b.ReportAllocs() 236 | 237 | t1, _ := New(Compression(100)) 238 | for i := 0; i < 100; i++ { 239 | t1.Add(rand.Float64()) 240 | } 241 | 242 | b.ResetTimer() 243 | 244 | for n := 0; n < b.N; n++ { 245 | t1.AsBytes() 246 | } 247 | } 248 | 249 | func BenchmarkToBytes(b *testing.B) { 250 | b.ReportAllocs() 251 | 252 | t1, _ := New(Compression(100)) 253 | for i := 0; i < 100; i++ { 254 | t1.Add(rand.Float64()) 255 | } 256 | 257 | b.ResetTimer() 258 | var buf []byte 259 | for n := 0; n < b.N; n++ { 260 | buf = t1.ToBytes(buf) 261 | } 262 | } 263 | 264 | func BenchmarkFromBytes(b *testing.B) { 265 | b.ReportAllocs() 266 | 267 | t1, _ := New(Compression(100)) 268 | for i := 0; i < 100; i++ { 269 | t1.Add(rand.Float64()) 270 | } 271 | 272 | buf, _ := t1.AsBytes() 273 | reader := bytes.NewReader(buf) 274 | 275 | b.ResetTimer() 276 | for n := 0; n < b.N; n++ { 277 | reader.Reset(buf) 278 | FromBytes(reader) 279 | } 280 | } 281 | 282 | func BenchmarkFromBytesMethod(b *testing.B) { 283 | b.ReportAllocs() 284 | 285 | t1, _ := New(Compression(100)) 286 | for i := 0; i < 100; i++ { 287 | t1.Add(rand.Float64()) 288 | } 289 | 290 | buf, _ := t1.AsBytes() 291 | 292 | b.ResetTimer() 293 | var t2 TDigest 294 | for n := 0; n < b.N; n++ { 295 | t2.FromBytes(buf) 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /summary.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "sort" 7 | ) 8 | 9 | type summary struct { 10 | means []float64 11 | counts []uint64 12 | } 13 | 14 | func newSummary(initialCapacity int) *summary { 15 | s := &summary{ 16 | means: make([]float64, 0, initialCapacity), 17 | counts: make([]uint64, 0, initialCapacity), 18 | } 19 | return s 20 | } 21 | 22 | func (s *summary) Len() int { 23 | return len(s.means) 24 | } 25 | 26 | func (s *summary) Add(key float64, value uint64) error { 27 | if math.IsNaN(key) { 28 | return fmt.Errorf("key must not be NaN") 29 | } 30 | if value == 0 { 31 | return fmt.Errorf("Count must be >0") 32 | } 33 | 34 | idx := s.findInsertionIndex(key) 35 | 36 | s.means = append(s.means, math.NaN()) 37 | s.counts = append(s.counts, 0) 38 | 39 | copy(s.means[idx+1:], s.means[idx:]) 40 | copy(s.counts[idx+1:], s.counts[idx:]) 41 | 42 | s.means[idx] = key 43 | s.counts[idx] = value 44 | 45 | return nil 46 | } 47 | 48 | // Always insert to the right 49 | func (s *summary) findInsertionIndex(x float64) int { 50 | // Binary search is only worthwhile if we have a lot of keys. 51 | if len(s.means) < 250 { 52 | for i, mean := range s.means { 53 | if mean > x { 54 | return i 55 | } 56 | } 57 | return len(s.means) 58 | } 59 | 60 | return sort.Search(len(s.means), func(i int) bool { 61 | return s.means[i] > x 62 | }) 63 | } 64 | 65 | // This method is the hotspot when calling Add(), which in turn is called by 66 | // Compress() and Merge(). 67 | func (s *summary) HeadSum(idx int) (sum float64) { 68 | return float64(sumUntilIndex(s.counts, idx)) 69 | } 70 | 71 | func (s *summary) Floor(x float64) int { 72 | return s.findIndex(x) - 1 73 | } 74 | 75 | func (s *summary) findIndex(x float64) int { 76 | // Binary search is only worthwhile if we have a lot of keys. 77 | if len(s.means) < 250 { 78 | for i, mean := range s.means { 79 | if mean >= x { 80 | return i 81 | } 82 | } 83 | return len(s.means) 84 | } 85 | 86 | return sort.Search(len(s.means), func(i int) bool { 87 | return s.means[i] >= x 88 | }) 89 | } 90 | 91 | func (s *summary) Mean(uncheckedIndex int) float64 { 92 | return s.means[uncheckedIndex] 93 | } 94 | 95 | func (s *summary) Count(uncheckedIndex int) uint64 { 96 | return s.counts[uncheckedIndex] 97 | } 98 | 99 | // return the index of the last item which the sum of counts 100 | // of items before it is less than or equal to `sum`. -1 in 101 | // case no centroid satisfies the requirement. 102 | // Since it's cheap, this also returns the `HeadSum` until 103 | // the found index (i.e. cumSum = HeadSum(FloorSum(x))) 104 | func (s *summary) FloorSum(sum float64) (index int, cumSum float64) { 105 | index = -1 106 | for i, count := range s.counts { 107 | if cumSum <= sum { 108 | index = i 109 | } else { 110 | break 111 | } 112 | cumSum += float64(count) 113 | } 114 | if index != -1 { 115 | cumSum -= float64(s.counts[index]) 116 | } 117 | return index, cumSum 118 | } 119 | 120 | func (s *summary) setAt(index int, mean float64, count uint64) { 121 | s.means[index] = mean 122 | s.counts[index] = count 123 | s.adjustRight(index) 124 | s.adjustLeft(index) 125 | } 126 | 127 | func (s *summary) adjustRight(index int) { 128 | for i := index + 1; i < len(s.means) && s.means[i-1] > s.means[i]; i++ { 129 | s.means[i-1], s.means[i] = s.means[i], s.means[i-1] 130 | s.counts[i-1], s.counts[i] = s.counts[i], s.counts[i-1] 131 | } 132 | } 133 | 134 | func (s *summary) adjustLeft(index int) { 135 | for i := index - 1; i >= 0 && s.means[i] > s.means[i+1]; i-- { 136 | s.means[i], s.means[i+1] = s.means[i+1], s.means[i] 137 | s.counts[i], s.counts[i+1] = s.counts[i+1], s.counts[i] 138 | } 139 | } 140 | 141 | func (s *summary) ForEach(f func(float64, uint64) bool) { 142 | for i, mean := range s.means { 143 | if !f(mean, s.counts[i]) { 144 | break 145 | } 146 | } 147 | } 148 | 149 | func (s *summary) Perm(rng RNG, f func(float64, uint64) bool) { 150 | for _, i := range perm(rng, s.Len()) { 151 | if !f(s.means[i], s.counts[i]) { 152 | break 153 | } 154 | } 155 | } 156 | 157 | func (s *summary) Clone() *summary { 158 | return &summary{ 159 | means: append([]float64{}, s.means...), 160 | counts: append([]uint64{}, s.counts...), 161 | } 162 | } 163 | 164 | // Randomly shuffles summary contents, so they can be added to another summary 165 | // with being pathological. Renders summary invalid. 166 | func (s *summary) shuffle(rng RNG) { 167 | for i := len(s.means) - 1; i > 1; i-- { 168 | s.Swap(i, rng.Intn(i+1)) 169 | } 170 | } 171 | 172 | // for sort.Interface 173 | func (s *summary) Swap(i, j int) { 174 | s.means[i], s.means[j] = s.means[j], s.means[i] 175 | s.counts[i], s.counts[j] = s.counts[j], s.counts[i] 176 | } 177 | 178 | func (s *summary) Less(i, j int) bool { 179 | return s.means[i] < s.means[j] 180 | } 181 | 182 | // A simple loop unroll saves a surprising amount of time. 183 | func sumUntilIndex(s []uint64, idx int) uint64 { 184 | var cumSum uint64 185 | var i int 186 | for i = idx - 1; i >= 3; i -= 4 { 187 | cumSum += uint64(s[i]) 188 | cumSum += uint64(s[i-1]) 189 | cumSum += uint64(s[i-2]) 190 | cumSum += uint64(s[i-3]) 191 | } 192 | for ; i >= 0; i-- { 193 | cumSum += uint64(s[i]) 194 | } 195 | return cumSum 196 | } 197 | 198 | func perm(rng RNG, n int) []int { 199 | m := make([]int, n) 200 | for i := 1; i < n; i++ { 201 | j := rng.Intn(i + 1) 202 | m[i] = m[j] 203 | m[j] = i 204 | } 205 | return m 206 | } 207 | -------------------------------------------------------------------------------- /summary_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "sort" 7 | "testing" 8 | ) 9 | 10 | func TestBasics(t *testing.T) { 11 | s := newSummary(2) 12 | 13 | err := s.Add(1, 1) 14 | 15 | if err != nil { 16 | t.Errorf("Failed to add simple item") 17 | } 18 | 19 | if s.Add(math.NaN(), 1) == nil { 20 | t.Errorf("Adding math.NaN() shouldn't be allowed") 21 | } 22 | 23 | if s.Add(1, 0) == nil { 24 | t.Errorf("Adding count=0 shouldn't be allowed") 25 | } 26 | } 27 | 28 | func checkSorted(s *summary, t *testing.T) { 29 | if !sort.Float64sAreSorted(s.means) { 30 | t.Fatalf("Keys are not sorted! %v", s.means) 31 | } 32 | } 33 | 34 | func TestCore(t *testing.T) { 35 | 36 | testData := make(map[float64]uint64) 37 | 38 | const maxDataSize = 10000 39 | s := newSummary(maxDataSize) 40 | checkSorted(s, t) 41 | 42 | if s.Len() != 0 { 43 | t.Errorf("Initial size should be zero regardless of capacity. Got %d", s.Len()) 44 | } 45 | 46 | // construct a summary made of unique items only 47 | for i := 0; i < maxDataSize; i++ { 48 | k := rand.Float64() 49 | v := rand.Uint64() 50 | 51 | _, exists := testData[k] 52 | if !exists { 53 | _ = s.Add(k, v) 54 | testData[k] = v 55 | } 56 | } 57 | 58 | checkSorted(s, t) 59 | 60 | if s.Len() != len(testData) { 61 | t.Errorf("Got Len() == %d. Expected %d", s.Len(), len(testData)) 62 | } 63 | 64 | for k, v := range testData { 65 | i := s.findIndex(k) 66 | 67 | if i == s.Len() { 68 | t.Errorf("Couldn't find previously added key on summary") 69 | continue 70 | } 71 | 72 | if s.means[i] != k || s.counts[i] != v { 73 | t.Errorf("Wanted to find {%.4f,%d}, but found {%.4f,%d} instead", k, v, s.means[i], s.counts[i]) 74 | } 75 | } 76 | } 77 | 78 | func TestSetAtNeverBreaksSorting(t *testing.T) { 79 | s := newSummary(10) 80 | 81 | for _, i := range []float64{10, 10, 10, 10, 10} { 82 | _ = s.Add(i, 1) 83 | } 84 | 85 | s.setAt(0, 30, 1) 86 | checkSorted(s, t) 87 | 88 | s.setAt(s.Len()-1, 0, 1) 89 | checkSorted(s, t) 90 | 91 | s.setAt(3, 10.1, 1) 92 | checkSorted(s, t) 93 | 94 | s.setAt(3, 9.9, 1) 95 | checkSorted(s, t) 96 | 97 | } 98 | 99 | func TestForEach(t *testing.T) { 100 | 101 | s := newSummary(10) 102 | for _, i := range []uint64{1, 2, 3, 4, 5, 6} { 103 | _ = s.Add(float64(i), i*10) 104 | } 105 | 106 | c := 0 107 | s.ForEach(func(mean float64, count uint64) bool { 108 | c++ 109 | return false 110 | }) 111 | 112 | if c != 1 { 113 | t.Errorf("ForEach must exit early if the closure returns false") 114 | } 115 | 116 | var tot uint64 117 | s.ForEach(func(mean float64, count uint64) bool { 118 | tot += count 119 | return true 120 | }) 121 | 122 | if tot != 210 { 123 | t.Errorf("ForEach must walk through the whole data if it always returns true") 124 | } 125 | } 126 | 127 | func TestFloorSum(t *testing.T) { 128 | s := newSummary(100) 129 | var total uint64 130 | for i := 0; i < 100; i++ { 131 | count := uint64(rand.Intn(10)) + 1 132 | _ = s.Add(rand.Float64(), count) 133 | total += count 134 | } 135 | 136 | idx, _ := s.FloorSum(-1) 137 | if idx != -1 { 138 | t.Errorf("Expected no centroid to satisfy -1 but got index=%d", idx) 139 | } 140 | 141 | for i := float64(0); i < float64(total)+10; i++ { 142 | node, _ := s.FloorSum(i) 143 | if s.HeadSum(node) > i { 144 | t.Errorf("headSum(%d)=%.0f (>%.0f)", node, s.HeadSum(node), i) 145 | } 146 | if node+1 < s.Len() && s.HeadSum(node+1) <= i { 147 | t.Errorf("headSum(%d)=%.0f (>%.0f)", node+1, s.HeadSum(node+1), i) 148 | } 149 | } 150 | } 151 | 152 | func TestFloor(t *testing.T) { 153 | s := newSummary(200) 154 | for i := float64(0); i < 101; i++ { 155 | _ = s.Add(i/2.0, 1) 156 | } 157 | 158 | if s.Floor(-30) != -1 { 159 | t.Errorf("Shouldn't have found a floor index. Got %d", s.Floor(-30)) 160 | } 161 | 162 | for i := 0; i < s.Len(); i++ { 163 | m := s.means[i] 164 | f := s.means[s.Floor(m+0.1)] 165 | if m != f { 166 | t.Errorf("Erm, %.4f != %.4f", m, f) 167 | } 168 | } 169 | } 170 | 171 | func TestAdjustLeftRight(t *testing.T) { 172 | 173 | keys := []float64{1, 2, 3, 4, 9, 5, 6, 7, 8} 174 | counts := []uint64{1, 2, 3, 4, 9, 5, 6, 7, 8} 175 | 176 | s := summary{means: keys, counts: counts} 177 | 178 | s.adjustRight(4) 179 | 180 | if !sort.Float64sAreSorted(s.means) || s.counts[4] != 5 { 181 | t.Errorf("adjustRight should have fixed the keys/counts state. %v %v", s.means, s.counts) 182 | } 183 | 184 | keys = []float64{1, 2, 3, 4, 0, 5, 6, 7, 8} 185 | counts = []uint64{1, 2, 3, 4, 0, 5, 6, 7, 8} 186 | 187 | s = summary{means: keys, counts: counts} 188 | s.adjustLeft(4) 189 | 190 | if !sort.Float64sAreSorted(s.means) || s.counts[4] != 4 { 191 | t.Errorf("adjustLeft should have fixed the keys/counts state. %v %v", s.means, s.counts) 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /tdigest.go: -------------------------------------------------------------------------------- 1 | // Package tdigest provides a highly accurate mergeable data-structure 2 | // for quantile estimation. 3 | // 4 | // Typical T-Digest use cases involve accumulating metrics on several 5 | // distinct nodes of a cluster and then merging them together to get 6 | // a system-wide quantile overview. Things such as: sensory data from 7 | // IoT devices, quantiles over enormous document datasets (think 8 | // ElasticSearch), performance metrics for distributed systems, etc. 9 | // 10 | // After you create (and configure, if desired) the digest: 11 | // 12 | // digest, err := tdigest.New(tdigest.Compression(100)) 13 | // 14 | // You can then use it for registering measurements: 15 | // 16 | // digest.Add(number) 17 | // 18 | // Estimating quantiles: 19 | // 20 | // digest.Quantile(0.99) 21 | // 22 | // And merging with another digest: 23 | // 24 | // digest.Merge(otherDigest) 25 | package tdigest 26 | 27 | import ( 28 | "fmt" 29 | "math" 30 | ) 31 | 32 | // TDigest is a quantile approximation data structure. 33 | type TDigest struct { 34 | summary *summary 35 | compression float64 36 | count uint64 37 | rng RNG 38 | } 39 | 40 | // New creates a new digest. 41 | // 42 | // By default the digest is constructed with a configuration that 43 | // should be useful for most use-cases. It comes with compression 44 | // set to 100 and uses a local random number generator for 45 | // performance reasons. 46 | func New(options ...tdigestOption) (*TDigest, error) { 47 | tdigest, err := newWithoutSummary(options...) 48 | 49 | if err != nil { 50 | return nil, err 51 | } 52 | 53 | tdigest.summary = newSummary(estimateCapacity(tdigest.compression)) 54 | return tdigest, nil 55 | } 56 | 57 | // Creates a tdigest instance without allocating a summary. 58 | func newWithoutSummary(options ...tdigestOption) (*TDigest, error) { 59 | tdigest := &TDigest{ 60 | compression: 100, 61 | count: 0, 62 | } 63 | 64 | for _, option := range options { 65 | err := option(tdigest) 66 | if err != nil { 67 | return nil, err 68 | } 69 | } 70 | 71 | if tdigest.rng == nil { 72 | tdigest.rng = newLocalRNG(1) 73 | } 74 | 75 | return tdigest, nil 76 | } 77 | 78 | func _quantile(index float64, previousIndex float64, nextIndex float64, previousMean float64, nextMean float64) float64 { 79 | delta := nextIndex - previousIndex 80 | previousWeight := (nextIndex - index) / delta 81 | nextWeight := (index - previousIndex) / delta 82 | return previousMean*previousWeight + nextMean*nextWeight 83 | } 84 | 85 | // Compression returns the TDigest compression. 86 | func (t *TDigest) Compression() float64 { 87 | return t.compression 88 | } 89 | 90 | // Quantile returns the desired percentile estimation. 91 | // 92 | // Values of p must be between 0 and 1 (inclusive), will panic otherwise. 93 | func (t *TDigest) Quantile(q float64) float64 { 94 | if q < 0 || q > 1 { 95 | panic("q must be between 0 and 1 (inclusive)") 96 | } 97 | 98 | if t.summary.Len() == 0 { 99 | return math.NaN() 100 | } else if t.summary.Len() == 1 { 101 | return t.summary.Mean(0) 102 | } 103 | 104 | index := q * float64(t.count-1) 105 | previousMean := math.NaN() 106 | previousIndex := float64(0) 107 | next, total := t.summary.FloorSum(index) 108 | 109 | if next > 0 { 110 | previousMean = t.summary.Mean(next - 1) 111 | previousIndex = total - float64(t.summary.Count(next-1)+1)/2 112 | } 113 | 114 | for { 115 | nextIndex := total + float64(t.summary.Count(next)-1)/2 116 | if nextIndex >= index { 117 | if math.IsNaN(previousMean) { 118 | // the index is before the 1st centroid 119 | if nextIndex == previousIndex { 120 | return t.summary.Mean(next) 121 | } 122 | // assume linear growth 123 | nextIndex2 := total + float64(t.summary.Count(next)) + float64(t.summary.Count(next+1)-1)/2 124 | previousMean = (nextIndex2*t.summary.Mean(next) - nextIndex*t.summary.Mean(next+1)) / (nextIndex2 - nextIndex) 125 | } 126 | // common case: two centroids found, the result in in between 127 | return _quantile(index, previousIndex, nextIndex, previousMean, t.summary.Mean(next)) 128 | } else if next+1 == t.summary.Len() { 129 | // the index is after the last centroid 130 | nextIndex2 := float64(t.count - 1) 131 | nextMean2 := (t.summary.Mean(next)*(nextIndex2-previousIndex) - previousMean*(nextIndex2-nextIndex)) / (nextIndex - previousIndex) 132 | return _quantile(index, nextIndex, nextIndex2, t.summary.Mean(next), nextMean2) 133 | } 134 | total += float64(t.summary.Count(next)) 135 | previousMean = t.summary.Mean(next) 136 | previousIndex = nextIndex 137 | next++ 138 | } 139 | // unreachable 140 | } 141 | 142 | // boundedWeightedAverage computes the weighted average of two 143 | // centroids guaranteeing that the result will be between x1 and x2, 144 | // inclusive. 145 | // 146 | // Refer to https://github.com/caio/go-tdigest/pull/19 for more details 147 | func boundedWeightedAverage(x1 float64, w1 float64, x2 float64, w2 float64) float64 { 148 | if x1 > x2 { 149 | x1, x2, w1, w2 = x2, x1, w2, w1 150 | } 151 | result := (x1*w1 + x2*w2) / (w1 + w2) 152 | return math.Max(x1, math.Min(result, x2)) 153 | } 154 | 155 | // AddWeighted registers a new sample in the digest. 156 | // 157 | // It's the main entry point for the digest and very likely the only 158 | // method to be used for collecting samples. The count parameter is for 159 | // when you are registering a sample that occurred multiple times - the 160 | // most common value for this is 1. 161 | // 162 | // This will emit an error if `value` is NaN or if `count` is zero. 163 | func (t *TDigest) AddWeighted(value float64, count uint64) (err error) { 164 | if count == 0 { 165 | return fmt.Errorf("illegal datapoint ", value, count) 166 | } 167 | 168 | if t.summary.Len() == 0 { 169 | err = t.summary.Add(value, count) 170 | t.count = uint64(count) 171 | return err 172 | } 173 | 174 | begin := t.summary.Floor(value) 175 | if begin == -1 { 176 | begin = 0 177 | } 178 | 179 | begin, end := t.findNeighbors(begin, value) 180 | 181 | closest := t.chooseMergeCandidate(begin, end, count) 182 | 183 | if closest == t.summary.Len() { 184 | err = t.summary.Add(value, count) 185 | if err != nil { 186 | return err 187 | } 188 | } else { 189 | c := float64(t.summary.Count(closest)) 190 | newMean := boundedWeightedAverage(t.summary.Mean(closest), c, value, float64(count)) 191 | t.summary.setAt(closest, newMean, uint64(c)+count) 192 | } 193 | t.count += uint64(count) 194 | 195 | if float64(t.summary.Len()) > 20*t.compression { 196 | err = t.Compress() 197 | } 198 | 199 | return err 200 | } 201 | 202 | // Count returns the total number of samples this digest represents 203 | // 204 | // The result represents how many times Add() was called on a digest 205 | // plus how many samples the digests it has been merged with had. 206 | // This is useful mainly for two scenarios: 207 | // 208 | // - Knowing if there is enough data so you can trust the quantiles 209 | // 210 | // - Knowing if you've registered too many samples already and 211 | // deciding what to do about it. 212 | // 213 | // For the second case one approach would be to create a side empty 214 | // digest and start registering samples on it as well as on the old 215 | // (big) one and then discard the bigger one after a certain criterion 216 | // is reached (say, minimum number of samples or a small relative 217 | // error between new and old digests). 218 | func (t TDigest) Count() uint64 { 219 | return t.count 220 | } 221 | 222 | // Add is an alias for AddWeighted(x,1) 223 | // Read the documentation for AddWeighted for more details. 224 | func (t *TDigest) Add(value float64) error { 225 | return t.AddWeighted(value, 1) 226 | } 227 | 228 | // Compress tries to reduce the number of individual centroids stored 229 | // in the digest. 230 | // 231 | // Compression trades off accuracy for performance and happens 232 | // automatically after a certain amount of distinct samples have been 233 | // stored. 234 | // 235 | // At any point in time you may call Compress on a digest, but you 236 | // may completely ignore this and it will compress itself automatically 237 | // after it grows too much. If you are minimizing network traffic 238 | // it might be a good idea to compress before serializing. 239 | func (t *TDigest) Compress() (err error) { 240 | if t.summary.Len() <= 1 { 241 | return nil 242 | } 243 | 244 | oldTree := t.summary 245 | t.summary = newSummary(estimateCapacity(t.compression)) 246 | t.count = 0 247 | 248 | oldTree.shuffle(t.rng) 249 | oldTree.ForEach(func(mean float64, count uint64) bool { 250 | err = t.AddWeighted(mean, count) 251 | return err == nil 252 | }) 253 | return err 254 | } 255 | 256 | // Merge joins a given digest into itself. 257 | // 258 | // Merging is useful when you have multiple TDigest instances running 259 | // in separate threads and you want to compute quantiles over all the 260 | // samples. This is particularly important on a scatter-gather/map-reduce 261 | // scenario. 262 | func (t *TDigest) Merge(other *TDigest) (err error) { 263 | if other.summary.Len() == 0 { 264 | return nil 265 | } 266 | 267 | other.summary.Perm(t.rng, func(mean float64, count uint64) bool { 268 | err = t.AddWeighted(mean, count) 269 | return err == nil 270 | }) 271 | return err 272 | } 273 | 274 | // MergeDestructive joins a given digest into itself rendering 275 | // the other digest invalid. 276 | // 277 | // This works as Merge above but its faster. Using this method 278 | // requires caution as it makes 'other' useless - you must make 279 | // sure you discard it without making further uses of it. 280 | func (t *TDigest) MergeDestructive(other *TDigest) (err error) { 281 | if other.summary.Len() == 0 { 282 | return nil 283 | } 284 | 285 | other.summary.shuffle(t.rng) 286 | other.summary.ForEach(func(mean float64, count uint64) bool { 287 | err = t.AddWeighted(mean, count) 288 | return err == nil 289 | }) 290 | return err 291 | } 292 | 293 | // CDF computes the fraction in which all samples are less than 294 | // or equal to the given value. 295 | func (t *TDigest) CDF(value float64) float64 { 296 | if t.summary.Len() == 0 { 297 | return math.NaN() 298 | } else if t.summary.Len() == 1 { 299 | if value < t.summary.Mean(0) { 300 | return 0 301 | } 302 | return 1 303 | } 304 | 305 | // We have at least 2 centroids 306 | left := (t.summary.Mean(1) - t.summary.Mean(0)) / 2 307 | right := left 308 | tot := 0.0 309 | 310 | for i := 1; i < t.summary.Len()-1; i++ { 311 | prevMean := t.summary.Mean(i - 1) 312 | if value < prevMean+right { 313 | v := (tot + float64(t.summary.Count(i-1))*interpolate(value, prevMean-left, prevMean+right)) / float64(t.Count()) 314 | if v > 0 { 315 | return v 316 | } 317 | return 0 318 | } 319 | 320 | tot += float64(t.summary.Count(i - 1)) 321 | left = right 322 | right = (t.summary.Mean(i+1) - t.summary.Mean(i)) / 2 323 | } 324 | 325 | // last centroid, the summary length is at least two 326 | aIdx := t.summary.Len() - 2 327 | aMean := t.summary.Mean(aIdx) 328 | if value < aMean+right { 329 | aCount := float64(t.summary.Count(aIdx)) 330 | return (tot + aCount*interpolate(value, aMean-left, aMean+right)) / float64(t.Count()) 331 | } 332 | return 1 333 | } 334 | 335 | // Clone returns a deep copy of a TDigest. 336 | func (t *TDigest) Clone() *TDigest { 337 | return &TDigest{ 338 | summary: t.summary.Clone(), 339 | compression: t.compression, 340 | count: t.count, 341 | rng: t.rng, 342 | } 343 | } 344 | 345 | func interpolate(x, x0, x1 float64) float64 { 346 | return (x - x0) / (x1 - x0) 347 | } 348 | 349 | // ForEachCentroid calls the specified function for each centroid. 350 | // 351 | // Iteration stops when the supplied function returns false, or when all 352 | // centroids have been iterated. 353 | func (t *TDigest) ForEachCentroid(f func(mean float64, count uint64) bool) { 354 | t.summary.ForEach(f) 355 | } 356 | 357 | func (t TDigest) findNeighbors(start int, value float64) (int, int) { 358 | minDistance := math.MaxFloat64 359 | lastNeighbor := t.summary.Len() 360 | for neighbor := start; neighbor < t.summary.Len(); neighbor++ { 361 | z := math.Abs(t.summary.Mean(neighbor) - value) 362 | if z < minDistance { 363 | start = neighbor 364 | minDistance = z 365 | } else if z > minDistance { 366 | lastNeighbor = neighbor 367 | break 368 | } 369 | } 370 | return start, lastNeighbor 371 | } 372 | 373 | func (t TDigest) chooseMergeCandidate(begin, end int, count uint64) int { 374 | closest := t.summary.Len() 375 | sum := t.summary.HeadSum(begin) 376 | var n float32 377 | 378 | for neighbor := begin; neighbor != end; neighbor++ { 379 | c := float64(t.summary.Count(neighbor)) 380 | var q float64 381 | if t.count == 1 { 382 | q = 0.5 383 | } else { 384 | q = (sum + (c-1)/2) / float64(t.count-1) 385 | } 386 | k := 4 * float64(t.count) * q * (1 - q) / t.compression 387 | 388 | if c+float64(count) <= k { 389 | n++ 390 | if t.rng.Float32() < 1/n { 391 | closest = neighbor 392 | } 393 | } 394 | sum += c 395 | } 396 | return closest 397 | } 398 | 399 | // TrimmedMean returns the mean of the distribution between the two 400 | // percentiles p1 and p2. 401 | // 402 | // Values of p1 and p2 must be beetween 0 and 1 (inclusive) and p1 403 | // must be less than p2. Will panic otherwise. 404 | func (t *TDigest) TrimmedMean(p1, p2 float64) float64 { 405 | if p1 < 0 || p1 > 1 { 406 | panic("p1 must be between 0 and 1 (inclusive)") 407 | } 408 | if p2 < 0 || p2 > 1 { 409 | panic("p2 must be between 0 and 1 (inclusive)") 410 | } 411 | if p1 >= p2 { 412 | panic("p1 must be lower than p2") 413 | } 414 | 415 | minCount := p1 * float64(t.count) 416 | maxCount := p2 * float64(t.count) 417 | 418 | var trimmedSum, trimmedCount, currCount float64 419 | for i, mean := range t.summary.means { 420 | count := float64(t.summary.counts[i]) 421 | 422 | nextCount := currCount + count 423 | if nextCount <= minCount { 424 | currCount = nextCount 425 | continue 426 | } 427 | 428 | if currCount < minCount { 429 | count = nextCount - minCount 430 | } 431 | if nextCount > maxCount { 432 | count -= nextCount - maxCount 433 | } 434 | 435 | trimmedSum += count * mean 436 | trimmedCount += count 437 | 438 | if nextCount >= maxCount { 439 | break 440 | } 441 | currCount = nextCount 442 | } 443 | 444 | if trimmedCount == 0 { 445 | return 0 446 | } 447 | return trimmedSum / trimmedCount 448 | } 449 | 450 | func estimateCapacity(compression float64) int { 451 | return int(compression) * 10 452 | } 453 | -------------------------------------------------------------------------------- /tdigest_test.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | "sort" 8 | "testing" 9 | 10 | rng "github.com/leesper/go_rng" 11 | "gonum.org/v1/gonum/stat" 12 | ) 13 | 14 | func uncheckedNew(options ...tdigestOption) *TDigest { 15 | t, _ := New(options...) 16 | return t 17 | } 18 | 19 | // Test of tdigest internals and accuracy. Note no t.Parallel(): 20 | // during tests the default random seed is consistent, but varying 21 | // concurrency scheduling mixes up the random values used in each test. 22 | // Since there's a random number call inside tdigest this breaks repeatability 23 | // for all tests. So, no test concurrency here. 24 | 25 | func TestTInternals(t *testing.T) { 26 | tdigest := uncheckedNew() 27 | 28 | if !math.IsNaN(tdigest.Quantile(0.1)) { 29 | t.Errorf("Quantile() on an empty digest should return NaN. Got: %.4f", tdigest.Quantile(0.1)) 30 | } 31 | 32 | if !math.IsNaN(tdigest.CDF(1)) { 33 | t.Errorf("CDF() on an empty digest should return NaN. Got: %.4f", tdigest.CDF(1)) 34 | } 35 | 36 | _ = tdigest.Add(0.4) 37 | 38 | if tdigest.Quantile(0.1) != 0.4 { 39 | t.Errorf("Quantile() on a single-sample digest should return the samples's mean. Got %.4f", tdigest.Quantile(0.1)) 40 | } 41 | 42 | if tdigest.CDF(0.3) != 0 { 43 | t.Errorf("CDF(x) on digest with a single centroid should return 0 if x < mean") 44 | } 45 | 46 | if tdigest.CDF(0.5) != 1 { 47 | t.Errorf("CDF(x) on digest with a single centroid should return 1 if x >= mean") 48 | } 49 | 50 | _ = tdigest.Add(0.5) 51 | 52 | if tdigest.summary.Len() != 2 { 53 | t.Errorf("Expected size 2, got %d", tdigest.summary.Len()) 54 | } 55 | 56 | err := tdigest.AddWeighted(0, 0) 57 | 58 | if err == nil { 59 | t.Errorf("Expected AddWeighted() to error out with input (0,0)") 60 | } 61 | } 62 | 63 | func closeEnough(a float64, b float64) bool { 64 | const EPS = 0.000001 65 | if (a-b < EPS) && (b-a < EPS) { 66 | return true 67 | } 68 | return false 69 | } 70 | 71 | func assertDifferenceSmallerThan(tdigest *TDigest, p float64, m float64, t *testing.T) { 72 | tp := tdigest.Quantile(p) 73 | if math.Abs(tp-p) >= m { 74 | t.Errorf("T-Digest.Quantile(%.4f) = %.4f. Diff (%.4f) >= %.4f", p, tp, math.Abs(tp-p), m) 75 | } 76 | } 77 | 78 | func TestUniformDistribution(t *testing.T) { 79 | tdigest := uncheckedNew() 80 | 81 | for i := 0; i < 100000; i++ { 82 | _ = tdigest.Add(rand.Float64()) 83 | } 84 | 85 | assertDifferenceSmallerThan(tdigest, 0.5, 0.02, t) 86 | assertDifferenceSmallerThan(tdigest, 0.1, 0.01, t) 87 | assertDifferenceSmallerThan(tdigest, 0.9, 0.01, t) 88 | assertDifferenceSmallerThan(tdigest, 0.01, 0.005, t) 89 | assertDifferenceSmallerThan(tdigest, 0.99, 0.005, t) 90 | assertDifferenceSmallerThan(tdigest, 0.001, 0.001, t) 91 | assertDifferenceSmallerThan(tdigest, 0.999, 0.001, t) 92 | } 93 | 94 | // Asserts quantile p is no greater than absolute m off from "true" 95 | // fractional quantile for supplied data. So m must be scaled 96 | // appropriately for source data range. 97 | func assertDifferenceFromQuantile(data []float64, tdigest *TDigest, p float64, m float64, t *testing.T) { 98 | q := quantile(p, data) 99 | tp := tdigest.Quantile(p) 100 | 101 | if math.Abs(tp-q) >= m { 102 | t.Fatalf("T-Digest.Quantile(%.4f) = %.4f vs actual %.4f. Diff (%.4f) >= %.4f", p, tp, q, math.Abs(tp-q), m) 103 | } 104 | } 105 | 106 | func TestSequentialInsertion(t *testing.T) { 107 | tdigest := uncheckedNew() 108 | 109 | data := make([]float64, 10000) 110 | for i := 0; i < len(data); i++ { 111 | data[i] = float64(i) 112 | } 113 | 114 | for i := 0; i < len(data); i++ { 115 | _ = tdigest.Add(data[i]) 116 | 117 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.001, 1.0+0.001*float64(i), t) 118 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.01, 1.0+0.005*float64(i), t) 119 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.05, 1.0+0.01*float64(i), t) 120 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.25, 1.0+0.03*float64(i), t) 121 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.5, 1.0+0.03*float64(i), t) 122 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.75, 1.0+0.03*float64(i), t) 123 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.95, 1.0+0.01*float64(i), t) 124 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.99, 1.0+0.005*float64(i), t) 125 | assertDifferenceFromQuantile(data[:i+1], tdigest, 0.999, 1.0+0.001*float64(i), t) 126 | } 127 | } 128 | 129 | func TestNonSequentialInsertion(t *testing.T) { 130 | tdigest := uncheckedNew() 131 | 132 | // Not quite a uniform distribution, but close. 133 | data := make([]float64, 1000) 134 | for i := 0; i < len(data); i++ { 135 | tmp := (i * 1627) % len(data) 136 | data[i] = float64(tmp) 137 | } 138 | 139 | sorted := make([]float64, 0, len(data)) 140 | 141 | for i := 0; i < len(data); i++ { 142 | _ = tdigest.Add(data[i]) 143 | sorted = append(sorted, data[i]) 144 | 145 | // Estimated quantiles are all over the place for low counts, which is 146 | // OK given that something like P99 is not very meaningful when there are 147 | // 25 samples. To account for this, increase the error tolerance for 148 | // smaller counts. 149 | if i == 0 { 150 | continue 151 | } 152 | 153 | max := float64(len(data)) 154 | fac := 1.0 + max/float64(i) 155 | 156 | sort.Float64s(sorted) 157 | assertDifferenceFromQuantile(sorted, tdigest, 0.001, fac+0.001*max, t) 158 | assertDifferenceFromQuantile(sorted, tdigest, 0.01, fac+0.005*max, t) 159 | assertDifferenceFromQuantile(sorted, tdigest, 0.05, fac+0.01*max, t) 160 | assertDifferenceFromQuantile(sorted, tdigest, 0.25, fac+0.01*max, t) 161 | assertDifferenceFromQuantile(sorted, tdigest, 0.5, fac+0.02*max, t) 162 | assertDifferenceFromQuantile(sorted, tdigest, 0.75, fac+0.01*max, t) 163 | assertDifferenceFromQuantile(sorted, tdigest, 0.95, fac+0.01*max, t) 164 | assertDifferenceFromQuantile(sorted, tdigest, 0.99, fac+0.005*max, t) 165 | assertDifferenceFromQuantile(sorted, tdigest, 0.999, fac+0.001*max, t) 166 | } 167 | } 168 | 169 | func TestSingletonInACrowd(t *testing.T) { 170 | tdigest := uncheckedNew() 171 | for i := 0; i < 10000; i++ { 172 | _ = tdigest.Add(10) 173 | } 174 | _ = tdigest.Add(20) 175 | _ = tdigest.Compress() 176 | 177 | for _, q := range []float64{0, 0.5, 0.8, 0.9, 0.99, 0.999} { 178 | if q == 0.999 { 179 | // Test for 0.999 disabled since it doesn't 180 | // pass in the reference implementation 181 | continue 182 | } 183 | result := tdigest.Quantile(q) 184 | if !closeEnough(result, 10) { 185 | t.Errorf("Expected Quantile(%.3f) = 10, but got %.4f (size=%d)", q, result, tdigest.summary.Len()) 186 | } 187 | } 188 | 189 | result := tdigest.Quantile(1) 190 | if result != 20 { 191 | t.Errorf("Expected Quantile(1) = 20, but got %.4f (size=%d)", result, tdigest.summary.Len()) 192 | } 193 | } 194 | 195 | func TestRespectBounds(t *testing.T) { 196 | tdigest := uncheckedNew(Compression(10)) 197 | 198 | data := []float64{0, 279, 2, 281} 199 | for _, f := range data { 200 | _ = tdigest.Add(f) 201 | } 202 | 203 | quantiles := []float64{0.01, 0.25, 0.5, 0.75, 0.999} 204 | for _, q := range quantiles { 205 | result := tdigest.Quantile(q) 206 | if result < 0 { 207 | t.Errorf("q(%.3f) = %.4f < 0", q, result) 208 | } 209 | if tdigest.Quantile(q) > 281 { 210 | t.Errorf("q(%.3f) = %.4f > 281", q, result) 211 | } 212 | } 213 | } 214 | 215 | func TestWeights(t *testing.T) { 216 | tdigest := uncheckedNew(Compression(10)) 217 | 218 | // Create data slice with repeats matching weights we gave to tdigest 219 | data := []float64{} 220 | for i := 0; i < 100; i++ { 221 | _ = tdigest.AddWeighted(float64(i), uint64(i)) 222 | 223 | for j := 0; j < i; j++ { 224 | data = append(data, float64(i)) 225 | } 226 | } 227 | 228 | assertDifferenceFromQuantile(data, tdigest, 0.001, 1.0+0.001*100.0, t) 229 | assertDifferenceFromQuantile(data, tdigest, 0.01, 1.0+0.005*100.0, t) 230 | assertDifferenceFromQuantile(data, tdigest, 0.05, 1.0+0.01*100.0, t) 231 | assertDifferenceFromQuantile(data, tdigest, 0.25, 1.0+0.01*100.0, t) 232 | assertDifferenceFromQuantile(data, tdigest, 0.5, 1.0+0.02*100.0, t) 233 | assertDifferenceFromQuantile(data, tdigest, 0.75, 1.0+0.01*100.0, t) 234 | assertDifferenceFromQuantile(data, tdigest, 0.95, 1.0+0.01*100.0, t) 235 | assertDifferenceFromQuantile(data, tdigest, 0.99, 1.0+0.005*100.0, t) 236 | assertDifferenceFromQuantile(data, tdigest, 0.999, 1.0+0.001*100.0, t) 237 | } 238 | 239 | func TestIntegers(t *testing.T) { 240 | tdigest := uncheckedNew() 241 | 242 | _ = tdigest.Add(1) 243 | _ = tdigest.Add(2) 244 | _ = tdigest.Add(3) 245 | 246 | if tdigest.Quantile(0.5) != 2 { 247 | t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5)) 248 | } 249 | 250 | tdigest = uncheckedNew() 251 | 252 | for _, i := range []float64{1, 2, 2, 2, 2, 2, 2, 2, 3} { 253 | _ = tdigest.Add(i) 254 | } 255 | 256 | if tdigest.Quantile(0.5) != 2 { 257 | t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5)) 258 | } 259 | 260 | var tot uint64 261 | tdigest.ForEachCentroid(func(mean float64, count uint64) bool { 262 | tot += count 263 | return true 264 | }) 265 | 266 | if tot != 9 { 267 | t.Errorf("Expected the centroid count to be 9, Got %d instead", tot) 268 | } 269 | } 270 | 271 | func cdf(x float64, data []float64) float64 { 272 | var n1, n2 int 273 | for i := 0; i < len(data); i++ { 274 | if data[i] < x { 275 | n1++ 276 | } 277 | if data[i] <= x { 278 | n2++ 279 | } 280 | } 281 | return float64(n1+n2) / 2.0 / float64(len(data)) 282 | } 283 | 284 | func quantile(q float64, data []float64) float64 { 285 | if len(data) == 0 { 286 | return math.NaN() 287 | } 288 | 289 | if q == 1 || len(data) == 1 { 290 | return data[len(data)-1] 291 | } 292 | 293 | index := q * (float64(len(data)) - 1) 294 | return data[int(index)+1]*(index-float64(int(index))) + data[int(index)]*(float64(int(index)+1)-index) 295 | } 296 | 297 | func TestMergeNormal(t *testing.T) { 298 | testMerge(t, false) 299 | } 300 | 301 | func TestMergeDescructive(t *testing.T) { 302 | testMerge(t, true) 303 | } 304 | 305 | func testMerge(t *testing.T, destructive bool) { 306 | if testing.Short() { 307 | t.Skipf("Skipping merge test. Short flag is on") 308 | } 309 | 310 | const numItems = 100000 311 | 312 | for _, numSubs := range []int{2, 5, 10, 20, 50, 100} { 313 | data := make([]float64, numItems) 314 | 315 | subs := make([]*TDigest, numSubs) 316 | for i := 0; i < numSubs; i++ { 317 | subs[i] = uncheckedNew() 318 | } 319 | 320 | dist := uncheckedNew() 321 | for i := 0; i < numItems; i++ { 322 | num := rand.Float64() 323 | 324 | data[i] = num 325 | _ = dist.Add(num) 326 | _ = subs[i%numSubs].Add(num) 327 | } 328 | 329 | _ = dist.Compress() 330 | 331 | dist2 := uncheckedNew() 332 | for i := 0; i < numSubs; i++ { 333 | if destructive { 334 | _ = dist2.MergeDestructive(subs[i]) 335 | } else { 336 | _ = dist2.Merge(subs[i]) 337 | } 338 | 339 | } 340 | 341 | if dist.Count() != dist2.Count() { 342 | t.Errorf("Expected the number of centroids to be the same. %d != %d", dist.Count(), dist2.Count()) 343 | } 344 | 345 | if dist2.Count() != numItems { 346 | t.Errorf("Items shouldn't have disappeared. %d != %d", dist2.Count(), numItems) 347 | } 348 | 349 | sort.Float64s(data) 350 | 351 | for _, q := range []float64{0.001, 0.01, 0.1, 0.2, 0.3, 0.5} { 352 | z := quantile(q, data) 353 | p1 := dist.Quantile(q) 354 | p2 := dist2.Quantile(q) 355 | 356 | e1 := p1 - z 357 | e2 := p2 - z 358 | 359 | if math.Abs(e2)/q >= 0.3 { 360 | t.Errorf("rel >= 0.3: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f real=%.3f", 361 | numSubs, q, e1, e2, math.Abs(e2)/q, z-q) 362 | } 363 | if math.Abs(e2) >= 0.015 { 364 | t.Errorf("e2 >= 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f real=%.3f", 365 | numSubs, q, e1, e2, math.Abs(e2)/q, z-q) 366 | } 367 | 368 | z = cdf(q, data) 369 | e1 = dist.CDF(q) - z 370 | e2 = dist2.CDF(q) - z 371 | 372 | if math.Abs(e2)/q > 0.3 { 373 | t.Errorf("CDF e2 < 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f", 374 | numSubs, q, e1, e2, math.Abs(e2)/q) 375 | } 376 | 377 | if math.Abs(e2) >= 0.015 { 378 | t.Errorf("CDF e2 < 0.015: parts=%3d q=%.3f e1=%.4f e2=%.4f rel=%.3f", 379 | numSubs, q, e1, e2, math.Abs(e2)/q) 380 | } 381 | } 382 | } 383 | } 384 | 385 | func TestCompressDoesntChangeCount(t *testing.T) { 386 | tdigest := uncheckedNew() 387 | 388 | for i := 0; i < 1000; i++ { 389 | _ = tdigest.Add(rand.Float64()) 390 | } 391 | 392 | initialCount := tdigest.Count() 393 | 394 | err := tdigest.Compress() 395 | if err != nil { 396 | t.Errorf("Compress() triggered an unexpected error: %s", err) 397 | } 398 | 399 | if tdigest.Count() != initialCount { 400 | t.Errorf("Compress() should not change count. Wanted %d, got %d", initialCount, tdigest.Count()) 401 | } 402 | } 403 | 404 | func TestGammaDistribution(t *testing.T) { 405 | const numItems = 100000 406 | 407 | digest := uncheckedNew() 408 | gammaRNG := rng.NewGammaGenerator(0xDEADBEE) 409 | 410 | data := make([]float64, numItems) 411 | for i := 0; i < numItems; i++ { 412 | data[i] = gammaRNG.Gamma(0.1, 0.1) 413 | _ = digest.Add(data[i]) 414 | } 415 | 416 | sort.Float64s(data) 417 | 418 | softErrors := 0 419 | for _, q := range []float64{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999} { 420 | 421 | ix := float64(len(data))*q - 0.5 422 | index := int(math.Floor(ix)) 423 | p := ix - float64(index) 424 | realQuantile := data[index]*(1-p) + data[index+1]*p 425 | 426 | // estimated cdf of real quantile(x) 427 | if math.Abs(digest.CDF(realQuantile)-q) > 0.005 { 428 | t.Errorf("Error in estimated CDF too high") 429 | } 430 | 431 | // real cdf of estimated quantile(x) 432 | error := math.Abs(q - cdf(digest.Quantile(q), data)) 433 | if error > 0.005 { 434 | softErrors++ 435 | } 436 | 437 | if error > 0.012 { 438 | t.Errorf("Error in estimated Quantile too high") 439 | } 440 | } 441 | 442 | if softErrors >= 3 { 443 | t.Errorf("Too many soft errors") 444 | } 445 | 446 | // Issue #17, verify that we are hitting the extreme CDF case 447 | // XXX Maybe test this properly instead of having a hardcoded value 448 | extreme := digest.CDF(0.71875) 449 | if !closeEnough(extreme, 1) { 450 | t.Errorf("Expected something close to 1 but got %.4f instead", extreme) 451 | } 452 | } 453 | 454 | func shouldPanic(f func(), t *testing.T, message string) { 455 | defer func() { 456 | tryRecover := recover() 457 | if tryRecover == nil { 458 | t.Error(message) 459 | } 460 | }() 461 | f() 462 | } 463 | 464 | func TestPanic(t *testing.T) { 465 | tdigest := uncheckedNew() 466 | 467 | shouldPanic(func() { 468 | tdigest.Quantile(-42) 469 | }, t, "Quantile < 0 should panic!") 470 | 471 | shouldPanic(func() { 472 | tdigest.Quantile(42) 473 | }, t, "Quantile > 1 should panic!") 474 | } 475 | 476 | func TestForEachCentroid(t *testing.T) { 477 | tdigest := uncheckedNew(Compression(10)) 478 | 479 | for i := 0; i < 100; i++ { 480 | _ = tdigest.Add(float64(i)) 481 | } 482 | 483 | // Iterate limited number. 484 | means := []float64{} 485 | tdigest.ForEachCentroid(func(mean float64, count uint64) bool { 486 | means = append(means, mean) 487 | return len(means) != 3 488 | }) 489 | if len(means) != 3 { 490 | t.Errorf("ForEachCentroid handled incorrect number of data items") 491 | } 492 | 493 | // Iterate all datapoints. 494 | means = []float64{} 495 | tdigest.ForEachCentroid(func(mean float64, count uint64) bool { 496 | means = append(means, mean) 497 | return true 498 | }) 499 | if len(means) != tdigest.summary.Len() { 500 | t.Errorf("ForEachCentroid did not handle all data") 501 | } 502 | } 503 | 504 | func TestQuantilesDontOverflow(t *testing.T) { 505 | tdigest := uncheckedNew(Compression(100)) 506 | // Add slightly more than math.MaxUint32 samples uniformly in the range 507 | // [0, 1). This would overflow a uint32-based implementation. 508 | tdigest.Add(1) 509 | for i := 0; i < 1024; i++ { 510 | tdigest.AddWeighted(float64(i)/1024, 4194304) 511 | } 512 | assertDifferenceSmallerThan(tdigest, 0.5, .02, t) 513 | } 514 | 515 | func TestCDFInsideLastCentroid(t *testing.T) { 516 | // values pulled from a live digest. sorry it's a lot! 517 | td := &TDigest{ 518 | summary: &summary{ 519 | means: []float64{2120.75048828125, 2260.3844299316406, 3900.490264892578, 3937.495807647705, 5390.479816436768, 10450.335285186768, 14152.897296905518, 16442.676349639893, 24303.143146514893, 56961.87361526489, 63891.24959182739, 73982.55232620239, 86477.50447463989, 110746.62556838989, 175479.7388496399, 300492.3404121399, 440452.5279121399, 515611.7700996399, 535827.0025215149, 546241.6822090149, 556965.3648262024, 569791.2124824524, 587320.6870918274, 603969.4175605774, 613751.6177558899, 624708.7593574524, 635060.0718574524, 641924.2007637024, 650656.4302558899, 660653.1714668274, 671380.9009590149, 687094.3667793274, 716595.8824043274, 740870.9800605774, 760276.2437324524, 768857.5786933899, 775021.0025215149, 787686.0337715149, 801473.4624824524, 815225.1255683899, 832358.6997871399, 852438.4751777649, 866134.2935371399, 1.10661549666214e+06, 1.1212118980293274e+06, 1.2230108433418274e+06, 1.5446490620918274e+06, 4.306712312091827e+06, 5.487582562091827e+06, 6.306383562091827e+06, 7.089308312091827e+06, 7.520797593341827e+06}, 520 | counts: []uint64{0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x4, 0x5, 0x6, 0x3, 0x3, 0x4, 0x11, 0x23, 0x2f, 0x1e, 0x1b, 0x36, 0x31, 0x33, 0x4e, 0x5f, 0x61, 0x48, 0x2e, 0x26, 0x28, 0x2a, 0x31, 0x39, 0x51, 0x32, 0x2b, 0x12, 0x8, 0xb, 0xa, 0x11, 0xa, 0x11, 0x9, 0x7, 0x1, 0x1, 0x1, 0x3, 0x2, 0x1, 0x1, 0x1, 0x1}, 521 | }, 522 | compression: 5, 523 | count: 1250, 524 | rng: globalRNG{}, 525 | } 526 | 527 | if cdf := td.CDF(7.144560976650238e+06); cdf > 1 { 528 | t.Fatalf("invalid: %v", cdf) 529 | } 530 | } 531 | 532 | func TestTrimmedMean(t *testing.T) { 533 | tests := []struct { 534 | p1, p2 float64 535 | }{ 536 | {0, 1}, 537 | {0.1, 0.9}, 538 | {0.2, 0.8}, 539 | {0.25, 0.75}, 540 | {0, 0.5}, 541 | {0.5, 1}, 542 | {0.1, 0.7}, 543 | {0.3, 0.9}, 544 | } 545 | 546 | for _, size := range []int{100, 1000, 10000} { 547 | for _, test := range tests { 548 | td := uncheckedNew(Compression(100)) 549 | 550 | data := make([]float64, 0, size) 551 | for i := 0; i < size; i++ { 552 | f := rand.Float64() 553 | data = append(data, f) 554 | err := td.Add(f) 555 | if err != nil { 556 | t.Fatal(err) 557 | } 558 | } 559 | 560 | got := td.TrimmedMean(test.p1, test.p2) 561 | wanted := trimmedMean(data, test.p1, test.p2) 562 | if math.Abs(got-wanted) > 0.01 { 563 | t.Fatalf("got %f, wanted %f (size=%d p1=%f p2=%f)", 564 | got, wanted, size, test.p1, test.p2) 565 | } 566 | 567 | for i := 0; i < 10; i++ { 568 | err := td.Add(float64(i * 100)) 569 | if err != nil { 570 | t.Fatal(err) 571 | } 572 | } 573 | mean := td.TrimmedMean(0.1, 0.999) 574 | if mean < 0 { 575 | t.Fatalf("mean < 0") 576 | } 577 | } 578 | } 579 | } 580 | 581 | func TestTrimmedMeanCornerCases(t *testing.T) { 582 | td := uncheckedNew(Compression(100)) 583 | 584 | mean := td.TrimmedMean(0, 1) 585 | if mean != 0 { 586 | t.Fatalf("got %f, wanted 0", mean) 587 | } 588 | 589 | x := 1.0 590 | err := td.Add(x) 591 | if err != nil { 592 | t.Fatal(err) 593 | } 594 | 595 | mean = td.TrimmedMean(0, 1) 596 | if mean != 1 { 597 | t.Fatalf("got %f, wanted %f", mean, x) 598 | } 599 | 600 | err = td.Add(1000) 601 | if err != nil { 602 | t.Fatal(err) 603 | } 604 | 605 | mean = td.TrimmedMean(0, 1) 606 | wanted := 500.5 607 | if !closeEnough(mean, wanted) { 608 | t.Fatalf("got %f, wanted %f", mean, wanted) 609 | } 610 | } 611 | 612 | func trimmedMean(ff []float64, p1, p2 float64) float64 { 613 | sort.Float64s(ff) 614 | x1 := stat.Quantile(p1, stat.Empirical, ff, nil) 615 | x2 := stat.Quantile(p2, stat.Empirical, ff, nil) 616 | 617 | var sum float64 618 | var count int 619 | for _, f := range ff { 620 | if f >= x1 && f <= x2 { 621 | sum += f 622 | count++ 623 | } 624 | } 625 | return sum / float64(count) 626 | } 627 | 628 | func TestClone(t *testing.T) { 629 | seed := func(td *TDigest) { 630 | for i := 0; i < 100; i++ { 631 | err := td.Add(rand.Float64()) 632 | if err != nil { 633 | t.Fatal(err) 634 | } 635 | } 636 | } 637 | 638 | td := uncheckedNew(Compression(42)) 639 | seed(td) 640 | clone := td.Clone() 641 | 642 | // Clone behaves like td. 643 | 644 | if clone.Compression() != td.Compression() { 645 | t.Fatalf("got %f, wanted %f", clone.Compression(), td.Compression()) 646 | } 647 | 648 | cloneCount := clone.Count() 649 | if cloneCount != td.Count() { 650 | t.Fatalf("got %d, wanted %d", cloneCount, td.Count()) 651 | } 652 | 653 | cloneQuantile := clone.Quantile(1) 654 | if cloneQuantile != td.Quantile(1) { 655 | t.Fatalf("got %f, wanted %f", cloneQuantile, td.Quantile(1)) 656 | } 657 | 658 | seed(td) 659 | if td.Count() == clone.Count() { 660 | t.Fatal("seed does not work") 661 | } 662 | 663 | // Clone is not changed after td is changed. 664 | 665 | if clone.Count() != cloneCount { 666 | t.Fatalf("got %d, wanted %d", clone.Count(), cloneCount) 667 | } 668 | 669 | if clone.Quantile(1) != cloneQuantile { 670 | t.Fatalf("got %f, wanted %f", clone.Quantile(1), cloneQuantile) 671 | } 672 | 673 | // Clone is fully functional. 674 | 675 | err := clone.Add(1) 676 | if err != nil { 677 | t.Fatal(err) 678 | } 679 | } 680 | 681 | var compressions = []float64{1, 10, 20, 30, 50, 100} 682 | 683 | func BenchmarkTDigestAddOnce(b *testing.B) { 684 | for _, compression := range compressions { 685 | compression := compression 686 | b.Run(fmt.Sprintf("compression=%.0f", compression), func(b *testing.B) { 687 | benchmarkAddOnce(b, compression) 688 | }) 689 | } 690 | } 691 | 692 | func benchmarkAddOnce(b *testing.B, compression float64) { 693 | t := uncheckedNew(Compression(compression)) 694 | 695 | data := make([]float64, b.N) 696 | for n := 0; n < b.N; n++ { 697 | data[n] = rand.Float64() 698 | } 699 | 700 | b.ReportAllocs() 701 | b.ResetTimer() 702 | for n := 0; n < b.N; n++ { 703 | err := t.Add(data[n]) 704 | if err != nil { 705 | b.Error(err) 706 | } 707 | } 708 | b.StopTimer() 709 | } 710 | 711 | func BenchmarkTDigestAddMulti(b *testing.B) { 712 | for _, compression := range compressions { 713 | compression := compression 714 | for _, n := range []int{10, 100, 1000, 10000} { 715 | n := n 716 | name := fmt.Sprintf("compression=%.0f n=%d", compression, n) 717 | b.Run(name, func(b *testing.B) { 718 | benchmarkAddMulti(b, compression, n) 719 | }) 720 | } 721 | } 722 | } 723 | 724 | func benchmarkAddMulti(b *testing.B, compression float64, times int) { 725 | data := make([]float64, times) 726 | for i := 0; i < times; i++ { 727 | data[i] = rand.Float64() 728 | } 729 | 730 | b.ReportAllocs() 731 | b.ResetTimer() 732 | for n := 0; n < b.N; n++ { 733 | t := uncheckedNew(Compression(compression)) 734 | for i := 0; i < times; i++ { 735 | err := t.AddWeighted(data[i], 1) 736 | if err != nil { 737 | b.Error(err) 738 | } 739 | } 740 | } 741 | b.StopTimer() 742 | } 743 | 744 | func BenchmarkTDigestMerge(b *testing.B) { 745 | for _, compression := range compressions { 746 | compression := compression 747 | for _, n := range []int{1, 10, 100} { 748 | name := fmt.Sprintf("compression=%.0f n=%d", compression, n) 749 | b.Run(name, func(b *testing.B) { 750 | benchmarkMerge(b, compression, n) 751 | }) 752 | } 753 | } 754 | } 755 | 756 | func benchmarkMerge(b *testing.B, compression float64, times int) { 757 | ts := make([]*TDigest, times) 758 | for i := 0; i < times; i++ { 759 | ts[i] = randomTDigest(compression) 760 | } 761 | 762 | b.ReportAllocs() 763 | b.ResetTimer() 764 | for n := 0; n < b.N; n++ { 765 | dst := uncheckedNew(Compression(compression)) 766 | 767 | for i := 0; i < times; i++ { 768 | err := dst.Merge(ts[i]) 769 | if err != nil { 770 | b.Fatal(err) 771 | } 772 | } 773 | 774 | err := dst.Compress() 775 | if err != nil { 776 | b.Fatal(err) 777 | } 778 | } 779 | } 780 | 781 | func randomTDigest(compression float64) *TDigest { 782 | t := uncheckedNew(Compression(compression)) 783 | n := 20 * int(compression) 784 | for i := 0; i < n; i++ { 785 | err := t.Add(rand.Float64()) 786 | if err != nil { 787 | panic(err) 788 | } 789 | } 790 | return t 791 | } 792 | 793 | // Pathological ordered-input case. 794 | func BenchmarkAddOrdered(b *testing.B) { 795 | t, _ := New(Compression(100)) 796 | 797 | for n := 0; n < b.N; n++ { 798 | err := t.Add(float64(n)) 799 | if err != nil { 800 | b.Error(err) 801 | } 802 | } 803 | } 804 | 805 | func BenchmarkMerge(b *testing.B) { 806 | b.ReportAllocs() 807 | 808 | t, _ := New(Compression(100)) 809 | for n := 0; n < 1000; n++ { 810 | t.AddWeighted(rand.Float64(), uint64(rand.Intn(100))) 811 | } 812 | 813 | dest, _ := New(Compression(100)) 814 | 815 | b.ResetTimer() 816 | for n := 0; n < b.N; n++ { 817 | dest.Merge(t) 818 | } 819 | } 820 | 821 | func BenchmarkMergeDestructive(b *testing.B) { 822 | b.ReportAllocs() 823 | 824 | t, _ := New(Compression(100)) 825 | for n := 0; n < 1000; n++ { 826 | t.AddWeighted(rand.Float64(), uint64(rand.Intn(100))) 827 | } 828 | 829 | dest, _ := New(Compression(100)) 830 | 831 | b.ResetTimer() 832 | 833 | // After the first iteration, t's summary is scrambled, which means it's 834 | // mostly useless, but we can still merge it. 835 | for n := 0; n < b.N; n++ { 836 | dest.MergeDestructive(t) 837 | } 838 | } 839 | --------------------------------------------------------------------------------